From d90457f2c80ac2be43f1efe6bd1442ba9c508f39 Mon Sep 17 00:00:00 2001
From: Konstantin Bereznyakov <konstantin.bereznyakov@treasure-data.com>
Date: Fri, 13 Mar 2026 15:21:02 +0900
Subject: [PATCH 01/13] HIVE-29368: Could this be the truly accurate
 pessimistic stats combining?

---
 .../hadoop/hive/ql/stats/StatsUtils.java      |   1 +
 .../estimator/PessimisticStatCombiner.java    |   4 +-
 .../queries/clientpositive/ndv_case_const.q   |  27 +
 .../clientpositive/llap/ndv_case_const.q.out  | 755 ++++++++++++++++++
 4 files changed, 784 insertions(+), 3 deletions(-)
 create mode 100644 ql/src/test/queries/clientpositive/ndv_case_const.q
 create mode 100644 ql/src/test/results/clientpositive/llap/ndv_case_const.q.out
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
index c530633fbf1c..35fa75ebf7c4 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
@@ -1574,6 +1574,7 @@ public static ColStatistics getColStatisticsFromExpression(HiveConf conf, Statis
             Optional<ColStatistics> res = se.estimate(csList);
             if (res.isPresent()) {
               ColStatistics newStats = res.get();
+              newStats.setCountDistint(Math.min(newStats.getCountDistint(), numRows));
               colType = colType.toLowerCase();
               newStats.setColumnType(colType);
               newStats.setColumnName(colName);
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java
index dde2019eadf7..5b1cfb73d722 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java
@@ -41,9 +41,7 @@ public void add(ColStatistics stat) {
       if (stat.getAvgColLen() > result.getAvgColLen()) {
         result.setAvgColLen(stat.getAvgColLen());
       }
-      if (stat.getCountDistint() > result.getCountDistint()) {
-        result.setCountDistint(stat.getCountDistint());
-      }
+      result.setCountDistint(result.getCountDistint() + stat.getCountDistint());
       if (stat.getNumNulls() > result.getNumNulls()) {
         result.setNumNulls(stat.getNumNulls());
       }
diff --git a/ql/src/test/queries/clientpositive/ndv_case_const.q b/ql/src/test/queries/clientpositive/ndv_case_const.q
new file mode 100644
index 000000000000..42162d25c529
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/ndv_case_const.q
@@ -0,0 +1,27 @@
+CREATE TABLE t (cond INT, c2 STRING, c100 STRING);
+ALTER TABLE t UPDATE STATISTICS SET('numRows'='10000','rawDataSize'='1000000');
+ALTER TABLE t UPDATE STATISTICS FOR COLUMN cond SET('numDVs'='10','numNulls'='0');
+ALTER TABLE t UPDATE STATISTICS FOR COLUMN c2 SET('numDVs'='2','numNulls'='0','avgColLen'='5','maxColLen'='10');
+ALTER TABLE t UPDATE STATISTICS FOR COLUMN c100 SET('numDVs'='100','numNulls'='0','avgColLen'='5','maxColLen'='10');
+
+EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN 'A' WHEN cond=2 THEN 'B' ELSE 'C' END x FROM t) sub GROUP BY x;
+
+EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN 'A' WHEN cond=2 THEN 'B' WHEN cond=3 THEN 'A' ELSE 'B' END x FROM t) sub GROUP BY x;
+
+EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN 'A' WHEN cond=2 THEN 'B' ELSE NULL END x FROM t) sub GROUP BY x;
+
+EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN NULL WHEN cond=2 THEN 'A' ELSE 'B' END x FROM t) sub GROUP BY x;
+
+EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN NULL WHEN cond=2 THEN NULL WHEN cond=3 THEN 'A' ELSE 'B' END x FROM t) sub GROUP BY x;
+
+EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN NULL WHEN cond=2 THEN 'A' WHEN cond=3 THEN NULL ELSE 'B' END x FROM t) sub GROUP BY x;
+
+EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN NULL WHEN cond=2 THEN NULL WHEN cond=3 THEN c100 ELSE 'A' END x FROM t) sub GROUP BY x;
+
+EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN c2 WHEN cond=2 THEN c100 ELSE 'A' END x FROM t) sub GROUP BY x;
+
+EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN c2 WHEN cond=2 THEN c100 ELSE c2 END x FROM t) sub GROUP BY x;
+
+EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN 'A' WHEN cond=2 THEN 'B' WHEN cond=3 THEN 'C' ELSE c2 END x FROM t) sub GROUP BY x;
+
+EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN 'A' WHEN cond=2 THEN 'B' ELSE c100 END x FROM t) sub GROUP BY x;
diff --git a/ql/src/test/results/clientpositive/llap/ndv_case_const.q.out b/ql/src/test/results/clientpositive/llap/ndv_case_const.q.out
new file mode 100644
index 000000000000..1d3c298a25c5
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/ndv_case_const.q.out
@@ -0,0 +1,755 @@
+PREHOOK: query: CREATE TABLE t (cond INT, c2 STRING, c100 STRING)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t
+POSTHOOK: query: CREATE TABLE t (cond INT, c2 STRING, c100 STRING)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t
+PREHOOK: query: ALTER TABLE t UPDATE STATISTICS SET('numRows'='10000','rawDataSize'='1000000')
+PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
+PREHOOK: Input: default@t
+PREHOOK: Output: default@t
+POSTHOOK: query: ALTER TABLE t UPDATE STATISTICS SET('numRows'='10000','rawDataSize'='1000000')
+POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS
+POSTHOOK: Input: default@t
+POSTHOOK: Output: default@t
+PREHOOK: query: ALTER TABLE t UPDATE STATISTICS FOR COLUMN cond SET('numDVs'='10','numNulls'='0')
+PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
+PREHOOK: Input: default@t
+PREHOOK: Output: default@t
+POSTHOOK: query: ALTER TABLE t UPDATE STATISTICS FOR COLUMN cond SET('numDVs'='10','numNulls'='0')
+POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS
+POSTHOOK: Input: default@t
+POSTHOOK: Output: default@t
+PREHOOK: query: ALTER TABLE t UPDATE STATISTICS FOR COLUMN c2 SET('numDVs'='2','numNulls'='0','avgColLen'='5','maxColLen'='10')
+PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
+PREHOOK: Input: default@t
+PREHOOK: Output: default@t
+POSTHOOK: query: ALTER TABLE t UPDATE STATISTICS FOR COLUMN c2 SET('numDVs'='2','numNulls'='0','avgColLen'='5','maxColLen'='10')
+POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS
+POSTHOOK: Input: default@t
+POSTHOOK: Output: default@t
+PREHOOK: query: ALTER TABLE t UPDATE STATISTICS FOR COLUMN c100 SET('numDVs'='100','numNulls'='0','avgColLen'='5','maxColLen'='10')
+PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
+PREHOOK: Input: default@t
+PREHOOK: Output: default@t
+POSTHOOK: query: ALTER TABLE t UPDATE STATISTICS FOR COLUMN c100 SET('numDVs'='100','numNulls'='0','avgColLen'='5','maxColLen'='10')
+POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS
+POSTHOOK: Input: default@t
+POSTHOOK: Output: default@t
+PREHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN 'A' WHEN cond=2 THEN 'B' ELSE 'C' END x FROM t) sub GROUP BY x
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN 'A' WHEN cond=2 THEN 'B' ELSE 'C' END x FROM t) sub GROUP BY x
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: t
+                  Statistics: Num rows: 10000 Data size: 40000 Basic stats: COMPLETE Column stats: COMPLETE
+                  Select Operator
+                    expressions: CASE WHEN ((cond = 1)) THEN ('A') WHEN ((cond = 2)) THEN ('B') ELSE ('C') END (type: string)
+                    outputColumnNames: _col0
+                    Statistics: Num rows: 10000 Data size: 40000 Basic stats: COMPLETE Column stats: COMPLETE
+                    Group By Operator
+                      keys: _col0 (type: string)
+                      minReductionHashAggr: 0.99
+                      mode: hash
+                      outputColumnNames: _col0
+                      Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: string)
+                        null sort order: z
+                        sort order: +
+                        Map-reduce partition columns: _col0 (type: string)
+                        Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Reducer 2 
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Group By Operator
+                keys: KEY._col0 (type: string)
+                mode: mergepartial
+                outputColumnNames: _col0
+                Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE
+                  table:
+                      input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN 'A' WHEN cond=2 THEN 'B' WHEN cond=3 THEN 'A' ELSE 'B' END x FROM t) sub GROUP BY x
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN 'A' WHEN cond=2 THEN 'B' WHEN cond=3 THEN 'A' ELSE 'B' END x FROM t) sub GROUP BY x
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: t
+                  Statistics: Num rows: 10000 Data size: 40000 Basic stats: COMPLETE Column stats: COMPLETE
+                  Select Operator
+                    expressions: CASE WHEN ((cond = 1)) THEN ('A') WHEN ((cond = 2)) THEN ('B') WHEN ((cond = 3)) THEN ('A') ELSE ('B') END (type: string)
+                    outputColumnNames: _col0
+                    Statistics: Num rows: 10000 Data size: 40000 Basic stats: COMPLETE Column stats: COMPLETE
+                    Group By Operator
+                      keys: _col0 (type: string)
+                      minReductionHashAggr: 0.99
+                      mode: hash
+                      outputColumnNames: _col0
+                      Statistics: Num rows: 4 Data size: 340 Basic stats: COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: string)
+                        null sort order: z
+                        sort order: +
+                        Map-reduce partition columns: _col0 (type: string)
+                        Statistics: Num rows: 4 Data size: 340 Basic stats: COMPLETE Column stats: COMPLETE
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Reducer 2 
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Group By Operator
+                keys: KEY._col0 (type: string)
+                mode: mergepartial
+                outputColumnNames: _col0
+                Statistics: Num rows: 4 Data size: 340 Basic stats: COMPLETE Column stats: COMPLETE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 4 Data size: 340 Basic stats: COMPLETE Column stats: COMPLETE
+                  table:
+                      input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN 'A' WHEN cond=2 THEN 'B' ELSE NULL END x FROM t) sub GROUP BY x
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN 'A' WHEN cond=2 THEN 'B' ELSE NULL END x FROM t) sub GROUP BY x
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: t
+                  Statistics: Num rows: 10000 Data size: 40000 Basic stats: COMPLETE Column stats: COMPLETE
+                  Select Operator
+                    expressions: CASE WHEN ((cond = 1)) THEN ('A') WHEN ((cond = 2)) THEN ('B') ELSE (null) END (type: string)
+                    outputColumnNames: _col0
+                    Statistics: Num rows: 10000 Data size: 40000 Basic stats: COMPLETE Column stats: COMPLETE
+                    Group By Operator
+                      keys: _col0 (type: string)
+                      minReductionHashAggr: 0.99
+                      mode: hash
+                      outputColumnNames: _col0
+                      Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: string)
+                        null sort order: z
+                        sort order: +
+                        Map-reduce partition columns: _col0 (type: string)
+                        Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Reducer 2 
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Group By Operator
+                keys: KEY._col0 (type: string)
+                mode: mergepartial
+                outputColumnNames: _col0
+                Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE
+                  table:
+                      input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN NULL WHEN cond=2 THEN 'A' ELSE 'B' END x FROM t) sub GROUP BY x
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN NULL WHEN cond=2 THEN 'A' ELSE 'B' END x FROM t) sub GROUP BY x
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: t
+                  Statistics: Num rows: 10000 Data size: 40000 Basic stats: COMPLETE Column stats: COMPLETE
+                  Select Operator
+                    expressions: CASE WHEN ((cond = 1)) THEN (null) WHEN ((cond = 2)) THEN ('A') ELSE ('B') END (type: string)
+                    outputColumnNames: _col0
+                    Statistics: Num rows: 10000 Data size: 40000 Basic stats: COMPLETE Column stats: COMPLETE
+                    Group By Operator
+                      keys: _col0 (type: string)
+                      minReductionHashAggr: 0.99
+                      mode: hash
+                      outputColumnNames: _col0
+                      Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: string)
+                        null sort order: z
+                        sort order: +
+                        Map-reduce partition columns: _col0 (type: string)
+                        Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Reducer 2 
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Group By Operator
+                keys: KEY._col0 (type: string)
+                mode: mergepartial
+                outputColumnNames: _col0
+                Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE
+                  table:
+                      input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN NULL WHEN cond=2 THEN NULL WHEN cond=3 THEN 'A' ELSE 'B' END x FROM t) sub GROUP BY x
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN NULL WHEN cond=2 THEN NULL WHEN cond=3 THEN 'A' ELSE 'B' END x FROM t) sub GROUP BY x
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: t
+                  Statistics: Num rows: 10000 Data size: 40000 Basic stats: COMPLETE Column stats: COMPLETE
+                  Select Operator
+                    expressions: CASE WHEN ((cond) IN (1, 2)) THEN (null) WHEN ((cond = 3)) THEN ('A') ELSE ('B') END (type: string)
+                    outputColumnNames: _col0
+                    Statistics: Num rows: 10000 Data size: 40000 Basic stats: COMPLETE Column stats: COMPLETE
+                    Group By Operator
+                      keys: _col0 (type: string)
+                      minReductionHashAggr: 0.99
+                      mode: hash
+                      outputColumnNames: _col0
+                      Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: string)
+                        null sort order: z
+                        sort order: +
+                        Map-reduce partition columns: _col0 (type: string)
+                        Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Reducer 2 
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Group By Operator
+                keys: KEY._col0 (type: string)
+                mode: mergepartial
+                outputColumnNames: _col0
+                Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE
+                  table:
+                      input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN NULL WHEN cond=2 THEN 'A' WHEN cond=3 THEN NULL ELSE 'B' END x FROM t) sub GROUP BY x
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN NULL WHEN cond=2 THEN 'A' WHEN cond=3 THEN NULL ELSE 'B' END x FROM t) sub GROUP BY x
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: t
+                  Statistics: Num rows: 10000 Data size: 40000 Basic stats: COMPLETE Column stats: COMPLETE
+                  Select Operator
+                    expressions: CASE WHEN ((cond = 1)) THEN (null) WHEN ((cond = 2)) THEN ('A') WHEN ((cond = 3)) THEN (null) ELSE ('B') END (type: string)
+                    outputColumnNames: _col0
+                    Statistics: Num rows: 10000 Data size: 40000 Basic stats: COMPLETE Column stats: COMPLETE
+                    Group By Operator
+                      keys: _col0 (type: string)
+                      minReductionHashAggr: 0.99
+                      mode: hash
+                      outputColumnNames: _col0
+                      Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: string)
+                        null sort order: z
+                        sort order: +
+                        Map-reduce partition columns: _col0 (type: string)
+                        Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Reducer 2 
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Group By Operator
+                keys: KEY._col0 (type: string)
+                mode: mergepartial
+                outputColumnNames: _col0
+                Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE
+                  table:
+                      input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN NULL WHEN cond=2 THEN NULL WHEN cond=3 THEN c100 ELSE 'A' END x FROM t) sub GROUP BY x
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN NULL WHEN cond=2 THEN NULL WHEN cond=3 THEN c100 ELSE 'A' END x FROM t) sub GROUP BY x
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: t
+                  Statistics: Num rows: 10000 Data size: 930000 Basic stats: COMPLETE Column stats: COMPLETE
+                  Select Operator
+                    expressions: CASE WHEN ((cond) IN (1, 2)) THEN (null) WHEN ((cond = 3)) THEN (c100) ELSE ('A') END (type: string)
+                    outputColumnNames: _col0
+                    Statistics: Num rows: 10000 Data size: 930000 Basic stats: COMPLETE Column stats: COMPLETE
+                    Group By Operator
+                      keys: _col0 (type: string)
+                      minReductionHashAggr: 0.9898
+                      mode: hash
+                      outputColumnNames: _col0
+                      Statistics: Num rows: 102 Data size: 89 Basic stats: COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: string)
+                        null sort order: z
+                        sort order: +
+                        Map-reduce partition columns: _col0 (type: string)
+                        Statistics: Num rows: 102 Data size: 89 Basic stats: COMPLETE Column stats: COMPLETE
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Reducer 2 
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Group By Operator
+                keys: KEY._col0 (type: string)
+                mode: mergepartial
+                outputColumnNames: _col0
+                Statistics: Num rows: 102 Data size: 89 Basic stats: COMPLETE Column stats: COMPLETE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 102 Data size: 89 Basic stats: COMPLETE Column stats: COMPLETE
+                  table:
+                      input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN c2 WHEN cond=2 THEN c100 ELSE 'A' END x FROM t) sub GROUP BY x
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN c2 WHEN cond=2 THEN c100 ELSE 'A' END x FROM t) sub GROUP BY x
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: t
+                  Statistics: Num rows: 10000 Data size: 1820000 Basic stats: COMPLETE Column stats: COMPLETE
+                  Select Operator
+                    expressions: CASE WHEN ((cond = 1)) THEN (c2) WHEN ((cond = 2)) THEN (c100) ELSE ('A') END (type: string)
+                    outputColumnNames: _col0
+                    Statistics: Num rows: 10000 Data size: 1820000 Basic stats: COMPLETE Column stats: COMPLETE
+                    Group By Operator
+                      keys: _col0 (type: string)
+                      minReductionHashAggr: 0.9897
+                      mode: hash
+                      outputColumnNames: _col0
+                      Statistics: Num rows: 103 Data size: 9167 Basic stats: COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: string)
+                        null sort order: z
+                        sort order: +
+                        Map-reduce partition columns: _col0 (type: string)
+                        Statistics: Num rows: 103 Data size: 9167 Basic stats: COMPLETE Column stats: COMPLETE
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Reducer 2 
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Group By Operator
+                keys: KEY._col0 (type: string)
+                mode: mergepartial
+                outputColumnNames: _col0
+                Statistics: Num rows: 103 Data size: 9167 Basic stats: COMPLETE Column stats: COMPLETE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 103 Data size: 9167 Basic stats: COMPLETE Column stats: COMPLETE
+                  table:
+                      input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN c2 WHEN cond=2 THEN c100 ELSE c2 END x FROM t) sub GROUP BY x
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN c2 WHEN cond=2 THEN c100 ELSE c2 END x FROM t) sub GROUP BY x
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: t
+                  Statistics: Num rows: 10000 Data size: 1820000 Basic stats: COMPLETE Column stats: COMPLETE
+                  Select Operator
+                    expressions: CASE WHEN ((cond = 1)) THEN (c2) WHEN ((cond = 2)) THEN (c100) ELSE (c2) END (type: string)
+                    outputColumnNames: _col0
+                    Statistics: Num rows: 10000 Data size: 1820000 Basic stats: COMPLETE Column stats: COMPLETE
+                    Group By Operator
+                      keys: _col0 (type: string)
+                      minReductionHashAggr: 0.9896
+                      mode: hash
+                      outputColumnNames: _col0
+                      Statistics: Num rows: 104 Data size: 9256 Basic stats: COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: string)
+                        null sort order: z
+                        sort order: +
+                        Map-reduce partition columns: _col0 (type: string)
+                        Statistics: Num rows: 104 Data size: 9256 Basic stats: COMPLETE Column stats: COMPLETE
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Reducer 2 
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Group By Operator
+                keys: KEY._col0 (type: string)
+                mode: mergepartial
+                outputColumnNames: _col0
+                Statistics: Num rows: 104 Data size: 9256 Basic stats: COMPLETE Column stats: COMPLETE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 104 Data size: 9256 Basic stats: COMPLETE Column stats: COMPLETE
+                  table:
+                      input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN 'A' WHEN cond=2 THEN 'B' WHEN cond=3 THEN 'C' ELSE c2 END x FROM t) sub GROUP BY x
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN 'A' WHEN cond=2 THEN 'B' WHEN cond=3 THEN 'C' ELSE c2 END x FROM t) sub GROUP BY x
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: t
+                  Statistics: Num rows: 10000 Data size: 930000 Basic stats: COMPLETE Column stats: COMPLETE
+                  Select Operator
+                    expressions: CASE WHEN ((cond = 1)) THEN ('A') WHEN ((cond = 2)) THEN ('B') WHEN ((cond = 3)) THEN ('C') ELSE (c2) END (type: string)
+                    outputColumnNames: _col0
+                    Statistics: Num rows: 10000 Data size: 930000 Basic stats: COMPLETE Column stats: COMPLETE
+                    Group By Operator
+                      keys: _col0 (type: string)
+                      minReductionHashAggr: 0.99
+                      mode: hash
+                      outputColumnNames: _col0
+                      Statistics: Num rows: 5 Data size: 445 Basic stats: COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: string)
+                        null sort order: z
+                        sort order: +
+                        Map-reduce partition columns: _col0 (type: string)
+                        Statistics: Num rows: 5 Data size: 445 Basic stats: COMPLETE Column stats: COMPLETE
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Reducer 2 
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Group By Operator
+                keys: KEY._col0 (type: string)
+                mode: mergepartial
+                outputColumnNames: _col0
+                Statistics: Num rows: 5 Data size: 445 Basic stats: COMPLETE Column stats: COMPLETE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 5 Data size: 445 Basic stats: COMPLETE Column stats: COMPLETE
+                  table:
+                      input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN 'A' WHEN cond=2 THEN 'B' ELSE c100 END x FROM t) sub GROUP BY x
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN 'A' WHEN cond=2 THEN 'B' ELSE c100 END x FROM t) sub GROUP BY x
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: t
+                  Statistics: Num rows: 10000 Data size: 930000 Basic stats: COMPLETE Column stats: COMPLETE
+                  Select Operator
+                    expressions: CASE WHEN ((cond = 1)) THEN ('A') WHEN ((cond = 2)) THEN ('B') ELSE (c100) END (type: string)
+                    outputColumnNames: _col0
+                    Statistics: Num rows: 10000 Data size: 930000 Basic stats: COMPLETE Column stats: COMPLETE
+                    Group By Operator
+                      keys: _col0 (type: string)
+                      minReductionHashAggr: 0.9898
+                      mode: hash
+                      outputColumnNames: _col0
+                      Statistics: Num rows: 102 Data size: 9078 Basic stats: COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: string)
+                        null sort order: z
+                        sort order: +
+                        Map-reduce partition columns: _col0 (type: string)
+                        Statistics: Num rows: 102 Data size: 9078 Basic stats: COMPLETE Column stats: COMPLETE
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Reducer 2 
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Group By Operator
+                keys: KEY._col0 (type: string)
+                mode: mergepartial
+                outputColumnNames: _col0
+                Statistics: Num rows: 102 Data size: 9078 Basic stats: COMPLETE Column stats: COMPLETE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 102 Data size: 9078 Basic stats: COMPLETE Column stats: COMPLETE
+                  table:
+                      input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+

From 46485cfa95d7df389186b482fdb42e3b924526b5 Mon Sep 17 00:00:00 2001
From: Konstantin Bereznyakov <konstantin.bereznyakov@treasure-data.com>
Date: Fri, 20 Mar 2026 11:00:52 -0700
Subject: [PATCH 02/13] HIVE-29368: NDV of 0 is "unknown", so combining it with
 anything else still remails "unknown"

---
 .../hive/ql/stats/estimator/PessimisticStatCombiner.java   | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java
index 5b1cfb73d722..8272d0ff06d4 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java
@@ -41,7 +41,12 @@ public void add(ColStatistics stat) {
       if (stat.getAvgColLen() > result.getAvgColLen()) {
         result.setAvgColLen(stat.getAvgColLen());
       }
-      result.setCountDistint(result.getCountDistint() + stat.getCountDistint());
+      // NDV=0 means "unknown" - if either stat has unknown NDV, preserve 0 to propagate uncertainty
+      if (result.getCountDistint() == 0L || stat.getCountDistint() == 0L) {
+        result.setCountDistint(0L);
+      } else {
+        result.setCountDistint(result.getCountDistint() + stat.getCountDistint());
+      }
       if (stat.getNumNulls() > result.getNumNulls()) {
         result.setNumNulls(stat.getNumNulls());
       }

From 1d3fb0591404d69bd8b7f6213e82c58b9df9f1fa Mon Sep 17 00:00:00 2001
From: Konstantin Bereznyakov <konstantin.bereznyakov@treasure-data.com>
Date: Fri, 20 Mar 2026 11:59:51 -0700
Subject: [PATCH 03/13] HIVE-29368: use safeAdd

---
 .../hive/ql/stats/estimator/PessimisticStatCombiner.java       | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java
index 7e4cb0cffed1..2c6867e6f7b9 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java
@@ -21,6 +21,7 @@
 import java.util.Optional;
 
 import org.apache.hadoop.hive.ql.plan.ColStatistics;
+import org.apache.hadoop.hive.ql.stats.StatsUtils;
 
 /**
  * Combines {@link ColStatistics} objects to provide the most pessimistic estimate.
@@ -45,7 +46,7 @@ public void add(ColStatistics stat) {
     if (result.getCountDistint() == 0L || stat.getCountDistint() == 0L) {
       result.setCountDistint(0L);
     } else {
-      result.setCountDistint(result.getCountDistint() + stat.getCountDistint());
+      result.setCountDistint(StatsUtils.safeAdd(result.getCountDistint(), stat.getCountDistint()));
     }
     if (stat.getNumNulls() < 0 || result.getNumNulls() < 0) {
       result.setNumNulls(-1);

From 874996d9db21f7cec85525ae2c122a94ce76ad7c Mon Sep 17 00:00:00 2001
From: Konstantin Bereznyakov <konstantin.bereznyakov@treasure-data.com>
Date: Fri, 20 Mar 2026 19:41:58 -0700
Subject: [PATCH 04/13] HIVE-29368: special handling of "const NULL" columns,
 unit tests, new .out files

---
 .../hadoop/hive/ql/plan/ColStatistics.java    |  10 +
 .../hadoop/hive/ql/stats/StatsUtils.java      |   5 +-
 .../estimator/PessimisticStatCombiner.java    |   9 +-
 .../ql/stats/estimator/StatEstimator.java     |  45 ++++-
 .../estimator/StatEstimatorProvider.java      |  10 +-
 .../hive/ql/udf/generic/GenericUDFLower.java  |  15 --
 .../hive/ql/udf/generic/GenericUDFUpper.java  |  15 --
 .../TestPessimisticStatCombiner.java          | 184 ++++++++++++++++++
 .../ql/stats/estimator/TestStatEstimator.java | 169 ++++++++++++++++
 .../llap/infer_bucket_sort_dyn_part.q.out     |  10 +-
 .../llap/list_bucket_dml_6.q.out              |  28 +--
 .../llap/list_bucket_dml_7.q.out              |  28 +--
 .../llap/list_bucket_dml_8.q.out              |  10 +-
 .../llap/merge_dynamic_partition4.q.out       |  10 +-
 .../llap/merge_dynamic_partition5.q.out       |  10 +-
 .../llap/scratch_col_issue.q.out              |   6 +-
 16 files changed, 469 insertions(+), 95 deletions(-)
 create mode 100644 ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestStatEstimator.java

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java
index 717d1f8b6a7c..76b8c28691cc 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java
@@ -31,6 +31,7 @@ public class ColStatistics {
   private boolean isPrimaryKey;
   private boolean isEstimated;
   private boolean isFilteredColumn;
+  private boolean isConst;
   private byte[] bitVectors;
   private byte[] histogram;
 
@@ -155,6 +156,8 @@ public String toString() {
 
     sb.append(" isEstimated: ");
     sb.append(isEstimated);
+    sb.append(" isConst: ");
+    sb.append(isConst);
     return sb.toString();
   }
 
@@ -171,6 +174,7 @@ public ColStatistics clone() {
     clone.setPrimaryKey(isPrimaryKey);
     clone.setIsEstimated(isEstimated);
     clone.setIsFilteredColumn(isFilteredColumn);
+    clone.setConst(isConst);
     if (range != null ) {
       clone.setRange(range.clone());
     }
@@ -191,6 +195,12 @@ public void setIsEstimated(boolean isEstimated) {
 
   public boolean isEstimated() { return isEstimated; }
 
+  public void setConst(boolean isConst) {
+    this.isConst = isConst;
+  }
+
+  public boolean isConst() { return isConst; }
+
   public static class Range {
     public final Number minValue;
     public final Number maxValue;
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
index 4392ee905d86..f8aad54910e3 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
@@ -1578,10 +1578,9 @@ public static ColStatistics getColStatisticsFromExpression(HiveConf conf, Statis
             csList.add(cs);
           }
           if (csList.size() == engfd.getChildren().size()) {
-            Optional<ColStatistics> res = se.estimate(csList);
+            Optional<ColStatistics> res = se.estimate(csList, numRows);
             if (res.isPresent()) {
               ColStatistics newStats = res.get();
-              newStats.setCountDistint(Math.min(newStats.getCountDistint(), numRows));
               colType = colType.toLowerCase();
               newStats.setColumnType(colType);
               newStats.setColumnName(colName);
@@ -1644,6 +1643,7 @@ private static ColStatistics buildColStatForConstant(HiveConf conf, long numRows
     colStats.setAvgColLen(avgColSize);
     colStats.setCountDistint(countDistincts);
     colStats.setNumNulls(numNulls);
+    colStats.setConst(true);
 
     Optional<Number> value = getConstValue(encd);
     value.ifPresent(number -> colStats.setRange(number, number));
@@ -2093,7 +2093,6 @@ public static long computeNDVGroupingColumns(List<ColStatistics> colStats, Stati
       return 0L;
     }
     if (ndvValues.isEmpty()) {
-      // No grouping columns, one row
       return 1L;
     }
     if (expDecay) {
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java
index 2c6867e6f7b9..b3086693ed52 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java
@@ -42,12 +42,14 @@ public void add(ColStatistics stat) {
     if (stat.getAvgColLen() > result.getAvgColLen()) {
       result.setAvgColLen(stat.getAvgColLen());
     }
-    // NDV=0 means "unknown" - if either stat has unknown NDV, preserve 0 to propagate uncertainty
-    if (result.getCountDistint() == 0L || stat.getCountDistint() == 0L) {
-      result.setCountDistint(0L);
+    // NDV=0 is "unknown" only if the stat is NOT a constant.
+    // Constants with NDV=0 (e.g., NULL) are "known zero", not unknown.
+    if ((result.getCountDistint() == 0 && !result.isConst()) || (stat.getCountDistint() == 0 && !stat.isConst())) {
+      result.setCountDistint(0);
     } else {
       result.setCountDistint(StatsUtils.safeAdd(result.getCountDistint(), stat.getCountDistint()));
     }
+    result.setConst(false);
     if (stat.getNumNulls() < 0 || result.getNumNulls() < 0) {
       result.setNumNulls(-1);
     } else if (stat.getNumNulls() > result.getNumNulls()) {
@@ -70,6 +72,5 @@ public void add(ColStatistics stat) {
 
   public Optional<ColStatistics> getResult() {
     return Optional.of(result);
-
   }
 }
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimator.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimator.java
index 94aaa32ecfcb..98e96c48893a 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimator.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimator.java
@@ -24,20 +24,53 @@
 import org.apache.hadoop.hive.ql.plan.ColStatistics;
 
 /**
- * Enables statistics related computation on UDFs
+ * Enables statistics related computation on UDFs.
+ *
+ * <p>This interface provides two default implementations:
+ * <ul>
+ *   <li>{@link #estimate(List)} - clones the first argument's statistics (suitable for most UDFs)</li>
+ *   <li>{@link #estimate(List, long)} - calls estimate(List) and caps NDV at numRows</li>
+ * </ul>
+ *
+ * <p>UDFs that simply pass through statistics (like LOWER, UPPER) can use the defaults.
+ * UDFs that combine statistics (like IF, WHEN, COALESCE) should override {@link #estimate(List)}.
  */
 public interface StatEstimator {
 
   /**
    * Computes the output statistics of the actual UDF.
    *
-   * The estimator should return with a preferably overestimated {@link ColStatistics} object if possible.
-   * The actual estimation logic may decide to not give an estimation; it should return with {@link Optional#empty()}.
+   * <p>The default implementation clones the first argument's statistics, which is suitable
+   * for most UDFs that don't significantly alter the statistical properties of their input.
+   *
+   * <p>Override this method for UDFs that combine multiple inputs (like IF, WHEN, COALESCE)
+   * or significantly transform the data.
+   *
+   * @param argStats the statistics for every argument of the UDF
+   * @return {@link ColStatistics} estimate for the actual UDF, or empty if estimation is not possible
+   */
+  default Optional<ColStatistics> estimate(List<ColStatistics> argStats) {
+    if (argStats.isEmpty()) {
+      return Optional.empty();
+    }
+    return Optional.of(argStats.get(0).clone());
+  }
+
+  /**
+   * Computes the output statistics of the actual UDF, ensuring NDV does not exceed numRows.
    *
-   * Note: at the time of the call there will be {@link ColStatistics} for all the arguments; if that is not available - the estimation is skipped.
+   * <p>The default implementation calls {@link #estimate(List)} and caps the NDV at numRows.
+   * This ensures that estimators which combine statistics from multiple branches (producing
+   * potentially inflated NDV values) are automatically bounded by the number of rows.
    *
    * @param argStats the statistics for every argument of the UDF
-   * @return {@link ColStatistics} estimate for the actual UDF.
+   * @param numRows the number of rows, used to cap the NDV
+   * @return {@link ColStatistics} estimate for the actual UDF with NDV capped at numRows
    */
-  public Optional<ColStatistics> estimate(List<ColStatistics> argStats);
+  default Optional<ColStatistics> estimate(List<ColStatistics> argStats, long numRows) {
+    return estimate(argStats).map(cs -> {
+      cs.setCountDistint(Math.min(cs.getCountDistint(), numRows));
+      return cs;
+    });
+  }
 }
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimatorProvider.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimatorProvider.java
index 96865d194c6e..c888493040e4 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimatorProvider.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimatorProvider.java
@@ -19,11 +19,19 @@
 
 /**
  * Marker interface for UDFs to communicate that the usage of StatEstimators is supported by the UDF.
+ *
+ * <p>The default implementation returns a {@link StatEstimator} that clones the first argument's
+ * statistics, which is suitable for most UDFs. Override {@link #getStatEstimator()} for UDFs
+ * that combine statistics from multiple inputs (like IF, WHEN, COALESCE).
  */
 public interface StatEstimatorProvider {
 
   /**
    * Returns the {@link StatEstimator} for the given UDF instance.
+   *
+   * <p>The default implementation returns an estimator that clones the first argument's statistics.
    */
-  public StatEstimator getStatEstimator();
+  default StatEstimator getStatEstimator() {
+    return new StatEstimator() {};
+  }
 }
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFLower.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFLower.java
index 411438907424..609274c0bfe4 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFLower.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFLower.java
@@ -24,8 +24,6 @@
 import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions;
 import org.apache.hadoop.hive.ql.exec.vector.expressions.StringLower;
 import org.apache.hadoop.hive.ql.metadata.HiveException;
-import org.apache.hadoop.hive.ql.plan.ColStatistics;
-import org.apache.hadoop.hive.ql.stats.estimator.StatEstimator;
 import org.apache.hadoop.hive.ql.stats.estimator.StatEstimatorProvider;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
@@ -37,8 +35,6 @@
 import org.apache.hadoop.hive.serde2.typeinfo.BaseCharTypeInfo;
 import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
 
-import java.util.List;
-import java.util.Optional;
 
 /**
  * UDFLower.
@@ -113,15 +109,4 @@ public Object evaluate(DeferredObject[] arguments) throws HiveException {
   public String getDisplayString(String[] children) {
     return getStandardDisplayString("lower", children);
   }
-
-  @Override
-  public StatEstimator getStatEstimator() {
-    return new StatEstimator() {
-      @Override
-      public Optional<ColStatistics> estimate(List<ColStatistics> argStats) {
-        return Optional.of(argStats.get(0).clone());
-      }
-    };
-  }
-
 }
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUpper.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUpper.java
index 019cbe94a4ba..d0df8da9886b 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUpper.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUpper.java
@@ -24,8 +24,6 @@
 import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions;
 import org.apache.hadoop.hive.ql.exec.vector.expressions.StringUpper;
 import org.apache.hadoop.hive.ql.metadata.HiveException;
-import org.apache.hadoop.hive.ql.plan.ColStatistics;
-import org.apache.hadoop.hive.ql.stats.estimator.StatEstimator;
 import org.apache.hadoop.hive.ql.stats.estimator.StatEstimatorProvider;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
@@ -37,8 +35,6 @@
 import org.apache.hadoop.hive.serde2.typeinfo.BaseCharTypeInfo;
 import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
 
-import java.util.List;
-import java.util.Optional;
 
 /**
  * UDFUpper.
@@ -115,15 +111,4 @@ public Object evaluate(DeferredObject[] arguments) throws HiveException {
   public String getDisplayString(String[] children) {
     return getStandardDisplayString("upper", children);
   }
-
-  @Override
-  public StatEstimator getStatEstimator() {
-    return new StatEstimator() {
-      @Override
-      public Optional<ColStatistics> estimate(List<ColStatistics> argStats) {
-        return Optional.of(argStats.get(0).clone());
-      }
-    };
-  }
-
 }
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestPessimisticStatCombiner.java b/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestPessimisticStatCombiner.java
index 98bc589e40d3..fb3eb09308e9 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestPessimisticStatCombiner.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestPessimisticStatCombiner.java
@@ -155,6 +155,184 @@ void testCombineBothUnknownNumTruesAndNumFalses() {
     assertEquals(-1, combined.getNumFalses(), "Both unknown should result in unknown (-1)");
   }
 
+  @Test
+  void testCombinePropagatesUnknownNdvFromFirst() {
+    ColStatistics stat1 = createStat("col1", "int", 0, 10, 4.0); // NDV=0 means unknown
+    ColStatistics stat2 = createStat("col2", "int", 100, 20, 4.0);
+
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
+    combiner.add(stat1);
+    combiner.add(stat2);
+
+    ColStatistics combined = combiner.getResult().get();
+    assertEquals(0, combined.getCountDistint(), "Unknown NDV (0) from first should be propagated");
+  }
+
+  @Test
+  void testCombinePropagatesUnknownNdvFromSecond() {
+    ColStatistics stat1 = createStat("col1", "int", 100, 10, 4.0);
+    ColStatistics stat2 = createStat("col2", "int", 0, 20, 4.0); // NDV=0 means unknown
+
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
+    combiner.add(stat1);
+    combiner.add(stat2);
+
+    ColStatistics combined = combiner.getResult().get();
+    assertEquals(0, combined.getCountDistint(), "Unknown NDV (0) from second should be propagated");
+  }
+
+  @Test
+  void testCombineBothUnknownNdv() {
+    ColStatistics stat1 = createStat("col1", "int", 0, 10, 4.0);
+    ColStatistics stat2 = createStat("col2", "int", 0, 20, 4.0);
+
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
+    combiner.add(stat1);
+    combiner.add(stat2);
+
+    ColStatistics combined = combiner.getResult().get();
+    assertEquals(0, combined.getCountDistint(), "Both unknown NDV should result in unknown (0)");
+  }
+
+  @Test
+  void testCombineSumsNdvWhenBothKnown() {
+    ColStatistics stat1 = createStat("col1", "int", 50, 10, 4.0);
+    ColStatistics stat2 = createStat("col2", "int", 30, 20, 4.0);
+
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
+    combiner.add(stat1);
+    combiner.add(stat2);
+
+    ColStatistics combined = combiner.getResult().get();
+    assertEquals(80, combined.getCountDistint(), "Known NDVs should be summed");
+  }
+
+  @Test
+  void testCombineNdvOverflowProtection() {
+    ColStatistics stat1 = createStat("col1", "int", Long.MAX_VALUE - 10, 10, 4.0);
+    ColStatistics stat2 = createStat("col2", "int", 100, 20, 4.0);
+
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
+    combiner.add(stat1);
+    combiner.add(stat2);
+
+    ColStatistics combined = combiner.getResult().get();
+    assertEquals(Long.MAX_VALUE, combined.getCountDistint(), "NDV overflow should be capped at Long.MAX_VALUE");
+  }
+
+  @Test
+  void testCombineThreeStats() {
+    ColStatistics stat1 = createStat("col1", "int", 10, 5, 4.0);
+    ColStatistics stat2 = createStat("col2", "int", 20, 10, 4.0);
+    ColStatistics stat3 = createStat("col3", "int", 30, 15, 4.0);
+
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
+    combiner.add(stat1);
+    combiner.add(stat2);
+    combiner.add(stat3);
+
+    ColStatistics combined = combiner.getResult().get();
+    assertEquals(60, combined.getCountDistint(), "Three NDVs should be summed");
+    assertEquals(15, combined.getNumNulls(), "Should take max numNulls");
+  }
+
+  @Test
+  void testCombineUnknownNdvInMiddle() {
+    ColStatistics stat1 = createStat("col1", "int", 10, 5, 4.0);
+    ColStatistics stat2 = createStat("col2", "int", 0, 10, 4.0); // unknown
+    ColStatistics stat3 = createStat("col3", "int", 30, 15, 4.0);
+
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
+    combiner.add(stat1);
+    combiner.add(stat2);
+    combiner.add(stat3);
+
+    ColStatistics combined = combiner.getResult().get();
+    assertEquals(0, combined.getCountDistint(), "Unknown NDV in middle should propagate");
+  }
+
+  @Test
+  void testConstantWithNdvZeroIsNotTreatedAsUnknown() {
+    ColStatistics stat1 = createStat("col1", "string", 1, 0, 5.0);
+    ColStatistics stat2 = createConstStat("const", "string", 0, 1000, 5.0); // NULL constant: NDV=0, isConst=true
+
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
+    combiner.add(stat1);
+    combiner.add(stat2);
+
+    ColStatistics combined = combiner.getResult().get();
+    assertEquals(1, combined.getCountDistint(), "Constant with NDV=0 should not propagate as unknown");
+  }
+
+  @Test
+  void testNullConstantFirstThenOtherConstants() {
+    ColStatistics nullConst = createConstStat("null", "string", 0, 1000, 5.0); // NULL constant
+    ColStatistics constA = createConstStat("A", "string", 1, 0, 5.0);
+    ColStatistics constB = createConstStat("B", "string", 1, 0, 5.0);
+
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
+    combiner.add(nullConst);
+    combiner.add(constA);
+    combiner.add(constB);
+
+    ColStatistics combined = combiner.getResult().get();
+    assertEquals(2, combined.getCountDistint(), "NULL(0) + A(1) + B(1) should sum to 2");
+  }
+
+  @Test
+  void testConstantsWithNullInMiddle() {
+    ColStatistics constA = createConstStat("A", "string", 1, 0, 5.0);
+    ColStatistics nullConst = createConstStat("null", "string", 0, 1000, 5.0); // NULL constant
+    ColStatistics constB = createConstStat("B", "string", 1, 0, 5.0);
+
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
+    combiner.add(constA);
+    combiner.add(nullConst);
+    combiner.add(constB);
+
+    ColStatistics combined = combiner.getResult().get();
+    assertEquals(2, combined.getCountDistint(), "A(1) + NULL(0) + B(1) should sum to 2");
+  }
+
+  @Test
+  void testNonConstantNdvZeroStillPropagatesUnknown() {
+    ColStatistics stat1 = createStat("col1", "string", 1, 0, 5.0);
+    ColStatistics stat2 = createStat("col2", "string", 0, 10, 5.0); // Column with unknown NDV (isConst=false)
+
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
+    combiner.add(stat1);
+    combiner.add(stat2);
+
+    ColStatistics combined = combiner.getResult().get();
+    assertEquals(0, combined.getCountDistint(), "Non-constant with NDV=0 should still propagate as unknown");
+  }
+
+  @Test
+  void testMixedConstantAndNonConstantWithNdvZero() {
+    ColStatistics constStat = createConstStat("const", "string", 0, 1000, 5.0); // NULL constant
+    ColStatistics colStat = createStat("col", "string", 0, 10, 5.0); // Column with unknown NDV
+
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
+    combiner.add(constStat);
+    combiner.add(colStat);
+
+    ColStatistics combined = combiner.getResult().get();
+    assertEquals(0, combined.getCountDistint(), "Non-constant with NDV=0 should propagate unknown even if combined with constant");
+  }
+
+  @Test
+  void testCombinedResultIsNotConst() {
+    ColStatistics constA = createConstStat("A", "string", 1, 0, 5.0);
+    ColStatistics constB = createConstStat("B", "string", 1, 0, 5.0);
+
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
+    combiner.add(constA);
+    combiner.add(constB);
+
+    ColStatistics combined = combiner.getResult().get();
+    assertEquals(false, combined.isConst(), "Combined result should not be marked as constant");
+  }
+
   private ColStatistics createStat(String name, String type, long ndv, long numNulls, double avgColLen) {
     ColStatistics stat = new ColStatistics(name, type);
     stat.setCountDistint(ndv);
@@ -162,4 +340,10 @@ private ColStatistics createStat(String name, String type, long ndv, long numNul
     stat.setAvgColLen(avgColLen);
     return stat;
   }
+
+  private ColStatistics createConstStat(String name, String type, long ndv, long numNulls, double avgColLen) {
+    ColStatistics stat = createStat(name, type, ndv, numNulls, avgColLen);
+    stat.setConst(true);
+    return stat;
+  }
 }
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestStatEstimator.java b/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestStatEstimator.java
new file mode 100644
index 000000000000..09140f3d15bb
--- /dev/null
+++ b/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestStatEstimator.java
@@ -0,0 +1,169 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.stats.estimator;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertNotSame;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Optional;
+
+import org.apache.hadoop.hive.ql.plan.ColStatistics;
+import org.junit.jupiter.api.Test;
+
+class TestStatEstimator {
+
+  @Test
+  void testDefaultEstimateWithEmptyList() {
+    StatEstimator estimator = new StatEstimator() {};
+    Optional<ColStatistics> result = estimator.estimate(Collections.emptyList());
+    assertFalse(result.isPresent(), "Empty list should return empty Optional");
+  }
+
+  @Test
+  void testDefaultEstimateClonesFirstArg() {
+    StatEstimator estimator = new StatEstimator() {};
+    ColStatistics stat = createStat("col1", "int", 100, 10, 4.0);
+
+    Optional<ColStatistics> result = estimator.estimate(Arrays.asList(stat));
+
+    assertTrue(result.isPresent());
+    assertEquals(100, result.get().getCountDistint());
+    assertEquals(10, result.get().getNumNulls());
+    assertEquals(4.0, result.get().getAvgColLen());
+  }
+
+  @Test
+  void testDefaultEstimateReturnsCloneNotSameReference() {
+    StatEstimator estimator = new StatEstimator() {};
+    ColStatistics stat = createStat("col1", "int", 100, 10, 4.0);
+
+    Optional<ColStatistics> result = estimator.estimate(Arrays.asList(stat));
+
+    assertTrue(result.isPresent());
+    assertNotSame(stat, result.get(), "Should return a clone, not the same reference");
+    stat.setCountDistint(999);
+    assertEquals(100, result.get().getCountDistint(), "Clone should not be affected by original changes");
+  }
+
+  @Test
+  void testDefaultEstimateIgnoresSubsequentArgs() {
+    StatEstimator estimator = new StatEstimator() {};
+    ColStatistics stat1 = createStat("col1", "int", 100, 10, 4.0);
+    ColStatistics stat2 = createStat("col2", "int", 200, 20, 8.0);
+
+    Optional<ColStatistics> result = estimator.estimate(Arrays.asList(stat1, stat2));
+
+    assertTrue(result.isPresent());
+    assertEquals(100, result.get().getCountDistint(), "Should use first arg's NDV");
+    assertEquals(10, result.get().getNumNulls(), "Should use first arg's numNulls");
+  }
+
+  @Test
+  void testDefaultEstimateWithNumRowsCapsNdv() {
+    StatEstimator estimator = new StatEstimator() {};
+    ColStatistics stat = createStat("col1", "int", 1000, 10, 4.0);
+
+    Optional<ColStatistics> result = estimator.estimate(Arrays.asList(stat), 500);
+
+    assertTrue(result.isPresent());
+    assertEquals(500, result.get().getCountDistint(), "NDV should be capped at numRows");
+  }
+
+  @Test
+  void testDefaultEstimateWithNumRowsNoCappingNeeded() {
+    StatEstimator estimator = new StatEstimator() {};
+    ColStatistics stat = createStat("col1", "int", 100, 10, 4.0);
+
+    Optional<ColStatistics> result = estimator.estimate(Arrays.asList(stat), 500);
+
+    assertTrue(result.isPresent());
+    assertEquals(100, result.get().getCountDistint(), "NDV should remain unchanged when less than numRows");
+  }
+
+  @Test
+  void testDefaultEstimateWithNumRowsExactlyEqual() {
+    StatEstimator estimator = new StatEstimator() {};
+    ColStatistics stat = createStat("col1", "int", 500, 10, 4.0);
+
+    Optional<ColStatistics> result = estimator.estimate(Arrays.asList(stat), 500);
+
+    assertTrue(result.isPresent());
+    assertEquals(500, result.get().getCountDistint(), "NDV should remain unchanged when equal to numRows");
+  }
+
+  @Test
+  void testDefaultEstimateWithNumRowsEmptyList() {
+    StatEstimator estimator = new StatEstimator() {};
+
+    Optional<ColStatistics> result = estimator.estimate(Collections.emptyList(), 500);
+
+    assertFalse(result.isPresent(), "Empty list should return empty Optional");
+  }
+
+  @Test
+  void testDefaultEstimateWithNumRowsPreservesOtherStats() {
+    StatEstimator estimator = new StatEstimator() {};
+    ColStatistics stat = createStat("col1", "int", 1000, 10, 4.0);
+    stat.setNumTrues(50);
+    stat.setNumFalses(40);
+
+    Optional<ColStatistics> result = estimator.estimate(Arrays.asList(stat), 500);
+
+    assertTrue(result.isPresent());
+    assertEquals(500, result.get().getCountDistint(), "NDV should be capped");
+    assertEquals(10, result.get().getNumNulls(), "numNulls should be preserved");
+    assertEquals(4.0, result.get().getAvgColLen(), "avgColLen should be preserved");
+  }
+
+  @Test
+  void testStatEstimatorProviderDefaultReturnsWorkingEstimator() {
+    StatEstimatorProvider provider = new StatEstimatorProvider() {};
+    StatEstimator estimator = provider.getStatEstimator();
+
+    ColStatistics stat = createStat("col1", "int", 100, 10, 4.0);
+    Optional<ColStatistics> result = estimator.estimate(Arrays.asList(stat));
+
+    assertTrue(result.isPresent());
+    assertEquals(100, result.get().getCountDistint());
+  }
+
+  @Test
+  void testStatEstimatorProviderDefaultCapsNdv() {
+    StatEstimatorProvider provider = new StatEstimatorProvider() {};
+    StatEstimator estimator = provider.getStatEstimator();
+
+    ColStatistics stat = createStat("col1", "int", 1000, 10, 4.0);
+    Optional<ColStatistics> result = estimator.estimate(Arrays.asList(stat), 500);
+
+    assertTrue(result.isPresent());
+    assertEquals(500, result.get().getCountDistint(), "Default provider estimator should cap NDV");
+  }
+
+  private ColStatistics createStat(String name, String type, long ndv, long numNulls, double avgColLen) {
+    ColStatistics stat = new ColStatistics(name, type);
+    stat.setCountDistint(ndv);
+    stat.setNumNulls(numNulls);
+    stat.setAvgColLen(avgColLen);
+    return stat;
+  }
+}
diff --git a/ql/src/test/results/clientpositive/llap/infer_bucket_sort_dyn_part.q.out b/ql/src/test/results/clientpositive/llap/infer_bucket_sort_dyn_part.q.out
index 995733564a08..17db16415c01 100644
--- a/ql/src/test/results/clientpositive/llap/infer_bucket_sort_dyn_part.q.out
+++ b/ql/src/test/results/clientpositive/llap/infer_bucket_sort_dyn_part.q.out
@@ -492,13 +492,13 @@ STAGE PLANS:
                       minReductionHashAggr: 0.99
                       mode: hash
                       outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10
-                      Statistics: Num rows: 1 Data size: 652 Basic stats: COMPLETE Column stats: COMPLETE
+                      Statistics: Num rows: 2 Data size: 1304 Basic stats: COMPLETE Column stats: COMPLETE
                       Reduce Output Operator
                         key expressions: _col0 (type: string), _col1 (type: string)
                         null sort order: zz
                         sort order: ++
                         Map-reduce partition columns: _col0 (type: string), _col1 (type: string)
-                        Statistics: Num rows: 1 Data size: 652 Basic stats: COMPLETE Column stats: COMPLETE
+                        Statistics: Num rows: 2 Data size: 1304 Basic stats: COMPLETE Column stats: COMPLETE
                         value expressions: _col2 (type: int), _col3 (type: struct<count:bigint,sum:double,input:int>), _col4 (type: bigint), _col5 (type: bigint), _col6 (type: binary), _col7 (type: int), _col8 (type: struct<count:bigint,sum:double,input:int>), _col9 (type: bigint), _col10 (type: binary)
         Reducer 3 
             Execution mode: vectorized, llap
@@ -508,14 +508,14 @@ STAGE PLANS:
                 keys: KEY._col0 (type: string), KEY._col1 (type: string)
                 mode: mergepartial
                 outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10
-                Statistics: Num rows: 1 Data size: 516 Basic stats: COMPLETE Column stats: COMPLETE
+                Statistics: Num rows: 2 Data size: 1032 Basic stats: COMPLETE Column stats: COMPLETE
                 Select Operator
                   expressions: 'STRING' (type: string), UDFToLong(COALESCE(_col2,0)) (type: bigint), COALESCE(_col3,0) (type: double), (_col4 - _col5) (type: bigint), COALESCE(ndv_compute_bit_vector(_col6),0) (type: bigint), _col6 (type: binary), 'STRING' (type: string), UDFToLong(COALESCE(_col7,0)) (type: bigint), COALESCE(_col8,0) (type: double), (_col4 - _col9) (type: bigint), COALESCE(ndv_compute_bit_vector(_col10),0) (type: bigint), _col10 (type: binary), _col0 (type: string), _col1 (type: string)
                   outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13
-                  Statistics: Num rows: 1 Data size: 712 Basic stats: COMPLETE Column stats: COMPLETE
+                  Statistics: Num rows: 2 Data size: 1424 Basic stats: COMPLETE Column stats: COMPLETE
                   File Output Operator
                     compressed: false
-                    Statistics: Num rows: 1 Data size: 712 Basic stats: COMPLETE Column stats: COMPLETE
+                    Statistics: Num rows: 2 Data size: 1424 Basic stats: COMPLETE Column stats: COMPLETE
                     table:
                         input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                         output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
diff --git a/ql/src/test/results/clientpositive/llap/list_bucket_dml_6.q.out b/ql/src/test/results/clientpositive/llap/list_bucket_dml_6.q.out
index dbcf49b202e7..df098011525b 100644
--- a/ql/src/test/results/clientpositive/llap/list_bucket_dml_6.q.out
+++ b/ql/src/test/results/clientpositive/llap/list_bucket_dml_6.q.out
@@ -96,7 +96,7 @@ STAGE PLANS:
                         minReductionHashAggr: 0.99
                         mode: hash
                         outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10
-                        Statistics: Num rows: 1 Data size: 652 Basic stats: COMPLETE Column stats: COMPLETE
+                        Statistics: Num rows: 2 Data size: 1304 Basic stats: COMPLETE Column stats: COMPLETE
                         Reduce Output Operator
                           bucketingVersion: 2
                           key expressions: _col0 (type: string), _col1 (type: string)
@@ -104,7 +104,7 @@ STAGE PLANS:
                           numBuckets: -1
                           sort order: ++
                           Map-reduce partition columns: _col0 (type: string), _col1 (type: string)
-                          Statistics: Num rows: 1 Data size: 652 Basic stats: COMPLETE Column stats: COMPLETE
+                          Statistics: Num rows: 2 Data size: 1304 Basic stats: COMPLETE Column stats: COMPLETE
                           tag: -1
                           value expressions: _col2 (type: int), _col3 (type: struct<count:bigint,sum:double,input:int>), _col4 (type: bigint), _col5 (type: bigint), _col6 (type: binary), _col7 (type: int), _col8 (type: struct<count:bigint,sum:double,input:int>), _col9 (type: bigint), _col10 (type: binary)
                           auto parallelism: true
@@ -199,18 +199,18 @@ STAGE PLANS:
                 keys: KEY._col0 (type: string), KEY._col1 (type: string)
                 mode: mergepartial
                 outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10
-                Statistics: Num rows: 1 Data size: 516 Basic stats: COMPLETE Column stats: COMPLETE
+                Statistics: Num rows: 2 Data size: 1032 Basic stats: COMPLETE Column stats: COMPLETE
                 Select Operator
                   expressions: 'STRING' (type: string), UDFToLong(COALESCE(_col2,0)) (type: bigint), COALESCE(_col3,0) (type: double), (_col4 - _col5) (type: bigint), COALESCE(ndv_compute_bit_vector(_col6),0) (type: bigint), _col6 (type: binary), 'STRING' (type: string), UDFToLong(COALESCE(_col7,0)) (type: bigint), COALESCE(_col8,0) (type: double), (_col4 - _col9) (type: bigint), COALESCE(ndv_compute_bit_vector(_col10),0) (type: bigint), _col10 (type: binary), _col0 (type: string), _col1 (type: string)
                   outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13
-                  Statistics: Num rows: 1 Data size: 712 Basic stats: COMPLETE Column stats: COMPLETE
+                  Statistics: Num rows: 2 Data size: 1424 Basic stats: COMPLETE Column stats: COMPLETE
                   File Output Operator
                     bucketingVersion: 2
                     compressed: false
                     GlobalTableId: 0
 #### A masked pattern was here ####
                     NumFilesPerFileSink: 1
-                    Statistics: Num rows: 1 Data size: 712 Basic stats: COMPLETE Column stats: COMPLETE
+                    Statistics: Num rows: 2 Data size: 1424 Basic stats: COMPLETE Column stats: COMPLETE
 #### A masked pattern was here ####
                     table:
                         input format: org.apache.hadoop.mapred.SequenceFileInputFormat
@@ -317,7 +317,7 @@ Table:              	list_bucketing_dynamic_part_n3
 #### A masked pattern was here ####
 Partition Parameters:	 	 
 	COLUMN_STATS_ACCURATE	{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}}
-	numFiles            	1                   
+	numFiles            	2                   
 	numRows             	16                  
 	rawDataSize         	136                 
 	totalSize           	#Masked#
@@ -358,7 +358,7 @@ Table:              	list_bucketing_dynamic_part_n3
 #### A masked pattern was here ####
 Partition Parameters:	 	 
 	COLUMN_STATS_ACCURATE	{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}}
-	numFiles            	3                   
+	numFiles            	6                   
 	numRows             	984                 
 	rawDataSize         	9488                
 	totalSize           	#Masked#
@@ -461,7 +461,7 @@ STAGE PLANS:
                         minReductionHashAggr: 0.99
                         mode: hash
                         outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10
-                        Statistics: Num rows: 1 Data size: 652 Basic stats: COMPLETE Column stats: COMPLETE
+                        Statistics: Num rows: 2 Data size: 1304 Basic stats: COMPLETE Column stats: COMPLETE
                         Reduce Output Operator
                           bucketingVersion: 2
                           key expressions: _col0 (type: string), _col1 (type: string)
@@ -469,7 +469,7 @@ STAGE PLANS:
                           numBuckets: -1
                           sort order: ++
                           Map-reduce partition columns: _col0 (type: string), _col1 (type: string)
-                          Statistics: Num rows: 1 Data size: 652 Basic stats: COMPLETE Column stats: COMPLETE
+                          Statistics: Num rows: 2 Data size: 1304 Basic stats: COMPLETE Column stats: COMPLETE
                           tag: -1
                           value expressions: _col2 (type: int), _col3 (type: struct<count:bigint,sum:double,input:int>), _col4 (type: bigint), _col5 (type: bigint), _col6 (type: binary), _col7 (type: int), _col8 (type: struct<count:bigint,sum:double,input:int>), _col9 (type: bigint), _col10 (type: binary)
                           auto parallelism: true
@@ -564,18 +564,18 @@ STAGE PLANS:
                 keys: KEY._col0 (type: string), KEY._col1 (type: string)
                 mode: mergepartial
                 outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10
-                Statistics: Num rows: 1 Data size: 516 Basic stats: COMPLETE Column stats: COMPLETE
+                Statistics: Num rows: 2 Data size: 1032 Basic stats: COMPLETE Column stats: COMPLETE
                 Select Operator
                   expressions: 'STRING' (type: string), UDFToLong(COALESCE(_col2,0)) (type: bigint), COALESCE(_col3,0) (type: double), (_col4 - _col5) (type: bigint), COALESCE(ndv_compute_bit_vector(_col6),0) (type: bigint), _col6 (type: binary), 'STRING' (type: string), UDFToLong(COALESCE(_col7,0)) (type: bigint), COALESCE(_col8,0) (type: double), (_col4 - _col9) (type: bigint), COALESCE(ndv_compute_bit_vector(_col10),0) (type: bigint), _col10 (type: binary), _col0 (type: string), _col1 (type: string)
                   outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13
-                  Statistics: Num rows: 1 Data size: 712 Basic stats: COMPLETE Column stats: COMPLETE
+                  Statistics: Num rows: 2 Data size: 1424 Basic stats: COMPLETE Column stats: COMPLETE
                   File Output Operator
                     bucketingVersion: 2
                     compressed: false
                     GlobalTableId: 0
 #### A masked pattern was here ####
                     NumFilesPerFileSink: 1
-                    Statistics: Num rows: 1 Data size: 712 Basic stats: COMPLETE Column stats: COMPLETE
+                    Statistics: Num rows: 2 Data size: 1424 Basic stats: COMPLETE Column stats: COMPLETE
 #### A masked pattern was here ####
                     table:
                         input format: org.apache.hadoop.mapred.SequenceFileInputFormat
@@ -682,7 +682,7 @@ Table:              	list_bucketing_dynamic_part_n3
 #### A masked pattern was here ####
 Partition Parameters:	 	 
 	COLUMN_STATS_ACCURATE	{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}}
-	numFiles            	1                   
+	numFiles            	2                   
 	numRows             	16                  
 	rawDataSize         	136                 
 	totalSize           	#Masked#
@@ -723,7 +723,7 @@ Table:              	list_bucketing_dynamic_part_n3
 #### A masked pattern was here ####
 Partition Parameters:	 	 
 	COLUMN_STATS_ACCURATE	{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}}
-	numFiles            	3                   
+	numFiles            	6                   
 	numRows             	984                 
 	rawDataSize         	9488                
 	totalSize           	#Masked#
diff --git a/ql/src/test/results/clientpositive/llap/list_bucket_dml_7.q.out b/ql/src/test/results/clientpositive/llap/list_bucket_dml_7.q.out
index ad7051398156..d1e40c4588f0 100644
--- a/ql/src/test/results/clientpositive/llap/list_bucket_dml_7.q.out
+++ b/ql/src/test/results/clientpositive/llap/list_bucket_dml_7.q.out
@@ -96,7 +96,7 @@ STAGE PLANS:
                         minReductionHashAggr: 0.99
                         mode: hash
                         outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10
-                        Statistics: Num rows: 1 Data size: 652 Basic stats: COMPLETE Column stats: COMPLETE
+                        Statistics: Num rows: 2 Data size: 1304 Basic stats: COMPLETE Column stats: COMPLETE
                         Reduce Output Operator
                           bucketingVersion: 2
                           key expressions: _col0 (type: string), _col1 (type: string)
@@ -104,7 +104,7 @@ STAGE PLANS:
                           numBuckets: -1
                           sort order: ++
                           Map-reduce partition columns: _col0 (type: string), _col1 (type: string)
-                          Statistics: Num rows: 1 Data size: 652 Basic stats: COMPLETE Column stats: COMPLETE
+                          Statistics: Num rows: 2 Data size: 1304 Basic stats: COMPLETE Column stats: COMPLETE
                           tag: -1
                           value expressions: _col2 (type: int), _col3 (type: struct<count:bigint,sum:double,input:int>), _col4 (type: bigint), _col5 (type: bigint), _col6 (type: binary), _col7 (type: int), _col8 (type: struct<count:bigint,sum:double,input:int>), _col9 (type: bigint), _col10 (type: binary)
                           auto parallelism: true
@@ -199,18 +199,18 @@ STAGE PLANS:
                 keys: KEY._col0 (type: string), KEY._col1 (type: string)
                 mode: mergepartial
                 outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10
-                Statistics: Num rows: 1 Data size: 516 Basic stats: COMPLETE Column stats: COMPLETE
+                Statistics: Num rows: 2 Data size: 1032 Basic stats: COMPLETE Column stats: COMPLETE
                 Select Operator
                   expressions: 'STRING' (type: string), UDFToLong(COALESCE(_col2,0)) (type: bigint), COALESCE(_col3,0) (type: double), (_col4 - _col5) (type: bigint), COALESCE(ndv_compute_bit_vector(_col6),0) (type: bigint), _col6 (type: binary), 'STRING' (type: string), UDFToLong(COALESCE(_col7,0)) (type: bigint), COALESCE(_col8,0) (type: double), (_col4 - _col9) (type: bigint), COALESCE(ndv_compute_bit_vector(_col10),0) (type: bigint), _col10 (type: binary), _col0 (type: string), _col1 (type: string)
                   outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13
-                  Statistics: Num rows: 1 Data size: 712 Basic stats: COMPLETE Column stats: COMPLETE
+                  Statistics: Num rows: 2 Data size: 1424 Basic stats: COMPLETE Column stats: COMPLETE
                   File Output Operator
                     bucketingVersion: 2
                     compressed: false
                     GlobalTableId: 0
 #### A masked pattern was here ####
                     NumFilesPerFileSink: 1
-                    Statistics: Num rows: 1 Data size: 712 Basic stats: COMPLETE Column stats: COMPLETE
+                    Statistics: Num rows: 2 Data size: 1424 Basic stats: COMPLETE Column stats: COMPLETE
 #### A masked pattern was here ####
                     table:
                         input format: org.apache.hadoop.mapred.SequenceFileInputFormat
@@ -317,7 +317,7 @@ Table:              	list_bucketing_dynamic_part
 #### A masked pattern was here ####
 Partition Parameters:	 	 
 	COLUMN_STATS_ACCURATE	{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}}
-	numFiles            	1                   
+	numFiles            	2                   
 	numRows             	16                  
 	rawDataSize         	136                 
 	totalSize           	#Masked#
@@ -358,7 +358,7 @@ Table:              	list_bucketing_dynamic_part
 #### A masked pattern was here ####
 Partition Parameters:	 	 
 	COLUMN_STATS_ACCURATE	{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}}
-	numFiles            	2                   
+	numFiles            	4                   
 	numRows             	984                 
 	rawDataSize         	9488                
 	totalSize           	#Masked#
@@ -461,7 +461,7 @@ STAGE PLANS:
                         minReductionHashAggr: 0.99
                         mode: hash
                         outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10
-                        Statistics: Num rows: 1 Data size: 652 Basic stats: COMPLETE Column stats: COMPLETE
+                        Statistics: Num rows: 2 Data size: 1304 Basic stats: COMPLETE Column stats: COMPLETE
                         Reduce Output Operator
                           bucketingVersion: 2
                           key expressions: _col0 (type: string), _col1 (type: string)
@@ -469,7 +469,7 @@ STAGE PLANS:
                           numBuckets: -1
                           sort order: ++
                           Map-reduce partition columns: _col0 (type: string), _col1 (type: string)
-                          Statistics: Num rows: 1 Data size: 652 Basic stats: COMPLETE Column stats: COMPLETE
+                          Statistics: Num rows: 2 Data size: 1304 Basic stats: COMPLETE Column stats: COMPLETE
                           tag: -1
                           value expressions: _col2 (type: int), _col3 (type: struct<count:bigint,sum:double,input:int>), _col4 (type: bigint), _col5 (type: bigint), _col6 (type: binary), _col7 (type: int), _col8 (type: struct<count:bigint,sum:double,input:int>), _col9 (type: bigint), _col10 (type: binary)
                           auto parallelism: true
@@ -564,18 +564,18 @@ STAGE PLANS:
                 keys: KEY._col0 (type: string), KEY._col1 (type: string)
                 mode: mergepartial
                 outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10
-                Statistics: Num rows: 1 Data size: 516 Basic stats: COMPLETE Column stats: COMPLETE
+                Statistics: Num rows: 2 Data size: 1032 Basic stats: COMPLETE Column stats: COMPLETE
                 Select Operator
                   expressions: 'STRING' (type: string), UDFToLong(COALESCE(_col2,0)) (type: bigint), COALESCE(_col3,0) (type: double), (_col4 - _col5) (type: bigint), COALESCE(ndv_compute_bit_vector(_col6),0) (type: bigint), _col6 (type: binary), 'STRING' (type: string), UDFToLong(COALESCE(_col7,0)) (type: bigint), COALESCE(_col8,0) (type: double), (_col4 - _col9) (type: bigint), COALESCE(ndv_compute_bit_vector(_col10),0) (type: bigint), _col10 (type: binary), _col0 (type: string), _col1 (type: string)
                   outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13
-                  Statistics: Num rows: 1 Data size: 712 Basic stats: COMPLETE Column stats: COMPLETE
+                  Statistics: Num rows: 2 Data size: 1424 Basic stats: COMPLETE Column stats: COMPLETE
                   File Output Operator
                     bucketingVersion: 2
                     compressed: false
                     GlobalTableId: 0
 #### A masked pattern was here ####
                     NumFilesPerFileSink: 1
-                    Statistics: Num rows: 1 Data size: 712 Basic stats: COMPLETE Column stats: COMPLETE
+                    Statistics: Num rows: 2 Data size: 1424 Basic stats: COMPLETE Column stats: COMPLETE
 #### A masked pattern was here ####
                     table:
                         input format: org.apache.hadoop.mapred.SequenceFileInputFormat
@@ -682,7 +682,7 @@ Table:              	list_bucketing_dynamic_part
 #### A masked pattern was here ####
 Partition Parameters:	 	 
 	COLUMN_STATS_ACCURATE	{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}}
-	numFiles            	1                   
+	numFiles            	2                   
 	numRows             	16                  
 	rawDataSize         	136                 
 	totalSize           	#Masked#
@@ -723,7 +723,7 @@ Table:              	list_bucketing_dynamic_part
 #### A masked pattern was here ####
 Partition Parameters:	 	 
 	COLUMN_STATS_ACCURATE	{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}}
-	numFiles            	2                   
+	numFiles            	4                   
 	numRows             	984                 
 	rawDataSize         	9488                
 	totalSize           	#Masked#
diff --git a/ql/src/test/results/clientpositive/llap/list_bucket_dml_8.q.out b/ql/src/test/results/clientpositive/llap/list_bucket_dml_8.q.out
index 148303926d66..4e5651cccc53 100644
--- a/ql/src/test/results/clientpositive/llap/list_bucket_dml_8.q.out
+++ b/ql/src/test/results/clientpositive/llap/list_bucket_dml_8.q.out
@@ -96,7 +96,7 @@ STAGE PLANS:
                         minReductionHashAggr: 0.99
                         mode: hash
                         outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10
-                        Statistics: Num rows: 1 Data size: 652 Basic stats: COMPLETE Column stats: COMPLETE
+                        Statistics: Num rows: 2 Data size: 1304 Basic stats: COMPLETE Column stats: COMPLETE
                         Reduce Output Operator
                           bucketingVersion: 2
                           key expressions: _col0 (type: string), _col1 (type: string)
@@ -104,7 +104,7 @@ STAGE PLANS:
                           numBuckets: -1
                           sort order: ++
                           Map-reduce partition columns: _col0 (type: string), _col1 (type: string)
-                          Statistics: Num rows: 1 Data size: 652 Basic stats: COMPLETE Column stats: COMPLETE
+                          Statistics: Num rows: 2 Data size: 1304 Basic stats: COMPLETE Column stats: COMPLETE
                           tag: -1
                           value expressions: _col2 (type: int), _col3 (type: struct<count:bigint,sum:double,input:int>), _col4 (type: bigint), _col5 (type: bigint), _col6 (type: binary), _col7 (type: int), _col8 (type: struct<count:bigint,sum:double,input:int>), _col9 (type: bigint), _col10 (type: binary)
                           auto parallelism: true
@@ -199,18 +199,18 @@ STAGE PLANS:
                 keys: KEY._col0 (type: string), KEY._col1 (type: string)
                 mode: mergepartial
                 outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10
-                Statistics: Num rows: 1 Data size: 516 Basic stats: COMPLETE Column stats: COMPLETE
+                Statistics: Num rows: 2 Data size: 1032 Basic stats: COMPLETE Column stats: COMPLETE
                 Select Operator
                   expressions: 'STRING' (type: string), UDFToLong(COALESCE(_col2,0)) (type: bigint), COALESCE(_col3,0) (type: double), (_col4 - _col5) (type: bigint), COALESCE(ndv_compute_bit_vector(_col6),0) (type: bigint), _col6 (type: binary), 'STRING' (type: string), UDFToLong(COALESCE(_col7,0)) (type: bigint), COALESCE(_col8,0) (type: double), (_col4 - _col9) (type: bigint), COALESCE(ndv_compute_bit_vector(_col10),0) (type: bigint), _col10 (type: binary), _col0 (type: string), _col1 (type: string)
                   outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13
-                  Statistics: Num rows: 1 Data size: 712 Basic stats: COMPLETE Column stats: COMPLETE
+                  Statistics: Num rows: 2 Data size: 1424 Basic stats: COMPLETE Column stats: COMPLETE
                   File Output Operator
                     bucketingVersion: 2
                     compressed: false
                     GlobalTableId: 0
 #### A masked pattern was here ####
                     NumFilesPerFileSink: 1
-                    Statistics: Num rows: 1 Data size: 712 Basic stats: COMPLETE Column stats: COMPLETE
+                    Statistics: Num rows: 2 Data size: 1424 Basic stats: COMPLETE Column stats: COMPLETE
 #### A masked pattern was here ####
                     table:
                         input format: org.apache.hadoop.mapred.SequenceFileInputFormat
diff --git a/ql/src/test/results/clientpositive/llap/merge_dynamic_partition4.q.out b/ql/src/test/results/clientpositive/llap/merge_dynamic_partition4.q.out
index 2c9c9015c173..85f1ea93c068 100644
--- a/ql/src/test/results/clientpositive/llap/merge_dynamic_partition4.q.out
+++ b/ql/src/test/results/clientpositive/llap/merge_dynamic_partition4.q.out
@@ -180,13 +180,13 @@ STAGE PLANS:
                         minReductionHashAggr: 0.99
                         mode: hash
                         outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10
-                        Statistics: Num rows: 1 Data size: 652 Basic stats: COMPLETE Column stats: COMPLETE
+                        Statistics: Num rows: 2 Data size: 1304 Basic stats: COMPLETE Column stats: COMPLETE
                         Reduce Output Operator
                           key expressions: _col0 (type: string), _col1 (type: string)
                           null sort order: zz
                           sort order: ++
                           Map-reduce partition columns: _col0 (type: string), _col1 (type: string)
-                          Statistics: Num rows: 1 Data size: 652 Basic stats: COMPLETE Column stats: COMPLETE
+                          Statistics: Num rows: 2 Data size: 1304 Basic stats: COMPLETE Column stats: COMPLETE
                           value expressions: _col2 (type: int), _col3 (type: struct<count:bigint,sum:double,input:int>), _col4 (type: bigint), _col5 (type: bigint), _col6 (type: binary), _col7 (type: int), _col8 (type: struct<count:bigint,sum:double,input:int>), _col9 (type: bigint), _col10 (type: binary)
             Execution mode: llap
             LLAP IO: no inputs
@@ -198,14 +198,14 @@ STAGE PLANS:
                 keys: KEY._col0 (type: string), KEY._col1 (type: string)
                 mode: mergepartial
                 outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10
-                Statistics: Num rows: 1 Data size: 516 Basic stats: COMPLETE Column stats: COMPLETE
+                Statistics: Num rows: 2 Data size: 1032 Basic stats: COMPLETE Column stats: COMPLETE
                 Select Operator
                   expressions: 'STRING' (type: string), UDFToLong(COALESCE(_col2,0)) (type: bigint), COALESCE(_col3,0) (type: double), (_col4 - _col5) (type: bigint), COALESCE(ndv_compute_bit_vector(_col6),0) (type: bigint), _col6 (type: binary), 'STRING' (type: string), UDFToLong(COALESCE(_col7,0)) (type: bigint), COALESCE(_col8,0) (type: double), (_col4 - _col9) (type: bigint), COALESCE(ndv_compute_bit_vector(_col10),0) (type: bigint), _col10 (type: binary), _col0 (type: string), _col1 (type: string)
                   outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13
-                  Statistics: Num rows: 1 Data size: 712 Basic stats: COMPLETE Column stats: COMPLETE
+                  Statistics: Num rows: 2 Data size: 1424 Basic stats: COMPLETE Column stats: COMPLETE
                   File Output Operator
                     compressed: false
-                    Statistics: Num rows: 1 Data size: 712 Basic stats: COMPLETE Column stats: COMPLETE
+                    Statistics: Num rows: 2 Data size: 1424 Basic stats: COMPLETE Column stats: COMPLETE
                     table:
                         input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                         output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
diff --git a/ql/src/test/results/clientpositive/llap/merge_dynamic_partition5.q.out b/ql/src/test/results/clientpositive/llap/merge_dynamic_partition5.q.out
index 5b1e537b938a..ab9805c19485 100644
--- a/ql/src/test/results/clientpositive/llap/merge_dynamic_partition5.q.out
+++ b/ql/src/test/results/clientpositive/llap/merge_dynamic_partition5.q.out
@@ -156,13 +156,13 @@ STAGE PLANS:
                         minReductionHashAggr: 0.99
                         mode: hash
                         outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10
-                        Statistics: Num rows: 1 Data size: 652 Basic stats: COMPLETE Column stats: COMPLETE
+                        Statistics: Num rows: 2 Data size: 1304 Basic stats: COMPLETE Column stats: COMPLETE
                         Reduce Output Operator
                           key expressions: _col0 (type: string), _col1 (type: string)
                           null sort order: zz
                           sort order: ++
                           Map-reduce partition columns: _col0 (type: string), _col1 (type: string)
-                          Statistics: Num rows: 1 Data size: 652 Basic stats: COMPLETE Column stats: COMPLETE
+                          Statistics: Num rows: 2 Data size: 1304 Basic stats: COMPLETE Column stats: COMPLETE
                           value expressions: _col2 (type: int), _col3 (type: struct<count:bigint,sum:double,input:int>), _col4 (type: bigint), _col5 (type: bigint), _col6 (type: binary), _col7 (type: int), _col8 (type: struct<count:bigint,sum:double,input:int>), _col9 (type: bigint), _col10 (type: binary)
             Execution mode: llap
             LLAP IO: no inputs
@@ -174,14 +174,14 @@ STAGE PLANS:
                 keys: KEY._col0 (type: string), KEY._col1 (type: string)
                 mode: mergepartial
                 outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10
-                Statistics: Num rows: 1 Data size: 516 Basic stats: COMPLETE Column stats: COMPLETE
+                Statistics: Num rows: 2 Data size: 1032 Basic stats: COMPLETE Column stats: COMPLETE
                 Select Operator
                   expressions: 'STRING' (type: string), UDFToLong(COALESCE(_col2,0)) (type: bigint), COALESCE(_col3,0) (type: double), (_col4 - _col5) (type: bigint), COALESCE(ndv_compute_bit_vector(_col6),0) (type: bigint), _col6 (type: binary), 'STRING' (type: string), UDFToLong(COALESCE(_col7,0)) (type: bigint), COALESCE(_col8,0) (type: double), (_col4 - _col9) (type: bigint), COALESCE(ndv_compute_bit_vector(_col10),0) (type: bigint), _col10 (type: binary), _col0 (type: string), _col1 (type: string)
                   outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13
-                  Statistics: Num rows: 1 Data size: 712 Basic stats: COMPLETE Column stats: COMPLETE
+                  Statistics: Num rows: 2 Data size: 1424 Basic stats: COMPLETE Column stats: COMPLETE
                   File Output Operator
                     compressed: false
-                    Statistics: Num rows: 1 Data size: 712 Basic stats: COMPLETE Column stats: COMPLETE
+                    Statistics: Num rows: 2 Data size: 1424 Basic stats: COMPLETE Column stats: COMPLETE
                     table:
                         input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                         output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
diff --git a/ql/src/test/results/clientpositive/llap/scratch_col_issue.q.out b/ql/src/test/results/clientpositive/llap/scratch_col_issue.q.out
index 5418ef860de3..1e23944fcaf7 100644
--- a/ql/src/test/results/clientpositive/llap/scratch_col_issue.q.out
+++ b/ql/src/test/results/clientpositive/llap/scratch_col_issue.q.out
@@ -189,7 +189,7 @@ STAGE PLANS:
                         outputColumnNames: _col1, _col2
                         input vertices:
                           1 Map 2
-                        Statistics: Num rows: 2 Data size: 368 Basic stats: COMPLETE Column stats: COMPLETE
+                        Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: COMPLETE
                         Select Operator
                           expressions: if((_col1) IN ('CertificateOfDeposit', 'RecurringDeposit', 'TermDeposit'), COALESCE(from_unixtime(to_unix_timestamp(CAST( _col2 AS DATE)), 'MM-dd-yyyy'),' '), '') (type: string)
                           outputColumnNames: _col0
@@ -198,13 +198,13 @@ STAGE PLANS:
                               native: true
                               projectedOutputColumnNums: [14]
                               selectExpressions: IfExprCondExprColumn(col 9:boolean, col 13:string, col 5:string)(children: StringColumnInList(col 1, values CertificateOfDeposit, RecurringDeposit, TermDeposit) -> 9:boolean, VectorCoalesce(columns [5, 12])(children: VectorUDFAdaptor(from_unixtime(to_unix_timestamp(CAST( _col2 AS DATE)), 'MM-dd-yyyy'))(children: VectorUDFUnixTimeStampDate(col 10)(children: CastStringToDate(col 2:string) -> 10:date) -> 11:bigint) -> 5:string, ConstantVectorExpression(val  ) -> 12:string) -> 13:string, ConstantVectorExpression(val ) -> 5:string) -> 14:string
-                          Statistics: Num rows: 2 Data size: 368 Basic stats: COMPLETE Column stats: COMPLETE
+                          Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: COMPLETE
                           File Output Operator
                             compressed: false
                             File Sink Vectorization:
                                 className: VectorFileSinkOperator
                                 native: false
-                            Statistics: Num rows: 2 Data size: 368 Basic stats: COMPLETE Column stats: COMPLETE
+                            Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: COMPLETE
                             table:
                                 input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                                 output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat

From f89edb7d4a26a2eb6abb096ca7fe39a04bba33af Mon Sep 17 00:00:00 2001
From: Konstantin Bereznyakov <konstantin.bereznyakov@treasure-data.com>
Date: Sat, 21 Mar 2026 10:09:37 -0700
Subject: [PATCH 05/13] HIVE-29368: trigger a rebuild


From d7aed0eb7afa55de0e3b143bd6adbd6cb549e9fb Mon Sep 17 00:00:00 2001
From: Konstantin Bereznyakov <konstantin.bereznyakov@treasure-data.com>
Date: Tue, 24 Mar 2026 11:25:11 -0700
Subject: [PATCH 06/13] HIVE-29368: configured split file counts consistently
 between CI & localhost

---
 .../clientpositive/list_bucket_dml_6.q        | 20 ++++++++-----
 .../clientpositive/list_bucket_dml_7.q        | 11 ++++---
 .../llap/list_bucket_dml_6.q.out              | 30 ++++++++-----------
 .../llap/list_bucket_dml_7.q.out              | 18 +++--------
 4 files changed, 34 insertions(+), 45 deletions(-)

diff --git a/ql/src/test/queries/clientpositive/list_bucket_dml_6.q b/ql/src/test/queries/clientpositive/list_bucket_dml_6.q
index 2ce2ced59e04..de1c802e9ee7 100644
--- a/ql/src/test/queries/clientpositive/list_bucket_dml_6.q
+++ b/ql/src/test/queries/clientpositive/list_bucket_dml_6.q
@@ -1,10 +1,19 @@
 --! qt:dataset:srcpart
+-- Debug: DELETEME: show merge settings
+set hive.merge.mapfiles;
+set hive.merge.mapredfiles;
+set hive.merge.tezfiles;
+set hive.merge.smallfiles.avgsize;
+-- Debug: DELETEME: end
+
+-- this ensures consistent split file counts between localhost & CI runs
+set tez.grouping.split-count=1;
 set hive.mapred.mode=nonstrict;
 set hive.exec.dynamic.partition=true;
 set hive.input.format=org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;
 set hive.merge.smallfiles.avgsize=200;
 set mapred.input.dir.recursive=true;
-set hive.merge.mapfiles=false;	
+set hive.merge.mapfiles=false;
 set hive.merge.mapredfiles=false;
 
 -- list bucketing DML: dynamic partition. multiple skewed columns. merge.
@@ -43,13 +52,13 @@ set hive.merge.mapredfiles=false;
 -- 87 000000_0
 -- 87 000001_0
 -- with merge
--- 118 000002_0 
+-- 118 000002_0
 
 -- SORT_QUERY_RESULTS
 
 -- create a skewed table
-create table list_bucketing_dynamic_part_n3 (key String, value String) 
-    partitioned by (ds String, hr String) 
+create table list_bucketing_dynamic_part_n3 (key String, value String)
+    partitioned by (ds String, hr String)
     skewed by (key, value) on (('484','val_484'),('51','val_14'),('103','val_103'))
     stored as DIRECTORIES
     STORED AS RCFILE;
@@ -92,6 +101,3 @@ select * from list_bucketing_dynamic_part_n3 where key = '484' and value = 'val_
 select * from list_bucketing_dynamic_part_n3 where key = '484' and value = 'val_484';
 select * from srcpart where ds = '2008-04-08' and key = '484' and value = 'val_484';
 
--- clean up
-drop table list_bucketing_dynamic_part_n3;
-
diff --git a/ql/src/test/queries/clientpositive/list_bucket_dml_7.q b/ql/src/test/queries/clientpositive/list_bucket_dml_7.q
index f80585e56c6f..a4a21aaa1ceb 100644
--- a/ql/src/test/queries/clientpositive/list_bucket_dml_7.q
+++ b/ql/src/test/queries/clientpositive/list_bucket_dml_7.q
@@ -1,4 +1,6 @@
 --! qt:dataset:srcpart
+-- this ensures consistent split file counts between localhost & CI runs
+set tez.grouping.split-count=1;
 set hive.mapred.mode=nonstrict;
 set hive.exec.dynamic.partition=true;
 set hive.input.format=org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;
@@ -39,10 +41,10 @@ select key, value, if(key % 100 == 0, 'a1', 'b1') from srcpart where ds = '2008-
 
 -- check DML result
 show partitions list_bucketing_dynamic_part;
-desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='a1');	
+desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='a1');
 desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='b1');
 
-set hive.merge.mapfiles=true;	
+set hive.merge.mapfiles=true;
 set hive.merge.mapredfiles=true; 
 -- list bucketing DML with merge. use bucketize to generate a few small files.
 explain extended
@@ -54,7 +56,7 @@ select key, value, if(key % 100 == 0, 'a1', 'b1') from srcpart where ds = '2008-
 
 -- check DML result
 show partitions list_bucketing_dynamic_part;
-desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='a1');	
+desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='a1');
 desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='b1');
 
 select count(1) from srcpart where ds = '2008-04-08';
@@ -65,6 +67,3 @@ explain extended
 select * from list_bucketing_dynamic_part where key = '484' and value = 'val_484';
 select * from list_bucketing_dynamic_part where key = '484' and value = 'val_484';
 select * from srcpart where ds = '2008-04-08' and key = '484' and value = 'val_484';
-
--- clean up
-drop table list_bucketing_dynamic_part;
diff --git a/ql/src/test/results/clientpositive/llap/list_bucket_dml_6.q.out b/ql/src/test/results/clientpositive/llap/list_bucket_dml_6.q.out
index df098011525b..d9d5d18340eb 100644
--- a/ql/src/test/results/clientpositive/llap/list_bucket_dml_6.q.out
+++ b/ql/src/test/results/clientpositive/llap/list_bucket_dml_6.q.out
@@ -1,13 +1,17 @@
-PREHOOK: query: create table list_bucketing_dynamic_part_n3 (key String, value String) 
-    partitioned by (ds String, hr String) 
+hive.merge.mapfiles=true
+hive.merge.mapredfiles=false
+hive.merge.tezfiles=false
+hive.merge.smallfiles.avgsize=16000000
+PREHOOK: query: create table list_bucketing_dynamic_part_n3 (key String, value String)
+    partitioned by (ds String, hr String)
     skewed by (key, value) on (('484','val_484'),('51','val_14'),('103','val_103'))
     stored as DIRECTORIES
     STORED AS RCFILE
 PREHOOK: type: CREATETABLE
 PREHOOK: Output: database:default
 PREHOOK: Output: default@list_bucketing_dynamic_part_n3
-POSTHOOK: query: create table list_bucketing_dynamic_part_n3 (key String, value String) 
-    partitioned by (ds String, hr String) 
+POSTHOOK: query: create table list_bucketing_dynamic_part_n3 (key String, value String)
+    partitioned by (ds String, hr String)
     skewed by (key, value) on (('484','val_484'),('51','val_14'),('103','val_103'))
     stored as DIRECTORIES
     STORED AS RCFILE
@@ -317,7 +321,7 @@ Table:              	list_bucketing_dynamic_part_n3
 #### A masked pattern was here ####
 Partition Parameters:	 	 
 	COLUMN_STATS_ACCURATE	{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}}
-	numFiles            	2                   
+	numFiles            	1                   
 	numRows             	16                  
 	rawDataSize         	136                 
 	totalSize           	#Masked#
@@ -358,7 +362,7 @@ Table:              	list_bucketing_dynamic_part_n3
 #### A masked pattern was here ####
 Partition Parameters:	 	 
 	COLUMN_STATS_ACCURATE	{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}}
-	numFiles            	6                   
+	numFiles            	3                   
 	numRows             	984                 
 	rawDataSize         	9488                
 	totalSize           	#Masked#
@@ -682,7 +686,7 @@ Table:              	list_bucketing_dynamic_part_n3
 #### A masked pattern was here ####
 Partition Parameters:	 	 
 	COLUMN_STATS_ACCURATE	{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}}
-	numFiles            	2                   
+	numFiles            	1                   
 	numRows             	16                  
 	rawDataSize         	136                 
 	totalSize           	#Masked#
@@ -723,7 +727,7 @@ Table:              	list_bucketing_dynamic_part_n3
 #### A masked pattern was here ####
 Partition Parameters:	 	 
 	COLUMN_STATS_ACCURATE	{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}}
-	numFiles            	6                   
+	numFiles            	3                   
 	numRows             	984                 
 	rawDataSize         	9488                
 	totalSize           	#Masked#
@@ -898,13 +902,3 @@ POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=12
 #### A masked pattern was here ####
 484	val_484	2008-04-08	11
 484	val_484	2008-04-08	12
-PREHOOK: query: drop table list_bucketing_dynamic_part_n3
-PREHOOK: type: DROPTABLE
-PREHOOK: Input: default@list_bucketing_dynamic_part_n3
-PREHOOK: Output: database:default
-PREHOOK: Output: default@list_bucketing_dynamic_part_n3
-POSTHOOK: query: drop table list_bucketing_dynamic_part_n3
-POSTHOOK: type: DROPTABLE
-POSTHOOK: Input: default@list_bucketing_dynamic_part_n3
-POSTHOOK: Output: database:default
-POSTHOOK: Output: default@list_bucketing_dynamic_part_n3
diff --git a/ql/src/test/results/clientpositive/llap/list_bucket_dml_7.q.out b/ql/src/test/results/clientpositive/llap/list_bucket_dml_7.q.out
index d1e40c4588f0..e1dbd260d038 100644
--- a/ql/src/test/results/clientpositive/llap/list_bucket_dml_7.q.out
+++ b/ql/src/test/results/clientpositive/llap/list_bucket_dml_7.q.out
@@ -317,7 +317,7 @@ Table:              	list_bucketing_dynamic_part
 #### A masked pattern was here ####
 Partition Parameters:	 	 
 	COLUMN_STATS_ACCURATE	{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}}
-	numFiles            	2                   
+	numFiles            	1                   
 	numRows             	16                  
 	rawDataSize         	136                 
 	totalSize           	#Masked#
@@ -358,7 +358,7 @@ Table:              	list_bucketing_dynamic_part
 #### A masked pattern was here ####
 Partition Parameters:	 	 
 	COLUMN_STATS_ACCURATE	{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}}
-	numFiles            	4                   
+	numFiles            	2                   
 	numRows             	984                 
 	rawDataSize         	9488                
 	totalSize           	#Masked#
@@ -682,7 +682,7 @@ Table:              	list_bucketing_dynamic_part
 #### A masked pattern was here ####
 Partition Parameters:	 	 
 	COLUMN_STATS_ACCURATE	{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}}
-	numFiles            	2                   
+	numFiles            	1                   
 	numRows             	16                  
 	rawDataSize         	136                 
 	totalSize           	#Masked#
@@ -723,7 +723,7 @@ Table:              	list_bucketing_dynamic_part
 #### A masked pattern was here ####
 Partition Parameters:	 	 
 	COLUMN_STATS_ACCURATE	{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}}
-	numFiles            	4                   
+	numFiles            	2                   
 	numRows             	984                 
 	rawDataSize         	9488                
 	totalSize           	#Masked#
@@ -898,13 +898,3 @@ POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=12
 #### A masked pattern was here ####
 484	val_484	2008-04-08	11
 484	val_484	2008-04-08	12
-PREHOOK: query: drop table list_bucketing_dynamic_part
-PREHOOK: type: DROPTABLE
-PREHOOK: Input: default@list_bucketing_dynamic_part
-PREHOOK: Output: database:default
-PREHOOK: Output: default@list_bucketing_dynamic_part
-POSTHOOK: query: drop table list_bucketing_dynamic_part
-POSTHOOK: type: DROPTABLE
-POSTHOOK: Input: default@list_bucketing_dynamic_part
-POSTHOOK: Output: database:default
-POSTHOOK: Output: default@list_bucketing_dynamic_part

From bf047c0b69a88801864c7cfabf6bc92bb69647cf Mon Sep 17 00:00:00 2001
From: Konstantin Bereznyakov <konstantin.bereznyakov@treasure-data.com>
Date: Tue, 24 Mar 2026 17:01:47 -0700
Subject: [PATCH 07/13] HIVE-29368: removed debug info from the test file

---
 ql/src/test/queries/clientpositive/list_bucket_dml_6.q | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/ql/src/test/queries/clientpositive/list_bucket_dml_6.q b/ql/src/test/queries/clientpositive/list_bucket_dml_6.q
index de1c802e9ee7..11986d696aff 100644
--- a/ql/src/test/queries/clientpositive/list_bucket_dml_6.q
+++ b/ql/src/test/queries/clientpositive/list_bucket_dml_6.q
@@ -1,11 +1,4 @@
 --! qt:dataset:srcpart
--- Debug: DELETEME: show merge settings
-set hive.merge.mapfiles;
-set hive.merge.mapredfiles;
-set hive.merge.tezfiles;
-set hive.merge.smallfiles.avgsize;
--- Debug: DELETEME: end
-
 -- this ensures consistent split file counts between localhost & CI runs
 set tez.grouping.split-count=1;
 set hive.mapred.mode=nonstrict;

From d4826c2d877b5fb1a62e09ad86e561a6a297b595 Mon Sep 17 00:00:00 2001
From: Konstantin Bereznyakov <konstantin.bereznyakov@treasure-data.com>
Date: Wed, 25 Mar 2026 10:51:23 -0700
Subject: [PATCH 08/13] HIVE-29368: refactoring as per the PR feedback

---
 .../hadoop/hive/ql/plan/ColStatistics.java    |  10 -
 .../hadoop/hive/ql/stats/StatsUtils.java      |  16 +-
 .../estimator/PessimisticStatCombiner.java    |   7 +-
 .../ql/stats/estimator/StatEstimator.java     |  45 +---
 .../estimator/StatEstimatorProvider.java      |  10 +-
 .../hive/ql/udf/generic/GenericUDFLower.java  |  15 ++
 .../hive/ql/udf/generic/GenericUDFUpper.java  |  15 ++
 .../hadoop/hive/ql/stats/TestStatsUtils.java  | 208 ++++++++++++++++
 .../TestPessimisticStatCombiner.java          | 223 +++---------------
 .../ql/stats/estimator/TestStatEstimator.java | 169 -------------
 .../clientpositive/list_bucket_dml_6.q        |  13 +-
 .../clientpositive/list_bucket_dml_7.q        |  11 +-
 .../queries/clientpositive/ndv_case_const.q   |  13 +
 .../llap/list_bucket_dml_6.q.out              |  22 +-
 .../llap/list_bucket_dml_7.q.out              |  10 +
 .../clientpositive/llap/ndv_case_const.q.out  | 121 +++++++++-
 16 files changed, 462 insertions(+), 446 deletions(-)
 delete mode 100644 ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestStatEstimator.java

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java
index 76b8c28691cc..717d1f8b6a7c 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java
@@ -31,7 +31,6 @@ public class ColStatistics {
   private boolean isPrimaryKey;
   private boolean isEstimated;
   private boolean isFilteredColumn;
-  private boolean isConst;
   private byte[] bitVectors;
   private byte[] histogram;
 
@@ -156,8 +155,6 @@ public String toString() {
 
     sb.append(" isEstimated: ");
     sb.append(isEstimated);
-    sb.append(" isConst: ");
-    sb.append(isConst);
     return sb.toString();
   }
 
@@ -174,7 +171,6 @@ public ColStatistics clone() {
     clone.setPrimaryKey(isPrimaryKey);
     clone.setIsEstimated(isEstimated);
     clone.setIsFilteredColumn(isFilteredColumn);
-    clone.setConst(isConst);
     if (range != null ) {
       clone.setRange(range.clone());
     }
@@ -195,12 +191,6 @@ public void setIsEstimated(boolean isEstimated) {
 
   public boolean isEstimated() { return isEstimated; }
 
-  public void setConst(boolean isConst) {
-    this.isConst = isConst;
-  }
-
-  public boolean isConst() { return isConst; }
-
   public static class Range {
     public final Number minValue;
     public final Number maxValue;
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
index f8aad54910e3..830c4b6c8cec 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
@@ -1578,9 +1578,11 @@ public static ColStatistics getColStatisticsFromExpression(HiveConf conf, Statis
             csList.add(cs);
           }
           if (csList.size() == engfd.getChildren().size()) {
-            Optional<ColStatistics> res = se.estimate(csList, numRows);
+            Optional<ColStatistics> res = se.estimate(csList);
             if (res.isPresent()) {
               ColStatistics newStats = res.get();
+              // NDV cannot exceed numRows
+              newStats.setCountDistint(Math.min(newStats.getCountDistint(), numRows));
               colType = colType.toLowerCase();
               newStats.setColumnType(colType);
               newStats.setColumnName(colName);
@@ -1626,14 +1628,10 @@ public static ColStatistics getColStatisticsFromExpression(HiveConf conf, Statis
   }
 
   private static ColStatistics buildColStatForConstant(HiveConf conf, long numRows, ExprNodeConstantDesc encd) {
-
     long numNulls = 0;
-    long countDistincts = 0;
+    long countDistincts = 1;
     if (encd.getValue() == null) {
-      // null projection
       numNulls = numRows;
-    } else {
-      countDistincts = 1;
     }
     String colType = encd.getTypeString();
     colType = colType.toLowerCase();
@@ -1643,7 +1641,6 @@ private static ColStatistics buildColStatForConstant(HiveConf conf, long numRows
     colStats.setAvgColLen(avgColSize);
     colStats.setCountDistint(countDistincts);
     colStats.setNumNulls(numNulls);
-    colStats.setConst(true);
 
     Optional<Number> value = getConstValue(encd);
     value.ifPresent(number -> colStats.setRange(number, number));
@@ -2093,6 +2090,7 @@ public static long computeNDVGroupingColumns(List<ColStatistics> colStats, Stati
       return 0L;
     }
     if (ndvValues.isEmpty()) {
+      // No grouping columns, one row
       return 1L;
     }
     if (expDecay) {
@@ -2109,7 +2107,9 @@ private static List<Long> extractNDVGroupingColumns(List<ColStatistics> colStats
     for (ColStatistics cs : colStats) {
       if (cs != null) {
         long ndv = cs.getCountDistint();
-        if (cs.getNumNulls() > 0) {
+        // +1 for NULL group: source columns with partial nulls and known NDV only.
+        // Computed expressions include NULL. Ordered: numNulls>0 first (often false).
+        if (!cs.isEstimated() && cs.getNumNulls() > 0 && ndv > 0 && cs.getNumNulls() < parentStats.getNumRows()) {
           ndv = StatsUtils.safeAdd(ndv, 1);
         }
         ndvValues.add(ndv);
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java
index b3086693ed52..f84484c456be 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java
@@ -42,14 +42,13 @@ public void add(ColStatistics stat) {
     if (stat.getAvgColLen() > result.getAvgColLen()) {
       result.setAvgColLen(stat.getAvgColLen());
     }
-    // NDV=0 is "unknown" only if the stat is NOT a constant.
-    // Constants with NDV=0 (e.g., NULL) are "known zero", not unknown.
-    if ((result.getCountDistint() == 0 && !result.isConst()) || (stat.getCountDistint() == 0 && !stat.isConst())) {
+    // If any branch has NDV=0 (unknown stats), propagate unknown to result.
+    // Summing would treat unknown as zero, causing cardinality underestimates.
+    if (result.getCountDistint() == 0 || stat.getCountDistint() == 0) {
       result.setCountDistint(0);
     } else {
       result.setCountDistint(StatsUtils.safeAdd(result.getCountDistint(), stat.getCountDistint()));
     }
-    result.setConst(false);
     if (stat.getNumNulls() < 0 || result.getNumNulls() < 0) {
       result.setNumNulls(-1);
     } else if (stat.getNumNulls() > result.getNumNulls()) {
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimator.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimator.java
index 98e96c48893a..94aaa32ecfcb 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimator.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimator.java
@@ -24,53 +24,20 @@
 import org.apache.hadoop.hive.ql.plan.ColStatistics;
 
 /**
- * Enables statistics related computation on UDFs.
- *
- * <p>This interface provides two default implementations:
- * <ul>
- *   <li>{@link #estimate(List)} - clones the first argument's statistics (suitable for most UDFs)</li>
- *   <li>{@link #estimate(List, long)} - calls estimate(List) and caps NDV at numRows</li>
- * </ul>
- *
- * <p>UDFs that simply pass through statistics (like LOWER, UPPER) can use the defaults.
- * UDFs that combine statistics (like IF, WHEN, COALESCE) should override {@link #estimate(List)}.
+ * Enables statistics related computation on UDFs
  */
 public interface StatEstimator {
 
   /**
    * Computes the output statistics of the actual UDF.
    *
-   * <p>The default implementation clones the first argument's statistics, which is suitable
-   * for most UDFs that don't significantly alter the statistical properties of their input.
-   *
-   * <p>Override this method for UDFs that combine multiple inputs (like IF, WHEN, COALESCE)
-   * or significantly transform the data.
-   *
-   * @param argStats the statistics for every argument of the UDF
-   * @return {@link ColStatistics} estimate for the actual UDF, or empty if estimation is not possible
-   */
-  default Optional<ColStatistics> estimate(List<ColStatistics> argStats) {
-    if (argStats.isEmpty()) {
-      return Optional.empty();
-    }
-    return Optional.of(argStats.get(0).clone());
-  }
-
-  /**
-   * Computes the output statistics of the actual UDF, ensuring NDV does not exceed numRows.
+   * The estimator should return with a preferably overestimated {@link ColStatistics} object if possible.
+   * The actual estimation logic may decide to not give an estimation; it should return with {@link Optional#empty()}.
    *
-   * <p>The default implementation calls {@link #estimate(List)} and caps the NDV at numRows.
-   * This ensures that estimators which combine statistics from multiple branches (producing
-   * potentially inflated NDV values) are automatically bounded by the number of rows.
+   * Note: at the time of the call there will be {@link ColStatistics} for all the arguments; if that is not available - the estimation is skipped.
    *
    * @param argStats the statistics for every argument of the UDF
-   * @param numRows the number of rows, used to cap the NDV
-   * @return {@link ColStatistics} estimate for the actual UDF with NDV capped at numRows
+   * @return {@link ColStatistics} estimate for the actual UDF.
    */
-  default Optional<ColStatistics> estimate(List<ColStatistics> argStats, long numRows) {
-    return estimate(argStats).map(cs -> {
-      cs.setCountDistint(Math.min(cs.getCountDistint(), numRows));
-      return cs;
-    });
-  }
+  public Optional<ColStatistics> estimate(List<ColStatistics> argStats);
 }
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimatorProvider.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimatorProvider.java
index c888493040e4..96865d194c6e 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimatorProvider.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimatorProvider.java
@@ -19,19 +19,11 @@
 
 /**
  * Marker interface for UDFs to communicate that the usage of StatEstimators is supported by the UDF.
- *
- * <p>The default implementation returns a {@link StatEstimator} that clones the first argument's
- * statistics, which is suitable for most UDFs. Override {@link #getStatEstimator()} for UDFs
- * that combine statistics from multiple inputs (like IF, WHEN, COALESCE).
  */
 public interface StatEstimatorProvider {
 
   /**
    * Returns the {@link StatEstimator} for the given UDF instance.
-   *
-   * <p>The default implementation returns an estimator that clones the first argument's statistics.
    */
-  default StatEstimator getStatEstimator() {
-    return new StatEstimator() {};
-  }
+  public StatEstimator getStatEstimator();
 }
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFLower.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFLower.java
index 609274c0bfe4..411438907424 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFLower.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFLower.java
@@ -24,6 +24,8 @@
 import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions;
 import org.apache.hadoop.hive.ql.exec.vector.expressions.StringLower;
 import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.plan.ColStatistics;
+import org.apache.hadoop.hive.ql.stats.estimator.StatEstimator;
 import org.apache.hadoop.hive.ql.stats.estimator.StatEstimatorProvider;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
@@ -35,6 +37,8 @@
 import org.apache.hadoop.hive.serde2.typeinfo.BaseCharTypeInfo;
 import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
 
+import java.util.List;
+import java.util.Optional;
 
 /**
  * UDFLower.
@@ -109,4 +113,15 @@ public Object evaluate(DeferredObject[] arguments) throws HiveException {
   public String getDisplayString(String[] children) {
     return getStandardDisplayString("lower", children);
   }
+
+  @Override
+  public StatEstimator getStatEstimator() {
+    return new StatEstimator() {
+      @Override
+      public Optional<ColStatistics> estimate(List<ColStatistics> argStats) {
+        return Optional.of(argStats.get(0).clone());
+      }
+    };
+  }
+
 }
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUpper.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUpper.java
index d0df8da9886b..019cbe94a4ba 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUpper.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUpper.java
@@ -24,6 +24,8 @@
 import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions;
 import org.apache.hadoop.hive.ql.exec.vector.expressions.StringUpper;
 import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.plan.ColStatistics;
+import org.apache.hadoop.hive.ql.stats.estimator.StatEstimator;
 import org.apache.hadoop.hive.ql.stats.estimator.StatEstimatorProvider;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
@@ -35,6 +37,8 @@
 import org.apache.hadoop.hive.serde2.typeinfo.BaseCharTypeInfo;
 import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
 
+import java.util.List;
+import java.util.Optional;
 
 /**
  * UDFUpper.
@@ -111,4 +115,15 @@ public Object evaluate(DeferredObject[] arguments) throws HiveException {
   public String getDisplayString(String[] children) {
     return getStandardDisplayString("upper", children);
   }
+
+  @Override
+  public StatEstimator getStatEstimator() {
+    return new StatEstimator() {
+      @Override
+      public Optional<ColStatistics> estimate(List<ColStatistics> argStats) {
+        return Optional.of(argStats.get(0).clone());
+      }
+    };
+  }
+
 }
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java b/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java
index 3f76c554d446..bc54e749834d 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java
@@ -19,12 +19,14 @@
 package org.apache.hadoop.hive.ql.stats;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
 import static org.junit.jupiter.api.Assertions.assertNotEquals;
 import static org.junit.jupiter.api.Assertions.assertNotNull;
 import static org.junit.jupiter.api.Assertions.assertNull;
 
 import java.lang.reflect.Field;
 import java.lang.reflect.Modifier;
+import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
 import java.util.Set;
@@ -39,7 +41,12 @@
 import org.apache.hadoop.hive.metastore.api.LongColumnStatsData;
 import org.apache.hadoop.hive.ql.plan.ColStatistics;
 import org.apache.hadoop.hive.ql.plan.ColStatistics.Range;
+import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
 import org.apache.hadoop.hive.ql.plan.Statistics;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFIf;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
 import org.apache.hadoop.hive.serde.serdeConstants;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.params.ParameterizedTest;
@@ -499,4 +506,205 @@ void testScaleColStatisticsPreservesUnknownNumFalses() {
     assertEquals(-1, colStats.get(0).getNumFalses(), "Unknown numFalses (-1) should be preserved after scaling");
   }
 
+  // Tests for buildColStatForConstant (via getColStatisticsFromExpression)
+
+  @Test
+  void testGetColStatisticsFromExpressionNullConstant() {
+    HiveConf conf = new HiveConf();
+    Statistics parentStats = new Statistics(1000, 8000, 0, 0);
+
+    ExprNodeConstantDesc nullConst = new ExprNodeConstantDesc(TypeInfoFactory.stringTypeInfo, null);
+    ColStatistics cs = StatsUtils.getColStatisticsFromExpression(conf, parentStats, nullConst);
+
+    assertNotNull(cs);
+    assertEquals(1, cs.getCountDistint(), "NULL constant should have NDV=1");
+    assertEquals(1000, cs.getNumNulls(), "NULL constant should have numNulls=numRows");
+    assertFalse(cs.isEstimated(), "Constant stats should not be marked as estimated");
+  }
+
+  @Test
+  void testGetColStatisticsFromExpressionNonNullConstant() {
+    HiveConf conf = new HiveConf();
+    Statistics parentStats = new Statistics(1000, 8000, 0, 0);
+
+    ExprNodeConstantDesc strConst = new ExprNodeConstantDesc(TypeInfoFactory.stringTypeInfo, "hello");
+    ColStatistics cs = StatsUtils.getColStatisticsFromExpression(conf, parentStats, strConst);
+
+    assertNotNull(cs);
+    assertEquals(1, cs.getCountDistint(), "Non-NULL constant should have NDV=1");
+    assertEquals(0, cs.getNumNulls(), "Non-NULL constant should have numNulls=0");
+  }
+
+  @Test
+  void testGetColStatisticsFromExpressionIntConstant() {
+    HiveConf conf = new HiveConf();
+    Statistics parentStats = new Statistics(500, 4000, 0, 0);
+
+    ExprNodeConstantDesc intConst = new ExprNodeConstantDesc(TypeInfoFactory.intTypeInfo, 42);
+    ColStatistics cs = StatsUtils.getColStatisticsFromExpression(conf, parentStats, intConst);
+
+    assertNotNull(cs);
+    assertEquals(1, cs.getCountDistint(), "Integer constant should have NDV=1");
+    assertEquals(0, cs.getNumNulls(), "Integer constant should have numNulls=0");
+    assertNotNull(cs.getRange(), "Integer constant should have a range");
+    assertEquals(42, cs.getRange().minValue.intValue());
+    assertEquals(42, cs.getRange().maxValue.intValue());
+  }
+
+  // Tests for computeNDVGroupingColumns / extractNDVGroupingColumns
+
+  @Test
+  void testComputeNDVGroupingColumnsSourceColumnWithNulls() {
+    Statistics parentStats = new Statistics(1000, 8000, 0, 0);
+    parentStats.setColumnStatsState(Statistics.State.COMPLETE);
+
+    ColStatistics cs = new ColStatistics("col1", "string");
+    cs.setCountDistint(100);
+    cs.setNumNulls(50);
+    cs.setIsEstimated(false);  // source column
+
+    long ndv = StatsUtils.computeNDVGroupingColumns(Arrays.asList(cs), parentStats, false);
+    assertEquals(101, ndv, "Source column with nulls should get +1 for NULL: 100 + 1 = 101");
+  }
+
+  @Test
+  void testComputeNDVGroupingColumnsSourceColumnNoNulls() {
+    Statistics parentStats = new Statistics(1000, 8000, 0, 0);
+    parentStats.setColumnStatsState(Statistics.State.COMPLETE);
+
+    ColStatistics cs = new ColStatistics("col1", "string");
+    cs.setCountDistint(100);
+    cs.setNumNulls(0);
+    cs.setIsEstimated(false);
+
+    long ndv = StatsUtils.computeNDVGroupingColumns(Arrays.asList(cs), parentStats, false);
+    assertEquals(100, ndv, "Source column without nulls should not get +1");
+  }
+
+  @Test
+  void testComputeNDVGroupingColumnsEstimatedExpression() {
+    Statistics parentStats = new Statistics(1000, 8000, 0, 0);
+    parentStats.setColumnStatsState(Statistics.State.COMPLETE);
+
+    ColStatistics cs = new ColStatistics("case_expr", "string");
+    cs.setCountDistint(3);
+    cs.setNumNulls(500);
+    cs.setIsEstimated(true);  // computed expression (e.g., CASE)
+
+    long ndv = StatsUtils.computeNDVGroupingColumns(Arrays.asList(cs), parentStats, false);
+    assertEquals(3, ndv, "Estimated expression should NOT get +1 (already accounts for NULL)");
+  }
+
+  @Test
+  void testComputeNDVGroupingColumnsAllNullColumn() {
+    Statistics parentStats = new Statistics(1000, 8000, 0, 0);
+    parentStats.setColumnStatsState(Statistics.State.COMPLETE);
+
+    ColStatistics cs = new ColStatistics("col1", "string");
+    cs.setCountDistint(1);
+    cs.setNumNulls(1000);  // all rows are NULL
+    cs.setIsEstimated(false);
+
+    long ndv = StatsUtils.computeNDVGroupingColumns(Arrays.asList(cs), parentStats, false);
+    assertEquals(1, ndv, "All-NULL column should NOT get +1 (numNulls == numRows)");
+  }
+
+  @Test
+  void testComputeNDVGroupingColumnsUnknownNdv() {
+    Statistics parentStats = new Statistics(1000, 8000, 0, 0);
+    parentStats.setColumnStatsState(Statistics.State.COMPLETE);
+
+    ColStatistics cs = new ColStatistics("col1", "string");
+    cs.setCountDistint(0);  // unknown NDV
+    cs.setNumNulls(50);
+    cs.setIsEstimated(false);
+
+    long ndv = StatsUtils.computeNDVGroupingColumns(Arrays.asList(cs), parentStats, false);
+    assertEquals(0, ndv, "Unknown NDV (0) should NOT get +1 to avoid false precision");
+  }
+
+  @Test
+  void testComputeNDVGroupingColumnsMultipleColumns() {
+    Statistics parentStats = new Statistics(1000, 8000, 0, 0);
+    parentStats.setColumnStatsState(Statistics.State.COMPLETE);
+
+    ColStatistics cs1 = new ColStatistics("col1", "string");
+    cs1.setCountDistint(10);
+    cs1.setNumNulls(50);
+    cs1.setIsEstimated(false);
+
+    ColStatistics cs2 = new ColStatistics("col2", "int");
+    cs2.setCountDistint(5);
+    cs2.setNumNulls(0);
+    cs2.setIsEstimated(false);
+
+    long ndv = StatsUtils.computeNDVGroupingColumns(Arrays.asList(cs1, cs2), parentStats, false);
+    // col1: 10 + 1 = 11 (has nulls), col2: 5 (no nulls)
+    // Product: 11 * 5 = 55
+    assertEquals(55, ndv, "Product of NDVs: (10+1) * 5 = 55");
+  }
+
+  @Test
+  void testComputeNDVGroupingColumnsMixedEstimatedAndSource() {
+    Statistics parentStats = new Statistics(1000, 8000, 0, 0);
+    parentStats.setColumnStatsState(Statistics.State.COMPLETE);
+
+    ColStatistics sourceCol = new ColStatistics("col1", "string");
+    sourceCol.setCountDistint(10);
+    sourceCol.setNumNulls(50);
+    sourceCol.setIsEstimated(false);  // source: gets +1
+
+    ColStatistics caseExpr = new ColStatistics("case_expr", "string");
+    caseExpr.setCountDistint(3);
+    caseExpr.setNumNulls(200);
+    caseExpr.setIsEstimated(true);  // estimated: no +1
+
+    long ndv = StatsUtils.computeNDVGroupingColumns(Arrays.asList(sourceCol, caseExpr), parentStats, false);
+    // sourceCol: 10 + 1 = 11, caseExpr: 3 (no +1)
+    // Product: 11 * 3 = 33
+    assertEquals(33, ndv, "Mixed columns: source (10+1) * estimated (3) = 33");
+  }
+
+  // Test for NDV cap after StatEstimator (NDV cannot exceed numRows)
+
+  @Test
+  void testGetColStatisticsFromExpressionNdvCappedAtNumRows() throws Exception {
+    HiveConf conf = new HiveConf();
+    conf.setBoolVar(HiveConf.ConfVars.HIVE_STATS_ESTIMATORS_ENABLE, true);
+
+    // Create parent stats with only 100 rows
+    Statistics parentStats = new Statistics(100, 800, 0, 0);
+
+    // Create column stats for col1 and col2 with high NDV (each 80)
+    ColStatistics col1Stats = new ColStatistics("col1", "string");
+    col1Stats.setCountDistint(80);
+    col1Stats.setNumNulls(0);
+    col1Stats.setAvgColLen(10);
+
+    ColStatistics col2Stats = new ColStatistics("col2", "string");
+    col2Stats.setCountDistint(80);
+    col2Stats.setNumNulls(0);
+    col2Stats.setAvgColLen(10);
+
+    parentStats.setColumnStats(Arrays.asList(col1Stats, col2Stats));
+
+    // Create IF(true, col1, col2) expression
+    // IF uses PessimisticStatCombiner which sums NDVs: 80 + 80 = 160
+    // But numRows is only 100, so NDV should be capped at 100
+    GenericUDFIf udfIf = new GenericUDFIf();
+    ExprNodeConstantDesc condExpr = new ExprNodeConstantDesc(TypeInfoFactory.booleanTypeInfo, true);
+    ExprNodeColumnDesc col1Expr = new ExprNodeColumnDesc(TypeInfoFactory.stringTypeInfo, "col1", "t", false);
+    ExprNodeColumnDesc col2Expr = new ExprNodeColumnDesc(TypeInfoFactory.stringTypeInfo, "col2", "t", false);
+
+    ExprNodeGenericFuncDesc ifExpr = new ExprNodeGenericFuncDesc(
+        TypeInfoFactory.stringTypeInfo, udfIf, "if",
+        Arrays.asList(condExpr, col1Expr, col2Expr));
+
+    ColStatistics result = StatsUtils.getColStatisticsFromExpression(conf, parentStats, ifExpr);
+
+    assertNotNull(result);
+    // PessimisticStatCombiner would produce 80 + 80 = 160, but cap ensures NDV <= numRows (100)
+    assertEquals(100, result.getCountDistint(), "NDV should be capped at numRows (100), not 160");
+  }
+
 }
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestPessimisticStatCombiner.java b/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestPessimisticStatCombiner.java
index fb3eb09308e9..9840cfeaf269 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestPessimisticStatCombiner.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestPessimisticStatCombiner.java
@@ -25,6 +25,45 @@
 
 class TestPessimisticStatCombiner {
 
+  @Test
+  void testNdvSumWhenBothKnown() {
+    ColStatistics stat1 = createStat("col1", "int", 50, 0, 4.0);
+    ColStatistics stat2 = createStat("col2", "int", 30, 0, 4.0);
+
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
+    combiner.add(stat1);
+    combiner.add(stat2);
+
+    ColStatistics result = combiner.getResult().get();
+    assertEquals(80, result.getCountDistint(), "NDV should be summed: 50 + 30 = 80");
+  }
+
+  @Test
+  void testNdvUnknownPropagatedFromFirst() {
+    ColStatistics stat1 = createStat("col1", "int", 0, 0, 4.0);
+    ColStatistics stat2 = createStat("col2", "int", 100, 0, 4.0);
+
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
+    combiner.add(stat1);
+    combiner.add(stat2);
+
+    ColStatistics result = combiner.getResult().get();
+    assertEquals(0, result.getCountDistint(), "Unknown NDV (0) should propagate");
+  }
+
+  @Test
+  void testNdvUnknownPropagatedFromSecond() {
+    ColStatistics stat1 = createStat("col1", "int", 100, 0, 4.0);
+    ColStatistics stat2 = createStat("col2", "int", 0, 0, 4.0);
+
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
+    combiner.add(stat1);
+    combiner.add(stat2);
+
+    ColStatistics result = combiner.getResult().get();
+    assertEquals(0, result.getCountDistint(), "Unknown NDV (0) should propagate");
+  }
+
   @Test
   void testCombinePropagatesUnknownNumNullsFromFirst() {
     ColStatistics stat1 = createStat("col1", "int", 50, -1, 4.0); // unknown numNulls
@@ -155,184 +194,6 @@ void testCombineBothUnknownNumTruesAndNumFalses() {
     assertEquals(-1, combined.getNumFalses(), "Both unknown should result in unknown (-1)");
   }
 
-  @Test
-  void testCombinePropagatesUnknownNdvFromFirst() {
-    ColStatistics stat1 = createStat("col1", "int", 0, 10, 4.0); // NDV=0 means unknown
-    ColStatistics stat2 = createStat("col2", "int", 100, 20, 4.0);
-
-    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
-    combiner.add(stat1);
-    combiner.add(stat2);
-
-    ColStatistics combined = combiner.getResult().get();
-    assertEquals(0, combined.getCountDistint(), "Unknown NDV (0) from first should be propagated");
-  }
-
-  @Test
-  void testCombinePropagatesUnknownNdvFromSecond() {
-    ColStatistics stat1 = createStat("col1", "int", 100, 10, 4.0);
-    ColStatistics stat2 = createStat("col2", "int", 0, 20, 4.0); // NDV=0 means unknown
-
-    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
-    combiner.add(stat1);
-    combiner.add(stat2);
-
-    ColStatistics combined = combiner.getResult().get();
-    assertEquals(0, combined.getCountDistint(), "Unknown NDV (0) from second should be propagated");
-  }
-
-  @Test
-  void testCombineBothUnknownNdv() {
-    ColStatistics stat1 = createStat("col1", "int", 0, 10, 4.0);
-    ColStatistics stat2 = createStat("col2", "int", 0, 20, 4.0);
-
-    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
-    combiner.add(stat1);
-    combiner.add(stat2);
-
-    ColStatistics combined = combiner.getResult().get();
-    assertEquals(0, combined.getCountDistint(), "Both unknown NDV should result in unknown (0)");
-  }
-
-  @Test
-  void testCombineSumsNdvWhenBothKnown() {
-    ColStatistics stat1 = createStat("col1", "int", 50, 10, 4.0);
-    ColStatistics stat2 = createStat("col2", "int", 30, 20, 4.0);
-
-    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
-    combiner.add(stat1);
-    combiner.add(stat2);
-
-    ColStatistics combined = combiner.getResult().get();
-    assertEquals(80, combined.getCountDistint(), "Known NDVs should be summed");
-  }
-
-  @Test
-  void testCombineNdvOverflowProtection() {
-    ColStatistics stat1 = createStat("col1", "int", Long.MAX_VALUE - 10, 10, 4.0);
-    ColStatistics stat2 = createStat("col2", "int", 100, 20, 4.0);
-
-    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
-    combiner.add(stat1);
-    combiner.add(stat2);
-
-    ColStatistics combined = combiner.getResult().get();
-    assertEquals(Long.MAX_VALUE, combined.getCountDistint(), "NDV overflow should be capped at Long.MAX_VALUE");
-  }
-
-  @Test
-  void testCombineThreeStats() {
-    ColStatistics stat1 = createStat("col1", "int", 10, 5, 4.0);
-    ColStatistics stat2 = createStat("col2", "int", 20, 10, 4.0);
-    ColStatistics stat3 = createStat("col3", "int", 30, 15, 4.0);
-
-    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
-    combiner.add(stat1);
-    combiner.add(stat2);
-    combiner.add(stat3);
-
-    ColStatistics combined = combiner.getResult().get();
-    assertEquals(60, combined.getCountDistint(), "Three NDVs should be summed");
-    assertEquals(15, combined.getNumNulls(), "Should take max numNulls");
-  }
-
-  @Test
-  void testCombineUnknownNdvInMiddle() {
-    ColStatistics stat1 = createStat("col1", "int", 10, 5, 4.0);
-    ColStatistics stat2 = createStat("col2", "int", 0, 10, 4.0); // unknown
-    ColStatistics stat3 = createStat("col3", "int", 30, 15, 4.0);
-
-    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
-    combiner.add(stat1);
-    combiner.add(stat2);
-    combiner.add(stat3);
-
-    ColStatistics combined = combiner.getResult().get();
-    assertEquals(0, combined.getCountDistint(), "Unknown NDV in middle should propagate");
-  }
-
-  @Test
-  void testConstantWithNdvZeroIsNotTreatedAsUnknown() {
-    ColStatistics stat1 = createStat("col1", "string", 1, 0, 5.0);
-    ColStatistics stat2 = createConstStat("const", "string", 0, 1000, 5.0); // NULL constant: NDV=0, isConst=true
-
-    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
-    combiner.add(stat1);
-    combiner.add(stat2);
-
-    ColStatistics combined = combiner.getResult().get();
-    assertEquals(1, combined.getCountDistint(), "Constant with NDV=0 should not propagate as unknown");
-  }
-
-  @Test
-  void testNullConstantFirstThenOtherConstants() {
-    ColStatistics nullConst = createConstStat("null", "string", 0, 1000, 5.0); // NULL constant
-    ColStatistics constA = createConstStat("A", "string", 1, 0, 5.0);
-    ColStatistics constB = createConstStat("B", "string", 1, 0, 5.0);
-
-    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
-    combiner.add(nullConst);
-    combiner.add(constA);
-    combiner.add(constB);
-
-    ColStatistics combined = combiner.getResult().get();
-    assertEquals(2, combined.getCountDistint(), "NULL(0) + A(1) + B(1) should sum to 2");
-  }
-
-  @Test
-  void testConstantsWithNullInMiddle() {
-    ColStatistics constA = createConstStat("A", "string", 1, 0, 5.0);
-    ColStatistics nullConst = createConstStat("null", "string", 0, 1000, 5.0); // NULL constant
-    ColStatistics constB = createConstStat("B", "string", 1, 0, 5.0);
-
-    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
-    combiner.add(constA);
-    combiner.add(nullConst);
-    combiner.add(constB);
-
-    ColStatistics combined = combiner.getResult().get();
-    assertEquals(2, combined.getCountDistint(), "A(1) + NULL(0) + B(1) should sum to 2");
-  }
-
-  @Test
-  void testNonConstantNdvZeroStillPropagatesUnknown() {
-    ColStatistics stat1 = createStat("col1", "string", 1, 0, 5.0);
-    ColStatistics stat2 = createStat("col2", "string", 0, 10, 5.0); // Column with unknown NDV (isConst=false)
-
-    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
-    combiner.add(stat1);
-    combiner.add(stat2);
-
-    ColStatistics combined = combiner.getResult().get();
-    assertEquals(0, combined.getCountDistint(), "Non-constant with NDV=0 should still propagate as unknown");
-  }
-
-  @Test
-  void testMixedConstantAndNonConstantWithNdvZero() {
-    ColStatistics constStat = createConstStat("const", "string", 0, 1000, 5.0); // NULL constant
-    ColStatistics colStat = createStat("col", "string", 0, 10, 5.0); // Column with unknown NDV
-
-    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
-    combiner.add(constStat);
-    combiner.add(colStat);
-
-    ColStatistics combined = combiner.getResult().get();
-    assertEquals(0, combined.getCountDistint(), "Non-constant with NDV=0 should propagate unknown even if combined with constant");
-  }
-
-  @Test
-  void testCombinedResultIsNotConst() {
-    ColStatistics constA = createConstStat("A", "string", 1, 0, 5.0);
-    ColStatistics constB = createConstStat("B", "string", 1, 0, 5.0);
-
-    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
-    combiner.add(constA);
-    combiner.add(constB);
-
-    ColStatistics combined = combiner.getResult().get();
-    assertEquals(false, combined.isConst(), "Combined result should not be marked as constant");
-  }
-
   private ColStatistics createStat(String name, String type, long ndv, long numNulls, double avgColLen) {
     ColStatistics stat = new ColStatistics(name, type);
     stat.setCountDistint(ndv);
@@ -340,10 +201,4 @@ private ColStatistics createStat(String name, String type, long ndv, long numNul
     stat.setAvgColLen(avgColLen);
     return stat;
   }
-
-  private ColStatistics createConstStat(String name, String type, long ndv, long numNulls, double avgColLen) {
-    ColStatistics stat = createStat(name, type, ndv, numNulls, avgColLen);
-    stat.setConst(true);
-    return stat;
-  }
 }
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestStatEstimator.java b/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestStatEstimator.java
deleted file mode 100644
index 09140f3d15bb..000000000000
--- a/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestStatEstimator.java
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hadoop.hive.ql.stats.estimator;
-
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertFalse;
-import static org.junit.jupiter.api.Assertions.assertNotSame;
-import static org.junit.jupiter.api.Assertions.assertTrue;
-
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.Optional;
-
-import org.apache.hadoop.hive.ql.plan.ColStatistics;
-import org.junit.jupiter.api.Test;
-
-class TestStatEstimator {
-
-  @Test
-  void testDefaultEstimateWithEmptyList() {
-    StatEstimator estimator = new StatEstimator() {};
-    Optional<ColStatistics> result = estimator.estimate(Collections.emptyList());
-    assertFalse(result.isPresent(), "Empty list should return empty Optional");
-  }
-
-  @Test
-  void testDefaultEstimateClonesFirstArg() {
-    StatEstimator estimator = new StatEstimator() {};
-    ColStatistics stat = createStat("col1", "int", 100, 10, 4.0);
-
-    Optional<ColStatistics> result = estimator.estimate(Arrays.asList(stat));
-
-    assertTrue(result.isPresent());
-    assertEquals(100, result.get().getCountDistint());
-    assertEquals(10, result.get().getNumNulls());
-    assertEquals(4.0, result.get().getAvgColLen());
-  }
-
-  @Test
-  void testDefaultEstimateReturnsCloneNotSameReference() {
-    StatEstimator estimator = new StatEstimator() {};
-    ColStatistics stat = createStat("col1", "int", 100, 10, 4.0);
-
-    Optional<ColStatistics> result = estimator.estimate(Arrays.asList(stat));
-
-    assertTrue(result.isPresent());
-    assertNotSame(stat, result.get(), "Should return a clone, not the same reference");
-    stat.setCountDistint(999);
-    assertEquals(100, result.get().getCountDistint(), "Clone should not be affected by original changes");
-  }
-
-  @Test
-  void testDefaultEstimateIgnoresSubsequentArgs() {
-    StatEstimator estimator = new StatEstimator() {};
-    ColStatistics stat1 = createStat("col1", "int", 100, 10, 4.0);
-    ColStatistics stat2 = createStat("col2", "int", 200, 20, 8.0);
-
-    Optional<ColStatistics> result = estimator.estimate(Arrays.asList(stat1, stat2));
-
-    assertTrue(result.isPresent());
-    assertEquals(100, result.get().getCountDistint(), "Should use first arg's NDV");
-    assertEquals(10, result.get().getNumNulls(), "Should use first arg's numNulls");
-  }
-
-  @Test
-  void testDefaultEstimateWithNumRowsCapsNdv() {
-    StatEstimator estimator = new StatEstimator() {};
-    ColStatistics stat = createStat("col1", "int", 1000, 10, 4.0);
-
-    Optional<ColStatistics> result = estimator.estimate(Arrays.asList(stat), 500);
-
-    assertTrue(result.isPresent());
-    assertEquals(500, result.get().getCountDistint(), "NDV should be capped at numRows");
-  }
-
-  @Test
-  void testDefaultEstimateWithNumRowsNoCappingNeeded() {
-    StatEstimator estimator = new StatEstimator() {};
-    ColStatistics stat = createStat("col1", "int", 100, 10, 4.0);
-
-    Optional<ColStatistics> result = estimator.estimate(Arrays.asList(stat), 500);
-
-    assertTrue(result.isPresent());
-    assertEquals(100, result.get().getCountDistint(), "NDV should remain unchanged when less than numRows");
-  }
-
-  @Test
-  void testDefaultEstimateWithNumRowsExactlyEqual() {
-    StatEstimator estimator = new StatEstimator() {};
-    ColStatistics stat = createStat("col1", "int", 500, 10, 4.0);
-
-    Optional<ColStatistics> result = estimator.estimate(Arrays.asList(stat), 500);
-
-    assertTrue(result.isPresent());
-    assertEquals(500, result.get().getCountDistint(), "NDV should remain unchanged when equal to numRows");
-  }
-
-  @Test
-  void testDefaultEstimateWithNumRowsEmptyList() {
-    StatEstimator estimator = new StatEstimator() {};
-
-    Optional<ColStatistics> result = estimator.estimate(Collections.emptyList(), 500);
-
-    assertFalse(result.isPresent(), "Empty list should return empty Optional");
-  }
-
-  @Test
-  void testDefaultEstimateWithNumRowsPreservesOtherStats() {
-    StatEstimator estimator = new StatEstimator() {};
-    ColStatistics stat = createStat("col1", "int", 1000, 10, 4.0);
-    stat.setNumTrues(50);
-    stat.setNumFalses(40);
-
-    Optional<ColStatistics> result = estimator.estimate(Arrays.asList(stat), 500);
-
-    assertTrue(result.isPresent());
-    assertEquals(500, result.get().getCountDistint(), "NDV should be capped");
-    assertEquals(10, result.get().getNumNulls(), "numNulls should be preserved");
-    assertEquals(4.0, result.get().getAvgColLen(), "avgColLen should be preserved");
-  }
-
-  @Test
-  void testStatEstimatorProviderDefaultReturnsWorkingEstimator() {
-    StatEstimatorProvider provider = new StatEstimatorProvider() {};
-    StatEstimator estimator = provider.getStatEstimator();
-
-    ColStatistics stat = createStat("col1", "int", 100, 10, 4.0);
-    Optional<ColStatistics> result = estimator.estimate(Arrays.asList(stat));
-
-    assertTrue(result.isPresent());
-    assertEquals(100, result.get().getCountDistint());
-  }
-
-  @Test
-  void testStatEstimatorProviderDefaultCapsNdv() {
-    StatEstimatorProvider provider = new StatEstimatorProvider() {};
-    StatEstimator estimator = provider.getStatEstimator();
-
-    ColStatistics stat = createStat("col1", "int", 1000, 10, 4.0);
-    Optional<ColStatistics> result = estimator.estimate(Arrays.asList(stat), 500);
-
-    assertTrue(result.isPresent());
-    assertEquals(500, result.get().getCountDistint(), "Default provider estimator should cap NDV");
-  }
-
-  private ColStatistics createStat(String name, String type, long ndv, long numNulls, double avgColLen) {
-    ColStatistics stat = new ColStatistics(name, type);
-    stat.setCountDistint(ndv);
-    stat.setNumNulls(numNulls);
-    stat.setAvgColLen(avgColLen);
-    return stat;
-  }
-}
diff --git a/ql/src/test/queries/clientpositive/list_bucket_dml_6.q b/ql/src/test/queries/clientpositive/list_bucket_dml_6.q
index 11986d696aff..2ce2ced59e04 100644
--- a/ql/src/test/queries/clientpositive/list_bucket_dml_6.q
+++ b/ql/src/test/queries/clientpositive/list_bucket_dml_6.q
@@ -1,12 +1,10 @@
 --! qt:dataset:srcpart
--- this ensures consistent split file counts between localhost & CI runs
-set tez.grouping.split-count=1;
 set hive.mapred.mode=nonstrict;
 set hive.exec.dynamic.partition=true;
 set hive.input.format=org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;
 set hive.merge.smallfiles.avgsize=200;
 set mapred.input.dir.recursive=true;
-set hive.merge.mapfiles=false;
+set hive.merge.mapfiles=false;	
 set hive.merge.mapredfiles=false;
 
 -- list bucketing DML: dynamic partition. multiple skewed columns. merge.
@@ -45,13 +43,13 @@ set hive.merge.mapredfiles=false;
 -- 87 000000_0
 -- 87 000001_0
 -- with merge
--- 118 000002_0
+-- 118 000002_0 
 
 -- SORT_QUERY_RESULTS
 
 -- create a skewed table
-create table list_bucketing_dynamic_part_n3 (key String, value String)
-    partitioned by (ds String, hr String)
+create table list_bucketing_dynamic_part_n3 (key String, value String) 
+    partitioned by (ds String, hr String) 
     skewed by (key, value) on (('484','val_484'),('51','val_14'),('103','val_103'))
     stored as DIRECTORIES
     STORED AS RCFILE;
@@ -94,3 +92,6 @@ select * from list_bucketing_dynamic_part_n3 where key = '484' and value = 'val_
 select * from list_bucketing_dynamic_part_n3 where key = '484' and value = 'val_484';
 select * from srcpart where ds = '2008-04-08' and key = '484' and value = 'val_484';
 
+-- clean up
+drop table list_bucketing_dynamic_part_n3;
+
diff --git a/ql/src/test/queries/clientpositive/list_bucket_dml_7.q b/ql/src/test/queries/clientpositive/list_bucket_dml_7.q
index a4a21aaa1ceb..f80585e56c6f 100644
--- a/ql/src/test/queries/clientpositive/list_bucket_dml_7.q
+++ b/ql/src/test/queries/clientpositive/list_bucket_dml_7.q
@@ -1,6 +1,4 @@
 --! qt:dataset:srcpart
--- this ensures consistent split file counts between localhost & CI runs
-set tez.grouping.split-count=1;
 set hive.mapred.mode=nonstrict;
 set hive.exec.dynamic.partition=true;
 set hive.input.format=org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;
@@ -41,10 +39,10 @@ select key, value, if(key % 100 == 0, 'a1', 'b1') from srcpart where ds = '2008-
 
 -- check DML result
 show partitions list_bucketing_dynamic_part;
-desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='a1');
+desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='a1');	
 desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='b1');
 
-set hive.merge.mapfiles=true;
+set hive.merge.mapfiles=true;	
 set hive.merge.mapredfiles=true; 
 -- list bucketing DML with merge. use bucketize to generate a few small files.
 explain extended
@@ -56,7 +54,7 @@ select key, value, if(key % 100 == 0, 'a1', 'b1') from srcpart where ds = '2008-
 
 -- check DML result
 show partitions list_bucketing_dynamic_part;
-desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='a1');
+desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='a1');	
 desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='b1');
 
 select count(1) from srcpart where ds = '2008-04-08';
@@ -67,3 +65,6 @@ explain extended
 select * from list_bucketing_dynamic_part where key = '484' and value = 'val_484';
 select * from list_bucketing_dynamic_part where key = '484' and value = 'val_484';
 select * from srcpart where ds = '2008-04-08' and key = '484' and value = 'val_484';
+
+-- clean up
+drop table list_bucketing_dynamic_part;
diff --git a/ql/src/test/queries/clientpositive/ndv_case_const.q b/ql/src/test/queries/clientpositive/ndv_case_const.q
index 42162d25c529..7132e163aa13 100644
--- a/ql/src/test/queries/clientpositive/ndv_case_const.q
+++ b/ql/src/test/queries/clientpositive/ndv_case_const.q
@@ -1,3 +1,6 @@
+-- Tests for CASE expression NDV estimation in Group By Operator.
+-- Verifies that "Statistics: Num rows" reflects accurate NDV computation
+-- when CASE branches contain constants, NULLs, and column references.
 CREATE TABLE t (cond INT, c2 STRING, c100 STRING);
 ALTER TABLE t UPDATE STATISTICS SET('numRows'='10000','rawDataSize'='1000000');
 ALTER TABLE t UPDATE STATISTICS FOR COLUMN cond SET('numDVs'='10','numNulls'='0');
@@ -25,3 +28,13 @@ EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN c2 WHEN cond=2 THEN c100 ELS
 EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN 'A' WHEN cond=2 THEN 'B' WHEN cond=3 THEN 'C' ELSE c2 END x FROM t) sub GROUP BY x;
 
 EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN 'A' WHEN cond=2 THEN 'B' ELSE c100 END x FROM t) sub GROUP BY x;
+
+-- Test NDV cap: sum of branch NDVs (100+100+100+1=301) exceeds numRows (200)
+CREATE TABLE t_small (cond INT, c100a STRING, c100b STRING, c100c STRING);
+ALTER TABLE t_small UPDATE STATISTICS SET('numRows'='200','rawDataSize'='20000');
+ALTER TABLE t_small UPDATE STATISTICS FOR COLUMN cond SET('numDVs'='10','numNulls'='0');
+ALTER TABLE t_small UPDATE STATISTICS FOR COLUMN c100a SET('numDVs'='100','numNulls'='0','avgColLen'='5','maxColLen'='10');
+ALTER TABLE t_small UPDATE STATISTICS FOR COLUMN c100b SET('numDVs'='100','numNulls'='0','avgColLen'='5','maxColLen'='10');
+ALTER TABLE t_small UPDATE STATISTICS FOR COLUMN c100c SET('numDVs'='100','numNulls'='0','avgColLen'='5','maxColLen'='10');
+
+EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN c100a WHEN cond=2 THEN c100b WHEN cond=3 THEN c100c ELSE 'A' END x FROM t_small) sub GROUP BY x;
diff --git a/ql/src/test/results/clientpositive/llap/list_bucket_dml_6.q.out b/ql/src/test/results/clientpositive/llap/list_bucket_dml_6.q.out
index d9d5d18340eb..4f4a0b3df537 100644
--- a/ql/src/test/results/clientpositive/llap/list_bucket_dml_6.q.out
+++ b/ql/src/test/results/clientpositive/llap/list_bucket_dml_6.q.out
@@ -1,17 +1,13 @@
-hive.merge.mapfiles=true
-hive.merge.mapredfiles=false
-hive.merge.tezfiles=false
-hive.merge.smallfiles.avgsize=16000000
-PREHOOK: query: create table list_bucketing_dynamic_part_n3 (key String, value String)
-    partitioned by (ds String, hr String)
+PREHOOK: query: create table list_bucketing_dynamic_part_n3 (key String, value String) 
+    partitioned by (ds String, hr String) 
     skewed by (key, value) on (('484','val_484'),('51','val_14'),('103','val_103'))
     stored as DIRECTORIES
     STORED AS RCFILE
 PREHOOK: type: CREATETABLE
 PREHOOK: Output: database:default
 PREHOOK: Output: default@list_bucketing_dynamic_part_n3
-POSTHOOK: query: create table list_bucketing_dynamic_part_n3 (key String, value String)
-    partitioned by (ds String, hr String)
+POSTHOOK: query: create table list_bucketing_dynamic_part_n3 (key String, value String) 
+    partitioned by (ds String, hr String) 
     skewed by (key, value) on (('484','val_484'),('51','val_14'),('103','val_103'))
     stored as DIRECTORIES
     STORED AS RCFILE
@@ -902,3 +898,13 @@ POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=12
 #### A masked pattern was here ####
 484	val_484	2008-04-08	11
 484	val_484	2008-04-08	12
+PREHOOK: query: drop table list_bucketing_dynamic_part_n3
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@list_bucketing_dynamic_part_n3
+PREHOOK: Output: database:default
+PREHOOK: Output: default@list_bucketing_dynamic_part_n3
+POSTHOOK: query: drop table list_bucketing_dynamic_part_n3
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@list_bucketing_dynamic_part_n3
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@list_bucketing_dynamic_part_n3
diff --git a/ql/src/test/results/clientpositive/llap/list_bucket_dml_7.q.out b/ql/src/test/results/clientpositive/llap/list_bucket_dml_7.q.out
index e1dbd260d038..6e45676ba107 100644
--- a/ql/src/test/results/clientpositive/llap/list_bucket_dml_7.q.out
+++ b/ql/src/test/results/clientpositive/llap/list_bucket_dml_7.q.out
@@ -898,3 +898,13 @@ POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=12
 #### A masked pattern was here ####
 484	val_484	2008-04-08	11
 484	val_484	2008-04-08	12
+PREHOOK: query: drop table list_bucketing_dynamic_part
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@list_bucketing_dynamic_part
+PREHOOK: Output: database:default
+PREHOOK: Output: default@list_bucketing_dynamic_part
+POSTHOOK: query: drop table list_bucketing_dynamic_part
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@list_bucketing_dynamic_part
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@list_bucketing_dynamic_part
diff --git a/ql/src/test/results/clientpositive/llap/ndv_case_const.q.out b/ql/src/test/results/clientpositive/llap/ndv_case_const.q.out
index 1d3c298a25c5..6539159d2eaa 100644
--- a/ql/src/test/results/clientpositive/llap/ndv_case_const.q.out
+++ b/ql/src/test/results/clientpositive/llap/ndv_case_const.q.out
@@ -397,13 +397,13 @@ STAGE PLANS:
                       minReductionHashAggr: 0.99
                       mode: hash
                       outputColumnNames: _col0
-                      Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE
+                      Statistics: Num rows: 4 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE
                       Reduce Output Operator
                         key expressions: _col0 (type: string)
                         null sort order: z
                         sort order: +
                         Map-reduce partition columns: _col0 (type: string)
-                        Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE
+                        Statistics: Num rows: 4 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE
             Execution mode: vectorized, llap
             LLAP IO: all inputs
         Reducer 2 
@@ -413,10 +413,10 @@ STAGE PLANS:
                 keys: KEY._col0 (type: string)
                 mode: mergepartial
                 outputColumnNames: _col0
-                Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE
+                Statistics: Num rows: 4 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE
                 File Output Operator
                   compressed: false
-                  Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE
+                  Statistics: Num rows: 4 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE
                   table:
                       input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                       output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -753,3 +753,116 @@ STAGE PLANS:
       Processor Tree:
         ListSink
 
+PREHOOK: query: CREATE TABLE t_small (cond INT, c100a STRING, c100b STRING, c100c STRING)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t_small
+POSTHOOK: query: CREATE TABLE t_small (cond INT, c100a STRING, c100b STRING, c100c STRING)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t_small
+PREHOOK: query: ALTER TABLE t_small UPDATE STATISTICS SET('numRows'='200','rawDataSize'='20000')
+PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
+PREHOOK: Input: default@t_small
+PREHOOK: Output: default@t_small
+POSTHOOK: query: ALTER TABLE t_small UPDATE STATISTICS SET('numRows'='200','rawDataSize'='20000')
+POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS
+POSTHOOK: Input: default@t_small
+POSTHOOK: Output: default@t_small
+PREHOOK: query: ALTER TABLE t_small UPDATE STATISTICS FOR COLUMN cond SET('numDVs'='10','numNulls'='0')
+PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
+PREHOOK: Input: default@t_small
+PREHOOK: Output: default@t_small
+POSTHOOK: query: ALTER TABLE t_small UPDATE STATISTICS FOR COLUMN cond SET('numDVs'='10','numNulls'='0')
+POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS
+POSTHOOK: Input: default@t_small
+POSTHOOK: Output: default@t_small
+PREHOOK: query: ALTER TABLE t_small UPDATE STATISTICS FOR COLUMN c100a SET('numDVs'='100','numNulls'='0','avgColLen'='5','maxColLen'='10')
+PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
+PREHOOK: Input: default@t_small
+PREHOOK: Output: default@t_small
+POSTHOOK: query: ALTER TABLE t_small UPDATE STATISTICS FOR COLUMN c100a SET('numDVs'='100','numNulls'='0','avgColLen'='5','maxColLen'='10')
+POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS
+POSTHOOK: Input: default@t_small
+POSTHOOK: Output: default@t_small
+PREHOOK: query: ALTER TABLE t_small UPDATE STATISTICS FOR COLUMN c100b SET('numDVs'='100','numNulls'='0','avgColLen'='5','maxColLen'='10')
+PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
+PREHOOK: Input: default@t_small
+PREHOOK: Output: default@t_small
+POSTHOOK: query: ALTER TABLE t_small UPDATE STATISTICS FOR COLUMN c100b SET('numDVs'='100','numNulls'='0','avgColLen'='5','maxColLen'='10')
+POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS
+POSTHOOK: Input: default@t_small
+POSTHOOK: Output: default@t_small
+PREHOOK: query: ALTER TABLE t_small UPDATE STATISTICS FOR COLUMN c100c SET('numDVs'='100','numNulls'='0','avgColLen'='5','maxColLen'='10')
+PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
+PREHOOK: Input: default@t_small
+PREHOOK: Output: default@t_small
+POSTHOOK: query: ALTER TABLE t_small UPDATE STATISTICS FOR COLUMN c100c SET('numDVs'='100','numNulls'='0','avgColLen'='5','maxColLen'='10')
+POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS
+POSTHOOK: Input: default@t_small
+POSTHOOK: Output: default@t_small
+PREHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN c100a WHEN cond=2 THEN c100b WHEN cond=3 THEN c100c ELSE 'A' END x FROM t_small) sub GROUP BY x
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t_small
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN c100a WHEN cond=2 THEN c100b WHEN cond=3 THEN c100c ELSE 'A' END x FROM t_small) sub GROUP BY x
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t_small
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: t_small
+                  Statistics: Num rows: 200 Data size: 54200 Basic stats: COMPLETE Column stats: COMPLETE
+                  Select Operator
+                    expressions: CASE WHEN ((cond = 1)) THEN (c100a) WHEN ((cond = 2)) THEN (c100b) WHEN ((cond = 3)) THEN (c100c) ELSE ('A') END (type: string)
+                    outputColumnNames: _col0
+                    Statistics: Num rows: 200 Data size: 54200 Basic stats: COMPLETE Column stats: COMPLETE
+                    Group By Operator
+                      keys: _col0 (type: string)
+                      minReductionHashAggr: 0.4
+                      mode: hash
+                      outputColumnNames: _col0
+                      Statistics: Num rows: 200 Data size: 17800 Basic stats: COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: string)
+                        null sort order: z
+                        sort order: +
+                        Map-reduce partition columns: _col0 (type: string)
+                        Statistics: Num rows: 200 Data size: 17800 Basic stats: COMPLETE Column stats: COMPLETE
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Reducer 2 
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Group By Operator
+                keys: KEY._col0 (type: string)
+                mode: mergepartial
+                outputColumnNames: _col0
+                Statistics: Num rows: 200 Data size: 17800 Basic stats: COMPLETE Column stats: COMPLETE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 200 Data size: 17800 Basic stats: COMPLETE Column stats: COMPLETE
+                  table:
+                      input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+

From 4170c17b1951f5ff785ecf76536a19d863eb9395 Mon Sep 17 00:00:00 2001
From: Konstantin Bereznyakov <konstantin.bereznyakov@treasure-data.com>
Date: Wed, 25 Mar 2026 11:41:30 -0700
Subject: [PATCH 09/13] HIVE-29368: trigger a rebuild


From dced9f51079c74ad0bb5e56266baa9d6ea1233c9 Mon Sep 17 00:00:00 2001
From: Konstantin Bereznyakov <konstantin.bereznyakov@treasure-data.com>
Date: Wed, 25 Mar 2026 17:57:33 -0700
Subject: [PATCH 10/13] HIVE-29368: buildColStatForConstant() can be simplified
 after removing NDV logic

---
 .../apache/hadoop/hive/ql/stats/StatsUtils.java  | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
index 830c4b6c8cec..d75d87b81a4e 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
@@ -1628,19 +1628,11 @@ public static ColStatistics getColStatisticsFromExpression(HiveConf conf, Statis
   }
 
   private static ColStatistics buildColStatForConstant(HiveConf conf, long numRows, ExprNodeConstantDesc encd) {
-    long numNulls = 0;
-    long countDistincts = 1;
-    if (encd.getValue() == null) {
-      numNulls = numRows;
-    }
-    String colType = encd.getTypeString();
-    colType = colType.toLowerCase();
-    ObjectInspector oi = encd.getWritableObjectInspector();
-    double avgColSize = getAvgColLenOf(conf, oi, colType);
+    String colType = encd.getTypeString().toLowerCase();
     ColStatistics colStats = new ColStatistics(encd.getName(), colType);
-    colStats.setAvgColLen(avgColSize);
-    colStats.setCountDistint(countDistincts);
-    colStats.setNumNulls(numNulls);
+    colStats.setAvgColLen(getAvgColLenOf(conf, encd.getWritableObjectInspector(), colType));
+    colStats.setCountDistint(1);
+    colStats.setNumNulls(encd.getValue() == null ? numRows : 0);
 
     Optional<Number> value = getConstValue(encd);
     value.ifPresent(number -> colStats.setRange(number, number));

From 3690d4b61f7694743d578c5bf02238e514ef9a0e Mon Sep 17 00:00:00 2001
From: Konstantin Bereznyakov <konstantin.bereznyakov@treasure-data.com>
Date: Fri, 27 Mar 2026 15:42:22 -0700
Subject: [PATCH 11/13] HIVE-29368: refactored StatEstimator to be aware of
 parent stats, and PessimisticStatCombiner to use numRows to identify "const
 NULL" ColStatistics instances

---
 .../hadoop/hive/ql/stats/StatsUtils.java      |  8 +-
 .../estimator/PessimisticStatCombiner.java    | 13 ++-
 .../ql/stats/estimator/StatEstimator.java     | 17 +++-
 .../ql/udf/generic/GenericUDFCoalesce.java    |  5 +-
 .../hive/ql/udf/generic/GenericUDFIf.java     |  5 +-
 .../hive/ql/udf/generic/GenericUDFWhen.java   |  5 +-
 .../hadoop/hive/ql/stats/TestStatsUtils.java  |  2 +-
 .../TestPessimisticStatCombiner.java          | 79 ++++++++++++++++---
 .../clientpositive/llap/ndv_case_const.q.out  |  8 +-
 9 files changed, 111 insertions(+), 31 deletions(-)

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
index d75d87b81a4e..9fd7593e2ce3 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
@@ -1578,7 +1578,7 @@ public static ColStatistics getColStatisticsFromExpression(HiveConf conf, Statis
             csList.add(cs);
           }
           if (csList.size() == engfd.getChildren().size()) {
-            Optional<ColStatistics> res = se.estimate(csList);
+            Optional<ColStatistics> res = se.estimate(csList, parentStats);
             if (res.isPresent()) {
               ColStatistics newStats = res.get();
               // NDV cannot exceed numRows
@@ -1631,7 +1631,7 @@ private static ColStatistics buildColStatForConstant(HiveConf conf, long numRows
     String colType = encd.getTypeString().toLowerCase();
     ColStatistics colStats = new ColStatistics(encd.getName(), colType);
     colStats.setAvgColLen(getAvgColLenOf(conf, encd.getWritableObjectInspector(), colType));
-    colStats.setCountDistint(1);
+    colStats.setCountDistint(encd.getValue() == null ? 0 : 1);
     colStats.setNumNulls(encd.getValue() == null ? numRows : 0);
 
     Optional<Number> value = getConstValue(encd);
@@ -2099,9 +2099,7 @@ private static List<Long> extractNDVGroupingColumns(List<ColStatistics> colStats
     for (ColStatistics cs : colStats) {
       if (cs != null) {
         long ndv = cs.getCountDistint();
-        // +1 for NULL group: source columns with partial nulls and known NDV only.
-        // Computed expressions include NULL. Ordered: numNulls>0 first (often false).
-        if (!cs.isEstimated() && cs.getNumNulls() > 0 && ndv > 0 && cs.getNumNulls() < parentStats.getNumRows()) {
+        if (cs.getNumNulls() > 0) {
           ndv = StatsUtils.safeAdd(ndv, 1);
         }
         ndvValues.add(ndv);
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java
index f84484c456be..48bb90820439 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java
@@ -28,10 +28,19 @@
  */
 public class PessimisticStatCombiner {
 
+  private final long numRows;
   private boolean inited;
+  private boolean hasUnknownNDV;
   private ColStatistics result;
 
+  public PessimisticStatCombiner(long numRows) {
+    this.numRows = numRows;
+  }
+
   public void add(ColStatistics stat) {
+    // NDV==0 means unknown, unless it's a NULL constant (numNulls == numRows)
+    hasUnknownNDV = hasUnknownNDV || (stat.getCountDistint() == 0 && stat.getNumNulls() != numRows);
+
     if (!inited) {
       inited = true;
       result = stat.clone();
@@ -42,9 +51,7 @@ public void add(ColStatistics stat) {
     if (stat.getAvgColLen() > result.getAvgColLen()) {
       result.setAvgColLen(stat.getAvgColLen());
     }
-    // If any branch has NDV=0 (unknown stats), propagate unknown to result.
-    // Summing would treat unknown as zero, causing cardinality underestimates.
-    if (result.getCountDistint() == 0 || stat.getCountDistint() == 0) {
+    if (hasUnknownNDV) {
       result.setCountDistint(0);
     } else {
       result.setCountDistint(StatsUtils.safeAdd(result.getCountDistint(), stat.getCountDistint()));
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimator.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimator.java
index 94aaa32ecfcb..80846fa24d30 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimator.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimator.java
@@ -22,6 +22,7 @@
 import java.util.Optional;
 
 import org.apache.hadoop.hive.ql.plan.ColStatistics;
+import org.apache.hadoop.hive.ql.plan.Statistics;
 
 /**
  * Enables statistics related computation on UDFs
@@ -39,5 +40,19 @@ public interface StatEstimator {
    * @param argStats the statistics for every argument of the UDF
    * @return {@link ColStatistics} estimate for the actual UDF.
    */
-  public Optional<ColStatistics> estimate(List<ColStatistics> argStats);
+  default Optional<ColStatistics> estimate(List<ColStatistics> argStats) {
+    throw new UnsupportedOperationException("This estimator requires parentStats");
+  }
+
+  /**
+   * Computes the output statistics with access to parent statistics.
+   * Override this method when the estimator uses more info for accurate estimation.
+   *
+   * @param argStats the statistics for every argument of the UDF
+   * @param parentStats statistics from the parent operator
+   * @return {@link ColStatistics} estimate for the actual UDF.
+   */
+  default Optional<ColStatistics> estimate(List<ColStatistics> argStats, Statistics parentStats) {
+    return estimate(argStats);
+  }
 }
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCoalesce.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCoalesce.java
index bbca9242ecaa..1799669bda57 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCoalesce.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCoalesce.java
@@ -26,6 +26,7 @@
 import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressionsSupportDecimal64;
 import org.apache.hadoop.hive.ql.metadata.HiveException;
 import org.apache.hadoop.hive.ql.plan.ColStatistics;
+import org.apache.hadoop.hive.ql.plan.Statistics;
 import org.apache.hadoop.hive.ql.stats.estimator.StatEstimator;
 import org.apache.hadoop.hive.ql.stats.estimator.StatEstimatorProvider;
 import org.apache.hadoop.hive.ql.stats.estimator.PessimisticStatCombiner;
@@ -89,8 +90,8 @@ public StatEstimator getStatEstimator() {
   static class CoalesceStatEstimator implements StatEstimator {
 
     @Override
-    public Optional<ColStatistics> estimate(List<ColStatistics> argStats) {
-      PessimisticStatCombiner combiner = new PessimisticStatCombiner();
+    public Optional<ColStatistics> estimate(List<ColStatistics> argStats, Statistics parentStats) {
+      PessimisticStatCombiner combiner = new PessimisticStatCombiner(parentStats.getNumRows());
       for (int i = 0; i < argStats.size(); i++) {
         combiner.add(argStats.get(i));
       }
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFIf.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFIf.java
index eaa352317267..74bd2459debf 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFIf.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFIf.java
@@ -29,6 +29,7 @@
 import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressionsSupportDecimal64;
 import org.apache.hadoop.hive.ql.metadata.HiveException;
 import org.apache.hadoop.hive.ql.plan.ColStatistics;
+import org.apache.hadoop.hive.ql.plan.Statistics;
 import org.apache.hadoop.hive.ql.stats.estimator.StatEstimator;
 import org.apache.hadoop.hive.ql.stats.estimator.StatEstimatorProvider;
 import org.apache.hadoop.hive.ql.stats.estimator.PessimisticStatCombiner;
@@ -187,8 +188,8 @@ public StatEstimator getStatEstimator() {
   static class IfStatEstimator implements StatEstimator {
 
     @Override
-    public Optional<ColStatistics> estimate(List<ColStatistics> argStats) {
-      PessimisticStatCombiner combiner = new PessimisticStatCombiner();
+    public Optional<ColStatistics> estimate(List<ColStatistics> argStats, Statistics parentStats) {
+      PessimisticStatCombiner combiner = new PessimisticStatCombiner(parentStats.getNumRows());
       combiner.add(argStats.get(1));
       combiner.add(argStats.get(2));
       return combiner.getResult();
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFWhen.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFWhen.java
index e6d3580692d3..5dab62ab959e 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFWhen.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFWhen.java
@@ -25,6 +25,7 @@
 import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
 import org.apache.hadoop.hive.ql.metadata.HiveException;
 import org.apache.hadoop.hive.ql.plan.ColStatistics;
+import org.apache.hadoop.hive.ql.plan.Statistics;
 import org.apache.hadoop.hive.ql.stats.estimator.PessimisticStatCombiner;
 import org.apache.hadoop.hive.ql.stats.estimator.StatEstimator;
 import org.apache.hadoop.hive.ql.stats.estimator.StatEstimatorProvider;
@@ -143,8 +144,8 @@ public StatEstimator getStatEstimator() {
   static class WhenStatEstimator implements StatEstimator {
 
     @Override
-    public Optional<ColStatistics> estimate(List<ColStatistics> argStats) {
-      PessimisticStatCombiner combiner = new PessimisticStatCombiner();
+    public Optional<ColStatistics> estimate(List<ColStatistics> argStats, Statistics parentStats) {
+      PessimisticStatCombiner combiner = new PessimisticStatCombiner(parentStats.getNumRows());
       for (int i = 1; i < argStats.size(); i += 2) {
         combiner.add(argStats.get(i));
       }
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java b/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java
index bc54e749834d..2b8cb07b6822 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java
@@ -668,7 +668,7 @@ void testComputeNDVGroupingColumnsMixedEstimatedAndSource() {
   // Test for NDV cap after StatEstimator (NDV cannot exceed numRows)
 
   @Test
-  void testGetColStatisticsFromExpressionNdvCappedAtNumRows() throws Exception {
+  void testGetColStatisticsFromExpressionNdvCappedAtNumRows() {
     HiveConf conf = new HiveConf();
     conf.setBoolVar(HiveConf.ConfVars.HIVE_STATS_ESTIMATORS_ENABLE, true);
 
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestPessimisticStatCombiner.java b/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestPessimisticStatCombiner.java
index 9840cfeaf269..281e7b82c27a 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestPessimisticStatCombiner.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestPessimisticStatCombiner.java
@@ -30,7 +30,7 @@ void testNdvSumWhenBothKnown() {
     ColStatistics stat1 = createStat("col1", "int", 50, 0, 4.0);
     ColStatistics stat2 = createStat("col2", "int", 30, 0, 4.0);
 
-    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner(1000);
     combiner.add(stat1);
     combiner.add(stat2);
 
@@ -43,7 +43,7 @@ void testNdvUnknownPropagatedFromFirst() {
     ColStatistics stat1 = createStat("col1", "int", 0, 0, 4.0);
     ColStatistics stat2 = createStat("col2", "int", 100, 0, 4.0);
 
-    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner(1000);
     combiner.add(stat1);
     combiner.add(stat2);
 
@@ -56,7 +56,7 @@ void testNdvUnknownPropagatedFromSecond() {
     ColStatistics stat1 = createStat("col1", "int", 100, 0, 4.0);
     ColStatistics stat2 = createStat("col2", "int", 0, 0, 4.0);
 
-    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner(1000);
     combiner.add(stat1);
     combiner.add(stat2);
 
@@ -69,7 +69,7 @@ void testCombinePropagatesUnknownNumNullsFromFirst() {
     ColStatistics stat1 = createStat("col1", "int", 50, -1, 4.0); // unknown numNulls
     ColStatistics stat2 = createStat("col2", "int", 30, 100, 4.0);
 
-    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner(1000);
     combiner.add(stat1);
     combiner.add(stat2);
 
@@ -82,7 +82,7 @@ void testCombinePropagatesUnknownNumNullsFromSecond() {
     ColStatistics stat1 = createStat("col1", "int", 50, 100, 4.0);
     ColStatistics stat2 = createStat("col2", "int", 30, -1, 4.0); // unknown numNulls
 
-    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner(1000);
     combiner.add(stat1);
     combiner.add(stat2);
 
@@ -100,7 +100,7 @@ void testCombinePropagatesUnknownNumTruesFromFirst() {
     stat2.setNumTrues(100);
     stat2.setNumFalses(150);
 
-    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner(1000);
     combiner.add(stat1);
     combiner.add(stat2);
 
@@ -118,7 +118,7 @@ void testCombinePropagatesUnknownNumTruesFromSecond() {
     stat2.setNumTrues(-1); // unknown
     stat2.setNumFalses(150);
 
-    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner(1000);
     combiner.add(stat1);
     combiner.add(stat2);
 
@@ -136,7 +136,7 @@ void testCombinePropagatesUnknownNumFalsesFromFirst() {
     stat2.setNumTrues(50);
     stat2.setNumFalses(150);
 
-    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner(1000);
     combiner.add(stat1);
     combiner.add(stat2);
 
@@ -154,7 +154,7 @@ void testCombinePropagatesUnknownNumFalsesFromSecond() {
     stat2.setNumTrues(50);
     stat2.setNumFalses(-1); // unknown
 
-    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner(1000);
     combiner.add(stat1);
     combiner.add(stat2);
 
@@ -167,7 +167,7 @@ void testCombineBothUnknownNumNulls() {
     ColStatistics stat1 = createStat("col1", "int", 50, -1, 4.0);
     ColStatistics stat2 = createStat("col2", "int", 30, -1, 4.0);
 
-    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner(1000);
     combiner.add(stat1);
     combiner.add(stat2);
 
@@ -185,7 +185,7 @@ void testCombineBothUnknownNumTruesAndNumFalses() {
     stat2.setNumTrues(-1);
     stat2.setNumFalses(-1);
 
-    PessimisticStatCombiner combiner = new PessimisticStatCombiner();
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner(1000);
     combiner.add(stat1);
     combiner.add(stat2);
 
@@ -194,6 +194,63 @@ void testCombineBothUnknownNumTruesAndNumFalses() {
     assertEquals(-1, combined.getNumFalses(), "Both unknown should result in unknown (-1)");
   }
 
+  @Test
+  void testNullConstantDoesNotContributeToNdv() {
+    long numRows = 100;
+    ColStatistics nullConstant = createStat("null", "int", 0, numRows, 0.0);
+    ColStatistics regularStat = createStat("col", "int", 50, 10, 4.0);
+
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner(numRows);
+    combiner.add(nullConstant);
+    combiner.add(regularStat);
+
+    ColStatistics result = combiner.getResult().get();
+    assertEquals(50, result.getCountDistint(), "NULL constant should not contribute to NDV");
+  }
+
+  @Test
+  void testNullConstantAsSecondDoesNotContributeToNdv() {
+    long numRows = 100;
+    ColStatistics regularStat = createStat("col", "int", 50, 10, 4.0);
+    ColStatistics nullConstant = createStat("null", "int", 0, numRows, 0.0);
+
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner(numRows);
+    combiner.add(regularStat);
+    combiner.add(nullConstant);
+
+    ColStatistics result = combiner.getResult().get();
+    assertEquals(50, result.getCountDistint(), "NULL constant should not contribute to NDV");
+  }
+
+  @Test
+  void testMultipleNullConstantsResultInZeroNdv() {
+    long numRows = 100;
+    ColStatistics nullConstant1 = createStat("null1", "int", 0, numRows, 0.0);
+    ColStatistics nullConstant2 = createStat("null2", "int", 0, numRows, 0.0);
+
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner(numRows);
+    combiner.add(nullConstant1);
+    combiner.add(nullConstant2);
+
+    ColStatistics result = combiner.getResult().get();
+    assertEquals(0, result.getCountDistint(), "Multiple NULL constants should result in NDV=0");
+    assertEquals(numRows, result.getNumNulls(), "numNulls should be numRows");
+  }
+
+  @Test
+  void testUnknownNdvNotConfusedWithNullConstant() {
+    long numRows = 100;
+    ColStatistics unknownNdv = createStat("col", "int", 0, 10, 4.0);
+    ColStatistics regularStat = createStat("col2", "int", 50, 5, 4.0);
+
+    PessimisticStatCombiner combiner = new PessimisticStatCombiner(numRows);
+    combiner.add(unknownNdv);
+    combiner.add(regularStat);
+
+    ColStatistics result = combiner.getResult().get();
+    assertEquals(0, result.getCountDistint(), "Unknown NDV should propagate as 0");
+  }
+
   private ColStatistics createStat(String name, String type, long ndv, long numNulls, double avgColLen) {
     ColStatistics stat = new ColStatistics(name, type);
     stat.setCountDistint(ndv);
diff --git a/ql/src/test/results/clientpositive/llap/ndv_case_const.q.out b/ql/src/test/results/clientpositive/llap/ndv_case_const.q.out
index 6539159d2eaa..a25b1e35b17d 100644
--- a/ql/src/test/results/clientpositive/llap/ndv_case_const.q.out
+++ b/ql/src/test/results/clientpositive/llap/ndv_case_const.q.out
@@ -397,13 +397,13 @@ STAGE PLANS:
                       minReductionHashAggr: 0.99
                       mode: hash
                       outputColumnNames: _col0
-                      Statistics: Num rows: 4 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE
+                      Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE
                       Reduce Output Operator
                         key expressions: _col0 (type: string)
                         null sort order: z
                         sort order: +
                         Map-reduce partition columns: _col0 (type: string)
-                        Statistics: Num rows: 4 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE
+                        Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE
             Execution mode: vectorized, llap
             LLAP IO: all inputs
         Reducer 2 
@@ -413,10 +413,10 @@ STAGE PLANS:
                 keys: KEY._col0 (type: string)
                 mode: mergepartial
                 outputColumnNames: _col0
-                Statistics: Num rows: 4 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE
+                Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE
                 File Output Operator
                   compressed: false
-                  Statistics: Num rows: 4 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE
+                  Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE
                   table:
                       input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                       output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat

From cca7fc6d4aa0cff5a9499c18090a59130177c11d Mon Sep 17 00:00:00 2001
From: Konstantin Bereznyakov <konstantin.bereznyakov@treasure-data.com>
Date: Fri, 27 Mar 2026 15:47:46 -0700
Subject: [PATCH 12/13] HIVE-29368: fully reverted buildColStatForConstant to
 reduce the total diff

---
 .../hadoop/hive/ql/stats/StatsUtils.java      | 20 +++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
index 9fd7593e2ce3..1052e067966e 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
@@ -1628,11 +1628,23 @@ public static ColStatistics getColStatisticsFromExpression(HiveConf conf, Statis
   }
 
   private static ColStatistics buildColStatForConstant(HiveConf conf, long numRows, ExprNodeConstantDesc encd) {
-    String colType = encd.getTypeString().toLowerCase();
+
+    long numNulls = 0;
+    long countDistincts = 0;
+    if (encd.getValue() == null) {
+      // null projection
+      numNulls = numRows;
+    } else {
+      countDistincts = 1;
+    }
+    String colType = encd.getTypeString();
+    colType = colType.toLowerCase();
+    ObjectInspector oi = encd.getWritableObjectInspector();
+    double avgColSize = getAvgColLenOf(conf, oi, colType);
     ColStatistics colStats = new ColStatistics(encd.getName(), colType);
-    colStats.setAvgColLen(getAvgColLenOf(conf, encd.getWritableObjectInspector(), colType));
-    colStats.setCountDistint(encd.getValue() == null ? 0 : 1);
-    colStats.setNumNulls(encd.getValue() == null ? numRows : 0);
+    colStats.setAvgColLen(avgColSize);
+    colStats.setCountDistint(countDistincts);
+    colStats.setNumNulls(numNulls);
 
     Optional<Number> value = getConstValue(encd);
     value.ifPresent(number -> colStats.setRange(number, number));

From 8c2fd961d9ea31153f1debe7da3dceb8b56a6950 Mon Sep 17 00:00:00 2001
From: Konstantin Bereznyakov <konstantin.bereznyakov@treasure-data.com>
Date: Mon, 30 Mar 2026 10:53:15 -0700
Subject: [PATCH 13/13] HIVE-29368: tweaking extractNDVGroupingColumns
 conditions, test code fixes, new .out files

---
 .../hadoop/hive/ql/stats/StatsUtils.java      |   5 +-
 .../hadoop/hive/ql/stats/TestStatsUtils.java  | 147 ++++++++++--------
 .../ql/stats/estimator/TestStatEstimator.java |  50 ++++++
 .../llap/parquet_vectorization_13.q.out       |   4 +-
 .../llap/parquet_vectorization_14.q.out       |   2 +-
 .../llap/parquet_vectorization_15.q.out       |  12 +-
 .../llap/parquet_vectorization_16.q.out       |   2 +-
 .../llap/parquet_vectorization_9.q.out        |   2 +-
 .../llap/vectorization_13.q.out               |   4 +-
 .../llap/vectorization_14.q.out               |   2 +-
 .../llap/vectorization_15.q.out               |  12 +-
 .../llap/vectorization_16.q.out               |   2 +-
 .../clientpositive/llap/vectorization_9.q.out |   2 +-
 .../llap/vectorization_short_regress.q.out    |  18 +--
 .../llap/vectorized_stats.q.out               |   8 +-
 15 files changed, 172 insertions(+), 100 deletions(-)
 create mode 100644 ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestStatEstimator.java

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
index 1052e067966e..dbb5565ff239 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
@@ -2111,7 +2111,10 @@ private static List<Long> extractNDVGroupingColumns(List<ColStatistics> colStats
     for (ColStatistics cs : colStats) {
       if (cs != null) {
         long ndv = cs.getCountDistint();
-        if (cs.getNumNulls() > 0) {
+        // NDV needs to be adjusted if a column has a known NDV along with NULL values
+        // or if a column happens to be "const NULL"
+        if ((ndv > 0 && cs.getNumNulls() > 0) ||
+            (ndv == 0 && !cs.isEstimated() && cs.getNumNulls() == parentStats.getNumRows())) {
           ndv = StatsUtils.safeAdd(ndv, 1);
         }
         ndvValues.add(ndv);
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java b/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java
index 2b8cb07b6822..48ee3b99cc35 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java
@@ -19,7 +19,6 @@
 package org.apache.hadoop.hive.ql.stats;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertFalse;
 import static org.junit.jupiter.api.Assertions.assertNotEquals;
 import static org.junit.jupiter.api.Assertions.assertNotNull;
 import static org.junit.jupiter.api.Assertions.assertNull;
@@ -45,7 +44,9 @@
 import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
 import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
 import org.apache.hadoop.hive.ql.plan.Statistics;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFCoalesce;
 import org.apache.hadoop.hive.ql.udf.generic.GenericUDFIf;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDFWhen;
 import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
 import org.apache.hadoop.hive.serde.serdeConstants;
 import org.junit.jupiter.api.Test;
@@ -506,51 +507,6 @@ void testScaleColStatisticsPreservesUnknownNumFalses() {
     assertEquals(-1, colStats.get(0).getNumFalses(), "Unknown numFalses (-1) should be preserved after scaling");
   }
 
-  // Tests for buildColStatForConstant (via getColStatisticsFromExpression)
-
-  @Test
-  void testGetColStatisticsFromExpressionNullConstant() {
-    HiveConf conf = new HiveConf();
-    Statistics parentStats = new Statistics(1000, 8000, 0, 0);
-
-    ExprNodeConstantDesc nullConst = new ExprNodeConstantDesc(TypeInfoFactory.stringTypeInfo, null);
-    ColStatistics cs = StatsUtils.getColStatisticsFromExpression(conf, parentStats, nullConst);
-
-    assertNotNull(cs);
-    assertEquals(1, cs.getCountDistint(), "NULL constant should have NDV=1");
-    assertEquals(1000, cs.getNumNulls(), "NULL constant should have numNulls=numRows");
-    assertFalse(cs.isEstimated(), "Constant stats should not be marked as estimated");
-  }
-
-  @Test
-  void testGetColStatisticsFromExpressionNonNullConstant() {
-    HiveConf conf = new HiveConf();
-    Statistics parentStats = new Statistics(1000, 8000, 0, 0);
-
-    ExprNodeConstantDesc strConst = new ExprNodeConstantDesc(TypeInfoFactory.stringTypeInfo, "hello");
-    ColStatistics cs = StatsUtils.getColStatisticsFromExpression(conf, parentStats, strConst);
-
-    assertNotNull(cs);
-    assertEquals(1, cs.getCountDistint(), "Non-NULL constant should have NDV=1");
-    assertEquals(0, cs.getNumNulls(), "Non-NULL constant should have numNulls=0");
-  }
-
-  @Test
-  void testGetColStatisticsFromExpressionIntConstant() {
-    HiveConf conf = new HiveConf();
-    Statistics parentStats = new Statistics(500, 4000, 0, 0);
-
-    ExprNodeConstantDesc intConst = new ExprNodeConstantDesc(TypeInfoFactory.intTypeInfo, 42);
-    ColStatistics cs = StatsUtils.getColStatisticsFromExpression(conf, parentStats, intConst);
-
-    assertNotNull(cs);
-    assertEquals(1, cs.getCountDistint(), "Integer constant should have NDV=1");
-    assertEquals(0, cs.getNumNulls(), "Integer constant should have numNulls=0");
-    assertNotNull(cs.getRange(), "Integer constant should have a range");
-    assertEquals(42, cs.getRange().minValue.intValue());
-    assertEquals(42, cs.getRange().maxValue.intValue());
-  }
-
   // Tests for computeNDVGroupingColumns / extractNDVGroupingColumns
 
   @Test
@@ -592,7 +548,7 @@ void testComputeNDVGroupingColumnsEstimatedExpression() {
     cs.setIsEstimated(true);  // computed expression (e.g., CASE)
 
     long ndv = StatsUtils.computeNDVGroupingColumns(Arrays.asList(cs), parentStats, false);
-    assertEquals(3, ndv, "Estimated expression should NOT get +1 (already accounts for NULL)");
+    assertEquals(4, ndv, "NDV with nulls: 3 + 1 = 4");
   }
 
   @Test
@@ -601,12 +557,26 @@ void testComputeNDVGroupingColumnsAllNullColumn() {
     parentStats.setColumnStatsState(Statistics.State.COMPLETE);
 
     ColStatistics cs = new ColStatistics("col1", "string");
-    cs.setCountDistint(1);
+    cs.setCountDistint(0);
     cs.setNumNulls(1000);  // all rows are NULL
     cs.setIsEstimated(false);
 
     long ndv = StatsUtils.computeNDVGroupingColumns(Arrays.asList(cs), parentStats, false);
-    assertEquals(1, ndv, "All-NULL column should NOT get +1 (numNulls == numRows)");
+    assertEquals(1, ndv, "All-NULL column: NDV=0 but numNulls==numRows, so NDV becomes 1");
+  }
+
+  @Test
+  void testComputeNDVGroupingColumnsAllNullEstimatedColumn() {
+    Statistics parentStats = new Statistics(1000, 8000, 0, 0);
+    parentStats.setColumnStatsState(Statistics.State.COMPLETE);
+
+    ColStatistics cs = new ColStatistics("case_expr", "string");
+    cs.setCountDistint(0);
+    cs.setNumNulls(1000);  // all rows are NULL
+    cs.setIsEstimated(true);  // from expression like CASE
+
+    long ndv = StatsUtils.computeNDVGroupingColumns(Arrays.asList(cs), parentStats, false);
+    assertEquals(0, ndv, "Estimated all-NULL column: NDV stays 0 (unknown from combiner)");
   }
 
   @Test
@@ -660,22 +630,15 @@ void testComputeNDVGroupingColumnsMixedEstimatedAndSource() {
     caseExpr.setIsEstimated(true);  // estimated: no +1
 
     long ndv = StatsUtils.computeNDVGroupingColumns(Arrays.asList(sourceCol, caseExpr), parentStats, false);
-    // sourceCol: 10 + 1 = 11, caseExpr: 3 (no +1)
-    // Product: 11 * 3 = 33
-    assertEquals(33, ndv, "Mixed columns: source (10+1) * estimated (3) = 33");
+    // sourceCol: 10 + 1 = 11, caseExpr: 3 + 1 = 4
+    // Product: 11 * 4 = 44
+    assertEquals(44, ndv, "Mixed columns: (10+1) * (3+1) = 44");
   }
 
-  // Test for NDV cap after StatEstimator (NDV cannot exceed numRows)
-
   @Test
   void testGetColStatisticsFromExpressionNdvCappedAtNumRows() {
-    HiveConf conf = new HiveConf();
-    conf.setBoolVar(HiveConf.ConfVars.HIVE_STATS_ESTIMATORS_ENABLE, true);
-
-    // Create parent stats with only 100 rows
     Statistics parentStats = new Statistics(100, 800, 0, 0);
 
-    // Create column stats for col1 and col2 with high NDV (each 80)
     ColStatistics col1Stats = new ColStatistics("col1", "string");
     col1Stats.setCountDistint(80);
     col1Stats.setNumNulls(0);
@@ -688,9 +651,6 @@ void testGetColStatisticsFromExpressionNdvCappedAtNumRows() {
 
     parentStats.setColumnStats(Arrays.asList(col1Stats, col2Stats));
 
-    // Create IF(true, col1, col2) expression
-    // IF uses PessimisticStatCombiner which sums NDVs: 80 + 80 = 160
-    // But numRows is only 100, so NDV should be capped at 100
     GenericUDFIf udfIf = new GenericUDFIf();
     ExprNodeConstantDesc condExpr = new ExprNodeConstantDesc(TypeInfoFactory.booleanTypeInfo, true);
     ExprNodeColumnDesc col1Expr = new ExprNodeColumnDesc(TypeInfoFactory.stringTypeInfo, "col1", "t", false);
@@ -700,11 +660,70 @@ void testGetColStatisticsFromExpressionNdvCappedAtNumRows() {
         TypeInfoFactory.stringTypeInfo, udfIf, "if",
         Arrays.asList(condExpr, col1Expr, col2Expr));
 
-    ColStatistics result = StatsUtils.getColStatisticsFromExpression(conf, parentStats, ifExpr);
+    ColStatistics result = StatsUtils.getColStatisticsFromExpression(new HiveConf(), parentStats, ifExpr);
 
     assertNotNull(result);
-    // PessimisticStatCombiner would produce 80 + 80 = 160, but cap ensures NDV <= numRows (100)
     assertEquals(100, result.getCountDistint(), "NDV should be capped at numRows (100), not 160");
   }
 
+  @Test
+  void testGetColStatisticsFromExpressionWhenNdvCapped() {
+    Statistics parentStats = new Statistics(100, 800, 0, 0);
+
+    ColStatistics col1Stats = new ColStatistics("col1", "string");
+    col1Stats.setCountDistint(60);
+    col1Stats.setNumNulls(0);
+    col1Stats.setAvgColLen(10);
+
+    ColStatistics col2Stats = new ColStatistics("col2", "string");
+    col2Stats.setCountDistint(70);
+    col2Stats.setNumNulls(0);
+    col2Stats.setAvgColLen(10);
+
+    parentStats.setColumnStats(Arrays.asList(col1Stats, col2Stats));
+
+    GenericUDFWhen udfWhen = new GenericUDFWhen();
+    ExprNodeConstantDesc condExpr = new ExprNodeConstantDesc(TypeInfoFactory.booleanTypeInfo, true);
+    ExprNodeColumnDesc col1Expr = new ExprNodeColumnDesc(TypeInfoFactory.stringTypeInfo, "col1", "t", false);
+    ExprNodeColumnDesc col2Expr = new ExprNodeColumnDesc(TypeInfoFactory.stringTypeInfo, "col2", "t", false);
+
+    ExprNodeGenericFuncDesc whenExpr = new ExprNodeGenericFuncDesc(
+        TypeInfoFactory.stringTypeInfo, udfWhen, "when",
+        Arrays.asList(condExpr, col1Expr, col2Expr));
+
+    ColStatistics result = StatsUtils.getColStatisticsFromExpression(new HiveConf(), parentStats, whenExpr);
+
+    assertNotNull(result);
+    assertEquals(100, result.getCountDistint(), "NDV should be capped at numRows (100), not 130");
+  }
+
+  @Test
+  void testGetColStatisticsFromExpressionCoalesceNdvCapped() {
+    Statistics parentStats = new Statistics(100, 800, 0, 0);
+
+    ColStatistics col1Stats = new ColStatistics("col1", "string");
+    col1Stats.setCountDistint(50);
+    col1Stats.setNumNulls(20);
+    col1Stats.setAvgColLen(10);
+
+    ColStatistics col2Stats = new ColStatistics("col2", "string");
+    col2Stats.setCountDistint(80);
+    col2Stats.setNumNulls(10);
+    col2Stats.setAvgColLen(10);
+
+    parentStats.setColumnStats(Arrays.asList(col1Stats, col2Stats));
+
+    GenericUDFCoalesce udfCoalesce = new GenericUDFCoalesce();
+    ExprNodeColumnDesc col1Expr = new ExprNodeColumnDesc(TypeInfoFactory.stringTypeInfo, "col1", "t", false);
+    ExprNodeColumnDesc col2Expr = new ExprNodeColumnDesc(TypeInfoFactory.stringTypeInfo, "col2", "t", false);
+
+    ExprNodeGenericFuncDesc coalesceExpr = new ExprNodeGenericFuncDesc(
+        TypeInfoFactory.stringTypeInfo, udfCoalesce, "coalesce",
+        Arrays.asList(col1Expr, col2Expr));
+
+    ColStatistics result = StatsUtils.getColStatisticsFromExpression(new HiveConf(), parentStats, coalesceExpr);
+
+    assertNotNull(result);
+    assertEquals(100, result.getCountDistint(), "NDV should be capped at numRows (100), not 130");
+  }
 }
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestStatEstimator.java b/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestStatEstimator.java
new file mode 100644
index 000000000000..7fd715f4a98d
--- /dev/null
+++ b/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestStatEstimator.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.stats.estimator;
+
+import static org.junit.jupiter.api.Assertions.assertThrows;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.hadoop.hive.ql.plan.ColStatistics;
+import org.apache.hadoop.hive.ql.plan.Statistics;
+import org.junit.jupiter.api.Test;
+
+class TestStatEstimator {
+
+  @Test
+  void testDefaultEstimateThrowsUnsupportedOperation() {
+    StatEstimator estimator = new StatEstimator() {};
+    List<ColStatistics> argStats = Arrays.asList(new ColStatistics("col", "int"));
+
+    assertThrows(UnsupportedOperationException.class, () -> estimator.estimate(argStats),
+        "Default estimate(argStats) should throw UnsupportedOperationException");
+  }
+
+  @Test
+  void testDefaultEstimateWithParentStatsCallsEstimate() {
+    StatEstimator estimator = new StatEstimator() {};
+    List<ColStatistics> argStats = Arrays.asList(new ColStatistics("col", "int"));
+    Statistics parentStats = new Statistics(100, 800, 0, 0);
+
+    assertThrows(UnsupportedOperationException.class, () -> estimator.estimate(argStats, parentStats),
+        "Default estimate(argStats, parentStats) should delegate to estimate(argStats) which throws");
+  }
+}
diff --git a/ql/src/test/results/clientpositive/llap/parquet_vectorization_13.q.out b/ql/src/test/results/clientpositive/llap/parquet_vectorization_13.q.out
index 7125704c33d2..2a223991fe36 100644
--- a/ql/src/test/results/clientpositive/llap/parquet_vectorization_13.q.out
+++ b/ql/src/test/results/clientpositive/llap/parquet_vectorization_13.q.out
@@ -130,7 +130,7 @@ STAGE PLANS:
                               vectorProcessingMode: HASH
                               projectedOutputColumnNums: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
                           keys: _col0 (type: boolean), _col1 (type: tinyint), _col2 (type: timestamp), _col3 (type: float), _col4 (type: string)
-                          minReductionHashAggr: 0.4
+                          minReductionHashAggr: 0.99
                           mode: hash
                           outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14
                           Statistics: Num rows: 1386 Data size: 194258 Basic stats: COMPLETE Column stats: COMPLETE
@@ -487,7 +487,7 @@ STAGE PLANS:
                               vectorProcessingMode: HASH
                               projectedOutputColumnNums: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
                           keys: _col0 (type: boolean), _col1 (type: tinyint), _col2 (type: timestamp), _col3 (type: float), _col4 (type: string)
-                          minReductionHashAggr: 0.4
+                          minReductionHashAggr: 0.99
                           mode: hash
                           outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14
                           Statistics: Num rows: 1386 Data size: 194258 Basic stats: COMPLETE Column stats: COMPLETE
diff --git a/ql/src/test/results/clientpositive/llap/parquet_vectorization_14.q.out b/ql/src/test/results/clientpositive/llap/parquet_vectorization_14.q.out
index 5acc12c3b71d..12c88df0aebf 100644
--- a/ql/src/test/results/clientpositive/llap/parquet_vectorization_14.q.out
+++ b/ql/src/test/results/clientpositive/llap/parquet_vectorization_14.q.out
@@ -120,7 +120,7 @@ STAGE PLANS:
                             vectorProcessingMode: HASH
                             projectedOutputColumnNums: [0, 1, 2, 3, 4, 5, 6, 7]
                         keys: _col2 (type: string), _col1 (type: float), _col4 (type: double), _col0 (type: timestamp), _col3 (type: boolean)
-                        minReductionHashAggr: 0.4
+                        minReductionHashAggr: 0.99
                         mode: hash
                         outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12
                         Statistics: Num rows: 758 Data size: 130530 Basic stats: COMPLETE Column stats: COMPLETE
diff --git a/ql/src/test/results/clientpositive/llap/parquet_vectorization_15.q.out b/ql/src/test/results/clientpositive/llap/parquet_vectorization_15.q.out
index d7d3f4919183..3653c9466248 100644
--- a/ql/src/test/results/clientpositive/llap/parquet_vectorization_15.q.out
+++ b/ql/src/test/results/clientpositive/llap/parquet_vectorization_15.q.out
@@ -116,7 +116,7 @@ STAGE PLANS:
                             vectorProcessingMode: HASH
                             projectedOutputColumnNums: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
                         keys: _col0 (type: float), _col1 (type: boolean), _col2 (type: double), _col3 (type: string), _col4 (type: tinyint), _col5 (type: int), _col6 (type: timestamp)
-                        minReductionHashAggr: 0.4
+                        minReductionHashAggr: 0.99
                         mode: hash
                         outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16
                         Statistics: Num rows: 6144 Data size: 1216372 Basic stats: COMPLETE Column stats: COMPLETE
@@ -154,16 +154,16 @@ STAGE PLANS:
                 keys: KEY._col0 (type: float), KEY._col1 (type: boolean), KEY._col2 (type: double), KEY._col3 (type: string), KEY._col4 (type: tinyint), KEY._col5 (type: int), KEY._col6 (type: timestamp)
                 mode: mergepartial
                 outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16
-                Statistics: Num rows: 6144 Data size: 1216372 Basic stats: COMPLETE Column stats: COMPLETE
+                Statistics: Num rows: 6104 Data size: 1208432 Basic stats: COMPLETE Column stats: COMPLETE
                 Select Operator
                   expressions: _col0 (type: float), _col1 (type: boolean), _col2 (type: double), _col3 (type: string), _col4 (type: tinyint), _col5 (type: int), _col6 (type: timestamp), power(((_col7 - ((_col8 * _col8) / _col9)) / if((_col9 = 1L), null, (_col9 - 1))), 0.5) (type: double), (-26.28 - CAST( _col5 AS decimal(10,0))) (type: decimal(13,2)), _col10 (type: double), (_col2 * 79.553D) (type: double), (33.0 % _col0) (type: float), power(((_col11 - ((_col12 * _col12) / _col13)) / if((_col13 = 1L), null, (_col13 - 1))), 0.5) (type: double), ((_col11 - ((_col12 * _col12) / _col13)) / _col13) (type: double), (-23.0D % _col2) (type: double), (- _col4) (type: tinyint), ((_col14 - ((_col15 * _col15) / _col16)) / if((_col16 = 1L), null, (_col16 - 1))) (type: double), (UDFToFloat(_col5) - _col0) (type: float), (-23 % UDFToInteger(_col4)) (type: int), (- (-26.28 - CAST( _col5 AS decimal(10,0)))) (type: decimal(13,2)), power(((_col14 - ((_col15 * _col15) / _col16)) / _col16), 0.5) (type: double)
                   outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20
-                  Statistics: Num rows: 6144 Data size: 2592628 Basic stats: COMPLETE Column stats: COMPLETE
+                  Statistics: Num rows: 6104 Data size: 2575728 Basic stats: COMPLETE Column stats: COMPLETE
                   Reduce Output Operator
                     key expressions: _col0 (type: float), _col1 (type: boolean), _col2 (type: double), _col3 (type: string), _col4 (type: tinyint), _col5 (type: int), _col6 (type: timestamp)
                     null sort order: zzzzzzz
                     sort order: +++++++
-                    Statistics: Num rows: 6144 Data size: 2592628 Basic stats: COMPLETE Column stats: COMPLETE
+                    Statistics: Num rows: 6104 Data size: 2575728 Basic stats: COMPLETE Column stats: COMPLETE
                     value expressions: _col7 (type: double), _col8 (type: decimal(13,2)), _col9 (type: double), _col10 (type: double), _col11 (type: float), _col12 (type: double), _col13 (type: double), _col14 (type: double), _col15 (type: tinyint), _col16 (type: double), _col17 (type: float), _col18 (type: int), _col19 (type: decimal(13,2)), _col20 (type: double)
         Reducer 3 
             Execution mode: llap
@@ -175,10 +175,10 @@ STAGE PLANS:
               Select Operator
                 expressions: KEY.reducesinkkey0 (type: float), KEY.reducesinkkey1 (type: boolean), KEY.reducesinkkey2 (type: double), KEY.reducesinkkey3 (type: string), KEY.reducesinkkey4 (type: tinyint), KEY.reducesinkkey5 (type: int), KEY.reducesinkkey6 (type: timestamp), VALUE._col0 (type: double), VALUE._col1 (type: decimal(13,2)), VALUE._col2 (type: double), VALUE._col3 (type: double), VALUE._col4 (type: float), VALUE._col5 (type: double), VALUE._col6 (type: double), VALUE._col7 (type: double), VALUE._col8 (type: tinyint), VALUE._col9 (type: double), VALUE._col10 (type: float), VALUE._col11 (type: int), VALUE._col12 (type: decimal(13,2)), VALUE._col13 (type: double)
                 outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20
-                Statistics: Num rows: 6144 Data size: 2592628 Basic stats: COMPLETE Column stats: COMPLETE
+                Statistics: Num rows: 6104 Data size: 2575728 Basic stats: COMPLETE Column stats: COMPLETE
                 File Output Operator
                   compressed: false
-                  Statistics: Num rows: 6144 Data size: 2592628 Basic stats: COMPLETE Column stats: COMPLETE
+                  Statistics: Num rows: 6104 Data size: 2575728 Basic stats: COMPLETE Column stats: COMPLETE
                   table:
                       input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                       output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
diff --git a/ql/src/test/results/clientpositive/llap/parquet_vectorization_16.q.out b/ql/src/test/results/clientpositive/llap/parquet_vectorization_16.q.out
index eeab9c89af72..a457b27af643 100644
--- a/ql/src/test/results/clientpositive/llap/parquet_vectorization_16.q.out
+++ b/ql/src/test/results/clientpositive/llap/parquet_vectorization_16.q.out
@@ -93,7 +93,7 @@ STAGE PLANS:
                             vectorProcessingMode: HASH
                             projectedOutputColumnNums: [0, 1, 2, 3]
                         keys: _col0 (type: string), _col1 (type: double), _col2 (type: timestamp)
-                        minReductionHashAggr: 0.4
+                        minReductionHashAggr: 0.99
                         mode: hash
                         outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6
                         Statistics: Num rows: 5979 Data size: 825318 Basic stats: COMPLETE Column stats: COMPLETE
diff --git a/ql/src/test/results/clientpositive/llap/parquet_vectorization_9.q.out b/ql/src/test/results/clientpositive/llap/parquet_vectorization_9.q.out
index eeab9c89af72..a457b27af643 100644
--- a/ql/src/test/results/clientpositive/llap/parquet_vectorization_9.q.out
+++ b/ql/src/test/results/clientpositive/llap/parquet_vectorization_9.q.out
@@ -93,7 +93,7 @@ STAGE PLANS:
                             vectorProcessingMode: HASH
                             projectedOutputColumnNums: [0, 1, 2, 3]
                         keys: _col0 (type: string), _col1 (type: double), _col2 (type: timestamp)
-                        minReductionHashAggr: 0.4
+                        minReductionHashAggr: 0.99
                         mode: hash
                         outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6
                         Statistics: Num rows: 5979 Data size: 825318 Basic stats: COMPLETE Column stats: COMPLETE
diff --git a/ql/src/test/results/clientpositive/llap/vectorization_13.q.out b/ql/src/test/results/clientpositive/llap/vectorization_13.q.out
index d1911fdb7f8b..0e96fffb2c09 100644
--- a/ql/src/test/results/clientpositive/llap/vectorization_13.q.out
+++ b/ql/src/test/results/clientpositive/llap/vectorization_13.q.out
@@ -131,7 +131,7 @@ STAGE PLANS:
                               vectorProcessingMode: HASH
                               projectedOutputColumnNums: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
                           keys: _col0 (type: boolean), _col1 (type: tinyint), _col2 (type: timestamp), _col3 (type: float), _col4 (type: string)
-                          minReductionHashAggr: 0.4
+                          minReductionHashAggr: 0.99
                           mode: hash
                           outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14
                           Statistics: Num rows: 1386 Data size: 194258 Basic stats: COMPLETE Column stats: COMPLETE
@@ -511,7 +511,7 @@ STAGE PLANS:
                               vectorProcessingMode: HASH
                               projectedOutputColumnNums: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
                           keys: _col0 (type: boolean), _col1 (type: tinyint), _col2 (type: timestamp), _col3 (type: float), _col4 (type: string)
-                          minReductionHashAggr: 0.4
+                          minReductionHashAggr: 0.99
                           mode: hash
                           outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14
                           Statistics: Num rows: 1386 Data size: 194258 Basic stats: COMPLETE Column stats: COMPLETE
diff --git a/ql/src/test/results/clientpositive/llap/vectorization_14.q.out b/ql/src/test/results/clientpositive/llap/vectorization_14.q.out
index 25bfeb19bfcf..62feb4c66cff 100644
--- a/ql/src/test/results/clientpositive/llap/vectorization_14.q.out
+++ b/ql/src/test/results/clientpositive/llap/vectorization_14.q.out
@@ -121,7 +121,7 @@ STAGE PLANS:
                             vectorProcessingMode: HASH
                             projectedOutputColumnNums: [0, 1, 2, 3, 4, 5, 6, 7]
                         keys: _col2 (type: string), _col1 (type: float), _col4 (type: double), _col0 (type: timestamp), _col3 (type: boolean)
-                        minReductionHashAggr: 0.4
+                        minReductionHashAggr: 0.99
                         mode: hash
                         outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12
                         Statistics: Num rows: 758 Data size: 130530 Basic stats: COMPLETE Column stats: COMPLETE
diff --git a/ql/src/test/results/clientpositive/llap/vectorization_15.q.out b/ql/src/test/results/clientpositive/llap/vectorization_15.q.out
index 6732aba7edd2..8c20b9f5fca7 100644
--- a/ql/src/test/results/clientpositive/llap/vectorization_15.q.out
+++ b/ql/src/test/results/clientpositive/llap/vectorization_15.q.out
@@ -117,7 +117,7 @@ STAGE PLANS:
                             vectorProcessingMode: HASH
                             projectedOutputColumnNums: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
                         keys: _col0 (type: float), _col1 (type: boolean), _col2 (type: double), _col3 (type: string), _col4 (type: tinyint), _col5 (type: int), _col6 (type: timestamp)
-                        minReductionHashAggr: 0.4
+                        minReductionHashAggr: 0.99
                         mode: hash
                         outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16
                         Statistics: Num rows: 6144 Data size: 1216372 Basic stats: COMPLETE Column stats: COMPLETE
@@ -163,16 +163,16 @@ STAGE PLANS:
                 keys: KEY._col0 (type: float), KEY._col1 (type: boolean), KEY._col2 (type: double), KEY._col3 (type: string), KEY._col4 (type: tinyint), KEY._col5 (type: int), KEY._col6 (type: timestamp)
                 mode: mergepartial
                 outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16
-                Statistics: Num rows: 6144 Data size: 1216372 Basic stats: COMPLETE Column stats: COMPLETE
+                Statistics: Num rows: 6104 Data size: 1208432 Basic stats: COMPLETE Column stats: COMPLETE
                 Select Operator
                   expressions: _col0 (type: float), _col1 (type: boolean), _col2 (type: double), _col3 (type: string), _col4 (type: tinyint), _col5 (type: int), _col6 (type: timestamp), power(((_col7 - ((_col8 * _col8) / _col9)) / if((_col9 = 1L), null, (_col9 - 1))), 0.5) (type: double), (-26.28 - CAST( _col5 AS decimal(10,0))) (type: decimal(13,2)), _col10 (type: double), (_col2 * 79.553D) (type: double), (33.0 % _col0) (type: float), power(((_col11 - ((_col12 * _col12) / _col13)) / if((_col13 = 1L), null, (_col13 - 1))), 0.5) (type: double), ((_col11 - ((_col12 * _col12) / _col13)) / _col13) (type: double), (-23.0D % _col2) (type: double), (- _col4) (type: tinyint), ((_col14 - ((_col15 * _col15) / _col16)) / if((_col16 = 1L), null, (_col16 - 1))) (type: double), (UDFToFloat(_col5) - _col0) (type: float), (-23 % UDFToInteger(_col4)) (type: int), (- (-26.28 - CAST( _col5 AS decimal(10,0)))) (type: decimal(13,2)), power(((_col14 - ((_col15 * _col15) / _col16)) / _col16), 0.5) (type: double)
                   outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20
-                  Statistics: Num rows: 6144 Data size: 2592628 Basic stats: COMPLETE Column stats: COMPLETE
+                  Statistics: Num rows: 6104 Data size: 2575728 Basic stats: COMPLETE Column stats: COMPLETE
                   Reduce Output Operator
                     key expressions: _col0 (type: float), _col1 (type: boolean), _col2 (type: double), _col3 (type: string), _col4 (type: tinyint), _col5 (type: int), _col6 (type: timestamp)
                     null sort order: zzzzzzz
                     sort order: +++++++
-                    Statistics: Num rows: 6144 Data size: 2592628 Basic stats: COMPLETE Column stats: COMPLETE
+                    Statistics: Num rows: 6104 Data size: 2575728 Basic stats: COMPLETE Column stats: COMPLETE
                     value expressions: _col7 (type: double), _col8 (type: decimal(13,2)), _col9 (type: double), _col10 (type: double), _col11 (type: float), _col12 (type: double), _col13 (type: double), _col14 (type: double), _col15 (type: tinyint), _col16 (type: double), _col17 (type: float), _col18 (type: int), _col19 (type: decimal(13,2)), _col20 (type: double)
         Reducer 3 
             Execution mode: llap
@@ -184,10 +184,10 @@ STAGE PLANS:
               Select Operator
                 expressions: KEY.reducesinkkey0 (type: float), KEY.reducesinkkey1 (type: boolean), KEY.reducesinkkey2 (type: double), KEY.reducesinkkey3 (type: string), KEY.reducesinkkey4 (type: tinyint), KEY.reducesinkkey5 (type: int), KEY.reducesinkkey6 (type: timestamp), VALUE._col0 (type: double), VALUE._col1 (type: decimal(13,2)), VALUE._col2 (type: double), VALUE._col3 (type: double), VALUE._col4 (type: float), VALUE._col5 (type: double), VALUE._col6 (type: double), VALUE._col7 (type: double), VALUE._col8 (type: tinyint), VALUE._col9 (type: double), VALUE._col10 (type: float), VALUE._col11 (type: int), VALUE._col12 (type: decimal(13,2)), VALUE._col13 (type: double)
                 outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20
-                Statistics: Num rows: 6144 Data size: 2592628 Basic stats: COMPLETE Column stats: COMPLETE
+                Statistics: Num rows: 6104 Data size: 2575728 Basic stats: COMPLETE Column stats: COMPLETE
                 File Output Operator
                   compressed: false
-                  Statistics: Num rows: 6144 Data size: 2592628 Basic stats: COMPLETE Column stats: COMPLETE
+                  Statistics: Num rows: 6104 Data size: 2575728 Basic stats: COMPLETE Column stats: COMPLETE
                   table:
                       input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                       output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
diff --git a/ql/src/test/results/clientpositive/llap/vectorization_16.q.out b/ql/src/test/results/clientpositive/llap/vectorization_16.q.out
index 7e8cb81144fc..7de5092fc76d 100644
--- a/ql/src/test/results/clientpositive/llap/vectorization_16.q.out
+++ b/ql/src/test/results/clientpositive/llap/vectorization_16.q.out
@@ -94,7 +94,7 @@ STAGE PLANS:
                             vectorProcessingMode: HASH
                             projectedOutputColumnNums: [0, 1, 2, 3]
                         keys: _col0 (type: string), _col1 (type: double), _col2 (type: timestamp)
-                        minReductionHashAggr: 0.4
+                        minReductionHashAggr: 0.99
                         mode: hash
                         outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6
                         Statistics: Num rows: 5979 Data size: 825318 Basic stats: COMPLETE Column stats: COMPLETE
diff --git a/ql/src/test/results/clientpositive/llap/vectorization_9.q.out b/ql/src/test/results/clientpositive/llap/vectorization_9.q.out
index 7e8cb81144fc..7de5092fc76d 100644
--- a/ql/src/test/results/clientpositive/llap/vectorization_9.q.out
+++ b/ql/src/test/results/clientpositive/llap/vectorization_9.q.out
@@ -94,7 +94,7 @@ STAGE PLANS:
                             vectorProcessingMode: HASH
                             projectedOutputColumnNums: [0, 1, 2, 3]
                         keys: _col0 (type: string), _col1 (type: double), _col2 (type: timestamp)
-                        minReductionHashAggr: 0.4
+                        minReductionHashAggr: 0.99
                         mode: hash
                         outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6
                         Statistics: Num rows: 5979 Data size: 825318 Basic stats: COMPLETE Column stats: COMPLETE
diff --git a/ql/src/test/results/clientpositive/llap/vectorization_short_regress.q.out b/ql/src/test/results/clientpositive/llap/vectorization_short_regress.q.out
index da82903d7963..8f7d63935f59 100644
--- a/ql/src/test/results/clientpositive/llap/vectorization_short_regress.q.out
+++ b/ql/src/test/results/clientpositive/llap/vectorization_short_regress.q.out
@@ -2949,10 +2949,10 @@ STAGE PLANS:
                             vectorProcessingMode: HASH
                             projectedOutputColumnNums: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
                         keys: _col0 (type: timestamp), _col1 (type: string)
-                        minReductionHashAggr: 0.5133463
+                        minReductionHashAggr: 0.99
                         mode: hash
                         outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20, _col21, _col22
-                        Statistics: Num rows: 5980 Data size: 1579124 Basic stats: COMPLETE Column stats: COMPLETE
+                        Statistics: Num rows: 6144 Data size: 1622368 Basic stats: COMPLETE Column stats: COMPLETE
                         Reduce Output Operator
                           key expressions: _col0 (type: timestamp), _col1 (type: string)
                           null sort order: zz
@@ -2962,7 +2962,7 @@ STAGE PLANS:
                               className: VectorReduceSinkMultiKeyOperator
                               native: true
                               nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true
-                          Statistics: Num rows: 5980 Data size: 1579124 Basic stats: COMPLETE Column stats: COMPLETE
+                          Statistics: Num rows: 6144 Data size: 1622368 Basic stats: COMPLETE Column stats: COMPLETE
                           value expressions: _col2 (type: double), _col3 (type: double), _col4 (type: bigint), _col5 (type: bigint), _col6 (type: bigint), _col7 (type: bigint), _col8 (type: tinyint), _col9 (type: double), _col10 (type: double), _col11 (type: bigint), _col12 (type: double), _col13 (type: double), _col14 (type: bigint), _col15 (type: bigint), _col16 (type: bigint), _col17 (type: double), _col18 (type: bigint), _col19 (type: double), _col20 (type: double), _col21 (type: double), _col22 (type: bigint)
             Execution mode: vectorized, llap
             LLAP IO: all inputs
@@ -2997,7 +2997,7 @@ STAGE PLANS:
                 keys: KEY._col0 (type: timestamp), KEY._col1 (type: string)
                 mode: mergepartial
                 outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20, _col21, _col22
-                Statistics: Num rows: 5980 Data size: 1579124 Basic stats: COMPLETE Column stats: COMPLETE
+                Statistics: Num rows: 5979 Data size: 1578826 Basic stats: COMPLETE Column stats: COMPLETE
                 Select Operator
                   expressions: _col0 (type: timestamp), _col1 (type: string), power(((_col2 - ((_col3 * _col3) / _col4)) / _col4), 0.5) (type: double), (UDFToDouble(_col5) / _col6) (type: double), _col7 (type: bigint), _col8 (type: tinyint), ((_col9 - ((_col10 * _col10) / _col11)) / if((_col11 = 1L), null, (_col11 - 1))) (type: double), ((_col12 - ((_col13 * _col13) / _col14)) / _col14) (type: double), (UDFToDouble(_col15) / _col16) (type: double), ((_col12 - ((_col13 * _col13) / _col14)) / if((_col14 = 1L), null, (_col14 - 1))) (type: double), (_col17 / _col18) (type: double), _col19 (type: double), ((_col9 - ((_col10 * _col10) / _col11)) / _col11) (type: double), power(((_col20 - ((_col21 * _col21) / _col22)) / _col22), 0.5) (type: double), _col15 (type: bigint)
                   outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14
@@ -3006,12 +3006,12 @@ STAGE PLANS:
                       native: true
                       projectedOutputColumnNums: [0, 1, 27, 29, 7, 8, 36, 40, 42, 49, 50, 19, 54, 59, 15]
                       selectExpressions: FuncPowerDoubleToDouble(col 26:double)(children: DoubleColDivideLongColumn(col 25:double, col 4:bigint)(children: DoubleColSubtractDoubleColumn(col 2:double, col 24:double)(children: DoubleColDivideLongColumn(col 23:double, col 4:bigint)(children: DoubleColMultiplyDoubleColumn(col 3:double, col 3:double) -> 23:double) -> 24:double) -> 25:double) -> 26:double) -> 27:double, DoubleColDivideLongColumn(col 28:double, col 6:bigint)(children: CastLongToDouble(col 5:bigint) -> 28:double) -> 29:double, DoubleColDivideLongColumn(col 32:double, col 35:bigint)(children: DoubleColSubtractDoubleColumn(col 9:double, col 31:double)(children: DoubleColDivideLongColumn(col 30:double, col 11:bigint)(children: DoubleColMultiplyDoubleColumn(col 10:double, col 10:double) -> 30:double) -> 31:double) -> 32:double, IfExprNullCondExpr(col 33:boolean, null, col 34:bigint)(children: LongColEqualLongScalar(col 11:bigint, val 1) -> 33:boolean, LongColSubtractLongScalar(col 11:bigint, val 1) -> 34:bigint) -> 35:bigint) -> 36:double, DoubleColDivideLongColumn(col 39:double, col 14:bigint)(children: DoubleColSubtractDoubleColumn(col 12:double, col 38:double)(children: DoubleColDivideLongColumn(col 37:double, col 14:bigint)(children: DoubleColMultiplyDoubleColumn(col 13:double, col 13:double) -> 37:double) -> 38:double) -> 39:double) -> 40:double, DoubleColDivideLongColumn(col 41:double, col 16:bigint)(children: CastLongToDouble(col 15:bigint) -> 41:double) -> 42:double, DoubleColDivideLongColumn(col 45:double, col 48:bigint)(children: DoubleColSubtractDoubleColumn(col 12:double, col 44:double)(children: DoubleColDivideLongColumn(col 43:double, col 14:bigint)(children: DoubleColMultiplyDoubleColumn(col 13:double, col 13:double) -> 43:double) -> 44:double) -> 45:double, IfExprNullCondExpr(col 46:boolean, null, col 47:bigint)(children: LongColEqualLongScalar(col 14:bigint, val 1) -> 46:boolean, LongColSubtractLongScalar(col 14:bigint, val 1) -> 47:bigint) -> 48:bigint) -> 49:double, DoubleColDivideLongColumn(col 17:double, col 18:bigint) -> 50:double, DoubleColDivideLongColumn(col 53:double, col 11:bigint)(children: DoubleColSubtractDoubleColumn(col 9:double, col 52:double)(children: DoubleColDivideLongColumn(col 51:double, col 11:bigint)(children: DoubleColMultiplyDoubleColumn(col 10:double, col 10:double) -> 51:double) -> 52:double) -> 53:double) -> 54:double, FuncPowerDoubleToDouble(col 58:double)(children: DoubleColDivideLongColumn(col 57:double, col 22:bigint)(children: DoubleColSubtractDoubleColumn(col 20:double, col 56:double)(children: DoubleColDivideLongColumn(col 55:double, col 22:bigint)(children: DoubleColMultiplyDoubleColumn(col 21:double, col 21:double) -> 55:double) -> 56:double) -> 57:double) -> 58:double) -> 59:double
-                  Statistics: Num rows: 5980 Data size: 1196404 Basic stats: COMPLETE Column stats: COMPLETE
+                  Statistics: Num rows: 5979 Data size: 1196170 Basic stats: COMPLETE Column stats: COMPLETE
                   Top N Key Operator
                     sort order: +++++++++++++++++++++++++++++++++++++++
                     keys: _col0 (type: timestamp), _col1 (type: string), _col2 (type: double), (_col2 * 10.175D) (type: double), (- _col2) (type: double), _col3 (type: double), (- _col2) (type: double), (-26.28D - _col2) (type: double), _col4 (type: bigint), (- _col4) (type: bigint), ((-26.28D - _col2) * (- _col2)) (type: double), _col5 (type: tinyint), (((-26.28D - _col2) * (- _col2)) * UDFToDouble((- _col4))) (type: double), (- (_col2 * 10.175D)) (type: double), _col6 (type: double), (_col6 + (((-26.28D - _col2) * (- _col2)) * UDFToDouble((- _col4)))) (type: double), _col2 (type: double), (UDFToDouble((- _col4)) / _col2) (type: double), _col7 (type: double), (10.175D / _col3) (type: double), _col8 (type: double), _col9 (type: double), ((_col6 + (((-26.28D - _col2) * (- _col2)) * UDFToDouble((- _col4)))) - (((-26.28D - _col2) * (- _col2)) * UDFToDouble((- _col4)))) (type: double), (_col2 * 10.175D) (type: double), _col10 (type: double), (((_col6 + (((-26.28D - _col2) * (- _col2)) * UDFToDouble((- _col4)))) - (((-26.28D - _col2) * (- _col2)) * UDFToDouble((- _col4)))) * 10.175D) (type: double), (10.175D % (10.175D / _col3)) (type: double), (- _col5) (type: tinyint), _col11 (type: double), _col12 (type: double), (- ((-26.28D - _col2) * (- _col2))) (type: double), ((- _col2) % _col10) (type: double), (-26.28 / CAST( (- _col5) AS decimal(3,0))) (type: decimal(8,6)), _col13 (type: double), _col14 (type: bigint), ((_col6 + (((-26.28D - _col2) * (- _col2)) * UDFToDouble((- _col4)))) / _col7) (type: double), _col4 (type: bigint), _col4 (type: bigint), ((_col6 + (((-26.28D - _col2) * (- _col2)) * UDFToDouble((- _col4)))) % -26.28D) (type: double)
                     null sort order: zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz
-                    Statistics: Num rows: 5980 Data size: 1196404 Basic stats: COMPLETE Column stats: COMPLETE
+                    Statistics: Num rows: 5979 Data size: 1196170 Basic stats: COMPLETE Column stats: COMPLETE
                     top n: 50
                     Top N Key Vectorization:
                         className: VectorTopNKeyOperator
@@ -3025,7 +3025,7 @@ STAGE PLANS:
                           native: true
                           projectedOutputColumnNums: [0, 1, 27, 23, 24, 29, 25, 26, 7, 35, 31, 8, 30, 32, 36, 28, 27, 38, 40, 37, 42, 49, 41, 39, 50, 43, 45, 48, 19, 54, 44, 52, 145, 59, 15, 53, 7, 7, 55]
                           selectExpressions: DoubleColMultiplyDoubleScalar(col 27:double, val 10.175) -> 23:double, DoubleColUnaryMinus(col 27:double) -> 24:double, DoubleColUnaryMinus(col 27:double) -> 25:double, DoubleScalarSubtractDoubleColumn(val -26.28, col 27:double) -> 26:double, LongColUnaryMinus(col 7:bigint) -> 35:bigint, DoubleColMultiplyDoubleColumn(col 28:double, col 30:double)(children: DoubleScalarSubtractDoubleColumn(val -26.28, col 27:double) -> 28:double, DoubleColUnaryMinus(col 27:double) -> 30:double) -> 31:double, DoubleColMultiplyDoubleColumn(col 32:double, col 28:double)(children: DoubleColMultiplyDoubleColumn(col 28:double, col 30:double)(children: DoubleScalarSubtractDoubleColumn(val -26.28, col 27:double) -> 28:double, DoubleColUnaryMinus(col 27:double) -> 30:double) -> 32:double, CastLongToDouble(col 48:bigint)(children: LongColUnaryMinus(col 7:bigint) -> 48:bigint) -> 28:double) -> 30:double, DoubleColUnaryMinus(col 28:double)(children: DoubleColMultiplyDoubleScalar(col 27:double, val 10.175) -> 28:double) -> 32:double, DoubleColAddDoubleColumn(col 36:double, col 37:double)(children: DoubleColMultiplyDoubleColumn(col 38:double, col 28:double)(children: DoubleColMultiplyDoubleColumn(col 28:double, col 37:double)(children: DoubleScalarSubtractDoubleColumn(val -26.28, col 27:double) -> 28:double, DoubleColUnaryMinus(col 27:double) -> 37:double) -> 38:double, CastLongToDouble(col 48:bigint)(children: LongColUnaryMinus(col 7:bigint) -> 48:bigint) -> 28:double) -> 37:double) -> 28:double, DoubleColDivideDoubleColumn(col 37:double, col 27:double)(children: CastLongToDouble(col 48:bigint)(children: LongColUnaryMinus(col 7:bigint) -> 48:bigint) -> 37:double) -> 38:double, DoubleScalarDivideDoubleColumn(val 10.175, col 29:double) -> 37:double, DoubleColSubtractDoubleColumn(col 39:double, col 43:double)(children: DoubleColAddDoubleColumn(col 36:double, col 41:double)(children: DoubleColMultiplyDoubleColumn(col 43:double, col 39:double)(children: DoubleColMultiplyDoubleColumn(col 39:double, col 41:double)(children: DoubleScalarSubtractDoubleColumn(val -26.28, col 27:double) -> 39:double, DoubleColUnaryMinus(col 27:double) -> 41:double) -> 43:double, CastLongToDouble(col 48:bigint)(children: LongColUnaryMinus(col 7:bigint) -> 48:bigint) -> 39:double) -> 41:double) -> 39:double, DoubleColMultiplyDoubleColumn(col 44:double, col 41:double)(children: DoubleColMultiplyDoubleColumn(col 41:double, col 43:double)(children: DoubleScalarSubtractDoubleColumn(val -26.28, col 27:double) -> 41:double, DoubleColUnaryMinus(col 27:double) -> 43:double) -> 44:double, CastLongToDouble(col 48:bigint)(children: LongColUnaryMinus(col 7:bigint) -> 48:bigint) -> 41:double) -> 43:double) -> 41:double, DoubleColMultiplyDoubleScalar(col 27:double, val 10.175) -> 39:double, DoubleColMultiplyDoubleScalar(col 44:double, val 10.175)(children: DoubleColSubtractDoubleColumn(col 43:double, col 45:double)(children: DoubleColAddDoubleColumn(col 36:double, col 44:double)(children: DoubleColMultiplyDoubleColumn(col 45:double, col 43:double)(children: DoubleColMultiplyDoubleColumn(col 43:double, col 44:double)(children: DoubleScalarSubtractDoubleColumn(val -26.28, col 27:double) -> 43:double, DoubleColUnaryMinus(col 27:double) -> 44:double) -> 45:double, CastLongToDouble(col 48:bigint)(children: LongColUnaryMinus(col 7:bigint) -> 48:bigint) -> 43:double) -> 44:double) -> 43:double, DoubleColMultiplyDoubleColumn(col 51:double, col 44:double)(children: DoubleColMultiplyDoubleColumn(col 44:double, col 45:double)(children: DoubleScalarSubtractDoubleColumn(val -26.28, col 27:double) -> 44:double, DoubleColUnaryMinus(col 27:double) -> 45:double) -> 51:double, CastLongToDouble(col 48:bigint)(children: LongColUnaryMinus(col 7:bigint) -> 48:bigint) -> 44:double) -> 45:double) -> 44:double) -> 43:double, DoubleScalarModuloDoubleColumn(val 10.175, col 44:double)(children: DoubleScalarDivideDoubleColumn(val 10.175, col 29:double) -> 44:double) -> 45:double, LongColUnaryMinus(col 8:tinyint) -> 48:tinyint, DoubleColUnaryMinus(col 52:double)(children: DoubleColMultiplyDoubleColumn(col 44:double, col 51:double)(children: DoubleScalarSubtractDoubleColumn(val -26.28, col 27:double) -> 44:double, DoubleColUnaryMinus(col 27:double) -> 51:double) -> 52:double) -> 44:double, DoubleColModuloDoubleColumn(col 51:double, col 50:double)(children: DoubleColUnaryMinus(col 27:double) -> 51:double) -> 52:double, DecimalScalarDivideDecimalColumn(val -26.28, col 127:decimal(3,0))(children: CastLongToDecimal(col 71:tinyint)(children: LongColUnaryMinus(col 8:tinyint) -> 71:tinyint) -> 127:decimal(3,0)) -> 145:decimal(8,6), DoubleColDivideDoubleColumn(col 51:double, col 40:double)(children: DoubleColAddDoubleColumn(col 36:double, col 53:double)(children: DoubleColMultiplyDoubleColumn(col 55:double, col 51:double)(children: DoubleColMultiplyDoubleColumn(col 51:double, col 53:double)(children: DoubleScalarSubtractDoubleColumn(val -26.28, col 27:double) -> 51:double, DoubleColUnaryMinus(col 27:double) -> 53:double) -> 55:double, CastLongToDouble(col 71:bigint)(children: LongColUnaryMinus(col 7:bigint) -> 71:bigint) -> 51:double) -> 53:double) -> 51:double) -> 53:double, DoubleColModuloDoubleScalar(col 51:double, val -26.28)(children: DoubleColAddDoubleColumn(col 36:double, col 55:double)(children: DoubleColMultiplyDoubleColumn(col 56:double, col 51:double)(children: DoubleColMultiplyDoubleColumn(col 51:double, col 55:double)(children: DoubleScalarSubtractDoubleColumn(val -26.28, col 27:double) -> 51:double, DoubleColUnaryMinus(col 27:double) -> 55:double) -> 56:double, CastLongToDouble(col 71:bigint)(children: LongColUnaryMinus(col 7:bigint) -> 71:bigint) -> 51:double) -> 55:double) -> 51:double) -> 55:double
-                      Statistics: Num rows: 5980 Data size: 2739514 Basic stats: COMPLETE Column stats: COMPLETE
+                      Statistics: Num rows: 5979 Data size: 2738988 Basic stats: COMPLETE Column stats: COMPLETE
                       Reduce Output Operator
                         key expressions: _col0 (type: timestamp), _col1 (type: string), _col2 (type: double), _col3 (type: double), _col4 (type: double), _col5 (type: double), _col6 (type: double), _col7 (type: double), _col8 (type: bigint), _col9 (type: bigint), _col10 (type: double), _col11 (type: tinyint), _col12 (type: double), _col13 (type: double), _col14 (type: double), _col15 (type: double), _col16 (type: double), _col17 (type: double), _col18 (type: double), _col19 (type: double), _col20 (type: double), _col21 (type: double), _col22 (type: double), _col23 (type: double), _col24 (type: double), _col25 (type: double), _col26 (type: double), _col27 (type: tinyint), _col28 (type: double), _col29 (type: double), _col30 (type: double), _col31 (type: double), _col32 (type: decimal(8,6)), _col33 (type: double), _col34 (type: bigint), _col35 (type: double), _col36 (type: bigint), _col37 (type: bigint), _col38 (type: double)
                         null sort order: zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz
@@ -3034,7 +3034,7 @@ STAGE PLANS:
                             className: VectorReduceSinkObjectHashOperator
                             native: true
                             nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true
-                        Statistics: Num rows: 5980 Data size: 2739514 Basic stats: COMPLETE Column stats: COMPLETE
+                        Statistics: Num rows: 5979 Data size: 2738988 Basic stats: COMPLETE Column stats: COMPLETE
         Reducer 3 
             Execution mode: vectorized, llap
             Reduce Vectorization:
@@ -3051,7 +3051,7 @@ STAGE PLANS:
                     className: VectorSelectOperator
                     native: true
                     projectedOutputColumnNums: [0, 1, 2, 3, 4, 5, 4, 7, 8, 9, 10, 11, 12, 13, 14, 15, 2, 17, 18, 19, 20, 21, 22, 3, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 8, 8, 38]
-                Statistics: Num rows: 5980 Data size: 2739514 Basic stats: COMPLETE Column stats: COMPLETE
+                Statistics: Num rows: 5979 Data size: 2738988 Basic stats: COMPLETE Column stats: COMPLETE
                 Limit
                   Number of rows: 50
                   Limit Vectorization:
diff --git a/ql/src/test/results/clientpositive/llap/vectorized_stats.q.out b/ql/src/test/results/clientpositive/llap/vectorized_stats.q.out
index af0c461861f3..0e1519cf20a9 100644
--- a/ql/src/test/results/clientpositive/llap/vectorized_stats.q.out
+++ b/ql/src/test/results/clientpositive/llap/vectorized_stats.q.out
@@ -1207,13 +1207,13 @@ STAGE PLANS:
                       minReductionHashAggr: 0.99
                       mode: hash
                       outputColumnNames: _col0
-                      Statistics: Num rows: 1 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE
+                      Statistics: Num rows: 6144 Data size: 183480 Basic stats: COMPLETE Column stats: COMPLETE
                       Reduce Output Operator
                         key expressions: _col0 (type: timestamp)
                         null sort order: z
                         sort order: +
                         Map-reduce partition columns: _col0 (type: timestamp)
-                        Statistics: Num rows: 1 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE
+                        Statistics: Num rows: 6144 Data size: 183480 Basic stats: COMPLETE Column stats: COMPLETE
             Execution mode: vectorized, llap
             LLAP IO: all inputs
         Reducer 2 
@@ -1223,10 +1223,10 @@ STAGE PLANS:
                 keys: KEY._col0 (type: timestamp)
                 mode: mergepartial
                 outputColumnNames: _col0
-                Statistics: Num rows: 1 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE
+                Statistics: Num rows: 3072 Data size: 91760 Basic stats: COMPLETE Column stats: COMPLETE
                 File Output Operator
                   compressed: false
-                  Statistics: Num rows: 1 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE
+                  Statistics: Num rows: 3072 Data size: 91760 Basic stats: COMPLETE Column stats: COMPLETE
                   table:
                       input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                       output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat