From d90457f2c80ac2be43f1efe6bd1442ba9c508f39 Mon Sep 17 00:00:00 2001 From: Konstantin Bereznyakov Date: Fri, 13 Mar 2026 15:21:02 +0900 Subject: [PATCH 01/13] HIVE-29368: Could this be the truly accurate pessimistic stats combining? --- .../hadoop/hive/ql/stats/StatsUtils.java | 1 + .../estimator/PessimisticStatCombiner.java | 4 +- .../queries/clientpositive/ndv_case_const.q | 27 + .../clientpositive/llap/ndv_case_const.q.out | 755 ++++++++++++++++++ 4 files changed, 784 insertions(+), 3 deletions(-) create mode 100644 ql/src/test/queries/clientpositive/ndv_case_const.q create mode 100644 ql/src/test/results/clientpositive/llap/ndv_case_const.q.out diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index c530633fbf1c..35fa75ebf7c4 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -1574,6 +1574,7 @@ public static ColStatistics getColStatisticsFromExpression(HiveConf conf, Statis Optional res = se.estimate(csList); if (res.isPresent()) { ColStatistics newStats = res.get(); + newStats.setCountDistint(Math.min(newStats.getCountDistint(), numRows)); colType = colType.toLowerCase(); newStats.setColumnType(colType); newStats.setColumnName(colName); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java index dde2019eadf7..5b1cfb73d722 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java @@ -41,9 +41,7 @@ public void add(ColStatistics stat) { if (stat.getAvgColLen() > result.getAvgColLen()) { result.setAvgColLen(stat.getAvgColLen()); } - if (stat.getCountDistint() > result.getCountDistint()) { - result.setCountDistint(stat.getCountDistint()); - } + result.setCountDistint(result.getCountDistint() + stat.getCountDistint()); if (stat.getNumNulls() > result.getNumNulls()) { result.setNumNulls(stat.getNumNulls()); } diff --git a/ql/src/test/queries/clientpositive/ndv_case_const.q b/ql/src/test/queries/clientpositive/ndv_case_const.q new file mode 100644 index 000000000000..42162d25c529 --- /dev/null +++ b/ql/src/test/queries/clientpositive/ndv_case_const.q @@ -0,0 +1,27 @@ +CREATE TABLE t (cond INT, c2 STRING, c100 STRING); +ALTER TABLE t UPDATE STATISTICS SET('numRows'='10000','rawDataSize'='1000000'); +ALTER TABLE t UPDATE STATISTICS FOR COLUMN cond SET('numDVs'='10','numNulls'='0'); +ALTER TABLE t UPDATE STATISTICS FOR COLUMN c2 SET('numDVs'='2','numNulls'='0','avgColLen'='5','maxColLen'='10'); +ALTER TABLE t UPDATE STATISTICS FOR COLUMN c100 SET('numDVs'='100','numNulls'='0','avgColLen'='5','maxColLen'='10'); + +EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN 'A' WHEN cond=2 THEN 'B' ELSE 'C' END x FROM t) sub GROUP BY x; + +EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN 'A' WHEN cond=2 THEN 'B' WHEN cond=3 THEN 'A' ELSE 'B' END x FROM t) sub GROUP BY x; + +EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN 'A' WHEN cond=2 THEN 'B' ELSE NULL END x FROM t) sub GROUP BY x; + +EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN NULL WHEN cond=2 THEN 'A' ELSE 'B' END x FROM t) sub GROUP BY x; + +EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN NULL WHEN cond=2 THEN NULL WHEN cond=3 THEN 'A' ELSE 'B' END x FROM t) sub GROUP BY x; + +EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN NULL WHEN cond=2 THEN 'A' WHEN cond=3 THEN NULL ELSE 'B' END x FROM t) sub GROUP BY x; + +EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN NULL WHEN cond=2 THEN NULL WHEN cond=3 THEN c100 ELSE 'A' END x FROM t) sub GROUP BY x; + +EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN c2 WHEN cond=2 THEN c100 ELSE 'A' END x FROM t) sub GROUP BY x; + +EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN c2 WHEN cond=2 THEN c100 ELSE c2 END x FROM t) sub GROUP BY x; + +EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN 'A' WHEN cond=2 THEN 'B' WHEN cond=3 THEN 'C' ELSE c2 END x FROM t) sub GROUP BY x; + +EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN 'A' WHEN cond=2 THEN 'B' ELSE c100 END x FROM t) sub GROUP BY x; diff --git a/ql/src/test/results/clientpositive/llap/ndv_case_const.q.out b/ql/src/test/results/clientpositive/llap/ndv_case_const.q.out new file mode 100644 index 000000000000..1d3c298a25c5 --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/ndv_case_const.q.out @@ -0,0 +1,755 @@ +PREHOOK: query: CREATE TABLE t (cond INT, c2 STRING, c100 STRING) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t +POSTHOOK: query: CREATE TABLE t (cond INT, c2 STRING, c100 STRING) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t +PREHOOK: query: ALTER TABLE t UPDATE STATISTICS SET('numRows'='10000','rawDataSize'='1000000') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@t +PREHOOK: Output: default@t +POSTHOOK: query: ALTER TABLE t UPDATE STATISTICS SET('numRows'='10000','rawDataSize'='1000000') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@t +POSTHOOK: Output: default@t +PREHOOK: query: ALTER TABLE t UPDATE STATISTICS FOR COLUMN cond SET('numDVs'='10','numNulls'='0') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@t +PREHOOK: Output: default@t +POSTHOOK: query: ALTER TABLE t UPDATE STATISTICS FOR COLUMN cond SET('numDVs'='10','numNulls'='0') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@t +POSTHOOK: Output: default@t +PREHOOK: query: ALTER TABLE t UPDATE STATISTICS FOR COLUMN c2 SET('numDVs'='2','numNulls'='0','avgColLen'='5','maxColLen'='10') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@t +PREHOOK: Output: default@t +POSTHOOK: query: ALTER TABLE t UPDATE STATISTICS FOR COLUMN c2 SET('numDVs'='2','numNulls'='0','avgColLen'='5','maxColLen'='10') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@t +POSTHOOK: Output: default@t +PREHOOK: query: ALTER TABLE t UPDATE STATISTICS FOR COLUMN c100 SET('numDVs'='100','numNulls'='0','avgColLen'='5','maxColLen'='10') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@t +PREHOOK: Output: default@t +POSTHOOK: query: ALTER TABLE t UPDATE STATISTICS FOR COLUMN c100 SET('numDVs'='100','numNulls'='0','avgColLen'='5','maxColLen'='10') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@t +POSTHOOK: Output: default@t +PREHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN 'A' WHEN cond=2 THEN 'B' ELSE 'C' END x FROM t) sub GROUP BY x +PREHOOK: type: QUERY +PREHOOK: Input: default@t +#### A masked pattern was here #### +POSTHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN 'A' WHEN cond=2 THEN 'B' ELSE 'C' END x FROM t) sub GROUP BY x +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: t + Statistics: Num rows: 10000 Data size: 40000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: CASE WHEN ((cond = 1)) THEN ('A') WHEN ((cond = 2)) THEN ('B') ELSE ('C') END (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 10000 Data size: 40000 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + keys: _col0 (type: string) + minReductionHashAggr: 0.99 + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized, llap + LLAP IO: all inputs + Reducer 2 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN 'A' WHEN cond=2 THEN 'B' WHEN cond=3 THEN 'A' ELSE 'B' END x FROM t) sub GROUP BY x +PREHOOK: type: QUERY +PREHOOK: Input: default@t +#### A masked pattern was here #### +POSTHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN 'A' WHEN cond=2 THEN 'B' WHEN cond=3 THEN 'A' ELSE 'B' END x FROM t) sub GROUP BY x +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: t + Statistics: Num rows: 10000 Data size: 40000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: CASE WHEN ((cond = 1)) THEN ('A') WHEN ((cond = 2)) THEN ('B') WHEN ((cond = 3)) THEN ('A') ELSE ('B') END (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 10000 Data size: 40000 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + keys: _col0 (type: string) + minReductionHashAggr: 0.99 + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 4 Data size: 340 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 4 Data size: 340 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized, llap + LLAP IO: all inputs + Reducer 2 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 4 Data size: 340 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 4 Data size: 340 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN 'A' WHEN cond=2 THEN 'B' ELSE NULL END x FROM t) sub GROUP BY x +PREHOOK: type: QUERY +PREHOOK: Input: default@t +#### A masked pattern was here #### +POSTHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN 'A' WHEN cond=2 THEN 'B' ELSE NULL END x FROM t) sub GROUP BY x +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: t + Statistics: Num rows: 10000 Data size: 40000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: CASE WHEN ((cond = 1)) THEN ('A') WHEN ((cond = 2)) THEN ('B') ELSE (null) END (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 10000 Data size: 40000 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + keys: _col0 (type: string) + minReductionHashAggr: 0.99 + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized, llap + LLAP IO: all inputs + Reducer 2 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN NULL WHEN cond=2 THEN 'A' ELSE 'B' END x FROM t) sub GROUP BY x +PREHOOK: type: QUERY +PREHOOK: Input: default@t +#### A masked pattern was here #### +POSTHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN NULL WHEN cond=2 THEN 'A' ELSE 'B' END x FROM t) sub GROUP BY x +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: t + Statistics: Num rows: 10000 Data size: 40000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: CASE WHEN ((cond = 1)) THEN (null) WHEN ((cond = 2)) THEN ('A') ELSE ('B') END (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 10000 Data size: 40000 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + keys: _col0 (type: string) + minReductionHashAggr: 0.99 + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized, llap + LLAP IO: all inputs + Reducer 2 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN NULL WHEN cond=2 THEN NULL WHEN cond=3 THEN 'A' ELSE 'B' END x FROM t) sub GROUP BY x +PREHOOK: type: QUERY +PREHOOK: Input: default@t +#### A masked pattern was here #### +POSTHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN NULL WHEN cond=2 THEN NULL WHEN cond=3 THEN 'A' ELSE 'B' END x FROM t) sub GROUP BY x +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: t + Statistics: Num rows: 10000 Data size: 40000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: CASE WHEN ((cond) IN (1, 2)) THEN (null) WHEN ((cond = 3)) THEN ('A') ELSE ('B') END (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 10000 Data size: 40000 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + keys: _col0 (type: string) + minReductionHashAggr: 0.99 + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized, llap + LLAP IO: all inputs + Reducer 2 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN NULL WHEN cond=2 THEN 'A' WHEN cond=3 THEN NULL ELSE 'B' END x FROM t) sub GROUP BY x +PREHOOK: type: QUERY +PREHOOK: Input: default@t +#### A masked pattern was here #### +POSTHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN NULL WHEN cond=2 THEN 'A' WHEN cond=3 THEN NULL ELSE 'B' END x FROM t) sub GROUP BY x +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: t + Statistics: Num rows: 10000 Data size: 40000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: CASE WHEN ((cond = 1)) THEN (null) WHEN ((cond = 2)) THEN ('A') WHEN ((cond = 3)) THEN (null) ELSE ('B') END (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 10000 Data size: 40000 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + keys: _col0 (type: string) + minReductionHashAggr: 0.99 + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized, llap + LLAP IO: all inputs + Reducer 2 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN NULL WHEN cond=2 THEN NULL WHEN cond=3 THEN c100 ELSE 'A' END x FROM t) sub GROUP BY x +PREHOOK: type: QUERY +PREHOOK: Input: default@t +#### A masked pattern was here #### +POSTHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN NULL WHEN cond=2 THEN NULL WHEN cond=3 THEN c100 ELSE 'A' END x FROM t) sub GROUP BY x +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: t + Statistics: Num rows: 10000 Data size: 930000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: CASE WHEN ((cond) IN (1, 2)) THEN (null) WHEN ((cond = 3)) THEN (c100) ELSE ('A') END (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 10000 Data size: 930000 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + keys: _col0 (type: string) + minReductionHashAggr: 0.9898 + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 102 Data size: 89 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 102 Data size: 89 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized, llap + LLAP IO: all inputs + Reducer 2 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 102 Data size: 89 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 102 Data size: 89 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN c2 WHEN cond=2 THEN c100 ELSE 'A' END x FROM t) sub GROUP BY x +PREHOOK: type: QUERY +PREHOOK: Input: default@t +#### A masked pattern was here #### +POSTHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN c2 WHEN cond=2 THEN c100 ELSE 'A' END x FROM t) sub GROUP BY x +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: t + Statistics: Num rows: 10000 Data size: 1820000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: CASE WHEN ((cond = 1)) THEN (c2) WHEN ((cond = 2)) THEN (c100) ELSE ('A') END (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 10000 Data size: 1820000 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + keys: _col0 (type: string) + minReductionHashAggr: 0.9897 + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 103 Data size: 9167 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 103 Data size: 9167 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized, llap + LLAP IO: all inputs + Reducer 2 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 103 Data size: 9167 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 103 Data size: 9167 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN c2 WHEN cond=2 THEN c100 ELSE c2 END x FROM t) sub GROUP BY x +PREHOOK: type: QUERY +PREHOOK: Input: default@t +#### A masked pattern was here #### +POSTHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN c2 WHEN cond=2 THEN c100 ELSE c2 END x FROM t) sub GROUP BY x +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: t + Statistics: Num rows: 10000 Data size: 1820000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: CASE WHEN ((cond = 1)) THEN (c2) WHEN ((cond = 2)) THEN (c100) ELSE (c2) END (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 10000 Data size: 1820000 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + keys: _col0 (type: string) + minReductionHashAggr: 0.9896 + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 104 Data size: 9256 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 104 Data size: 9256 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized, llap + LLAP IO: all inputs + Reducer 2 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 104 Data size: 9256 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 104 Data size: 9256 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN 'A' WHEN cond=2 THEN 'B' WHEN cond=3 THEN 'C' ELSE c2 END x FROM t) sub GROUP BY x +PREHOOK: type: QUERY +PREHOOK: Input: default@t +#### A masked pattern was here #### +POSTHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN 'A' WHEN cond=2 THEN 'B' WHEN cond=3 THEN 'C' ELSE c2 END x FROM t) sub GROUP BY x +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: t + Statistics: Num rows: 10000 Data size: 930000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: CASE WHEN ((cond = 1)) THEN ('A') WHEN ((cond = 2)) THEN ('B') WHEN ((cond = 3)) THEN ('C') ELSE (c2) END (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 10000 Data size: 930000 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + keys: _col0 (type: string) + minReductionHashAggr: 0.99 + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 5 Data size: 445 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 5 Data size: 445 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized, llap + LLAP IO: all inputs + Reducer 2 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 5 Data size: 445 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 5 Data size: 445 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN 'A' WHEN cond=2 THEN 'B' ELSE c100 END x FROM t) sub GROUP BY x +PREHOOK: type: QUERY +PREHOOK: Input: default@t +#### A masked pattern was here #### +POSTHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN 'A' WHEN cond=2 THEN 'B' ELSE c100 END x FROM t) sub GROUP BY x +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: t + Statistics: Num rows: 10000 Data size: 930000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: CASE WHEN ((cond = 1)) THEN ('A') WHEN ((cond = 2)) THEN ('B') ELSE (c100) END (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 10000 Data size: 930000 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + keys: _col0 (type: string) + minReductionHashAggr: 0.9898 + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 102 Data size: 9078 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 102 Data size: 9078 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized, llap + LLAP IO: all inputs + Reducer 2 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 102 Data size: 9078 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 102 Data size: 9078 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + From 46485cfa95d7df389186b482fdb42e3b924526b5 Mon Sep 17 00:00:00 2001 From: Konstantin Bereznyakov Date: Fri, 20 Mar 2026 11:00:52 -0700 Subject: [PATCH 02/13] HIVE-29368: NDV of 0 is "unknown", so combining it with anything else still remails "unknown" --- .../hive/ql/stats/estimator/PessimisticStatCombiner.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java index 5b1cfb73d722..8272d0ff06d4 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java @@ -41,7 +41,12 @@ public void add(ColStatistics stat) { if (stat.getAvgColLen() > result.getAvgColLen()) { result.setAvgColLen(stat.getAvgColLen()); } - result.setCountDistint(result.getCountDistint() + stat.getCountDistint()); + // NDV=0 means "unknown" - if either stat has unknown NDV, preserve 0 to propagate uncertainty + if (result.getCountDistint() == 0L || stat.getCountDistint() == 0L) { + result.setCountDistint(0L); + } else { + result.setCountDistint(result.getCountDistint() + stat.getCountDistint()); + } if (stat.getNumNulls() > result.getNumNulls()) { result.setNumNulls(stat.getNumNulls()); } From 1d3fb0591404d69bd8b7f6213e82c58b9df9f1fa Mon Sep 17 00:00:00 2001 From: Konstantin Bereznyakov Date: Fri, 20 Mar 2026 11:59:51 -0700 Subject: [PATCH 03/13] HIVE-29368: use safeAdd --- .../hive/ql/stats/estimator/PessimisticStatCombiner.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java index 7e4cb0cffed1..2c6867e6f7b9 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java @@ -21,6 +21,7 @@ import java.util.Optional; import org.apache.hadoop.hive.ql.plan.ColStatistics; +import org.apache.hadoop.hive.ql.stats.StatsUtils; /** * Combines {@link ColStatistics} objects to provide the most pessimistic estimate. @@ -45,7 +46,7 @@ public void add(ColStatistics stat) { if (result.getCountDistint() == 0L || stat.getCountDistint() == 0L) { result.setCountDistint(0L); } else { - result.setCountDistint(result.getCountDistint() + stat.getCountDistint()); + result.setCountDistint(StatsUtils.safeAdd(result.getCountDistint(), stat.getCountDistint())); } if (stat.getNumNulls() < 0 || result.getNumNulls() < 0) { result.setNumNulls(-1); From 874996d9db21f7cec85525ae2c122a94ce76ad7c Mon Sep 17 00:00:00 2001 From: Konstantin Bereznyakov Date: Fri, 20 Mar 2026 19:41:58 -0700 Subject: [PATCH 04/13] HIVE-29368: special handling of "const NULL" columns, unit tests, new .out files --- .../hadoop/hive/ql/plan/ColStatistics.java | 10 + .../hadoop/hive/ql/stats/StatsUtils.java | 5 +- .../estimator/PessimisticStatCombiner.java | 9 +- .../ql/stats/estimator/StatEstimator.java | 45 ++++- .../estimator/StatEstimatorProvider.java | 10 +- .../hive/ql/udf/generic/GenericUDFLower.java | 15 -- .../hive/ql/udf/generic/GenericUDFUpper.java | 15 -- .../TestPessimisticStatCombiner.java | 184 ++++++++++++++++++ .../ql/stats/estimator/TestStatEstimator.java | 169 ++++++++++++++++ .../llap/infer_bucket_sort_dyn_part.q.out | 10 +- .../llap/list_bucket_dml_6.q.out | 28 +-- .../llap/list_bucket_dml_7.q.out | 28 +-- .../llap/list_bucket_dml_8.q.out | 10 +- .../llap/merge_dynamic_partition4.q.out | 10 +- .../llap/merge_dynamic_partition5.q.out | 10 +- .../llap/scratch_col_issue.q.out | 6 +- 16 files changed, 469 insertions(+), 95 deletions(-) create mode 100644 ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestStatEstimator.java diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java index 717d1f8b6a7c..76b8c28691cc 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java @@ -31,6 +31,7 @@ public class ColStatistics { private boolean isPrimaryKey; private boolean isEstimated; private boolean isFilteredColumn; + private boolean isConst; private byte[] bitVectors; private byte[] histogram; @@ -155,6 +156,8 @@ public String toString() { sb.append(" isEstimated: "); sb.append(isEstimated); + sb.append(" isConst: "); + sb.append(isConst); return sb.toString(); } @@ -171,6 +174,7 @@ public ColStatistics clone() { clone.setPrimaryKey(isPrimaryKey); clone.setIsEstimated(isEstimated); clone.setIsFilteredColumn(isFilteredColumn); + clone.setConst(isConst); if (range != null ) { clone.setRange(range.clone()); } @@ -191,6 +195,12 @@ public void setIsEstimated(boolean isEstimated) { public boolean isEstimated() { return isEstimated; } + public void setConst(boolean isConst) { + this.isConst = isConst; + } + + public boolean isConst() { return isConst; } + public static class Range { public final Number minValue; public final Number maxValue; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index 4392ee905d86..f8aad54910e3 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -1578,10 +1578,9 @@ public static ColStatistics getColStatisticsFromExpression(HiveConf conf, Statis csList.add(cs); } if (csList.size() == engfd.getChildren().size()) { - Optional res = se.estimate(csList); + Optional res = se.estimate(csList, numRows); if (res.isPresent()) { ColStatistics newStats = res.get(); - newStats.setCountDistint(Math.min(newStats.getCountDistint(), numRows)); colType = colType.toLowerCase(); newStats.setColumnType(colType); newStats.setColumnName(colName); @@ -1644,6 +1643,7 @@ private static ColStatistics buildColStatForConstant(HiveConf conf, long numRows colStats.setAvgColLen(avgColSize); colStats.setCountDistint(countDistincts); colStats.setNumNulls(numNulls); + colStats.setConst(true); Optional value = getConstValue(encd); value.ifPresent(number -> colStats.setRange(number, number)); @@ -2093,7 +2093,6 @@ public static long computeNDVGroupingColumns(List colStats, Stati return 0L; } if (ndvValues.isEmpty()) { - // No grouping columns, one row return 1L; } if (expDecay) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java index 2c6867e6f7b9..b3086693ed52 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java @@ -42,12 +42,14 @@ public void add(ColStatistics stat) { if (stat.getAvgColLen() > result.getAvgColLen()) { result.setAvgColLen(stat.getAvgColLen()); } - // NDV=0 means "unknown" - if either stat has unknown NDV, preserve 0 to propagate uncertainty - if (result.getCountDistint() == 0L || stat.getCountDistint() == 0L) { - result.setCountDistint(0L); + // NDV=0 is "unknown" only if the stat is NOT a constant. + // Constants with NDV=0 (e.g., NULL) are "known zero", not unknown. + if ((result.getCountDistint() == 0 && !result.isConst()) || (stat.getCountDistint() == 0 && !stat.isConst())) { + result.setCountDistint(0); } else { result.setCountDistint(StatsUtils.safeAdd(result.getCountDistint(), stat.getCountDistint())); } + result.setConst(false); if (stat.getNumNulls() < 0 || result.getNumNulls() < 0) { result.setNumNulls(-1); } else if (stat.getNumNulls() > result.getNumNulls()) { @@ -70,6 +72,5 @@ public void add(ColStatistics stat) { public Optional getResult() { return Optional.of(result); - } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimator.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimator.java index 94aaa32ecfcb..98e96c48893a 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimator.java @@ -24,20 +24,53 @@ import org.apache.hadoop.hive.ql.plan.ColStatistics; /** - * Enables statistics related computation on UDFs + * Enables statistics related computation on UDFs. + * + *

This interface provides two default implementations: + *

    + *
  • {@link #estimate(List)} - clones the first argument's statistics (suitable for most UDFs)
  • + *
  • {@link #estimate(List, long)} - calls estimate(List) and caps NDV at numRows
  • + *
+ * + *

UDFs that simply pass through statistics (like LOWER, UPPER) can use the defaults. + * UDFs that combine statistics (like IF, WHEN, COALESCE) should override {@link #estimate(List)}. */ public interface StatEstimator { /** * Computes the output statistics of the actual UDF. * - * The estimator should return with a preferably overestimated {@link ColStatistics} object if possible. - * The actual estimation logic may decide to not give an estimation; it should return with {@link Optional#empty()}. + *

The default implementation clones the first argument's statistics, which is suitable + * for most UDFs that don't significantly alter the statistical properties of their input. + * + *

Override this method for UDFs that combine multiple inputs (like IF, WHEN, COALESCE) + * or significantly transform the data. + * + * @param argStats the statistics for every argument of the UDF + * @return {@link ColStatistics} estimate for the actual UDF, or empty if estimation is not possible + */ + default Optional estimate(List argStats) { + if (argStats.isEmpty()) { + return Optional.empty(); + } + return Optional.of(argStats.get(0).clone()); + } + + /** + * Computes the output statistics of the actual UDF, ensuring NDV does not exceed numRows. * - * Note: at the time of the call there will be {@link ColStatistics} for all the arguments; if that is not available - the estimation is skipped. + *

The default implementation calls {@link #estimate(List)} and caps the NDV at numRows. + * This ensures that estimators which combine statistics from multiple branches (producing + * potentially inflated NDV values) are automatically bounded by the number of rows. * * @param argStats the statistics for every argument of the UDF - * @return {@link ColStatistics} estimate for the actual UDF. + * @param numRows the number of rows, used to cap the NDV + * @return {@link ColStatistics} estimate for the actual UDF with NDV capped at numRows */ - public Optional estimate(List argStats); + default Optional estimate(List argStats, long numRows) { + return estimate(argStats).map(cs -> { + cs.setCountDistint(Math.min(cs.getCountDistint(), numRows)); + return cs; + }); + } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimatorProvider.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimatorProvider.java index 96865d194c6e..c888493040e4 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimatorProvider.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimatorProvider.java @@ -19,11 +19,19 @@ /** * Marker interface for UDFs to communicate that the usage of StatEstimators is supported by the UDF. + * + *

The default implementation returns a {@link StatEstimator} that clones the first argument's + * statistics, which is suitable for most UDFs. Override {@link #getStatEstimator()} for UDFs + * that combine statistics from multiple inputs (like IF, WHEN, COALESCE). */ public interface StatEstimatorProvider { /** * Returns the {@link StatEstimator} for the given UDF instance. + * + *

The default implementation returns an estimator that clones the first argument's statistics. */ - public StatEstimator getStatEstimator(); + default StatEstimator getStatEstimator() { + return new StatEstimator() {}; + } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFLower.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFLower.java index 411438907424..609274c0bfe4 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFLower.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFLower.java @@ -24,8 +24,6 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions; import org.apache.hadoop.hive.ql.exec.vector.expressions.StringLower; import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.ql.plan.ColStatistics; -import org.apache.hadoop.hive.ql.stats.estimator.StatEstimator; import org.apache.hadoop.hive.ql.stats.estimator.StatEstimatorProvider; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; @@ -37,8 +35,6 @@ import org.apache.hadoop.hive.serde2.typeinfo.BaseCharTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; -import java.util.List; -import java.util.Optional; /** * UDFLower. @@ -113,15 +109,4 @@ public Object evaluate(DeferredObject[] arguments) throws HiveException { public String getDisplayString(String[] children) { return getStandardDisplayString("lower", children); } - - @Override - public StatEstimator getStatEstimator() { - return new StatEstimator() { - @Override - public Optional estimate(List argStats) { - return Optional.of(argStats.get(0).clone()); - } - }; - } - } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUpper.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUpper.java index 019cbe94a4ba..d0df8da9886b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUpper.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUpper.java @@ -24,8 +24,6 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions; import org.apache.hadoop.hive.ql.exec.vector.expressions.StringUpper; import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.ql.plan.ColStatistics; -import org.apache.hadoop.hive.ql.stats.estimator.StatEstimator; import org.apache.hadoop.hive.ql.stats.estimator.StatEstimatorProvider; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; @@ -37,8 +35,6 @@ import org.apache.hadoop.hive.serde2.typeinfo.BaseCharTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; -import java.util.List; -import java.util.Optional; /** * UDFUpper. @@ -115,15 +111,4 @@ public Object evaluate(DeferredObject[] arguments) throws HiveException { public String getDisplayString(String[] children) { return getStandardDisplayString("upper", children); } - - @Override - public StatEstimator getStatEstimator() { - return new StatEstimator() { - @Override - public Optional estimate(List argStats) { - return Optional.of(argStats.get(0).clone()); - } - }; - } - } diff --git a/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestPessimisticStatCombiner.java b/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestPessimisticStatCombiner.java index 98bc589e40d3..fb3eb09308e9 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestPessimisticStatCombiner.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestPessimisticStatCombiner.java @@ -155,6 +155,184 @@ void testCombineBothUnknownNumTruesAndNumFalses() { assertEquals(-1, combined.getNumFalses(), "Both unknown should result in unknown (-1)"); } + @Test + void testCombinePropagatesUnknownNdvFromFirst() { + ColStatistics stat1 = createStat("col1", "int", 0, 10, 4.0); // NDV=0 means unknown + ColStatistics stat2 = createStat("col2", "int", 100, 20, 4.0); + + PessimisticStatCombiner combiner = new PessimisticStatCombiner(); + combiner.add(stat1); + combiner.add(stat2); + + ColStatistics combined = combiner.getResult().get(); + assertEquals(0, combined.getCountDistint(), "Unknown NDV (0) from first should be propagated"); + } + + @Test + void testCombinePropagatesUnknownNdvFromSecond() { + ColStatistics stat1 = createStat("col1", "int", 100, 10, 4.0); + ColStatistics stat2 = createStat("col2", "int", 0, 20, 4.0); // NDV=0 means unknown + + PessimisticStatCombiner combiner = new PessimisticStatCombiner(); + combiner.add(stat1); + combiner.add(stat2); + + ColStatistics combined = combiner.getResult().get(); + assertEquals(0, combined.getCountDistint(), "Unknown NDV (0) from second should be propagated"); + } + + @Test + void testCombineBothUnknownNdv() { + ColStatistics stat1 = createStat("col1", "int", 0, 10, 4.0); + ColStatistics stat2 = createStat("col2", "int", 0, 20, 4.0); + + PessimisticStatCombiner combiner = new PessimisticStatCombiner(); + combiner.add(stat1); + combiner.add(stat2); + + ColStatistics combined = combiner.getResult().get(); + assertEquals(0, combined.getCountDistint(), "Both unknown NDV should result in unknown (0)"); + } + + @Test + void testCombineSumsNdvWhenBothKnown() { + ColStatistics stat1 = createStat("col1", "int", 50, 10, 4.0); + ColStatistics stat2 = createStat("col2", "int", 30, 20, 4.0); + + PessimisticStatCombiner combiner = new PessimisticStatCombiner(); + combiner.add(stat1); + combiner.add(stat2); + + ColStatistics combined = combiner.getResult().get(); + assertEquals(80, combined.getCountDistint(), "Known NDVs should be summed"); + } + + @Test + void testCombineNdvOverflowProtection() { + ColStatistics stat1 = createStat("col1", "int", Long.MAX_VALUE - 10, 10, 4.0); + ColStatistics stat2 = createStat("col2", "int", 100, 20, 4.0); + + PessimisticStatCombiner combiner = new PessimisticStatCombiner(); + combiner.add(stat1); + combiner.add(stat2); + + ColStatistics combined = combiner.getResult().get(); + assertEquals(Long.MAX_VALUE, combined.getCountDistint(), "NDV overflow should be capped at Long.MAX_VALUE"); + } + + @Test + void testCombineThreeStats() { + ColStatistics stat1 = createStat("col1", "int", 10, 5, 4.0); + ColStatistics stat2 = createStat("col2", "int", 20, 10, 4.0); + ColStatistics stat3 = createStat("col3", "int", 30, 15, 4.0); + + PessimisticStatCombiner combiner = new PessimisticStatCombiner(); + combiner.add(stat1); + combiner.add(stat2); + combiner.add(stat3); + + ColStatistics combined = combiner.getResult().get(); + assertEquals(60, combined.getCountDistint(), "Three NDVs should be summed"); + assertEquals(15, combined.getNumNulls(), "Should take max numNulls"); + } + + @Test + void testCombineUnknownNdvInMiddle() { + ColStatistics stat1 = createStat("col1", "int", 10, 5, 4.0); + ColStatistics stat2 = createStat("col2", "int", 0, 10, 4.0); // unknown + ColStatistics stat3 = createStat("col3", "int", 30, 15, 4.0); + + PessimisticStatCombiner combiner = new PessimisticStatCombiner(); + combiner.add(stat1); + combiner.add(stat2); + combiner.add(stat3); + + ColStatistics combined = combiner.getResult().get(); + assertEquals(0, combined.getCountDistint(), "Unknown NDV in middle should propagate"); + } + + @Test + void testConstantWithNdvZeroIsNotTreatedAsUnknown() { + ColStatistics stat1 = createStat("col1", "string", 1, 0, 5.0); + ColStatistics stat2 = createConstStat("const", "string", 0, 1000, 5.0); // NULL constant: NDV=0, isConst=true + + PessimisticStatCombiner combiner = new PessimisticStatCombiner(); + combiner.add(stat1); + combiner.add(stat2); + + ColStatistics combined = combiner.getResult().get(); + assertEquals(1, combined.getCountDistint(), "Constant with NDV=0 should not propagate as unknown"); + } + + @Test + void testNullConstantFirstThenOtherConstants() { + ColStatistics nullConst = createConstStat("null", "string", 0, 1000, 5.0); // NULL constant + ColStatistics constA = createConstStat("A", "string", 1, 0, 5.0); + ColStatistics constB = createConstStat("B", "string", 1, 0, 5.0); + + PessimisticStatCombiner combiner = new PessimisticStatCombiner(); + combiner.add(nullConst); + combiner.add(constA); + combiner.add(constB); + + ColStatistics combined = combiner.getResult().get(); + assertEquals(2, combined.getCountDistint(), "NULL(0) + A(1) + B(1) should sum to 2"); + } + + @Test + void testConstantsWithNullInMiddle() { + ColStatistics constA = createConstStat("A", "string", 1, 0, 5.0); + ColStatistics nullConst = createConstStat("null", "string", 0, 1000, 5.0); // NULL constant + ColStatistics constB = createConstStat("B", "string", 1, 0, 5.0); + + PessimisticStatCombiner combiner = new PessimisticStatCombiner(); + combiner.add(constA); + combiner.add(nullConst); + combiner.add(constB); + + ColStatistics combined = combiner.getResult().get(); + assertEquals(2, combined.getCountDistint(), "A(1) + NULL(0) + B(1) should sum to 2"); + } + + @Test + void testNonConstantNdvZeroStillPropagatesUnknown() { + ColStatistics stat1 = createStat("col1", "string", 1, 0, 5.0); + ColStatistics stat2 = createStat("col2", "string", 0, 10, 5.0); // Column with unknown NDV (isConst=false) + + PessimisticStatCombiner combiner = new PessimisticStatCombiner(); + combiner.add(stat1); + combiner.add(stat2); + + ColStatistics combined = combiner.getResult().get(); + assertEquals(0, combined.getCountDistint(), "Non-constant with NDV=0 should still propagate as unknown"); + } + + @Test + void testMixedConstantAndNonConstantWithNdvZero() { + ColStatistics constStat = createConstStat("const", "string", 0, 1000, 5.0); // NULL constant + ColStatistics colStat = createStat("col", "string", 0, 10, 5.0); // Column with unknown NDV + + PessimisticStatCombiner combiner = new PessimisticStatCombiner(); + combiner.add(constStat); + combiner.add(colStat); + + ColStatistics combined = combiner.getResult().get(); + assertEquals(0, combined.getCountDistint(), "Non-constant with NDV=0 should propagate unknown even if combined with constant"); + } + + @Test + void testCombinedResultIsNotConst() { + ColStatistics constA = createConstStat("A", "string", 1, 0, 5.0); + ColStatistics constB = createConstStat("B", "string", 1, 0, 5.0); + + PessimisticStatCombiner combiner = new PessimisticStatCombiner(); + combiner.add(constA); + combiner.add(constB); + + ColStatistics combined = combiner.getResult().get(); + assertEquals(false, combined.isConst(), "Combined result should not be marked as constant"); + } + private ColStatistics createStat(String name, String type, long ndv, long numNulls, double avgColLen) { ColStatistics stat = new ColStatistics(name, type); stat.setCountDistint(ndv); @@ -162,4 +340,10 @@ private ColStatistics createStat(String name, String type, long ndv, long numNul stat.setAvgColLen(avgColLen); return stat; } + + private ColStatistics createConstStat(String name, String type, long ndv, long numNulls, double avgColLen) { + ColStatistics stat = createStat(name, type, ndv, numNulls, avgColLen); + stat.setConst(true); + return stat; + } } diff --git a/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestStatEstimator.java b/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestStatEstimator.java new file mode 100644 index 000000000000..09140f3d15bb --- /dev/null +++ b/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestStatEstimator.java @@ -0,0 +1,169 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.stats.estimator; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotSame; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.Arrays; +import java.util.Collections; +import java.util.Optional; + +import org.apache.hadoop.hive.ql.plan.ColStatistics; +import org.junit.jupiter.api.Test; + +class TestStatEstimator { + + @Test + void testDefaultEstimateWithEmptyList() { + StatEstimator estimator = new StatEstimator() {}; + Optional result = estimator.estimate(Collections.emptyList()); + assertFalse(result.isPresent(), "Empty list should return empty Optional"); + } + + @Test + void testDefaultEstimateClonesFirstArg() { + StatEstimator estimator = new StatEstimator() {}; + ColStatistics stat = createStat("col1", "int", 100, 10, 4.0); + + Optional result = estimator.estimate(Arrays.asList(stat)); + + assertTrue(result.isPresent()); + assertEquals(100, result.get().getCountDistint()); + assertEquals(10, result.get().getNumNulls()); + assertEquals(4.0, result.get().getAvgColLen()); + } + + @Test + void testDefaultEstimateReturnsCloneNotSameReference() { + StatEstimator estimator = new StatEstimator() {}; + ColStatistics stat = createStat("col1", "int", 100, 10, 4.0); + + Optional result = estimator.estimate(Arrays.asList(stat)); + + assertTrue(result.isPresent()); + assertNotSame(stat, result.get(), "Should return a clone, not the same reference"); + stat.setCountDistint(999); + assertEquals(100, result.get().getCountDistint(), "Clone should not be affected by original changes"); + } + + @Test + void testDefaultEstimateIgnoresSubsequentArgs() { + StatEstimator estimator = new StatEstimator() {}; + ColStatistics stat1 = createStat("col1", "int", 100, 10, 4.0); + ColStatistics stat2 = createStat("col2", "int", 200, 20, 8.0); + + Optional result = estimator.estimate(Arrays.asList(stat1, stat2)); + + assertTrue(result.isPresent()); + assertEquals(100, result.get().getCountDistint(), "Should use first arg's NDV"); + assertEquals(10, result.get().getNumNulls(), "Should use first arg's numNulls"); + } + + @Test + void testDefaultEstimateWithNumRowsCapsNdv() { + StatEstimator estimator = new StatEstimator() {}; + ColStatistics stat = createStat("col1", "int", 1000, 10, 4.0); + + Optional result = estimator.estimate(Arrays.asList(stat), 500); + + assertTrue(result.isPresent()); + assertEquals(500, result.get().getCountDistint(), "NDV should be capped at numRows"); + } + + @Test + void testDefaultEstimateWithNumRowsNoCappingNeeded() { + StatEstimator estimator = new StatEstimator() {}; + ColStatistics stat = createStat("col1", "int", 100, 10, 4.0); + + Optional result = estimator.estimate(Arrays.asList(stat), 500); + + assertTrue(result.isPresent()); + assertEquals(100, result.get().getCountDistint(), "NDV should remain unchanged when less than numRows"); + } + + @Test + void testDefaultEstimateWithNumRowsExactlyEqual() { + StatEstimator estimator = new StatEstimator() {}; + ColStatistics stat = createStat("col1", "int", 500, 10, 4.0); + + Optional result = estimator.estimate(Arrays.asList(stat), 500); + + assertTrue(result.isPresent()); + assertEquals(500, result.get().getCountDistint(), "NDV should remain unchanged when equal to numRows"); + } + + @Test + void testDefaultEstimateWithNumRowsEmptyList() { + StatEstimator estimator = new StatEstimator() {}; + + Optional result = estimator.estimate(Collections.emptyList(), 500); + + assertFalse(result.isPresent(), "Empty list should return empty Optional"); + } + + @Test + void testDefaultEstimateWithNumRowsPreservesOtherStats() { + StatEstimator estimator = new StatEstimator() {}; + ColStatistics stat = createStat("col1", "int", 1000, 10, 4.0); + stat.setNumTrues(50); + stat.setNumFalses(40); + + Optional result = estimator.estimate(Arrays.asList(stat), 500); + + assertTrue(result.isPresent()); + assertEquals(500, result.get().getCountDistint(), "NDV should be capped"); + assertEquals(10, result.get().getNumNulls(), "numNulls should be preserved"); + assertEquals(4.0, result.get().getAvgColLen(), "avgColLen should be preserved"); + } + + @Test + void testStatEstimatorProviderDefaultReturnsWorkingEstimator() { + StatEstimatorProvider provider = new StatEstimatorProvider() {}; + StatEstimator estimator = provider.getStatEstimator(); + + ColStatistics stat = createStat("col1", "int", 100, 10, 4.0); + Optional result = estimator.estimate(Arrays.asList(stat)); + + assertTrue(result.isPresent()); + assertEquals(100, result.get().getCountDistint()); + } + + @Test + void testStatEstimatorProviderDefaultCapsNdv() { + StatEstimatorProvider provider = new StatEstimatorProvider() {}; + StatEstimator estimator = provider.getStatEstimator(); + + ColStatistics stat = createStat("col1", "int", 1000, 10, 4.0); + Optional result = estimator.estimate(Arrays.asList(stat), 500); + + assertTrue(result.isPresent()); + assertEquals(500, result.get().getCountDistint(), "Default provider estimator should cap NDV"); + } + + private ColStatistics createStat(String name, String type, long ndv, long numNulls, double avgColLen) { + ColStatistics stat = new ColStatistics(name, type); + stat.setCountDistint(ndv); + stat.setNumNulls(numNulls); + stat.setAvgColLen(avgColLen); + return stat; + } +} diff --git a/ql/src/test/results/clientpositive/llap/infer_bucket_sort_dyn_part.q.out b/ql/src/test/results/clientpositive/llap/infer_bucket_sort_dyn_part.q.out index 995733564a08..17db16415c01 100644 --- a/ql/src/test/results/clientpositive/llap/infer_bucket_sort_dyn_part.q.out +++ b/ql/src/test/results/clientpositive/llap/infer_bucket_sort_dyn_part.q.out @@ -492,13 +492,13 @@ STAGE PLANS: minReductionHashAggr: 0.99 mode: hash outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10 - Statistics: Num rows: 1 Data size: 652 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1304 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: string), _col1 (type: string) null sort order: zz sort order: ++ Map-reduce partition columns: _col0 (type: string), _col1 (type: string) - Statistics: Num rows: 1 Data size: 652 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1304 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col2 (type: int), _col3 (type: struct), _col4 (type: bigint), _col5 (type: bigint), _col6 (type: binary), _col7 (type: int), _col8 (type: struct), _col9 (type: bigint), _col10 (type: binary) Reducer 3 Execution mode: vectorized, llap @@ -508,14 +508,14 @@ STAGE PLANS: keys: KEY._col0 (type: string), KEY._col1 (type: string) mode: mergepartial outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10 - Statistics: Num rows: 1 Data size: 516 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1032 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: 'STRING' (type: string), UDFToLong(COALESCE(_col2,0)) (type: bigint), COALESCE(_col3,0) (type: double), (_col4 - _col5) (type: bigint), COALESCE(ndv_compute_bit_vector(_col6),0) (type: bigint), _col6 (type: binary), 'STRING' (type: string), UDFToLong(COALESCE(_col7,0)) (type: bigint), COALESCE(_col8,0) (type: double), (_col4 - _col9) (type: bigint), COALESCE(ndv_compute_bit_vector(_col10),0) (type: bigint), _col10 (type: binary), _col0 (type: string), _col1 (type: string) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13 - Statistics: Num rows: 1 Data size: 712 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1424 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 1 Data size: 712 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1424 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat diff --git a/ql/src/test/results/clientpositive/llap/list_bucket_dml_6.q.out b/ql/src/test/results/clientpositive/llap/list_bucket_dml_6.q.out index dbcf49b202e7..df098011525b 100644 --- a/ql/src/test/results/clientpositive/llap/list_bucket_dml_6.q.out +++ b/ql/src/test/results/clientpositive/llap/list_bucket_dml_6.q.out @@ -96,7 +96,7 @@ STAGE PLANS: minReductionHashAggr: 0.99 mode: hash outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10 - Statistics: Num rows: 1 Data size: 652 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1304 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator bucketingVersion: 2 key expressions: _col0 (type: string), _col1 (type: string) @@ -104,7 +104,7 @@ STAGE PLANS: numBuckets: -1 sort order: ++ Map-reduce partition columns: _col0 (type: string), _col1 (type: string) - Statistics: Num rows: 1 Data size: 652 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1304 Basic stats: COMPLETE Column stats: COMPLETE tag: -1 value expressions: _col2 (type: int), _col3 (type: struct), _col4 (type: bigint), _col5 (type: bigint), _col6 (type: binary), _col7 (type: int), _col8 (type: struct), _col9 (type: bigint), _col10 (type: binary) auto parallelism: true @@ -199,18 +199,18 @@ STAGE PLANS: keys: KEY._col0 (type: string), KEY._col1 (type: string) mode: mergepartial outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10 - Statistics: Num rows: 1 Data size: 516 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1032 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: 'STRING' (type: string), UDFToLong(COALESCE(_col2,0)) (type: bigint), COALESCE(_col3,0) (type: double), (_col4 - _col5) (type: bigint), COALESCE(ndv_compute_bit_vector(_col6),0) (type: bigint), _col6 (type: binary), 'STRING' (type: string), UDFToLong(COALESCE(_col7,0)) (type: bigint), COALESCE(_col8,0) (type: double), (_col4 - _col9) (type: bigint), COALESCE(ndv_compute_bit_vector(_col10),0) (type: bigint), _col10 (type: binary), _col0 (type: string), _col1 (type: string) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13 - Statistics: Num rows: 1 Data size: 712 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1424 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator bucketingVersion: 2 compressed: false GlobalTableId: 0 #### A masked pattern was here #### NumFilesPerFileSink: 1 - Statistics: Num rows: 1 Data size: 712 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1424 Basic stats: COMPLETE Column stats: COMPLETE #### A masked pattern was here #### table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat @@ -317,7 +317,7 @@ Table: list_bucketing_dynamic_part_n3 #### A masked pattern was here #### Partition Parameters: COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}} - numFiles 1 + numFiles 2 numRows 16 rawDataSize 136 totalSize #Masked# @@ -358,7 +358,7 @@ Table: list_bucketing_dynamic_part_n3 #### A masked pattern was here #### Partition Parameters: COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}} - numFiles 3 + numFiles 6 numRows 984 rawDataSize 9488 totalSize #Masked# @@ -461,7 +461,7 @@ STAGE PLANS: minReductionHashAggr: 0.99 mode: hash outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10 - Statistics: Num rows: 1 Data size: 652 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1304 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator bucketingVersion: 2 key expressions: _col0 (type: string), _col1 (type: string) @@ -469,7 +469,7 @@ STAGE PLANS: numBuckets: -1 sort order: ++ Map-reduce partition columns: _col0 (type: string), _col1 (type: string) - Statistics: Num rows: 1 Data size: 652 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1304 Basic stats: COMPLETE Column stats: COMPLETE tag: -1 value expressions: _col2 (type: int), _col3 (type: struct), _col4 (type: bigint), _col5 (type: bigint), _col6 (type: binary), _col7 (type: int), _col8 (type: struct), _col9 (type: bigint), _col10 (type: binary) auto parallelism: true @@ -564,18 +564,18 @@ STAGE PLANS: keys: KEY._col0 (type: string), KEY._col1 (type: string) mode: mergepartial outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10 - Statistics: Num rows: 1 Data size: 516 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1032 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: 'STRING' (type: string), UDFToLong(COALESCE(_col2,0)) (type: bigint), COALESCE(_col3,0) (type: double), (_col4 - _col5) (type: bigint), COALESCE(ndv_compute_bit_vector(_col6),0) (type: bigint), _col6 (type: binary), 'STRING' (type: string), UDFToLong(COALESCE(_col7,0)) (type: bigint), COALESCE(_col8,0) (type: double), (_col4 - _col9) (type: bigint), COALESCE(ndv_compute_bit_vector(_col10),0) (type: bigint), _col10 (type: binary), _col0 (type: string), _col1 (type: string) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13 - Statistics: Num rows: 1 Data size: 712 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1424 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator bucketingVersion: 2 compressed: false GlobalTableId: 0 #### A masked pattern was here #### NumFilesPerFileSink: 1 - Statistics: Num rows: 1 Data size: 712 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1424 Basic stats: COMPLETE Column stats: COMPLETE #### A masked pattern was here #### table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat @@ -682,7 +682,7 @@ Table: list_bucketing_dynamic_part_n3 #### A masked pattern was here #### Partition Parameters: COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}} - numFiles 1 + numFiles 2 numRows 16 rawDataSize 136 totalSize #Masked# @@ -723,7 +723,7 @@ Table: list_bucketing_dynamic_part_n3 #### A masked pattern was here #### Partition Parameters: COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}} - numFiles 3 + numFiles 6 numRows 984 rawDataSize 9488 totalSize #Masked# diff --git a/ql/src/test/results/clientpositive/llap/list_bucket_dml_7.q.out b/ql/src/test/results/clientpositive/llap/list_bucket_dml_7.q.out index ad7051398156..d1e40c4588f0 100644 --- a/ql/src/test/results/clientpositive/llap/list_bucket_dml_7.q.out +++ b/ql/src/test/results/clientpositive/llap/list_bucket_dml_7.q.out @@ -96,7 +96,7 @@ STAGE PLANS: minReductionHashAggr: 0.99 mode: hash outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10 - Statistics: Num rows: 1 Data size: 652 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1304 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator bucketingVersion: 2 key expressions: _col0 (type: string), _col1 (type: string) @@ -104,7 +104,7 @@ STAGE PLANS: numBuckets: -1 sort order: ++ Map-reduce partition columns: _col0 (type: string), _col1 (type: string) - Statistics: Num rows: 1 Data size: 652 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1304 Basic stats: COMPLETE Column stats: COMPLETE tag: -1 value expressions: _col2 (type: int), _col3 (type: struct), _col4 (type: bigint), _col5 (type: bigint), _col6 (type: binary), _col7 (type: int), _col8 (type: struct), _col9 (type: bigint), _col10 (type: binary) auto parallelism: true @@ -199,18 +199,18 @@ STAGE PLANS: keys: KEY._col0 (type: string), KEY._col1 (type: string) mode: mergepartial outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10 - Statistics: Num rows: 1 Data size: 516 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1032 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: 'STRING' (type: string), UDFToLong(COALESCE(_col2,0)) (type: bigint), COALESCE(_col3,0) (type: double), (_col4 - _col5) (type: bigint), COALESCE(ndv_compute_bit_vector(_col6),0) (type: bigint), _col6 (type: binary), 'STRING' (type: string), UDFToLong(COALESCE(_col7,0)) (type: bigint), COALESCE(_col8,0) (type: double), (_col4 - _col9) (type: bigint), COALESCE(ndv_compute_bit_vector(_col10),0) (type: bigint), _col10 (type: binary), _col0 (type: string), _col1 (type: string) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13 - Statistics: Num rows: 1 Data size: 712 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1424 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator bucketingVersion: 2 compressed: false GlobalTableId: 0 #### A masked pattern was here #### NumFilesPerFileSink: 1 - Statistics: Num rows: 1 Data size: 712 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1424 Basic stats: COMPLETE Column stats: COMPLETE #### A masked pattern was here #### table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat @@ -317,7 +317,7 @@ Table: list_bucketing_dynamic_part #### A masked pattern was here #### Partition Parameters: COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}} - numFiles 1 + numFiles 2 numRows 16 rawDataSize 136 totalSize #Masked# @@ -358,7 +358,7 @@ Table: list_bucketing_dynamic_part #### A masked pattern was here #### Partition Parameters: COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}} - numFiles 2 + numFiles 4 numRows 984 rawDataSize 9488 totalSize #Masked# @@ -461,7 +461,7 @@ STAGE PLANS: minReductionHashAggr: 0.99 mode: hash outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10 - Statistics: Num rows: 1 Data size: 652 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1304 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator bucketingVersion: 2 key expressions: _col0 (type: string), _col1 (type: string) @@ -469,7 +469,7 @@ STAGE PLANS: numBuckets: -1 sort order: ++ Map-reduce partition columns: _col0 (type: string), _col1 (type: string) - Statistics: Num rows: 1 Data size: 652 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1304 Basic stats: COMPLETE Column stats: COMPLETE tag: -1 value expressions: _col2 (type: int), _col3 (type: struct), _col4 (type: bigint), _col5 (type: bigint), _col6 (type: binary), _col7 (type: int), _col8 (type: struct), _col9 (type: bigint), _col10 (type: binary) auto parallelism: true @@ -564,18 +564,18 @@ STAGE PLANS: keys: KEY._col0 (type: string), KEY._col1 (type: string) mode: mergepartial outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10 - Statistics: Num rows: 1 Data size: 516 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1032 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: 'STRING' (type: string), UDFToLong(COALESCE(_col2,0)) (type: bigint), COALESCE(_col3,0) (type: double), (_col4 - _col5) (type: bigint), COALESCE(ndv_compute_bit_vector(_col6),0) (type: bigint), _col6 (type: binary), 'STRING' (type: string), UDFToLong(COALESCE(_col7,0)) (type: bigint), COALESCE(_col8,0) (type: double), (_col4 - _col9) (type: bigint), COALESCE(ndv_compute_bit_vector(_col10),0) (type: bigint), _col10 (type: binary), _col0 (type: string), _col1 (type: string) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13 - Statistics: Num rows: 1 Data size: 712 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1424 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator bucketingVersion: 2 compressed: false GlobalTableId: 0 #### A masked pattern was here #### NumFilesPerFileSink: 1 - Statistics: Num rows: 1 Data size: 712 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1424 Basic stats: COMPLETE Column stats: COMPLETE #### A masked pattern was here #### table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat @@ -682,7 +682,7 @@ Table: list_bucketing_dynamic_part #### A masked pattern was here #### Partition Parameters: COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}} - numFiles 1 + numFiles 2 numRows 16 rawDataSize 136 totalSize #Masked# @@ -723,7 +723,7 @@ Table: list_bucketing_dynamic_part #### A masked pattern was here #### Partition Parameters: COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}} - numFiles 2 + numFiles 4 numRows 984 rawDataSize 9488 totalSize #Masked# diff --git a/ql/src/test/results/clientpositive/llap/list_bucket_dml_8.q.out b/ql/src/test/results/clientpositive/llap/list_bucket_dml_8.q.out index 148303926d66..4e5651cccc53 100644 --- a/ql/src/test/results/clientpositive/llap/list_bucket_dml_8.q.out +++ b/ql/src/test/results/clientpositive/llap/list_bucket_dml_8.q.out @@ -96,7 +96,7 @@ STAGE PLANS: minReductionHashAggr: 0.99 mode: hash outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10 - Statistics: Num rows: 1 Data size: 652 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1304 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator bucketingVersion: 2 key expressions: _col0 (type: string), _col1 (type: string) @@ -104,7 +104,7 @@ STAGE PLANS: numBuckets: -1 sort order: ++ Map-reduce partition columns: _col0 (type: string), _col1 (type: string) - Statistics: Num rows: 1 Data size: 652 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1304 Basic stats: COMPLETE Column stats: COMPLETE tag: -1 value expressions: _col2 (type: int), _col3 (type: struct), _col4 (type: bigint), _col5 (type: bigint), _col6 (type: binary), _col7 (type: int), _col8 (type: struct), _col9 (type: bigint), _col10 (type: binary) auto parallelism: true @@ -199,18 +199,18 @@ STAGE PLANS: keys: KEY._col0 (type: string), KEY._col1 (type: string) mode: mergepartial outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10 - Statistics: Num rows: 1 Data size: 516 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1032 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: 'STRING' (type: string), UDFToLong(COALESCE(_col2,0)) (type: bigint), COALESCE(_col3,0) (type: double), (_col4 - _col5) (type: bigint), COALESCE(ndv_compute_bit_vector(_col6),0) (type: bigint), _col6 (type: binary), 'STRING' (type: string), UDFToLong(COALESCE(_col7,0)) (type: bigint), COALESCE(_col8,0) (type: double), (_col4 - _col9) (type: bigint), COALESCE(ndv_compute_bit_vector(_col10),0) (type: bigint), _col10 (type: binary), _col0 (type: string), _col1 (type: string) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13 - Statistics: Num rows: 1 Data size: 712 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1424 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator bucketingVersion: 2 compressed: false GlobalTableId: 0 #### A masked pattern was here #### NumFilesPerFileSink: 1 - Statistics: Num rows: 1 Data size: 712 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1424 Basic stats: COMPLETE Column stats: COMPLETE #### A masked pattern was here #### table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat diff --git a/ql/src/test/results/clientpositive/llap/merge_dynamic_partition4.q.out b/ql/src/test/results/clientpositive/llap/merge_dynamic_partition4.q.out index 2c9c9015c173..85f1ea93c068 100644 --- a/ql/src/test/results/clientpositive/llap/merge_dynamic_partition4.q.out +++ b/ql/src/test/results/clientpositive/llap/merge_dynamic_partition4.q.out @@ -180,13 +180,13 @@ STAGE PLANS: minReductionHashAggr: 0.99 mode: hash outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10 - Statistics: Num rows: 1 Data size: 652 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1304 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: string), _col1 (type: string) null sort order: zz sort order: ++ Map-reduce partition columns: _col0 (type: string), _col1 (type: string) - Statistics: Num rows: 1 Data size: 652 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1304 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col2 (type: int), _col3 (type: struct), _col4 (type: bigint), _col5 (type: bigint), _col6 (type: binary), _col7 (type: int), _col8 (type: struct), _col9 (type: bigint), _col10 (type: binary) Execution mode: llap LLAP IO: no inputs @@ -198,14 +198,14 @@ STAGE PLANS: keys: KEY._col0 (type: string), KEY._col1 (type: string) mode: mergepartial outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10 - Statistics: Num rows: 1 Data size: 516 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1032 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: 'STRING' (type: string), UDFToLong(COALESCE(_col2,0)) (type: bigint), COALESCE(_col3,0) (type: double), (_col4 - _col5) (type: bigint), COALESCE(ndv_compute_bit_vector(_col6),0) (type: bigint), _col6 (type: binary), 'STRING' (type: string), UDFToLong(COALESCE(_col7,0)) (type: bigint), COALESCE(_col8,0) (type: double), (_col4 - _col9) (type: bigint), COALESCE(ndv_compute_bit_vector(_col10),0) (type: bigint), _col10 (type: binary), _col0 (type: string), _col1 (type: string) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13 - Statistics: Num rows: 1 Data size: 712 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1424 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 1 Data size: 712 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1424 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat diff --git a/ql/src/test/results/clientpositive/llap/merge_dynamic_partition5.q.out b/ql/src/test/results/clientpositive/llap/merge_dynamic_partition5.q.out index 5b1e537b938a..ab9805c19485 100644 --- a/ql/src/test/results/clientpositive/llap/merge_dynamic_partition5.q.out +++ b/ql/src/test/results/clientpositive/llap/merge_dynamic_partition5.q.out @@ -156,13 +156,13 @@ STAGE PLANS: minReductionHashAggr: 0.99 mode: hash outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10 - Statistics: Num rows: 1 Data size: 652 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1304 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: string), _col1 (type: string) null sort order: zz sort order: ++ Map-reduce partition columns: _col0 (type: string), _col1 (type: string) - Statistics: Num rows: 1 Data size: 652 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1304 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col2 (type: int), _col3 (type: struct), _col4 (type: bigint), _col5 (type: bigint), _col6 (type: binary), _col7 (type: int), _col8 (type: struct), _col9 (type: bigint), _col10 (type: binary) Execution mode: llap LLAP IO: no inputs @@ -174,14 +174,14 @@ STAGE PLANS: keys: KEY._col0 (type: string), KEY._col1 (type: string) mode: mergepartial outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10 - Statistics: Num rows: 1 Data size: 516 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1032 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: 'STRING' (type: string), UDFToLong(COALESCE(_col2,0)) (type: bigint), COALESCE(_col3,0) (type: double), (_col4 - _col5) (type: bigint), COALESCE(ndv_compute_bit_vector(_col6),0) (type: bigint), _col6 (type: binary), 'STRING' (type: string), UDFToLong(COALESCE(_col7,0)) (type: bigint), COALESCE(_col8,0) (type: double), (_col4 - _col9) (type: bigint), COALESCE(ndv_compute_bit_vector(_col10),0) (type: bigint), _col10 (type: binary), _col0 (type: string), _col1 (type: string) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13 - Statistics: Num rows: 1 Data size: 712 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1424 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 1 Data size: 712 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 2 Data size: 1424 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat diff --git a/ql/src/test/results/clientpositive/llap/scratch_col_issue.q.out b/ql/src/test/results/clientpositive/llap/scratch_col_issue.q.out index 5418ef860de3..1e23944fcaf7 100644 --- a/ql/src/test/results/clientpositive/llap/scratch_col_issue.q.out +++ b/ql/src/test/results/clientpositive/llap/scratch_col_issue.q.out @@ -189,7 +189,7 @@ STAGE PLANS: outputColumnNames: _col1, _col2 input vertices: 1 Map 2 - Statistics: Num rows: 2 Data size: 368 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: if((_col1) IN ('CertificateOfDeposit', 'RecurringDeposit', 'TermDeposit'), COALESCE(from_unixtime(to_unix_timestamp(CAST( _col2 AS DATE)), 'MM-dd-yyyy'),' '), '') (type: string) outputColumnNames: _col0 @@ -198,13 +198,13 @@ STAGE PLANS: native: true projectedOutputColumnNums: [14] selectExpressions: IfExprCondExprColumn(col 9:boolean, col 13:string, col 5:string)(children: StringColumnInList(col 1, values CertificateOfDeposit, RecurringDeposit, TermDeposit) -> 9:boolean, VectorCoalesce(columns [5, 12])(children: VectorUDFAdaptor(from_unixtime(to_unix_timestamp(CAST( _col2 AS DATE)), 'MM-dd-yyyy'))(children: VectorUDFUnixTimeStampDate(col 10)(children: CastStringToDate(col 2:string) -> 10:date) -> 11:bigint) -> 5:string, ConstantVectorExpression(val ) -> 12:string) -> 13:string, ConstantVectorExpression(val ) -> 5:string) -> 14:string - Statistics: Num rows: 2 Data size: 368 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false File Sink Vectorization: className: VectorFileSinkOperator native: false - Statistics: Num rows: 2 Data size: 368 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat From f89edb7d4a26a2eb6abb096ca7fe39a04bba33af Mon Sep 17 00:00:00 2001 From: Konstantin Bereznyakov Date: Sat, 21 Mar 2026 10:09:37 -0700 Subject: [PATCH 05/13] HIVE-29368: trigger a rebuild From d7aed0eb7afa55de0e3b143bd6adbd6cb549e9fb Mon Sep 17 00:00:00 2001 From: Konstantin Bereznyakov Date: Tue, 24 Mar 2026 11:25:11 -0700 Subject: [PATCH 06/13] HIVE-29368: configured split file counts consistently between CI & localhost --- .../clientpositive/list_bucket_dml_6.q | 20 ++++++++----- .../clientpositive/list_bucket_dml_7.q | 11 ++++--- .../llap/list_bucket_dml_6.q.out | 30 ++++++++----------- .../llap/list_bucket_dml_7.q.out | 18 +++-------- 4 files changed, 34 insertions(+), 45 deletions(-) diff --git a/ql/src/test/queries/clientpositive/list_bucket_dml_6.q b/ql/src/test/queries/clientpositive/list_bucket_dml_6.q index 2ce2ced59e04..de1c802e9ee7 100644 --- a/ql/src/test/queries/clientpositive/list_bucket_dml_6.q +++ b/ql/src/test/queries/clientpositive/list_bucket_dml_6.q @@ -1,10 +1,19 @@ --! qt:dataset:srcpart +-- Debug: DELETEME: show merge settings +set hive.merge.mapfiles; +set hive.merge.mapredfiles; +set hive.merge.tezfiles; +set hive.merge.smallfiles.avgsize; +-- Debug: DELETEME: end + +-- this ensures consistent split file counts between localhost & CI runs +set tez.grouping.split-count=1; set hive.mapred.mode=nonstrict; set hive.exec.dynamic.partition=true; set hive.input.format=org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat; set hive.merge.smallfiles.avgsize=200; set mapred.input.dir.recursive=true; -set hive.merge.mapfiles=false; +set hive.merge.mapfiles=false; set hive.merge.mapredfiles=false; -- list bucketing DML: dynamic partition. multiple skewed columns. merge. @@ -43,13 +52,13 @@ set hive.merge.mapredfiles=false; -- 87 000000_0 -- 87 000001_0 -- with merge --- 118 000002_0 +-- 118 000002_0 -- SORT_QUERY_RESULTS -- create a skewed table -create table list_bucketing_dynamic_part_n3 (key String, value String) - partitioned by (ds String, hr String) +create table list_bucketing_dynamic_part_n3 (key String, value String) + partitioned by (ds String, hr String) skewed by (key, value) on (('484','val_484'),('51','val_14'),('103','val_103')) stored as DIRECTORIES STORED AS RCFILE; @@ -92,6 +101,3 @@ select * from list_bucketing_dynamic_part_n3 where key = '484' and value = 'val_ select * from list_bucketing_dynamic_part_n3 where key = '484' and value = 'val_484'; select * from srcpart where ds = '2008-04-08' and key = '484' and value = 'val_484'; --- clean up -drop table list_bucketing_dynamic_part_n3; - diff --git a/ql/src/test/queries/clientpositive/list_bucket_dml_7.q b/ql/src/test/queries/clientpositive/list_bucket_dml_7.q index f80585e56c6f..a4a21aaa1ceb 100644 --- a/ql/src/test/queries/clientpositive/list_bucket_dml_7.q +++ b/ql/src/test/queries/clientpositive/list_bucket_dml_7.q @@ -1,4 +1,6 @@ --! qt:dataset:srcpart +-- this ensures consistent split file counts between localhost & CI runs +set tez.grouping.split-count=1; set hive.mapred.mode=nonstrict; set hive.exec.dynamic.partition=true; set hive.input.format=org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat; @@ -39,10 +41,10 @@ select key, value, if(key % 100 == 0, 'a1', 'b1') from srcpart where ds = '2008- -- check DML result show partitions list_bucketing_dynamic_part; -desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='a1'); +desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='a1'); desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='b1'); -set hive.merge.mapfiles=true; +set hive.merge.mapfiles=true; set hive.merge.mapredfiles=true; -- list bucketing DML with merge. use bucketize to generate a few small files. explain extended @@ -54,7 +56,7 @@ select key, value, if(key % 100 == 0, 'a1', 'b1') from srcpart where ds = '2008- -- check DML result show partitions list_bucketing_dynamic_part; -desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='a1'); +desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='a1'); desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='b1'); select count(1) from srcpart where ds = '2008-04-08'; @@ -65,6 +67,3 @@ explain extended select * from list_bucketing_dynamic_part where key = '484' and value = 'val_484'; select * from list_bucketing_dynamic_part where key = '484' and value = 'val_484'; select * from srcpart where ds = '2008-04-08' and key = '484' and value = 'val_484'; - --- clean up -drop table list_bucketing_dynamic_part; diff --git a/ql/src/test/results/clientpositive/llap/list_bucket_dml_6.q.out b/ql/src/test/results/clientpositive/llap/list_bucket_dml_6.q.out index df098011525b..d9d5d18340eb 100644 --- a/ql/src/test/results/clientpositive/llap/list_bucket_dml_6.q.out +++ b/ql/src/test/results/clientpositive/llap/list_bucket_dml_6.q.out @@ -1,13 +1,17 @@ -PREHOOK: query: create table list_bucketing_dynamic_part_n3 (key String, value String) - partitioned by (ds String, hr String) +hive.merge.mapfiles=true +hive.merge.mapredfiles=false +hive.merge.tezfiles=false +hive.merge.smallfiles.avgsize=16000000 +PREHOOK: query: create table list_bucketing_dynamic_part_n3 (key String, value String) + partitioned by (ds String, hr String) skewed by (key, value) on (('484','val_484'),('51','val_14'),('103','val_103')) stored as DIRECTORIES STORED AS RCFILE PREHOOK: type: CREATETABLE PREHOOK: Output: database:default PREHOOK: Output: default@list_bucketing_dynamic_part_n3 -POSTHOOK: query: create table list_bucketing_dynamic_part_n3 (key String, value String) - partitioned by (ds String, hr String) +POSTHOOK: query: create table list_bucketing_dynamic_part_n3 (key String, value String) + partitioned by (ds String, hr String) skewed by (key, value) on (('484','val_484'),('51','val_14'),('103','val_103')) stored as DIRECTORIES STORED AS RCFILE @@ -317,7 +321,7 @@ Table: list_bucketing_dynamic_part_n3 #### A masked pattern was here #### Partition Parameters: COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}} - numFiles 2 + numFiles 1 numRows 16 rawDataSize 136 totalSize #Masked# @@ -358,7 +362,7 @@ Table: list_bucketing_dynamic_part_n3 #### A masked pattern was here #### Partition Parameters: COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}} - numFiles 6 + numFiles 3 numRows 984 rawDataSize 9488 totalSize #Masked# @@ -682,7 +686,7 @@ Table: list_bucketing_dynamic_part_n3 #### A masked pattern was here #### Partition Parameters: COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}} - numFiles 2 + numFiles 1 numRows 16 rawDataSize 136 totalSize #Masked# @@ -723,7 +727,7 @@ Table: list_bucketing_dynamic_part_n3 #### A masked pattern was here #### Partition Parameters: COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}} - numFiles 6 + numFiles 3 numRows 984 rawDataSize 9488 totalSize #Masked# @@ -898,13 +902,3 @@ POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=12 #### A masked pattern was here #### 484 val_484 2008-04-08 11 484 val_484 2008-04-08 12 -PREHOOK: query: drop table list_bucketing_dynamic_part_n3 -PREHOOK: type: DROPTABLE -PREHOOK: Input: default@list_bucketing_dynamic_part_n3 -PREHOOK: Output: database:default -PREHOOK: Output: default@list_bucketing_dynamic_part_n3 -POSTHOOK: query: drop table list_bucketing_dynamic_part_n3 -POSTHOOK: type: DROPTABLE -POSTHOOK: Input: default@list_bucketing_dynamic_part_n3 -POSTHOOK: Output: database:default -POSTHOOK: Output: default@list_bucketing_dynamic_part_n3 diff --git a/ql/src/test/results/clientpositive/llap/list_bucket_dml_7.q.out b/ql/src/test/results/clientpositive/llap/list_bucket_dml_7.q.out index d1e40c4588f0..e1dbd260d038 100644 --- a/ql/src/test/results/clientpositive/llap/list_bucket_dml_7.q.out +++ b/ql/src/test/results/clientpositive/llap/list_bucket_dml_7.q.out @@ -317,7 +317,7 @@ Table: list_bucketing_dynamic_part #### A masked pattern was here #### Partition Parameters: COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}} - numFiles 2 + numFiles 1 numRows 16 rawDataSize 136 totalSize #Masked# @@ -358,7 +358,7 @@ Table: list_bucketing_dynamic_part #### A masked pattern was here #### Partition Parameters: COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}} - numFiles 4 + numFiles 2 numRows 984 rawDataSize 9488 totalSize #Masked# @@ -682,7 +682,7 @@ Table: list_bucketing_dynamic_part #### A masked pattern was here #### Partition Parameters: COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}} - numFiles 2 + numFiles 1 numRows 16 rawDataSize 136 totalSize #Masked# @@ -723,7 +723,7 @@ Table: list_bucketing_dynamic_part #### A masked pattern was here #### Partition Parameters: COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"key\":\"true\",\"value\":\"true\"}} - numFiles 4 + numFiles 2 numRows 984 rawDataSize 9488 totalSize #Masked# @@ -898,13 +898,3 @@ POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=12 #### A masked pattern was here #### 484 val_484 2008-04-08 11 484 val_484 2008-04-08 12 -PREHOOK: query: drop table list_bucketing_dynamic_part -PREHOOK: type: DROPTABLE -PREHOOK: Input: default@list_bucketing_dynamic_part -PREHOOK: Output: database:default -PREHOOK: Output: default@list_bucketing_dynamic_part -POSTHOOK: query: drop table list_bucketing_dynamic_part -POSTHOOK: type: DROPTABLE -POSTHOOK: Input: default@list_bucketing_dynamic_part -POSTHOOK: Output: database:default -POSTHOOK: Output: default@list_bucketing_dynamic_part From bf047c0b69a88801864c7cfabf6bc92bb69647cf Mon Sep 17 00:00:00 2001 From: Konstantin Bereznyakov Date: Tue, 24 Mar 2026 17:01:47 -0700 Subject: [PATCH 07/13] HIVE-29368: removed debug info from the test file --- ql/src/test/queries/clientpositive/list_bucket_dml_6.q | 7 ------- 1 file changed, 7 deletions(-) diff --git a/ql/src/test/queries/clientpositive/list_bucket_dml_6.q b/ql/src/test/queries/clientpositive/list_bucket_dml_6.q index de1c802e9ee7..11986d696aff 100644 --- a/ql/src/test/queries/clientpositive/list_bucket_dml_6.q +++ b/ql/src/test/queries/clientpositive/list_bucket_dml_6.q @@ -1,11 +1,4 @@ --! qt:dataset:srcpart --- Debug: DELETEME: show merge settings -set hive.merge.mapfiles; -set hive.merge.mapredfiles; -set hive.merge.tezfiles; -set hive.merge.smallfiles.avgsize; --- Debug: DELETEME: end - -- this ensures consistent split file counts between localhost & CI runs set tez.grouping.split-count=1; set hive.mapred.mode=nonstrict; From d4826c2d877b5fb1a62e09ad86e561a6a297b595 Mon Sep 17 00:00:00 2001 From: Konstantin Bereznyakov Date: Wed, 25 Mar 2026 10:51:23 -0700 Subject: [PATCH 08/13] HIVE-29368: refactoring as per the PR feedback --- .../hadoop/hive/ql/plan/ColStatistics.java | 10 - .../hadoop/hive/ql/stats/StatsUtils.java | 16 +- .../estimator/PessimisticStatCombiner.java | 7 +- .../ql/stats/estimator/StatEstimator.java | 45 +--- .../estimator/StatEstimatorProvider.java | 10 +- .../hive/ql/udf/generic/GenericUDFLower.java | 15 ++ .../hive/ql/udf/generic/GenericUDFUpper.java | 15 ++ .../hadoop/hive/ql/stats/TestStatsUtils.java | 208 ++++++++++++++++ .../TestPessimisticStatCombiner.java | 223 +++--------------- .../ql/stats/estimator/TestStatEstimator.java | 169 ------------- .../clientpositive/list_bucket_dml_6.q | 13 +- .../clientpositive/list_bucket_dml_7.q | 11 +- .../queries/clientpositive/ndv_case_const.q | 13 + .../llap/list_bucket_dml_6.q.out | 22 +- .../llap/list_bucket_dml_7.q.out | 10 + .../clientpositive/llap/ndv_case_const.q.out | 121 +++++++++- 16 files changed, 462 insertions(+), 446 deletions(-) delete mode 100644 ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestStatEstimator.java diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java index 76b8c28691cc..717d1f8b6a7c 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/ColStatistics.java @@ -31,7 +31,6 @@ public class ColStatistics { private boolean isPrimaryKey; private boolean isEstimated; private boolean isFilteredColumn; - private boolean isConst; private byte[] bitVectors; private byte[] histogram; @@ -156,8 +155,6 @@ public String toString() { sb.append(" isEstimated: "); sb.append(isEstimated); - sb.append(" isConst: "); - sb.append(isConst); return sb.toString(); } @@ -174,7 +171,6 @@ public ColStatistics clone() { clone.setPrimaryKey(isPrimaryKey); clone.setIsEstimated(isEstimated); clone.setIsFilteredColumn(isFilteredColumn); - clone.setConst(isConst); if (range != null ) { clone.setRange(range.clone()); } @@ -195,12 +191,6 @@ public void setIsEstimated(boolean isEstimated) { public boolean isEstimated() { return isEstimated; } - public void setConst(boolean isConst) { - this.isConst = isConst; - } - - public boolean isConst() { return isConst; } - public static class Range { public final Number minValue; public final Number maxValue; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index f8aad54910e3..830c4b6c8cec 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -1578,9 +1578,11 @@ public static ColStatistics getColStatisticsFromExpression(HiveConf conf, Statis csList.add(cs); } if (csList.size() == engfd.getChildren().size()) { - Optional res = se.estimate(csList, numRows); + Optional res = se.estimate(csList); if (res.isPresent()) { ColStatistics newStats = res.get(); + // NDV cannot exceed numRows + newStats.setCountDistint(Math.min(newStats.getCountDistint(), numRows)); colType = colType.toLowerCase(); newStats.setColumnType(colType); newStats.setColumnName(colName); @@ -1626,14 +1628,10 @@ public static ColStatistics getColStatisticsFromExpression(HiveConf conf, Statis } private static ColStatistics buildColStatForConstant(HiveConf conf, long numRows, ExprNodeConstantDesc encd) { - long numNulls = 0; - long countDistincts = 0; + long countDistincts = 1; if (encd.getValue() == null) { - // null projection numNulls = numRows; - } else { - countDistincts = 1; } String colType = encd.getTypeString(); colType = colType.toLowerCase(); @@ -1643,7 +1641,6 @@ private static ColStatistics buildColStatForConstant(HiveConf conf, long numRows colStats.setAvgColLen(avgColSize); colStats.setCountDistint(countDistincts); colStats.setNumNulls(numNulls); - colStats.setConst(true); Optional value = getConstValue(encd); value.ifPresent(number -> colStats.setRange(number, number)); @@ -2093,6 +2090,7 @@ public static long computeNDVGroupingColumns(List colStats, Stati return 0L; } if (ndvValues.isEmpty()) { + // No grouping columns, one row return 1L; } if (expDecay) { @@ -2109,7 +2107,9 @@ private static List extractNDVGroupingColumns(List colStats for (ColStatistics cs : colStats) { if (cs != null) { long ndv = cs.getCountDistint(); - if (cs.getNumNulls() > 0) { + // +1 for NULL group: source columns with partial nulls and known NDV only. + // Computed expressions include NULL. Ordered: numNulls>0 first (often false). + if (!cs.isEstimated() && cs.getNumNulls() > 0 && ndv > 0 && cs.getNumNulls() < parentStats.getNumRows()) { ndv = StatsUtils.safeAdd(ndv, 1); } ndvValues.add(ndv); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java index b3086693ed52..f84484c456be 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java @@ -42,14 +42,13 @@ public void add(ColStatistics stat) { if (stat.getAvgColLen() > result.getAvgColLen()) { result.setAvgColLen(stat.getAvgColLen()); } - // NDV=0 is "unknown" only if the stat is NOT a constant. - // Constants with NDV=0 (e.g., NULL) are "known zero", not unknown. - if ((result.getCountDistint() == 0 && !result.isConst()) || (stat.getCountDistint() == 0 && !stat.isConst())) { + // If any branch has NDV=0 (unknown stats), propagate unknown to result. + // Summing would treat unknown as zero, causing cardinality underestimates. + if (result.getCountDistint() == 0 || stat.getCountDistint() == 0) { result.setCountDistint(0); } else { result.setCountDistint(StatsUtils.safeAdd(result.getCountDistint(), stat.getCountDistint())); } - result.setConst(false); if (stat.getNumNulls() < 0 || result.getNumNulls() < 0) { result.setNumNulls(-1); } else if (stat.getNumNulls() > result.getNumNulls()) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimator.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimator.java index 98e96c48893a..94aaa32ecfcb 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimator.java @@ -24,53 +24,20 @@ import org.apache.hadoop.hive.ql.plan.ColStatistics; /** - * Enables statistics related computation on UDFs. - * - *

This interface provides two default implementations: - *

    - *
  • {@link #estimate(List)} - clones the first argument's statistics (suitable for most UDFs)
  • - *
  • {@link #estimate(List, long)} - calls estimate(List) and caps NDV at numRows
  • - *
- * - *

UDFs that simply pass through statistics (like LOWER, UPPER) can use the defaults. - * UDFs that combine statistics (like IF, WHEN, COALESCE) should override {@link #estimate(List)}. + * Enables statistics related computation on UDFs */ public interface StatEstimator { /** * Computes the output statistics of the actual UDF. * - *

The default implementation clones the first argument's statistics, which is suitable - * for most UDFs that don't significantly alter the statistical properties of their input. - * - *

Override this method for UDFs that combine multiple inputs (like IF, WHEN, COALESCE) - * or significantly transform the data. - * - * @param argStats the statistics for every argument of the UDF - * @return {@link ColStatistics} estimate for the actual UDF, or empty if estimation is not possible - */ - default Optional estimate(List argStats) { - if (argStats.isEmpty()) { - return Optional.empty(); - } - return Optional.of(argStats.get(0).clone()); - } - - /** - * Computes the output statistics of the actual UDF, ensuring NDV does not exceed numRows. + * The estimator should return with a preferably overestimated {@link ColStatistics} object if possible. + * The actual estimation logic may decide to not give an estimation; it should return with {@link Optional#empty()}. * - *

The default implementation calls {@link #estimate(List)} and caps the NDV at numRows. - * This ensures that estimators which combine statistics from multiple branches (producing - * potentially inflated NDV values) are automatically bounded by the number of rows. + * Note: at the time of the call there will be {@link ColStatistics} for all the arguments; if that is not available - the estimation is skipped. * * @param argStats the statistics for every argument of the UDF - * @param numRows the number of rows, used to cap the NDV - * @return {@link ColStatistics} estimate for the actual UDF with NDV capped at numRows + * @return {@link ColStatistics} estimate for the actual UDF. */ - default Optional estimate(List argStats, long numRows) { - return estimate(argStats).map(cs -> { - cs.setCountDistint(Math.min(cs.getCountDistint(), numRows)); - return cs; - }); - } + public Optional estimate(List argStats); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimatorProvider.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimatorProvider.java index c888493040e4..96865d194c6e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimatorProvider.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimatorProvider.java @@ -19,19 +19,11 @@ /** * Marker interface for UDFs to communicate that the usage of StatEstimators is supported by the UDF. - * - *

The default implementation returns a {@link StatEstimator} that clones the first argument's - * statistics, which is suitable for most UDFs. Override {@link #getStatEstimator()} for UDFs - * that combine statistics from multiple inputs (like IF, WHEN, COALESCE). */ public interface StatEstimatorProvider { /** * Returns the {@link StatEstimator} for the given UDF instance. - * - *

The default implementation returns an estimator that clones the first argument's statistics. */ - default StatEstimator getStatEstimator() { - return new StatEstimator() {}; - } + public StatEstimator getStatEstimator(); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFLower.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFLower.java index 609274c0bfe4..411438907424 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFLower.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFLower.java @@ -24,6 +24,8 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions; import org.apache.hadoop.hive.ql.exec.vector.expressions.StringLower; import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.ColStatistics; +import org.apache.hadoop.hive.ql.stats.estimator.StatEstimator; import org.apache.hadoop.hive.ql.stats.estimator.StatEstimatorProvider; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; @@ -35,6 +37,8 @@ import org.apache.hadoop.hive.serde2.typeinfo.BaseCharTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import java.util.List; +import java.util.Optional; /** * UDFLower. @@ -109,4 +113,15 @@ public Object evaluate(DeferredObject[] arguments) throws HiveException { public String getDisplayString(String[] children) { return getStandardDisplayString("lower", children); } + + @Override + public StatEstimator getStatEstimator() { + return new StatEstimator() { + @Override + public Optional estimate(List argStats) { + return Optional.of(argStats.get(0).clone()); + } + }; + } + } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUpper.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUpper.java index d0df8da9886b..019cbe94a4ba 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUpper.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUpper.java @@ -24,6 +24,8 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions; import org.apache.hadoop.hive.ql.exec.vector.expressions.StringUpper; import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.ColStatistics; +import org.apache.hadoop.hive.ql.stats.estimator.StatEstimator; import org.apache.hadoop.hive.ql.stats.estimator.StatEstimatorProvider; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; @@ -35,6 +37,8 @@ import org.apache.hadoop.hive.serde2.typeinfo.BaseCharTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import java.util.List; +import java.util.Optional; /** * UDFUpper. @@ -111,4 +115,15 @@ public Object evaluate(DeferredObject[] arguments) throws HiveException { public String getDisplayString(String[] children) { return getStandardDisplayString("upper", children); } + + @Override + public StatEstimator getStatEstimator() { + return new StatEstimator() { + @Override + public Optional estimate(List argStats) { + return Optional.of(argStats.get(0).clone()); + } + }; + } + } diff --git a/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java b/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java index 3f76c554d446..bc54e749834d 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java @@ -19,12 +19,14 @@ package org.apache.hadoop.hive.ql.stats; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNull; import java.lang.reflect.Field; import java.lang.reflect.Modifier; +import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Set; @@ -39,7 +41,12 @@ import org.apache.hadoop.hive.metastore.api.LongColumnStatsData; import org.apache.hadoop.hive.ql.plan.ColStatistics; import org.apache.hadoop.hive.ql.plan.ColStatistics.Range; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; import org.apache.hadoop.hive.ql.plan.Statistics; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFIf; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.apache.hadoop.hive.serde.serdeConstants; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; @@ -499,4 +506,205 @@ void testScaleColStatisticsPreservesUnknownNumFalses() { assertEquals(-1, colStats.get(0).getNumFalses(), "Unknown numFalses (-1) should be preserved after scaling"); } + // Tests for buildColStatForConstant (via getColStatisticsFromExpression) + + @Test + void testGetColStatisticsFromExpressionNullConstant() { + HiveConf conf = new HiveConf(); + Statistics parentStats = new Statistics(1000, 8000, 0, 0); + + ExprNodeConstantDesc nullConst = new ExprNodeConstantDesc(TypeInfoFactory.stringTypeInfo, null); + ColStatistics cs = StatsUtils.getColStatisticsFromExpression(conf, parentStats, nullConst); + + assertNotNull(cs); + assertEquals(1, cs.getCountDistint(), "NULL constant should have NDV=1"); + assertEquals(1000, cs.getNumNulls(), "NULL constant should have numNulls=numRows"); + assertFalse(cs.isEstimated(), "Constant stats should not be marked as estimated"); + } + + @Test + void testGetColStatisticsFromExpressionNonNullConstant() { + HiveConf conf = new HiveConf(); + Statistics parentStats = new Statistics(1000, 8000, 0, 0); + + ExprNodeConstantDesc strConst = new ExprNodeConstantDesc(TypeInfoFactory.stringTypeInfo, "hello"); + ColStatistics cs = StatsUtils.getColStatisticsFromExpression(conf, parentStats, strConst); + + assertNotNull(cs); + assertEquals(1, cs.getCountDistint(), "Non-NULL constant should have NDV=1"); + assertEquals(0, cs.getNumNulls(), "Non-NULL constant should have numNulls=0"); + } + + @Test + void testGetColStatisticsFromExpressionIntConstant() { + HiveConf conf = new HiveConf(); + Statistics parentStats = new Statistics(500, 4000, 0, 0); + + ExprNodeConstantDesc intConst = new ExprNodeConstantDesc(TypeInfoFactory.intTypeInfo, 42); + ColStatistics cs = StatsUtils.getColStatisticsFromExpression(conf, parentStats, intConst); + + assertNotNull(cs); + assertEquals(1, cs.getCountDistint(), "Integer constant should have NDV=1"); + assertEquals(0, cs.getNumNulls(), "Integer constant should have numNulls=0"); + assertNotNull(cs.getRange(), "Integer constant should have a range"); + assertEquals(42, cs.getRange().minValue.intValue()); + assertEquals(42, cs.getRange().maxValue.intValue()); + } + + // Tests for computeNDVGroupingColumns / extractNDVGroupingColumns + + @Test + void testComputeNDVGroupingColumnsSourceColumnWithNulls() { + Statistics parentStats = new Statistics(1000, 8000, 0, 0); + parentStats.setColumnStatsState(Statistics.State.COMPLETE); + + ColStatistics cs = new ColStatistics("col1", "string"); + cs.setCountDistint(100); + cs.setNumNulls(50); + cs.setIsEstimated(false); // source column + + long ndv = StatsUtils.computeNDVGroupingColumns(Arrays.asList(cs), parentStats, false); + assertEquals(101, ndv, "Source column with nulls should get +1 for NULL: 100 + 1 = 101"); + } + + @Test + void testComputeNDVGroupingColumnsSourceColumnNoNulls() { + Statistics parentStats = new Statistics(1000, 8000, 0, 0); + parentStats.setColumnStatsState(Statistics.State.COMPLETE); + + ColStatistics cs = new ColStatistics("col1", "string"); + cs.setCountDistint(100); + cs.setNumNulls(0); + cs.setIsEstimated(false); + + long ndv = StatsUtils.computeNDVGroupingColumns(Arrays.asList(cs), parentStats, false); + assertEquals(100, ndv, "Source column without nulls should not get +1"); + } + + @Test + void testComputeNDVGroupingColumnsEstimatedExpression() { + Statistics parentStats = new Statistics(1000, 8000, 0, 0); + parentStats.setColumnStatsState(Statistics.State.COMPLETE); + + ColStatistics cs = new ColStatistics("case_expr", "string"); + cs.setCountDistint(3); + cs.setNumNulls(500); + cs.setIsEstimated(true); // computed expression (e.g., CASE) + + long ndv = StatsUtils.computeNDVGroupingColumns(Arrays.asList(cs), parentStats, false); + assertEquals(3, ndv, "Estimated expression should NOT get +1 (already accounts for NULL)"); + } + + @Test + void testComputeNDVGroupingColumnsAllNullColumn() { + Statistics parentStats = new Statistics(1000, 8000, 0, 0); + parentStats.setColumnStatsState(Statistics.State.COMPLETE); + + ColStatistics cs = new ColStatistics("col1", "string"); + cs.setCountDistint(1); + cs.setNumNulls(1000); // all rows are NULL + cs.setIsEstimated(false); + + long ndv = StatsUtils.computeNDVGroupingColumns(Arrays.asList(cs), parentStats, false); + assertEquals(1, ndv, "All-NULL column should NOT get +1 (numNulls == numRows)"); + } + + @Test + void testComputeNDVGroupingColumnsUnknownNdv() { + Statistics parentStats = new Statistics(1000, 8000, 0, 0); + parentStats.setColumnStatsState(Statistics.State.COMPLETE); + + ColStatistics cs = new ColStatistics("col1", "string"); + cs.setCountDistint(0); // unknown NDV + cs.setNumNulls(50); + cs.setIsEstimated(false); + + long ndv = StatsUtils.computeNDVGroupingColumns(Arrays.asList(cs), parentStats, false); + assertEquals(0, ndv, "Unknown NDV (0) should NOT get +1 to avoid false precision"); + } + + @Test + void testComputeNDVGroupingColumnsMultipleColumns() { + Statistics parentStats = new Statistics(1000, 8000, 0, 0); + parentStats.setColumnStatsState(Statistics.State.COMPLETE); + + ColStatistics cs1 = new ColStatistics("col1", "string"); + cs1.setCountDistint(10); + cs1.setNumNulls(50); + cs1.setIsEstimated(false); + + ColStatistics cs2 = new ColStatistics("col2", "int"); + cs2.setCountDistint(5); + cs2.setNumNulls(0); + cs2.setIsEstimated(false); + + long ndv = StatsUtils.computeNDVGroupingColumns(Arrays.asList(cs1, cs2), parentStats, false); + // col1: 10 + 1 = 11 (has nulls), col2: 5 (no nulls) + // Product: 11 * 5 = 55 + assertEquals(55, ndv, "Product of NDVs: (10+1) * 5 = 55"); + } + + @Test + void testComputeNDVGroupingColumnsMixedEstimatedAndSource() { + Statistics parentStats = new Statistics(1000, 8000, 0, 0); + parentStats.setColumnStatsState(Statistics.State.COMPLETE); + + ColStatistics sourceCol = new ColStatistics("col1", "string"); + sourceCol.setCountDistint(10); + sourceCol.setNumNulls(50); + sourceCol.setIsEstimated(false); // source: gets +1 + + ColStatistics caseExpr = new ColStatistics("case_expr", "string"); + caseExpr.setCountDistint(3); + caseExpr.setNumNulls(200); + caseExpr.setIsEstimated(true); // estimated: no +1 + + long ndv = StatsUtils.computeNDVGroupingColumns(Arrays.asList(sourceCol, caseExpr), parentStats, false); + // sourceCol: 10 + 1 = 11, caseExpr: 3 (no +1) + // Product: 11 * 3 = 33 + assertEquals(33, ndv, "Mixed columns: source (10+1) * estimated (3) = 33"); + } + + // Test for NDV cap after StatEstimator (NDV cannot exceed numRows) + + @Test + void testGetColStatisticsFromExpressionNdvCappedAtNumRows() throws Exception { + HiveConf conf = new HiveConf(); + conf.setBoolVar(HiveConf.ConfVars.HIVE_STATS_ESTIMATORS_ENABLE, true); + + // Create parent stats with only 100 rows + Statistics parentStats = new Statistics(100, 800, 0, 0); + + // Create column stats for col1 and col2 with high NDV (each 80) + ColStatistics col1Stats = new ColStatistics("col1", "string"); + col1Stats.setCountDistint(80); + col1Stats.setNumNulls(0); + col1Stats.setAvgColLen(10); + + ColStatistics col2Stats = new ColStatistics("col2", "string"); + col2Stats.setCountDistint(80); + col2Stats.setNumNulls(0); + col2Stats.setAvgColLen(10); + + parentStats.setColumnStats(Arrays.asList(col1Stats, col2Stats)); + + // Create IF(true, col1, col2) expression + // IF uses PessimisticStatCombiner which sums NDVs: 80 + 80 = 160 + // But numRows is only 100, so NDV should be capped at 100 + GenericUDFIf udfIf = new GenericUDFIf(); + ExprNodeConstantDesc condExpr = new ExprNodeConstantDesc(TypeInfoFactory.booleanTypeInfo, true); + ExprNodeColumnDesc col1Expr = new ExprNodeColumnDesc(TypeInfoFactory.stringTypeInfo, "col1", "t", false); + ExprNodeColumnDesc col2Expr = new ExprNodeColumnDesc(TypeInfoFactory.stringTypeInfo, "col2", "t", false); + + ExprNodeGenericFuncDesc ifExpr = new ExprNodeGenericFuncDesc( + TypeInfoFactory.stringTypeInfo, udfIf, "if", + Arrays.asList(condExpr, col1Expr, col2Expr)); + + ColStatistics result = StatsUtils.getColStatisticsFromExpression(conf, parentStats, ifExpr); + + assertNotNull(result); + // PessimisticStatCombiner would produce 80 + 80 = 160, but cap ensures NDV <= numRows (100) + assertEquals(100, result.getCountDistint(), "NDV should be capped at numRows (100), not 160"); + } + } diff --git a/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestPessimisticStatCombiner.java b/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestPessimisticStatCombiner.java index fb3eb09308e9..9840cfeaf269 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestPessimisticStatCombiner.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestPessimisticStatCombiner.java @@ -25,6 +25,45 @@ class TestPessimisticStatCombiner { + @Test + void testNdvSumWhenBothKnown() { + ColStatistics stat1 = createStat("col1", "int", 50, 0, 4.0); + ColStatistics stat2 = createStat("col2", "int", 30, 0, 4.0); + + PessimisticStatCombiner combiner = new PessimisticStatCombiner(); + combiner.add(stat1); + combiner.add(stat2); + + ColStatistics result = combiner.getResult().get(); + assertEquals(80, result.getCountDistint(), "NDV should be summed: 50 + 30 = 80"); + } + + @Test + void testNdvUnknownPropagatedFromFirst() { + ColStatistics stat1 = createStat("col1", "int", 0, 0, 4.0); + ColStatistics stat2 = createStat("col2", "int", 100, 0, 4.0); + + PessimisticStatCombiner combiner = new PessimisticStatCombiner(); + combiner.add(stat1); + combiner.add(stat2); + + ColStatistics result = combiner.getResult().get(); + assertEquals(0, result.getCountDistint(), "Unknown NDV (0) should propagate"); + } + + @Test + void testNdvUnknownPropagatedFromSecond() { + ColStatistics stat1 = createStat("col1", "int", 100, 0, 4.0); + ColStatistics stat2 = createStat("col2", "int", 0, 0, 4.0); + + PessimisticStatCombiner combiner = new PessimisticStatCombiner(); + combiner.add(stat1); + combiner.add(stat2); + + ColStatistics result = combiner.getResult().get(); + assertEquals(0, result.getCountDistint(), "Unknown NDV (0) should propagate"); + } + @Test void testCombinePropagatesUnknownNumNullsFromFirst() { ColStatistics stat1 = createStat("col1", "int", 50, -1, 4.0); // unknown numNulls @@ -155,184 +194,6 @@ void testCombineBothUnknownNumTruesAndNumFalses() { assertEquals(-1, combined.getNumFalses(), "Both unknown should result in unknown (-1)"); } - @Test - void testCombinePropagatesUnknownNdvFromFirst() { - ColStatistics stat1 = createStat("col1", "int", 0, 10, 4.0); // NDV=0 means unknown - ColStatistics stat2 = createStat("col2", "int", 100, 20, 4.0); - - PessimisticStatCombiner combiner = new PessimisticStatCombiner(); - combiner.add(stat1); - combiner.add(stat2); - - ColStatistics combined = combiner.getResult().get(); - assertEquals(0, combined.getCountDistint(), "Unknown NDV (0) from first should be propagated"); - } - - @Test - void testCombinePropagatesUnknownNdvFromSecond() { - ColStatistics stat1 = createStat("col1", "int", 100, 10, 4.0); - ColStatistics stat2 = createStat("col2", "int", 0, 20, 4.0); // NDV=0 means unknown - - PessimisticStatCombiner combiner = new PessimisticStatCombiner(); - combiner.add(stat1); - combiner.add(stat2); - - ColStatistics combined = combiner.getResult().get(); - assertEquals(0, combined.getCountDistint(), "Unknown NDV (0) from second should be propagated"); - } - - @Test - void testCombineBothUnknownNdv() { - ColStatistics stat1 = createStat("col1", "int", 0, 10, 4.0); - ColStatistics stat2 = createStat("col2", "int", 0, 20, 4.0); - - PessimisticStatCombiner combiner = new PessimisticStatCombiner(); - combiner.add(stat1); - combiner.add(stat2); - - ColStatistics combined = combiner.getResult().get(); - assertEquals(0, combined.getCountDistint(), "Both unknown NDV should result in unknown (0)"); - } - - @Test - void testCombineSumsNdvWhenBothKnown() { - ColStatistics stat1 = createStat("col1", "int", 50, 10, 4.0); - ColStatistics stat2 = createStat("col2", "int", 30, 20, 4.0); - - PessimisticStatCombiner combiner = new PessimisticStatCombiner(); - combiner.add(stat1); - combiner.add(stat2); - - ColStatistics combined = combiner.getResult().get(); - assertEquals(80, combined.getCountDistint(), "Known NDVs should be summed"); - } - - @Test - void testCombineNdvOverflowProtection() { - ColStatistics stat1 = createStat("col1", "int", Long.MAX_VALUE - 10, 10, 4.0); - ColStatistics stat2 = createStat("col2", "int", 100, 20, 4.0); - - PessimisticStatCombiner combiner = new PessimisticStatCombiner(); - combiner.add(stat1); - combiner.add(stat2); - - ColStatistics combined = combiner.getResult().get(); - assertEquals(Long.MAX_VALUE, combined.getCountDistint(), "NDV overflow should be capped at Long.MAX_VALUE"); - } - - @Test - void testCombineThreeStats() { - ColStatistics stat1 = createStat("col1", "int", 10, 5, 4.0); - ColStatistics stat2 = createStat("col2", "int", 20, 10, 4.0); - ColStatistics stat3 = createStat("col3", "int", 30, 15, 4.0); - - PessimisticStatCombiner combiner = new PessimisticStatCombiner(); - combiner.add(stat1); - combiner.add(stat2); - combiner.add(stat3); - - ColStatistics combined = combiner.getResult().get(); - assertEquals(60, combined.getCountDistint(), "Three NDVs should be summed"); - assertEquals(15, combined.getNumNulls(), "Should take max numNulls"); - } - - @Test - void testCombineUnknownNdvInMiddle() { - ColStatistics stat1 = createStat("col1", "int", 10, 5, 4.0); - ColStatistics stat2 = createStat("col2", "int", 0, 10, 4.0); // unknown - ColStatistics stat3 = createStat("col3", "int", 30, 15, 4.0); - - PessimisticStatCombiner combiner = new PessimisticStatCombiner(); - combiner.add(stat1); - combiner.add(stat2); - combiner.add(stat3); - - ColStatistics combined = combiner.getResult().get(); - assertEquals(0, combined.getCountDistint(), "Unknown NDV in middle should propagate"); - } - - @Test - void testConstantWithNdvZeroIsNotTreatedAsUnknown() { - ColStatistics stat1 = createStat("col1", "string", 1, 0, 5.0); - ColStatistics stat2 = createConstStat("const", "string", 0, 1000, 5.0); // NULL constant: NDV=0, isConst=true - - PessimisticStatCombiner combiner = new PessimisticStatCombiner(); - combiner.add(stat1); - combiner.add(stat2); - - ColStatistics combined = combiner.getResult().get(); - assertEquals(1, combined.getCountDistint(), "Constant with NDV=0 should not propagate as unknown"); - } - - @Test - void testNullConstantFirstThenOtherConstants() { - ColStatistics nullConst = createConstStat("null", "string", 0, 1000, 5.0); // NULL constant - ColStatistics constA = createConstStat("A", "string", 1, 0, 5.0); - ColStatistics constB = createConstStat("B", "string", 1, 0, 5.0); - - PessimisticStatCombiner combiner = new PessimisticStatCombiner(); - combiner.add(nullConst); - combiner.add(constA); - combiner.add(constB); - - ColStatistics combined = combiner.getResult().get(); - assertEquals(2, combined.getCountDistint(), "NULL(0) + A(1) + B(1) should sum to 2"); - } - - @Test - void testConstantsWithNullInMiddle() { - ColStatistics constA = createConstStat("A", "string", 1, 0, 5.0); - ColStatistics nullConst = createConstStat("null", "string", 0, 1000, 5.0); // NULL constant - ColStatistics constB = createConstStat("B", "string", 1, 0, 5.0); - - PessimisticStatCombiner combiner = new PessimisticStatCombiner(); - combiner.add(constA); - combiner.add(nullConst); - combiner.add(constB); - - ColStatistics combined = combiner.getResult().get(); - assertEquals(2, combined.getCountDistint(), "A(1) + NULL(0) + B(1) should sum to 2"); - } - - @Test - void testNonConstantNdvZeroStillPropagatesUnknown() { - ColStatistics stat1 = createStat("col1", "string", 1, 0, 5.0); - ColStatistics stat2 = createStat("col2", "string", 0, 10, 5.0); // Column with unknown NDV (isConst=false) - - PessimisticStatCombiner combiner = new PessimisticStatCombiner(); - combiner.add(stat1); - combiner.add(stat2); - - ColStatistics combined = combiner.getResult().get(); - assertEquals(0, combined.getCountDistint(), "Non-constant with NDV=0 should still propagate as unknown"); - } - - @Test - void testMixedConstantAndNonConstantWithNdvZero() { - ColStatistics constStat = createConstStat("const", "string", 0, 1000, 5.0); // NULL constant - ColStatistics colStat = createStat("col", "string", 0, 10, 5.0); // Column with unknown NDV - - PessimisticStatCombiner combiner = new PessimisticStatCombiner(); - combiner.add(constStat); - combiner.add(colStat); - - ColStatistics combined = combiner.getResult().get(); - assertEquals(0, combined.getCountDistint(), "Non-constant with NDV=0 should propagate unknown even if combined with constant"); - } - - @Test - void testCombinedResultIsNotConst() { - ColStatistics constA = createConstStat("A", "string", 1, 0, 5.0); - ColStatistics constB = createConstStat("B", "string", 1, 0, 5.0); - - PessimisticStatCombiner combiner = new PessimisticStatCombiner(); - combiner.add(constA); - combiner.add(constB); - - ColStatistics combined = combiner.getResult().get(); - assertEquals(false, combined.isConst(), "Combined result should not be marked as constant"); - } - private ColStatistics createStat(String name, String type, long ndv, long numNulls, double avgColLen) { ColStatistics stat = new ColStatistics(name, type); stat.setCountDistint(ndv); @@ -340,10 +201,4 @@ private ColStatistics createStat(String name, String type, long ndv, long numNul stat.setAvgColLen(avgColLen); return stat; } - - private ColStatistics createConstStat(String name, String type, long ndv, long numNulls, double avgColLen) { - ColStatistics stat = createStat(name, type, ndv, numNulls, avgColLen); - stat.setConst(true); - return stat; - } } diff --git a/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestStatEstimator.java b/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestStatEstimator.java deleted file mode 100644 index 09140f3d15bb..000000000000 --- a/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestStatEstimator.java +++ /dev/null @@ -1,169 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.hadoop.hive.ql.stats.estimator; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertNotSame; -import static org.junit.jupiter.api.Assertions.assertTrue; - -import java.util.Arrays; -import java.util.Collections; -import java.util.Optional; - -import org.apache.hadoop.hive.ql.plan.ColStatistics; -import org.junit.jupiter.api.Test; - -class TestStatEstimator { - - @Test - void testDefaultEstimateWithEmptyList() { - StatEstimator estimator = new StatEstimator() {}; - Optional result = estimator.estimate(Collections.emptyList()); - assertFalse(result.isPresent(), "Empty list should return empty Optional"); - } - - @Test - void testDefaultEstimateClonesFirstArg() { - StatEstimator estimator = new StatEstimator() {}; - ColStatistics stat = createStat("col1", "int", 100, 10, 4.0); - - Optional result = estimator.estimate(Arrays.asList(stat)); - - assertTrue(result.isPresent()); - assertEquals(100, result.get().getCountDistint()); - assertEquals(10, result.get().getNumNulls()); - assertEquals(4.0, result.get().getAvgColLen()); - } - - @Test - void testDefaultEstimateReturnsCloneNotSameReference() { - StatEstimator estimator = new StatEstimator() {}; - ColStatistics stat = createStat("col1", "int", 100, 10, 4.0); - - Optional result = estimator.estimate(Arrays.asList(stat)); - - assertTrue(result.isPresent()); - assertNotSame(stat, result.get(), "Should return a clone, not the same reference"); - stat.setCountDistint(999); - assertEquals(100, result.get().getCountDistint(), "Clone should not be affected by original changes"); - } - - @Test - void testDefaultEstimateIgnoresSubsequentArgs() { - StatEstimator estimator = new StatEstimator() {}; - ColStatistics stat1 = createStat("col1", "int", 100, 10, 4.0); - ColStatistics stat2 = createStat("col2", "int", 200, 20, 8.0); - - Optional result = estimator.estimate(Arrays.asList(stat1, stat2)); - - assertTrue(result.isPresent()); - assertEquals(100, result.get().getCountDistint(), "Should use first arg's NDV"); - assertEquals(10, result.get().getNumNulls(), "Should use first arg's numNulls"); - } - - @Test - void testDefaultEstimateWithNumRowsCapsNdv() { - StatEstimator estimator = new StatEstimator() {}; - ColStatistics stat = createStat("col1", "int", 1000, 10, 4.0); - - Optional result = estimator.estimate(Arrays.asList(stat), 500); - - assertTrue(result.isPresent()); - assertEquals(500, result.get().getCountDistint(), "NDV should be capped at numRows"); - } - - @Test - void testDefaultEstimateWithNumRowsNoCappingNeeded() { - StatEstimator estimator = new StatEstimator() {}; - ColStatistics stat = createStat("col1", "int", 100, 10, 4.0); - - Optional result = estimator.estimate(Arrays.asList(stat), 500); - - assertTrue(result.isPresent()); - assertEquals(100, result.get().getCountDistint(), "NDV should remain unchanged when less than numRows"); - } - - @Test - void testDefaultEstimateWithNumRowsExactlyEqual() { - StatEstimator estimator = new StatEstimator() {}; - ColStatistics stat = createStat("col1", "int", 500, 10, 4.0); - - Optional result = estimator.estimate(Arrays.asList(stat), 500); - - assertTrue(result.isPresent()); - assertEquals(500, result.get().getCountDistint(), "NDV should remain unchanged when equal to numRows"); - } - - @Test - void testDefaultEstimateWithNumRowsEmptyList() { - StatEstimator estimator = new StatEstimator() {}; - - Optional result = estimator.estimate(Collections.emptyList(), 500); - - assertFalse(result.isPresent(), "Empty list should return empty Optional"); - } - - @Test - void testDefaultEstimateWithNumRowsPreservesOtherStats() { - StatEstimator estimator = new StatEstimator() {}; - ColStatistics stat = createStat("col1", "int", 1000, 10, 4.0); - stat.setNumTrues(50); - stat.setNumFalses(40); - - Optional result = estimator.estimate(Arrays.asList(stat), 500); - - assertTrue(result.isPresent()); - assertEquals(500, result.get().getCountDistint(), "NDV should be capped"); - assertEquals(10, result.get().getNumNulls(), "numNulls should be preserved"); - assertEquals(4.0, result.get().getAvgColLen(), "avgColLen should be preserved"); - } - - @Test - void testStatEstimatorProviderDefaultReturnsWorkingEstimator() { - StatEstimatorProvider provider = new StatEstimatorProvider() {}; - StatEstimator estimator = provider.getStatEstimator(); - - ColStatistics stat = createStat("col1", "int", 100, 10, 4.0); - Optional result = estimator.estimate(Arrays.asList(stat)); - - assertTrue(result.isPresent()); - assertEquals(100, result.get().getCountDistint()); - } - - @Test - void testStatEstimatorProviderDefaultCapsNdv() { - StatEstimatorProvider provider = new StatEstimatorProvider() {}; - StatEstimator estimator = provider.getStatEstimator(); - - ColStatistics stat = createStat("col1", "int", 1000, 10, 4.0); - Optional result = estimator.estimate(Arrays.asList(stat), 500); - - assertTrue(result.isPresent()); - assertEquals(500, result.get().getCountDistint(), "Default provider estimator should cap NDV"); - } - - private ColStatistics createStat(String name, String type, long ndv, long numNulls, double avgColLen) { - ColStatistics stat = new ColStatistics(name, type); - stat.setCountDistint(ndv); - stat.setNumNulls(numNulls); - stat.setAvgColLen(avgColLen); - return stat; - } -} diff --git a/ql/src/test/queries/clientpositive/list_bucket_dml_6.q b/ql/src/test/queries/clientpositive/list_bucket_dml_6.q index 11986d696aff..2ce2ced59e04 100644 --- a/ql/src/test/queries/clientpositive/list_bucket_dml_6.q +++ b/ql/src/test/queries/clientpositive/list_bucket_dml_6.q @@ -1,12 +1,10 @@ --! qt:dataset:srcpart --- this ensures consistent split file counts between localhost & CI runs -set tez.grouping.split-count=1; set hive.mapred.mode=nonstrict; set hive.exec.dynamic.partition=true; set hive.input.format=org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat; set hive.merge.smallfiles.avgsize=200; set mapred.input.dir.recursive=true; -set hive.merge.mapfiles=false; +set hive.merge.mapfiles=false; set hive.merge.mapredfiles=false; -- list bucketing DML: dynamic partition. multiple skewed columns. merge. @@ -45,13 +43,13 @@ set hive.merge.mapredfiles=false; -- 87 000000_0 -- 87 000001_0 -- with merge --- 118 000002_0 +-- 118 000002_0 -- SORT_QUERY_RESULTS -- create a skewed table -create table list_bucketing_dynamic_part_n3 (key String, value String) - partitioned by (ds String, hr String) +create table list_bucketing_dynamic_part_n3 (key String, value String) + partitioned by (ds String, hr String) skewed by (key, value) on (('484','val_484'),('51','val_14'),('103','val_103')) stored as DIRECTORIES STORED AS RCFILE; @@ -94,3 +92,6 @@ select * from list_bucketing_dynamic_part_n3 where key = '484' and value = 'val_ select * from list_bucketing_dynamic_part_n3 where key = '484' and value = 'val_484'; select * from srcpart where ds = '2008-04-08' and key = '484' and value = 'val_484'; +-- clean up +drop table list_bucketing_dynamic_part_n3; + diff --git a/ql/src/test/queries/clientpositive/list_bucket_dml_7.q b/ql/src/test/queries/clientpositive/list_bucket_dml_7.q index a4a21aaa1ceb..f80585e56c6f 100644 --- a/ql/src/test/queries/clientpositive/list_bucket_dml_7.q +++ b/ql/src/test/queries/clientpositive/list_bucket_dml_7.q @@ -1,6 +1,4 @@ --! qt:dataset:srcpart --- this ensures consistent split file counts between localhost & CI runs -set tez.grouping.split-count=1; set hive.mapred.mode=nonstrict; set hive.exec.dynamic.partition=true; set hive.input.format=org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat; @@ -41,10 +39,10 @@ select key, value, if(key % 100 == 0, 'a1', 'b1') from srcpart where ds = '2008- -- check DML result show partitions list_bucketing_dynamic_part; -desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='a1'); +desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='a1'); desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='b1'); -set hive.merge.mapfiles=true; +set hive.merge.mapfiles=true; set hive.merge.mapredfiles=true; -- list bucketing DML with merge. use bucketize to generate a few small files. explain extended @@ -56,7 +54,7 @@ select key, value, if(key % 100 == 0, 'a1', 'b1') from srcpart where ds = '2008- -- check DML result show partitions list_bucketing_dynamic_part; -desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='a1'); +desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='a1'); desc formatted list_bucketing_dynamic_part partition (ds='2008-04-08', hr='b1'); select count(1) from srcpart where ds = '2008-04-08'; @@ -67,3 +65,6 @@ explain extended select * from list_bucketing_dynamic_part where key = '484' and value = 'val_484'; select * from list_bucketing_dynamic_part where key = '484' and value = 'val_484'; select * from srcpart where ds = '2008-04-08' and key = '484' and value = 'val_484'; + +-- clean up +drop table list_bucketing_dynamic_part; diff --git a/ql/src/test/queries/clientpositive/ndv_case_const.q b/ql/src/test/queries/clientpositive/ndv_case_const.q index 42162d25c529..7132e163aa13 100644 --- a/ql/src/test/queries/clientpositive/ndv_case_const.q +++ b/ql/src/test/queries/clientpositive/ndv_case_const.q @@ -1,3 +1,6 @@ +-- Tests for CASE expression NDV estimation in Group By Operator. +-- Verifies that "Statistics: Num rows" reflects accurate NDV computation +-- when CASE branches contain constants, NULLs, and column references. CREATE TABLE t (cond INT, c2 STRING, c100 STRING); ALTER TABLE t UPDATE STATISTICS SET('numRows'='10000','rawDataSize'='1000000'); ALTER TABLE t UPDATE STATISTICS FOR COLUMN cond SET('numDVs'='10','numNulls'='0'); @@ -25,3 +28,13 @@ EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN c2 WHEN cond=2 THEN c100 ELS EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN 'A' WHEN cond=2 THEN 'B' WHEN cond=3 THEN 'C' ELSE c2 END x FROM t) sub GROUP BY x; EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN 'A' WHEN cond=2 THEN 'B' ELSE c100 END x FROM t) sub GROUP BY x; + +-- Test NDV cap: sum of branch NDVs (100+100+100+1=301) exceeds numRows (200) +CREATE TABLE t_small (cond INT, c100a STRING, c100b STRING, c100c STRING); +ALTER TABLE t_small UPDATE STATISTICS SET('numRows'='200','rawDataSize'='20000'); +ALTER TABLE t_small UPDATE STATISTICS FOR COLUMN cond SET('numDVs'='10','numNulls'='0'); +ALTER TABLE t_small UPDATE STATISTICS FOR COLUMN c100a SET('numDVs'='100','numNulls'='0','avgColLen'='5','maxColLen'='10'); +ALTER TABLE t_small UPDATE STATISTICS FOR COLUMN c100b SET('numDVs'='100','numNulls'='0','avgColLen'='5','maxColLen'='10'); +ALTER TABLE t_small UPDATE STATISTICS FOR COLUMN c100c SET('numDVs'='100','numNulls'='0','avgColLen'='5','maxColLen'='10'); + +EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN c100a WHEN cond=2 THEN c100b WHEN cond=3 THEN c100c ELSE 'A' END x FROM t_small) sub GROUP BY x; diff --git a/ql/src/test/results/clientpositive/llap/list_bucket_dml_6.q.out b/ql/src/test/results/clientpositive/llap/list_bucket_dml_6.q.out index d9d5d18340eb..4f4a0b3df537 100644 --- a/ql/src/test/results/clientpositive/llap/list_bucket_dml_6.q.out +++ b/ql/src/test/results/clientpositive/llap/list_bucket_dml_6.q.out @@ -1,17 +1,13 @@ -hive.merge.mapfiles=true -hive.merge.mapredfiles=false -hive.merge.tezfiles=false -hive.merge.smallfiles.avgsize=16000000 -PREHOOK: query: create table list_bucketing_dynamic_part_n3 (key String, value String) - partitioned by (ds String, hr String) +PREHOOK: query: create table list_bucketing_dynamic_part_n3 (key String, value String) + partitioned by (ds String, hr String) skewed by (key, value) on (('484','val_484'),('51','val_14'),('103','val_103')) stored as DIRECTORIES STORED AS RCFILE PREHOOK: type: CREATETABLE PREHOOK: Output: database:default PREHOOK: Output: default@list_bucketing_dynamic_part_n3 -POSTHOOK: query: create table list_bucketing_dynamic_part_n3 (key String, value String) - partitioned by (ds String, hr String) +POSTHOOK: query: create table list_bucketing_dynamic_part_n3 (key String, value String) + partitioned by (ds String, hr String) skewed by (key, value) on (('484','val_484'),('51','val_14'),('103','val_103')) stored as DIRECTORIES STORED AS RCFILE @@ -902,3 +898,13 @@ POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=12 #### A masked pattern was here #### 484 val_484 2008-04-08 11 484 val_484 2008-04-08 12 +PREHOOK: query: drop table list_bucketing_dynamic_part_n3 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@list_bucketing_dynamic_part_n3 +PREHOOK: Output: database:default +PREHOOK: Output: default@list_bucketing_dynamic_part_n3 +POSTHOOK: query: drop table list_bucketing_dynamic_part_n3 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@list_bucketing_dynamic_part_n3 +POSTHOOK: Output: database:default +POSTHOOK: Output: default@list_bucketing_dynamic_part_n3 diff --git a/ql/src/test/results/clientpositive/llap/list_bucket_dml_7.q.out b/ql/src/test/results/clientpositive/llap/list_bucket_dml_7.q.out index e1dbd260d038..6e45676ba107 100644 --- a/ql/src/test/results/clientpositive/llap/list_bucket_dml_7.q.out +++ b/ql/src/test/results/clientpositive/llap/list_bucket_dml_7.q.out @@ -898,3 +898,13 @@ POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=12 #### A masked pattern was here #### 484 val_484 2008-04-08 11 484 val_484 2008-04-08 12 +PREHOOK: query: drop table list_bucketing_dynamic_part +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@list_bucketing_dynamic_part +PREHOOK: Output: database:default +PREHOOK: Output: default@list_bucketing_dynamic_part +POSTHOOK: query: drop table list_bucketing_dynamic_part +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@list_bucketing_dynamic_part +POSTHOOK: Output: database:default +POSTHOOK: Output: default@list_bucketing_dynamic_part diff --git a/ql/src/test/results/clientpositive/llap/ndv_case_const.q.out b/ql/src/test/results/clientpositive/llap/ndv_case_const.q.out index 1d3c298a25c5..6539159d2eaa 100644 --- a/ql/src/test/results/clientpositive/llap/ndv_case_const.q.out +++ b/ql/src/test/results/clientpositive/llap/ndv_case_const.q.out @@ -397,13 +397,13 @@ STAGE PLANS: minReductionHashAggr: 0.99 mode: hash outputColumnNames: _col0 - Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 4 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: string) null sort order: z sort order: + Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 4 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE Execution mode: vectorized, llap LLAP IO: all inputs Reducer 2 @@ -413,10 +413,10 @@ STAGE PLANS: keys: KEY._col0 (type: string) mode: mergepartial outputColumnNames: _col0 - Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 4 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 4 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat @@ -753,3 +753,116 @@ STAGE PLANS: Processor Tree: ListSink +PREHOOK: query: CREATE TABLE t_small (cond INT, c100a STRING, c100b STRING, c100c STRING) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@t_small +POSTHOOK: query: CREATE TABLE t_small (cond INT, c100a STRING, c100b STRING, c100c STRING) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@t_small +PREHOOK: query: ALTER TABLE t_small UPDATE STATISTICS SET('numRows'='200','rawDataSize'='20000') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@t_small +PREHOOK: Output: default@t_small +POSTHOOK: query: ALTER TABLE t_small UPDATE STATISTICS SET('numRows'='200','rawDataSize'='20000') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@t_small +POSTHOOK: Output: default@t_small +PREHOOK: query: ALTER TABLE t_small UPDATE STATISTICS FOR COLUMN cond SET('numDVs'='10','numNulls'='0') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@t_small +PREHOOK: Output: default@t_small +POSTHOOK: query: ALTER TABLE t_small UPDATE STATISTICS FOR COLUMN cond SET('numDVs'='10','numNulls'='0') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@t_small +POSTHOOK: Output: default@t_small +PREHOOK: query: ALTER TABLE t_small UPDATE STATISTICS FOR COLUMN c100a SET('numDVs'='100','numNulls'='0','avgColLen'='5','maxColLen'='10') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@t_small +PREHOOK: Output: default@t_small +POSTHOOK: query: ALTER TABLE t_small UPDATE STATISTICS FOR COLUMN c100a SET('numDVs'='100','numNulls'='0','avgColLen'='5','maxColLen'='10') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@t_small +POSTHOOK: Output: default@t_small +PREHOOK: query: ALTER TABLE t_small UPDATE STATISTICS FOR COLUMN c100b SET('numDVs'='100','numNulls'='0','avgColLen'='5','maxColLen'='10') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@t_small +PREHOOK: Output: default@t_small +POSTHOOK: query: ALTER TABLE t_small UPDATE STATISTICS FOR COLUMN c100b SET('numDVs'='100','numNulls'='0','avgColLen'='5','maxColLen'='10') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@t_small +POSTHOOK: Output: default@t_small +PREHOOK: query: ALTER TABLE t_small UPDATE STATISTICS FOR COLUMN c100c SET('numDVs'='100','numNulls'='0','avgColLen'='5','maxColLen'='10') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@t_small +PREHOOK: Output: default@t_small +POSTHOOK: query: ALTER TABLE t_small UPDATE STATISTICS FOR COLUMN c100c SET('numDVs'='100','numNulls'='0','avgColLen'='5','maxColLen'='10') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@t_small +POSTHOOK: Output: default@t_small +PREHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN c100a WHEN cond=2 THEN c100b WHEN cond=3 THEN c100c ELSE 'A' END x FROM t_small) sub GROUP BY x +PREHOOK: type: QUERY +PREHOOK: Input: default@t_small +#### A masked pattern was here #### +POSTHOOK: query: EXPLAIN SELECT x FROM (SELECT CASE WHEN cond=1 THEN c100a WHEN cond=2 THEN c100b WHEN cond=3 THEN c100c ELSE 'A' END x FROM t_small) sub GROUP BY x +POSTHOOK: type: QUERY +POSTHOOK: Input: default@t_small +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: t_small + Statistics: Num rows: 200 Data size: 54200 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: CASE WHEN ((cond = 1)) THEN (c100a) WHEN ((cond = 2)) THEN (c100b) WHEN ((cond = 3)) THEN (c100c) ELSE ('A') END (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 200 Data size: 54200 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + keys: _col0 (type: string) + minReductionHashAggr: 0.4 + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 200 Data size: 17800 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: string) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 200 Data size: 17800 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized, llap + LLAP IO: all inputs + Reducer 2 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 200 Data size: 17800 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 200 Data size: 17800 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + From 4170c17b1951f5ff785ecf76536a19d863eb9395 Mon Sep 17 00:00:00 2001 From: Konstantin Bereznyakov Date: Wed, 25 Mar 2026 11:41:30 -0700 Subject: [PATCH 09/13] HIVE-29368: trigger a rebuild From dced9f51079c74ad0bb5e56266baa9d6ea1233c9 Mon Sep 17 00:00:00 2001 From: Konstantin Bereznyakov Date: Wed, 25 Mar 2026 17:57:33 -0700 Subject: [PATCH 10/13] HIVE-29368: buildColStatForConstant() can be simplified after removing NDV logic --- .../apache/hadoop/hive/ql/stats/StatsUtils.java | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index 830c4b6c8cec..d75d87b81a4e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -1628,19 +1628,11 @@ public static ColStatistics getColStatisticsFromExpression(HiveConf conf, Statis } private static ColStatistics buildColStatForConstant(HiveConf conf, long numRows, ExprNodeConstantDesc encd) { - long numNulls = 0; - long countDistincts = 1; - if (encd.getValue() == null) { - numNulls = numRows; - } - String colType = encd.getTypeString(); - colType = colType.toLowerCase(); - ObjectInspector oi = encd.getWritableObjectInspector(); - double avgColSize = getAvgColLenOf(conf, oi, colType); + String colType = encd.getTypeString().toLowerCase(); ColStatistics colStats = new ColStatistics(encd.getName(), colType); - colStats.setAvgColLen(avgColSize); - colStats.setCountDistint(countDistincts); - colStats.setNumNulls(numNulls); + colStats.setAvgColLen(getAvgColLenOf(conf, encd.getWritableObjectInspector(), colType)); + colStats.setCountDistint(1); + colStats.setNumNulls(encd.getValue() == null ? numRows : 0); Optional value = getConstValue(encd); value.ifPresent(number -> colStats.setRange(number, number)); From 3690d4b61f7694743d578c5bf02238e514ef9a0e Mon Sep 17 00:00:00 2001 From: Konstantin Bereznyakov Date: Fri, 27 Mar 2026 15:42:22 -0700 Subject: [PATCH 11/13] HIVE-29368: refactored StatEstimator to be aware of parent stats, and PessimisticStatCombiner to use numRows to identify "const NULL" ColStatistics instances --- .../hadoop/hive/ql/stats/StatsUtils.java | 8 +- .../estimator/PessimisticStatCombiner.java | 13 ++- .../ql/stats/estimator/StatEstimator.java | 17 +++- .../ql/udf/generic/GenericUDFCoalesce.java | 5 +- .../hive/ql/udf/generic/GenericUDFIf.java | 5 +- .../hive/ql/udf/generic/GenericUDFWhen.java | 5 +- .../hadoop/hive/ql/stats/TestStatsUtils.java | 2 +- .../TestPessimisticStatCombiner.java | 79 ++++++++++++++++--- .../clientpositive/llap/ndv_case_const.q.out | 8 +- 9 files changed, 111 insertions(+), 31 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index d75d87b81a4e..9fd7593e2ce3 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -1578,7 +1578,7 @@ public static ColStatistics getColStatisticsFromExpression(HiveConf conf, Statis csList.add(cs); } if (csList.size() == engfd.getChildren().size()) { - Optional res = se.estimate(csList); + Optional res = se.estimate(csList, parentStats); if (res.isPresent()) { ColStatistics newStats = res.get(); // NDV cannot exceed numRows @@ -1631,7 +1631,7 @@ private static ColStatistics buildColStatForConstant(HiveConf conf, long numRows String colType = encd.getTypeString().toLowerCase(); ColStatistics colStats = new ColStatistics(encd.getName(), colType); colStats.setAvgColLen(getAvgColLenOf(conf, encd.getWritableObjectInspector(), colType)); - colStats.setCountDistint(1); + colStats.setCountDistint(encd.getValue() == null ? 0 : 1); colStats.setNumNulls(encd.getValue() == null ? numRows : 0); Optional value = getConstValue(encd); @@ -2099,9 +2099,7 @@ private static List extractNDVGroupingColumns(List colStats for (ColStatistics cs : colStats) { if (cs != null) { long ndv = cs.getCountDistint(); - // +1 for NULL group: source columns with partial nulls and known NDV only. - // Computed expressions include NULL. Ordered: numNulls>0 first (often false). - if (!cs.isEstimated() && cs.getNumNulls() > 0 && ndv > 0 && cs.getNumNulls() < parentStats.getNumRows()) { + if (cs.getNumNulls() > 0) { ndv = StatsUtils.safeAdd(ndv, 1); } ndvValues.add(ndv); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java index f84484c456be..48bb90820439 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/PessimisticStatCombiner.java @@ -28,10 +28,19 @@ */ public class PessimisticStatCombiner { + private final long numRows; private boolean inited; + private boolean hasUnknownNDV; private ColStatistics result; + public PessimisticStatCombiner(long numRows) { + this.numRows = numRows; + } + public void add(ColStatistics stat) { + // NDV==0 means unknown, unless it's a NULL constant (numNulls == numRows) + hasUnknownNDV = hasUnknownNDV || (stat.getCountDistint() == 0 && stat.getNumNulls() != numRows); + if (!inited) { inited = true; result = stat.clone(); @@ -42,9 +51,7 @@ public void add(ColStatistics stat) { if (stat.getAvgColLen() > result.getAvgColLen()) { result.setAvgColLen(stat.getAvgColLen()); } - // If any branch has NDV=0 (unknown stats), propagate unknown to result. - // Summing would treat unknown as zero, causing cardinality underestimates. - if (result.getCountDistint() == 0 || stat.getCountDistint() == 0) { + if (hasUnknownNDV) { result.setCountDistint(0); } else { result.setCountDistint(StatsUtils.safeAdd(result.getCountDistint(), stat.getCountDistint())); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimator.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimator.java index 94aaa32ecfcb..80846fa24d30 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/estimator/StatEstimator.java @@ -22,6 +22,7 @@ import java.util.Optional; import org.apache.hadoop.hive.ql.plan.ColStatistics; +import org.apache.hadoop.hive.ql.plan.Statistics; /** * Enables statistics related computation on UDFs @@ -39,5 +40,19 @@ public interface StatEstimator { * @param argStats the statistics for every argument of the UDF * @return {@link ColStatistics} estimate for the actual UDF. */ - public Optional estimate(List argStats); + default Optional estimate(List argStats) { + throw new UnsupportedOperationException("This estimator requires parentStats"); + } + + /** + * Computes the output statistics with access to parent statistics. + * Override this method when the estimator uses more info for accurate estimation. + * + * @param argStats the statistics for every argument of the UDF + * @param parentStats statistics from the parent operator + * @return {@link ColStatistics} estimate for the actual UDF. + */ + default Optional estimate(List argStats, Statistics parentStats) { + return estimate(argStats); + } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCoalesce.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCoalesce.java index bbca9242ecaa..1799669bda57 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCoalesce.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFCoalesce.java @@ -26,6 +26,7 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressionsSupportDecimal64; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.ColStatistics; +import org.apache.hadoop.hive.ql.plan.Statistics; import org.apache.hadoop.hive.ql.stats.estimator.StatEstimator; import org.apache.hadoop.hive.ql.stats.estimator.StatEstimatorProvider; import org.apache.hadoop.hive.ql.stats.estimator.PessimisticStatCombiner; @@ -89,8 +90,8 @@ public StatEstimator getStatEstimator() { static class CoalesceStatEstimator implements StatEstimator { @Override - public Optional estimate(List argStats) { - PessimisticStatCombiner combiner = new PessimisticStatCombiner(); + public Optional estimate(List argStats, Statistics parentStats) { + PessimisticStatCombiner combiner = new PessimisticStatCombiner(parentStats.getNumRows()); for (int i = 0; i < argStats.size(); i++) { combiner.add(argStats.get(i)); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFIf.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFIf.java index eaa352317267..74bd2459debf 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFIf.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFIf.java @@ -29,6 +29,7 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressionsSupportDecimal64; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.ColStatistics; +import org.apache.hadoop.hive.ql.plan.Statistics; import org.apache.hadoop.hive.ql.stats.estimator.StatEstimator; import org.apache.hadoop.hive.ql.stats.estimator.StatEstimatorProvider; import org.apache.hadoop.hive.ql.stats.estimator.PessimisticStatCombiner; @@ -187,8 +188,8 @@ public StatEstimator getStatEstimator() { static class IfStatEstimator implements StatEstimator { @Override - public Optional estimate(List argStats) { - PessimisticStatCombiner combiner = new PessimisticStatCombiner(); + public Optional estimate(List argStats, Statistics parentStats) { + PessimisticStatCombiner combiner = new PessimisticStatCombiner(parentStats.getNumRows()); combiner.add(argStats.get(1)); combiner.add(argStats.get(2)); return combiner.getResult(); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFWhen.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFWhen.java index e6d3580692d3..5dab62ab959e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFWhen.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFWhen.java @@ -25,6 +25,7 @@ import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.ColStatistics; +import org.apache.hadoop.hive.ql.plan.Statistics; import org.apache.hadoop.hive.ql.stats.estimator.PessimisticStatCombiner; import org.apache.hadoop.hive.ql.stats.estimator.StatEstimator; import org.apache.hadoop.hive.ql.stats.estimator.StatEstimatorProvider; @@ -143,8 +144,8 @@ public StatEstimator getStatEstimator() { static class WhenStatEstimator implements StatEstimator { @Override - public Optional estimate(List argStats) { - PessimisticStatCombiner combiner = new PessimisticStatCombiner(); + public Optional estimate(List argStats, Statistics parentStats) { + PessimisticStatCombiner combiner = new PessimisticStatCombiner(parentStats.getNumRows()); for (int i = 1; i < argStats.size(); i += 2) { combiner.add(argStats.get(i)); } diff --git a/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java b/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java index bc54e749834d..2b8cb07b6822 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java @@ -668,7 +668,7 @@ void testComputeNDVGroupingColumnsMixedEstimatedAndSource() { // Test for NDV cap after StatEstimator (NDV cannot exceed numRows) @Test - void testGetColStatisticsFromExpressionNdvCappedAtNumRows() throws Exception { + void testGetColStatisticsFromExpressionNdvCappedAtNumRows() { HiveConf conf = new HiveConf(); conf.setBoolVar(HiveConf.ConfVars.HIVE_STATS_ESTIMATORS_ENABLE, true); diff --git a/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestPessimisticStatCombiner.java b/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestPessimisticStatCombiner.java index 9840cfeaf269..281e7b82c27a 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestPessimisticStatCombiner.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestPessimisticStatCombiner.java @@ -30,7 +30,7 @@ void testNdvSumWhenBothKnown() { ColStatistics stat1 = createStat("col1", "int", 50, 0, 4.0); ColStatistics stat2 = createStat("col2", "int", 30, 0, 4.0); - PessimisticStatCombiner combiner = new PessimisticStatCombiner(); + PessimisticStatCombiner combiner = new PessimisticStatCombiner(1000); combiner.add(stat1); combiner.add(stat2); @@ -43,7 +43,7 @@ void testNdvUnknownPropagatedFromFirst() { ColStatistics stat1 = createStat("col1", "int", 0, 0, 4.0); ColStatistics stat2 = createStat("col2", "int", 100, 0, 4.0); - PessimisticStatCombiner combiner = new PessimisticStatCombiner(); + PessimisticStatCombiner combiner = new PessimisticStatCombiner(1000); combiner.add(stat1); combiner.add(stat2); @@ -56,7 +56,7 @@ void testNdvUnknownPropagatedFromSecond() { ColStatistics stat1 = createStat("col1", "int", 100, 0, 4.0); ColStatistics stat2 = createStat("col2", "int", 0, 0, 4.0); - PessimisticStatCombiner combiner = new PessimisticStatCombiner(); + PessimisticStatCombiner combiner = new PessimisticStatCombiner(1000); combiner.add(stat1); combiner.add(stat2); @@ -69,7 +69,7 @@ void testCombinePropagatesUnknownNumNullsFromFirst() { ColStatistics stat1 = createStat("col1", "int", 50, -1, 4.0); // unknown numNulls ColStatistics stat2 = createStat("col2", "int", 30, 100, 4.0); - PessimisticStatCombiner combiner = new PessimisticStatCombiner(); + PessimisticStatCombiner combiner = new PessimisticStatCombiner(1000); combiner.add(stat1); combiner.add(stat2); @@ -82,7 +82,7 @@ void testCombinePropagatesUnknownNumNullsFromSecond() { ColStatistics stat1 = createStat("col1", "int", 50, 100, 4.0); ColStatistics stat2 = createStat("col2", "int", 30, -1, 4.0); // unknown numNulls - PessimisticStatCombiner combiner = new PessimisticStatCombiner(); + PessimisticStatCombiner combiner = new PessimisticStatCombiner(1000); combiner.add(stat1); combiner.add(stat2); @@ -100,7 +100,7 @@ void testCombinePropagatesUnknownNumTruesFromFirst() { stat2.setNumTrues(100); stat2.setNumFalses(150); - PessimisticStatCombiner combiner = new PessimisticStatCombiner(); + PessimisticStatCombiner combiner = new PessimisticStatCombiner(1000); combiner.add(stat1); combiner.add(stat2); @@ -118,7 +118,7 @@ void testCombinePropagatesUnknownNumTruesFromSecond() { stat2.setNumTrues(-1); // unknown stat2.setNumFalses(150); - PessimisticStatCombiner combiner = new PessimisticStatCombiner(); + PessimisticStatCombiner combiner = new PessimisticStatCombiner(1000); combiner.add(stat1); combiner.add(stat2); @@ -136,7 +136,7 @@ void testCombinePropagatesUnknownNumFalsesFromFirst() { stat2.setNumTrues(50); stat2.setNumFalses(150); - PessimisticStatCombiner combiner = new PessimisticStatCombiner(); + PessimisticStatCombiner combiner = new PessimisticStatCombiner(1000); combiner.add(stat1); combiner.add(stat2); @@ -154,7 +154,7 @@ void testCombinePropagatesUnknownNumFalsesFromSecond() { stat2.setNumTrues(50); stat2.setNumFalses(-1); // unknown - PessimisticStatCombiner combiner = new PessimisticStatCombiner(); + PessimisticStatCombiner combiner = new PessimisticStatCombiner(1000); combiner.add(stat1); combiner.add(stat2); @@ -167,7 +167,7 @@ void testCombineBothUnknownNumNulls() { ColStatistics stat1 = createStat("col1", "int", 50, -1, 4.0); ColStatistics stat2 = createStat("col2", "int", 30, -1, 4.0); - PessimisticStatCombiner combiner = new PessimisticStatCombiner(); + PessimisticStatCombiner combiner = new PessimisticStatCombiner(1000); combiner.add(stat1); combiner.add(stat2); @@ -185,7 +185,7 @@ void testCombineBothUnknownNumTruesAndNumFalses() { stat2.setNumTrues(-1); stat2.setNumFalses(-1); - PessimisticStatCombiner combiner = new PessimisticStatCombiner(); + PessimisticStatCombiner combiner = new PessimisticStatCombiner(1000); combiner.add(stat1); combiner.add(stat2); @@ -194,6 +194,63 @@ void testCombineBothUnknownNumTruesAndNumFalses() { assertEquals(-1, combined.getNumFalses(), "Both unknown should result in unknown (-1)"); } + @Test + void testNullConstantDoesNotContributeToNdv() { + long numRows = 100; + ColStatistics nullConstant = createStat("null", "int", 0, numRows, 0.0); + ColStatistics regularStat = createStat("col", "int", 50, 10, 4.0); + + PessimisticStatCombiner combiner = new PessimisticStatCombiner(numRows); + combiner.add(nullConstant); + combiner.add(regularStat); + + ColStatistics result = combiner.getResult().get(); + assertEquals(50, result.getCountDistint(), "NULL constant should not contribute to NDV"); + } + + @Test + void testNullConstantAsSecondDoesNotContributeToNdv() { + long numRows = 100; + ColStatistics regularStat = createStat("col", "int", 50, 10, 4.0); + ColStatistics nullConstant = createStat("null", "int", 0, numRows, 0.0); + + PessimisticStatCombiner combiner = new PessimisticStatCombiner(numRows); + combiner.add(regularStat); + combiner.add(nullConstant); + + ColStatistics result = combiner.getResult().get(); + assertEquals(50, result.getCountDistint(), "NULL constant should not contribute to NDV"); + } + + @Test + void testMultipleNullConstantsResultInZeroNdv() { + long numRows = 100; + ColStatistics nullConstant1 = createStat("null1", "int", 0, numRows, 0.0); + ColStatistics nullConstant2 = createStat("null2", "int", 0, numRows, 0.0); + + PessimisticStatCombiner combiner = new PessimisticStatCombiner(numRows); + combiner.add(nullConstant1); + combiner.add(nullConstant2); + + ColStatistics result = combiner.getResult().get(); + assertEquals(0, result.getCountDistint(), "Multiple NULL constants should result in NDV=0"); + assertEquals(numRows, result.getNumNulls(), "numNulls should be numRows"); + } + + @Test + void testUnknownNdvNotConfusedWithNullConstant() { + long numRows = 100; + ColStatistics unknownNdv = createStat("col", "int", 0, 10, 4.0); + ColStatistics regularStat = createStat("col2", "int", 50, 5, 4.0); + + PessimisticStatCombiner combiner = new PessimisticStatCombiner(numRows); + combiner.add(unknownNdv); + combiner.add(regularStat); + + ColStatistics result = combiner.getResult().get(); + assertEquals(0, result.getCountDistint(), "Unknown NDV should propagate as 0"); + } + private ColStatistics createStat(String name, String type, long ndv, long numNulls, double avgColLen) { ColStatistics stat = new ColStatistics(name, type); stat.setCountDistint(ndv); diff --git a/ql/src/test/results/clientpositive/llap/ndv_case_const.q.out b/ql/src/test/results/clientpositive/llap/ndv_case_const.q.out index 6539159d2eaa..a25b1e35b17d 100644 --- a/ql/src/test/results/clientpositive/llap/ndv_case_const.q.out +++ b/ql/src/test/results/clientpositive/llap/ndv_case_const.q.out @@ -397,13 +397,13 @@ STAGE PLANS: minReductionHashAggr: 0.99 mode: hash outputColumnNames: _col0 - Statistics: Num rows: 4 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: string) null sort order: z sort order: + Map-reduce partition columns: _col0 (type: string) - Statistics: Num rows: 4 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE Execution mode: vectorized, llap LLAP IO: all inputs Reducer 2 @@ -413,10 +413,10 @@ STAGE PLANS: keys: KEY._col0 (type: string) mode: mergepartial outputColumnNames: _col0 - Statistics: Num rows: 4 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 4 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3 Data size: 85 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat From cca7fc6d4aa0cff5a9499c18090a59130177c11d Mon Sep 17 00:00:00 2001 From: Konstantin Bereznyakov Date: Fri, 27 Mar 2026 15:47:46 -0700 Subject: [PATCH 12/13] HIVE-29368: fully reverted buildColStatForConstant to reduce the total diff --- .../hadoop/hive/ql/stats/StatsUtils.java | 20 +++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index 9fd7593e2ce3..1052e067966e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -1628,11 +1628,23 @@ public static ColStatistics getColStatisticsFromExpression(HiveConf conf, Statis } private static ColStatistics buildColStatForConstant(HiveConf conf, long numRows, ExprNodeConstantDesc encd) { - String colType = encd.getTypeString().toLowerCase(); + + long numNulls = 0; + long countDistincts = 0; + if (encd.getValue() == null) { + // null projection + numNulls = numRows; + } else { + countDistincts = 1; + } + String colType = encd.getTypeString(); + colType = colType.toLowerCase(); + ObjectInspector oi = encd.getWritableObjectInspector(); + double avgColSize = getAvgColLenOf(conf, oi, colType); ColStatistics colStats = new ColStatistics(encd.getName(), colType); - colStats.setAvgColLen(getAvgColLenOf(conf, encd.getWritableObjectInspector(), colType)); - colStats.setCountDistint(encd.getValue() == null ? 0 : 1); - colStats.setNumNulls(encd.getValue() == null ? numRows : 0); + colStats.setAvgColLen(avgColSize); + colStats.setCountDistint(countDistincts); + colStats.setNumNulls(numNulls); Optional value = getConstValue(encd); value.ifPresent(number -> colStats.setRange(number, number)); From 8c2fd961d9ea31153f1debe7da3dceb8b56a6950 Mon Sep 17 00:00:00 2001 From: Konstantin Bereznyakov Date: Mon, 30 Mar 2026 10:53:15 -0700 Subject: [PATCH 13/13] HIVE-29368: tweaking extractNDVGroupingColumns conditions, test code fixes, new .out files --- .../hadoop/hive/ql/stats/StatsUtils.java | 5 +- .../hadoop/hive/ql/stats/TestStatsUtils.java | 147 ++++++++++-------- .../ql/stats/estimator/TestStatEstimator.java | 50 ++++++ .../llap/parquet_vectorization_13.q.out | 4 +- .../llap/parquet_vectorization_14.q.out | 2 +- .../llap/parquet_vectorization_15.q.out | 12 +- .../llap/parquet_vectorization_16.q.out | 2 +- .../llap/parquet_vectorization_9.q.out | 2 +- .../llap/vectorization_13.q.out | 4 +- .../llap/vectorization_14.q.out | 2 +- .../llap/vectorization_15.q.out | 12 +- .../llap/vectorization_16.q.out | 2 +- .../clientpositive/llap/vectorization_9.q.out | 2 +- .../llap/vectorization_short_regress.q.out | 18 +-- .../llap/vectorized_stats.q.out | 8 +- 15 files changed, 172 insertions(+), 100 deletions(-) create mode 100644 ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestStatEstimator.java diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index 1052e067966e..dbb5565ff239 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -2111,7 +2111,10 @@ private static List extractNDVGroupingColumns(List colStats for (ColStatistics cs : colStats) { if (cs != null) { long ndv = cs.getCountDistint(); - if (cs.getNumNulls() > 0) { + // NDV needs to be adjusted if a column has a known NDV along with NULL values + // or if a column happens to be "const NULL" + if ((ndv > 0 && cs.getNumNulls() > 0) || + (ndv == 0 && !cs.isEstimated() && cs.getNumNulls() == parentStats.getNumRows())) { ndv = StatsUtils.safeAdd(ndv, 1); } ndvValues.add(ndv); diff --git a/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java b/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java index 2b8cb07b6822..48ee3b99cc35 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java @@ -19,7 +19,6 @@ package org.apache.hadoop.hive.ql.stats; import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNull; @@ -45,7 +44,9 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; import org.apache.hadoop.hive.ql.plan.Statistics; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFCoalesce; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFIf; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFWhen; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.apache.hadoop.hive.serde.serdeConstants; import org.junit.jupiter.api.Test; @@ -506,51 +507,6 @@ void testScaleColStatisticsPreservesUnknownNumFalses() { assertEquals(-1, colStats.get(0).getNumFalses(), "Unknown numFalses (-1) should be preserved after scaling"); } - // Tests for buildColStatForConstant (via getColStatisticsFromExpression) - - @Test - void testGetColStatisticsFromExpressionNullConstant() { - HiveConf conf = new HiveConf(); - Statistics parentStats = new Statistics(1000, 8000, 0, 0); - - ExprNodeConstantDesc nullConst = new ExprNodeConstantDesc(TypeInfoFactory.stringTypeInfo, null); - ColStatistics cs = StatsUtils.getColStatisticsFromExpression(conf, parentStats, nullConst); - - assertNotNull(cs); - assertEquals(1, cs.getCountDistint(), "NULL constant should have NDV=1"); - assertEquals(1000, cs.getNumNulls(), "NULL constant should have numNulls=numRows"); - assertFalse(cs.isEstimated(), "Constant stats should not be marked as estimated"); - } - - @Test - void testGetColStatisticsFromExpressionNonNullConstant() { - HiveConf conf = new HiveConf(); - Statistics parentStats = new Statistics(1000, 8000, 0, 0); - - ExprNodeConstantDesc strConst = new ExprNodeConstantDesc(TypeInfoFactory.stringTypeInfo, "hello"); - ColStatistics cs = StatsUtils.getColStatisticsFromExpression(conf, parentStats, strConst); - - assertNotNull(cs); - assertEquals(1, cs.getCountDistint(), "Non-NULL constant should have NDV=1"); - assertEquals(0, cs.getNumNulls(), "Non-NULL constant should have numNulls=0"); - } - - @Test - void testGetColStatisticsFromExpressionIntConstant() { - HiveConf conf = new HiveConf(); - Statistics parentStats = new Statistics(500, 4000, 0, 0); - - ExprNodeConstantDesc intConst = new ExprNodeConstantDesc(TypeInfoFactory.intTypeInfo, 42); - ColStatistics cs = StatsUtils.getColStatisticsFromExpression(conf, parentStats, intConst); - - assertNotNull(cs); - assertEquals(1, cs.getCountDistint(), "Integer constant should have NDV=1"); - assertEquals(0, cs.getNumNulls(), "Integer constant should have numNulls=0"); - assertNotNull(cs.getRange(), "Integer constant should have a range"); - assertEquals(42, cs.getRange().minValue.intValue()); - assertEquals(42, cs.getRange().maxValue.intValue()); - } - // Tests for computeNDVGroupingColumns / extractNDVGroupingColumns @Test @@ -592,7 +548,7 @@ void testComputeNDVGroupingColumnsEstimatedExpression() { cs.setIsEstimated(true); // computed expression (e.g., CASE) long ndv = StatsUtils.computeNDVGroupingColumns(Arrays.asList(cs), parentStats, false); - assertEquals(3, ndv, "Estimated expression should NOT get +1 (already accounts for NULL)"); + assertEquals(4, ndv, "NDV with nulls: 3 + 1 = 4"); } @Test @@ -601,12 +557,26 @@ void testComputeNDVGroupingColumnsAllNullColumn() { parentStats.setColumnStatsState(Statistics.State.COMPLETE); ColStatistics cs = new ColStatistics("col1", "string"); - cs.setCountDistint(1); + cs.setCountDistint(0); cs.setNumNulls(1000); // all rows are NULL cs.setIsEstimated(false); long ndv = StatsUtils.computeNDVGroupingColumns(Arrays.asList(cs), parentStats, false); - assertEquals(1, ndv, "All-NULL column should NOT get +1 (numNulls == numRows)"); + assertEquals(1, ndv, "All-NULL column: NDV=0 but numNulls==numRows, so NDV becomes 1"); + } + + @Test + void testComputeNDVGroupingColumnsAllNullEstimatedColumn() { + Statistics parentStats = new Statistics(1000, 8000, 0, 0); + parentStats.setColumnStatsState(Statistics.State.COMPLETE); + + ColStatistics cs = new ColStatistics("case_expr", "string"); + cs.setCountDistint(0); + cs.setNumNulls(1000); // all rows are NULL + cs.setIsEstimated(true); // from expression like CASE + + long ndv = StatsUtils.computeNDVGroupingColumns(Arrays.asList(cs), parentStats, false); + assertEquals(0, ndv, "Estimated all-NULL column: NDV stays 0 (unknown from combiner)"); } @Test @@ -660,22 +630,15 @@ void testComputeNDVGroupingColumnsMixedEstimatedAndSource() { caseExpr.setIsEstimated(true); // estimated: no +1 long ndv = StatsUtils.computeNDVGroupingColumns(Arrays.asList(sourceCol, caseExpr), parentStats, false); - // sourceCol: 10 + 1 = 11, caseExpr: 3 (no +1) - // Product: 11 * 3 = 33 - assertEquals(33, ndv, "Mixed columns: source (10+1) * estimated (3) = 33"); + // sourceCol: 10 + 1 = 11, caseExpr: 3 + 1 = 4 + // Product: 11 * 4 = 44 + assertEquals(44, ndv, "Mixed columns: (10+1) * (3+1) = 44"); } - // Test for NDV cap after StatEstimator (NDV cannot exceed numRows) - @Test void testGetColStatisticsFromExpressionNdvCappedAtNumRows() { - HiveConf conf = new HiveConf(); - conf.setBoolVar(HiveConf.ConfVars.HIVE_STATS_ESTIMATORS_ENABLE, true); - - // Create parent stats with only 100 rows Statistics parentStats = new Statistics(100, 800, 0, 0); - // Create column stats for col1 and col2 with high NDV (each 80) ColStatistics col1Stats = new ColStatistics("col1", "string"); col1Stats.setCountDistint(80); col1Stats.setNumNulls(0); @@ -688,9 +651,6 @@ void testGetColStatisticsFromExpressionNdvCappedAtNumRows() { parentStats.setColumnStats(Arrays.asList(col1Stats, col2Stats)); - // Create IF(true, col1, col2) expression - // IF uses PessimisticStatCombiner which sums NDVs: 80 + 80 = 160 - // But numRows is only 100, so NDV should be capped at 100 GenericUDFIf udfIf = new GenericUDFIf(); ExprNodeConstantDesc condExpr = new ExprNodeConstantDesc(TypeInfoFactory.booleanTypeInfo, true); ExprNodeColumnDesc col1Expr = new ExprNodeColumnDesc(TypeInfoFactory.stringTypeInfo, "col1", "t", false); @@ -700,11 +660,70 @@ void testGetColStatisticsFromExpressionNdvCappedAtNumRows() { TypeInfoFactory.stringTypeInfo, udfIf, "if", Arrays.asList(condExpr, col1Expr, col2Expr)); - ColStatistics result = StatsUtils.getColStatisticsFromExpression(conf, parentStats, ifExpr); + ColStatistics result = StatsUtils.getColStatisticsFromExpression(new HiveConf(), parentStats, ifExpr); assertNotNull(result); - // PessimisticStatCombiner would produce 80 + 80 = 160, but cap ensures NDV <= numRows (100) assertEquals(100, result.getCountDistint(), "NDV should be capped at numRows (100), not 160"); } + @Test + void testGetColStatisticsFromExpressionWhenNdvCapped() { + Statistics parentStats = new Statistics(100, 800, 0, 0); + + ColStatistics col1Stats = new ColStatistics("col1", "string"); + col1Stats.setCountDistint(60); + col1Stats.setNumNulls(0); + col1Stats.setAvgColLen(10); + + ColStatistics col2Stats = new ColStatistics("col2", "string"); + col2Stats.setCountDistint(70); + col2Stats.setNumNulls(0); + col2Stats.setAvgColLen(10); + + parentStats.setColumnStats(Arrays.asList(col1Stats, col2Stats)); + + GenericUDFWhen udfWhen = new GenericUDFWhen(); + ExprNodeConstantDesc condExpr = new ExprNodeConstantDesc(TypeInfoFactory.booleanTypeInfo, true); + ExprNodeColumnDesc col1Expr = new ExprNodeColumnDesc(TypeInfoFactory.stringTypeInfo, "col1", "t", false); + ExprNodeColumnDesc col2Expr = new ExprNodeColumnDesc(TypeInfoFactory.stringTypeInfo, "col2", "t", false); + + ExprNodeGenericFuncDesc whenExpr = new ExprNodeGenericFuncDesc( + TypeInfoFactory.stringTypeInfo, udfWhen, "when", + Arrays.asList(condExpr, col1Expr, col2Expr)); + + ColStatistics result = StatsUtils.getColStatisticsFromExpression(new HiveConf(), parentStats, whenExpr); + + assertNotNull(result); + assertEquals(100, result.getCountDistint(), "NDV should be capped at numRows (100), not 130"); + } + + @Test + void testGetColStatisticsFromExpressionCoalesceNdvCapped() { + Statistics parentStats = new Statistics(100, 800, 0, 0); + + ColStatistics col1Stats = new ColStatistics("col1", "string"); + col1Stats.setCountDistint(50); + col1Stats.setNumNulls(20); + col1Stats.setAvgColLen(10); + + ColStatistics col2Stats = new ColStatistics("col2", "string"); + col2Stats.setCountDistint(80); + col2Stats.setNumNulls(10); + col2Stats.setAvgColLen(10); + + parentStats.setColumnStats(Arrays.asList(col1Stats, col2Stats)); + + GenericUDFCoalesce udfCoalesce = new GenericUDFCoalesce(); + ExprNodeColumnDesc col1Expr = new ExprNodeColumnDesc(TypeInfoFactory.stringTypeInfo, "col1", "t", false); + ExprNodeColumnDesc col2Expr = new ExprNodeColumnDesc(TypeInfoFactory.stringTypeInfo, "col2", "t", false); + + ExprNodeGenericFuncDesc coalesceExpr = new ExprNodeGenericFuncDesc( + TypeInfoFactory.stringTypeInfo, udfCoalesce, "coalesce", + Arrays.asList(col1Expr, col2Expr)); + + ColStatistics result = StatsUtils.getColStatisticsFromExpression(new HiveConf(), parentStats, coalesceExpr); + + assertNotNull(result); + assertEquals(100, result.getCountDistint(), "NDV should be capped at numRows (100), not 130"); + } } diff --git a/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestStatEstimator.java b/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestStatEstimator.java new file mode 100644 index 000000000000..7fd715f4a98d --- /dev/null +++ b/ql/src/test/org/apache/hadoop/hive/ql/stats/estimator/TestStatEstimator.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.stats.estimator; + +import static org.junit.jupiter.api.Assertions.assertThrows; + +import java.util.Arrays; +import java.util.List; + +import org.apache.hadoop.hive.ql.plan.ColStatistics; +import org.apache.hadoop.hive.ql.plan.Statistics; +import org.junit.jupiter.api.Test; + +class TestStatEstimator { + + @Test + void testDefaultEstimateThrowsUnsupportedOperation() { + StatEstimator estimator = new StatEstimator() {}; + List argStats = Arrays.asList(new ColStatistics("col", "int")); + + assertThrows(UnsupportedOperationException.class, () -> estimator.estimate(argStats), + "Default estimate(argStats) should throw UnsupportedOperationException"); + } + + @Test + void testDefaultEstimateWithParentStatsCallsEstimate() { + StatEstimator estimator = new StatEstimator() {}; + List argStats = Arrays.asList(new ColStatistics("col", "int")); + Statistics parentStats = new Statistics(100, 800, 0, 0); + + assertThrows(UnsupportedOperationException.class, () -> estimator.estimate(argStats, parentStats), + "Default estimate(argStats, parentStats) should delegate to estimate(argStats) which throws"); + } +} diff --git a/ql/src/test/results/clientpositive/llap/parquet_vectorization_13.q.out b/ql/src/test/results/clientpositive/llap/parquet_vectorization_13.q.out index 7125704c33d2..2a223991fe36 100644 --- a/ql/src/test/results/clientpositive/llap/parquet_vectorization_13.q.out +++ b/ql/src/test/results/clientpositive/llap/parquet_vectorization_13.q.out @@ -130,7 +130,7 @@ STAGE PLANS: vectorProcessingMode: HASH projectedOutputColumnNums: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] keys: _col0 (type: boolean), _col1 (type: tinyint), _col2 (type: timestamp), _col3 (type: float), _col4 (type: string) - minReductionHashAggr: 0.4 + minReductionHashAggr: 0.99 mode: hash outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14 Statistics: Num rows: 1386 Data size: 194258 Basic stats: COMPLETE Column stats: COMPLETE @@ -487,7 +487,7 @@ STAGE PLANS: vectorProcessingMode: HASH projectedOutputColumnNums: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] keys: _col0 (type: boolean), _col1 (type: tinyint), _col2 (type: timestamp), _col3 (type: float), _col4 (type: string) - minReductionHashAggr: 0.4 + minReductionHashAggr: 0.99 mode: hash outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14 Statistics: Num rows: 1386 Data size: 194258 Basic stats: COMPLETE Column stats: COMPLETE diff --git a/ql/src/test/results/clientpositive/llap/parquet_vectorization_14.q.out b/ql/src/test/results/clientpositive/llap/parquet_vectorization_14.q.out index 5acc12c3b71d..12c88df0aebf 100644 --- a/ql/src/test/results/clientpositive/llap/parquet_vectorization_14.q.out +++ b/ql/src/test/results/clientpositive/llap/parquet_vectorization_14.q.out @@ -120,7 +120,7 @@ STAGE PLANS: vectorProcessingMode: HASH projectedOutputColumnNums: [0, 1, 2, 3, 4, 5, 6, 7] keys: _col2 (type: string), _col1 (type: float), _col4 (type: double), _col0 (type: timestamp), _col3 (type: boolean) - minReductionHashAggr: 0.4 + minReductionHashAggr: 0.99 mode: hash outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12 Statistics: Num rows: 758 Data size: 130530 Basic stats: COMPLETE Column stats: COMPLETE diff --git a/ql/src/test/results/clientpositive/llap/parquet_vectorization_15.q.out b/ql/src/test/results/clientpositive/llap/parquet_vectorization_15.q.out index d7d3f4919183..3653c9466248 100644 --- a/ql/src/test/results/clientpositive/llap/parquet_vectorization_15.q.out +++ b/ql/src/test/results/clientpositive/llap/parquet_vectorization_15.q.out @@ -116,7 +116,7 @@ STAGE PLANS: vectorProcessingMode: HASH projectedOutputColumnNums: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] keys: _col0 (type: float), _col1 (type: boolean), _col2 (type: double), _col3 (type: string), _col4 (type: tinyint), _col5 (type: int), _col6 (type: timestamp) - minReductionHashAggr: 0.4 + minReductionHashAggr: 0.99 mode: hash outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16 Statistics: Num rows: 6144 Data size: 1216372 Basic stats: COMPLETE Column stats: COMPLETE @@ -154,16 +154,16 @@ STAGE PLANS: keys: KEY._col0 (type: float), KEY._col1 (type: boolean), KEY._col2 (type: double), KEY._col3 (type: string), KEY._col4 (type: tinyint), KEY._col5 (type: int), KEY._col6 (type: timestamp) mode: mergepartial outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16 - Statistics: Num rows: 6144 Data size: 1216372 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 6104 Data size: 1208432 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col0 (type: float), _col1 (type: boolean), _col2 (type: double), _col3 (type: string), _col4 (type: tinyint), _col5 (type: int), _col6 (type: timestamp), power(((_col7 - ((_col8 * _col8) / _col9)) / if((_col9 = 1L), null, (_col9 - 1))), 0.5) (type: double), (-26.28 - CAST( _col5 AS decimal(10,0))) (type: decimal(13,2)), _col10 (type: double), (_col2 * 79.553D) (type: double), (33.0 % _col0) (type: float), power(((_col11 - ((_col12 * _col12) / _col13)) / if((_col13 = 1L), null, (_col13 - 1))), 0.5) (type: double), ((_col11 - ((_col12 * _col12) / _col13)) / _col13) (type: double), (-23.0D % _col2) (type: double), (- _col4) (type: tinyint), ((_col14 - ((_col15 * _col15) / _col16)) / if((_col16 = 1L), null, (_col16 - 1))) (type: double), (UDFToFloat(_col5) - _col0) (type: float), (-23 % UDFToInteger(_col4)) (type: int), (- (-26.28 - CAST( _col5 AS decimal(10,0)))) (type: decimal(13,2)), power(((_col14 - ((_col15 * _col15) / _col16)) / _col16), 0.5) (type: double) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20 - Statistics: Num rows: 6144 Data size: 2592628 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 6104 Data size: 2575728 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: float), _col1 (type: boolean), _col2 (type: double), _col3 (type: string), _col4 (type: tinyint), _col5 (type: int), _col6 (type: timestamp) null sort order: zzzzzzz sort order: +++++++ - Statistics: Num rows: 6144 Data size: 2592628 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 6104 Data size: 2575728 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col7 (type: double), _col8 (type: decimal(13,2)), _col9 (type: double), _col10 (type: double), _col11 (type: float), _col12 (type: double), _col13 (type: double), _col14 (type: double), _col15 (type: tinyint), _col16 (type: double), _col17 (type: float), _col18 (type: int), _col19 (type: decimal(13,2)), _col20 (type: double) Reducer 3 Execution mode: llap @@ -175,10 +175,10 @@ STAGE PLANS: Select Operator expressions: KEY.reducesinkkey0 (type: float), KEY.reducesinkkey1 (type: boolean), KEY.reducesinkkey2 (type: double), KEY.reducesinkkey3 (type: string), KEY.reducesinkkey4 (type: tinyint), KEY.reducesinkkey5 (type: int), KEY.reducesinkkey6 (type: timestamp), VALUE._col0 (type: double), VALUE._col1 (type: decimal(13,2)), VALUE._col2 (type: double), VALUE._col3 (type: double), VALUE._col4 (type: float), VALUE._col5 (type: double), VALUE._col6 (type: double), VALUE._col7 (type: double), VALUE._col8 (type: tinyint), VALUE._col9 (type: double), VALUE._col10 (type: float), VALUE._col11 (type: int), VALUE._col12 (type: decimal(13,2)), VALUE._col13 (type: double) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20 - Statistics: Num rows: 6144 Data size: 2592628 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 6104 Data size: 2575728 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 6144 Data size: 2592628 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 6104 Data size: 2575728 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat diff --git a/ql/src/test/results/clientpositive/llap/parquet_vectorization_16.q.out b/ql/src/test/results/clientpositive/llap/parquet_vectorization_16.q.out index eeab9c89af72..a457b27af643 100644 --- a/ql/src/test/results/clientpositive/llap/parquet_vectorization_16.q.out +++ b/ql/src/test/results/clientpositive/llap/parquet_vectorization_16.q.out @@ -93,7 +93,7 @@ STAGE PLANS: vectorProcessingMode: HASH projectedOutputColumnNums: [0, 1, 2, 3] keys: _col0 (type: string), _col1 (type: double), _col2 (type: timestamp) - minReductionHashAggr: 0.4 + minReductionHashAggr: 0.99 mode: hash outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6 Statistics: Num rows: 5979 Data size: 825318 Basic stats: COMPLETE Column stats: COMPLETE diff --git a/ql/src/test/results/clientpositive/llap/parquet_vectorization_9.q.out b/ql/src/test/results/clientpositive/llap/parquet_vectorization_9.q.out index eeab9c89af72..a457b27af643 100644 --- a/ql/src/test/results/clientpositive/llap/parquet_vectorization_9.q.out +++ b/ql/src/test/results/clientpositive/llap/parquet_vectorization_9.q.out @@ -93,7 +93,7 @@ STAGE PLANS: vectorProcessingMode: HASH projectedOutputColumnNums: [0, 1, 2, 3] keys: _col0 (type: string), _col1 (type: double), _col2 (type: timestamp) - minReductionHashAggr: 0.4 + minReductionHashAggr: 0.99 mode: hash outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6 Statistics: Num rows: 5979 Data size: 825318 Basic stats: COMPLETE Column stats: COMPLETE diff --git a/ql/src/test/results/clientpositive/llap/vectorization_13.q.out b/ql/src/test/results/clientpositive/llap/vectorization_13.q.out index d1911fdb7f8b..0e96fffb2c09 100644 --- a/ql/src/test/results/clientpositive/llap/vectorization_13.q.out +++ b/ql/src/test/results/clientpositive/llap/vectorization_13.q.out @@ -131,7 +131,7 @@ STAGE PLANS: vectorProcessingMode: HASH projectedOutputColumnNums: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] keys: _col0 (type: boolean), _col1 (type: tinyint), _col2 (type: timestamp), _col3 (type: float), _col4 (type: string) - minReductionHashAggr: 0.4 + minReductionHashAggr: 0.99 mode: hash outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14 Statistics: Num rows: 1386 Data size: 194258 Basic stats: COMPLETE Column stats: COMPLETE @@ -511,7 +511,7 @@ STAGE PLANS: vectorProcessingMode: HASH projectedOutputColumnNums: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] keys: _col0 (type: boolean), _col1 (type: tinyint), _col2 (type: timestamp), _col3 (type: float), _col4 (type: string) - minReductionHashAggr: 0.4 + minReductionHashAggr: 0.99 mode: hash outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14 Statistics: Num rows: 1386 Data size: 194258 Basic stats: COMPLETE Column stats: COMPLETE diff --git a/ql/src/test/results/clientpositive/llap/vectorization_14.q.out b/ql/src/test/results/clientpositive/llap/vectorization_14.q.out index 25bfeb19bfcf..62feb4c66cff 100644 --- a/ql/src/test/results/clientpositive/llap/vectorization_14.q.out +++ b/ql/src/test/results/clientpositive/llap/vectorization_14.q.out @@ -121,7 +121,7 @@ STAGE PLANS: vectorProcessingMode: HASH projectedOutputColumnNums: [0, 1, 2, 3, 4, 5, 6, 7] keys: _col2 (type: string), _col1 (type: float), _col4 (type: double), _col0 (type: timestamp), _col3 (type: boolean) - minReductionHashAggr: 0.4 + minReductionHashAggr: 0.99 mode: hash outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12 Statistics: Num rows: 758 Data size: 130530 Basic stats: COMPLETE Column stats: COMPLETE diff --git a/ql/src/test/results/clientpositive/llap/vectorization_15.q.out b/ql/src/test/results/clientpositive/llap/vectorization_15.q.out index 6732aba7edd2..8c20b9f5fca7 100644 --- a/ql/src/test/results/clientpositive/llap/vectorization_15.q.out +++ b/ql/src/test/results/clientpositive/llap/vectorization_15.q.out @@ -117,7 +117,7 @@ STAGE PLANS: vectorProcessingMode: HASH projectedOutputColumnNums: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] keys: _col0 (type: float), _col1 (type: boolean), _col2 (type: double), _col3 (type: string), _col4 (type: tinyint), _col5 (type: int), _col6 (type: timestamp) - minReductionHashAggr: 0.4 + minReductionHashAggr: 0.99 mode: hash outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16 Statistics: Num rows: 6144 Data size: 1216372 Basic stats: COMPLETE Column stats: COMPLETE @@ -163,16 +163,16 @@ STAGE PLANS: keys: KEY._col0 (type: float), KEY._col1 (type: boolean), KEY._col2 (type: double), KEY._col3 (type: string), KEY._col4 (type: tinyint), KEY._col5 (type: int), KEY._col6 (type: timestamp) mode: mergepartial outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16 - Statistics: Num rows: 6144 Data size: 1216372 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 6104 Data size: 1208432 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col0 (type: float), _col1 (type: boolean), _col2 (type: double), _col3 (type: string), _col4 (type: tinyint), _col5 (type: int), _col6 (type: timestamp), power(((_col7 - ((_col8 * _col8) / _col9)) / if((_col9 = 1L), null, (_col9 - 1))), 0.5) (type: double), (-26.28 - CAST( _col5 AS decimal(10,0))) (type: decimal(13,2)), _col10 (type: double), (_col2 * 79.553D) (type: double), (33.0 % _col0) (type: float), power(((_col11 - ((_col12 * _col12) / _col13)) / if((_col13 = 1L), null, (_col13 - 1))), 0.5) (type: double), ((_col11 - ((_col12 * _col12) / _col13)) / _col13) (type: double), (-23.0D % _col2) (type: double), (- _col4) (type: tinyint), ((_col14 - ((_col15 * _col15) / _col16)) / if((_col16 = 1L), null, (_col16 - 1))) (type: double), (UDFToFloat(_col5) - _col0) (type: float), (-23 % UDFToInteger(_col4)) (type: int), (- (-26.28 - CAST( _col5 AS decimal(10,0)))) (type: decimal(13,2)), power(((_col14 - ((_col15 * _col15) / _col16)) / _col16), 0.5) (type: double) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20 - Statistics: Num rows: 6144 Data size: 2592628 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 6104 Data size: 2575728 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: float), _col1 (type: boolean), _col2 (type: double), _col3 (type: string), _col4 (type: tinyint), _col5 (type: int), _col6 (type: timestamp) null sort order: zzzzzzz sort order: +++++++ - Statistics: Num rows: 6144 Data size: 2592628 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 6104 Data size: 2575728 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col7 (type: double), _col8 (type: decimal(13,2)), _col9 (type: double), _col10 (type: double), _col11 (type: float), _col12 (type: double), _col13 (type: double), _col14 (type: double), _col15 (type: tinyint), _col16 (type: double), _col17 (type: float), _col18 (type: int), _col19 (type: decimal(13,2)), _col20 (type: double) Reducer 3 Execution mode: llap @@ -184,10 +184,10 @@ STAGE PLANS: Select Operator expressions: KEY.reducesinkkey0 (type: float), KEY.reducesinkkey1 (type: boolean), KEY.reducesinkkey2 (type: double), KEY.reducesinkkey3 (type: string), KEY.reducesinkkey4 (type: tinyint), KEY.reducesinkkey5 (type: int), KEY.reducesinkkey6 (type: timestamp), VALUE._col0 (type: double), VALUE._col1 (type: decimal(13,2)), VALUE._col2 (type: double), VALUE._col3 (type: double), VALUE._col4 (type: float), VALUE._col5 (type: double), VALUE._col6 (type: double), VALUE._col7 (type: double), VALUE._col8 (type: tinyint), VALUE._col9 (type: double), VALUE._col10 (type: float), VALUE._col11 (type: int), VALUE._col12 (type: decimal(13,2)), VALUE._col13 (type: double) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20 - Statistics: Num rows: 6144 Data size: 2592628 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 6104 Data size: 2575728 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 6144 Data size: 2592628 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 6104 Data size: 2575728 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat diff --git a/ql/src/test/results/clientpositive/llap/vectorization_16.q.out b/ql/src/test/results/clientpositive/llap/vectorization_16.q.out index 7e8cb81144fc..7de5092fc76d 100644 --- a/ql/src/test/results/clientpositive/llap/vectorization_16.q.out +++ b/ql/src/test/results/clientpositive/llap/vectorization_16.q.out @@ -94,7 +94,7 @@ STAGE PLANS: vectorProcessingMode: HASH projectedOutputColumnNums: [0, 1, 2, 3] keys: _col0 (type: string), _col1 (type: double), _col2 (type: timestamp) - minReductionHashAggr: 0.4 + minReductionHashAggr: 0.99 mode: hash outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6 Statistics: Num rows: 5979 Data size: 825318 Basic stats: COMPLETE Column stats: COMPLETE diff --git a/ql/src/test/results/clientpositive/llap/vectorization_9.q.out b/ql/src/test/results/clientpositive/llap/vectorization_9.q.out index 7e8cb81144fc..7de5092fc76d 100644 --- a/ql/src/test/results/clientpositive/llap/vectorization_9.q.out +++ b/ql/src/test/results/clientpositive/llap/vectorization_9.q.out @@ -94,7 +94,7 @@ STAGE PLANS: vectorProcessingMode: HASH projectedOutputColumnNums: [0, 1, 2, 3] keys: _col0 (type: string), _col1 (type: double), _col2 (type: timestamp) - minReductionHashAggr: 0.4 + minReductionHashAggr: 0.99 mode: hash outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6 Statistics: Num rows: 5979 Data size: 825318 Basic stats: COMPLETE Column stats: COMPLETE diff --git a/ql/src/test/results/clientpositive/llap/vectorization_short_regress.q.out b/ql/src/test/results/clientpositive/llap/vectorization_short_regress.q.out index da82903d7963..8f7d63935f59 100644 --- a/ql/src/test/results/clientpositive/llap/vectorization_short_regress.q.out +++ b/ql/src/test/results/clientpositive/llap/vectorization_short_regress.q.out @@ -2949,10 +2949,10 @@ STAGE PLANS: vectorProcessingMode: HASH projectedOutputColumnNums: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20] keys: _col0 (type: timestamp), _col1 (type: string) - minReductionHashAggr: 0.5133463 + minReductionHashAggr: 0.99 mode: hash outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20, _col21, _col22 - Statistics: Num rows: 5980 Data size: 1579124 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 6144 Data size: 1622368 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: timestamp), _col1 (type: string) null sort order: zz @@ -2962,7 +2962,7 @@ STAGE PLANS: className: VectorReduceSinkMultiKeyOperator native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - Statistics: Num rows: 5980 Data size: 1579124 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 6144 Data size: 1622368 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col2 (type: double), _col3 (type: double), _col4 (type: bigint), _col5 (type: bigint), _col6 (type: bigint), _col7 (type: bigint), _col8 (type: tinyint), _col9 (type: double), _col10 (type: double), _col11 (type: bigint), _col12 (type: double), _col13 (type: double), _col14 (type: bigint), _col15 (type: bigint), _col16 (type: bigint), _col17 (type: double), _col18 (type: bigint), _col19 (type: double), _col20 (type: double), _col21 (type: double), _col22 (type: bigint) Execution mode: vectorized, llap LLAP IO: all inputs @@ -2997,7 +2997,7 @@ STAGE PLANS: keys: KEY._col0 (type: timestamp), KEY._col1 (type: string) mode: mergepartial outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, _col16, _col17, _col18, _col19, _col20, _col21, _col22 - Statistics: Num rows: 5980 Data size: 1579124 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5979 Data size: 1578826 Basic stats: COMPLETE Column stats: COMPLETE Select Operator expressions: _col0 (type: timestamp), _col1 (type: string), power(((_col2 - ((_col3 * _col3) / _col4)) / _col4), 0.5) (type: double), (UDFToDouble(_col5) / _col6) (type: double), _col7 (type: bigint), _col8 (type: tinyint), ((_col9 - ((_col10 * _col10) / _col11)) / if((_col11 = 1L), null, (_col11 - 1))) (type: double), ((_col12 - ((_col13 * _col13) / _col14)) / _col14) (type: double), (UDFToDouble(_col15) / _col16) (type: double), ((_col12 - ((_col13 * _col13) / _col14)) / if((_col14 = 1L), null, (_col14 - 1))) (type: double), (_col17 / _col18) (type: double), _col19 (type: double), ((_col9 - ((_col10 * _col10) / _col11)) / _col11) (type: double), power(((_col20 - ((_col21 * _col21) / _col22)) / _col22), 0.5) (type: double), _col15 (type: bigint) outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14 @@ -3006,12 +3006,12 @@ STAGE PLANS: native: true projectedOutputColumnNums: [0, 1, 27, 29, 7, 8, 36, 40, 42, 49, 50, 19, 54, 59, 15] selectExpressions: FuncPowerDoubleToDouble(col 26:double)(children: DoubleColDivideLongColumn(col 25:double, col 4:bigint)(children: DoubleColSubtractDoubleColumn(col 2:double, col 24:double)(children: DoubleColDivideLongColumn(col 23:double, col 4:bigint)(children: DoubleColMultiplyDoubleColumn(col 3:double, col 3:double) -> 23:double) -> 24:double) -> 25:double) -> 26:double) -> 27:double, DoubleColDivideLongColumn(col 28:double, col 6:bigint)(children: CastLongToDouble(col 5:bigint) -> 28:double) -> 29:double, DoubleColDivideLongColumn(col 32:double, col 35:bigint)(children: DoubleColSubtractDoubleColumn(col 9:double, col 31:double)(children: DoubleColDivideLongColumn(col 30:double, col 11:bigint)(children: DoubleColMultiplyDoubleColumn(col 10:double, col 10:double) -> 30:double) -> 31:double) -> 32:double, IfExprNullCondExpr(col 33:boolean, null, col 34:bigint)(children: LongColEqualLongScalar(col 11:bigint, val 1) -> 33:boolean, LongColSubtractLongScalar(col 11:bigint, val 1) -> 34:bigint) -> 35:bigint) -> 36:double, DoubleColDivideLongColumn(col 39:double, col 14:bigint)(children: DoubleColSubtractDoubleColumn(col 12:double, col 38:double)(children: DoubleColDivideLongColumn(col 37:double, col 14:bigint)(children: DoubleColMultiplyDoubleColumn(col 13:double, col 13:double) -> 37:double) -> 38:double) -> 39:double) -> 40:double, DoubleColDivideLongColumn(col 41:double, col 16:bigint)(children: CastLongToDouble(col 15:bigint) -> 41:double) -> 42:double, DoubleColDivideLongColumn(col 45:double, col 48:bigint)(children: DoubleColSubtractDoubleColumn(col 12:double, col 44:double)(children: DoubleColDivideLongColumn(col 43:double, col 14:bigint)(children: DoubleColMultiplyDoubleColumn(col 13:double, col 13:double) -> 43:double) -> 44:double) -> 45:double, IfExprNullCondExpr(col 46:boolean, null, col 47:bigint)(children: LongColEqualLongScalar(col 14:bigint, val 1) -> 46:boolean, LongColSubtractLongScalar(col 14:bigint, val 1) -> 47:bigint) -> 48:bigint) -> 49:double, DoubleColDivideLongColumn(col 17:double, col 18:bigint) -> 50:double, DoubleColDivideLongColumn(col 53:double, col 11:bigint)(children: DoubleColSubtractDoubleColumn(col 9:double, col 52:double)(children: DoubleColDivideLongColumn(col 51:double, col 11:bigint)(children: DoubleColMultiplyDoubleColumn(col 10:double, col 10:double) -> 51:double) -> 52:double) -> 53:double) -> 54:double, FuncPowerDoubleToDouble(col 58:double)(children: DoubleColDivideLongColumn(col 57:double, col 22:bigint)(children: DoubleColSubtractDoubleColumn(col 20:double, col 56:double)(children: DoubleColDivideLongColumn(col 55:double, col 22:bigint)(children: DoubleColMultiplyDoubleColumn(col 21:double, col 21:double) -> 55:double) -> 56:double) -> 57:double) -> 58:double) -> 59:double - Statistics: Num rows: 5980 Data size: 1196404 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5979 Data size: 1196170 Basic stats: COMPLETE Column stats: COMPLETE Top N Key Operator sort order: +++++++++++++++++++++++++++++++++++++++ keys: _col0 (type: timestamp), _col1 (type: string), _col2 (type: double), (_col2 * 10.175D) (type: double), (- _col2) (type: double), _col3 (type: double), (- _col2) (type: double), (-26.28D - _col2) (type: double), _col4 (type: bigint), (- _col4) (type: bigint), ((-26.28D - _col2) * (- _col2)) (type: double), _col5 (type: tinyint), (((-26.28D - _col2) * (- _col2)) * UDFToDouble((- _col4))) (type: double), (- (_col2 * 10.175D)) (type: double), _col6 (type: double), (_col6 + (((-26.28D - _col2) * (- _col2)) * UDFToDouble((- _col4)))) (type: double), _col2 (type: double), (UDFToDouble((- _col4)) / _col2) (type: double), _col7 (type: double), (10.175D / _col3) (type: double), _col8 (type: double), _col9 (type: double), ((_col6 + (((-26.28D - _col2) * (- _col2)) * UDFToDouble((- _col4)))) - (((-26.28D - _col2) * (- _col2)) * UDFToDouble((- _col4)))) (type: double), (_col2 * 10.175D) (type: double), _col10 (type: double), (((_col6 + (((-26.28D - _col2) * (- _col2)) * UDFToDouble((- _col4)))) - (((-26.28D - _col2) * (- _col2)) * UDFToDouble((- _col4)))) * 10.175D) (type: double), (10.175D % (10.175D / _col3)) (type: double), (- _col5) (type: tinyint), _col11 (type: double), _col12 (type: double), (- ((-26.28D - _col2) * (- _col2))) (type: double), ((- _col2) % _col10) (type: double), (-26.28 / CAST( (- _col5) AS decimal(3,0))) (type: decimal(8,6)), _col13 (type: double), _col14 (type: bigint), ((_col6 + (((-26.28D - _col2) * (- _col2)) * UDFToDouble((- _col4)))) / _col7) (type: double), _col4 (type: bigint), _col4 (type: bigint), ((_col6 + (((-26.28D - _col2) * (- _col2)) * UDFToDouble((- _col4)))) % -26.28D) (type: double) null sort order: zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz - Statistics: Num rows: 5980 Data size: 1196404 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5979 Data size: 1196170 Basic stats: COMPLETE Column stats: COMPLETE top n: 50 Top N Key Vectorization: className: VectorTopNKeyOperator @@ -3025,7 +3025,7 @@ STAGE PLANS: native: true projectedOutputColumnNums: [0, 1, 27, 23, 24, 29, 25, 26, 7, 35, 31, 8, 30, 32, 36, 28, 27, 38, 40, 37, 42, 49, 41, 39, 50, 43, 45, 48, 19, 54, 44, 52, 145, 59, 15, 53, 7, 7, 55] selectExpressions: DoubleColMultiplyDoubleScalar(col 27:double, val 10.175) -> 23:double, DoubleColUnaryMinus(col 27:double) -> 24:double, DoubleColUnaryMinus(col 27:double) -> 25:double, DoubleScalarSubtractDoubleColumn(val -26.28, col 27:double) -> 26:double, LongColUnaryMinus(col 7:bigint) -> 35:bigint, DoubleColMultiplyDoubleColumn(col 28:double, col 30:double)(children: DoubleScalarSubtractDoubleColumn(val -26.28, col 27:double) -> 28:double, DoubleColUnaryMinus(col 27:double) -> 30:double) -> 31:double, DoubleColMultiplyDoubleColumn(col 32:double, col 28:double)(children: DoubleColMultiplyDoubleColumn(col 28:double, col 30:double)(children: DoubleScalarSubtractDoubleColumn(val -26.28, col 27:double) -> 28:double, DoubleColUnaryMinus(col 27:double) -> 30:double) -> 32:double, CastLongToDouble(col 48:bigint)(children: LongColUnaryMinus(col 7:bigint) -> 48:bigint) -> 28:double) -> 30:double, DoubleColUnaryMinus(col 28:double)(children: DoubleColMultiplyDoubleScalar(col 27:double, val 10.175) -> 28:double) -> 32:double, DoubleColAddDoubleColumn(col 36:double, col 37:double)(children: DoubleColMultiplyDoubleColumn(col 38:double, col 28:double)(children: DoubleColMultiplyDoubleColumn(col 28:double, col 37:double)(children: DoubleScalarSubtractDoubleColumn(val -26.28, col 27:double) -> 28:double, DoubleColUnaryMinus(col 27:double) -> 37:double) -> 38:double, CastLongToDouble(col 48:bigint)(children: LongColUnaryMinus(col 7:bigint) -> 48:bigint) -> 28:double) -> 37:double) -> 28:double, DoubleColDivideDoubleColumn(col 37:double, col 27:double)(children: CastLongToDouble(col 48:bigint)(children: LongColUnaryMinus(col 7:bigint) -> 48:bigint) -> 37:double) -> 38:double, DoubleScalarDivideDoubleColumn(val 10.175, col 29:double) -> 37:double, DoubleColSubtractDoubleColumn(col 39:double, col 43:double)(children: DoubleColAddDoubleColumn(col 36:double, col 41:double)(children: DoubleColMultiplyDoubleColumn(col 43:double, col 39:double)(children: DoubleColMultiplyDoubleColumn(col 39:double, col 41:double)(children: DoubleScalarSubtractDoubleColumn(val -26.28, col 27:double) -> 39:double, DoubleColUnaryMinus(col 27:double) -> 41:double) -> 43:double, CastLongToDouble(col 48:bigint)(children: LongColUnaryMinus(col 7:bigint) -> 48:bigint) -> 39:double) -> 41:double) -> 39:double, DoubleColMultiplyDoubleColumn(col 44:double, col 41:double)(children: DoubleColMultiplyDoubleColumn(col 41:double, col 43:double)(children: DoubleScalarSubtractDoubleColumn(val -26.28, col 27:double) -> 41:double, DoubleColUnaryMinus(col 27:double) -> 43:double) -> 44:double, CastLongToDouble(col 48:bigint)(children: LongColUnaryMinus(col 7:bigint) -> 48:bigint) -> 41:double) -> 43:double) -> 41:double, DoubleColMultiplyDoubleScalar(col 27:double, val 10.175) -> 39:double, DoubleColMultiplyDoubleScalar(col 44:double, val 10.175)(children: DoubleColSubtractDoubleColumn(col 43:double, col 45:double)(children: DoubleColAddDoubleColumn(col 36:double, col 44:double)(children: DoubleColMultiplyDoubleColumn(col 45:double, col 43:double)(children: DoubleColMultiplyDoubleColumn(col 43:double, col 44:double)(children: DoubleScalarSubtractDoubleColumn(val -26.28, col 27:double) -> 43:double, DoubleColUnaryMinus(col 27:double) -> 44:double) -> 45:double, CastLongToDouble(col 48:bigint)(children: LongColUnaryMinus(col 7:bigint) -> 48:bigint) -> 43:double) -> 44:double) -> 43:double, DoubleColMultiplyDoubleColumn(col 51:double, col 44:double)(children: DoubleColMultiplyDoubleColumn(col 44:double, col 45:double)(children: DoubleScalarSubtractDoubleColumn(val -26.28, col 27:double) -> 44:double, DoubleColUnaryMinus(col 27:double) -> 45:double) -> 51:double, CastLongToDouble(col 48:bigint)(children: LongColUnaryMinus(col 7:bigint) -> 48:bigint) -> 44:double) -> 45:double) -> 44:double) -> 43:double, DoubleScalarModuloDoubleColumn(val 10.175, col 44:double)(children: DoubleScalarDivideDoubleColumn(val 10.175, col 29:double) -> 44:double) -> 45:double, LongColUnaryMinus(col 8:tinyint) -> 48:tinyint, DoubleColUnaryMinus(col 52:double)(children: DoubleColMultiplyDoubleColumn(col 44:double, col 51:double)(children: DoubleScalarSubtractDoubleColumn(val -26.28, col 27:double) -> 44:double, DoubleColUnaryMinus(col 27:double) -> 51:double) -> 52:double) -> 44:double, DoubleColModuloDoubleColumn(col 51:double, col 50:double)(children: DoubleColUnaryMinus(col 27:double) -> 51:double) -> 52:double, DecimalScalarDivideDecimalColumn(val -26.28, col 127:decimal(3,0))(children: CastLongToDecimal(col 71:tinyint)(children: LongColUnaryMinus(col 8:tinyint) -> 71:tinyint) -> 127:decimal(3,0)) -> 145:decimal(8,6), DoubleColDivideDoubleColumn(col 51:double, col 40:double)(children: DoubleColAddDoubleColumn(col 36:double, col 53:double)(children: DoubleColMultiplyDoubleColumn(col 55:double, col 51:double)(children: DoubleColMultiplyDoubleColumn(col 51:double, col 53:double)(children: DoubleScalarSubtractDoubleColumn(val -26.28, col 27:double) -> 51:double, DoubleColUnaryMinus(col 27:double) -> 53:double) -> 55:double, CastLongToDouble(col 71:bigint)(children: LongColUnaryMinus(col 7:bigint) -> 71:bigint) -> 51:double) -> 53:double) -> 51:double) -> 53:double, DoubleColModuloDoubleScalar(col 51:double, val -26.28)(children: DoubleColAddDoubleColumn(col 36:double, col 55:double)(children: DoubleColMultiplyDoubleColumn(col 56:double, col 51:double)(children: DoubleColMultiplyDoubleColumn(col 51:double, col 55:double)(children: DoubleScalarSubtractDoubleColumn(val -26.28, col 27:double) -> 51:double, DoubleColUnaryMinus(col 27:double) -> 55:double) -> 56:double, CastLongToDouble(col 71:bigint)(children: LongColUnaryMinus(col 7:bigint) -> 71:bigint) -> 51:double) -> 55:double) -> 51:double) -> 55:double - Statistics: Num rows: 5980 Data size: 2739514 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5979 Data size: 2738988 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: timestamp), _col1 (type: string), _col2 (type: double), _col3 (type: double), _col4 (type: double), _col5 (type: double), _col6 (type: double), _col7 (type: double), _col8 (type: bigint), _col9 (type: bigint), _col10 (type: double), _col11 (type: tinyint), _col12 (type: double), _col13 (type: double), _col14 (type: double), _col15 (type: double), _col16 (type: double), _col17 (type: double), _col18 (type: double), _col19 (type: double), _col20 (type: double), _col21 (type: double), _col22 (type: double), _col23 (type: double), _col24 (type: double), _col25 (type: double), _col26 (type: double), _col27 (type: tinyint), _col28 (type: double), _col29 (type: double), _col30 (type: double), _col31 (type: double), _col32 (type: decimal(8,6)), _col33 (type: double), _col34 (type: bigint), _col35 (type: double), _col36 (type: bigint), _col37 (type: bigint), _col38 (type: double) null sort order: zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz @@ -3034,7 +3034,7 @@ STAGE PLANS: className: VectorReduceSinkObjectHashOperator native: true nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true - Statistics: Num rows: 5980 Data size: 2739514 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5979 Data size: 2738988 Basic stats: COMPLETE Column stats: COMPLETE Reducer 3 Execution mode: vectorized, llap Reduce Vectorization: @@ -3051,7 +3051,7 @@ STAGE PLANS: className: VectorSelectOperator native: true projectedOutputColumnNums: [0, 1, 2, 3, 4, 5, 4, 7, 8, 9, 10, 11, 12, 13, 14, 15, 2, 17, 18, 19, 20, 21, 22, 3, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 8, 8, 38] - Statistics: Num rows: 5980 Data size: 2739514 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 5979 Data size: 2738988 Basic stats: COMPLETE Column stats: COMPLETE Limit Number of rows: 50 Limit Vectorization: diff --git a/ql/src/test/results/clientpositive/llap/vectorized_stats.q.out b/ql/src/test/results/clientpositive/llap/vectorized_stats.q.out index af0c461861f3..0e1519cf20a9 100644 --- a/ql/src/test/results/clientpositive/llap/vectorized_stats.q.out +++ b/ql/src/test/results/clientpositive/llap/vectorized_stats.q.out @@ -1207,13 +1207,13 @@ STAGE PLANS: minReductionHashAggr: 0.99 mode: hash outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 6144 Data size: 183480 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator key expressions: _col0 (type: timestamp) null sort order: z sort order: + Map-reduce partition columns: _col0 (type: timestamp) - Statistics: Num rows: 1 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 6144 Data size: 183480 Basic stats: COMPLETE Column stats: COMPLETE Execution mode: vectorized, llap LLAP IO: all inputs Reducer 2 @@ -1223,10 +1223,10 @@ STAGE PLANS: keys: KEY._col0 (type: timestamp) mode: mergepartial outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3072 Data size: 91760 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 1 Data size: 40 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 3072 Data size: 91760 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat