Skip to content

Commit 9cf07c3

Browse files
committed
HIVE-29432: Autogather column statistics missing for tables containing a column with an unsupported type
1 parent 24e835c commit 9cf07c3

50 files changed

Lines changed: 1509 additions & 667 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsAutoGatherContext.java

Lines changed: 73 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
import java.util.HashMap;
2525
import java.util.List;
2626
import java.util.Map;
27-
import java.util.stream.Collectors;
27+
import java.util.function.Supplier;
2828

2929
import org.apache.hadoop.hive.conf.HiveConf;
3030
import org.apache.hadoop.hive.metastore.api.FieldSchema;
@@ -256,14 +256,32 @@ private void replaceSelectOperatorProcess(SelectOperator operator, Operator<? ex
256256
// |
257257

258258
// 1. deal with non-partition columns
259+
Map<String, Integer> columnNameToIndex = new HashMap<>();
260+
List<ColumnInfo> selRSSig = selRS.getSignature();
261+
for (int i = 0; i < selRSSig.size(); i++) {
262+
columnNameToIndex.putIfAbsent(selRSSig.get(i).getAlias(), i);
263+
}
259264
for (int i = 0; i < this.columns.size(); i++) {
260265
ColumnInfo col = columns.get(i);
266+
ObjectInspector objectInspector = col.getObjectInspector();
267+
if (objectInspector == null) {
268+
continue;
269+
}
270+
boolean columnSupported = isColumnSupported(objectInspector.getCategory(), col::getType);
271+
if (!columnSupported) {
272+
continue;
273+
}
274+
275+
Integer selRSIdx = columnNameToIndex.get(this.columns.get(i).getName());
276+
if (selRSIdx == null) {
277+
continue;
278+
}
261279
ExprNodeDesc exprNodeDesc = new ExprNodeColumnDesc(col);
262280
colList.add(exprNodeDesc);
263-
String internalName = selRS.getColumnNames().get(i);
281+
String internalName = selRS.getColumnNames().get(selRSIdx);
264282
columnNames.add(internalName);
265283
columnExprMap.put(internalName, exprNodeDesc);
266-
signature.add(selRS.getSignature().get(i));
284+
signature.add(selRSSig.get(selRSIdx));
267285
}
268286
// if there is any partition column (in static partition or dynamic
269287
// partition or mixed case)
@@ -280,7 +298,7 @@ private void replaceSelectOperatorProcess(SelectOperator operator, Operator<? ex
280298
}
281299
exprNodeDesc = new ExprNodeConstantDesc(partSpec.get(partColName));
282300
TypeInfo srcType = exprNodeDesc.getTypeInfo();
283-
TypeInfo destType = selRS.getSignature().get(this.columns.size() + i).getType();
301+
TypeInfo destType = selRSSig.get(this.columns.size() + i).getType();
284302
if (!srcType.equals(destType)) {
285303
// This may be possible when srcType is string but destType is integer
286304
exprNodeDesc = ExprNodeTypeCheck.getExprNodeDefaultExprProcessor()
@@ -292,7 +310,7 @@ private void replaceSelectOperatorProcess(SelectOperator operator, Operator<? ex
292310
dynamicPartBegin++;
293311
ColumnInfo col = columns.get(this.columns.size() + dynamicPartBegin);
294312
TypeInfo srcType = col.getType();
295-
TypeInfo destType = selRS.getSignature().get(this.columns.size() + i).getType();
313+
TypeInfo destType = selRSSig.get(this.columns.size() + i).getType();
296314
exprNodeDesc = new ExprNodeColumnDesc(col);
297315
if (!srcType.equals(destType)) {
298316
exprNodeDesc = ExprNodeTypeCheck.getExprNodeDefaultExprProcessor()
@@ -303,7 +321,7 @@ private void replaceSelectOperatorProcess(SelectOperator operator, Operator<? ex
303321
String internalName = selRS.getColumnNames().get(this.columns.size() + i);
304322
columnNames.add(internalName);
305323
columnExprMap.put(internalName, exprNodeDesc);
306-
signature.add(selRS.getSignature().get(this.columns.size() + i));
324+
signature.add(selRSSig.get(this.columns.size() + i));
307325
}
308326
operator.setConf(new SelectDesc(colList, columnNames));
309327
operator.setColumnExprMap(columnExprMap);
@@ -319,36 +337,61 @@ public boolean isInsertInto() {
319337
return isInsertInto;
320338
}
321339

322-
public static boolean canRunAutogatherStats(Operator curr) {
340+
public static boolean isColumnSupported(ObjectInspector.Category category, Supplier<TypeInfo> typeInfoSupplier) {
341+
if (category != ObjectInspector.Category.PRIMITIVE) {
342+
return false;
343+
}
344+
TypeInfo t = typeInfoSupplier.get();
345+
switch (((PrimitiveTypeInfo) t).getPrimitiveCategory()) {
346+
case BOOLEAN:
347+
case BYTE:
348+
case SHORT:
349+
case INT:
350+
case LONG:
351+
case TIMESTAMP:
352+
case FLOAT:
353+
case DOUBLE:
354+
case STRING:
355+
case CHAR:
356+
case VARCHAR:
357+
case BINARY:
358+
case DECIMAL:
359+
case DATE:
360+
return true;
361+
default:
362+
return false;
363+
}
364+
}
365+
366+
public static boolean canRunAutogatherStats(Table destinationTable, Operator curr) {
367+
if (destinationTable.isNonNative() && destinationTable.getStorageHandler().supportsPartitioning()) {
368+
// On partitioned tables, the partition key is needed to store the stats.
369+
// However, external tables (e.g. stored by iceberg) may not define partition keys,
370+
// i.e., org.apache.hadoop.hive.ql.metadata.Table.getPartitionKeys() returns null.
371+
// So keep the same behavior as before HIVE-29432, and only run stats autogather if all columns are supported.
372+
return areAllColumnsSupported(curr);
373+
}
374+
return isAnyColumnSupported(curr);
375+
}
376+
377+
private static boolean areAllColumnsSupported(Operator curr) {
323378
// check the ObjectInspector
324379
for (ColumnInfo cinfo : curr.getSchema().getSignature()) {
325-
if (cinfo.getIsVirtualCol()) {
326-
return false;
327-
} else if (cinfo.getObjectInspector().getCategory() != ObjectInspector.Category.PRIMITIVE) {
380+
if (cinfo.getIsVirtualCol() || !isColumnSupported(cinfo.getObjectInspector().getCategory(), cinfo::getType)) {
328381
return false;
329-
} else {
330-
switch (((PrimitiveTypeInfo) cinfo.getType()).getPrimitiveCategory()) {
331-
case BOOLEAN:
332-
case BYTE:
333-
case SHORT:
334-
case INT:
335-
case LONG:
336-
case TIMESTAMP:
337-
case FLOAT:
338-
case DOUBLE:
339-
case STRING:
340-
case CHAR:
341-
case VARCHAR:
342-
case BINARY:
343-
case DECIMAL:
344-
case DATE:
345-
break;
346-
default:
347-
return false;
348-
}
349382
}
350383
}
351384
return true;
352385
}
353386

387+
private static boolean isAnyColumnSupported(Operator curr) {
388+
// check the ObjectInspector
389+
for (ColumnInfo cinfo : curr.getSchema().getSignature()) {
390+
if (!cinfo.getIsVirtualCol() && isColumnSupported(cinfo.getObjectInspector().getCategory(), cinfo::getType)) {
391+
return true;
392+
}
393+
}
394+
return false;
395+
}
396+
354397
}

ql/src/java/org/apache/hadoop/hive/ql/parse/ColumnStatsSemanticAnalyzer.java

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,6 @@
4949
import org.apache.hadoop.hive.ql.stats.ColStatsProcessor.ColumnStatsField;
5050
import org.apache.hadoop.hive.ql.stats.ColStatsProcessor.ColumnStatsType;
5151
import org.apache.hadoop.hive.ql.stats.StatsUtils;
52-
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
5352
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
5453
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
5554
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
@@ -103,11 +102,27 @@ private boolean shouldRewrite(ASTNode tree) {
103102
return rwt;
104103
}
105104

105+
/**
106+
* Get the names of the columns that support column statistics.
107+
*/
108+
private static List<String> getColumnNames(Table tbl) {
109+
List<String> colNames = new ArrayList<>();
110+
for (FieldSchema col : tbl.getCols()) {
111+
String type = col.getType();
112+
TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(type);
113+
boolean isSupported = ColumnStatsAutoGatherContext.isColumnSupported(typeInfo.getCategory(), () -> typeInfo);
114+
if (isSupported) {
115+
colNames.add(col.getName());
116+
}
117+
}
118+
return colNames;
119+
}
120+
106121
private List<String> getColumnName(ASTNode tree) throws SemanticException {
107122

108123
switch (tree.getChildCount()) {
109124
case 2:
110-
return Utilities.getColumnNamesFromFieldSchema(tbl.getCols());
125+
return getColumnNames(tbl);
111126
case 3:
112127
int numCols = tree.getChild(2).getChildCount();
113128
List<String> colName = new ArrayList<>(numCols);
@@ -212,7 +227,8 @@ protected static List<String> getColumnTypes(Table tbl, List<String> colNames) {
212227
if (colName.equalsIgnoreCase(col.getName())) {
213228
String type = col.getType();
214229
TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(type);
215-
if (typeInfo.getCategory() != ObjectInspector.Category.PRIMITIVE) {
230+
boolean isSupported = ColumnStatsAutoGatherContext.isColumnSupported(typeInfo.getCategory(), () -> typeInfo);
231+
if (!isSupported) {
216232
logTypeWarning(colName, type);
217233
colNames.remove(colName);
218234
} else {
@@ -241,7 +257,7 @@ private String genRewrittenQuery(List<String> colNames, List<String> colTypes, H
241257
protected static String genRewrittenQuery(Table tbl,
242258
HiveConf conf, List<TransformSpec> partTransformSpec, Map<String, String> partSpec,
243259
boolean isPartitionStats) {
244-
List<String> colNames = Utilities.getColumnNamesFromFieldSchema(tbl.getCols());
260+
List<String> colNames = getColumnNames(tbl);
245261
List<String> colTypes = ColumnStatsSemanticAnalyzer.getColumnTypes(tbl, colNames);
246262
return ColumnStatsSemanticAnalyzer.genRewrittenQuery(
247263
tbl, colNames, colTypes, conf, partTransformSpec, -1, partSpec, isPartitionStats, true);
@@ -733,7 +749,7 @@ static AnalyzeRewriteContext genAnalyzeRewriteContext(HiveConf conf, Table tbl)
733749
AnalyzeRewriteContext analyzeRewrite = new AnalyzeRewriteContext();
734750
analyzeRewrite.setTableName(tbl.getFullyQualifiedName());
735751
analyzeRewrite.setTblLvl(!(conf.getBoolVar(ConfVars.HIVE_STATS_COLLECT_PART_LEVEL_STATS) && tbl.isPartitioned()));
736-
List<String> colNames = Utilities.getColumnNamesFromFieldSchema(tbl.getCols());
752+
List<String> colNames = getColumnNames(tbl);
737753
List<String> colTypes = getColumnTypes(tbl, colNames);
738754
analyzeRewrite.setColName(colNames);
739755
analyzeRewrite.setColType(colTypes);

ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8260,8 +8260,9 @@ protected Operator genFileSinkPlan(String dest, QB qb, Operator input)
82608260
&& enableColumnStatsCollecting()
82618261
&& destinationTable != null
82628262
&& (!destinationTable.isNonNative() || destinationTable.getStorageHandler().commitInMoveTask())
8263-
&& !destTableIsTemporary && !destTableIsMaterialization
8264-
&& ColumnStatsAutoGatherContext.canRunAutogatherStats(fso)) {
8263+
&& !destTableIsTemporary
8264+
&& !destTableIsMaterialization
8265+
&& ColumnStatsAutoGatherContext.canRunAutogatherStats(destinationTable, fso)) {
82658266
if (destType == QBMetaData.DEST_TABLE) {
82668267
genAutoColumnStatsGatheringPipeline(destinationTable, partSpec, input,
82678268
qb.getParseInfo().isInsertIntoTable(destinationTable.getDbName(), destinationTable.getTableName(),
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
set hive.stats.autogather=true;
2+
set hive.stats.column.autogather=true;
3+
4+
-- create a table with a type that does not support column stats autogather
5+
-- the columns before and after that column should still obtain statistics
6+
CREATE TABLE test_stats1 (a int, b uniontype<int, string>, c int) STORED AS TEXTFILE;
7+
INSERT INTO test_stats1 (a, b, c) VALUES (1, create_union(0, 2, ""), 3);
8+
DESCRIBE FORMATTED test_stats1 a;
9+
DESCRIBE FORMATTED test_stats1 b;
10+
DESCRIBE FORMATTED test_stats1 c;
11+

ql/src/test/results/clientnegative/avro_non_nullable_union.q.out

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,8 @@ Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: Avro could not vali
5656
Caused by: Avro could not validate record against schema (record = {"id": 3, "value": null}) (schema = {"type":"record","name":"nullable","fields":[{"name":"id","type":"int"},{"name":"value","type":["int","double"]}]})
5757
#### A masked pattern was here ####
5858
]], Vertex did not succeed due to OWN_TASK_FAILURE, failedTasks:1 killedTasks:0, Vertex vertex_#ID# [Map 1] killed/failed due to:OWN_TASK_FAILURE]
59-
DAG did not succeed due to VERTEX_FAILURE. failedVertices:1 killedVertices:0
59+
[Masked Vertex killed due to OTHER_VERTEX_FAILURE]
60+
DAG did not succeed due to VERTEX_FAILURE. failedVertices:1 killedVertices:1
6061
FAILED: Execution Error, return code 2 from org.apache.hadoop.hive.ql.exec.tez.TezTask. Vertex failed, vertexName=Map 1, vertexId=vertex_#ID#, diagnostics=[Task failed, taskId=task_#ID#, diagnostics=[TaskAttempt 0 failed, info=[Error: Error while running task ( failure ) : attempt_#ID#:java.lang.RuntimeException: java.lang.RuntimeException: Hive Runtime Error while closing operators
6162
#### A masked pattern was here ####
6263
Caused by: java.lang.RuntimeException: Hive Runtime Error while closing operators
@@ -73,4 +74,4 @@ Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: Avro could not vali
7374
#### A masked pattern was here ####
7475
Caused by: Avro could not validate record against schema (record = {"id": 3, "value": null}) (schema = {"type":"record","name":"nullable","fields":[{"name":"id","type":"int"},{"name":"value","type":["int","double"]}]})
7576
#### A masked pattern was here ####
76-
]], Vertex did not succeed due to OWN_TASK_FAILURE, failedTasks:1 killedTasks:0, Vertex vertex_#ID# [Map 1] killed/failed due to:OWN_TASK_FAILURE]DAG did not succeed due to VERTEX_FAILURE. failedVertices:1 killedVertices:0
77+
]], Vertex did not succeed due to OWN_TASK_FAILURE, failedTasks:1 killedTasks:0, Vertex vertex_#ID# [Map 1] killed/failed due to:OWN_TASK_FAILURE][Masked Vertex killed due to OTHER_VERTEX_FAILURE]DAG did not succeed due to VERTEX_FAILURE. failedVertices:1 killedVertices:1

ql/src/test/results/clientpositive/llap/cast_null_to_complex.q.out

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ Retention: 0
8787
#### A masked pattern was here ####
8888
Table Type: MANAGED_TABLE
8989
Table Parameters:
90-
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"}
90+
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"_c2\":\"true\"}}
9191
bucketing_version 2
9292
numFiles 1
9393
numRows 1

ql/src/test/results/clientpositive/llap/columnarserde_create_shortcut.q.out

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ STAGE PLANS:
3030
#### A masked pattern was here ####
3131
Edges:
3232
Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE)
33+
Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE)
3334
#### A masked pattern was here ####
3435
Vertices:
3536
Map 1
@@ -64,6 +65,40 @@ STAGE PLANS:
6465
output format: org.apache.hadoop.hive.ql.io.RCFileOutputFormat
6566
serde: org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe
6667
name: default.columnarserde_create_shortcut
68+
Select Operator
69+
expressions: _col3 (type: int), _col4 (type: string)
70+
outputColumnNames: d, e
71+
Statistics: Num rows: 11 Data size: 34628 Basic stats: COMPLETE Column stats: NONE
72+
Group By Operator
73+
aggregations: min(d), max(d), count(1), count(d), compute_bit_vector_hll(d), max(length(e)), avg(COALESCE(length(e),0)), count(e), compute_bit_vector_hll(e)
74+
minReductionHashAggr: 0.99
75+
mode: hash
76+
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8
77+
Statistics: Num rows: 1 Data size: 3548 Basic stats: COMPLETE Column stats: NONE
78+
Reduce Output Operator
79+
null sort order:
80+
sort order:
81+
Statistics: Num rows: 1 Data size: 3548 Basic stats: COMPLETE Column stats: NONE
82+
value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: bigint), _col3 (type: bigint), _col4 (type: binary), _col5 (type: int), _col6 (type: struct<count:bigint,sum:double,input:int>), _col7 (type: bigint), _col8 (type: binary)
83+
Reducer 3
84+
Execution mode: vectorized, llap
85+
Reduce Operator Tree:
86+
Group By Operator
87+
aggregations: min(VALUE._col0), max(VALUE._col1), count(VALUE._col2), count(VALUE._col3), compute_bit_vector_hll(VALUE._col4), max(VALUE._col5), avg(VALUE._col6), count(VALUE._col7), compute_bit_vector_hll(VALUE._col8)
88+
mode: mergepartial
89+
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8
90+
Statistics: Num rows: 1 Data size: 3548 Basic stats: COMPLETE Column stats: NONE
91+
Select Operator
92+
expressions: 'LONG' (type: string), UDFToLong(_col0) (type: bigint), UDFToLong(_col1) (type: bigint), (_col2 - _col3) (type: bigint), COALESCE(ndv_compute_bit_vector(_col4),0) (type: bigint), _col4 (type: binary), 'STRING' (type: string), UDFToLong(COALESCE(_col5,0)) (type: bigint), COALESCE(_col6,0) (type: double), (_col2 - _col7) (type: bigint), COALESCE(ndv_compute_bit_vector(_col8),0) (type: bigint), _col8 (type: binary)
93+
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9, _col10, _col11
94+
Statistics: Num rows: 1 Data size: 3548 Basic stats: COMPLETE Column stats: NONE
95+
File Output Operator
96+
compressed: false
97+
Statistics: Num rows: 1 Data size: 3548 Basic stats: COMPLETE Column stats: NONE
98+
table:
99+
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
100+
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
101+
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
67102

68103
Stage: Stage-2
69104
Dependency Collection
@@ -81,6 +116,10 @@ STAGE PLANS:
81116
Stage: Stage-3
82117
Stats Work
83118
Basic Stats Work:
119+
Column Stats Desc:
120+
Columns: d, e
121+
Column Types: int, string
122+
Table: default.columnarserde_create_shortcut
84123

85124
PREHOOK: query: FROM src_thrift
86125
INSERT OVERWRITE TABLE columnarserde_create_shortcut SELECT src_thrift.lint, src_thrift.lstring, src_thrift.mstringstring, src_thrift.aint, src_thrift.astring DISTRIBUTE BY 1

ql/src/test/results/clientpositive/llap/empty_result_ctas.q.out

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ Retention: 0
4141
#### A masked pattern was here ####
4242
Table Type: MANAGED_TABLE
4343
Table Parameters:
44-
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"}
44+
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"c_primitive\":\"true\"}}
4545
bucketing_version 2
4646
numFiles 0
4747
numRows 0
@@ -92,7 +92,7 @@ Retention: 0
9292
#### A masked pattern was here ####
9393
Table Type: MANAGED_TABLE
9494
Table Parameters:
95-
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"}
95+
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"c_primitive\":\"true\"}}
9696
bucketing_version 2
9797
numFiles 0
9898
numRows 0
@@ -161,7 +161,7 @@ Retention: 0
161161
#### A masked pattern was here ####
162162
Table Type: MANAGED_TABLE
163163
Table Parameters:
164-
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"}
164+
COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"a\":\"true\",\"b\":\"true\"}}
165165
bucketing_version 2
166166
numFiles 0
167167
numRows 0

0 commit comments

Comments
 (0)