Skip to content

Commit 0176c0e

Browse files
committed
HIVE-29481: Fix NumberFormatException and zero results when querying insert_only tables with copy suffixes
1 parent ae366be commit 0176c0e

4 files changed

Lines changed: 145 additions & 18 deletions

File tree

ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -456,8 +456,14 @@ else if(ORIGINAL_PATTERN_COPY.matcher(bucketFileName).matches()) {
456456
return new BucketMetaData(bucketId, copyNumber);
457457
}
458458
else if (bucketFileName.startsWith(BUCKET_PREFIX)) {
459-
return new BucketMetaData(Integer
460-
.parseInt(bucketFileName.substring(bucketFileName.indexOf('_') + 1)), 0);
459+
String rest = bucketFileName.substring(BUCKET_PREFIX.length());
460+
int idxOfCopy = rest.indexOf(Utilities.COPY_KEYWORD);
461+
int copyNumber = 0;
462+
if (idxOfCopy >= 0) {
463+
copyNumber = Integer.parseInt(rest.substring(idxOfCopy + Utilities.COPY_KEYWORD.length()));
464+
rest = rest.substring(0, idxOfCopy);
465+
}
466+
return new BucketMetaData(Integer.parseInt(rest), copyNumber);
461467
}
462468
return INVALID;
463469
}
@@ -1443,7 +1449,7 @@ private static void findBestWorkingDeltas(ValidWriteIdList writeIdList, AcidDire
14431449
//and we want to end up with the best set containing all relevant data: delta_5_20 delta_51_60,
14441450
//subject to list of 'exceptions' in 'writeIdList' (not show in above example).
14451451
List<ParsedDelta> deltas = new ArrayList<>();
1446-
long current = directory.getBase() == null ? 0 : directory.getBase().getWriteId();
1452+
long current = directory.getBase() == null ? -1 : directory.getBase().getWriteId();
14471453
int lastStmtId = -1;
14481454
ParsedDelta prev = null;
14491455
for(ParsedDelta next: directory.getCurrentDirectories()) {
@@ -1943,6 +1949,9 @@ private static void processDeltaDir(Path deltadir, ValidWriteIdList writeIdList,
19431949
* checks {@code visibilityTxnId} to see if {@code child} is committed in current snapshot
19441950
*/
19451951
private static boolean isDirUsable(Path child, long visibilityTxnId, List<Path> aborted, ValidTxnList validTxnList) {
1952+
if (visibilityTxnId <= 0) {
1953+
return true;
1954+
}
19461955
if (validTxnList == null) {
19471956
throw new IllegalArgumentException("No ValidTxnList for " + child);
19481957
}

ql/src/test/org/apache/hadoop/hive/ql/io/TestAcidUtils.java

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,17 @@ public void testParsingBaseCopySuffix() throws Exception {
179179
assertEquals(123, p.getWriteId());
180180
}
181181

182+
@Test
183+
public void testBucketMetaDataParsing() throws Exception {
184+
AcidUtils.BucketMetaData b = AcidUtils.BucketMetaData.parse("bucket_-001_copy_1");
185+
assertEquals(-1, b.bucketId);
186+
assertEquals(1, b.copyNumber);
187+
188+
b = AcidUtils.BucketMetaData.parse("bucket_00123_copy_5");
189+
assertEquals(123, b.bucketId);
190+
assertEquals(5, b.copyNumber);
191+
}
192+
182193
@Test
183194
public void testOriginal() throws Exception {
184195
Configuration conf = new Configuration();
@@ -212,6 +223,30 @@ public void testOriginal() throws Exception {
212223
result.get(6).getFileStatus().getPath().toString());
213224
}
214225

226+
@Test
227+
public void testGetAcidStateWithNullValidTxnList() throws Exception {
228+
Configuration conf = new Configuration();
229+
MockFileSystem fs = new MockFileSystem(conf,
230+
new MockFile("mock:/tbl/part1/delta_0000001_0000001/bucket_0", 500, new byte[0]));
231+
// Not setting VALID_TXNS_KEY in conf, so validTxnList will be null
232+
AcidDirectory dir = AcidUtils.getAcidState(fs, new MockPath(fs, "mock:/tbl/part1"), conf,
233+
new ValidReaderWriteIdList("tbl:100:" + Long.MAX_VALUE + ":"), null, false);
234+
assertEquals(1, dir.getCurrentDirectories().size());
235+
assertEquals("mock:/tbl/part1/delta_0000001_0000001", dir.getCurrentDirectories().get(0).getPath().toString());
236+
}
237+
238+
@Test
239+
public void testGetAcidStateWithDeltaZero() throws Exception {
240+
Configuration conf = new Configuration();
241+
MockFileSystem fs = new MockFileSystem(conf,
242+
new MockFile("mock:/tbl/part1/delta_0000000_0000000_-001/bucket_0", 500, new byte[0]));
243+
// For non-transactional tables, the writeIdList might be empty or start from 0
244+
AcidDirectory dir = AcidUtils.getAcidState(fs, new MockPath(fs, "mock:/tbl/part1"), conf,
245+
new ValidReaderWriteIdList("tbl:100:0:"), null, false);
246+
// This is currently failing (returns 0) but should return 1
247+
assertEquals(1, dir.getCurrentDirectories().size());
248+
}
249+
215250
@Test
216251
public void testOriginalDeltas() throws Exception {
217252
Configuration conf = new Configuration();
Lines changed: 25 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,40 @@
11
set hive.support.concurrency=true;
22
set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
33

4-
-- Test reproduction of HIVE-29481: NumberFormatException when querying insert_only tables with copy suffixes
5-
6-
-- Create a table with insert_only transactional property
4+
-- 1. Test Insert-Only Table (Reproduce HIVE-29481)
75
CREATE TABLE hive_29481_io(id INT) STORED AS ORC
86
LOCATION '${system:test.tmp.dir}/hive_29481_io'
97
TBLPROPERTIES('transactional'='true', 'transactional_properties'='insert_only');
108

11-
-- Insert some data to create a valid ACID delta directory
129
INSERT INTO hive_29481_io VALUES (1);
10+
INSERT INTO hive_29481_io VALUES (4),(5),(6);
11+
12+
-- Manually create a directory with a _copy_ suffix (common during name collisions in moves)
13+
-- AcidUtils used to fail with NumberFormatException on the suffix or ignore it if writeId was 0.
14+
dfs -mkdir ${system:test.tmp.dir}/hive_29481_io/delta_0000000_0000000_copy_1;
15+
-- Create a dummy file inside to ensure it's processed
16+
dfs -touchz ${system:test.tmp.dir}/hive_29481_io/delta_0000000_0000000_copy_1/bucket_00000;
1317

14-
-- Show existing directories
15-
dfs -ls -R ${system:test.tmp.dir}/hive_29481_io;
18+
SELECT * FROM hive_29481_io;
1619

17-
-- Manually create a directory with a _copy_ suffix to simulate the issue
18-
-- AcidUtils.ParsedDeltaLight.parse will be called on any directory starting with delta_
19-
-- We'll create it with writeId 2 (which is after the first insert's writeId 1)
20-
dfs -mkdir ${system:test.tmp.dir}/hive_29481_io/delta_0000002_0000002_0000_copy_1;
20+
-- 2. Test Full ACID Table (CRUD)
21+
CREATE TABLE hive_29481_crud(id INT, val STRING) STORED AS ORC
22+
LOCATION '${system:test.tmp.dir}/hive_29481_crud'
23+
TBLPROPERTIES('transactional'='true');
2124

22-
-- Verify it was created
23-
dfs -ls -R ${system:test.tmp.dir}/hive_29481_io;
25+
INSERT INTO hive_29481_crud VALUES (1, 'a'), (2, 'b');
2426

25-
-- This query should fail with NumberFormatException: For input string: "0000_copy_1"
26-
SELECT * FROM hive_29481_io;
27+
-- Update a row (creates delta directory)
28+
UPDATE hive_29481_crud SET val = 'updated' WHERE id = 1;
29+
30+
-- Delete a row (creates delete_delta directory)
31+
DELETE FROM hive_29481_crud WHERE id = 2;
32+
33+
-- Simulate a _copy_ suffix on a delete_delta directory
34+
dfs -mkdir ${system:test.tmp.dir}/hive_29481_crud/delete_delta_0000004_0000004_0000_copy_1;
35+
36+
SELECT * FROM hive_29481_crud;
2737

2838
-- Clean up
2939
DROP TABLE hive_29481_io;
40+
DROP TABLE hive_29481_crud;

ql/src/test/results/clientpositive/hive_29481.q.out

Lines changed: 73 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,15 @@ POSTHOOK: type: QUERY
2121
POSTHOOK: Input: _dummy_database@_dummy_table
2222
POSTHOOK: Output: default@hive_29481_io
2323
POSTHOOK: Lineage: hive_29481_io.id SCRIPT []
24-
#### A masked pattern was here ####
24+
PREHOOK: query: INSERT INTO hive_29481_io VALUES (4),(5),(6)
25+
PREHOOK: type: QUERY
26+
PREHOOK: Input: _dummy_database@_dummy_table
27+
PREHOOK: Output: default@hive_29481_io
28+
POSTHOOK: query: INSERT INTO hive_29481_io VALUES (4),(5),(6)
29+
POSTHOOK: type: QUERY
30+
POSTHOOK: Input: _dummy_database@_dummy_table
31+
POSTHOOK: Output: default@hive_29481_io
32+
POSTHOOK: Lineage: hive_29481_io.id SCRIPT []
2533
PREHOOK: query: SELECT * FROM hive_29481_io
2634
PREHOOK: type: QUERY
2735
PREHOOK: Input: default@hive_29481_io
@@ -31,6 +39,62 @@ POSTHOOK: type: QUERY
3139
POSTHOOK: Input: default@hive_29481_io
3240
#### A masked pattern was here ####
3341
1
42+
4
43+
5
44+
6
45+
PREHOOK: query: CREATE TABLE hive_29481_crud(id INT, val STRING) STORED AS ORC
46+
#### A masked pattern was here ####
47+
TBLPROPERTIES('transactional'='true')
48+
PREHOOK: type: CREATETABLE
49+
#### A masked pattern was here ####
50+
PREHOOK: Output: database:default
51+
PREHOOK: Output: default@hive_29481_crud
52+
POSTHOOK: query: CREATE TABLE hive_29481_crud(id INT, val STRING) STORED AS ORC
53+
#### A masked pattern was here ####
54+
TBLPROPERTIES('transactional'='true')
55+
POSTHOOK: type: CREATETABLE
56+
#### A masked pattern was here ####
57+
POSTHOOK: Output: database:default
58+
POSTHOOK: Output: default@hive_29481_crud
59+
PREHOOK: query: INSERT INTO hive_29481_crud VALUES (1, 'a'), (2, 'b')
60+
PREHOOK: type: QUERY
61+
PREHOOK: Input: _dummy_database@_dummy_table
62+
PREHOOK: Output: default@hive_29481_crud
63+
POSTHOOK: query: INSERT INTO hive_29481_crud VALUES (1, 'a'), (2, 'b')
64+
POSTHOOK: type: QUERY
65+
POSTHOOK: Input: _dummy_database@_dummy_table
66+
POSTHOOK: Output: default@hive_29481_crud
67+
POSTHOOK: Lineage: hive_29481_crud.id SCRIPT []
68+
POSTHOOK: Lineage: hive_29481_crud.val SCRIPT []
69+
PREHOOK: query: UPDATE hive_29481_crud SET val = 'updated' WHERE id = 1
70+
PREHOOK: type: QUERY
71+
PREHOOK: Input: default@hive_29481_crud
72+
PREHOOK: Output: default@hive_29481_crud
73+
PREHOOK: Output: default@hive_29481_crud
74+
POSTHOOK: query: UPDATE hive_29481_crud SET val = 'updated' WHERE id = 1
75+
POSTHOOK: type: QUERY
76+
POSTHOOK: Input: default@hive_29481_crud
77+
POSTHOOK: Output: default@hive_29481_crud
78+
POSTHOOK: Output: default@hive_29481_crud
79+
POSTHOOK: Lineage: hive_29481_crud.id SIMPLE []
80+
POSTHOOK: Lineage: hive_29481_crud.val SIMPLE []
81+
PREHOOK: query: DELETE FROM hive_29481_crud WHERE id = 2
82+
PREHOOK: type: QUERY
83+
PREHOOK: Input: default@hive_29481_crud
84+
PREHOOK: Output: default@hive_29481_crud
85+
POSTHOOK: query: DELETE FROM hive_29481_crud WHERE id = 2
86+
POSTHOOK: type: QUERY
87+
POSTHOOK: Input: default@hive_29481_crud
88+
POSTHOOK: Output: default@hive_29481_crud
89+
PREHOOK: query: SELECT * FROM hive_29481_crud
90+
PREHOOK: type: QUERY
91+
PREHOOK: Input: default@hive_29481_crud
92+
#### A masked pattern was here ####
93+
POSTHOOK: query: SELECT * FROM hive_29481_crud
94+
POSTHOOK: type: QUERY
95+
POSTHOOK: Input: default@hive_29481_crud
96+
#### A masked pattern was here ####
97+
1 updated
3498
PREHOOK: query: DROP TABLE hive_29481_io
3599
PREHOOK: type: DROPTABLE
36100
PREHOOK: Input: default@hive_29481_io
@@ -39,3 +103,11 @@ POSTHOOK: query: DROP TABLE hive_29481_io
39103
POSTHOOK: type: DROPTABLE
40104
POSTHOOK: Input: default@hive_29481_io
41105
POSTHOOK: Output: default@hive_29481_io
106+
PREHOOK: query: DROP TABLE hive_29481_crud
107+
PREHOOK: type: DROPTABLE
108+
PREHOOK: Input: default@hive_29481_crud
109+
PREHOOK: Output: default@hive_29481_crud
110+
POSTHOOK: query: DROP TABLE hive_29481_crud
111+
POSTHOOK: type: DROPTABLE
112+
POSTHOOK: Input: default@hive_29481_crud
113+
POSTHOOK: Output: default@hive_29481_crud

0 commit comments

Comments
 (0)