Skip to content

Commit 7de8dd3

Browse files
popematttgregg
andauthored
Adds class to generate bytecode for Ion 1.0 symbol tables (#1138)
--------- Co-authored-by: Tyler Gregg <greggt@amazon.com>
1 parent 4ead384 commit 7de8dd3

6 files changed

Lines changed: 1124 additions & 0 deletions

File tree

Lines changed: 376 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,376 @@
1+
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
// SPDX-License-Identifier: Apache-2.0
3+
package com.amazon.ion.bytecode.bin10
4+
5+
import com.amazon.ion.IonException
6+
import com.amazon.ion.SystemSymbols
7+
import com.amazon.ion.bytecode.ir.Instructions.I_DIRECTIVE_ADD_SYMBOLS
8+
import com.amazon.ion.bytecode.ir.Instructions.I_DIRECTIVE_SET_SYMBOLS
9+
import com.amazon.ion.bytecode.ir.Instructions.I_DIRECTIVE_USE
10+
import com.amazon.ion.bytecode.ir.Instructions.I_END_CONTAINER
11+
import com.amazon.ion.bytecode.ir.Instructions.I_INT_I32
12+
import com.amazon.ion.bytecode.ir.Instructions.I_NULL_NULL
13+
import com.amazon.ion.bytecode.ir.Instructions.I_STRING_CP
14+
import com.amazon.ion.bytecode.ir.Instructions.I_SYMBOL_CP
15+
import com.amazon.ion.bytecode.ir.Instructions.packInstructionData
16+
import com.amazon.ion.bytecode.ir.OperationKind
17+
import com.amazon.ion.bytecode.util.AppendableConstantPoolView
18+
import com.amazon.ion.bytecode.util.BytecodeBuffer
19+
import com.amazon.ion.bytecode.util.unsignedToInt
20+
import edu.umd.cs.findbugs.annotations.SuppressFBWarnings
21+
22+
/**
23+
* Helper to generate Bytecode instructions for Ion 1.0 style symbol tables.
24+
*
25+
* We cannot meaningfully read a partial symbol table, so it seems a reasonable requirement that the entire symbol
26+
* table must be buffered before we generate bytecode for it. Therefore, this is re-usable for both continuable and
27+
* non-continuable bytecode generators.
28+
*
29+
* The Bytecode uses Ion 1.1 style directives, which don't quite align with Ion 1.0 directives.
30+
* So, we have two options:
31+
* 1. We can output the bytecode that is functionally equivalent to "classic" symbol tables
32+
* 2. We can add a "classic" symbol table directive to the bytecode
33+
*
34+
* For option 1, the generated bytecode is as follows:
35+
* - if there's no LST append, and no imports, then we can generate a SET_SYMBOLS instruction
36+
* - if there's LST append, and no imports, then we can generate an ADD_SYMBOLS instruction
37+
* - if there are imports, there cannot also be LST append.
38+
* - if there are imports, and it's not LST append, we can generate an empty SET_SYMBOLS, followed by a USE for all the
39+
* imports, followed by ADD_SYMBOLS with the local symbols.
40+
* - if there's LST append with no local symbols added, we could elide it completely, but for simplicity of the
41+
* implementation we can emit an empty ADD_SYMBOLS
42+
* - if there are imports and no local symbols, we can generate an empty SET_SYMBOLS, followed by USE for all the imports.
43+
* - if there are no imports, no LST append, and no local symbols, we can generate an empty SET_SYMBOLS.
44+
*
45+
* This means we need to buffer some data before emitting the bytecode, but that's okay.
46+
* We can put the strings into the constant pool, and keep track of the min and max, since there will be nothing else
47+
* that we would put in the constant pool while we're processing a symbol table.
48+
*
49+
* It's actually beneficial to put the symbol text strings in the constant pool now. They can be added to the symbol
50+
* table from the constant pool comparatively cheaply, and we can decode the strings eagerly without having as much
51+
* overhead from the control flow of calling `readTextReference()` over and over.
52+
*
53+
* Logic is roughly this:
54+
*
55+
* ```pseudocode
56+
* let symbolsStartInclusive = 0
57+
* let symbolsEndExclusive = 0
58+
* let isAppend = false
59+
* while hasMoreFields():
60+
* let fieldName = readFieldName()
61+
* switch(fieldName):
62+
* "imports":
63+
* let valueType = readValueType()
64+
* switch(valueType):
65+
* symbol:
66+
* readAndValidate "$ion_symbol_table"
67+
* isAppend = true
68+
* list:
69+
* bytecode.add2(SET_SYMBOLS, END_CONTAINER)
70+
* bytecode.add(USE)
71+
* while hasMoreListElements():
72+
* compileImport()
73+
* bytecode.add(END_CONTAINER)
74+
* isAppend = true
75+
* "symbols":
76+
* symbolStartInclusive = sizeOf(constantPool)
77+
* for value in list:
78+
* if value is symbol:
79+
* constantPool.add(readText())
80+
* else:
81+
* constantPool.add(null)
82+
* symbolsEndExclusive = sizeOf(constantPool)
83+
*
84+
* if (isAppend):
85+
* bytecode.add(ADD_SYMBOLS)
86+
* else:
87+
* bytecode.add(SET_SYMBOLS)
88+
* for i in symbolsStartInclusive .. symbolsEndExclusive:
89+
* bytecode.add(SYMBOL_CP(i))
90+
* bytecode.add(END_CONTAINER)
91+
* ```
92+
*/
93+
internal object SymbolTableHelper {
94+
95+
private const val ONE_BYTE_MASK = 0xFF
96+
private const val ONE_BYTE_SHIFT = 8
97+
98+
/**
99+
* Compiles an Ion 1.0 symbol table to bytecode instructions. See class documentation for details.
100+
*/
101+
@JvmStatic
102+
@SuppressFBWarnings("SF_SWITCH_NO_DEFAULT")
103+
fun compileSymbolTable(source: ByteArray, position: Int, structLength: Int, dest: BytecodeBuffer, cp: AppendableConstantPoolView) {
104+
var symbolsCpIndexStartInclusive = 0
105+
var symbolsCpIndexEndExclusive = 0
106+
107+
var hasSeenImports = false
108+
var isAppendRequired = false
109+
var hasSeenSymbols = false
110+
111+
iterateStruct(source, position, structLength) { fieldSid, fieldTid, pos, length ->
112+
val operationKind = TypeIdHelper.operationKindForTypeId(fieldTid)
113+
when (fieldSid) {
114+
SystemSymbols.IMPORTS_SID -> {
115+
if (hasSeenImports) throw IonException("Multiple imports fields found within a single local symbol table.")
116+
hasSeenImports = true
117+
when (operationKind) {
118+
OperationKind.SYMBOL -> {
119+
val sid = readUInt(source, pos, length).toInt()
120+
if (sid == SystemSymbols.ION_SYMBOL_TABLE_SID) isAppendRequired = true
121+
}
122+
OperationKind.LIST -> {
123+
readImportsList(source, pos, length, dest, cp)
124+
isAppendRequired = true
125+
}
126+
}
127+
}
128+
SystemSymbols.SYMBOLS_SID -> {
129+
if (hasSeenSymbols) throw IonException("Multiple symbols fields found within a single local symbol table.")
130+
hasSeenSymbols = true
131+
when (operationKind) {
132+
OperationKind.LIST -> {
133+
symbolsCpIndexStartInclusive = cp.size
134+
readSymbolsList(source, pos, length, cp)
135+
symbolsCpIndexEndExclusive = cp.size
136+
}
137+
}
138+
}
139+
}
140+
}
141+
142+
val directiveOperation = if (isAppendRequired) {
143+
// The new local symbols are "appended" to the imports using ADD_SYMBOLS
144+
if (symbolsCpIndexEndExclusive - symbolsCpIndexStartInclusive == 0) return
145+
I_DIRECTIVE_ADD_SYMBOLS
146+
} else {
147+
I_DIRECTIVE_SET_SYMBOLS
148+
}
149+
dest.add(directiveOperation)
150+
for (i in symbolsCpIndexStartInclusive until symbolsCpIndexEndExclusive) {
151+
dest.add(I_SYMBOL_CP.packInstructionData(i))
152+
}
153+
dest.add(I_END_CONTAINER)
154+
}
155+
156+
/**
157+
* Reads a list of import structs. Emits bytecode if and only if there is a non-zero number of imports.
158+
*/
159+
@JvmStatic
160+
private fun readImportsList(
161+
source: ByteArray,
162+
listStart: Int,
163+
listLength: Int,
164+
dest: BytecodeBuffer,
165+
cp: AppendableConstantPoolView
166+
) {
167+
// Clear default module symbols and start adding the imports in a USE directive
168+
dest.add3(I_DIRECTIVE_SET_SYMBOLS, I_END_CONTAINER, I_DIRECTIVE_USE)
169+
170+
val checkpoint = dest.size()
171+
172+
iterateList(source, listStart, listLength) { childTid: Int, childStart: Int, length: Int ->
173+
if (TypeIdHelper.isNonNullStruct(childTid)) readImportStruct(source, childStart, length, dest, cp)
174+
}
175+
176+
if (dest.size() == checkpoint) {
177+
// Truncate to remove the USE directive
178+
dest.truncate(checkpoint - 1)
179+
} else {
180+
// Close the USE directive
181+
dest.add(I_END_CONTAINER)
182+
}
183+
}
184+
185+
/**
186+
* Reads an import struct according to https://amazon-ion.github.io/ion-docs/docs/symbols.html#imports
187+
*/
188+
@JvmStatic
189+
@SuppressFBWarnings("SF_SWITCH_NO_DEFAULT")
190+
private fun readImportStruct(
191+
source: ByteArray,
192+
contentStart: Int,
193+
contentLength: Int,
194+
dest: BytecodeBuffer,
195+
cp: AppendableConstantPoolView
196+
) {
197+
198+
var catalogName: String? = null
199+
var catalogVersion: Int = -1
200+
var maxId: Int = -1
201+
202+
var hasSeenCatalogName = false
203+
var hasSeenCatalogVersion = false
204+
var hasSeenMaxId = false
205+
206+
iterateStruct(source, contentStart, contentLength) { fieldSid, fieldTid, pos, length ->
207+
when (fieldSid) {
208+
SystemSymbols.NAME_SID -> {
209+
if (hasSeenCatalogName) throw IonException("Multiple name fields found within a single import.")
210+
hasSeenCatalogName = true
211+
if (TypeIdHelper.isNonNullString(fieldTid)) {
212+
catalogName = String(source, pos, length, Charsets.UTF_8)
213+
}
214+
}
215+
SystemSymbols.VERSION_SID -> {
216+
if (hasSeenCatalogVersion) throw IonException("Multiple version fields found within a single import.")
217+
hasSeenCatalogVersion = true
218+
if (TypeIdHelper.isNonNullPositiveInt(fieldTid)) {
219+
catalogVersion = readUInt(source, pos, length).toInt()
220+
}
221+
}
222+
SystemSymbols.MAX_ID_SID -> {
223+
if (hasSeenMaxId) throw IonException("Multiple max_id fields found within a single import.")
224+
hasSeenMaxId = true
225+
if (TypeIdHelper.isNonNullPositiveInt(fieldTid)) {
226+
maxId = readUInt(source, pos, length).toInt()
227+
}
228+
}
229+
}
230+
}
231+
232+
// No name, empty name, or $ion, so we ignore the import clause
233+
if (catalogName == null || catalogName == "\$ion" || catalogName == "") return
234+
val cpIndex = cp.add(catalogName)
235+
dest.add(I_STRING_CP.packInstructionData(cpIndex))
236+
if (catalogVersion < 1) catalogVersion = 1
237+
dest.add2(I_INT_I32, catalogVersion)
238+
if (maxId < 0) {
239+
dest.add(I_NULL_NULL)
240+
} else {
241+
dest.add2(I_INT_I32, maxId)
242+
}
243+
}
244+
245+
/**
246+
* Reads all symbols in the symbols lists, adding them to the constant pool. Any values that are not a non-null
247+
* string result in a symbol with unknown text, so a `null` is added to the constant pool.
248+
*/
249+
@JvmStatic
250+
private fun readSymbolsList(source: ByteArray, position: Int, listLength: Int, cp: AppendableConstantPoolView) {
251+
iterateList(source, position, listLength) { typeId, p, length ->
252+
if (TypeIdHelper.isNonNullString(typeId)) {
253+
cp.add(String(source, p, length, Charsets.UTF_8))
254+
} else {
255+
cp.add(null)
256+
}
257+
}
258+
}
259+
260+
// ==== General helpers for traversing through the symbol table struct ====
261+
262+
/**
263+
* Iterates over all fields in a struct.
264+
* For each non-null field, it calls [fieldHandler].
265+
* Annotations are ignored in symbol table and import structs, so this handles skipping the annotations.
266+
*/
267+
@JvmStatic
268+
@SuppressFBWarnings("SF_SWITCH_NO_DEFAULT")
269+
private inline fun iterateStruct(
270+
source: ByteArray,
271+
start: Int,
272+
length: Int,
273+
fieldHandler: (fieldSid: Int, valueTid: Int, pos: Int, len: Int) -> Unit
274+
) {
275+
var p = start
276+
277+
val end = p + length
278+
279+
while (p < end) {
280+
val fieldSidValueAndLength = VarIntHelper.readVarUIntValueAndLength(source, p)
281+
val fieldSid = fieldSidValueAndLength.shr(ONE_BYTE_SHIFT).toInt()
282+
p += fieldSidValueAndLength.toInt().and(ONE_BYTE_MASK)
283+
284+
var typeId = source[p++].unsignedToInt()
285+
286+
when (TypeIdHelper.operationKindForTypeId(typeId)) {
287+
// This is a nop, so we skip this field
288+
OperationKind.UNSET -> {
289+
val childLengthAndLengthSize = getLengthForTypeId(typeId, source, p)
290+
p += childLengthAndLengthSize.toInt().and(ONE_BYTE_MASK)
291+
p += childLengthAndLengthSize.shr(ONE_BYTE_SHIFT).toInt()
292+
continue
293+
}
294+
// We ignore annotations inside all symbol table structs and import structs
295+
OperationKind.ANNOTATIONS -> {
296+
p += skipAnnotations(typeId, source, p)
297+
typeId = source[p++].unsignedToInt()
298+
}
299+
}
300+
301+
val childLengthAndLengthSize = getLengthForTypeId(typeId, source, p)
302+
p += childLengthAndLengthSize.toInt().and(ONE_BYTE_MASK)
303+
val l = childLengthAndLengthSize.shr(ONE_BYTE_SHIFT).toInt()
304+
fieldHandler(fieldSid, typeId, p, l)
305+
p += l
306+
}
307+
}
308+
309+
/**
310+
* Iterates over all values in a list.
311+
* For each non-null value, it calls [valueHandler].
312+
* Annotations are ignored in symbols and imports lists, so this handles skipping the annotations.
313+
*/
314+
@JvmStatic
315+
private inline fun iterateList(
316+
source: ByteArray,
317+
position: Int,
318+
length: Int,
319+
valueHandler: (typeId: Int, position: Int, length: Int) -> Unit
320+
) {
321+
var p = position
322+
val end = position + length
323+
while (p < end) {
324+
val typeId = source[p++].unsignedToInt()
325+
326+
when (TypeIdHelper.operationKindForTypeId(typeId)) {
327+
// This is a nop, so we skip this field
328+
OperationKind.UNSET -> {
329+
val childLengthAndLengthSize = getLengthForTypeId(typeId, source, p)
330+
p += childLengthAndLengthSize.toInt().and(ONE_BYTE_MASK)
331+
p += childLengthAndLengthSize.shr(ONE_BYTE_SHIFT).toInt()
332+
continue
333+
}
334+
// We ignore annotations on anything inside a local symbol table.
335+
OperationKind.ANNOTATIONS -> {
336+
p += skipAnnotations(typeId, source, p)
337+
continue
338+
}
339+
}
340+
341+
val childLengthAndLengthSize = getLengthForTypeId(typeId, source, p)
342+
p += childLengthAndLengthSize.toInt().and(ONE_BYTE_MASK)
343+
val l = childLengthAndLengthSize.shr(ONE_BYTE_SHIFT).toInt()
344+
valueHandler(typeId, p, l)
345+
p += l
346+
}
347+
}
348+
349+
/** returns the number of bytes needed to skip the annotations and go to the annotated value. */
350+
@JvmStatic
351+
private fun skipAnnotations(typeId: Int, source: ByteArray, position: Int): Int {
352+
var p = position
353+
// Skip the annotations and do nothing with them, but don't skip the annotated value.
354+
if (TypeIdHelper.isVariableLength(typeId)) {
355+
p += VarIntHelper.readVarUIntValueAndLength(source, p).toInt().and(ONE_BYTE_MASK)
356+
}
357+
val innerAnnotationLength = VarIntHelper.readVarUIntValueAndLength(source, p)
358+
p += innerAnnotationLength.toInt().and(ONE_BYTE_MASK) + innerAnnotationLength.shr(ONE_BYTE_SHIFT).toInt()
359+
return p - position
360+
}
361+
362+
/**
363+
* Gets the length for the given TypeId, reading a VarUInt length if needed.
364+
* Returns -1 if there is not enough data available to read the full VarUInt length.
365+
*
366+
* @throws IonException if the typeId is not a legal typeId in Ion 1.0
367+
*/
368+
@JvmStatic
369+
private fun getLengthForTypeId(typeId: Int, source: ByteArray, position: Int): Long {
370+
return when (val l = TypeIdHelper.TYPE_LENGTHS[typeId]) {
371+
-1 -> VarIntHelper.readVarUIntValueAndLength(source, position)
372+
-2 -> throw IonException("Invalid Type ID: $typeId")
373+
else -> l.toLong().shl(ONE_BYTE_SHIFT)
374+
}
375+
}
376+
}

0 commit comments

Comments
 (0)