From f1a29af91d34fa34001544f87b031793fd7e93f4 Mon Sep 17 00:00:00 2001 From: Wolfgang De Salvador Date: Wed, 8 Apr 2026 15:07:34 +0200 Subject: [PATCH] Calculate parquet random tensor per column rathern than per batch --- .../data_generator/parquet_generator.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/dlio_benchmark/data_generator/parquet_generator.py b/dlio_benchmark/data_generator/parquet_generator.py index 7dc431cb..28c87db5 100755 --- a/dlio_benchmark/data_generator/parquet_generator.py +++ b/dlio_benchmark/data_generator/parquet_generator.py @@ -303,6 +303,16 @@ def generate(self): writer_target = pa.BufferOutputStream() with pq.ParquetWriter(writer_target, schema, compression=compression) as writer: + # Generate all column data upfront for the entire file. + # This reduces function call overhead from (num_batches * num_columns) + # to just num_columns calls to gen_random_tensor. + if self.parquet_columns: + full_columns = self._generate_batch_columns(self.num_samples, rng) + else: + full_columns = self._generate_legacy_batch(elem_size, self.num_samples, rng) + + full_table = pa.table(full_columns) + num_batches = ( self.num_samples + self.generation_batch_size - 1 ) // self.generation_batch_size @@ -312,13 +322,8 @@ def generate(self): batch_end = min(batch_start + self.generation_batch_size, self.num_samples) current_batch_size = batch_end - batch_start - # rng advances per batch — each batch gets unique data. - if self.parquet_columns: - columns = self._generate_batch_columns(current_batch_size, rng) - else: - columns = self._generate_legacy_batch(elem_size, current_batch_size, rng) - - batch_table = pa.table(columns) + # Slice the pre-generated table for this batch (zero-copy). + batch_table = full_table.slice(batch_start, current_batch_size) writer.write_table(batch_table, row_group_size=self.row_group_size) if not is_local: