From 83201b72170a80d23caa160309964425ee56982d Mon Sep 17 00:00:00 2001 From: mingyuanm Date: Tue, 7 Apr 2026 14:47:15 -0700 Subject: [PATCH 1/4] Exclude calibration dataset from testing --- .../shopify_product_catalogue/__init__.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/inference_endpoint/dataset_manager/predefined/shopify_product_catalogue/__init__.py b/src/inference_endpoint/dataset_manager/predefined/shopify_product_catalogue/__init__.py index 6bae9f43..0397ae7c 100644 --- a/src/inference_endpoint/dataset_manager/predefined/shopify_product_catalogue/__init__.py +++ b/src/inference_endpoint/dataset_manager/predefined/shopify_product_catalogue/__init__.py @@ -32,6 +32,28 @@ logger = getLogger(__name__) +CALIBRATION_SAMPLE_INDEX = [ + 20232, + 21162, + 33584, + 46825, + 45190, + 46143, + 14189, + 16658, + 26406, + 9565, + 33733, + 31057, + 47465, + 33503, + 42293, + 7768, + 1962, + 39746, + 13568, + 22527, +] def _process_sample_to_row(sample: dict[str, Any]) -> dict[str, Any]: """Convert a single HF dataset sample to a row dict for parquet storage. @@ -148,6 +170,8 @@ def generate( desc=f"Converting images ({split_key})", unit="rows", ): + if i in CALIBRATION_SAMPLE_INDEX: + continue sample = ds[i] all_rows.append(_process_sample_to_row(sample)) From 7026abb47c1eed6023d6b1525becf2b750cbc3db Mon Sep 17 00:00:00 2001 From: Mingyuan Ma Date: Tue, 7 Apr 2026 15:00:44 -0700 Subject: [PATCH 2/4] precommit-fix --- .../predefined/shopify_product_catalogue/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/inference_endpoint/dataset_manager/predefined/shopify_product_catalogue/__init__.py b/src/inference_endpoint/dataset_manager/predefined/shopify_product_catalogue/__init__.py index 0397ae7c..b267eeda 100644 --- a/src/inference_endpoint/dataset_manager/predefined/shopify_product_catalogue/__init__.py +++ b/src/inference_endpoint/dataset_manager/predefined/shopify_product_catalogue/__init__.py @@ -55,6 +55,7 @@ 22527, ] + def _process_sample_to_row(sample: dict[str, Any]) -> dict[str, Any]: """Convert a single HF dataset sample to a row dict for parquet storage. From a9f7a8d60b42b673169eb5b115b7c31813bad091 Mon Sep 17 00:00:00 2001 From: mingyuanm Date: Tue, 7 Apr 2026 15:08:32 -0700 Subject: [PATCH 3/4] Set may be better --- .../predefined/shopify_product_catalogue/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/inference_endpoint/dataset_manager/predefined/shopify_product_catalogue/__init__.py b/src/inference_endpoint/dataset_manager/predefined/shopify_product_catalogue/__init__.py index b267eeda..2b13823c 100644 --- a/src/inference_endpoint/dataset_manager/predefined/shopify_product_catalogue/__init__.py +++ b/src/inference_endpoint/dataset_manager/predefined/shopify_product_catalogue/__init__.py @@ -32,7 +32,7 @@ logger = getLogger(__name__) -CALIBRATION_SAMPLE_INDEX = [ +CALIBRATION_SAMPLE_INDEX = { 20232, 21162, 33584, @@ -53,7 +53,7 @@ 39746, 13568, 22527, -] +} def _process_sample_to_row(sample: dict[str, Any]) -> dict[str, Any]: From 7dcf936550c42abd2c03be5e7d256e47a6c7e85b Mon Sep 17 00:00:00 2001 From: mingyuanm Date: Thu, 9 Apr 2026 11:03:29 -0700 Subject: [PATCH 4/4] calibration sample index passed as a kwarg --- .../predefined/shopify_product_catalogue/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/inference_endpoint/dataset_manager/predefined/shopify_product_catalogue/__init__.py b/src/inference_endpoint/dataset_manager/predefined/shopify_product_catalogue/__init__.py index 2b13823c..e0e3a9ac 100644 --- a/src/inference_endpoint/dataset_manager/predefined/shopify_product_catalogue/__init__.py +++ b/src/inference_endpoint/dataset_manager/predefined/shopify_product_catalogue/__init__.py @@ -32,7 +32,7 @@ logger = getLogger(__name__) -CALIBRATION_SAMPLE_INDEX = { +DEFAULT_CALIBRATION_SAMPLE_INDEX = { 20232, 21162, 33584, @@ -124,6 +124,7 @@ def generate( force: bool = False, token: str | None = None, revision: str = "main", + calibration_sample_index: set[int] | None = DEFAULT_CALIBRATION_SAMPLE_INDEX, **kwargs: Any, ) -> pd.DataFrame: """Generate the Shopify product catalogue dataset. @@ -171,7 +172,7 @@ def generate( desc=f"Converting images ({split_key})", unit="rows", ): - if i in CALIBRATION_SAMPLE_INDEX: + if calibration_sample_index is not None and i in calibration_sample_index: continue sample = ds[i] all_rows.append(_process_sample_to_row(sample))