diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b4d3174d1..6b936106f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -109,6 +109,7 @@ repos:
               examples/speculative_decoding/main.py|
               examples/speculative_decoding/medusa_utils.py|
               examples/speculative_decoding/server_generate.py|
+              examples/puzzletron/evaluation/hf_deployable_anymodel\.py|
               modelopt/torch/puzzletron/decilm/deci_lm_hf_code/transformers_.*\.py|
           )$
 
diff --git a/examples/puzzletron/README.md b/examples/puzzletron/README.md
index 6da67fd44..993dc6e1a 100644
--- a/examples/puzzletron/README.md
+++ b/examples/puzzletron/README.md
@@ -15,11 +15,11 @@ In this example, we compress the [Llama-3.1-8B-Instruct](https://huggingface.co/
 
 ## Environment
 
-- Install Model-Optimizer in editable mode with the corresponding dependencies:
+- Install Model-Optimizer in editable mode with the corresponding dependencies (run from the repo root):
 
 ```bash
 pip install -e .[hf,puzzletron]
-pip install -r requirements.txt
+pip install -r examples/puzzletron/requirements.txt
 ```
 
 - For this example we are using 2x NVIDIA H100 80GB HBM3 to show multi-GPU steps. You can use also use s single GPU.
@@ -199,16 +199,38 @@ block_14:  attention  no_op   ffn  intermediate_3072
 
 ## Evaluation
 
-Once the model is ready, you can evaluate it using [Language Model Evaluation Harness](https://pypi.org/project/lm-eval/). For example, run the following to evaluate the model on [Massive Multitask Language Understanding](https://huggingface.co/datasets/cais/mmlu) benchmark.
+Evaluate AnyModel checkpoints by deploying a local OpenAI-compatible completions endpoint and running benchmarks against it.
+
+**1. Deploy the model (2 GPUs example):**
+
+```bash
+# Install the AnyModel-patched deployable (first time only: backs up the original)
+# /opt/Export-Deploy is the default path in NeMo containers — adjust if needed
+cp /opt/Export-Deploy/nemo_deploy/llm/hf_deployable.py /opt/Export-Deploy/nemo_deploy/llm/hf_deployable.py.bak
+cp examples/puzzletron/evaluation/hf_deployable_anymodel.py /opt/Export-Deploy/nemo_deploy/llm/hf_deployable.py
+
+# Start the server (blocks while running — use a separate terminal)
+ray start --head --num-gpus 2 --port 6379 --disable-usage-stats
+python /opt/Export-Deploy/scripts/deploy/nlp/deploy_ray_hf.py \
+    --model_path path/to/checkpoint \
+    --model_id anymodel-hf \
+    --num_gpus 2 --num_gpus_per_replica 2 --num_cpus_per_replica 16 \
+    --trust_remote_code --port 8083 --device_map "auto" --cuda_visible_devices "0,1"
+```
+
+**2. Run MMLU:**
 
 ```bash
-lm_eval --model hf \
-  --model_args pretrained=path/to/model,dtype=bfloat16,trust_remote_code=true,parallelize=True \
-  --tasks mmlu \
-  --num_fewshot 5 \
-  --batch_size 4
+eval-factory run_eval \
+    --eval_type mmlu \
+    --model_id anymodel-hf \
+    --model_type completions \
+    --model_url http://0.0.0.0:8083/v1/completions/ \
+    --output_dir examples/puzzletron/evals/mmlu_anymodel
 ```
 
+For a quick debug run, add `--overrides "config.params.limit_samples=5"`.
+
 ## Inference Performance Benchmarking
 
 Now let's evaluate how much speedup we get with the compressed model in terms of throughput and latency.
diff --git a/examples/puzzletron/evaluation/hf_deployable_anymodel.py b/examples/puzzletron/evaluation/hf_deployable_anymodel.py
new file mode 100644
index 000000000..f4fd4e414
--- /dev/null
+++ b/examples/puzzletron/evaluation/hf_deployable_anymodel.py
@@ -0,0 +1,724 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import logging
+from typing import Any
+
+import numpy as np
+import torch
+from nemo_deploy import ITritonDeployable
+from nemo_deploy.utils import broadcast_list, cast_output, str_ndarray2list
+from nemo_export_deploy_common.import_utils import (
+    MISSING_TRITON_MSG,
+    UnavailableError,
+    null_decorator,
+)
+from peft import PeftModel
+from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
+
+try:
+    from pytriton.decorators import batch
+    from pytriton.model_config import Tensor
+
+    HAVE_TRITON = True
+except (ImportError, ModuleNotFoundError):
+    from unittest.mock import MagicMock
+
+    HAVE_TRITON = False
+    batch = MagicMock()
+    Tensor = MagicMock()
+    batch = null_decorator
+
+
+LOGGER = logging.getLogger("NeMo")
+
+SUPPORTED_TASKS = ["text-generation"]
+
+
+class HuggingFaceLLMDeploy(ITritonDeployable):
+    """A Triton inference server compatible wrapper for HuggingFace models.
+
+    This class provides a standardized interface for deploying HuggingFace models
+    in Triton inference server. It supports various NLP tasks and handles model
+    loading, inference, and deployment configurations.
+
+    Args:
+        hf_model_id_path (Optional[str]): Path to the HuggingFace model or model identifier.
+            Can be a local path or a model ID from HuggingFace Hub.
+        hf_peft_model_id_path (Optional[str]): Path to the PEFT model or model identifier.
+            Can be a local path or a model ID from HuggingFace Hub.
+        tokenizer_id_path (Optional[str]): Path to the tokenizer or tokenizer identifier.
+            If None, will use the same path as hf_model_id_path.
+        model (Optional[AutoModel]): Pre-loaded HuggingFace model.
+        tokenizer (Optional[AutoTokenizer]): Pre-loaded HuggingFace tokenizer.
+        tokenizer_padding (bool): Whether to enable padding in tokenizer. Defaults to True.
+        tokenizer_truncation (bool): Whether to enable truncation in tokenizer. Defaults to True.
+        tokenizer_padding_side (str): Which side to pad on ('left' or 'right'). Defaults to 'left'.
+        task (str): HuggingFace task type (e.g., "text-generation"). Defaults to "text-generation".
+        **hf_kwargs: Additional keyword arguments to pass to HuggingFace model loading.
+    """
+
+    def __init__(
+        self,
+        hf_model_id_path: str | None = None,
+        hf_peft_model_id_path: str | None = None,
+        tokenizer_id_path: str | None = None,
+        model: AutoModel | None = None,
+        tokenizer: AutoTokenizer | None = None,
+        tokenizer_padding=True,
+        tokenizer_truncation=True,
+        tokenizer_padding_side="left",
+        task: str | None = "text-generation",
+        torch_dtype: torch.dtype | None = "auto",
+        device_map: str | None = "auto",
+        **hf_kwargs,
+    ):
+        if not HAVE_TRITON:
+            raise UnavailableError(MISSING_TRITON_MSG)
+
+        if hf_model_id_path is None and model is None:
+            raise ValueError("hf_model_id_path or model parameters has to be passed.")
+        elif hf_model_id_path is not None and model is not None:
+            LOGGER.warning(
+                "hf_model_id_path will be ignored and the HuggingFace model set with model parameter will be used."
+            )
+
+        assert task in SUPPORTED_TASKS, "Task {} is not a support task.".format(task)
+
+        self.hf_model_id_path = hf_model_id_path
+        self.hf_peft_model_id_path = hf_peft_model_id_path
+        self.task = task
+        self.model = model
+        self.tokenizer = tokenizer
+        self.tokenizer_padding = tokenizer_padding
+        self.tokenizer_truncation = tokenizer_truncation
+        self.tokenizer_padding_side = tokenizer_padding_side
+
+        if tokenizer_id_path is None:
+            self.tokenizer_id_path = hf_model_id_path
+        else:
+            self.tokenizer_id_path = tokenizer_id_path
+
+        if model is None:
+            self._load(torch_dtype=torch_dtype, device_map=device_map, **hf_kwargs)
+
+    def _load(
+        self, torch_dtype: torch.dtype | None = "auto", device_map: str | None = "auto", **hf_kwargs
+    ) -> None:
+        """Load the HuggingFace pipeline with the specified model and task.
+
+        This method initializes the HuggingFace AutoModel classes using the provided model
+        configuration and task type. It handles the model and tokenizer loading
+        process.
+
+        Args:
+            torch_dtype (torch.dtype): Data type for the model. Defaults to "auto".
+            device_map (str): Device map for the model. Defaults to "auto".
+            **hf_kwargs: Additional keyword arguments to pass to the HuggingFace model loading.
+
+        Raises:
+            AssertionError: If task is not specified.
+        """
+        assert self.task is not None, "A task has to be given for the generation task."
+
+        if self.task == "text-generation":
+            # =========================================================================
+            # BEGIN ANYMODEL PATCH
+            # Wraps model loading with deci_x_patcher for heterogeneous layer configs.
+            # See: modelopt/torch/puzzletron/anymodel/puzzformer/utils.py
+            # =========================================================================
+            import os
+            import sys
+
+            modelopt_workdir = os.environ.get("MODELOPT_WORKDIR") or os.environ.get(
+                "PUZZLE_WORKDIR"
+            )
+            if modelopt_workdir and modelopt_workdir not in sys.path:
+                sys.path.insert(0, modelopt_workdir)
+            from modelopt.torch.puzzletron.anymodel.models.llama import LlamaModelDescriptor
+            from modelopt.torch.puzzletron.anymodel.puzzformer import deci_x_patcher
+
+            with deci_x_patcher(model_descriptor=LlamaModelDescriptor):
+                self.model = AutoModelForCausalLM.from_pretrained(
+                    self.hf_model_id_path,
+                    torch_dtype=torch_dtype,
+                    device_map=device_map,
+                    **hf_kwargs,
+                )
+            # =========================================================================
+            # END ANYMODEL PATCH
+            # =========================================================================
+
+            if self.hf_peft_model_id_path is not None:
+                self.model = PeftModel.from_pretrained(self.model, self.hf_peft_model_id_path)
+        else:
+            raise ValueError("Task {} is not supported.".format(self.task))
+        num_gpus = torch.cuda.device_count()
+        # If there is only one GPU, move the model to GPU. If you are using device_map as "auto" or "balanced",
+        # the model will be moved to GPU automatically.
+        if device_map is None and num_gpus >= 1 and self.model.device.type != "cuda":
+            self.model.cuda()
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.tokenizer_id_path,
+            trust_remote_code=hf_kwargs.pop("trust_remote_code", False),
+            padding=self.tokenizer_padding,
+            truncation=self.tokenizer_truncation,
+            padding_side=self.tokenizer_padding_side,
+        )
+
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+
+    def generate(
+        self,
+        **kwargs: Any,
+    ) -> list[str]:
+        """Generate text based on the provided input prompts.
+
+        This method processes input prompts through the loaded pipeline and
+        generates text according to the specified parameters.
+
+        Args:
+            **kwargs: Generation parameters including:
+                - text_inputs: List of input prompts
+                - max_length: Maximum number of tokens to generate
+                - num_return_sequences: Number of sequences to generate per prompt
+                - temperature: Sampling temperature
+                - top_k: Number of highest probability tokens to consider
+                - top_p: Cumulative probability threshold for token sampling
+                - do_sample: Whether to use sampling, default is False for greedy decoding
+                - echo: Whether to return prompt + generated text (True) or just generated text (False)
+                - return_full_text: Whether to return full text or only generated part
+
+        Returns:
+            If output logits and output scores are False:
+            List[str]: A list of generated texts, one for each input prompt.
+            If output logits and output scores are True:
+            Dict: A dictionary containing:
+                - sentences: List of generated texts
+                - logits: List of logits
+                - scores: List of scores
+                - input_lengths: List of input token lengths (for echo processing)
+
+        Raises:
+            RuntimeError: If the pipeline is not initialized.
+        """
+        if not self.model:
+            raise RuntimeError("Model is not initialized")
+
+        inputs = self.tokenizer(
+            kwargs["text_inputs"],
+            return_tensors="pt",
+            padding=self.tokenizer_padding,
+            truncation=self.tokenizer_truncation,
+        )
+
+        # Store input lengths to extract only generated tokens later
+        input_lengths = [len(input_ids) for input_ids in inputs["input_ids"]]
+
+        # Get echo parameter (default False - only return generated text)
+        echo = kwargs.pop("echo", False)
+        kwargs.pop("text_inputs")  # Remove text_inputs as it's already been tokenized
+
+        kwargs = {**inputs, **kwargs}
+        for key, val in kwargs.items():
+            if torch.is_tensor(val):
+                kwargs[key] = val.cuda()
+
+        with torch.no_grad():
+            generated_ids = self.model.generate(**kwargs)
+        return_dict_in_generate = kwargs.get("return_dict_in_generate", False)
+        if return_dict_in_generate:
+            # Handle dict output (when logits/scores are requested)
+            sequences = generated_ids["sequences"]
+            output = {"sentences": [], "input_lengths": input_lengths, "sequences": sequences}
+
+            if echo:
+                # Return full text (prompt + generated).
+                # HF model's generate returns the input/prompt tokens as well by default.
+                for i, seq in enumerate(sequences):
+                    full_text = self.tokenizer.decode(seq, skip_special_tokens=True)
+                    output["sentences"].append(full_text)
+            else:
+                # Extract only the generated tokens (skip input tokens).
+                # This is required as HF model's generate returns the input/prompt tokens
+                # as well by default. (return_full_text is specific to some models)
+                for i, seq in enumerate(sequences):
+                    input_len = input_lengths[i] if i < len(input_lengths) else 0
+                    generated_tokens = seq[input_len:]  # Skip input tokens
+                    generated_text = self.tokenizer.decode(
+                        generated_tokens, skip_special_tokens=True
+                    )
+                    output["sentences"].append(generated_text)
+
+            if kwargs.get("output_logits", False):
+                output["logits"] = generated_ids["logits"]
+            if kwargs.get("output_scores", False):
+                output["scores"] = generated_ids["scores"]
+        else:
+            # Handle list output (normal case)
+            output = []
+            if echo:
+                # Return full text (prompt + generated), which is the default in case of HF model generate.
+                for i, seq in enumerate(generated_ids):
+                    full_text = self.tokenizer.decode(seq, skip_special_tokens=True)
+                    output.append(full_text)
+            else:
+                # Extract only the generated tokens (skip input tokens) as the default
+                # behavior returns the input/prompt tokens as well.
+                for i, seq in enumerate(generated_ids):
+                    input_len = input_lengths[i] if i < len(input_lengths) else 0
+                    generated_tokens = seq[input_len:]  # Skip input tokens
+                    generated_text = self.tokenizer.decode(
+                        generated_tokens, skip_special_tokens=True
+                    )
+                    output.append(generated_text)
+
+        return output
+
+    def generate_other_ranks(self):
+        """Generate function for ranks other than the rank 0."""
+        while True:
+            message = torch.empty(1, dtype=torch.long, device="cuda")
+            torch.distributed.broadcast(message, src=0)
+            if message == 0:
+                prompts = broadcast_list(data=[None], src=0)
+                (
+                    temperature,
+                    top_k,
+                    top_p,
+                    num_tokens_to_generate,
+                    output_logits,
+                    output_scores,
+                ) = broadcast_list(data=[None], src=0)
+
+                return_dict_in_generate = False
+                if output_logits or output_scores:
+                    return_dict_in_generate = True
+
+                self.generate(
+                    text_inputs=prompts,
+                    do_sample=False,  # do_sample=False for greedy decoding
+                    top_k=top_k,
+                    top_p=top_p,
+                    temperature=temperature,
+                    max_new_tokens=num_tokens_to_generate,
+                    output_logits=output_logits,
+                    output_scores=output_scores,
+                    return_dict_in_generate=return_dict_in_generate,
+                )
+            else:
+                return
+
+    @property
+    def get_triton_input(self):
+        inputs = (
+            Tensor(name="prompts", shape=(-1,), dtype=bytes),
+            Tensor(name="max_length", shape=(-1,), dtype=np.int_, optional=True),
+            Tensor(name="max_batch_size", shape=(-1,), dtype=np.int_, optional=True),
+            Tensor(name="top_k", shape=(-1,), dtype=np.int_, optional=True),
+            Tensor(name="top_p", shape=(-1,), dtype=np.single, optional=True),
+            Tensor(name="temperature", shape=(-1,), dtype=np.single, optional=True),
+            Tensor(name="random_seed", shape=(-1,), dtype=np.int_, optional=True),
+            Tensor(name="max_length", shape=(-1,), dtype=np.int_, optional=True),
+            Tensor(name="output_logits", shape=(-1,), dtype=np.bool_, optional=True),
+            Tensor(name="output_scores", shape=(-1,), dtype=np.bool_, optional=True),
+        )
+        return inputs
+
+    @property
+    def get_triton_output(self):
+        return (
+            Tensor(name="sentences", shape=(-1,), dtype=bytes),
+            Tensor(name="logits", shape=(-1,), dtype=np.single),
+            Tensor(name="scores", shape=(-1,), dtype=np.single),
+        )
+
+    @batch
+    def triton_infer_fn(self, **inputs: np.ndarray):
+        output_infer = {}
+
+        try:
+            prompts = str_ndarray2list(inputs.pop("prompts"))
+            temperature = inputs.pop("temperature")[0][0] if "temperature" in inputs else 1.0
+            top_k = int(inputs.pop("top_k")[0][0] if "top_k" in inputs else 1)
+            top_p = inputs.pop("top_p")[0][0] if "top_p" in inputs else 0
+            num_tokens_to_generate = (
+                inputs.pop("max_length")[0][0] if "max_length" in inputs else 256
+            )
+            output_logits = (
+                inputs.pop("output_logits")[0][0] if "output_logits" in inputs else False
+            )
+            output_scores = (
+                inputs.pop("output_scores")[0][0] if "output_scores" in inputs else False
+            )
+            return_dict_in_generate = False
+            if output_logits or output_scores:
+                return_dict_in_generate = True
+
+            if torch.distributed.is_initialized():
+                if torch.distributed.get_world_size() > 1:
+                    torch.distributed.broadcast(
+                        torch.tensor([0], dtype=torch.long, device="cuda"), src=0
+                    )
+                    broadcast_list(prompts, src=0)
+                    broadcast_list(
+                        data=[
+                            temperature,
+                            top_k,
+                            top_p,
+                            num_tokens_to_generate,
+                            output_logits,
+                            output_scores,
+                        ],
+                        src=0,
+                    )
+
+            output = self.generate(
+                text_inputs=prompts,
+                do_sample=False,  # do_sample=False for greedy decoding
+                top_k=top_k,
+                top_p=top_p,
+                temperature=temperature,
+                max_new_tokens=num_tokens_to_generate,
+                output_logits=output_logits,
+                output_scores=output_scores,
+                return_dict_in_generate=return_dict_in_generate,
+                echo=False,
+            )
+
+            if isinstance(output, dict):
+                output_infer = {"sentences": cast_output(output["sentences"], np.bytes_)}
+
+                if "scores" in output:
+                    output_scores = []
+                    for r in output["scores"]:
+                        lp = torch.tensor(r).cpu().detach().numpy()
+                        if len(lp) == 0:
+                            output_scores.append([0])
+                        else:
+                            output_scores.append(lp)
+                    output_infer["scores"] = np.array(output_scores).transpose(1, 0, 2)
+
+                if "logits" in output:
+                    output_logits = []
+                    for r in output["logits"]:
+                        lp = torch.tensor(r).cpu().detach().numpy()
+                        if len(lp) == 0:
+                            output_logits.append([0])
+                        else:
+                            output_logits.append(lp)
+                    output_infer["logits"] = np.array(output_logits).transpose(1, 0, 2)
+            else:
+                output_infer = {"sentences": cast_output(output, np.bytes_)}
+
+        except Exception as error:
+            err_msg = "An error occurred: {}".format(str(error))
+            output_infer["sentences"] = cast_output([err_msg], np.bytes_)
+
+        return output_infer
+
+    def _compute_logprobs(
+        self,
+        prompts: list[str],
+        output_infer: dict[str, Any],
+        compute_logprob: bool,
+        n_top_logprobs: int,
+        echo: bool,
+    ):
+        """Compute log probabilities and top log probabilities from model scores.
+        Used by ray_infer_fn to provide OAI API compatible output for evaluations.
+
+        This method processes the raw scores from model generation to compute:
+        - Log probabilities for chosen tokens
+        - Top-k log probabilities for each position (if requested)
+        - Handles both prompt tokens (when echo=True) and generated tokens
+
+        Args:
+            prompts: List of input prompts
+            output_infer: Dictionary containing model outputs including scores, sequences, and input_lengths
+            compute_logprob: Whether to compute log probabilities
+            n_top_logprobs: Number of top log probabilities to return (0 to disable)
+            echo: Whether to include prompt token log probabilities
+
+        Returns:
+            Tuple[Optional[List], Optional[List]]:
+                - log_probs_list: List of log probabilities for each sample (None if not computed)
+                - top_logprobs_list: List of top-k log probabilities for each sample (None if not computed)
+        """
+        # Tokenize the prompts to get prompt token IDs (needed for echo)
+        prompt_token_ids = None
+        prompt_inputs = None
+        if echo:
+            prompt_inputs = self.tokenizer(
+                prompts,
+                return_tensors="pt",
+                padding=self.tokenizer_padding,
+                truncation=self.tokenizer_truncation,
+            )
+            prompt_token_ids = prompt_inputs["input_ids"]
+            # Move to same device as model
+            for key, val in prompt_inputs.items():
+                if torch.is_tensor(val):
+                    prompt_inputs[key] = val.cuda()
+
+        # Process each sample
+        log_probs_list = []
+        top_logprobs_list = []
+
+        for sample_idx in range(len(prompts)):
+            sample_log_probs = []
+            sample_top_logprobs = []
+
+            # Get the generated sequence for this sample
+            sequences = output_infer["sequences"][sample_idx]
+
+            # For echo, compute prompt token logprobs by running forward pass
+            if echo and prompt_token_ids is not None:
+                prompt_len = len(prompt_token_ids[sample_idx])
+
+                # Run forward pass on prompt to get logits for prompt tokens as scores in output_infer contains
+                # logits only for generated tokens.
+                with torch.no_grad():
+                    # Create input for this specific sample
+                    sample_prompt_input = {
+                        key: val[sample_idx : sample_idx + 1] for key, val in prompt_inputs.items()
+                    }
+                    prompt_outputs = self.model(**sample_prompt_input)
+                    prompt_logits = prompt_outputs.logits[0]  # Shape: [seq_len, vocab_size]
+
+                # Calculate log probs for each prompt token (except the first BOS token)
+                for token_pos in range(1, prompt_len):  # Start from 1 to skip BOS
+                    # The logit at position i-1 predicts token at position i
+                    logit_for_current_token = prompt_logits[token_pos - 1]
+                    current_token_id = prompt_token_ids[sample_idx][token_pos].item()
+
+                    # Calculate log probabilities
+                    log_probs = torch.nn.functional.log_softmax(logit_for_current_token, dim=-1)
+                    chosen_log_prob = log_probs[current_token_id].item()
+                    sample_log_probs.append(chosen_log_prob)
+
+                    # Calculate top log probabilities if requested
+                    if n_top_logprobs > 0:
+                        top_log_probs_dict = {}
+                        top_k_values, top_k_indices = torch.topk(
+                            log_probs, min(n_top_logprobs, len(log_probs))
+                        )
+                        for k_idx in range(len(top_k_indices)):
+                            token_id = top_k_indices[k_idx].item()
+                            token_str = self.tokenizer.decode([token_id])
+                            top_log_probs_dict[token_str] = top_k_values[k_idx].item()
+                        sample_top_logprobs.append(top_log_probs_dict)
+
+            # Process the scores for generated tokens
+            for token_idx, score_tensor in enumerate(output_infer["scores"]):
+                # Get the chosen token ID from the sequence
+                # Scores start after the prompt, so we need to offset
+                input_len = (
+                    output_infer.get("input_lengths", [0])[sample_idx]
+                    if "input_lengths" in output_infer
+                    else 0
+                )
+                seq_idx = input_len + token_idx
+
+                if seq_idx < len(sequences):
+                    chosen_token_id = (
+                        sequences[seq_idx].item()
+                        if hasattr(sequences[seq_idx], "item")
+                        else sequences[seq_idx]
+                    )
+
+                    # Calculate log probabilities
+                    log_probs = torch.nn.functional.log_softmax(score_tensor[sample_idx], dim=-1)
+                    chosen_log_prob = log_probs[chosen_token_id].item()
+                    sample_log_probs.append(chosen_log_prob)
+
+                    # Calculate top log probabilities if requested
+                    if n_top_logprobs > 0:
+                        top_log_probs_dict = {}
+                        top_k_values, top_k_indices = torch.topk(
+                            log_probs, min(n_top_logprobs, len(log_probs))
+                        )
+                        for k_idx in range(len(top_k_indices)):
+                            token_id = top_k_indices[k_idx].item()
+                            token_str = self.tokenizer.decode([token_id])
+                            top_log_probs_dict[token_str] = top_k_values[k_idx].item()
+                        sample_top_logprobs.append(top_log_probs_dict)
+
+            log_probs_list.append(sample_log_probs)
+            if n_top_logprobs > 0:
+                top_logprobs_list.append(sample_top_logprobs)
+
+        # Return log probs and top logprobs
+        return_log_probs = log_probs_list if compute_logprob else None
+        return_top_logprobs = top_logprobs_list if n_top_logprobs > 0 else None
+
+        return return_log_probs, return_top_logprobs
+
+    def ray_infer_fn(self, inputs: dict[Any, Any]):
+        """Perform inference using Ray with dictionary inputs and outputs.
+
+        Args:
+            inputs (Dict[Any, Any]): Dictionary containing input parameters:
+                - prompts: List of input prompts
+                - temperature: Sampling temperature (optional)
+                - top_k: Number of highest probability tokens to consider (optional)
+                - top_p: Cumulative probability threshold for token sampling (optional)
+                - max_tokens: Maximum number of tokens to generate (optional)
+                - compute_logprob: Whether to compute log probabilities (optional)
+                - n_top_logprobs: Number of top log probabilities to return (optional)
+                - echo: Whether to echo the prompt in output (optional)
+
+        Returns:
+            Dict[str, Any]: Dictionary containing:
+                - sentences: List of generated texts
+                - log_probs: Optional list of log probabilities if compute_logprob is True
+                - top_logprobs: Optional list of top log probabilities if n_top_logprobs > 0
+        """
+        import json
+
+        try:
+            prompts = inputs.pop("prompts")
+            temperature = inputs.pop("temperature", 1.0)
+            top_k = int(inputs.pop("top_k", 1))
+            top_p = inputs.pop("top_p", 0.0)
+            num_tokens_to_generate = inputs.pop("max_tokens", 256)
+            output_logits = inputs.pop("output_logits", False)
+            output_scores = inputs.pop("output_scores", False)
+            compute_logprob = inputs.pop("compute_logprob", False)
+            n_top_logprobs = inputs.pop("n_top_logprobs", 0)
+            echo = inputs.pop("echo", False)
+
+            output_infer = self._infer_fn_ray(
+                prompts=prompts,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+                num_tokens_to_generate=num_tokens_to_generate,
+                output_logits=output_logits,
+                output_scores=output_scores,
+                compute_logprob=compute_logprob,
+                n_top_logprobs=n_top_logprobs,
+                echo=echo,
+            )
+            # Code to get logprobs (required in OAI API format for eval) from the scores in output_infer.
+            if (
+                (compute_logprob or n_top_logprobs > 0)
+                and "scores" in output_infer
+                and output_infer["scores"]
+            ):
+                log_probs_list, top_logprobs_list = self._compute_logprobs(
+                    prompts=prompts,
+                    output_infer=output_infer,
+                    compute_logprob=compute_logprob,
+                    n_top_logprobs=n_top_logprobs,
+                    echo=echo,
+                )
+
+                # Add to output
+                if log_probs_list is not None:
+                    output_infer["log_probs"] = log_probs_list
+                if top_logprobs_list is not None:
+                    # Convert to JSON strings for compatibility
+                    output_infer["top_logprobs"] = [
+                        json.dumps(top_logprobs) for top_logprobs in top_logprobs_list
+                    ]
+
+                # Remove raw outputs that are not needed in the final response
+                output_infer.pop("scores", None)
+                output_infer.pop("sequences", None)
+                output_infer.pop("input_lengths", None)
+            return output_infer
+        except Exception as error:
+            err_msg = "An error occurred: {}".format(str(error))
+            return {"sentences": [err_msg]}
+
+    def _infer_fn_ray(
+        self,
+        prompts,
+        temperature=1.0,
+        top_k=1,
+        top_p=0.0,
+        num_tokens_to_generate=256,
+        output_logits=False,
+        output_scores=False,
+        compute_logprob=False,
+        n_top_logprobs=0,
+        echo=False,
+        cast_output_func=None,
+    ):
+        """Common internal function for inference operations.
+
+        Args:
+            prompts: List of input prompts
+            temperature: Sampling temperature
+            top_k: Number of highest probability tokens to consider
+            top_p: Cumulative probability threshold for token sampling
+            num_tokens_to_generate: Maximum number of tokens to generate
+            output_logits: Whether to output logits
+            output_scores: Whether to output scores
+            compute_logprob: Whether to compute log probabilities
+            n_top_logprobs: Number of top log probabilities to return
+            echo: Whether to echo the prompt in output
+            cast_output_func: Optional function to cast output values
+
+        Returns:
+            Dict containing inference results with raw outputs
+        """
+        # Enable return_dict if we need scores for logprobs or if output_logits/scores are requested
+        return_dict_in_generate = (
+            output_logits or output_scores or compute_logprob or n_top_logprobs > 0
+        )
+        # Enable output_scores if we need to compute logprobs. scores and logits from generate are both identical in
+        # case of greedy decoding. Hence setting output_scores to True when compute_logprob or n_top_logprobs > 0.
+        if compute_logprob or n_top_logprobs > 0:
+            output_scores = True
+
+        if torch.distributed.is_initialized():
+            if torch.distributed.get_world_size() > 1:
+                torch.distributed.broadcast(
+                    torch.tensor([0], dtype=torch.long, device="cuda"), src=0
+                )
+                broadcast_list(prompts, src=0)
+                broadcast_list(
+                    data=[
+                        temperature,
+                        top_k,
+                        top_p,
+                        num_tokens_to_generate,
+                        output_logits,
+                        output_scores,
+                    ],
+                    src=0,
+                )
+
+        output = self.generate(
+            text_inputs=prompts,
+            do_sample=False,  # do_sample=False for greedy decoding
+            top_k=top_k,
+            top_p=top_p,
+            temperature=temperature,
+            max_new_tokens=num_tokens_to_generate,
+            output_logits=output_logits,
+            output_scores=output_scores,
+            return_dict_in_generate=return_dict_in_generate,
+            echo=echo,
+        )
+
+        if isinstance(output, dict):
+            return output
+
+        else:
+            return {"sentences": output}
diff --git a/examples/puzzletron/requirements.txt b/examples/puzzletron/requirements.txt
index fe63c413b..0511fb473 100644
--- a/examples/puzzletron/requirements.txt
+++ b/examples/puzzletron/requirements.txt
@@ -1 +1,3 @@
-lm-eval==0.4.9
+lm-eval==0.4.10
+math-verify
+ray