diff --git a/.gitignore b/.gitignore
index 3c7cc70072..a1e9611cc4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,7 +7,7 @@ __pycache__/
 test.py
 # C extensions
 *.so
-
+.claude
 # Distribution / packaging
 .Python
 build/
diff --git a/docs/source_en/Customization/Custom-dataset.md b/docs/source_en/Customization/Custom-dataset.md
index 69d49e4ed0..8a45c8ef3c 100644
--- a/docs/source_en/Customization/Custom-dataset.md
+++ b/docs/source_en/Customization/Custom-dataset.md
@@ -9,7 +9,7 @@ There are three methods for accessing custom datasets, each offering progressive
 
 The following is an introduction to the dataset formats that `AutoPreprocessor` can handle:
 
-The standard dataset format for ms-swift accepts keys such as: 'messages', 'rejected_response', 'label', 'images', 'videos', 'audios', 'tools', and 'objects'. Among these, 'messages' is a required key. 'rejected_response' is used for DPO and other RLHF training, 'label' is used for KTO training and classification model training. The keys 'images', 'videos', and 'audios' are used to store paths or URLs for multimodal data, 'tools' is used for Agent tasks, and 'objects' is used for grounding tasks.
+The standard dataset format for ms-swift accepts keys such as: 'messages', 'rejected_response', 'label', 'images', 'videos', 'audios', 'tensors', 'tools', and 'objects'. Among these, 'messages' is a required key. 'rejected_response' is used for DPO and other RLHF training, 'label' is used for KTO training and classification model training. The keys 'images', 'videos', 'audios', and 'tensors' are used to store paths or URLs for multimodal data, 'tools' is used for Agent tasks, and 'objects' is used for grounding tasks.
 
 There are three core preprocessors in ms-swift: `MessagesPreprocessor`, `AlpacaPreprocessor`, and `ResponsePreprocessor`. `MessagesPreprocessor` is used to convert datasets in the messages and sharegpt format into the standard format. `AlpacaPreprocessor` converts datasets in the alpaca format, while `ResponsePreprocessor` converts datasets in the query/response format. `AutoPreprocessor` automatically selects the appropriate preprocessor for the task.
 
@@ -173,7 +173,9 @@ Please refer to [Reranker training document](../BestPractices/Reranker.md#datase
 
 ### Multimodal
 
-For multimodal datasets, the format is the same as the aforementioned tasks. The difference lies in the addition of several keys: `images`, `videos`, and `audios`, which represent the URLs or paths (preferably absolute paths) of multimodal resources. The tags `<image>`, `<video>`, and `<audio>` indicate where to insert images, videos, or audio. MS-Swift supports multiple images, videos, and audio files. These special tokens will be replaced during preprocessing, as referenced [here](https://github.com/modelscope/ms-swift/blob/main/swift/llm/template/template/qwen.py#L198). The four examples below respectively demonstrate the data format for plain text, as well as formats containing image, video, and audio data.
+For multimodal datasets, the format is the same as the aforementioned tasks. The difference lies in the addition of several keys: `images`, `videos`, `audios`, and `tensors`, which represent the URLs or paths (preferably absolute paths) of multimodal resources. The tags `<image>`, `<video>`, `<audio>`, and `<tensor>` indicate where to insert images, videos, audio, or tensor data. MS-Swift supports multiple images, videos, audio files, and tensors. These special tokens will be replaced during preprocessing, as referenced [here](https://github.com/modelscope/ms-swift/blob/main/swift/llm/template/template/qwen.py#L198). The examples below demonstrate the data format for plain text, as well as formats containing image, video, audio, and tensor data.
+
+> **Note**: For detailed information about tensor support, including supported formats, use cases, and examples, see the [Tensor Support Documentation](Tensor-support.md).
 
 
 Pre-training:
@@ -182,6 +184,7 @@ Pre-training:
 {"messages": [{"role": "assistant", "content": "<image>is a puppy, <image>is a kitten"}], "images": ["/xxx/x.jpg", "/xxx/x.png"]}
 {"messages": [{"role": "assistant", "content": "<audio>describes how nice the weather is today"}], "audios": ["/xxx/x.wav"]}
 {"messages": [{"role": "assistant", "content": "<image>is an elephant, <video>is a lion running"}], "images": ["/xxx/x.jpg"], "videos": ["/xxx/x.mp4"]}
+{"messages": [{"role": "assistant", "content": "<tensor>represents a normal medical scan"}], "tensors": ["/xxx/scan.pt"]}
 ```
 
 Supervised Fine-tuning:
@@ -191,6 +194,7 @@ Supervised Fine-tuning:
 {"messages": [{"role": "user", "content": "<image><image>What is the difference between the two images?"}, {"role": "assistant", "content": "The first one is a kitten, and the second one is a puppy."}], "images": ["/xxx/x.jpg", "/xxx/x.png"]}
 {"messages": [{"role": "user", "content": "<audio>What did the audio say?"}, {"role": "assistant", "content": "The weather is really nice today."}], "audios": ["/xxx/x.mp3"]}
 {"messages": [{"role": "system", "content": "You are a helpful and harmless assistant."}, {"role": "user", "content": "<image>What is in the image, <video>What is in the video?"}, {"role": "assistant", "content": "The image shows an elephant, and the video shows a puppy running on the grass."}], "images": ["/xxx/x.jpg"], "videos": ["/xxx/x.mp4"]}
+{"messages": [{"role": "user", "content": "<tensor>Generate a report for this medical scan"}, {"role": "assistant", "content": "The scan shows normal cardiac function with no abnormalities detected."}], "tensors": ["/xxx/cardiac_scan.pt"]}
 ```
 - Note: The following fields will be automatically converted to the corresponding images, videos, and audios fields.
   - images: image, images.
@@ -199,7 +203,7 @@ Supervised Fine-tuning:
 - If you need to pass base64 data instead of file paths, here are sample examples: `"videos": ['data:video/mp4;base64,{base64_encoded}']`, `"images": ['data:image/jpg;base64,{base64_encoded}']`.
 - If you wish to directly pass in video frames instead of a video file, you can use the following format (requires `ms-swift>=3.8.3`): `"videos": [["/xxx/x.png", "/xxx/y.png"], ["/xxx/a.png", "/xxx/b.png", "/xxx/c.png"]]`. This format is supported only by certain models, including Qwen2/2.5/3-VL, Qwen2.5/3-Omni, and their derivative models.
 
-The data format for RLHF and sequence classification of multimodal models can reference the format of pure text large models, with additional fields such as `images` added on top of that.
+The data format for RLHF and sequence classification of multimodal models can reference the format of pure text large models, with additional fields such as `images` and `tensors` added on top of that.
 
 #### Grounding
 
diff --git a/docs/source_en/Customization/Tensor-support.md b/docs/source_en/Customization/Tensor-support.md
new file mode 100644
index 0000000000..ccdd50a7ed
--- /dev/null
+++ b/docs/source_en/Customization/Tensor-support.md
@@ -0,0 +1,371 @@
+# Tensor Input Support
+
+MS-Swift supports loading and processing tensor files (`.pt` format) as input modality, similar to how images, videos, and audios are handled. This feature is particularly useful for specialized applications such as:
+
+- Medical imaging and report generation (e.g., loading pre-processed scan tensors)
+- Scientific data visualization
+- Custom feature representations
+- Pre-computed embeddings or feature maps
+
+## Overview
+
+The tensor support follows the same pattern as other multimodal inputs in MS-Swift:
+
+- Use `<tensor>` tags in messages (similar to `<image>`, `<video>`, `<audio>`)
+- Provide tensor file paths via the `tensors` parameter
+- Tensors are automatically converted to PIL Images for model processing
+- Supports both single tensors and batched tensors
+
+## Supported Tensor Formats
+
+The tensor loader supports various tensor shapes:
+
+| Tensor Shape | Description | Output |
+|--------------|-------------|--------|
+| `(C, H, W)` | Single image tensor (e.g., `(3, 224, 224)` for RGB) | 1 PIL Image |
+| `(B, C, H, W)` | Batch of image tensors (e.g., `(4, 3, 224, 224)`) | B PIL Images |
+| `(1, H, W)` | Single channel (grayscale) tensor | 1 PIL Image (grayscale) |
+| `(H, W)` | 2D tensor (grayscale) | 1 PIL Image (grayscale) |
+
+**Supported channel formats:**
+- RGB: 3 channels `(3, H, W)`
+- Grayscale: 1 channel `(1, H, W)` or 2D `(H, W)`
+
+**Value ranges:**
+- Tensors with values in `[0, 1]` are automatically scaled to `[0, 255]`
+- Tensors with values in `[0, 255]` are used directly
+- All tensors are converted to uint8 format
+
+## Usage Examples
+
+### Basic Usage
+
+#### Method 1: Using `tensors` parameter
+
+```python
+from swift.llm import InferRequest
+
+request = InferRequest(
+    messages=[
+        {
+            "role": "user",
+            "content": "Analyze this tensor data: <tensor>"
+        }
+    ],
+    tensors=["path/to/tensor.pt"]
+)
+```
+
+#### Method 2: Using content with tensor type
+
+```python
+from swift.llm import InferRequest
+
+request = InferRequest(
+    messages=[
+        {
+            "role": "user",
+            "content": [
+                {"type": "tensor", "tensor": "path/to/tensor.pt"},
+                {"type": "text", "text": "Analyze this tensor data."}
+            ]
+        }
+    ]
+)
+```
+
+### Multiple Tensors
+
+```python
+request = InferRequest(
+    messages=[
+        {
+            "role": "user",
+            "content": "Compare these tensors: <tensor><tensor>"
+        }
+    ],
+    tensors=["tensor1.pt", "tensor2.pt"]
+)
+```
+
+### Mixed Media Types
+
+Tensors can be combined with images, videos, and audios:
+
+```python
+request = InferRequest(
+    messages=[
+        {
+            "role": "user",
+            "content": "Here's an image: <image> and tensor data: <tensor>"
+        }
+    ],
+    images=["photo.jpg"],
+    tensors=["data.pt"]
+)
+```
+
+## Dataset Format
+
+### Training Data Format
+
+Tensors follow the same dataset format as other multimodal data:
+
+#### Supervised Fine-tuning
+
+```jsonl
+{"messages": [{"role": "user", "content": "<tensor>Analyze this medical scan"}, {"role": "assistant", "content": "This scan shows..."}], "tensors": ["/path/to/scan.pt"]}
+{"messages": [{"role": "user", "content": "Compare <tensor><tensor>"}, {"role": "assistant", "content": "The first tensor shows..."}], "tensors": ["/path/to/tensor1.pt", "/path/to/tensor2.pt"]}
+```
+
+#### Pre-training
+
+```jsonl
+{"messages": [{"role": "assistant", "content": "<tensor> represents a normal cardiac scan"}], "tensors": ["/path/to/cardiac_scan.pt"]}
+```
+
+#### RLHF (DPO/ORPO/CPO/SimPO)
+
+```jsonl
+{"messages": [{"role": "user", "content": "<tensor>What does this show?"}, {"role": "assistant", "content": "This is a detailed analysis..."}], "tensors": ["/path/to/scan.pt"], "rejected_response": "I don't know"}
+```
+
+#### Mixed Modality
+
+```jsonl
+{"messages": [{"role": "user", "content": "<image><tensor>Compare the image and tensor"}, {"role": "assistant", "content": "The image shows... while the tensor indicates..."}], "images": ["/path/to/image.jpg"], "tensors": ["/path/to/tensor.pt"]}
+```
+
+### Command Line Usage
+
+Use tensors directly with command line parameters:
+
+```bash
+# Training with tensor dataset
+swift sft \
+    --model Qwen/Qwen2-VL-2B-Instruct \
+    --dataset /path/to/tensor_dataset.jsonl \
+    --max_length 2048
+
+# The dataset should contain 'tensors' field
+# Example: {"messages": [...], "tensors": ["tensor.pt"]}
+```
+
+## Creating Tensor Files
+
+### Example: Saving Tensors
+
+```python
+import torch
+
+# Create a sample RGB tensor (simulating image data)
+tensor = torch.randn(3, 224, 224)
+
+# Normalize to [0, 1] range
+tensor = torch.clamp((tensor + 1) / 2, 0, 1)
+
+# Save the tensor
+torch.save(tensor, "sample_tensor.pt")
+```
+
+### Example: Batch Tensor
+
+```python
+import torch
+
+# Create a batch of tensors
+batch_tensor = torch.randn(4, 3, 224, 224)
+batch_tensor = torch.clamp((batch_tensor + 1) / 2, 0, 1)
+
+torch.save(batch_tensor, "batch_tensor.pt")
+```
+
+### Example: Medical Imaging Data
+
+```python
+import torch
+import numpy as np
+from PIL import Image
+
+# Load a medical scan (e.g., from DICOM, NIfTI, etc.)
+# Assuming you have a numpy array from your medical imaging library
+scan_array = np.load("medical_scan.npy")  # Shape: (H, W) or (H, W, C)
+
+# Convert to tensor
+if len(scan_array.shape) == 2:  # Grayscale
+    tensor = torch.from_numpy(scan_array).unsqueeze(0)  # Add channel dim
+elif len(scan_array.shape) == 3:  # Multi-channel
+    tensor = torch.from_numpy(scan_array).permute(2, 0, 1)  # HWC -> CHW
+
+# Normalize to [0, 1] if needed
+tensor = (tensor - tensor.min()) / (tensor.max() - tensor.min())
+
+# Save for use with MS-Swift
+torch.save(tensor, "medical_scan.pt")
+```
+
+## Custom Template with Tensor-to-Image Conversion
+
+For advanced use cases, you can create custom templates that handle tensor conversion:
+
+```python
+from typing import List, Literal
+from swift.llm.template.base import Template, Context
+from swift.llm.template.template_inputs import StdTemplateInputs
+from swift.llm.template.vision_utils import load_tensor
+
+
+class TensorToImageTemplate(Template):
+    """Custom template that converts tensors to images."""
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio', 'tensor'],
+                    index: int, inputs: StdTemplateInputs) -> List[Context]:
+        if media_type == 'tensor':
+            # Convert tensor to images using built-in method
+            return self.replace_tensor2image(
+                load_tensor,
+                inputs,
+                lambda i: self.image_placeholder
+            )
+        else:
+            return super().replace_tag(media_type, index, inputs)
+```
+
+## Loading Tensors Programmatically
+
+```python
+from swift.llm.template.vision_utils import load_tensor
+
+# Load a tensor from file
+tensor = load_tensor("path/to/tensor.pt")
+
+# Load from URL
+tensor = load_tensor("https://example.com/tensor.pt")
+
+# The loaded tensor is always on CPU
+print(f"Tensor shape: {tensor.shape}")
+print(f"Tensor device: {tensor.device}")  # cpu
+```
+
+## Technical Details
+
+### Tensor Loading Pipeline
+
+1. **Load**: Tensor file is loaded using `torch.load()` with `map_location='cpu'`
+2. **Validate**: Checks that the loaded object is a `torch.Tensor`
+3. **Convert**: Tensor is converted to PIL Image(s) using `_tensor_to_images()`
+4. **Process**: Images are processed like regular image inputs
+
+### Conversion Process
+
+The tensor-to-image conversion follows these steps:
+
+1. **Shape Detection**: Determines if tensor is 2D, 3D, or 4D
+2. **Batch Handling**: For 4D tensors, each batch item is converted separately
+3. **Channel Formatting**: Converts from `(C, H, W)` to `(H, W, C)` for RGB
+4. **Normalization**: Values in `[0, 1]` are scaled to `[0, 255]`
+5. **Type Casting**: Converts to `uint8` format
+6. **PIL Conversion**: Creates PIL Images (RGB or grayscale mode)
+
+### Integration with Template System
+
+Tensors are integrated into the template system:
+
+- **Special Token**: `<tensor>` tag in messages
+- **Placeholder**: `Template.tensor_placeholder = ['<tensor>']`
+- **Index Tracking**: `inputs.tensor_idx` tracks current tensor position
+- **Multimodal Detection**: Tensors contribute to `inputs.is_multimodal`
+
+## Use Cases
+
+### Medical Report Generation
+
+```python
+# Training dataset for medical report generation
+dataset = [
+    {
+        "messages": [
+            {"role": "user", "content": "<tensor>Generate a report for this scan"},
+            {"role": "assistant", "content": "Findings: Normal cardiac function..."}
+        ],
+        "tensors": ["scans/patient001_cardiac.pt"]
+    },
+    # ... more examples
+]
+```
+
+### Scientific Data Analysis
+
+```python
+# Analyzing scientific measurements
+request = InferRequest(
+    messages=[
+        {
+            "role": "user",
+            "content": "<tensor>Analyze this spectral data and identify anomalies"
+        }
+    ],
+    tensors=["spectra/sample_001.pt"]
+)
+```
+
+### Feature Representation Learning
+
+```python
+# Using pre-computed features
+request = InferRequest(
+    messages=[
+        {
+            "role": "user",
+            "content": "<tensor>Classify this feature representation"
+        }
+    ],
+    tensors=["features/embedding_vector.pt"]
+)
+```
+
+## Limitations
+
+1. **Format**: Only `.pt` files (PyTorch tensor format) are supported
+2. **Channels**: Limited to 1 (grayscale) or 3 (RGB) channels
+3. **Dimensions**: Tensors must be 2D, 3D, or 4D
+4. **Memory**: Large batch tensors may consume significant memory
+5. **Conversion**: Tensors are always converted to images for model processing
+
+## Troubleshooting
+
+### Common Issues
+
+**Issue**: `ValueError: Expected a torch.Tensor, but got <type>`
+- **Solution**: Ensure the `.pt` file contains a PyTorch tensor, not other objects
+
+**Issue**: `ValueError: Unsupported tensor shape: torch.Size([...])`
+- **Solution**: Reshape your tensor to supported formats (2D, 3D, or 4D)
+
+**Issue**: `ValueError: Unsupported number of channels: X`
+- **Solution**: Convert to 1 (grayscale) or 3 (RGB) channels
+
+**Issue**: Images appear too dark/bright
+- **Solution**: Normalize your tensor values to [0, 1] range before saving
+
+### Debugging
+
+Enable debug mode to see tensor processing details:
+
+```bash
+export SWIFT_DEBUG=1
+python your_script.py
+```
+
+## References
+
+- [Custom Dataset Documentation](Custom-dataset.md)
+- [Multimodal Training Best Practices](../BestPractices/Rapidly-Training-VL-model.md)
+- [Template System Documentation](../Instruction/Template.md)
+
+## Example Scripts
+
+Complete example scripts are available in the `examples/` directory:
+
+- `examples/tensor_example.py`: Basic tensor loading and usage
+- `examples/tensor_template_example.py`: Custom template implementation
diff --git a/swift/llm/dataset/preprocessor/core.py b/swift/llm/dataset/preprocessor/core.py
index 11123841bc..65c5ae088d 100644
--- a/swift/llm/dataset/preprocessor/core.py
+++ b/swift/llm/dataset/preprocessor/core.py
@@ -20,7 +20,7 @@
 
 logger = get_logger()
 
-_pair_keys = ['messages', 'images', 'videos', 'audios', 'tools', 'objects']
+_pair_keys = ['messages', 'images', 'videos', 'audios', 'tensors', 'tools', 'objects']
 
 
 class RowPreprocessor:
@@ -45,7 +45,8 @@ def __init__(self,
         images_keys = ['images', 'image']
         audios_keys = ['audios', 'audio']
         videos_keys = ['videos', 'video']
-        for mm_type in ['images', 'audios', 'videos']:
+        tensors_keys = ['tensors', 'tensor']
+        for mm_type in ['images', 'audios', 'videos', 'tensors']:
             keys = locals()[f'{mm_type}_keys']
             for key in keys:
                 self.columns[key] = mm_type
diff --git a/swift/llm/template/base.py b/swift/llm/template/base.py
index fce20eb7d2..fbdcdd5af2 100644
--- a/swift/llm/template/base.py
+++ b/swift/llm/template/base.py
@@ -28,7 +28,7 @@
 from ..utils import Processor, ProcessorMixin
 from .template_inputs import InferRequest, StdTemplateInputs, TemplateInputs
 from .utils import Context, ContextType, StopWordsCriteria, fetch_one, findall, split_str_parts_by
-from .vision_utils import load_audio, load_batch, load_image, rescale_image
+from .vision_utils import load_audio, load_batch, load_image, load_tensor, rescale_image
 
 logger = get_logger()
 if TYPE_CHECKING:
@@ -40,12 +40,13 @@ class MaxLengthError(ValueError):
 
 
 class Template(ProcessorMixin):
-    special_tokens = ['<image>', '<video>', '<audio>', '<bbox>', '<ref-object>', '<cot-process>', '<start-image>']
-    special_keys = ['images', 'videos', 'audios', 'objects']
+    special_tokens = ['<image>', '<video>', '<audio>', '<tensor>', '<bbox>', '<ref-object>', '<cot-process>', '<start-image>']
+    special_keys = ['images', 'videos', 'audios', 'tensors', 'objects']
 
     image_placeholder = ['<image>']
     video_placeholder = ['<video>']
     audio_placeholder = ['<audio>']
+    tensor_placeholder = ['<tensor>']
     cot_process_placeholder = ['ки']
     placeholder_tokens = []  # For clearer printing
     load_images = True
@@ -763,7 +764,7 @@ def _split_special_tokens(context_list: List[Context],
     def _tokenize(self, context, **kwargs):
         return self.tokenizer(context, return_attention_mask=False, add_special_tokens=False, **kwargs)['input_ids']
 
-    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio', 'tensor'], index: int,
                     inputs: StdTemplateInputs) -> List[Context]:
         """Override this function to do your own replace operation.
 
@@ -794,6 +795,8 @@ def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int
                 return self.video_placeholder
         elif media_type == 'audio':
             return self.audio_placeholder
+        elif media_type == 'tensor':
+            return self.tensor_placeholder
 
     def replace_ref(self, ref: str, index: int, inputs: StdTemplateInputs) -> List[Context]:
         """Replace objects referenced by the bbox to contents or input_ids. This is useful in the grounding task.
@@ -880,11 +883,11 @@ def _pre_tokenize(self, context_list: List[Context], loss_scale_list: List[float
         res_loss_scale: List[float] = []  # result of loss_scale_list
 
         # reset
-        for k in ['video', 'audio', 'object', 'box']:
+        for k in ['video', 'audio', 'tensor', 'object', 'box']:
             setattr(inputs, f'{k}_idx', 0)
 
         for context, loss_scale in zip(context_list, loss_scale_list):
-            for k in ['video', 'audio']:
+            for k in ['video', 'audio', 'tensor']:
                 if context == f'<{k}>' and inputs.is_multimodal and getattr(inputs, f'{k}_idx') < len(
                         getattr(inputs, f'{k}s')):
                     c_list = self.replace_tag(k, getattr(inputs, f'{k}_idx'), inputs)
@@ -1320,6 +1323,84 @@ def replace_video2image(self, load_video_func, inputs, replace_tag: Callable) ->
         inputs.image_idx += len(new_images)
         return context_list
 
+    def replace_tensor2image(self, load_tensor_func, inputs, replace_tag: Callable) -> List[Context]:
+        """Replace tensor with image representations.
+        
+        This method is similar to replace_video2image but handles tensor files.
+        It loads a tensor from a .pt file and converts it to image representations.
+        """
+        context_list = []
+        if self.mode in {'vllm', 'lmdeploy'}:
+            tensor_path = inputs.tensors.pop(inputs.tensor_idx)
+            inputs.tensor_idx -= 1
+        else:
+            tensor_path = inputs.tensors[inputs.tensor_idx]
+        
+        images = inputs.images
+        # Load tensor and convert to images
+        tensor = load_tensor_func(tensor_path)
+        new_images = self._tensor_to_images(tensor)
+        inputs.images = images[:inputs.image_idx] + new_images + images[inputs.image_idx:]
+        
+        for i in range(len(new_images)):
+            context_list += replace_tag(i)
+        inputs.image_idx += len(new_images)
+        return context_list
+
+    def _tensor_to_images(self, tensor: torch.Tensor) -> List[Image.Image]:
+        """Convert a tensor to a list of PIL Images.
+        
+        This method handles the conversion of tensor data to PIL Images.
+        It supports various tensor formats and converts them appropriately.
+        """
+        from PIL import Image
+        import numpy as np
+        
+        # Handle different tensor shapes
+        if tensor.dim() == 4:  # Batch of images (B, C, H, W)
+            images = []
+            for i in range(tensor.shape[0]):
+                img_tensor = tensor[i]
+                img = self._single_tensor_to_image(img_tensor)
+                images.append(img)
+            return images
+        elif tensor.dim() == 3:  # Single image (C, H, W)
+            return [self._single_tensor_to_image(tensor)]
+        elif tensor.dim() == 2:  # Grayscale image (H, W)
+            return [self._single_tensor_to_image(tensor.unsqueeze(0))]
+        else:
+            raise ValueError(f"Unsupported tensor shape: {tensor.shape}")
+
+    def _single_tensor_to_image(self, tensor: torch.Tensor) -> Image.Image:
+        """Convert a single tensor to PIL Image."""
+        from PIL import Image
+        import numpy as np
+        
+        # Move to CPU and convert to numpy
+        tensor = tensor.cpu()
+        
+        # Handle different channel formats
+        if tensor.shape[0] == 3:  # RGB (3, H, W)
+            # Convert from (C, H, W) to (H, W, C)
+            img_array = tensor.permute(1, 2, 0).numpy()
+        elif tensor.shape[0] == 1:  # Grayscale (1, H, W)
+            # Convert from (1, H, W) to (H, W)
+            img_array = tensor.squeeze(0).numpy()
+        else:
+            raise ValueError(f"Unsupported number of channels: {tensor.shape[0]}")
+        
+        # Normalize to [0, 255] if needed
+        if img_array.max() <= 1.0:
+            img_array = (img_array * 255).astype(np.uint8)
+        else:
+            img_array = img_array.astype(np.uint8)
+        
+        # Convert to PIL Image
+        if len(img_array.shape) == 2:  # Grayscale
+            return Image.fromarray(img_array, mode='L')
+        else:  # RGB
+            return Image.fromarray(img_array, mode='RGB')
+
     def get_generate_ids(self, generate_ids: Union[torch.Tensor, List[int]],
                          num_prompt_tokens: int) -> Union[torch.Tensor, List[int]]:
         if self.skip_prompt:
diff --git a/swift/llm/template/template_inputs.py b/swift/llm/template/template_inputs.py
index 4c4ea31f2e..43757116cb 100644
--- a/swift/llm/template/template_inputs.py
+++ b/swift/llm/template/template_inputs.py
@@ -26,7 +26,7 @@ class InferRequest:
                     "role": "user",
                     "content": [
                         {
-                            "type": "image",  # can also be audio/video
+                            "type": "image",  # can also be audio/video/tensor
                             "image": "<url/path/base64/PIL.Image>",
                         },
                         {"type": "text", "text": "Please describe the picture."},
@@ -59,12 +59,13 @@ class InferRequest:
     images: List[Union[str, Image.Image]] = field(default_factory=list)
     audios: List[str] = field(default_factory=list)
     videos: List[str] = field(default_factory=list)
+    tensors: List[str] = field(default_factory=list)
 
     tools: Optional[List[Tool]] = None
     objects: Dict[str, List[Any]] = field(default_factory=dict)
 
     def __post_init__(self):
-        for key in ['images', 'audios', 'videos']:
+        for key in ['images', 'audios', 'videos', 'tensors']:
             val = getattr(self, key)
             if isinstance(val, str):
                 setattr(self, key, [val])
@@ -144,6 +145,7 @@ class StdTemplateInputs:
     images: List[Union[str, Image.Image]] = field(default_factory=list)
     videos: List[str] = field(default_factory=list)
     audios: List[str] = field(default_factory=list)
+    tensors: List[str] = field(default_factory=list)
     objects: Dict[str, List[Any]] = field(default_factory=dict)
 
     margin: Optional[float] = None  # for reward modeling
@@ -156,6 +158,7 @@ def __post_init__(self):
         self.image_idx = 0
         self.audio_idx = 0
         self.video_idx = 0
+        self.tensor_idx = 0
         self.ref_idx = 0
         self.bbox_idx = 0
         if self.images and not isinstance(self.images, (list, tuple)):
@@ -164,6 +167,8 @@ def __post_init__(self):
             self.videos = [self.videos]
         if self.audios and not isinstance(self.audios, (list, tuple)):
             self.audios = [self.audios]
+        if self.tensors and not isinstance(self.tensors, (list, tuple)):
+            self.tensors = [self.tensors]
         if self.rejected_response:
             assert isinstance(self.rejected_response, list) and all(
                 isinstance(item, str) for item in self.rejected_response)
@@ -175,7 +180,7 @@ def to_history(self):
 
     @property
     def is_multimodal(self):
-        return bool(self.images or self.audios or self.videos or self.objects)
+        return bool(self.images or self.audios or self.videos or self.tensors or self.objects)
 
     @classmethod
     def from_dict(cls, inputs: Dict[str, Any]) -> 'StdTemplateInputs':
@@ -226,7 +231,7 @@ def from_dict(cls, inputs: Dict[str, Any]) -> 'StdTemplateInputs':
 
     @staticmethod
     def remove_messages_media(messages: Messages) -> Dict[str, Any]:
-        res = {'images': [], 'audios': [], 'videos': []}
+        res = {'images': [], 'audios': [], 'videos': [], 'tensors': [], 'rejected_images': []}
         for message in messages:
             content = message['content']
             if isinstance(content, str):
diff --git a/swift/llm/template/vision_utils.py b/swift/llm/template/vision_utils.py
index e967fb8c3b..017599c514 100644
--- a/swift/llm/template/vision_utils.py
+++ b/swift/llm/template/vision_utils.py
@@ -309,6 +309,29 @@ def load_video_ovis2(video_path, num_frames):
     return frames
 
 
+def load_tensor(tensor_path: Union[str, bytes]) -> torch.Tensor:
+    """Load a tensor from a .pt file.
+    
+    Args:
+        tensor_path: Path to the .pt file, can be a URL, local file path, or bytes
+        
+    Returns:
+        The loaded tensor
+    """
+    tensor_io = load_file(tensor_path)
+    if isinstance(tensor_io, BytesIO):
+        # Load the tensor from BytesIO
+        tensor = torch.load(tensor_io, map_location='cpu')
+    else:
+        # Direct tensor object
+        tensor = tensor_io
+    
+    if not isinstance(tensor, torch.Tensor):
+        raise ValueError(f"Expected a torch.Tensor, but got {type(tensor)}")
+    
+    return tensor
+
+
 def load_video_ovis2_5(video_path, num_frames):
     from moviepy.editor import VideoFileClip
     with VideoFileClip(video_path) as clip:
diff --git a/tests/test_align/test_template/test_tensor.py b/tests/test_align/test_template/test_tensor.py
new file mode 100644
index 0000000000..7069a10297
--- /dev/null
+++ b/tests/test_align/test_template/test_tensor.py
@@ -0,0 +1,305 @@
+import os
+import tempfile
+from typing import List
+
+import torch
+from PIL import Image
+
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+os.environ['SWIFT_DEBUG'] = '1'
+
+
+def test_load_tensor():
+    """Test basic tensor loading functionality."""
+    from swift.llm.template.vision_utils import load_tensor
+
+    # Create a sample tensor
+    sample_tensor = torch.randn(3, 224, 224)
+
+    with tempfile.NamedTemporaryFile(suffix='.pt', delete=False) as tmp_file:
+        tensor_path = tmp_file.name
+        torch.save(sample_tensor, tensor_path)
+
+        try:
+            # Test loading
+            loaded_tensor = load_tensor(tensor_path)
+            assert isinstance(loaded_tensor, torch.Tensor), "Loaded object is not a tensor"
+            assert torch.allclose(sample_tensor, loaded_tensor), "Tensor values don't match"
+            assert loaded_tensor.shape == (3, 224, 224), f"Unexpected shape: {loaded_tensor.shape}"
+            print("✓ test_load_tensor passed")
+        finally:
+            os.unlink(tensor_path)
+
+
+def test_load_batch_tensor():
+    """Test loading batch tensors."""
+    from swift.llm.template.vision_utils import load_tensor
+
+    # Create a batch tensor
+    batch_tensor = torch.randn(4, 3, 64, 64)
+
+    with tempfile.NamedTemporaryFile(suffix='.pt', delete=False) as tmp_file:
+        tensor_path = tmp_file.name
+        torch.save(batch_tensor, tensor_path)
+
+        try:
+            # Test loading
+            loaded_tensor = load_tensor(tensor_path)
+            assert isinstance(loaded_tensor, torch.Tensor), "Loaded object is not a tensor"
+            assert torch.allclose(batch_tensor, loaded_tensor), "Tensor values don't match"
+            assert loaded_tensor.shape == (4, 3, 64, 64), f"Unexpected shape: {loaded_tensor.shape}"
+            print("✓ test_load_batch_tensor passed")
+        finally:
+            os.unlink(tensor_path)
+
+
+def test_tensor_to_images_single():
+    """Test converting a single tensor to image."""
+    from swift.llm.template.base import Template
+
+    # Create a mock template to access the conversion methods
+    template = Template(processor=None, template_meta=None)
+
+    # Create a single RGB tensor (3, 64, 64)
+    single_tensor = torch.rand(3, 64, 64)
+
+    # Convert to images
+    images = template._tensor_to_images(single_tensor)
+
+    assert isinstance(images, list), "Output should be a list"
+    assert len(images) == 1, f"Expected 1 image, got {len(images)}"
+    assert isinstance(images[0], Image.Image), "Output should contain PIL Images"
+    assert images[0].size == (64, 64), f"Unexpected image size: {images[0].size}"
+    assert images[0].mode == 'RGB', f"Unexpected image mode: {images[0].mode}"
+    print("✓ test_tensor_to_images_single passed")
+
+
+def test_tensor_to_images_batch():
+    """Test converting a batch of tensors to images."""
+    from swift.llm.template.base import Template
+
+    template = Template(processor=None, template_meta=None)
+
+    # Create a batch of RGB tensors (4, 3, 64, 64)
+    batch_tensor = torch.rand(4, 3, 64, 64)
+
+    # Convert to images
+    images = template._tensor_to_images(batch_tensor)
+
+    assert isinstance(images, list), "Output should be a list"
+    assert len(images) == 4, f"Expected 4 images, got {len(images)}"
+    assert all(isinstance(img, Image.Image) for img in images), "All outputs should be PIL Images"
+    assert all(img.size == (64, 64) for img in images), "All images should have size (64, 64)"
+    assert all(img.mode == 'RGB' for img in images), "All images should be RGB"
+    print("✓ test_tensor_to_images_batch passed")
+
+
+def test_tensor_to_images_grayscale():
+    """Test converting grayscale tensors to images."""
+    from swift.llm.template.base import Template
+
+    template = Template(processor=None, template_meta=None)
+
+    # Create a grayscale tensor (1, 64, 64)
+    gray_tensor = torch.rand(1, 64, 64)
+
+    # Convert to images
+    images = template._tensor_to_images(gray_tensor)
+
+    assert isinstance(images, list), "Output should be a list"
+    assert len(images) == 1, f"Expected 1 image, got {len(images)}"
+    assert isinstance(images[0], Image.Image), "Output should contain PIL Images"
+    assert images[0].size == (64, 64), f"Unexpected image size: {images[0].size}"
+    assert images[0].mode == 'L', f"Expected grayscale (L) mode, got: {images[0].mode}"
+    print("✓ test_tensor_to_images_grayscale passed")
+
+
+def test_tensor_to_images_2d():
+    """Test converting 2D tensors to images."""
+    from swift.llm.template.base import Template
+
+    template = Template(processor=None, template_meta=None)
+
+    # Create a 2D tensor (64, 64)
+    tensor_2d = torch.rand(64, 64)
+
+    # Convert to images
+    images = template._tensor_to_images(tensor_2d)
+
+    assert isinstance(images, list), "Output should be a list"
+    assert len(images) == 1, f"Expected 1 image, got {len(images)}"
+    assert isinstance(images[0], Image.Image), "Output should contain PIL Images"
+    assert images[0].size == (64, 64), f"Unexpected image size: {images[0].size}"
+    assert images[0].mode == 'L', f"Expected grayscale (L) mode, got: {images[0].mode}"
+    print("✓ test_tensor_to_images_2d passed")
+
+
+def test_tensor_normalization():
+    """Test that tensor values are properly normalized."""
+    from swift.llm.template.base import Template
+    import numpy as np
+
+    template = Template(processor=None, template_meta=None)
+
+    # Test normalized values [0, 1]
+    normalized_tensor = torch.rand(3, 32, 32)  # Already in [0, 1]
+    images = template._tensor_to_images(normalized_tensor)
+    img_array = np.array(images[0])
+    assert img_array.min() >= 0 and img_array.max() <= 255, "Image values should be in [0, 255]"
+
+    # Test values that need scaling
+    scaled_tensor = torch.rand(3, 32, 32) * 0.5  # In [0, 0.5]
+    images = template._tensor_to_images(scaled_tensor)
+    img_array = np.array(images[0])
+    assert img_array.min() >= 0 and img_array.max() <= 255, "Image values should be in [0, 255]"
+
+    print("✓ test_tensor_normalization passed")
+
+
+def test_template_inputs_with_tensors():
+    """Test StdTemplateInputs with tensor support."""
+    from swift.llm.template.template_inputs import StdTemplateInputs
+
+    # Create inputs with tensors
+    inputs = StdTemplateInputs(
+        messages=[{"role": "user", "content": "Analyze this: <tensor>"}],
+        tensors=["path/to/tensor.pt"]
+    )
+
+    # Verify tensor_idx is initialized
+    assert hasattr(inputs, 'tensor_idx'), "tensor_idx should be initialized"
+    assert inputs.tensor_idx == 0, f"tensor_idx should be 0, got {inputs.tensor_idx}"
+
+    # Verify multimodal detection
+    assert inputs.is_multimodal, "Should be detected as multimodal with tensors"
+
+    # Test with single string (should be converted to list)
+    inputs2 = StdTemplateInputs(
+        messages=[{"role": "user", "content": "<tensor>"}],
+        tensors="single_tensor.pt"
+    )
+    assert isinstance(inputs2.tensors, list), "Single tensor should be converted to list"
+    assert len(inputs2.tensors) == 1, f"Expected 1 tensor, got {len(inputs2.tensors)}"
+
+    print("✓ test_template_inputs_with_tensors passed")
+
+
+def test_infer_request_with_tensors():
+    """Test InferRequest with tensor support."""
+    from swift.llm.template.template_inputs import InferRequest
+
+    # Test Method 1: Using tensors parameter
+    request1 = InferRequest(
+        messages=[{"role": "user", "content": "Analyze this: <tensor>"}],
+        tensors=["tensor1.pt", "tensor2.pt"]
+    )
+    assert len(request1.tensors) == 2, f"Expected 2 tensors, got {len(request1.tensors)}"
+
+    # Test Method 2: Using content with tensor type
+    request2 = InferRequest(
+        messages=[{
+            "role": "user",
+            "content": [
+                {"type": "tensor", "tensor": "tensor.pt"},
+                {"type": "text", "text": "Analyze this."}
+            ]
+        }]
+    )
+    assert isinstance(request2.messages, list), "Messages should be a list"
+
+    # Test single string conversion
+    request3 = InferRequest(
+        messages=[{"role": "user", "content": "<tensor>"}],
+        tensors="single.pt"
+    )
+    assert isinstance(request3.tensors, list), "Single tensor should be converted to list"
+
+    print("✓ test_infer_request_with_tensors passed")
+
+
+def test_mixed_media():
+    """Test using tensors with other media types."""
+    from swift.llm.template.template_inputs import StdTemplateInputs
+
+    inputs = StdTemplateInputs(
+        messages=[{"role": "user", "content": "<image><tensor><video>"}],
+        images=["image.jpg"],
+        tensors=["tensor.pt"],
+        videos=["video.mp4"]
+    )
+
+    assert inputs.is_multimodal, "Should be multimodal"
+    assert len(inputs.images) == 1, "Should have 1 image"
+    assert len(inputs.tensors) == 1, "Should have 1 tensor"
+    assert len(inputs.videos) == 1, "Should have 1 video"
+    assert inputs.tensor_idx == 0, "tensor_idx should be initialized"
+
+    print("✓ test_mixed_media passed")
+
+
+def test_dataset_preprocessor_tensor_support():
+    """Test that dataset preprocessor supports tensors."""
+    from swift.llm.dataset.preprocessor.core import RowPreprocessor
+
+    preprocessor = RowPreprocessor()
+
+    # Check that tensors is in standard_keys
+    assert 'tensors' in RowPreprocessor.standard_keys, "tensors should be in standard_keys"
+
+    # Check that column mapping works
+    assert 'tensor' in preprocessor.columns, "tensor column should be mapped"
+    assert preprocessor.columns['tensor'] == 'tensors', "tensor should map to tensors"
+    assert 'tensors' in preprocessor.columns, "tensors column should be mapped"
+    assert preprocessor.columns['tensors'] == 'tensors', "tensors should map to tensors"
+
+    print("✓ test_dataset_preprocessor_tensor_support passed")
+
+
+def test_special_tokens():
+    """Test that tensor special tokens are properly registered."""
+    from swift.llm.template.base import Template
+
+    # Check that <tensor> is in special_tokens
+    assert '<tensor>' in Template.special_tokens, "<tensor> should be in special_tokens"
+
+    # Check that tensors is in special_keys
+    assert 'tensors' in Template.special_keys, "tensors should be in special_keys"
+
+    # Check that tensor_placeholder exists
+    assert hasattr(Template, 'tensor_placeholder'), "Template should have tensor_placeholder"
+    assert Template.tensor_placeholder == ['<tensor>'], "tensor_placeholder should be ['<tensor>']"
+
+    print("✓ test_special_tokens passed")
+
+
+if __name__ == '__main__':
+    print("=== Running Tensor Support Tests ===\n")
+
+    try:
+        # Basic loading tests
+        test_load_tensor()
+        test_load_batch_tensor()
+
+        # Tensor to image conversion tests
+        test_tensor_to_images_single()
+        test_tensor_to_images_batch()
+        test_tensor_to_images_grayscale()
+        test_tensor_to_images_2d()
+        test_tensor_normalization()
+
+        # Template inputs tests
+        test_template_inputs_with_tensors()
+        test_infer_request_with_tensors()
+        test_mixed_media()
+
+        # Integration tests
+        test_dataset_preprocessor_tensor_support()
+        test_special_tokens()
+
+        print("\n🎉 All tensor support tests passed!")
+
+    except Exception as e:
+        print(f"\n❌ Test failed with error: {e}")
+        import traceback
+        traceback.print_exc()
+        raise