diff --git a/README.md b/README.md index 829f1134..ee3708ec 100644 --- a/README.md +++ b/README.md @@ -129,7 +129,7 @@ supported on Twinkle✨ framework. > For serverless training service accessed via `base_url=https://www.modelscope.cn/twinkle`, it > is currently provided via the Tinker-compatible APIs. We will be rolling out services that support > both Tinker APIs, as well as the full-fledged Twinkle✨ native APIs. The serverless endpoint is backed -> by one training base at a time, and currently it is [Qwen3-30B-A3B-Instruct-2507](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B-Instruct-2507). +> by one training base at a time, and currently it is [Qwen3.5-4B](https://modelscope.cn/models/Qwen/Qwen3.5-4B). | Model Type | Model ID on [ModelScope](https://modelscope.cn) | Model Size | Requires | Support Megatron | HF Model ID | |---------------------|-----------------------------------------------------------------------------------------------------------------|:---------------------------------------:|----------------------|:----------------:|:---------------------------------------------------------------------------------------------------------:| @@ -234,7 +234,7 @@ from twinkle.dataset import Dataset, DatasetMeta from twinkle.preprocessor import SelfCognitionProcessor from twinkle.server.common import input_feature_to_datum -base_model = 'ms://Qwen/Qwen3-30B-A3B-Instruct-2507' +base_model = 'ms://Qwen/Qwen3.5-4B' base_url='your-base-url' api_key='your-api-key' diff --git a/README_ZH.md b/README_ZH.md index 1fc8267a..bdab0e41 100644 --- a/README_ZH.md +++ b/README_ZH.md @@ -112,7 +112,7 @@ Twinkle✨支持相同的算法接口运行在单GPU、torchrun多机、Ray、Cl 随着新模型的发布,我们将添加对更多模型的支持。下表列出了 Twinkle✨ 框架当前支持的模型。 >[!Note] -> 通过 `base_url=https://www.modelscope.cn/twinkle` 访问的无服务器训练服务,目前是通过兼容Tinker的API提供的。我们将陆续推出同时支持Tinker API和完整Twinkle✨原生 API的服务。无服务器端点每次由一个训练基座支持,目前使用的是[Qwen3-30B-A3B-Instruct-2507](https://modelscope.cn/models/Qwen/Qwen3-30B-A3B-Instruct-2507)。 +> 通过 `base_url=https://www.modelscope.cn/twinkle` 访问的无服务器训练服务,目前是通过兼容Tinker的API提供的。我们将陆续推出同时支持Tinker API和完整Twinkle✨原生 API的服务。无服务器端点每次由一个训练基座支持,目前使用的是[Qwen3.5-4B](https://modelscope.cn/models/Qwen/Qwen3.5-4B)。 | Model Type | Model ID 举例 | Model Size | Requires | Support Megatron | HF Model ID | |---------------------|-----------------------------------------------------------------------------------------------------------------|:---------------------------------------:|----------------------|:----------------:|:---------------------------------------------------------------------------------------------------------:| @@ -216,7 +216,7 @@ from twinkle.dataset import Dataset, DatasetMeta from twinkle.preprocessor import SelfCognitionProcessor from twinkle.server.common import input_feature_to_datum -base_model = 'ms://Qwen/Qwen3-30B-A3B-Instruct-2507' +base_model = 'ms://Qwen/Qwen3.5-4B' base_url='your-base-url' api_key='your-api-key' diff --git a/cookbook/client/server/megatron/server_config.yaml b/cookbook/client/server/megatron/server_config.yaml index becda8b0..6b5cce0e 100644 --- a/cookbook/client/server/megatron/server_config.yaml +++ b/cookbook/client/server/megatron/server_config.yaml @@ -36,11 +36,11 @@ applications: # 3. Sampler Service - Runs inference / sampling using vLLM engine # Used for generating text from the model (e.g., evaluating LoRA results). - - name: sampler-Qwen3-30B-A3B-Instruct-2507 - route_prefix: /api/v1/sampler/Qwen/Qwen3-30B-A3B-Instruct-2507 + - name: sampler-Qwen3.5-4B + route_prefix: /api/v1/sampler/Qwen/Qwen3.5-4B import_path: sampler args: - model_id: "ms://Qwen/Qwen3-30B-A3B-Instruct-2507" # ModelScope model identifier + model_id: "ms://Qwen/Qwen3.5-4B" # ModelScope model identifier nproc_per_node: 4 # Number of GPU processes per node sampler_type: vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler) engine_args: # vLLM engine-specific settings @@ -73,12 +73,12 @@ applications: # 2. Model Service (commented out) - Would host the base model for training. # Uncomment and configure if you need a training model worker. - - name: models-Qwen3-30B-A3B-Instruct-2507 - route_prefix: /api/v1/model/Qwen/Qwen3-30B-A3B-Instruct-2507 + - name: models-Qwen3.5-4B + route_prefix: /api/v1/model/Qwen/Qwen3.5-4B import_path: model args: use_megatron: true # Use HuggingFace Transformers backend - model_id: "ms://Qwen/Qwen3-30B-A3B-Instruct-2507" # ModelScope model identifier + model_id: "ms://Qwen/Qwen3.5-4B" # ModelScope model identifier max_length: 16000 # model max length max_loras: 5 # model max loras nproc_per_node: 4 # Number of GPU processes per node diff --git a/cookbook/client/server/megatron/server_config_4b.yaml b/cookbook/client/server/megatron/server_config_4b.yaml index 0ea99551..e191b981 100644 --- a/cookbook/client/server/megatron/server_config_4b.yaml +++ b/cookbook/client/server/megatron/server_config_4b.yaml @@ -39,6 +39,7 @@ applications: import_path: model args: use_megatron: true + model_cls: Qwen3_5ForConditionalGeneration model_id: "ms://Qwen/Qwen3.5-4B" # ModelScope model identifier max_length: 10240 nproc_per_node: 2 # Number of GPU processes per node diff --git a/cookbook/client/tinker/modelscope/sample.py b/cookbook/client/tinker/modelscope/sample.py index 34a0064c..137d1b2b 100644 --- a/cookbook/client/tinker/modelscope/sample.py +++ b/cookbook/client/tinker/modelscope/sample.py @@ -16,7 +16,7 @@ from tinker import ServiceClient -base_model = 'Qwen/Qwen3-30B-A3B-Instruct-2507' +base_model = 'Qwen/Qwen3.5-4B' base_url = 'http://www.modelscope.cn/twinkle' # Step 2: Define the base model and connect to the server @@ -29,7 +29,7 @@ # The model_path is a twinkle:// URI pointing to a previously saved LoRA checkpoint. # The server will load the base model and apply the LoRA adapter weights. sampling_client = service_client.create_sampling_client( - model_path='twinkle://xxx-Qwen_Qwen3-30B-A3B-Instruct-2507-xxx/weights/twinkle-lora-1', + model_path='twinkle://xxx-Qwen_Qwen3.5-4B-xxx/weights/twinkle-lora-1', base_model=base_model ) diff --git a/cookbook/client/tinker/modelscope/self_cognition.py b/cookbook/client/tinker/modelscope/self_cognition.py index cb3b1700..1d653fb1 100644 --- a/cookbook/client/tinker/modelscope/self_cognition.py +++ b/cookbook/client/tinker/modelscope/self_cognition.py @@ -23,7 +23,7 @@ from tinker import ServiceClient # The base model to fine-tune / evaluate -base_model = 'Qwen/Qwen3-30B-A3B-Instruct-2507' +base_model = 'Qwen/Qwen3.5-4B' base_url = 'http://www.modelscope.cn/twinkle' diff --git a/cookbook/client/tinker/modelscope/short_math_grpo.py b/cookbook/client/tinker/modelscope/short_math_grpo.py index 225f8219..424d460d 100644 --- a/cookbook/client/tinker/modelscope/short_math_grpo.py +++ b/cookbook/client/tinker/modelscope/short_math_grpo.py @@ -38,7 +38,7 @@ logger = get_logger() # ========== Configuration ========== -BASE_MODEL = 'Qwen/Qwen3-30B-A3B-Instruct-2507' +BASE_MODEL = 'Qwen/Qwen3.5-4B' NUM_GENERATIONS = 8 MAX_NEW_TOKENS = 4096 LEARNING_RATE = 1e-4 diff --git a/cookbook/client/tinker/self_host/sample.py b/cookbook/client/tinker/self_host/sample.py index 132eb63a..8b0be8ce 100644 --- a/cookbook/client/tinker/self_host/sample.py +++ b/cookbook/client/tinker/self_host/sample.py @@ -27,7 +27,7 @@ # The model_path is a twinkle:// URI pointing to a previously saved LoRA checkpoint. # The server will load the base model and apply the LoRA adapter weights. sampling_client = service_client.create_sampling_client( - model_path='twinkle://xxx-Qwen_Qwen3-30B-A3B-Instruct-2507-xxx/weights/twinkle-lora-1', + model_path='twinkle://xxx-Qwen_Qwen3.5-4B-xxx/weights/twinkle-lora-1', base_model=base_model ) diff --git a/cookbook/client/twinkle/modelscope/multi_modal.py b/cookbook/client/twinkle/modelscope/multi_modal.py new file mode 100644 index 00000000..f3b8cd24 --- /dev/null +++ b/cookbook/client/twinkle/modelscope/multi_modal.py @@ -0,0 +1,168 @@ +# Twinkle Client - Transformers LoRA Training Example +# +# This script demonstrates how to fine-tune a language model using LoRA +# (Low-Rank Adaptation) through the Twinkle client-server architecture. +# The server must be running first (see server.py and server_config.yaml). + +# Step 1: Load environment variables from a .env file (e.g., API tokens) +import dotenv +import os +from twinkle.data_format import Trajectory, Message +from twinkle.preprocessor import Preprocessor + +dotenv.load_dotenv('.env') +import numpy as np +import torch +from peft import LoraConfig + +from twinkle import get_logger +from twinkle.dataset import DatasetMeta +from twinkle_client import init_twinkle_client +from twinkle.dataloader import DataLoader +from twinkle.dataset import LazyDataset +from twinkle_client.model import MultiLoraTransformersModel + +logger = get_logger() + +base_model = 'Qwen/Qwen3.5-4B' +base_url = 'http://www.modelscope.cn/twinkle' + +# Step 2: Initialize the Twinkle client to communicate with the remote server. +# - base_url: the address of the running Twinkle server +# - api_key: authentication token (loaded from environment variable) +client = init_twinkle_client(base_url=base_url, api_key=os.environ.get('MODELSCOPE_TOKEN')) + +# Step 3: Query the server for existing training runs and their checkpoints. +# This is useful for resuming a previous training session. +runs = client.list_training_runs() + +resume_path = None +for run in runs: + logger.info(run.model_dump_json(indent=2)) + # List all saved checkpoints for this training run + checkpoints = client.list_checkpoints(run.training_run_id) + + for checkpoint in checkpoints: + logger.info(checkpoint.model_dump_json(indent=2)) + # Uncomment the line below to resume from a specific checkpoint: + # resume_path = checkpoint.twinkle_path + + +class LatexOCRProcessor(Preprocessor): + + def __call__(self, rows): + rows = self.map_col_to_row(rows) + rows = [self.preprocess(row) for row in rows] + rows = self.map_row_to_col(rows) + return rows + + def preprocess(self, row) -> Trajectory: + return Trajectory( + messages=[ + Message(role='user', content='Using LaTeX to perform OCR on the image.', images=[row['image']]), + Message(role='assistant', content=row['text']), + ] + ) + + +def train(): + # Step 4: Prepare the dataset + + # Load the latex dataset from ModelScope + dataset = LazyDataset(dataset_meta=DatasetMeta('ms://AI-ModelScope/LaTeX_OCR', data_slice=range(500))) + + # Apply a chat template so the data matches the model's expected input format + dataset.set_template('Qwen3_5Template', model_id=f'ms://{base_model}', max_length=512) + + # Replace placeholder names in the dataset with custom model/author names + dataset.map(LatexOCRProcessor) + + # Tokenize and encode the dataset into model-ready input features + dataset.encode(batched=True) + + # Wrap the dataset into a DataLoader that yields batches of size 4 + dataloader = DataLoader(dataset=dataset, batch_size=4) + + # Step 5: Configure the model + + # Create a multi-LoRA Transformers model pointing to the base model on ModelScope + model = MultiLoraTransformersModel(model_id=f'ms://{base_model}') + + # Define LoRA configuration: apply low-rank adapters to all linear layers + lora_config = LoraConfig(target_modules='all-linear') + + # Attach the LoRA adapter named 'default' to the model. + # gradient_accumulation_steps=2 means gradients are accumulated over 2 micro-batches + # before an optimizer step, effectively doubling the batch size. + model.add_adapter_to_model('default', lora_config, gradient_accumulation_steps=2) + + # Set the same chat template used during data preprocessing + model.set_template('Qwen3_5Template') + + # Set the input processor (pads sequences on the right side) + model.set_processor('InputProcessor', padding_side='right') + + # Use cross-entropy loss for language modeling + model.set_loss('CrossEntropyLoss') + + # Use Adam optimizer with a learning rate of 1e-4 (Only support Adam optimizer if server use megatron) + model.set_optimizer('Adam', lr=1e-4) + + # Use a linear learning rate scheduler (Do not support LR scheduler if server use megatron) + # model.set_lr_scheduler('LinearLR') + + # Step 6: Optionally resume from a previous checkpoint + if resume_path: + logger.info(f'Resuming training from {resume_path}') + model.load(resume_path, load_optimizer=True) + + # Step 7: Run the training loop + logger.info(model.get_train_configs().model_dump()) + + for epoch in range(3): + logger.info(f'Starting epoch {epoch}') + for step, batch in enumerate(dataloader): + for sample in batch: + for key in sample: + if isinstance(sample[key], np.ndarray): + sample[key] = sample[key].tolist() + elif isinstance(sample[key], torch.Tensor): + sample[key] = sample[key].cpu().numpy().tolist() + # Forward pass + backward pass (computes gradients) + model.forward_backward(inputs=batch) + + # Step + model.clip_grad_and_step() + # Equal to the following steps: + # # Clip gradients to prevent exploding gradients (max norm = 1.0) + # model.clip_grad_norm(1.0) + # # Perform one optimizer step (update model weights) + # model.step() + # # Reset gradients to zero for the next iteration + # model.zero_grad() + # # Advance the learning rate scheduler by one step + # model.lr_step() + + # Log the loss every 2 steps (aligned with gradient accumulation) + if step % 2 == 0: + # Print metric + metric = model.calculate_metric(is_training=True) + logger.info(f'Current is step {step} of {len(dataloader)}, metric: {metric.result}') + + # Step 8: Save the trained checkpoint + twinkle_path = model.save(name=f'twinkle-epoch-{epoch}', save_optimizer=True) + logger.info(f'Saved checkpoint: {twinkle_path}') + + # Step 9: Upload the checkpoint to ModelScope Hub + # YOUR_USER_NAME = "your_username" + # hub_model_id = f'{YOUR_USER_NAME}/twinkle-multi-modal' + # model.upload_to_hub( + # checkpoint_dir=twinkle_path, + # hub_model_id=hub_model_id, + # async_upload=False + # ) + # logger.info(f"Uploaded checkpoint to hub: {hub_model_id}") + + +if __name__ == '__main__': + train() diff --git a/cookbook/client/twinkle/modelscope/self_congnition.py b/cookbook/client/twinkle/modelscope/self_congnition.py index 25c8a0e4..ed44b4b1 100644 --- a/cookbook/client/twinkle/modelscope/self_congnition.py +++ b/cookbook/client/twinkle/modelscope/self_congnition.py @@ -21,7 +21,7 @@ logger = get_logger() -base_model = 'Qwen/Qwen3-30B-A3B-Instruct-2507' +base_model = 'Qwen/Qwen3.5-4B' base_url = 'http://www.modelscope.cn/twinkle' # Step 2: Initialize the Twinkle client to communicate with the remote server. diff --git a/cookbook/transformers/ep_fsdp_qwen3_moe.py b/cookbook/transformers/ep_fsdp_qwen3_moe.py index 673e64fd..3c02b218 100644 --- a/cookbook/transformers/ep_fsdp_qwen3_moe.py +++ b/cookbook/transformers/ep_fsdp_qwen3_moe.py @@ -11,7 +11,7 @@ logger = get_logger() -MODEL_ID = os.environ.get('QWEN3_MODEL_ID', 'ms://Qwen/Qwen3-30B-A3B-Instruct-2507') +MODEL_ID = os.environ.get('QWEN3_MODEL_ID', 'ms://Qwen/Qwen3.5-4B') DATASET_ID = os.environ.get('DATASET_ID', 'ms://swift/self-cognition') TEMPLATE_ID = os.environ.get('TEMPLATE_ID', 'Template') _num_layers_env = os.environ.get('NUM_LAYERS') diff --git a/cookbook/transformers/fsdp2_moe.py b/cookbook/transformers/fsdp2_moe.py index 3ea649d3..2a92794a 100644 --- a/cookbook/transformers/fsdp2_moe.py +++ b/cookbook/transformers/fsdp2_moe.py @@ -20,7 +20,7 @@ def eval(model): # 100 Samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(100))) - dataset.set_template('Template', model_id='ms://Qwen/Qwen3-30B-A3B-Instruct-2507') + dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B') dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) dataset.encode() dataloader = DataLoader(dataset=dataset, batch_size=4) @@ -35,7 +35,7 @@ def train(): # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding - dataset.set_template('Template', model_id='ms://Qwen/Qwen3-30B-A3B-Instruct-2507') + dataset.set_template('Template', model_id='ms://Qwen/Qwen3.5-4B') # Preprocess the dataset to standard format dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区')) # Encode dataset @@ -43,7 +43,7 @@ def train(): # Global batch size = 4, for GPUs, so 1 sample per GPU dataloader = DataLoader(dataset=dataset, batch_size=8) # Use a TransformersModel, transformer_cls_names_to_wrap=Qwen3MoeSparseMoeBlock to avoid hang of fsdp2 - model = TransformersModel(model_id='ms://Qwen/Qwen3-30B-A3B-Instruct-2507', fsdp_config={'transformer_cls_names_to_wrap':['Qwen3MoeSparseMoeBlock']}) + model = TransformersModel(model_id='ms://Qwen/Qwen3.5-4B', fsdp_config={'transformer_cls_names_to_wrap':['Qwen3MoeSparseMoeBlock']}) # Patch MoE model to fix the hang bug, support transformers==4.* model.apply_patch('ms://twinkle-kit/qwen3_moe_transformers4_patch') lora_config = LoraConfig( diff --git a/docs/source_en/Usage Guide/Introduction-with-Qwen3.5.md b/docs/source_en/Usage Guide/Introduction-with-Qwen3.5.md index 978b5af1..d1eba8cc 100644 --- a/docs/source_en/Usage Guide/Introduction-with-Qwen3.5.md +++ b/docs/source_en/Usage Guide/Introduction-with-Qwen3.5.md @@ -530,7 +530,7 @@ Alongside the open-source release of Twinkle, ModelScope provides a hosted model ```python base_url = 'https://www.modelscope.cn/twinkle' -base_model = 'Qwen/Qwen3-30B-A3B-Instruct-2507' # Model currently deployed in the official environment +base_model = 'Qwen/Qwen3.5-4B' # Model currently deployed in the official environment ``` --- diff --git a/docs/source_en/Usage Guide/Train-as-a-Service.md b/docs/source_en/Usage Guide/Train-as-a-Service.md index 1fdcb4a4..f5571773 100644 --- a/docs/source_en/Usage Guide/Train-as-a-Service.md +++ b/docs/source_en/Usage Guide/Train-as-a-Service.md @@ -2,7 +2,7 @@ Alongside the open-source release of the Twinkle framework, we also provide a hosted model training service (Training as a Service) powered by ModelScope's backend infrastructure. Developers can use this service to experience Twinkle's training API for free. -The model currently running on the cluster is [Qwen/Qwen3-30B-A3B-Instruct-2507](https://www.modelscope.cn/models/Qwen/Qwen3-30B-A3B-Instruct-2507). Below are the detailed usage instructions: +The model currently running on the cluster is [Qwen/Qwen3.5-4B](https://www.modelscope.cn/models/Qwen/Qwen3.5-4B). Below are the detailed usage instructions: ## Step 1. Register a ModelScope Account and Apply to Join the twinkle-explorers Organization @@ -30,7 +30,7 @@ from twinkle.dataset import Dataset, DatasetMeta from twinkle.preprocessor import SelfCognitionProcessor from twinkle.server.common import input_feature_to_datum -base_model = 'ms://Qwen/Qwen3-30B-A3B-Instruct-2507' +base_model = 'ms://Qwen/Qwen3.5-4B' base_url='http://www.modelscope.cn/twinkle' api_key=os.environ.get('MODELSCOPE_TOKEN') @@ -64,7 +64,7 @@ for epoch in range(2): print(f'Saved checkpoint for epoch {epoch} to {result.path}') ``` -With the code above, you can train a self-cognition LoRA based on `Qwen/Qwen3-30B-A3B-Instruct-2507`. This LoRA will change the model's name and creator to the names specified during training. To perform inference using this LoRA: +With the code above, you can train a self-cognition LoRA based on `Qwen/Qwen3.5-4B`. This LoRA will change the model's name and creator to the names specified during training. To perform inference using this LoRA: ```python import os @@ -79,7 +79,7 @@ init_tinker_client() from tinker import ServiceClient -base_model = 'Qwen/Qwen3-30B-A3B-Instruct-2507' +base_model = 'Qwen/Qwen3.5-4B' base_url = 'http://www.modelscope.cn/twinkle' # Step 2: Define the base model and connect to the server @@ -92,7 +92,7 @@ service_client = ServiceClient( # The model_path is a twinkle:// URI pointing to a previously saved LoRA checkpoint. # The server will load the base model and apply the LoRA adapter weights. sampling_client = service_client.create_sampling_client( - model_path='twinkle://xxx-Qwen_Qwen3-30B-A3B-Instruct-2507-xxx/weights/twinkle-lora-1', + model_path='twinkle://xxx-Qwen_Qwen3.5-4B-xxx/weights/twinkle-lora-1', base_model=base_model ) diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/Qwen3.5\346\234\200\344\275\263\345\256\236\350\267\265.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/Qwen3.5\346\234\200\344\275\263\345\256\236\350\267\265.md" index c8e92c3b..ad78e28d 100644 --- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/Qwen3.5\346\234\200\344\275\263\345\256\236\350\267\265.md" +++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/Qwen3.5\346\234\200\344\275\263\345\256\236\350\267\265.md" @@ -530,7 +530,7 @@ Twinkle 框架开源的同时,魔搭社区依托自身算力基础设施,提 ```python base_url = 'https://www.modelscope.cn/twinkle' -base_model = 'Qwen/Qwen3-30B-A3B-Instruct-2507' # 官方环境当前部署的模型 +base_model = 'Qwen/Qwen3.5-4B' # 官方环境当前部署的模型 ``` --- diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md" index b07c728b..cfff81e3 100644 --- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md" +++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md" @@ -3,7 +3,7 @@ 在 Twinkle 框架开源的同时,我们依托ModelScope的后台服务,也提供了托管的模型训练服务(Training as a Service),开发者可以通过这一服务, 免费体验Twinkle的训练API。 -目前在集群中运行的模型是[Qwen/Qwen3-30B-A3B-Instruct-2507](https://www.modelscope.cn/models/Qwen/Qwen3-30B-A3B-Instruct-2507)。下面介绍具体的使用方法: +目前在集群中运行的模型是[Qwen/Qwen3.5-4B](https://www.modelscope.cn/models/Qwen/Qwen3.5-4B)。下面介绍具体的使用方法: ## Step 1. 注册ModelScope用户并申请加入 twinkle-explorers 组织 @@ -33,7 +33,7 @@ from twinkle.dataset import Dataset, DatasetMeta from twinkle.preprocessor import SelfCognitionProcessor from twinkle.server.common import input_feature_to_datum -base_model = 'ms://Qwen/Qwen3-30B-A3B-Instruct-2507' +base_model = 'ms://Qwen/Qwen3.5-4B' base_url='http://www.modelscope.cn/twinkle' api_key=os.environ.get('MODELSCOPE_TOKEN') @@ -67,7 +67,7 @@ for epoch in range(2): print(f'Saved checkpoint for epoch {epoch} to {result.path}') ``` -通过上述代码,你可以训练一个原模型为`Qwen/Qwen3-30B-A3B-Instruct-2507`的自我认知lora。这个lora会改变模型的名称和制造者为训练时指定的名称。使用这个lora进行推理: +通过上述代码,你可以训练一个原模型为`Qwen/Qwen3.5-4B`的自我认知lora。这个lora会改变模型的名称和制造者为训练时指定的名称。使用这个lora进行推理: ```python import os @@ -82,7 +82,7 @@ init_tinker_client() from tinker import ServiceClient -base_model = 'Qwen/Qwen3-30B-A3B-Instruct-2507' +base_model = 'Qwen/Qwen3.5-4B' base_url = 'http://www.modelscope.cn/twinkle' # Step 2: Define the base model and connect to the server @@ -95,7 +95,7 @@ service_client = ServiceClient( # The model_path is a twinkle:// URI pointing to a previously saved LoRA checkpoint. # The server will load the base model and apply the LoRA adapter weights. sampling_client = service_client.create_sampling_client( - model_path='twinkle://xxx-Qwen_Qwen3-30B-A3B-Instruct-2507-xxx/weights/twinkle-lora-1', + model_path='twinkle://xxx-Qwen_Qwen3.5-4B-xxx/weights/twinkle-lora-1', base_model=base_model ) diff --git a/src/twinkle/dataloader/device_mesh_fetcher.py b/src/twinkle/dataloader/device_mesh_fetcher.py index bd89285f..9560fa0f 100644 --- a/src/twinkle/dataloader/device_mesh_fetcher.py +++ b/src/twinkle/dataloader/device_mesh_fetcher.py @@ -61,6 +61,8 @@ def fetch(self, _): except StopIteration as e: raise e except Exception: # noqa + import traceback + traceback.print_exc() continue else: break diff --git a/src/twinkle/dataloader/retry_sampler.py b/src/twinkle/dataloader/retry_sampler.py index 3e731bae..62f05660 100644 --- a/src/twinkle/dataloader/retry_sampler.py +++ b/src/twinkle/dataloader/retry_sampler.py @@ -33,6 +33,8 @@ def __iter__(self): total += 1 break except Exception: # noqa + import traceback + traceback.print_exc() continue else: raise StopIteration(f'Max retries exceeded: {self.max_retries}, no valid data found.') @@ -53,6 +55,8 @@ def __iter__(self): yield idx total += 1 except Exception: # noqa + import traceback + traceback.print_exc() continue else: raise ValueError(f'Max retries exceeded: {self.max_retries}, no valid data found.') diff --git a/src/twinkle/model/transformers/multi_lora_transformers.py b/src/twinkle/model/transformers/multi_lora_transformers.py index 6033d943..0900f52b 100644 --- a/src/twinkle/model/transformers/multi_lora_transformers.py +++ b/src/twinkle/model/transformers/multi_lora_transformers.py @@ -1,5 +1,6 @@ # Copyright (c) ModelScope Contributors. All rights reserved. import os +import transformers from peft import LoraConfig, PeftConfig, PeftModel, load_peft_weights from torch.optim import Optimizer from torch.optim.lr_scheduler import LRScheduler @@ -36,6 +37,8 @@ def __init__( os.environ['TOKENIZERS_PARALLELISM'] = 'true' super(PreTrainedModel, self).__init__() model_id = HubOperation.download_model(model_id) + if isinstance(model_cls, str): + model_cls = getattr(transformers, model_cls) self.model = model_cls.from_pretrained(model_id, config=config, **kwargs) self.model_id = model_id self.tokenizer_id = kwargs.get('tokenizer_id', self.model_id) diff --git a/src/twinkle/processor/base.py b/src/twinkle/processor/base.py index 03493689..576db8cd 100644 --- a/src/twinkle/processor/base.py +++ b/src/twinkle/processor/base.py @@ -35,6 +35,7 @@ class InputProcessor: 'video_grid_thw': 0, 'input_features': 0.0, 'feature_attention_mask': 0, + 'mm_token_type_ids': 0, } # VLM fields to concatenate (not pad) in batch @@ -370,7 +371,6 @@ def _collate_macro_batch(self, inputs: List[InputFeature]) -> InputFeature: else: result[key] = values result = InputFeature(**result) - for field, values in vlm_fields.items(): if values: if values[0].dim() == 1: diff --git a/src/twinkle/server/gateway/server.py b/src/twinkle/server/gateway/server.py index dd591ccf..bf55efda 100644 --- a/src/twinkle/server/gateway/server.py +++ b/src/twinkle/server/gateway/server.py @@ -36,7 +36,7 @@ def __init__(self, self.http_options = http_options or {} self.proxy = ServiceProxy(http_options=http_options, route_prefix=self.route_prefix) self.supported_models = self._normalize_models(supported_models) or [ - tinker_types.SupportedModel(model_name='Qwen/Qwen3-30B-A3B-Instruct-2507'), + tinker_types.SupportedModel(model_name='Qwen/Qwen3.5-4B'), ] self._modelscope_config_lock = asyncio.Lock() diff --git a/src/twinkle/server/model/twinkle_handlers.py b/src/twinkle/server/model/twinkle_handlers.py index 35c87441..416a4a0d 100644 --- a/src/twinkle/server/model/twinkle_handlers.py +++ b/src/twinkle/server/model/twinkle_handlers.py @@ -8,10 +8,11 @@ """ from __future__ import annotations +import torch import traceback from fastapi import Depends, FastAPI, HTTPException, Request from peft import LoraConfig -from typing import TYPE_CHECKING, Any, Callable, Optional +from typing import TYPE_CHECKING, Any, Callable if TYPE_CHECKING: from .app import ModelManagement @@ -137,11 +138,22 @@ async def forward_backward( ) -> types.ForwardBackwardResponse: adapter_name = _get_twinkle_adapter_name(request, body.adapter_name) + def first_element(data): + while isinstance(data, list): + if len(data) == 0: + return None + data = data[0] + return data + async def _task(): self.assert_adapter_exists(adapter_name=adapter_name) extra_kwargs = body.model_extra or {} - inputs = _parse_inputs(body.inputs) - ret = self.model.forward_backward(inputs=inputs, adapter_name=adapter_name, **extra_kwargs) + all_inputs = _parse_inputs(body.inputs) + for inputs in all_inputs: + for key in inputs: + if isinstance(inputs[key], list) and isinstance(first_element(inputs[key]), (int, float)): + inputs[key] = torch.tensor(inputs[key]) + ret = self.model.forward_backward(inputs=all_inputs, adapter_name=adapter_name, **extra_kwargs) return {'result': ret} return await run_task(self.schedule_task_and_wait(_task, task_type='forward_backward')) diff --git a/tests/moe/test_ep_fsdp_vs_single.py b/tests/moe/test_ep_fsdp_vs_single.py index d365c2be..ce46c3da 100644 --- a/tests/moe/test_ep_fsdp_vs_single.py +++ b/tests/moe/test_ep_fsdp_vs_single.py @@ -7,7 +7,7 @@ Requirements: - 4 CUDA GPUs - - Model weights accessible via QWEN3_MOE_MODEL_ID (default: Qwen/Qwen3-30B-A3B-Instruct-2507) + - Model weights accessible via QWEN3_MOE_MODEL_ID (default: Qwen/Qwen3.5-4B) Launch (requires 4 CUDA GPUs; skipped automatically if fewer GPUs are available): @@ -456,7 +456,7 @@ def test_alignment(self): if torch.cuda.device_count() < 4: self.skipTest('Need 4 GPUs') - model_id = os.environ.get('QWEN3_MOE_MODEL_ID', 'Qwen/Qwen3-30B-A3B-Instruct-2507') + model_id = os.environ.get('QWEN3_MOE_MODEL_ID', 'Qwen/Qwen3.5-4B') local_only = os.environ.get('QWEN3_MOE_LOCAL_ONLY', '1') != '0' try: diff --git a/tests/moe/test_expert_parallel_qwen3_fsdp_sp.py b/tests/moe/test_expert_parallel_qwen3_fsdp_sp.py index a2031d14..b035db3c 100644 --- a/tests/moe/test_expert_parallel_qwen3_fsdp_sp.py +++ b/tests/moe/test_expert_parallel_qwen3_fsdp_sp.py @@ -472,7 +472,7 @@ def test_qwen3_moe_pretrained_ep_fsdp_sp_alignment(self): world_size = 4 if torch.cuda.device_count() < world_size: self.skipTest('Requires at least 4 GPUs for EP+FSDP+SP alignment test.') - model_id = os.environ.get('QWEN3_MOE_MODEL_ID', 'Qwen/Qwen3-30B-A3B-Instruct-2507') + model_id = os.environ.get('QWEN3_MOE_MODEL_ID', 'Qwen/Qwen3.5-4B') local_files_only = os.environ.get('QWEN3_MOE_LOCAL_ONLY', '1') != '0' try: _load_qwen3_moe_config(model_id, local_files_only)