Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
468 changes: 467 additions & 1 deletion fast_llm/engine/checkpoint/external.py

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion fast_llm/layers/attention/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ class AttentionConfig(MixerConfig):
)
dense_layer: AffineLinearConfig = Field(
desc="Initialization configuration for the dense layer.",
hint=FieldHint.feature,
hint=FieldHint.architecture,
)
# TODO: Review names
rotary: RotaryConfig = Field(
Expand Down Expand Up @@ -115,6 +115,7 @@ class AttentionConfig(MixerConfig):
" Under Standard Parameterization (SP): default to 0.5. "
" Under muP (if scaling head_size size): use 1. "
" Under muP (if scaling number of heads instead of head_size): use 0.5.",
hint=FieldHint.architecture,
valid=skip_valid_if_none(check_field(Assert.geq, 0)),
)
implementation: AttentionImplementation = Field(
Expand Down
18 changes: 9 additions & 9 deletions fast_llm/layers/attention/rotary/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,10 @@ class Llama3RotaryConfig(DefaultRotaryConfig):
"""

# TODO: Add descriptions.
scale_factor: float = Field(default=8.0, hint=FieldHint.feature)
low_frequency_factor: float = Field(default=1.0, hint=FieldHint.feature)
high_frequency_factor: float = Field(default=4.0, hint=FieldHint.feature)
original_context_length: int = Field(default=8192, hint=FieldHint.feature)
scale_factor: float = Field(default=8.0, hint=FieldHint.architecture)
low_frequency_factor: float = Field(default=1.0, hint=FieldHint.architecture)
high_frequency_factor: float = Field(default=4.0, hint=FieldHint.architecture)
original_context_length: int = Field(default=8192, hint=FieldHint.architecture)

def _validate(self) -> None:
super()._validate()
Expand All @@ -102,20 +102,20 @@ class YarnRotaryConfig(DefaultRotaryConfig):
"""

# TODO: Add descriptions.
scale_factor: float = Field(default=8.0, hint=FieldHint.feature)
scale_factor: float = Field(default=8.0, hint=FieldHint.architecture)
attention_factor: None | float = Field(
default=None,
hint=FieldHint.feature,
hint=FieldHint.architecture,
)
beta_fast: float = Field(
default=32.0,
hint=FieldHint.feature,
hint=FieldHint.architecture,
)
beta_slow: float = Field(
default=1.0,
hint=FieldHint.feature,
hint=FieldHint.architecture,
)
original_context_length: int = Field(default=8192, hint=FieldHint.feature)
original_context_length: int = Field(default=8192, hint=FieldHint.architecture)

def _validate(self) -> None:
if self.attention_factor is None:
Expand Down
5 changes: 4 additions & 1 deletion fast_llm/layers/block/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,10 @@ def last_block_config(self) -> BlockConfig:
@config_class(dynamic_type={BlockSequenceConfig: "pattern"})
class PatternBlockSequenceConfig(BlockSequenceConfig):
_abstract = False
blocks: dict[str, BlockConfig] = Field()
blocks: dict[str, BlockConfig] = Field(
desc="Named block configurations referenced by `pattern`.",
hint=FieldHint.architecture,
)
pattern: list[str] = Field(
default=None,
desc="The name of each block (key in `blocks`) in the repeated pattern.",
Expand Down
2 changes: 1 addition & 1 deletion fast_llm/layers/decoder/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ class StochasticMixerConfig(MixerConfig):
"Used for inference/eval, checkpoint loading (receives pretrained weights), "
"and checkpoint saving (only this mixer is exported). "
"If None, uses the first mixer in the dict.",
hint=FieldHint.feature,
hint=FieldHint.architecture,
)

seed_shift: int = Field(
Expand Down
4 changes: 2 additions & 2 deletions fast_llm/layers/decoder/mlp/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ class MLPConfig(MLPBaseConfig):
activation: ActivationType = Field(
default=None,
desc="The MLP intermediate activation type. Default: SiLU for gated MLP, GeLU otherwise.",
hint=FieldHint.core,
hint=FieldHint.architecture,
)
# normalization_implementation: NormalizationImplementation = NormalizationImplementation.auto
recompute_level: MLPRecomputeLevel = Field(
Expand Down Expand Up @@ -95,7 +95,7 @@ class MoEMLPConfig(MLPConfig):
router: LinearConfig = Field(
# TODO: Improve default?
desc="Configuration for the MoE router.",
hint=FieldHint.feature,
hint=FieldHint.architecture,
)
experts: int = Field(
default=2,
Expand Down
4 changes: 2 additions & 2 deletions fast_llm/layers/vision/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,12 @@ class PatchEmbeddingsConfig(BlockConfig):
patch_height: int = Field(
default=16,
desc="Height of image patches, in pixels.",
hint=FieldHint.core,
hint=FieldHint.architecture,
)
patch_width: int = Field(
default=16,
desc="Width of image patches, in pixels.",
hint=FieldHint.core,
hint=FieldHint.architecture,
)
full_precision_residual: bool = Field(
default=False,
Expand Down
Loading
Loading