Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/lighteval/models/vllm/vllm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ class VLLMModelConfig(ModelConfig):
Number of GPUs to use for pipeline parallelism. Defaults to 1.
gpu_memory_utilization (NonNegativeFloat):
Fraction of GPU memory to use. Lower this if running out of memory. Defaults to 0.9.
enable_prefix_caching (bool):
Whether to enable prefix caching to speed up generation. May use more memory. Should be disabled for LFM2. Defaults to True.
max_model_length (PositiveInt | None):
Maximum sequence length for the model. If None, automatically inferred.
Reduce this if encountering OOM issues (4096 is usually sufficient).
Expand Down Expand Up @@ -156,6 +158,7 @@ class VLLMModelConfig(ModelConfig):
data_parallel_size: PositiveInt = 1 # how many GPUs to use for data parallelism
pipeline_parallel_size: PositiveInt = 1 # how many GPUs to use for pipeline parallelism
gpu_memory_utilization: NonNegativeFloat = 0.9 # lower this if you are running out of memory
enable_prefix_caching: bool = None # whether to enable prefix caching to speed up generation. May use more memory. Should be disabled for LFM2
max_model_length: PositiveInt | None = (
None # maximum length of the model, ussually infered automatically. reduce this if you encouter OOM issues, 4096 is usually enough
)
Expand Down Expand Up @@ -245,6 +248,7 @@ def _create_auto_model(self, config: VLLMModelConfig) -> Optional[LLM]:
self.model_args = {
"model": config.model_name,
"gpu_memory_utilization": config.gpu_memory_utilization,
"enable_prefix_caching": config.enable_prefix_caching,
"revision": config.revision + (f"/{config.subfolder}" if config.subfolder is not None else ""),
"dtype": config.dtype,
"trust_remote_code": config.trust_remote_code,
Expand Down
Loading