Skip to content

vllm.entrypoints.pooling.embed.protocol

EmbeddingRequest module-attribute

EmbeddingBytesResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/pooling/embed/protocol.py
class EmbeddingBytesResponse(OpenAIBaseModel):
    body: list[bytes]
    metadata: str
    media_type: str = "application/octet-stream"

body instance-attribute

body: list[bytes]

media_type class-attribute instance-attribute

media_type: str = 'application/octet-stream'

metadata instance-attribute

metadata: str

EmbeddingChatRequest

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/pooling/embed/protocol.py
class EmbeddingChatRequest(OpenAIBaseModel):
    model: str | None = None
    messages: list[ChatCompletionMessageParam]

    encoding_format: EncodingFormat = "float"
    dimensions: int | None = None
    user: str | None = None
    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None

    # --8<-- [start:chat-embedding-extra-params]
    add_generation_prompt: bool = Field(
        default=False,
        description=(
            "If true, the generation prompt will be added to the chat template. "
            "This is a parameter used by chat template in tokenizer config of the "
            "model."
        ),
    )

    add_special_tokens: bool = Field(
        default=False,
        description=(
            "If true, special tokens (e.g. BOS) will be added to the prompt "
            "on top of what is added by the chat template. "
            "For most models, the chat template takes care of adding the "
            "special tokens so this should be set to false (as is the "
            "default)."
        ),
    )
    chat_template: str | None = Field(
        default=None,
        description=(
            "A Jinja template to use for this conversion. "
            "As of transformers v4.44, default chat template is no longer "
            "allowed, so you must provide a chat template if the tokenizer "
            "does not define one."
        ),
    )
    chat_template_kwargs: dict[str, Any] | None = Field(
        default=None,
        description=(
            "Additional keyword args to pass to the template renderer. "
            "Will be accessible by the chat template."
        ),
    )
    mm_processor_kwargs: dict[str, Any] | None = Field(
        default=None,
        description=("Additional kwargs to pass to the HF processor."),
    )
    priority: int = Field(
        default=0,
        description=(
            "The priority of the request (lower means earlier handling; "
            "default: 0). Any priority other than 0 will raise an error "
            "if the served model does not use priority scheduling."
        ),
    )
    request_id: str = Field(
        default_factory=random_uuid,
        description=(
            "The request_id related to this request. If the caller does "
            "not set it, a random_uuid will be generated. This id is used "
            "through out the inference process and return in response."
        ),
    )
    normalize: bool | None = Field(
        default=None,
        description="Whether to normalize the embeddings outputs. Default is True.",
    )
    embed_dtype: EmbedDType = Field(
        default="float32",
        description=(
            "What dtype to use for encoding. Default to using float32 for base64 "
            "encoding to match the OpenAI python client behavior. "
            "This parameter will affect base64 and binary_response."
        ),
    )
    endianness: Endianness = Field(
        default="native",
        description=(
            "What endianness to use for encoding. Default to using native for "
            "base64 encoding to match the OpenAI python client behavior."
            "This parameter will affect base64 and binary_response."
        ),
    )
    # --8<-- [end:chat-embedding-extra-params]

    @model_validator(mode="before")
    @classmethod
    def check_generation_prompt(cls, data):
        if data.get("continue_final_message") and data.get("add_generation_prompt"):
            raise ValueError(
                "Cannot set both `continue_final_message` and "
                "`add_generation_prompt` to True."
            )
        return data

    def to_pooling_params(self):
        return PoolingParams(
            truncate_prompt_tokens=self.truncate_prompt_tokens,
            dimensions=self.dimensions,
            normalize=self.normalize,
        )

add_generation_prompt class-attribute instance-attribute

add_generation_prompt: bool = Field(
    default=False,
    description="If true, the generation prompt will be added to the chat template. This is a parameter used by chat template in tokenizer config of the model.",
)

add_special_tokens class-attribute instance-attribute

add_special_tokens: bool = Field(
    default=False,
    description="If true, special tokens (e.g. BOS) will be added to the prompt on top of what is added by the chat template. For most models, the chat template takes care of adding the special tokens so this should be set to false (as is the default).",
)

chat_template class-attribute instance-attribute

chat_template: str | None = Field(
    default=None,
    description="A Jinja template to use for this conversion. As of transformers v4.44, default chat template is no longer allowed, so you must provide a chat template if the tokenizer does not define one.",
)

chat_template_kwargs class-attribute instance-attribute

chat_template_kwargs: dict[str, Any] | None = Field(
    default=None,
    description="Additional keyword args to pass to the template renderer. Will be accessible by the chat template.",
)

dimensions class-attribute instance-attribute

dimensions: int | None = None

embed_dtype class-attribute instance-attribute

embed_dtype: EmbedDType = Field(
    default="float32",
    description="What dtype to use for encoding. Default to using float32 for base64 encoding to match the OpenAI python client behavior. This parameter will affect base64 and binary_response.",
)

encoding_format class-attribute instance-attribute

encoding_format: EncodingFormat = 'float'

endianness class-attribute instance-attribute

endianness: Endianness = Field(
    default="native",
    description="What endianness to use for encoding. Default to using native for base64 encoding to match the OpenAI python client behavior.This parameter will affect base64 and binary_response.",
)

messages instance-attribute

mm_processor_kwargs class-attribute instance-attribute

mm_processor_kwargs: dict[str, Any] | None = Field(
    default=None,
    description="Additional kwargs to pass to the HF processor.",
)

model class-attribute instance-attribute

model: str | None = None

normalize class-attribute instance-attribute

normalize: bool | None = Field(
    default=None,
    description="Whether to normalize the embeddings outputs. Default is True.",
)

priority class-attribute instance-attribute

priority: int = Field(
    default=0,
    description="The priority of the request (lower means earlier handling; default: 0). Any priority other than 0 will raise an error if the served model does not use priority scheduling.",
)

request_id class-attribute instance-attribute

request_id: str = Field(
    default_factory=random_uuid,
    description="The request_id related to this request. If the caller does not set it, a random_uuid will be generated. This id is used through out the inference process and return in response.",
)

truncate_prompt_tokens class-attribute instance-attribute

truncate_prompt_tokens: (
    Annotated[int, Field(ge=-1)] | None
) = None

user class-attribute instance-attribute

user: str | None = None

check_generation_prompt classmethod

check_generation_prompt(data)
Source code in vllm/entrypoints/pooling/embed/protocol.py
@model_validator(mode="before")
@classmethod
def check_generation_prompt(cls, data):
    if data.get("continue_final_message") and data.get("add_generation_prompt"):
        raise ValueError(
            "Cannot set both `continue_final_message` and "
            "`add_generation_prompt` to True."
        )
    return data

to_pooling_params

to_pooling_params()
Source code in vllm/entrypoints/pooling/embed/protocol.py
def to_pooling_params(self):
    return PoolingParams(
        truncate_prompt_tokens=self.truncate_prompt_tokens,
        dimensions=self.dimensions,
        normalize=self.normalize,
    )

EmbeddingCompletionRequest

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/pooling/embed/protocol.py
class EmbeddingCompletionRequest(OpenAIBaseModel):
    # Ordered by official OpenAI API documentation
    # https://platform.openai.com/docs/api-reference/embeddings
    model: str | None = None
    input: list[int] | list[list[int]] | str | list[str]
    encoding_format: EncodingFormat = "float"
    dimensions: int | None = None
    user: str | None = None
    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None

    # --8<-- [start:embedding-extra-params]
    add_special_tokens: bool = Field(
        default=True,
        description=(
            "If true (the default), special tokens (e.g. BOS) will be added to "
            "the prompt."
        ),
    )
    priority: int = Field(
        default=0,
        description=(
            "The priority of the request (lower means earlier handling; "
            "default: 0). Any priority other than 0 will raise an error "
            "if the served model does not use priority scheduling."
        ),
    )
    request_id: str = Field(
        default_factory=random_uuid,
        description=(
            "The request_id related to this request. If the caller does "
            "not set it, a random_uuid will be generated. This id is used "
            "through out the inference process and return in response."
        ),
    )
    normalize: bool | None = Field(
        default=None,
        description="Whether to normalize the embeddings outputs. Default is True.",
    )
    embed_dtype: EmbedDType = Field(
        default="float32",
        description=(
            "What dtype to use for encoding. Default to using float32 for base64 "
            "encoding to match the OpenAI python client behavior. "
            "This parameter will affect base64 and binary_response."
        ),
    )
    endianness: Endianness = Field(
        default="native",
        description=(
            "What endianness to use for encoding. Default to using native for "
            "base64 encoding to match the OpenAI python client behavior."
            "This parameter will affect base64 and binary_response."
        ),
    )
    # --8<-- [end:embedding-extra-params]

    def to_pooling_params(self):
        return PoolingParams(
            truncate_prompt_tokens=self.truncate_prompt_tokens,
            dimensions=self.dimensions,
            normalize=self.normalize,
        )

add_special_tokens class-attribute instance-attribute

add_special_tokens: bool = Field(
    default=True,
    description="If true (the default), special tokens (e.g. BOS) will be added to the prompt.",
)

dimensions class-attribute instance-attribute

dimensions: int | None = None

embed_dtype class-attribute instance-attribute

embed_dtype: EmbedDType = Field(
    default="float32",
    description="What dtype to use for encoding. Default to using float32 for base64 encoding to match the OpenAI python client behavior. This parameter will affect base64 and binary_response.",
)

encoding_format class-attribute instance-attribute

encoding_format: EncodingFormat = 'float'

endianness class-attribute instance-attribute

endianness: Endianness = Field(
    default="native",
    description="What endianness to use for encoding. Default to using native for base64 encoding to match the OpenAI python client behavior.This parameter will affect base64 and binary_response.",
)

input instance-attribute

input: list[int] | list[list[int]] | str | list[str]

model class-attribute instance-attribute

model: str | None = None

normalize class-attribute instance-attribute

normalize: bool | None = Field(
    default=None,
    description="Whether to normalize the embeddings outputs. Default is True.",
)

priority class-attribute instance-attribute

priority: int = Field(
    default=0,
    description="The priority of the request (lower means earlier handling; default: 0). Any priority other than 0 will raise an error if the served model does not use priority scheduling.",
)

request_id class-attribute instance-attribute

request_id: str = Field(
    default_factory=random_uuid,
    description="The request_id related to this request. If the caller does not set it, a random_uuid will be generated. This id is used through out the inference process and return in response.",
)

truncate_prompt_tokens class-attribute instance-attribute

truncate_prompt_tokens: (
    Annotated[int, Field(ge=-1)] | None
) = None

user class-attribute instance-attribute

user: str | None = None

to_pooling_params

to_pooling_params()
Source code in vllm/entrypoints/pooling/embed/protocol.py
def to_pooling_params(self):
    return PoolingParams(
        truncate_prompt_tokens=self.truncate_prompt_tokens,
        dimensions=self.dimensions,
        normalize=self.normalize,
    )

EmbeddingResponse

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/pooling/embed/protocol.py
class EmbeddingResponse(OpenAIBaseModel):
    id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
    object: str = "list"
    created: int = Field(default_factory=lambda: int(time.time()))
    model: str
    data: list[EmbeddingResponseData]
    usage: UsageInfo

created class-attribute instance-attribute

created: int = Field(default_factory=lambda: int(time()))

data instance-attribute

id class-attribute instance-attribute

id: str = Field(
    default_factory=lambda: f"embd-{random_uuid()}"
)

model instance-attribute

model: str

object class-attribute instance-attribute

object: str = 'list'

usage instance-attribute

usage: UsageInfo

EmbeddingResponseData

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/pooling/embed/protocol.py
class EmbeddingResponseData(OpenAIBaseModel):
    index: int
    object: str = "embedding"
    embedding: list[float] | str

embedding instance-attribute

embedding: list[float] | str

index instance-attribute

index: int

object class-attribute instance-attribute

object: str = 'embedding'