Text Embedding Models
gfmrag.text_emb_models
¶
BaseTextEmbModel
¶
A base class for text embedding models using SentenceTransformer.
This class provides functionality to encode text into embeddings using various SentenceTransformer models with configurable parameters.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
text_emb_model_name
|
str
|
Name or path of the SentenceTransformer model to use |
required |
normalize
|
bool
|
Whether to L2-normalize the embeddings. Defaults to False. |
False
|
batch_size
|
int
|
Batch size for encoding. Defaults to 32. |
32
|
query_instruct
|
str | None
|
Instruction/prompt to prepend to queries. Defaults to None. |
None
|
passage_instruct
|
str | None
|
Instruction/prompt to prepend to passages. Defaults to None. |
None
|
model_kwargs
|
dict | None
|
Additional keyword arguments for the model. Defaults to None. |
None
|
Attributes:
| Name | Type | Description |
|---|---|---|
text_emb_model |
SentenceTransformer
|
The underlying SentenceTransformer model |
text_emb_model_name |
str
|
Name of the model being used |
normalize |
bool
|
Whether embeddings are L2-normalized |
batch_size |
int
|
Batch size used for encoding |
query_instruct |
str | None
|
Instruction text for queries |
passage_instruct |
str | None
|
Instruction text for passages |
model_kwargs |
dict | None
|
Additional model configuration parameters |
Methods:
| Name | Description |
|---|---|
encode |
list[str], is_query: bool = False, show_progress_bar: bool = True) -> torch.Tensor: Encodes a list of texts into embeddings. |
Source code in gfmrag/text_emb_models/base_model.py
class BaseTextEmbModel:
"""A base class for text embedding models using SentenceTransformer.
This class provides functionality to encode text into embeddings using various
SentenceTransformer models with configurable parameters.
Args:
text_emb_model_name (str): Name or path of the SentenceTransformer model to use
normalize (bool, optional): Whether to L2-normalize the embeddings. Defaults to False.
batch_size (int, optional): Batch size for encoding. Defaults to 32.
query_instruct (str | None, optional): Instruction/prompt to prepend to queries. Defaults to None.
passage_instruct (str | None, optional): Instruction/prompt to prepend to passages. Defaults to None.
model_kwargs (dict | None, optional): Additional keyword arguments for the model. Defaults to None.
Attributes:
text_emb_model (SentenceTransformer): The underlying SentenceTransformer model
text_emb_model_name (str): Name of the model being used
normalize (bool): Whether embeddings are L2-normalized
batch_size (int): Batch size used for encoding
query_instruct (str | None): Instruction text for queries
passage_instruct (str | None): Instruction text for passages
model_kwargs (dict | None): Additional model configuration parameters
Methods:
encode(text: list[str], is_query: bool = False, show_progress_bar: bool = True) -> torch.Tensor:
Encodes a list of texts into embeddings.
"""
def __init__(
self,
text_emb_model_name: str,
normalize: bool = False,
batch_size: int = 32,
query_instruct: str | None = None,
passage_instruct: str | None = None,
model_kwargs: dict | None = None,
) -> None:
"""
Initialize the BaseTextEmbModel.
Args:
text_emb_model_name (str): Name or path of the SentenceTransformer model to use
normalize (bool, optional): Whether to L2-normalize the embeddings. Defaults to False.
batch_size (int, optional): Batch size for encoding. Defaults to 32.
query_instruct (str | None, optional): Instruction/prompt to prepend to queries. Defaults to None.
passage_instruct (str | None, optional): Instruction/prompt to prepend to passages. Defaults to None.
model_kwargs (dict | None, optional): Additional keyword arguments for the model. Defaults to None.
"""
self.text_emb_model_name = text_emb_model_name
self.normalize = normalize
self.batch_size = batch_size
self.query_instruct = query_instruct
self.passage_instruct = passage_instruct
self.model_kwargs = model_kwargs
self.text_emb_model = SentenceTransformer(
self.text_emb_model_name,
trust_remote_code=True,
model_kwargs=self.model_kwargs,
)
def encode(
self, text: list[str], is_query: bool = False, show_progress_bar: bool = True
) -> torch.Tensor:
"""
Encodes a list of text strings into embeddings using the text embedding model.
Args:
text (list[str]): List of text strings to encode
is_query (bool, optional): Whether the text is a query (True) or passage (False).
Determines which instruction prompt to use. Defaults to False.
show_progress_bar (bool, optional): Whether to display progress bar during encoding.
Defaults to True.
Returns:
torch.Tensor: Tensor containing the encoded embeddings for the input text
Examples:
>>> text_emb_model = BaseTextEmbModel("sentence-transformers/all-mpnet-base-v2")
>>> text = ["Hello, world!", "This is a test."]
>>> embeddings = text_emb_model.encode(text)
"""
if len(text) == 0:
return torch.empty((0, 0), dtype=torch.float32)
prompt = self.query_instruct if is_query else self.passage_instruct
all_embeddings = []
for i in tqdm(
range(0, len(text), self.batch_size), disable=not show_progress_bar
):
batch = text[i : min(i + self.batch_size, len(text))]
batch_embeddings = self.text_emb_model.encode(
batch,
device="cuda" if torch.cuda.is_available() else "cpu",
normalize_embeddings=self.normalize,
batch_size=self.batch_size,
prompt=prompt,
show_progress_bar=False,
convert_to_tensor=True,
).float()
all_embeddings.append(batch_embeddings.cpu())
del batch_embeddings
if torch.cuda.is_available():
torch.cuda.empty_cache()
return torch.cat(all_embeddings, dim=0)
__init__(text_emb_model_name, normalize=False, batch_size=32, query_instruct=None, passage_instruct=None, model_kwargs=None)
¶
Initialize the BaseTextEmbModel.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
text_emb_model_name
|
str
|
Name or path of the SentenceTransformer model to use |
required |
normalize
|
bool
|
Whether to L2-normalize the embeddings. Defaults to False. |
False
|
batch_size
|
int
|
Batch size for encoding. Defaults to 32. |
32
|
query_instruct
|
str | None
|
Instruction/prompt to prepend to queries. Defaults to None. |
None
|
passage_instruct
|
str | None
|
Instruction/prompt to prepend to passages. Defaults to None. |
None
|
model_kwargs
|
dict | None
|
Additional keyword arguments for the model. Defaults to None. |
None
|
Source code in gfmrag/text_emb_models/base_model.py
def __init__(
self,
text_emb_model_name: str,
normalize: bool = False,
batch_size: int = 32,
query_instruct: str | None = None,
passage_instruct: str | None = None,
model_kwargs: dict | None = None,
) -> None:
"""
Initialize the BaseTextEmbModel.
Args:
text_emb_model_name (str): Name or path of the SentenceTransformer model to use
normalize (bool, optional): Whether to L2-normalize the embeddings. Defaults to False.
batch_size (int, optional): Batch size for encoding. Defaults to 32.
query_instruct (str | None, optional): Instruction/prompt to prepend to queries. Defaults to None.
passage_instruct (str | None, optional): Instruction/prompt to prepend to passages. Defaults to None.
model_kwargs (dict | None, optional): Additional keyword arguments for the model. Defaults to None.
"""
self.text_emb_model_name = text_emb_model_name
self.normalize = normalize
self.batch_size = batch_size
self.query_instruct = query_instruct
self.passage_instruct = passage_instruct
self.model_kwargs = model_kwargs
self.text_emb_model = SentenceTransformer(
self.text_emb_model_name,
trust_remote_code=True,
model_kwargs=self.model_kwargs,
)
encode(text, is_query=False, show_progress_bar=True)
¶
Encodes a list of text strings into embeddings using the text embedding model.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
text
|
list[str]
|
List of text strings to encode |
required |
is_query
|
bool
|
Whether the text is a query (True) or passage (False). Determines which instruction prompt to use. Defaults to False. |
False
|
show_progress_bar
|
bool
|
Whether to display progress bar during encoding. Defaults to True. |
True
|
Returns:
| Type | Description |
|---|---|
Tensor
|
torch.Tensor: Tensor containing the encoded embeddings for the input text |
Examples:
>>> text_emb_model = BaseTextEmbModel("sentence-transformers/all-mpnet-base-v2")
>>> text = ["Hello, world!", "This is a test."]
>>> embeddings = text_emb_model.encode(text)
Source code in gfmrag/text_emb_models/base_model.py
def encode(
self, text: list[str], is_query: bool = False, show_progress_bar: bool = True
) -> torch.Tensor:
"""
Encodes a list of text strings into embeddings using the text embedding model.
Args:
text (list[str]): List of text strings to encode
is_query (bool, optional): Whether the text is a query (True) or passage (False).
Determines which instruction prompt to use. Defaults to False.
show_progress_bar (bool, optional): Whether to display progress bar during encoding.
Defaults to True.
Returns:
torch.Tensor: Tensor containing the encoded embeddings for the input text
Examples:
>>> text_emb_model = BaseTextEmbModel("sentence-transformers/all-mpnet-base-v2")
>>> text = ["Hello, world!", "This is a test."]
>>> embeddings = text_emb_model.encode(text)
"""
if len(text) == 0:
return torch.empty((0, 0), dtype=torch.float32)
prompt = self.query_instruct if is_query else self.passage_instruct
all_embeddings = []
for i in tqdm(
range(0, len(text), self.batch_size), disable=not show_progress_bar
):
batch = text[i : min(i + self.batch_size, len(text))]
batch_embeddings = self.text_emb_model.encode(
batch,
device="cuda" if torch.cuda.is_available() else "cpu",
normalize_embeddings=self.normalize,
batch_size=self.batch_size,
prompt=prompt,
show_progress_bar=False,
convert_to_tensor=True,
).float()
all_embeddings.append(batch_embeddings.cpu())
del batch_embeddings
if torch.cuda.is_available():
torch.cuda.empty_cache()
return torch.cat(all_embeddings, dim=0)
NVEmbedV2
¶
Bases: BaseTextEmbModel
A text embedding model class that extends BaseTextEmbModel specifically for Nvidia models.
This class customizes the base embedding model by: 1. Setting a larger max sequence length of 32768 2. Setting right-side padding 3. Adding EOS tokens to input text
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
text_emb_model_name
|
str
|
Name or path of the text embedding model |
required |
normalize
|
bool
|
Whether to normalize the output embeddings |
required |
batch_size
|
int
|
Batch size for processing |
required |
query_instruct
|
str
|
Instruction prefix for query texts. Defaults to "". |
''
|
passage_instruct
|
str
|
Instruction prefix for passage texts. Defaults to "". |
''
|
model_kwargs
|
dict | None
|
Additional keyword arguments for model initialization. Defaults to None. |
None
|
Methods:
| Name | Description |
|---|---|
add_eos |
Adds EOS token to each input example |
encode |
Encodes text by first adding EOS tokens then calling parent encode method |
Attributes:
| Name | Type | Description |
|---|---|---|
text_emb_model |
The underlying text embedding model with customized max_seq_length and padding_side |
Source code in gfmrag/text_emb_models/nv_embed.py
class NVEmbedV2(BaseTextEmbModel):
"""A text embedding model class that extends BaseTextEmbModel specifically for Nvidia models.
This class customizes the base embedding model by:
1. Setting a larger max sequence length of 32768
2. Setting right-side padding
3. Adding EOS tokens to input text
Args:
text_emb_model_name (str): Name or path of the text embedding model
normalize (bool): Whether to normalize the output embeddings
batch_size (int): Batch size for processing
query_instruct (str, optional): Instruction prefix for query texts. Defaults to "".
passage_instruct (str, optional): Instruction prefix for passage texts. Defaults to "".
model_kwargs (dict | None, optional): Additional keyword arguments for model initialization. Defaults to None.
Methods:
add_eos: Adds EOS token to each input example
encode: Encodes text by first adding EOS tokens then calling parent encode method
Attributes:
text_emb_model: The underlying text embedding model with customized max_seq_length and padding_side
"""
def __init__(
self,
text_emb_model_name: str,
normalize: bool,
batch_size: int,
query_instruct: str = "",
passage_instruct: str = "",
model_kwargs: Mapping[str, Any] | None = None,
) -> None:
self.text_emb_model_name = text_emb_model_name
self.normalize = normalize
self.batch_size = batch_size
self.query_instruct = query_instruct
self.passage_instruct = passage_instruct
self.model_kwargs: dict[str, Any] | None = (
dict(model_kwargs) if model_kwargs is not None else None
)
config = NVEmbedConfig.from_pretrained(self.text_emb_model_name)
self.text_emb_model = cast(
NVEmbedModel,
NVEmbedModel.from_pretrained(
self.text_emb_model_name,
config=config,
device_map="auto",
**dict(self.model_kwargs or {}),
),
)
self.max_seq_length = 32768
self.text_emb_model.padding_side = "right"
tokenizer = self.text_emb_model.tokenizer
if tokenizer is None:
raise ValueError("NVEmbedModel tokenizer must be initialized.")
tokenizer.padding_side = "right"
def add_eos(self, input_examples: list[str]) -> list[str]:
tokenizer = self.text_emb_model.tokenizer
if tokenizer is None or tokenizer.eos_token is None:
raise ValueError("NVEmbedModel tokenizer EOS token must be initialized.")
return [input_example + tokenizer.eos_token for input_example in input_examples]
def encode(
self, text: list[str], is_query: bool = False, show_progress_bar: bool = True
) -> torch.Tensor:
"""
Encode a list of text strings into embeddings with added EOS token.
This method adds an EOS (end of sequence) token to each text string before encoding.
Args:
text (list[str]): List of text strings to encode
is_query (bool): Whether the text is being encoded as a query.
show_progress_bar (bool): Whether to display a progress bar during encoding.
Returns:
torch.Tensor: Encoded text embeddings tensor
Examples:
>>> encoder = NVEmbedder()
>>> texts = ["Hello world", "Another text"]
>>> embeddings = encoder.encode(texts)
"""
if len(text) == 0:
return torch.empty((0, 0), dtype=torch.float32)
prompt = self.query_instruct if is_query else self.passage_instruct
prompt = prompt or ""
all_embeddings: list[torch.Tensor] = []
for i in tqdm(
range(0, len(text), self.batch_size), disable=not show_progress_bar
):
batch = text[i : min(i + self.batch_size, len(text))]
raw_batch_embeddings = self.text_emb_model.encode(
batch,
instruction=prompt,
max_length=self.max_seq_length,
)
if isinstance(raw_batch_embeddings, np.ndarray):
batch_embeddings = torch.from_numpy(raw_batch_embeddings).float()
else:
batch_embeddings = raw_batch_embeddings.float()
if self.normalize:
batch_embeddings = torch.nn.functional.normalize(
batch_embeddings, p=2, dim=1
)
all_embeddings.append(batch_embeddings.cpu())
del batch_embeddings
if torch.cuda.is_available():
torch.cuda.empty_cache()
return torch.cat(all_embeddings, dim=0)
encode(text, is_query=False, show_progress_bar=True)
¶
Encode a list of text strings into embeddings with added EOS token.
This method adds an EOS (end of sequence) token to each text string before encoding.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
text
|
list[str]
|
List of text strings to encode |
required |
is_query
|
bool
|
Whether the text is being encoded as a query. |
False
|
show_progress_bar
|
bool
|
Whether to display a progress bar during encoding. |
True
|
Returns:
| Type | Description |
|---|---|
Tensor
|
torch.Tensor: Encoded text embeddings tensor |
Examples:
>>> encoder = NVEmbedder()
>>> texts = ["Hello world", "Another text"]
>>> embeddings = encoder.encode(texts)
Source code in gfmrag/text_emb_models/nv_embed.py
def encode(
self, text: list[str], is_query: bool = False, show_progress_bar: bool = True
) -> torch.Tensor:
"""
Encode a list of text strings into embeddings with added EOS token.
This method adds an EOS (end of sequence) token to each text string before encoding.
Args:
text (list[str]): List of text strings to encode
is_query (bool): Whether the text is being encoded as a query.
show_progress_bar (bool): Whether to display a progress bar during encoding.
Returns:
torch.Tensor: Encoded text embeddings tensor
Examples:
>>> encoder = NVEmbedder()
>>> texts = ["Hello world", "Another text"]
>>> embeddings = encoder.encode(texts)
"""
if len(text) == 0:
return torch.empty((0, 0), dtype=torch.float32)
prompt = self.query_instruct if is_query else self.passage_instruct
prompt = prompt or ""
all_embeddings: list[torch.Tensor] = []
for i in tqdm(
range(0, len(text), self.batch_size), disable=not show_progress_bar
):
batch = text[i : min(i + self.batch_size, len(text))]
raw_batch_embeddings = self.text_emb_model.encode(
batch,
instruction=prompt,
max_length=self.max_seq_length,
)
if isinstance(raw_batch_embeddings, np.ndarray):
batch_embeddings = torch.from_numpy(raw_batch_embeddings).float()
else:
batch_embeddings = raw_batch_embeddings.float()
if self.normalize:
batch_embeddings = torch.nn.functional.normalize(
batch_embeddings, p=2, dim=1
)
all_embeddings.append(batch_embeddings.cpu())
del batch_embeddings
if torch.cuda.is_available():
torch.cuda.empty_cache()
return torch.cat(all_embeddings, dim=0)
Qwen3TextEmbModel
¶
Bases: BaseTextEmbModel
A text embedding model class that extends BaseTextEmbModel specifically for Qwen3 embedding models.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
text_emb_model_name
|
str
|
Name or path of the SentenceTransformer model to use |
required |
normalize
|
bool
|
Whether to L2-normalize the embeddings. Defaults to False. |
False
|
batch_size
|
int
|
Batch size for encoding. Defaults to 32. |
32
|
query_instruct
|
str | None
|
Instruction/prompt to prepend to queries. Defaults to None. |
None
|
passage_instruct
|
str | None
|
Instruction/prompt to prepend to passages. Defaults to None. |
None
|
truncate_dim
|
int | None
|
Dimension to truncate the embeddings to. Defaults to None. |
None
|
model_kwargs
|
dict | None
|
Additional keyword arguments for the model. Defaults to None. |
None
|
tokenizer_kwargs
|
dict | None
|
Additional keyword arguments for the tokenizer. Defaults to None. |
None
|
api_base
|
str
|
Base URL for the vLLM server. If no URL is provided, a local server will be started. |
None
|
api_key
|
str
|
API key for authentication. Defaults to "EMPTY". |
'EMPTY'
|
vllm_timeout
|
int
|
Timeout for vLLM requests in seconds. Defaults to 600. |
600
|
Attributes:
| Name | Type | Description |
|---|---|---|
client |
OpenAI
|
The OpenAI client for making requests to vLLM server |
async_client |
AsyncOpenAI
|
The async OpenAI client for concurrent requests |
text_emb_model_name |
str
|
Name of the model being used |
normalize |
bool
|
Whether embeddings are L2-normalized |
batch_size |
int
|
Batch size used for encoding |
query_instruct |
str | None
|
Instruction text for queries |
passage_instruct |
str | None
|
Instruction text for passages |
truncate_dim |
int | None
|
Dimension to truncate the embeddings to |
model_kwargs |
dict | None
|
Additional model configuration parameters |
tokenizer_kwargs |
dict | None
|
Additional tokenizer configuration parameters |
Methods:
| Name | Description |
|---|---|
encode |
list[str], is_query: bool = False, show_progress_bar: bool = True) -> torch.Tensor: Encodes a list of texts into embeddings. |
Source code in gfmrag/text_emb_models/qwen3_model.py
class Qwen3TextEmbModel(BaseTextEmbModel):
"""A text embedding model class that extends BaseTextEmbModel specifically for Qwen3 embedding models.
Args:
text_emb_model_name (str): Name or path of the SentenceTransformer model to use
normalize (bool, optional): Whether to L2-normalize the embeddings. Defaults to False.
batch_size (int, optional): Batch size for encoding. Defaults to 32.
query_instruct (str | None, optional): Instruction/prompt to prepend to queries. Defaults to None.
passage_instruct (str | None, optional): Instruction/prompt to prepend to passages. Defaults to None.
truncate_dim (int | None, optional): Dimension to truncate the embeddings to. Defaults to None.
model_kwargs (dict | None, optional): Additional keyword arguments for the model. Defaults to None.
tokenizer_kwargs (dict | None, optional): Additional keyword arguments for the tokenizer. Defaults to None.
api_base (str, optional): Base URL for the vLLM server. If no URL is provided, a local server will be started.
api_key (str, optional): API key for authentication. Defaults to "EMPTY".
vllm_timeout (int, optional): Timeout for vLLM requests in seconds. Defaults to 600.
Attributes:
client (OpenAI): The OpenAI client for making requests to vLLM server
async_client (AsyncOpenAI): The async OpenAI client for concurrent requests
text_emb_model_name (str): Name of the model being used
normalize (bool): Whether embeddings are L2-normalized
batch_size (int): Batch size used for encoding
query_instruct (str | None): Instruction text for queries
passage_instruct (str | None): Instruction text for passages
truncate_dim (int | None): Dimension to truncate the embeddings to
model_kwargs (dict | None): Additional model configuration parameters
tokenizer_kwargs (dict | None): Additional tokenizer configuration parameters
Methods:
encode(text: list[str], is_query: bool = False, show_progress_bar: bool = True) -> torch.Tensor:
Encodes a list of texts into embeddings.
"""
def __init__(
self,
text_emb_model_name: str,
normalize: bool = False,
batch_size: int = 32,
query_instruct: str | None = None,
passage_instruct: str | None = None,
truncate_dim: int | None = None,
model_kwargs: dict | None = None,
tokenizer_kwargs: dict | None = None,
api_base: str | None = None,
api_key: str = "EMPTY",
vllm_timeout: int = 600,
) -> None:
"""
Initialize the BaseTextEmbModel.
Args:
text_emb_model_name (str): Name or path of the SentenceTransformer model to use
normalize (bool, optional): Whether to L2-normalize the embeddings. Defaults to False.
batch_size (int, optional): Batch size for encoding. Defaults to 32.
query_instruct (str | None, optional): Instruction/prompt to prepend to queries. Defaults to None.
passage_instruct (str | None, optional): Instruction/prompt to prepend to passages. Defaults to None.
truncate_dim (int | None, optional): Dimension to truncate the embeddings to. Defaults to None.
model_kwargs (dict | None, optional): Additional keyword arguments for the model. Defaults to None.
tokenizer_kwargs (dict | None, optional): Additional keyword arguments for the tokenizer. Defaults to None.
api_base (str | None, optional): Base URL for the vLLM server. If no url is provided we would start a local server.
api_key (str | None, optional): API key for authentication. Defaults to "EMPTY".
vllm_timeout (int, optional): Timeout for vLLM requests in seconds. Defaults to 600.
"""
self.text_emb_model_name = text_emb_model_name
self.normalize = normalize
self.batch_size = batch_size
self.query_instruct = query_instruct
self.passage_instruct = passage_instruct
self.truncate_dim = truncate_dim
self.model_kwargs = model_kwargs
self.tokenizer_kwargs = tokenizer_kwargs
self.api_base = api_base
self.api_key = api_key
self.vllm_timeout = vllm_timeout
if api_base is None:
self.text_emb_model = self._start_vllm_server()
else:
# Check if API is available
if not self._is_api_available():
raise RuntimeError("vLLM API is not available")
self.client = OpenAI(
api_key=self.api_key,
base_url=self.api_base,
)
def _is_api_available(self) -> bool:
"""Check if the vLLM API is available at the specified URL."""
try:
health_url = self.api_base.replace("/v1", "/health") # type: ignore
response = requests.get(health_url, timeout=5)
return response.status_code == 200
except requests.RequestException:
return False
def _start_vllm_server(self) -> LLM:
"""Start a vLLM server for embedding generation."""
dist_keys = [
"RANK",
"LOCAL_RANK",
"WORLD_SIZE",
"LOCAL_WORLD_SIZE",
"GROUP_RANK",
"ROLE_RANK",
"ROLE_NAME",
"OMP_NUM_THREADS",
"MASTER_ADDR",
"MASTER_PORT",
"TORCHELASTIC_USE_AGENT_STORE",
"TORCHELASTIC_MAX_RESTARTS",
"TORCHELASTIC_RUN_ID",
"TORCH_NCCL_ASYNC_ERROR_HANDLING",
"TORCHELASTIC_ERROR_FILE",
]
old_env = {}
for dist_key in dist_keys:
if dist_key in os.environ:
old_env[dist_key] = os.environ.pop(dist_key)
os.environ["CUDA_VISIBLE_DEVICES"] = old_env.get("LOCAL_RANK", "0")
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
text_emb_model = LLM(
model=self.text_emb_model_name,
enforce_eager=True,
task="embed",
hf_overrides={"is_matryoshka": True},
)
# Restore environment variables
os.environ.pop("CUDA_VISIBLE_DEVICES")
for dist_key, dist_value in old_env.items():
if dist_value is not None:
os.environ[dist_key] = dist_value
return text_emb_model
def add_instruct(self, instruct: str | None, query: str) -> str:
"""Adds an instruction prefix to the query text if provided.
Args:
instruct (str | None): Instruction text to prepend to the query
query (str): The query text to which the instruction will be added
Returns:
str: The query text with the instruction prepended, or just the query if no instruction is provided
"""
if instruct is None:
return query
else:
return f"{instruct}{query}"
def _make_request(self, text: list[str], show_progress_bar: bool) -> torch.Tensor:
"""Makes a request to the vLLM API to get embeddings for the provided text.
Args:
text (list[str]): List of text strings to encode
is_query (bool): Whether the text is a query (True) or passage (False).
show_progress_bar (bool): Whether to display a progress bar during the request.
Returns:
torch.Tensor: Tensor containing the embeddings for the input text
"""
# Make request to vLLM server
dimensions = (
self.truncate_dim
if self.truncate_dim is not None and self.truncate_dim > 0
else NOT_GIVEN
)
# Process in batches
all_embeddings = []
for i in tqdm(
range(0, len(text), self.batch_size), disable=not show_progress_bar
):
batch = text[i : min(i + self.batch_size, len(text))]
response = self.client.embeddings.create(
model=self.text_emb_model_name,
input=batch,
dimensions=dimensions,
timeout=self.vllm_timeout,
)
batch_embeddings = [data.embedding for data in response.data]
all_embeddings.extend(batch_embeddings)
return torch.tensor(all_embeddings, device="cpu", dtype=torch.float32)
def embed(self, text: list[str], show_progress_bar: bool = True) -> torch.Tensor:
"""
Embeds a list of text strings using the text embedding model.
Args:
text (list[str]): List of text strings to embed.
show_progress_bar (bool, optional): Whether to display a progress bar during embedding.
Defaults to True.
Returns:
torch.Tensor: Tensor containing the embeddings for the input text.
"""
all_embeddings = []
for i in tqdm(
range(0, len(text), self.batch_size), disable=not show_progress_bar
):
batch = text[i : min(i + self.batch_size, len(text))]
if self.truncate_dim is not None and self.truncate_dim > 0:
output = self.text_emb_model.embed(
batch,
pooling_params=PoolingParams(dimensions=self.truncate_dim),
use_tqdm=False,
)
else:
output = self.text_emb_model.embed(batch, use_tqdm=False)
# Move each batch to CPU immediately to avoid holding the full
# embedding matrix on GPU when indexing large fact collections.
batch_embeddings = torch.tensor(
[o.outputs.embedding for o in output],
device="cpu",
dtype=torch.float32,
)
all_embeddings.append(batch_embeddings)
del output
if torch.cuda.is_available():
torch.cuda.empty_cache()
if not all_embeddings:
return torch.empty((0, 0), dtype=torch.float32)
return torch.cat(all_embeddings, dim=0)
def encode(
self, text: list[str], is_query: bool = False, show_progress_bar: bool = True
) -> torch.Tensor:
"""
Encodes a list of text strings into embeddings using the text embedding model.
Args:
text (list[str]): List of text strings to encode
is_query (bool, optional): Whether the text is a query (True) or passage (False).
Determines which instruction prompt to use. Defaults to False.
show_progress_bar (bool, optional): Whether to display progress bar during encoding.
Defaults to True.
Returns:
torch.Tensor: Tensor containing the encoded embeddings for the input text
Examples:
>>> text_emb_model = Qwen3TextEmbModel("Qwen/Qwen3-Embedding-0.6B")
>>> text = ["Hello, world!", "This is a test."]
>>> embeddings = text_emb_model.encode(text)
"""
text_with_instruct = [
self.add_instruct(self.query_instruct, t)
if is_query
else self.add_instruct(self.passage_instruct, t)
for t in text
]
if self.api_base:
return self._make_request(text_with_instruct, show_progress_bar)
else:
return self.embed(text_with_instruct, show_progress_bar)
__init__(text_emb_model_name, normalize=False, batch_size=32, query_instruct=None, passage_instruct=None, truncate_dim=None, model_kwargs=None, tokenizer_kwargs=None, api_base=None, api_key='EMPTY', vllm_timeout=600)
¶
Initialize the BaseTextEmbModel.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
text_emb_model_name
|
str
|
Name or path of the SentenceTransformer model to use |
required |
normalize
|
bool
|
Whether to L2-normalize the embeddings. Defaults to False. |
False
|
batch_size
|
int
|
Batch size for encoding. Defaults to 32. |
32
|
query_instruct
|
str | None
|
Instruction/prompt to prepend to queries. Defaults to None. |
None
|
passage_instruct
|
str | None
|
Instruction/prompt to prepend to passages. Defaults to None. |
None
|
truncate_dim
|
int | None
|
Dimension to truncate the embeddings to. Defaults to None. |
None
|
model_kwargs
|
dict | None
|
Additional keyword arguments for the model. Defaults to None. |
None
|
tokenizer_kwargs
|
dict | None
|
Additional keyword arguments for the tokenizer. Defaults to None. |
None
|
api_base
|
str | None
|
Base URL for the vLLM server. If no url is provided we would start a local server. |
None
|
api_key
|
str | None
|
API key for authentication. Defaults to "EMPTY". |
'EMPTY'
|
vllm_timeout
|
int
|
Timeout for vLLM requests in seconds. Defaults to 600. |
600
|
Source code in gfmrag/text_emb_models/qwen3_model.py
def __init__(
self,
text_emb_model_name: str,
normalize: bool = False,
batch_size: int = 32,
query_instruct: str | None = None,
passage_instruct: str | None = None,
truncate_dim: int | None = None,
model_kwargs: dict | None = None,
tokenizer_kwargs: dict | None = None,
api_base: str | None = None,
api_key: str = "EMPTY",
vllm_timeout: int = 600,
) -> None:
"""
Initialize the BaseTextEmbModel.
Args:
text_emb_model_name (str): Name or path of the SentenceTransformer model to use
normalize (bool, optional): Whether to L2-normalize the embeddings. Defaults to False.
batch_size (int, optional): Batch size for encoding. Defaults to 32.
query_instruct (str | None, optional): Instruction/prompt to prepend to queries. Defaults to None.
passage_instruct (str | None, optional): Instruction/prompt to prepend to passages. Defaults to None.
truncate_dim (int | None, optional): Dimension to truncate the embeddings to. Defaults to None.
model_kwargs (dict | None, optional): Additional keyword arguments for the model. Defaults to None.
tokenizer_kwargs (dict | None, optional): Additional keyword arguments for the tokenizer. Defaults to None.
api_base (str | None, optional): Base URL for the vLLM server. If no url is provided we would start a local server.
api_key (str | None, optional): API key for authentication. Defaults to "EMPTY".
vllm_timeout (int, optional): Timeout for vLLM requests in seconds. Defaults to 600.
"""
self.text_emb_model_name = text_emb_model_name
self.normalize = normalize
self.batch_size = batch_size
self.query_instruct = query_instruct
self.passage_instruct = passage_instruct
self.truncate_dim = truncate_dim
self.model_kwargs = model_kwargs
self.tokenizer_kwargs = tokenizer_kwargs
self.api_base = api_base
self.api_key = api_key
self.vllm_timeout = vllm_timeout
if api_base is None:
self.text_emb_model = self._start_vllm_server()
else:
# Check if API is available
if not self._is_api_available():
raise RuntimeError("vLLM API is not available")
self.client = OpenAI(
api_key=self.api_key,
base_url=self.api_base,
)
add_instruct(instruct, query)
¶
Adds an instruction prefix to the query text if provided.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
instruct
|
str | None
|
Instruction text to prepend to the query |
required |
query
|
str
|
The query text to which the instruction will be added |
required |
Source code in gfmrag/text_emb_models/qwen3_model.py
def add_instruct(self, instruct: str | None, query: str) -> str:
"""Adds an instruction prefix to the query text if provided.
Args:
instruct (str | None): Instruction text to prepend to the query
query (str): The query text to which the instruction will be added
Returns:
str: The query text with the instruction prepended, or just the query if no instruction is provided
"""
if instruct is None:
return query
else:
return f"{instruct}{query}"
embed(text, show_progress_bar=True)
¶
Embeds a list of text strings using the text embedding model.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
text
|
list[str]
|
List of text strings to embed. |
required |
show_progress_bar
|
bool
|
Whether to display a progress bar during embedding. Defaults to True. |
True
|
Returns:
| Type | Description |
|---|---|
Tensor
|
torch.Tensor: Tensor containing the embeddings for the input text. |
Source code in gfmrag/text_emb_models/qwen3_model.py
def embed(self, text: list[str], show_progress_bar: bool = True) -> torch.Tensor:
"""
Embeds a list of text strings using the text embedding model.
Args:
text (list[str]): List of text strings to embed.
show_progress_bar (bool, optional): Whether to display a progress bar during embedding.
Defaults to True.
Returns:
torch.Tensor: Tensor containing the embeddings for the input text.
"""
all_embeddings = []
for i in tqdm(
range(0, len(text), self.batch_size), disable=not show_progress_bar
):
batch = text[i : min(i + self.batch_size, len(text))]
if self.truncate_dim is not None and self.truncate_dim > 0:
output = self.text_emb_model.embed(
batch,
pooling_params=PoolingParams(dimensions=self.truncate_dim),
use_tqdm=False,
)
else:
output = self.text_emb_model.embed(batch, use_tqdm=False)
# Move each batch to CPU immediately to avoid holding the full
# embedding matrix on GPU when indexing large fact collections.
batch_embeddings = torch.tensor(
[o.outputs.embedding for o in output],
device="cpu",
dtype=torch.float32,
)
all_embeddings.append(batch_embeddings)
del output
if torch.cuda.is_available():
torch.cuda.empty_cache()
if not all_embeddings:
return torch.empty((0, 0), dtype=torch.float32)
return torch.cat(all_embeddings, dim=0)
encode(text, is_query=False, show_progress_bar=True)
¶
Encodes a list of text strings into embeddings using the text embedding model.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
text
|
list[str]
|
List of text strings to encode |
required |
is_query
|
bool
|
Whether the text is a query (True) or passage (False). Determines which instruction prompt to use. Defaults to False. |
False
|
show_progress_bar
|
bool
|
Whether to display progress bar during encoding. Defaults to True. |
True
|
Returns:
| Type | Description |
|---|---|
Tensor
|
torch.Tensor: Tensor containing the encoded embeddings for the input text |
Examples:
>>> text_emb_model = Qwen3TextEmbModel("Qwen/Qwen3-Embedding-0.6B")
>>> text = ["Hello, world!", "This is a test."]
>>> embeddings = text_emb_model.encode(text)
Source code in gfmrag/text_emb_models/qwen3_model.py
def encode(
self, text: list[str], is_query: bool = False, show_progress_bar: bool = True
) -> torch.Tensor:
"""
Encodes a list of text strings into embeddings using the text embedding model.
Args:
text (list[str]): List of text strings to encode
is_query (bool, optional): Whether the text is a query (True) or passage (False).
Determines which instruction prompt to use. Defaults to False.
show_progress_bar (bool, optional): Whether to display progress bar during encoding.
Defaults to True.
Returns:
torch.Tensor: Tensor containing the encoded embeddings for the input text
Examples:
>>> text_emb_model = Qwen3TextEmbModel("Qwen/Qwen3-Embedding-0.6B")
>>> text = ["Hello, world!", "This is a test."]
>>> embeddings = text_emb_model.encode(text)
"""
text_with_instruct = [
self.add_instruct(self.query_instruct, t)
if is_query
else self.add_instruct(self.passage_instruct, t)
for t in text
]
if self.api_base:
return self._make_request(text_with_instruct, show_progress_bar)
else:
return self.embed(text_with_instruct, show_progress_bar)