Large Language Models
gfmrag.llms
¶
BaseLanguageModel
¶
Bases: ABC
Abstract interface for language-model backends.
Source code in gfmrag/llms/base_language_model.py
class BaseLanguageModel(ABC):
"""Abstract interface for language-model backends."""
@abstractmethod
def __init__(self, model_name_or_path: str):
pass
@abstractmethod
def token_len(self, text: str) -> int:
"""Return the tokenized length of ``text``."""
pass
@abstractmethod
def generate_sentence(
self, llm_input: str | list, system_input: str = ""
) -> str | Exception:
"""Generate text from ``llm_input`` with an optional system prompt."""
pass
ChatGPT
¶
Bases: BaseLanguageModel
A class that interacts with OpenAI's ChatGPT models through their API.
This class provides functionality to generate text using ChatGPT models while handling token limits, retries, and various input formats.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
model_name_or_path
|
str
|
The name or path of the ChatGPT model to use |
required |
retry
|
int
|
Number of retries for failed API calls. Defaults to 5 |
5
|
Attributes:
| Name | Type | Description |
|---|---|---|
retry |
int
|
Maximum number of retry attempts for failed API calls |
model_name |
str
|
Name of the ChatGPT model being used |
maximun_token |
int
|
Maximum token limit for the specified model |
client |
OpenAI
|
OpenAI client instance for API interactions |
Methods:
| Name | Description |
|---|---|
token_len |
Calculate the number of tokens in a given text |
generate_sentence |
Generate response using the ChatGPT model |
Raises:
| Type | Description |
|---|---|
KeyError
|
If the specified model is not found when calculating tokens |
Exception
|
If generation fails after maximum retries |
Source code in gfmrag/llms/chatgpt.py
class ChatGPT(BaseLanguageModel):
"""A class that interacts with OpenAI's ChatGPT models through their API.
This class provides functionality to generate text using ChatGPT models while handling
token limits, retries, and various input formats.
Args:
model_name_or_path (str): The name or path of the ChatGPT model to use
retry (int, optional): Number of retries for failed API calls. Defaults to 5
Attributes:
retry (int): Maximum number of retry attempts for failed API calls
model_name (str): Name of the ChatGPT model being used
maximun_token (int): Maximum token limit for the specified model
client (OpenAI): OpenAI client instance for API interactions
Methods:
token_len(text): Calculate the number of tokens in a given text
generate_sentence(llm_input, system_input): Generate response using the ChatGPT model
Raises:
KeyError: If the specified model is not found when calculating tokens
Exception: If generation fails after maximum retries
"""
def __init__(self, model_name_or_path: str, retry: int = 5):
self.retry = retry
self.model_name = model_name_or_path
self.maximun_token = get_token_limit(self.model_name)
client = OpenAI()
self.client = client
def token_len(self, text: str) -> int:
"""Returns the number of tokens used by a list of messages."""
try:
encoding = tiktoken.encoding_for_model(self.model_name)
num_tokens = len(encoding.encode(text))
except KeyError as e:
raise KeyError(f"Warning: model {self.model_name} not found.") from e
return num_tokens
def generate_sentence(
self, llm_input: str | list, system_input: str = ""
) -> str | Exception:
"""Generate a response using the ChatGPT API.
This method sends a request to the ChatGPT API and returns the generated response.
It handles both single string inputs and message lists, with retry logic for failed attempts.
Args:
llm_input (Union[str, list]): Either a string containing the user's input or a list of message dictionaries
in the format [{"role": "role_type", "content": "message_content"}, ...]
system_input (str, optional): System message to be prepended to the conversation. Defaults to "".
Returns:
Union[str, Exception]: The generated response text if successful, or the Exception if all retries fail.
The response is stripped of leading/trailing whitespace.
Raises:
Exception: If all retry attempts fail, returns the last encountered exception.
Notes:
- Automatically truncates inputs that exceed the maximum token limit
- Uses exponential backoff with 30 second delays between retries
- Sets temperature to 0.0 for deterministic outputs
- Timeout is set to 60 seconds per API call
"""
# If the input is a list, it is assumed that the input is a list of messages
if isinstance(llm_input, list):
message = llm_input
else:
message = []
if system_input:
message.append({"role": "system", "content": system_input})
message.append({"role": "user", "content": llm_input})
cur_retry = 0
num_retry = self.retry
# Check if the input is too long
message_string = "\n".join([m["content"] for m in message])
input_length = self.token_len(message_string)
if input_length > self.maximun_token:
print(
f"Input lengt {input_length} is too long. The maximum token is {self.maximun_token}.\n Right tuncate the input to {self.maximun_token} tokens."
)
llm_input = llm_input[: self.maximun_token]
error = Exception("Failed to generate sentence")
while cur_retry <= num_retry:
try:
response = self.client.chat.completions.create(
model=self.model_name, messages=message, timeout=60, temperature=0.0
)
result = response.choices[0].message.content.strip() # type: ignore
return result
except Exception as e:
logger.error("Message: ", llm_input)
logger.error("Number of token: ", self.token_len(message_string))
logger.error(e)
time.sleep(30)
cur_retry += 1
error = e
continue
return error
generate_sentence(llm_input, system_input='')
¶
Generate a response using the ChatGPT API.
This method sends a request to the ChatGPT API and returns the generated response. It handles both single string inputs and message lists, with retry logic for failed attempts.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
llm_input
|
Union[str, list]
|
Either a string containing the user's input or a list of message dictionaries in the format [{"role": "role_type", "content": "message_content"}, ...] |
required |
system_input
|
str
|
System message to be prepended to the conversation. Defaults to "". |
''
|
Returns:
| Type | Description |
|---|---|
str | Exception
|
Union[str, Exception]: The generated response text if successful, or the Exception if all retries fail. The response is stripped of leading/trailing whitespace. |
Raises:
| Type | Description |
|---|---|
Exception
|
If all retry attempts fail, returns the last encountered exception. |
Notes
- Automatically truncates inputs that exceed the maximum token limit
- Uses exponential backoff with 30 second delays between retries
- Sets temperature to 0.0 for deterministic outputs
- Timeout is set to 60 seconds per API call
Source code in gfmrag/llms/chatgpt.py
def generate_sentence(
self, llm_input: str | list, system_input: str = ""
) -> str | Exception:
"""Generate a response using the ChatGPT API.
This method sends a request to the ChatGPT API and returns the generated response.
It handles both single string inputs and message lists, with retry logic for failed attempts.
Args:
llm_input (Union[str, list]): Either a string containing the user's input or a list of message dictionaries
in the format [{"role": "role_type", "content": "message_content"}, ...]
system_input (str, optional): System message to be prepended to the conversation. Defaults to "".
Returns:
Union[str, Exception]: The generated response text if successful, or the Exception if all retries fail.
The response is stripped of leading/trailing whitespace.
Raises:
Exception: If all retry attempts fail, returns the last encountered exception.
Notes:
- Automatically truncates inputs that exceed the maximum token limit
- Uses exponential backoff with 30 second delays between retries
- Sets temperature to 0.0 for deterministic outputs
- Timeout is set to 60 seconds per API call
"""
# If the input is a list, it is assumed that the input is a list of messages
if isinstance(llm_input, list):
message = llm_input
else:
message = []
if system_input:
message.append({"role": "system", "content": system_input})
message.append({"role": "user", "content": llm_input})
cur_retry = 0
num_retry = self.retry
# Check if the input is too long
message_string = "\n".join([m["content"] for m in message])
input_length = self.token_len(message_string)
if input_length > self.maximun_token:
print(
f"Input lengt {input_length} is too long. The maximum token is {self.maximun_token}.\n Right tuncate the input to {self.maximun_token} tokens."
)
llm_input = llm_input[: self.maximun_token]
error = Exception("Failed to generate sentence")
while cur_retry <= num_retry:
try:
response = self.client.chat.completions.create(
model=self.model_name, messages=message, timeout=60, temperature=0.0
)
result = response.choices[0].message.content.strip() # type: ignore
return result
except Exception as e:
logger.error("Message: ", llm_input)
logger.error("Number of token: ", self.token_len(message_string))
logger.error(e)
time.sleep(30)
cur_retry += 1
error = e
continue
return error
token_len(text)
¶
Returns the number of tokens used by a list of messages.
Source code in gfmrag/llms/chatgpt.py
def token_len(self, text: str) -> int:
"""Returns the number of tokens used by a list of messages."""
try:
encoding = tiktoken.encoding_for_model(self.model_name)
num_tokens = len(encoding.encode(text))
except KeyError as e:
raise KeyError(f"Warning: model {self.model_name} not found.") from e
return num_tokens
HfCausalModel
¶
Bases: BaseLanguageModel
Hugging Face causal language-model wrapper.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
model_name_or_path
|
str
|
Pretrained model name or local path. |
required |
maximun_token
|
int
|
Maximum number of input tokens. |
4096
|
max_new_tokens
|
int
|
Maximum number of generated tokens. |
1024
|
dtype
|
str
|
Runtime dtype name. |
'bf16'
|
quant
|
None | str
|
Optional quantization mode. |
None
|
attn_implementation
|
str
|
Attention backend name. |
'flash_attention_2'
|
Source code in gfmrag/llms/base_hf_causal_model.py
class HfCausalModel(BaseLanguageModel):
"""Hugging Face causal language-model wrapper.
Args:
model_name_or_path: Pretrained model name or local path.
maximun_token: Maximum number of input tokens.
max_new_tokens: Maximum number of generated tokens.
dtype: Runtime dtype name.
quant: Optional quantization mode.
attn_implementation: Attention backend name.
"""
DTYPE = {"fp32": torch.float32, "fp16": torch.float16, "bf16": torch.bfloat16}
QUANT = [None, "4bit", "8bit"]
ATTEN_IMPLEMENTATION = ["eager", "sdpa", "flash_attention_2"]
def __init__(
self,
model_name_or_path: str,
maximun_token: int = 4096,
max_new_tokens: int = 1024,
dtype: str = "bf16",
quant: None | str = None,
attn_implementation: str = "flash_attention_2",
):
assert quant in self.QUANT, f"quant should be one of {self.QUANT}"
assert attn_implementation in self.ATTEN_IMPLEMENTATION, (
f"attn_implementation should be one of {self.ATTEN_IMPLEMENTATION}"
)
assert dtype in self.DTYPE, f"dtype should be one of {self.DTYPE}"
self.maximun_token = maximun_token
self.max_new_tokens = max_new_tokens
self.tokenizer = AutoTokenizer.from_pretrained(
model_name_or_path, token=HF_TOKEN, trust_remote_code=True
)
model = AutoModelForCausalLM.from_pretrained(
model_name_or_path,
device_map="auto",
token=HF_TOKEN,
torch_dtype=self.DTYPE.get(dtype, None),
load_in_8bit=quant == "8bit",
load_in_4bit=quant == "4bit",
trust_remote_code=True,
attn_implementation=attn_implementation,
)
self.maximun_token = self.tokenizer.model_max_length
self.generator = pipeline(
"text-generation", model=model, tokenizer=self.tokenizer
)
def token_len(self, text: str) -> int:
return len(self.tokenizer.tokenize(text))
@torch.inference_mode()
def generate_sentence(
self, llm_input: str | list, system_input: str = ""
) -> str | Exception:
"""Generate text from a prompt string or a chat message list.
Args:
llm_input: Prompt string or chat message list.
system_input: Optional system prompt used with string input.
Returns:
Generated text, or the raised exception instance when generation fails.
"""
# If the input is a list, it is assumed that the input is a list of messages
if isinstance(llm_input, list):
message = llm_input
else:
message = []
if system_input:
message.append({"role": "system", "content": system_input})
message.append({"role": "user", "content": llm_input})
try:
outputs = self.generator(
message,
return_full_text=False,
max_new_tokens=self.max_new_tokens,
handle_long_generation="hole",
)
return outputs[0]["generated_text"].strip() # type: ignore
except Exception as e:
return e
generate_sentence(llm_input, system_input='')
¶
Generate text from a prompt string or a chat message list.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
llm_input
|
str | list
|
Prompt string or chat message list. |
required |
system_input
|
str
|
Optional system prompt used with string input. |
''
|
Returns:
| Type | Description |
|---|---|
str | Exception
|
Generated text, or the raised exception instance when generation fails. |
Source code in gfmrag/llms/base_hf_causal_model.py
@torch.inference_mode()
def generate_sentence(
self, llm_input: str | list, system_input: str = ""
) -> str | Exception:
"""Generate text from a prompt string or a chat message list.
Args:
llm_input: Prompt string or chat message list.
system_input: Optional system prompt used with string input.
Returns:
Generated text, or the raised exception instance when generation fails.
"""
# If the input is a list, it is assumed that the input is a list of messages
if isinstance(llm_input, list):
message = llm_input
else:
message = []
if system_input:
message.append({"role": "system", "content": system_input})
message.append({"role": "user", "content": llm_input})
try:
outputs = self.generator(
message,
return_full_text=False,
max_new_tokens=self.max_new_tokens,
handle_long_generation="hole",
)
return outputs[0]["generated_text"].strip() # type: ignore
except Exception as e:
return e