Large Language Models
gfmrag.llms
¶
BaseLanguageModel
¶
Bases: ABC
Base lanuage model. Define how to generate sentence by using a LM
Source code in gfmrag/llms/base_language_model.py
class BaseLanguageModel(ABC):
"""
Base lanuage model. Define how to generate sentence by using a LM
"""
@abstractmethod
def __init__(self, model_name_or_path: str):
pass
@abstractmethod
def token_len(self, text: str) -> int:
"""
Return tokenized length of text
Args:
text (str): input text
"""
pass
@abstractmethod
def generate_sentence(
self, llm_input: str | list, system_input: str = ""
) -> str | Exception:
"""
Generate sentence by using a LM
Args:
lm_input (LMInput): input for LM
"""
pass
generate_sentence(llm_input, system_input='')
abstractmethod
¶
Generate sentence by using a LM
Parameters:
Name | Type | Description | Default |
---|---|---|---|
lm_input
|
LMInput
|
input for LM |
required |
ChatGPT
¶
Bases: BaseLanguageModel
A class that interacts with OpenAI's ChatGPT models through their API.
This class provides functionality to generate text using ChatGPT models while handling token limits, retries, and various input formats.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
model_name_or_path
|
str
|
The name or path of the ChatGPT model to use |
required |
retry
|
int
|
Number of retries for failed API calls. Defaults to 5 |
5
|
Attributes:
Name | Type | Description |
---|---|---|
retry |
int
|
Maximum number of retry attempts for failed API calls |
model_name |
str
|
Name of the ChatGPT model being used |
maximun_token |
int
|
Maximum token limit for the specified model |
client |
OpenAI
|
OpenAI client instance for API interactions |
Methods:
Name | Description |
---|---|
token_len |
Calculate the number of tokens in a given text |
generate_sentence |
Generate response using the ChatGPT model |
Raises:
Type | Description |
---|---|
KeyError
|
If the specified model is not found when calculating tokens |
Exception
|
If generation fails after maximum retries |
Source code in gfmrag/llms/chatgpt.py
class ChatGPT(BaseLanguageModel):
"""A class that interacts with OpenAI's ChatGPT models through their API.
This class provides functionality to generate text using ChatGPT models while handling
token limits, retries, and various input formats.
Args:
model_name_or_path (str): The name or path of the ChatGPT model to use
retry (int, optional): Number of retries for failed API calls. Defaults to 5
Attributes:
retry (int): Maximum number of retry attempts for failed API calls
model_name (str): Name of the ChatGPT model being used
maximun_token (int): Maximum token limit for the specified model
client (OpenAI): OpenAI client instance for API interactions
Methods:
token_len(text): Calculate the number of tokens in a given text
generate_sentence(llm_input, system_input): Generate response using the ChatGPT model
Raises:
KeyError: If the specified model is not found when calculating tokens
Exception: If generation fails after maximum retries
"""
def __init__(self, model_name_or_path: str, retry: int = 5):
self.retry = retry
self.model_name = model_name_or_path
self.maximun_token = get_token_limit(self.model_name)
client = OpenAI(
api_key=os.environ[
"OPENAI_API_KEY"
], # this is also the default, it can be omitted
)
self.client = client
def token_len(self, text: str) -> int:
"""Returns the number of tokens used by a list of messages."""
try:
encoding = tiktoken.encoding_for_model(self.model_name)
num_tokens = len(encoding.encode(text))
except KeyError as e:
raise KeyError(f"Warning: model {self.model_name} not found.") from e
return num_tokens
def generate_sentence(
self, llm_input: str | list, system_input: str = ""
) -> str | Exception:
"""Generate a response using the ChatGPT API.
This method sends a request to the ChatGPT API and returns the generated response.
It handles both single string inputs and message lists, with retry logic for failed attempts.
Args:
llm_input (Union[str, list]): Either a string containing the user's input or a list of message dictionaries
in the format [{"role": "role_type", "content": "message_content"}, ...]
system_input (str, optional): System message to be prepended to the conversation. Defaults to "".
Returns:
Union[str, Exception]: The generated response text if successful, or the Exception if all retries fail.
The response is stripped of leading/trailing whitespace.
Raises:
Exception: If all retry attempts fail, returns the last encountered exception.
Notes:
- Automatically truncates inputs that exceed the maximum token limit
- Uses exponential backoff with 30 second delays between retries
- Sets temperature to 0.0 for deterministic outputs
- Timeout is set to 60 seconds per API call
"""
# If the input is a list, it is assumed that the input is a list of messages
if isinstance(llm_input, list):
message = llm_input
else:
message = []
if system_input:
message.append({"role": "system", "content": system_input})
message.append({"role": "user", "content": llm_input})
cur_retry = 0
num_retry = self.retry
# Check if the input is too long
message_string = "\n".join([m["content"] for m in message])
input_length = self.token_len(message_string)
if input_length > self.maximun_token:
print(
f"Input lengt {input_length} is too long. The maximum token is {self.maximun_token}.\n Right tuncate the input to {self.maximun_token} tokens."
)
llm_input = llm_input[: self.maximun_token]
error = Exception("Failed to generate sentence")
while cur_retry <= num_retry:
try:
response = self.client.chat.completions.create(
model=self.model_name, messages=message, timeout=60, temperature=0.0
)
result = response.choices[0].message.content.strip() # type: ignore
return result
except Exception as e:
logger.error("Message: ", llm_input)
logger.error("Number of token: ", self.token_len(message_string))
logger.error(e)
time.sleep(30)
cur_retry += 1
error = e
continue
return error
generate_sentence(llm_input, system_input='')
¶
Generate a response using the ChatGPT API.
This method sends a request to the ChatGPT API and returns the generated response. It handles both single string inputs and message lists, with retry logic for failed attempts.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
llm_input
|
Union[str, list]
|
Either a string containing the user's input or a list of message dictionaries in the format [{"role": "role_type", "content": "message_content"}, ...] |
required |
system_input
|
str
|
System message to be prepended to the conversation. Defaults to "". |
''
|
Returns:
Type | Description |
---|---|
str | Exception
|
Union[str, Exception]: The generated response text if successful, or the Exception if all retries fail. The response is stripped of leading/trailing whitespace. |
Raises:
Type | Description |
---|---|
Exception
|
If all retry attempts fail, returns the last encountered exception. |
Notes
- Automatically truncates inputs that exceed the maximum token limit
- Uses exponential backoff with 30 second delays between retries
- Sets temperature to 0.0 for deterministic outputs
- Timeout is set to 60 seconds per API call
Source code in gfmrag/llms/chatgpt.py
def generate_sentence(
self, llm_input: str | list, system_input: str = ""
) -> str | Exception:
"""Generate a response using the ChatGPT API.
This method sends a request to the ChatGPT API and returns the generated response.
It handles both single string inputs and message lists, with retry logic for failed attempts.
Args:
llm_input (Union[str, list]): Either a string containing the user's input or a list of message dictionaries
in the format [{"role": "role_type", "content": "message_content"}, ...]
system_input (str, optional): System message to be prepended to the conversation. Defaults to "".
Returns:
Union[str, Exception]: The generated response text if successful, or the Exception if all retries fail.
The response is stripped of leading/trailing whitespace.
Raises:
Exception: If all retry attempts fail, returns the last encountered exception.
Notes:
- Automatically truncates inputs that exceed the maximum token limit
- Uses exponential backoff with 30 second delays between retries
- Sets temperature to 0.0 for deterministic outputs
- Timeout is set to 60 seconds per API call
"""
# If the input is a list, it is assumed that the input is a list of messages
if isinstance(llm_input, list):
message = llm_input
else:
message = []
if system_input:
message.append({"role": "system", "content": system_input})
message.append({"role": "user", "content": llm_input})
cur_retry = 0
num_retry = self.retry
# Check if the input is too long
message_string = "\n".join([m["content"] for m in message])
input_length = self.token_len(message_string)
if input_length > self.maximun_token:
print(
f"Input lengt {input_length} is too long. The maximum token is {self.maximun_token}.\n Right tuncate the input to {self.maximun_token} tokens."
)
llm_input = llm_input[: self.maximun_token]
error = Exception("Failed to generate sentence")
while cur_retry <= num_retry:
try:
response = self.client.chat.completions.create(
model=self.model_name, messages=message, timeout=60, temperature=0.0
)
result = response.choices[0].message.content.strip() # type: ignore
return result
except Exception as e:
logger.error("Message: ", llm_input)
logger.error("Number of token: ", self.token_len(message_string))
logger.error(e)
time.sleep(30)
cur_retry += 1
error = e
continue
return error
token_len(text)
¶
Returns the number of tokens used by a list of messages.
Source code in gfmrag/llms/chatgpt.py
def token_len(self, text: str) -> int:
"""Returns the number of tokens used by a list of messages."""
try:
encoding = tiktoken.encoding_for_model(self.model_name)
num_tokens = len(encoding.encode(text))
except KeyError as e:
raise KeyError(f"Warning: model {self.model_name} not found.") from e
return num_tokens
HfCausalModel
¶
Bases: BaseLanguageModel
A class for handling Hugging Face causal language models with various configurations.
This class provides functionality to load and use Hugging Face's causal language models with different precision types, quantization options, and attention implementations.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
model_name_or_path
|
str The name or path of the pre-trained model to load |
required | |
maximun_token
|
int, optional Maximum number of tokens for the model input, by default 4096 |
4096
|
|
max_new_tokens
|
int, optional Maximum number of new tokens to generate, by default 1024 |
1024
|
|
dtype
|
str, optional Data type for model computation ('fp32', 'fp16', or 'bf16'), by default 'bf16' |
'bf16'
|
|
quant
|
str or None, optional Quantization option (None, '4bit', or '8bit'), by default None |
None
|
|
attn_implementation
|
str, optional Attention implementation method ('eager', 'sdpa', or 'flash_attention_2'), by default 'flash_attention_2' |
'flash_attention_2'
|
Methods:
Name | Description |
---|---|
token_len |
str) -> int Returns the number of tokens in the input text |
generate_sentence |
Union[str, list], system_input: str = "") -> Union[str, Exception] Generates text based on the input prompt or message list |
Source code in gfmrag/llms/base_hf_causal_model.py
class HfCausalModel(BaseLanguageModel):
"""A class for handling Hugging Face causal language models with various configurations.
This class provides functionality to load and use Hugging Face's causal language models
with different precision types, quantization options, and attention implementations.
Args:
model_name_or_path : str
The name or path of the pre-trained model to load
maximun_token : int, optional
Maximum number of tokens for the model input, by default 4096
max_new_tokens : int, optional
Maximum number of new tokens to generate, by default 1024
dtype : str, optional
Data type for model computation ('fp32', 'fp16', or 'bf16'), by default 'bf16'
quant : str or None, optional
Quantization option (None, '4bit', or '8bit'), by default None
attn_implementation : str, optional
Attention implementation method ('eager', 'sdpa', or 'flash_attention_2'),
by default 'flash_attention_2'
Methods:
token_len(text: str) -> int
Returns the number of tokens in the input text
generate_sentence(llm_input: Union[str, list], system_input: str = "") -> Union[str, Exception]
Generates text based on the input prompt or message list
"""
DTYPE = {"fp32": torch.float32, "fp16": torch.float16, "bf16": torch.bfloat16}
QUANT = [None, "4bit", "8bit"]
ATTEN_IMPLEMENTATION = ["eager", "sdpa", "flash_attention_2"]
def __init__(
self,
model_name_or_path: str,
maximun_token: int = 4096,
max_new_tokens: int = 1024,
dtype: str = "bf16",
quant: None | str = None,
attn_implementation: str = "flash_attention_2",
):
assert quant in self.QUANT, f"quant should be one of {self.QUANT}"
assert (
attn_implementation in self.ATTEN_IMPLEMENTATION
), f"attn_implementation should be one of {self.ATTEN_IMPLEMENTATION}"
assert dtype in self.DTYPE, f"dtype should be one of {self.DTYPE}"
self.maximun_token = maximun_token
self.max_new_tokens = max_new_tokens
self.tokenizer = AutoTokenizer.from_pretrained(
model_name_or_path, token=HF_TOKEN, trust_remote_code=True
)
model = AutoModelForCausalLM.from_pretrained(
model_name_or_path,
device_map="auto",
token=HF_TOKEN,
torch_dtype=self.DTYPE.get(dtype, None),
load_in_8bit=quant == "8bit",
load_in_4bit=quant == "4bit",
trust_remote_code=True,
attn_implementation=attn_implementation,
)
self.maximun_token = self.tokenizer.model_max_length
self.generator = pipeline(
"text-generation", model=model, tokenizer=self.tokenizer
)
def token_len(self, text: str) -> int:
return len(self.tokenizer.tokenize(text))
@torch.inference_mode()
def generate_sentence(
self, llm_input: str | list, system_input: str = ""
) -> str | Exception:
"""
Generate sentence by using a Language Model.
This method processes input (either a string or a list of messages) and generates text using the configured language model.
If a system prompt is provided along with a string input, it will be included in the message structure.
Args:
llm_input (Union[str, list]): Input for the language model. Can be either a string containing the prompt,
or a list of message dictionaries with 'role' and 'content' keys.
system_input (str, optional): System prompt to be prepended to the input. Only used when llm_input is a string.
Defaults to empty string.
Union[str, Exception]: Generated text output from the language model if successful,
or the Exception object if generation fails.
Examples:
>>> # Using string input with system prompt
>>> model.generate_sentence("Tell me a joke", system_input="Be funny")
>>> # Using message list input
>>> messages = [
... {"role": "system", "content": "Be helpful"},
... {"role": "user", "content": "Tell me a joke"}
... ]
>>> model.generate_sentence(messages)
"""
# If the input is a list, it is assumed that the input is a list of messages
if isinstance(llm_input, list):
message = llm_input
else:
message = []
if system_input:
message.append({"role": "system", "content": system_input})
message.append({"role": "user", "content": llm_input})
try:
outputs = self.generator(
message,
return_full_text=False,
max_new_tokens=self.max_new_tokens,
handle_long_generation="hole",
)
return outputs[0]["generated_text"].strip() # type: ignore
except Exception as e:
return e
generate_sentence(llm_input, system_input='')
¶
Generate sentence by using a Language Model.
This method processes input (either a string or a list of messages) and generates text using the configured language model. If a system prompt is provided along with a string input, it will be included in the message structure.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
llm_input
|
Union[str, list]
|
Input for the language model. Can be either a string containing the prompt, or a list of message dictionaries with 'role' and 'content' keys. |
required |
system_input
|
str
|
System prompt to be prepended to the input. Only used when llm_input is a string. Defaults to empty string. |
''
|
Union[str,
|
Exception]
|
Generated text output from the language model if successful, or the Exception object if generation fails. |
required |
Examples:
>>> # Using string input with system prompt
>>> model.generate_sentence("Tell me a joke", system_input="Be funny")
>>> # Using message list input
>>> messages = [
... {"role": "system", "content": "Be helpful"},
... {"role": "user", "content": "Tell me a joke"}
... ]
>>> model.generate_sentence(messages)
Source code in gfmrag/llms/base_hf_causal_model.py
@torch.inference_mode()
def generate_sentence(
self, llm_input: str | list, system_input: str = ""
) -> str | Exception:
"""
Generate sentence by using a Language Model.
This method processes input (either a string or a list of messages) and generates text using the configured language model.
If a system prompt is provided along with a string input, it will be included in the message structure.
Args:
llm_input (Union[str, list]): Input for the language model. Can be either a string containing the prompt,
or a list of message dictionaries with 'role' and 'content' keys.
system_input (str, optional): System prompt to be prepended to the input. Only used when llm_input is a string.
Defaults to empty string.
Union[str, Exception]: Generated text output from the language model if successful,
or the Exception object if generation fails.
Examples:
>>> # Using string input with system prompt
>>> model.generate_sentence("Tell me a joke", system_input="Be funny")
>>> # Using message list input
>>> messages = [
... {"role": "system", "content": "Be helpful"},
... {"role": "user", "content": "Tell me a joke"}
... ]
>>> model.generate_sentence(messages)
"""
# If the input is a list, it is assumed that the input is a list of messages
if isinstance(llm_input, list):
message = llm_input
else:
message = []
if system_input:
message.append({"role": "system", "content": system_input})
message.append({"role": "user", "content": llm_input})
try:
outputs = self.generator(
message,
return_full_text=False,
max_new_tokens=self.max_new_tokens,
handle_long_generation="hole",
)
return outputs[0]["generated_text"].strip() # type: ignore
except Exception as e:
return e