Skip to content

Large Language Models

gfmrag.llms

BaseLanguageModel

Bases: ABC

Base lanuage model. Define how to generate sentence by using a LM

Source code in gfmrag/llms/base_language_model.py
Python
class BaseLanguageModel(ABC):
    """
    Base lanuage model. Define how to generate sentence by using a LM
    """

    @abstractmethod
    def __init__(self, model_name_or_path: str):
        pass

    @abstractmethod
    def token_len(self, text: str) -> int:
        """
        Return tokenized length of text

        Args:
            text (str): input text
        """
        pass

    @abstractmethod
    def generate_sentence(
        self, llm_input: str | list, system_input: str = ""
    ) -> str | Exception:
        """
        Generate sentence by using a LM

        Args:
            lm_input (LMInput): input for LM
        """
        pass

generate_sentence(llm_input, system_input='') abstractmethod

Generate sentence by using a LM

Parameters:

Name Type Description Default
lm_input LMInput

input for LM

required
Source code in gfmrag/llms/base_language_model.py
Python
@abstractmethod
def generate_sentence(
    self, llm_input: str | list, system_input: str = ""
) -> str | Exception:
    """
    Generate sentence by using a LM

    Args:
        lm_input (LMInput): input for LM
    """
    pass

token_len(text) abstractmethod

Return tokenized length of text

Parameters:

Name Type Description Default
text str

input text

required
Source code in gfmrag/llms/base_language_model.py
Python
@abstractmethod
def token_len(self, text: str) -> int:
    """
    Return tokenized length of text

    Args:
        text (str): input text
    """
    pass

ChatGPT

Bases: BaseLanguageModel

A class that interacts with OpenAI's ChatGPT models through their API.

This class provides functionality to generate text using ChatGPT models while handling token limits, retries, and various input formats.

Parameters:

Name Type Description Default
model_name_or_path str

The name or path of the ChatGPT model to use

required
retry int

Number of retries for failed API calls. Defaults to 5

5

Attributes:

Name Type Description
retry int

Maximum number of retry attempts for failed API calls

model_name str

Name of the ChatGPT model being used

maximun_token int

Maximum token limit for the specified model

client OpenAI

OpenAI client instance for API interactions

Methods:

Name Description
token_len

Calculate the number of tokens in a given text

generate_sentence

Generate response using the ChatGPT model

Raises:

Type Description
KeyError

If the specified model is not found when calculating tokens

Exception

If generation fails after maximum retries

Source code in gfmrag/llms/chatgpt.py
Python
class ChatGPT(BaseLanguageModel):
    """A class that interacts with OpenAI's ChatGPT models through their API.

    This class provides functionality to generate text using ChatGPT models while handling
    token limits, retries, and various input formats.

    Args:
        model_name_or_path (str): The name or path of the ChatGPT model to use
        retry (int, optional): Number of retries for failed API calls. Defaults to 5

    Attributes:
        retry (int): Maximum number of retry attempts for failed API calls
        model_name (str): Name of the ChatGPT model being used
        maximun_token (int): Maximum token limit for the specified model
        client (OpenAI): OpenAI client instance for API interactions

    Methods:
        token_len(text): Calculate the number of tokens in a given text
        generate_sentence(llm_input, system_input): Generate response using the ChatGPT model

    Raises:
        KeyError: If the specified model is not found when calculating tokens
        Exception: If generation fails after maximum retries
    """

    def __init__(self, model_name_or_path: str, retry: int = 5):
        self.retry = retry
        self.model_name = model_name_or_path
        self.maximun_token = get_token_limit(self.model_name)

        client = OpenAI(
            api_key=os.environ[
                "OPENAI_API_KEY"
            ],  # this is also the default, it can be omitted
        )
        self.client = client

    def token_len(self, text: str) -> int:
        """Returns the number of tokens used by a list of messages."""
        try:
            encoding = tiktoken.encoding_for_model(self.model_name)
            num_tokens = len(encoding.encode(text))
        except KeyError as e:
            raise KeyError(f"Warning: model {self.model_name} not found.") from e
        return num_tokens

    def generate_sentence(
        self, llm_input: str | list, system_input: str = ""
    ) -> str | Exception:
        """Generate a response using the ChatGPT API.

        This method sends a request to the ChatGPT API and returns the generated response.
        It handles both single string inputs and message lists, with retry logic for failed attempts.

        Args:
            llm_input (Union[str, list]): Either a string containing the user's input or a list of message dictionaries
                in the format [{"role": "role_type", "content": "message_content"}, ...]
            system_input (str, optional): System message to be prepended to the conversation. Defaults to "".

        Returns:
            Union[str, Exception]: The generated response text if successful, or the Exception if all retries fail.
                The response is stripped of leading/trailing whitespace.

        Raises:
            Exception: If all retry attempts fail, returns the last encountered exception.

        Notes:
            - Automatically truncates inputs that exceed the maximum token limit
            - Uses exponential backoff with 30 second delays between retries
            - Sets temperature to 0.0 for deterministic outputs
            - Timeout is set to 60 seconds per API call
        """

        # If the input is a list, it is assumed that the input is a list of messages
        if isinstance(llm_input, list):
            message = llm_input
        else:
            message = []
            if system_input:
                message.append({"role": "system", "content": system_input})
            message.append({"role": "user", "content": llm_input})
        cur_retry = 0
        num_retry = self.retry
        # Check if the input is too long
        message_string = "\n".join([m["content"] for m in message])
        input_length = self.token_len(message_string)
        if input_length > self.maximun_token:
            print(
                f"Input lengt {input_length} is too long. The maximum token is {self.maximun_token}.\n Right tuncate the input to {self.maximun_token} tokens."
            )
            llm_input = llm_input[: self.maximun_token]
        error = Exception("Failed to generate sentence")
        while cur_retry <= num_retry:
            try:
                response = self.client.chat.completions.create(
                    model=self.model_name, messages=message, timeout=60, temperature=0.0
                )
                result = response.choices[0].message.content.strip()  # type: ignore
                return result
            except Exception as e:
                logger.error("Message: ", llm_input)
                logger.error("Number of token: ", self.token_len(message_string))
                logger.error(e)
                time.sleep(30)
                cur_retry += 1
                error = e
                continue
        return error

generate_sentence(llm_input, system_input='')

Generate a response using the ChatGPT API.

This method sends a request to the ChatGPT API and returns the generated response. It handles both single string inputs and message lists, with retry logic for failed attempts.

Parameters:

Name Type Description Default
llm_input Union[str, list]

Either a string containing the user's input or a list of message dictionaries in the format [{"role": "role_type", "content": "message_content"}, ...]

required
system_input str

System message to be prepended to the conversation. Defaults to "".

''

Returns:

Type Description
str | Exception

Union[str, Exception]: The generated response text if successful, or the Exception if all retries fail. The response is stripped of leading/trailing whitespace.

Raises:

Type Description
Exception

If all retry attempts fail, returns the last encountered exception.

Notes
  • Automatically truncates inputs that exceed the maximum token limit
  • Uses exponential backoff with 30 second delays between retries
  • Sets temperature to 0.0 for deterministic outputs
  • Timeout is set to 60 seconds per API call
Source code in gfmrag/llms/chatgpt.py
Python
def generate_sentence(
    self, llm_input: str | list, system_input: str = ""
) -> str | Exception:
    """Generate a response using the ChatGPT API.

    This method sends a request to the ChatGPT API and returns the generated response.
    It handles both single string inputs and message lists, with retry logic for failed attempts.

    Args:
        llm_input (Union[str, list]): Either a string containing the user's input or a list of message dictionaries
            in the format [{"role": "role_type", "content": "message_content"}, ...]
        system_input (str, optional): System message to be prepended to the conversation. Defaults to "".

    Returns:
        Union[str, Exception]: The generated response text if successful, or the Exception if all retries fail.
            The response is stripped of leading/trailing whitespace.

    Raises:
        Exception: If all retry attempts fail, returns the last encountered exception.

    Notes:
        - Automatically truncates inputs that exceed the maximum token limit
        - Uses exponential backoff with 30 second delays between retries
        - Sets temperature to 0.0 for deterministic outputs
        - Timeout is set to 60 seconds per API call
    """

    # If the input is a list, it is assumed that the input is a list of messages
    if isinstance(llm_input, list):
        message = llm_input
    else:
        message = []
        if system_input:
            message.append({"role": "system", "content": system_input})
        message.append({"role": "user", "content": llm_input})
    cur_retry = 0
    num_retry = self.retry
    # Check if the input is too long
    message_string = "\n".join([m["content"] for m in message])
    input_length = self.token_len(message_string)
    if input_length > self.maximun_token:
        print(
            f"Input lengt {input_length} is too long. The maximum token is {self.maximun_token}.\n Right tuncate the input to {self.maximun_token} tokens."
        )
        llm_input = llm_input[: self.maximun_token]
    error = Exception("Failed to generate sentence")
    while cur_retry <= num_retry:
        try:
            response = self.client.chat.completions.create(
                model=self.model_name, messages=message, timeout=60, temperature=0.0
            )
            result = response.choices[0].message.content.strip()  # type: ignore
            return result
        except Exception as e:
            logger.error("Message: ", llm_input)
            logger.error("Number of token: ", self.token_len(message_string))
            logger.error(e)
            time.sleep(30)
            cur_retry += 1
            error = e
            continue
    return error

token_len(text)

Returns the number of tokens used by a list of messages.

Source code in gfmrag/llms/chatgpt.py
Python
def token_len(self, text: str) -> int:
    """Returns the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(self.model_name)
        num_tokens = len(encoding.encode(text))
    except KeyError as e:
        raise KeyError(f"Warning: model {self.model_name} not found.") from e
    return num_tokens

HfCausalModel

Bases: BaseLanguageModel

A class for handling Hugging Face causal language models with various configurations.

This class provides functionality to load and use Hugging Face's causal language models with different precision types, quantization options, and attention implementations.

Parameters:

Name Type Description Default
model_name_or_path

str The name or path of the pre-trained model to load

required
maximun_token

int, optional Maximum number of tokens for the model input, by default 4096

4096
max_new_tokens

int, optional Maximum number of new tokens to generate, by default 1024

1024
dtype

str, optional Data type for model computation ('fp32', 'fp16', or 'bf16'), by default 'bf16'

'bf16'
quant

str or None, optional Quantization option (None, '4bit', or '8bit'), by default None

None
attn_implementation

str, optional Attention implementation method ('eager', 'sdpa', or 'flash_attention_2'), by default 'flash_attention_2'

'flash_attention_2'

Methods:

Name Description
token_len

str) -> int Returns the number of tokens in the input text

generate_sentence

Union[str, list], system_input: str = "") -> Union[str, Exception] Generates text based on the input prompt or message list

Source code in gfmrag/llms/base_hf_causal_model.py
Python
class HfCausalModel(BaseLanguageModel):
    """A class for handling Hugging Face causal language models with various configurations.

    This class provides functionality to load and use Hugging Face's causal language models
    with different precision types, quantization options, and attention implementations.

    Args:
        model_name_or_path : str
            The name or path of the pre-trained model to load
        maximun_token : int, optional
            Maximum number of tokens for the model input, by default 4096
        max_new_tokens : int, optional
            Maximum number of new tokens to generate, by default 1024
        dtype : str, optional
            Data type for model computation ('fp32', 'fp16', or 'bf16'), by default 'bf16'
        quant : str or None, optional
            Quantization option (None, '4bit', or '8bit'), by default None
        attn_implementation : str, optional
            Attention implementation method ('eager', 'sdpa', or 'flash_attention_2'),
            by default 'flash_attention_2'

    Methods:
        token_len(text: str) -> int
            Returns the number of tokens in the input text
        generate_sentence(llm_input: Union[str, list], system_input: str = "") -> Union[str, Exception]
            Generates text based on the input prompt or message list
    """

    DTYPE = {"fp32": torch.float32, "fp16": torch.float16, "bf16": torch.bfloat16}
    QUANT = [None, "4bit", "8bit"]
    ATTEN_IMPLEMENTATION = ["eager", "sdpa", "flash_attention_2"]

    def __init__(
        self,
        model_name_or_path: str,
        maximun_token: int = 4096,
        max_new_tokens: int = 1024,
        dtype: str = "bf16",
        quant: None | str = None,
        attn_implementation: str = "flash_attention_2",
    ):
        assert quant in self.QUANT, f"quant should be one of {self.QUANT}"
        assert (
            attn_implementation in self.ATTEN_IMPLEMENTATION
        ), f"attn_implementation should be one of {self.ATTEN_IMPLEMENTATION}"
        assert dtype in self.DTYPE, f"dtype should be one of {self.DTYPE}"
        self.maximun_token = maximun_token
        self.max_new_tokens = max_new_tokens

        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name_or_path, token=HF_TOKEN, trust_remote_code=True
        )
        model = AutoModelForCausalLM.from_pretrained(
            model_name_or_path,
            device_map="auto",
            token=HF_TOKEN,
            torch_dtype=self.DTYPE.get(dtype, None),
            load_in_8bit=quant == "8bit",
            load_in_4bit=quant == "4bit",
            trust_remote_code=True,
            attn_implementation=attn_implementation,
        )
        self.maximun_token = self.tokenizer.model_max_length
        self.generator = pipeline(
            "text-generation", model=model, tokenizer=self.tokenizer
        )

    def token_len(self, text: str) -> int:
        return len(self.tokenizer.tokenize(text))

    @torch.inference_mode()
    def generate_sentence(
        self, llm_input: str | list, system_input: str = ""
    ) -> str | Exception:
        """
        Generate sentence by using a Language Model.

        This method processes input (either a string or a list of messages) and generates text using the configured language model.
        If a system prompt is provided along with a string input, it will be included in the message structure.

        Args:
            llm_input (Union[str, list]): Input for the language model. Can be either a string containing the prompt,
                or a list of message dictionaries with 'role' and 'content' keys.
            system_input (str, optional): System prompt to be prepended to the input. Only used when llm_input is a string.
                Defaults to empty string.

            Union[str, Exception]: Generated text output from the language model if successful,
                or the Exception object if generation fails.

        Examples:
            >>> # Using string input with system prompt
            >>> model.generate_sentence("Tell me a joke", system_input="Be funny")

            >>> # Using message list input
            >>> messages = [
            ...     {"role": "system", "content": "Be helpful"},
            ...     {"role": "user", "content": "Tell me a joke"}
            ... ]
            >>> model.generate_sentence(messages)
        """
        # If the input is a list, it is assumed that the input is a list of messages
        if isinstance(llm_input, list):
            message = llm_input
        else:
            message = []
            if system_input:
                message.append({"role": "system", "content": system_input})
            message.append({"role": "user", "content": llm_input})
        try:
            outputs = self.generator(
                message,
                return_full_text=False,
                max_new_tokens=self.max_new_tokens,
                handle_long_generation="hole",
            )
            return outputs[0]["generated_text"].strip()  # type: ignore
        except Exception as e:
            return e

generate_sentence(llm_input, system_input='')

Generate sentence by using a Language Model.

This method processes input (either a string or a list of messages) and generates text using the configured language model. If a system prompt is provided along with a string input, it will be included in the message structure.

Parameters:

Name Type Description Default
llm_input Union[str, list]

Input for the language model. Can be either a string containing the prompt, or a list of message dictionaries with 'role' and 'content' keys.

required
system_input str

System prompt to be prepended to the input. Only used when llm_input is a string. Defaults to empty string.

''
Union[str, Exception]

Generated text output from the language model if successful, or the Exception object if generation fails.

required

Examples:

Python Console Session
>>> # Using string input with system prompt
>>> model.generate_sentence("Tell me a joke", system_input="Be funny")
Python Console Session
>>> # Using message list input
>>> messages = [
...     {"role": "system", "content": "Be helpful"},
...     {"role": "user", "content": "Tell me a joke"}
... ]
>>> model.generate_sentence(messages)
Source code in gfmrag/llms/base_hf_causal_model.py
Python
@torch.inference_mode()
def generate_sentence(
    self, llm_input: str | list, system_input: str = ""
) -> str | Exception:
    """
    Generate sentence by using a Language Model.

    This method processes input (either a string or a list of messages) and generates text using the configured language model.
    If a system prompt is provided along with a string input, it will be included in the message structure.

    Args:
        llm_input (Union[str, list]): Input for the language model. Can be either a string containing the prompt,
            or a list of message dictionaries with 'role' and 'content' keys.
        system_input (str, optional): System prompt to be prepended to the input. Only used when llm_input is a string.
            Defaults to empty string.

        Union[str, Exception]: Generated text output from the language model if successful,
            or the Exception object if generation fails.

    Examples:
        >>> # Using string input with system prompt
        >>> model.generate_sentence("Tell me a joke", system_input="Be funny")

        >>> # Using message list input
        >>> messages = [
        ...     {"role": "system", "content": "Be helpful"},
        ...     {"role": "user", "content": "Tell me a joke"}
        ... ]
        >>> model.generate_sentence(messages)
    """
    # If the input is a list, it is assumed that the input is a list of messages
    if isinstance(llm_input, list):
        message = llm_input
    else:
        message = []
        if system_input:
            message.append({"role": "system", "content": system_input})
        message.append({"role": "user", "content": llm_input})
    try:
        outputs = self.generator(
            message,
            return_full_text=False,
            max_new_tokens=self.max_new_tokens,
            handle_long_generation="hole",
        )
        return outputs[0]["generated_text"].strip()  # type: ignore
    except Exception as e:
        return e