Skip to content

Large Language Models

gfmrag.llms

BaseLanguageModel

Bases: ABC

Abstract interface for language-model backends.

Source code in gfmrag/llms/base_language_model.py
Python
class BaseLanguageModel(ABC):
    """Abstract interface for language-model backends."""

    @abstractmethod
    def __init__(self, model_name_or_path: str):
        pass

    @abstractmethod
    def token_len(self, text: str) -> int:
        """Return the tokenized length of ``text``."""
        pass

    @abstractmethod
    def generate_sentence(
        self, llm_input: str | list, system_input: str = ""
    ) -> str | Exception:
        """Generate text from ``llm_input`` with an optional system prompt."""
        pass

generate_sentence(llm_input, system_input='') abstractmethod

Generate text from llm_input with an optional system prompt.

Source code in gfmrag/llms/base_language_model.py
Python
@abstractmethod
def generate_sentence(
    self, llm_input: str | list, system_input: str = ""
) -> str | Exception:
    """Generate text from ``llm_input`` with an optional system prompt."""
    pass

token_len(text) abstractmethod

Return the tokenized length of text.

Source code in gfmrag/llms/base_language_model.py
Python
@abstractmethod
def token_len(self, text: str) -> int:
    """Return the tokenized length of ``text``."""
    pass

ChatGPT

Bases: BaseLanguageModel

A class that interacts with OpenAI's ChatGPT models through their API.

This class provides functionality to generate text using ChatGPT models while handling token limits, retries, and various input formats.

Parameters:

Name Type Description Default
model_name_or_path str

The name or path of the ChatGPT model to use

required
retry int

Number of retries for failed API calls. Defaults to 5

5

Attributes:

Name Type Description
retry int

Maximum number of retry attempts for failed API calls

model_name str

Name of the ChatGPT model being used

maximun_token int

Maximum token limit for the specified model

client OpenAI

OpenAI client instance for API interactions

Methods:

Name Description
token_len

Calculate the number of tokens in a given text

generate_sentence

Generate response using the ChatGPT model

Raises:

Type Description
KeyError

If the specified model is not found when calculating tokens

Exception

If generation fails after maximum retries

Source code in gfmrag/llms/chatgpt.py
Python
class ChatGPT(BaseLanguageModel):
    """A class that interacts with OpenAI's ChatGPT models through their API.

    This class provides functionality to generate text using ChatGPT models while handling
    token limits, retries, and various input formats.

    Args:
        model_name_or_path (str): The name or path of the ChatGPT model to use
        retry (int, optional): Number of retries for failed API calls. Defaults to 5

    Attributes:
        retry (int): Maximum number of retry attempts for failed API calls
        model_name (str): Name of the ChatGPT model being used
        maximun_token (int): Maximum token limit for the specified model
        client (OpenAI): OpenAI client instance for API interactions

    Methods:
        token_len(text): Calculate the number of tokens in a given text
        generate_sentence(llm_input, system_input): Generate response using the ChatGPT model

    Raises:
        KeyError: If the specified model is not found when calculating tokens
        Exception: If generation fails after maximum retries
    """

    def __init__(self, model_name_or_path: str, retry: int = 5):
        self.retry = retry
        self.model_name = model_name_or_path
        self.maximun_token = get_token_limit(self.model_name)

        client = OpenAI()
        self.client = client

    def token_len(self, text: str) -> int:
        """Returns the number of tokens used by a list of messages."""
        try:
            encoding = tiktoken.encoding_for_model(self.model_name)
            num_tokens = len(encoding.encode(text))
        except KeyError as e:
            raise KeyError(f"Warning: model {self.model_name} not found.") from e
        return num_tokens

    def generate_sentence(
        self, llm_input: str | list, system_input: str = ""
    ) -> str | Exception:
        """Generate a response using the ChatGPT API.

        This method sends a request to the ChatGPT API and returns the generated response.
        It handles both single string inputs and message lists, with retry logic for failed attempts.

        Args:
            llm_input (Union[str, list]): Either a string containing the user's input or a list of message dictionaries
                in the format [{"role": "role_type", "content": "message_content"}, ...]
            system_input (str, optional): System message to be prepended to the conversation. Defaults to "".

        Returns:
            Union[str, Exception]: The generated response text if successful, or the Exception if all retries fail.
                The response is stripped of leading/trailing whitespace.

        Raises:
            Exception: If all retry attempts fail, returns the last encountered exception.

        Notes:
            - Automatically truncates inputs that exceed the maximum token limit
            - Uses exponential backoff with 30 second delays between retries
            - Sets temperature to 0.0 for deterministic outputs
            - Timeout is set to 60 seconds per API call
        """

        # If the input is a list, it is assumed that the input is a list of messages
        if isinstance(llm_input, list):
            message = llm_input
        else:
            message = []
            if system_input:
                message.append({"role": "system", "content": system_input})
            message.append({"role": "user", "content": llm_input})
        cur_retry = 0
        num_retry = self.retry
        # Check if the input is too long
        message_string = "\n".join([m["content"] for m in message])
        input_length = self.token_len(message_string)
        if input_length > self.maximun_token:
            print(
                f"Input lengt {input_length} is too long. The maximum token is {self.maximun_token}.\n Right tuncate the input to {self.maximun_token} tokens."
            )
            llm_input = llm_input[: self.maximun_token]
        error = Exception("Failed to generate sentence")
        while cur_retry <= num_retry:
            try:
                response = self.client.chat.completions.create(
                    model=self.model_name, messages=message, timeout=60, temperature=0.0
                )
                result = response.choices[0].message.content.strip()  # type: ignore
                return result
            except Exception as e:
                logger.error("Message: ", llm_input)
                logger.error("Number of token: ", self.token_len(message_string))
                logger.error(e)
                time.sleep(30)
                cur_retry += 1
                error = e
                continue
        return error

generate_sentence(llm_input, system_input='')

Generate a response using the ChatGPT API.

This method sends a request to the ChatGPT API and returns the generated response. It handles both single string inputs and message lists, with retry logic for failed attempts.

Parameters:

Name Type Description Default
llm_input Union[str, list]

Either a string containing the user's input or a list of message dictionaries in the format [{"role": "role_type", "content": "message_content"}, ...]

required
system_input str

System message to be prepended to the conversation. Defaults to "".

''

Returns:

Type Description
str | Exception

Union[str, Exception]: The generated response text if successful, or the Exception if all retries fail. The response is stripped of leading/trailing whitespace.

Raises:

Type Description
Exception

If all retry attempts fail, returns the last encountered exception.

Notes
  • Automatically truncates inputs that exceed the maximum token limit
  • Uses exponential backoff with 30 second delays between retries
  • Sets temperature to 0.0 for deterministic outputs
  • Timeout is set to 60 seconds per API call
Source code in gfmrag/llms/chatgpt.py
Python
def generate_sentence(
    self, llm_input: str | list, system_input: str = ""
) -> str | Exception:
    """Generate a response using the ChatGPT API.

    This method sends a request to the ChatGPT API and returns the generated response.
    It handles both single string inputs and message lists, with retry logic for failed attempts.

    Args:
        llm_input (Union[str, list]): Either a string containing the user's input or a list of message dictionaries
            in the format [{"role": "role_type", "content": "message_content"}, ...]
        system_input (str, optional): System message to be prepended to the conversation. Defaults to "".

    Returns:
        Union[str, Exception]: The generated response text if successful, or the Exception if all retries fail.
            The response is stripped of leading/trailing whitespace.

    Raises:
        Exception: If all retry attempts fail, returns the last encountered exception.

    Notes:
        - Automatically truncates inputs that exceed the maximum token limit
        - Uses exponential backoff with 30 second delays between retries
        - Sets temperature to 0.0 for deterministic outputs
        - Timeout is set to 60 seconds per API call
    """

    # If the input is a list, it is assumed that the input is a list of messages
    if isinstance(llm_input, list):
        message = llm_input
    else:
        message = []
        if system_input:
            message.append({"role": "system", "content": system_input})
        message.append({"role": "user", "content": llm_input})
    cur_retry = 0
    num_retry = self.retry
    # Check if the input is too long
    message_string = "\n".join([m["content"] for m in message])
    input_length = self.token_len(message_string)
    if input_length > self.maximun_token:
        print(
            f"Input lengt {input_length} is too long. The maximum token is {self.maximun_token}.\n Right tuncate the input to {self.maximun_token} tokens."
        )
        llm_input = llm_input[: self.maximun_token]
    error = Exception("Failed to generate sentence")
    while cur_retry <= num_retry:
        try:
            response = self.client.chat.completions.create(
                model=self.model_name, messages=message, timeout=60, temperature=0.0
            )
            result = response.choices[0].message.content.strip()  # type: ignore
            return result
        except Exception as e:
            logger.error("Message: ", llm_input)
            logger.error("Number of token: ", self.token_len(message_string))
            logger.error(e)
            time.sleep(30)
            cur_retry += 1
            error = e
            continue
    return error

token_len(text)

Returns the number of tokens used by a list of messages.

Source code in gfmrag/llms/chatgpt.py
Python
def token_len(self, text: str) -> int:
    """Returns the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(self.model_name)
        num_tokens = len(encoding.encode(text))
    except KeyError as e:
        raise KeyError(f"Warning: model {self.model_name} not found.") from e
    return num_tokens

HfCausalModel

Bases: BaseLanguageModel

Hugging Face causal language-model wrapper.

Parameters:

Name Type Description Default
model_name_or_path str

Pretrained model name or local path.

required
maximun_token int

Maximum number of input tokens.

4096
max_new_tokens int

Maximum number of generated tokens.

1024
dtype str

Runtime dtype name.

'bf16'
quant None | str

Optional quantization mode.

None
attn_implementation str

Attention backend name.

'flash_attention_2'
Source code in gfmrag/llms/base_hf_causal_model.py
Python
class HfCausalModel(BaseLanguageModel):
    """Hugging Face causal language-model wrapper.

    Args:
        model_name_or_path: Pretrained model name or local path.
        maximun_token: Maximum number of input tokens.
        max_new_tokens: Maximum number of generated tokens.
        dtype: Runtime dtype name.
        quant: Optional quantization mode.
        attn_implementation: Attention backend name.
    """

    DTYPE = {"fp32": torch.float32, "fp16": torch.float16, "bf16": torch.bfloat16}
    QUANT = [None, "4bit", "8bit"]
    ATTEN_IMPLEMENTATION = ["eager", "sdpa", "flash_attention_2"]

    def __init__(
        self,
        model_name_or_path: str,
        maximun_token: int = 4096,
        max_new_tokens: int = 1024,
        dtype: str = "bf16",
        quant: None | str = None,
        attn_implementation: str = "flash_attention_2",
    ):
        assert quant in self.QUANT, f"quant should be one of {self.QUANT}"
        assert attn_implementation in self.ATTEN_IMPLEMENTATION, (
            f"attn_implementation should be one of {self.ATTEN_IMPLEMENTATION}"
        )
        assert dtype in self.DTYPE, f"dtype should be one of {self.DTYPE}"
        self.maximun_token = maximun_token
        self.max_new_tokens = max_new_tokens

        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name_or_path, token=HF_TOKEN, trust_remote_code=True
        )
        model = AutoModelForCausalLM.from_pretrained(
            model_name_or_path,
            device_map="auto",
            token=HF_TOKEN,
            torch_dtype=self.DTYPE.get(dtype, None),
            load_in_8bit=quant == "8bit",
            load_in_4bit=quant == "4bit",
            trust_remote_code=True,
            attn_implementation=attn_implementation,
        )
        self.maximun_token = self.tokenizer.model_max_length
        self.generator = pipeline(
            "text-generation", model=model, tokenizer=self.tokenizer
        )

    def token_len(self, text: str) -> int:
        return len(self.tokenizer.tokenize(text))

    @torch.inference_mode()
    def generate_sentence(
        self, llm_input: str | list, system_input: str = ""
    ) -> str | Exception:
        """Generate text from a prompt string or a chat message list.

        Args:
            llm_input: Prompt string or chat message list.
            system_input: Optional system prompt used with string input.

        Returns:
            Generated text, or the raised exception instance when generation fails.
        """
        # If the input is a list, it is assumed that the input is a list of messages
        if isinstance(llm_input, list):
            message = llm_input
        else:
            message = []
            if system_input:
                message.append({"role": "system", "content": system_input})
            message.append({"role": "user", "content": llm_input})
        try:
            outputs = self.generator(
                message,
                return_full_text=False,
                max_new_tokens=self.max_new_tokens,
                handle_long_generation="hole",
            )
            return outputs[0]["generated_text"].strip()  # type: ignore
        except Exception as e:
            return e

generate_sentence(llm_input, system_input='')

Generate text from a prompt string or a chat message list.

Parameters:

Name Type Description Default
llm_input str | list

Prompt string or chat message list.

required
system_input str

Optional system prompt used with string input.

''

Returns:

Type Description
str | Exception

Generated text, or the raised exception instance when generation fails.

Source code in gfmrag/llms/base_hf_causal_model.py
Python
@torch.inference_mode()
def generate_sentence(
    self, llm_input: str | list, system_input: str = ""
) -> str | Exception:
    """Generate text from a prompt string or a chat message list.

    Args:
        llm_input: Prompt string or chat message list.
        system_input: Optional system prompt used with string input.

    Returns:
        Generated text, or the raised exception instance when generation fails.
    """
    # If the input is a list, it is assumed that the input is a list of messages
    if isinstance(llm_input, list):
        message = llm_input
    else:
        message = []
        if system_input:
            message.append({"role": "system", "content": system_input})
        message.append({"role": "user", "content": llm_input})
    try:
        outputs = self.generator(
            message,
            return_full_text=False,
            max_new_tokens=self.max_new_tokens,
            handle_long_generation="hole",
        )
        return outputs[0]["generated_text"].strip()  # type: ignore
    except Exception as e:
        return e