Skip to content

OpenIE Model

gfmrag.kg_construction.openie_model

BaseOPENIEModel

Bases: ABC

Source code in gfmrag/kg_construction/openie_model/base_model.py
Python
class BaseOPENIEModel(ABC):
    @abstractmethod
    def __init__(self, **kwargs: Any) -> None:
        pass

    @abstractmethod
    def __call__(self, text: str) -> dict:
        """
        Perform OpenIE on the given text.

        Args:
            text (str): input text

        Returns:
            dict: dict of passage, extracted entities, extracted_triples

                - passage (str): input text
                - extracted_entities (list): list of extracted entities
                - extracted_triples (list): list of extracted triples

        Examples:
            >>> openie_model = OPENIEModel()
            >>> result = openie_model("Emmanuel Macron is the president of France")
            >>> print(result)
            {'passage': 'Emmanuel Macron is the president of France', 'extracted_entities': ['Emmanuel Macron', 'France'], 'extracted_triples': [['Emmanuel Macron', 'president of', 'France']]}
        """
        pass

__call__(text) abstractmethod

Perform OpenIE on the given text.

Parameters:

Name Type Description Default
text str

input text

required

Returns:

Name Type Description
dict dict

dict of passage, extracted entities, extracted_triples

  • passage (str): input text
  • extracted_entities (list): list of extracted entities
  • extracted_triples (list): list of extracted triples

Examples:

Python Console Session
>>> openie_model = OPENIEModel()
>>> result = openie_model("Emmanuel Macron is the president of France")
>>> print(result)
{'passage': 'Emmanuel Macron is the president of France', 'extracted_entities': ['Emmanuel Macron', 'France'], 'extracted_triples': [['Emmanuel Macron', 'president of', 'France']]}
Source code in gfmrag/kg_construction/openie_model/base_model.py
Python
@abstractmethod
def __call__(self, text: str) -> dict:
    """
    Perform OpenIE on the given text.

    Args:
        text (str): input text

    Returns:
        dict: dict of passage, extracted entities, extracted_triples

            - passage (str): input text
            - extracted_entities (list): list of extracted entities
            - extracted_triples (list): list of extracted triples

    Examples:
        >>> openie_model = OPENIEModel()
        >>> result = openie_model("Emmanuel Macron is the president of France")
        >>> print(result)
        {'passage': 'Emmanuel Macron is the president of France', 'extracted_entities': ['Emmanuel Macron', 'France'], 'extracted_triples': [['Emmanuel Macron', 'president of', 'France']]}
    """
    pass

LLMOPENIEModel

Bases: BaseOPENIEModel

A class for performing Open Information Extraction (OpenIE) using Large Language Models.

This class implements OpenIE functionality by performing Named Entity Recognition (NER) and relation extraction using various LLM backends like OpenAI, Together, Ollama, or llama.cpp.

Parameters:

Name Type Description Default
llm_api Literal['openai', 'together', 'ollama', 'llama.cpp']

The LLM backend to use. Defaults to "openai".

'openai'
model_name str

Name of the specific model to use. Defaults to "gpt-4o-mini".

'gpt-4o-mini'
max_ner_tokens int

Maximum number of tokens for NER output. Defaults to 1024.

1024
max_triples_tokens int

Maximum number of tokens for relation triples output. Defaults to 4096.

4096

Attributes:

Name Type Description
llm_api

The LLM backend being used

model_name

Name of the model being used

max_ner_tokens

Token limit for NER

max_triples_tokens

Token limit for relation triples

client

Initialized language model client

Methods:

Name Description
ner

Performs Named Entity Recognition on input text

openie_post_ner_extract

Extracts relation triples after NER

__call__

Main method to perform complete OpenIE pipeline

Examples:

Python Console Session
>>> openie_model = LLMOPENIEModel()
>>> result = openie_model("Emmanuel Macron is the president of France")
>>> print(result)
{'passage': 'Emmanuel Macron is the president of France', 'extracted_entities': ['Emmanuel Macron', 'France'], 'extracted_triples': [['Emmanuel Macron', 'president of', 'France']]}
Source code in gfmrag/kg_construction/openie_model/llm_openie_model.py
Python
class LLMOPENIEModel(BaseOPENIEModel):
    """
    A class for performing Open Information Extraction (OpenIE) using Large Language Models.

    This class implements OpenIE functionality by performing Named Entity Recognition (NER)
    and relation extraction using various LLM backends like OpenAI, Together, Ollama, or llama.cpp.

    Args:
        llm_api (Literal["openai", "together", "ollama", "llama.cpp"]): The LLM backend to use.
            Defaults to "openai".
        model_name (str): Name of the specific model to use. Defaults to "gpt-4o-mini".
        max_ner_tokens (int): Maximum number of tokens for NER output. Defaults to 1024.
        max_triples_tokens (int): Maximum number of tokens for relation triples output.
            Defaults to 4096.

    Attributes:
        llm_api: The LLM backend being used
        model_name: Name of the model being used
        max_ner_tokens: Token limit for NER
        max_triples_tokens: Token limit for relation triples
        client: Initialized language model client

    Methods:
        ner: Performs Named Entity Recognition on input text
        openie_post_ner_extract: Extracts relation triples after NER
        __call__: Main method to perform complete OpenIE pipeline

    Examples:
        >>> openie_model = LLMOPENIEModel()
        >>> result = openie_model("Emmanuel Macron is the president of France")
        >>> print(result)
        {'passage': 'Emmanuel Macron is the president of France', 'extracted_entities': ['Emmanuel Macron', 'France'], 'extracted_triples': [['Emmanuel Macron', 'president of', 'France']]}
    """

    def __init__(
        self,
        llm_api: Literal["openai", "together", "ollama", "llama.cpp"] = "openai",
        model_name: str = "gpt-4o-mini",
        max_ner_tokens: int = 1024,
        max_triples_tokens: int = 4096,
    ):
        """Initialize LLM-based OpenIE model.

        Args:
            llm_api (Literal["openai", "together", "ollama", "llama.cpp"]): The LLM API provider to use.
                Defaults to "openai".
            model_name (str): Name of the language model to use. Defaults to "gpt-4o-mini".
            max_ner_tokens (int): Maximum number of tokens for NER processing. Defaults to 1024.
            max_triples_tokens (int): Maximum number of tokens for triple extraction. Defaults to 4096.

        Attributes:
            llm_api: The selected LLM API provider
            model_name: Name of the language model
            max_ner_tokens: Token limit for NER
            max_triples_tokens: Token limit for triples
            client: Initialized language model client
        """
        self.llm_api = llm_api
        self.model_name = model_name
        self.max_ner_tokens = max_ner_tokens
        self.max_triples_tokens = max_triples_tokens

        self.client = init_langchain_model(llm_api, model_name)

    def ner(self, text: str) -> list:
        """
        Performs Named Entity Recognition (NER) on the input text using different LLM clients.

        Args:
            text (str): Input text to extract named entities from.

        Returns:
            list: A list of named entities extracted from the text. Returns empty list if extraction fails.

        Note:
            - For OpenAI client, uses JSON mode with specific parameters
            - For Ollama and LlamaCpp clients, extracts JSON from regular response
            - For other clients, extracts JSON from regular response without JSON mode
            - Handles exceptions by returning empty list and logging error
        """
        ner_messages = ner_prompts.format_prompt(user_input=text)

        try:
            if isinstance(self.client, ChatOpenAI):  # JSON mode
                chat_completion = self.client.invoke(
                    ner_messages.to_messages(),
                    temperature=0,
                    max_tokens=self.max_ner_tokens,
                    stop=["\n\n"],
                    response_format={"type": "json_object"},
                )
                response_content = chat_completion.content
                response_content = eval(response_content)

            elif isinstance(self.client, ChatOllama) or isinstance(
                self.client, ChatLlamaCpp
            ):
                response_content = self.client.invoke(ner_messages.to_messages())
                response_content = extract_json_dict(response_content)

            else:  # no JSON mode
                chat_completion = self.client.invoke(
                    ner_messages.to_messages(), temperature=0
                )
                response_content = chat_completion.content
                response_content = extract_json_dict(response_content)

            if "named_entities" not in response_content:
                response_content = []
            else:
                response_content = response_content["named_entities"]

        except Exception as e:
            logger.error(f"Error in extracting named entities: {e}")
            response_content = []

        return response_content

    def openie_post_ner_extract(self, text: str, entities: list) -> str:
        """
        Extracts open information (triples) from text using LLM, considering pre-identified named entities.

        Args:
            text (str): The input text to extract information from.
            entities (list): List of pre-identified named entities in the text.

        Returns:
            str: JSON string containing the extracted triples. Returns empty JSON object "{}" if extraction fails.

        Raises:
            Exception: Logs any errors that occur during the extraction process.

        Notes:
            - For ChatOpenAI client, uses JSON mode for structured output
            - For ChatOllama and ChatLlamaCpp clients, extracts JSON from unstructured response
            - For other clients, extracts JSON from response content
            - Uses temperature=0 and configured max_tokens for consistent outputs
        """
        named_entity_json = {"named_entities": entities}
        openie_messages = openie_post_ner_prompts.format_prompt(
            passage=text, named_entity_json=json.dumps(named_entity_json)
        )
        try:
            if isinstance(self.client, ChatOpenAI):  # JSON mode
                chat_completion = self.client.invoke(
                    openie_messages.to_messages(),
                    temperature=0,
                    max_tokens=self.max_triples_tokens,
                    response_format={"type": "json_object"},
                )
                response_content = chat_completion.content

            elif isinstance(self.client, ChatOllama) or isinstance(
                self.client, ChatLlamaCpp
            ):
                response_content = self.client.invoke(openie_messages.to_messages())
                response_content = extract_json_dict(response_content)
                response_content = str(response_content)
            else:  # no JSON mode
                chat_completion = self.client.invoke(
                    openie_messages.to_messages(),
                    temperature=0,
                    max_tokens=self.max_triples_tokens,
                )
                response_content = chat_completion.content
                response_content = extract_json_dict(response_content)
                response_content = str(response_content)

        except Exception as e:
            logger.error(f"Error in OpenIE: {e}")
            response_content = "{}"

        return response_content

    def __call__(self, text: str) -> dict:
        """
        Perform OpenIE on the given text.

        Args:
            text (str): input text

        Returns:
            dict: dict of passage, extracted entities, extracted_triples

                - passage (str): input text
                - extracted_entities (list): list of extracted entities
                - extracted_triples (list): list of extracted triples
        """
        res = {"passage": text, "extracted_entities": [], "extracted_triples": []}

        # ner_messages = ner_prompts.format_prompt(user_input=text)
        doc_entities = self.ner(text)
        try:
            doc_entities = list(np.unique(doc_entities))
        except Exception as e:
            logger.error(f"Results has nested lists: {e}")
            doc_entities = list(np.unique(list(chain.from_iterable(doc_entities))))

        triples = self.openie_post_ner_extract(text, doc_entities)
        res["extracted_entities"] = doc_entities
        try:
            res["extracted_triples"] = eval(triples)["triples"]
        except Exception:
            logger.error(f"Error in parsing triples: {triples}")

        return res

__call__(text)

Perform OpenIE on the given text.

Parameters:

Name Type Description Default
text str

input text

required

Returns:

Name Type Description
dict dict

dict of passage, extracted entities, extracted_triples

  • passage (str): input text
  • extracted_entities (list): list of extracted entities
  • extracted_triples (list): list of extracted triples
Source code in gfmrag/kg_construction/openie_model/llm_openie_model.py
Python
def __call__(self, text: str) -> dict:
    """
    Perform OpenIE on the given text.

    Args:
        text (str): input text

    Returns:
        dict: dict of passage, extracted entities, extracted_triples

            - passage (str): input text
            - extracted_entities (list): list of extracted entities
            - extracted_triples (list): list of extracted triples
    """
    res = {"passage": text, "extracted_entities": [], "extracted_triples": []}

    # ner_messages = ner_prompts.format_prompt(user_input=text)
    doc_entities = self.ner(text)
    try:
        doc_entities = list(np.unique(doc_entities))
    except Exception as e:
        logger.error(f"Results has nested lists: {e}")
        doc_entities = list(np.unique(list(chain.from_iterable(doc_entities))))

    triples = self.openie_post_ner_extract(text, doc_entities)
    res["extracted_entities"] = doc_entities
    try:
        res["extracted_triples"] = eval(triples)["triples"]
    except Exception:
        logger.error(f"Error in parsing triples: {triples}")

    return res

__init__(llm_api='openai', model_name='gpt-4o-mini', max_ner_tokens=1024, max_triples_tokens=4096)

Initialize LLM-based OpenIE model.

Parameters:

Name Type Description Default
llm_api Literal['openai', 'together', 'ollama', 'llama.cpp']

The LLM API provider to use. Defaults to "openai".

'openai'
model_name str

Name of the language model to use. Defaults to "gpt-4o-mini".

'gpt-4o-mini'
max_ner_tokens int

Maximum number of tokens for NER processing. Defaults to 1024.

1024
max_triples_tokens int

Maximum number of tokens for triple extraction. Defaults to 4096.

4096

Attributes:

Name Type Description
llm_api

The selected LLM API provider

model_name

Name of the language model

max_ner_tokens

Token limit for NER

max_triples_tokens

Token limit for triples

client

Initialized language model client

Source code in gfmrag/kg_construction/openie_model/llm_openie_model.py
Python
def __init__(
    self,
    llm_api: Literal["openai", "together", "ollama", "llama.cpp"] = "openai",
    model_name: str = "gpt-4o-mini",
    max_ner_tokens: int = 1024,
    max_triples_tokens: int = 4096,
):
    """Initialize LLM-based OpenIE model.

    Args:
        llm_api (Literal["openai", "together", "ollama", "llama.cpp"]): The LLM API provider to use.
            Defaults to "openai".
        model_name (str): Name of the language model to use. Defaults to "gpt-4o-mini".
        max_ner_tokens (int): Maximum number of tokens for NER processing. Defaults to 1024.
        max_triples_tokens (int): Maximum number of tokens for triple extraction. Defaults to 4096.

    Attributes:
        llm_api: The selected LLM API provider
        model_name: Name of the language model
        max_ner_tokens: Token limit for NER
        max_triples_tokens: Token limit for triples
        client: Initialized language model client
    """
    self.llm_api = llm_api
    self.model_name = model_name
    self.max_ner_tokens = max_ner_tokens
    self.max_triples_tokens = max_triples_tokens

    self.client = init_langchain_model(llm_api, model_name)

ner(text)

Performs Named Entity Recognition (NER) on the input text using different LLM clients.

Parameters:

Name Type Description Default
text str

Input text to extract named entities from.

required

Returns:

Name Type Description
list list

A list of named entities extracted from the text. Returns empty list if extraction fails.

Note
  • For OpenAI client, uses JSON mode with specific parameters
  • For Ollama and LlamaCpp clients, extracts JSON from regular response
  • For other clients, extracts JSON from regular response without JSON mode
  • Handles exceptions by returning empty list and logging error
Source code in gfmrag/kg_construction/openie_model/llm_openie_model.py
Python
def ner(self, text: str) -> list:
    """
    Performs Named Entity Recognition (NER) on the input text using different LLM clients.

    Args:
        text (str): Input text to extract named entities from.

    Returns:
        list: A list of named entities extracted from the text. Returns empty list if extraction fails.

    Note:
        - For OpenAI client, uses JSON mode with specific parameters
        - For Ollama and LlamaCpp clients, extracts JSON from regular response
        - For other clients, extracts JSON from regular response without JSON mode
        - Handles exceptions by returning empty list and logging error
    """
    ner_messages = ner_prompts.format_prompt(user_input=text)

    try:
        if isinstance(self.client, ChatOpenAI):  # JSON mode
            chat_completion = self.client.invoke(
                ner_messages.to_messages(),
                temperature=0,
                max_tokens=self.max_ner_tokens,
                stop=["\n\n"],
                response_format={"type": "json_object"},
            )
            response_content = chat_completion.content
            response_content = eval(response_content)

        elif isinstance(self.client, ChatOllama) or isinstance(
            self.client, ChatLlamaCpp
        ):
            response_content = self.client.invoke(ner_messages.to_messages())
            response_content = extract_json_dict(response_content)

        else:  # no JSON mode
            chat_completion = self.client.invoke(
                ner_messages.to_messages(), temperature=0
            )
            response_content = chat_completion.content
            response_content = extract_json_dict(response_content)

        if "named_entities" not in response_content:
            response_content = []
        else:
            response_content = response_content["named_entities"]

    except Exception as e:
        logger.error(f"Error in extracting named entities: {e}")
        response_content = []

    return response_content

openie_post_ner_extract(text, entities)

Extracts open information (triples) from text using LLM, considering pre-identified named entities.

Parameters:

Name Type Description Default
text str

The input text to extract information from.

required
entities list

List of pre-identified named entities in the text.

required

Returns:

Name Type Description
str str

JSON string containing the extracted triples. Returns empty JSON object "{}" if extraction fails.

Raises:

Type Description
Exception

Logs any errors that occur during the extraction process.

Notes
  • For ChatOpenAI client, uses JSON mode for structured output
  • For ChatOllama and ChatLlamaCpp clients, extracts JSON from unstructured response
  • For other clients, extracts JSON from response content
  • Uses temperature=0 and configured max_tokens for consistent outputs
Source code in gfmrag/kg_construction/openie_model/llm_openie_model.py
Python
def openie_post_ner_extract(self, text: str, entities: list) -> str:
    """
    Extracts open information (triples) from text using LLM, considering pre-identified named entities.

    Args:
        text (str): The input text to extract information from.
        entities (list): List of pre-identified named entities in the text.

    Returns:
        str: JSON string containing the extracted triples. Returns empty JSON object "{}" if extraction fails.

    Raises:
        Exception: Logs any errors that occur during the extraction process.

    Notes:
        - For ChatOpenAI client, uses JSON mode for structured output
        - For ChatOllama and ChatLlamaCpp clients, extracts JSON from unstructured response
        - For other clients, extracts JSON from response content
        - Uses temperature=0 and configured max_tokens for consistent outputs
    """
    named_entity_json = {"named_entities": entities}
    openie_messages = openie_post_ner_prompts.format_prompt(
        passage=text, named_entity_json=json.dumps(named_entity_json)
    )
    try:
        if isinstance(self.client, ChatOpenAI):  # JSON mode
            chat_completion = self.client.invoke(
                openie_messages.to_messages(),
                temperature=0,
                max_tokens=self.max_triples_tokens,
                response_format={"type": "json_object"},
            )
            response_content = chat_completion.content

        elif isinstance(self.client, ChatOllama) or isinstance(
            self.client, ChatLlamaCpp
        ):
            response_content = self.client.invoke(openie_messages.to_messages())
            response_content = extract_json_dict(response_content)
            response_content = str(response_content)
        else:  # no JSON mode
            chat_completion = self.client.invoke(
                openie_messages.to_messages(),
                temperature=0,
                max_tokens=self.max_triples_tokens,
            )
            response_content = chat_completion.content
            response_content = extract_json_dict(response_content)
            response_content = str(response_content)

    except Exception as e:
        logger.error(f"Error in OpenIE: {e}")
        response_content = "{}"

    return response_content