Spaces:

zjunlp
/

OneKE

Running

App Files Files Community

ShawnRu commited on Feb 20

Commit

009d93e

1 Parent(s): 36208ce

update

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

src/__pycache__/pipeline.cpython-311.pyc +0 -0
src/__pycache__/pipeline.cpython-39.pyc +0 -0
src/config.yaml +19 -0
src/generate_memory.py +181 -0
src/main.py +233 -0
src/models/__init__.py +3 -0
src/models/__pycache__/__init__.cpython-311.pyc +0 -0
src/models/__pycache__/__init__.cpython-37.pyc +0 -0
src/models/__pycache__/__init__.cpython-39.pyc +0 -0
src/models/__pycache__/llm_def.cpython-311.pyc +0 -0
src/models/__pycache__/llm_def.cpython-37.pyc +0 -0
src/models/__pycache__/llm_def.cpython-39.pyc +0 -0
src/models/__pycache__/prompt_example.cpython-311.pyc +0 -0
src/models/__pycache__/prompt_example.cpython-39.pyc +0 -0
src/models/__pycache__/prompt_template.cpython-311.pyc +0 -0
src/models/__pycache__/prompt_template.cpython-39.pyc +0 -0
src/models/llm_def.py +212 -0
src/models/prompt_example.py +137 -0
src/models/prompt_template.py +174 -0
src/modules/__init__.py +4 -0
src/modules/__pycache__/__init__.cpython-311.pyc +0 -0
src/modules/__pycache__/__init__.cpython-39.pyc +0 -0
src/modules/__pycache__/extraction_agent.cpython-311.pyc +0 -0
src/modules/__pycache__/extraction_agent.cpython-39.pyc +0 -0
src/modules/__pycache__/reflection_agent.cpython-311.pyc +0 -0
src/modules/__pycache__/reflection_agent.cpython-39.pyc +0 -0
src/modules/__pycache__/schema_agent.cpython-311.pyc +0 -0
src/modules/__pycache__/schema_agent.cpython-39.pyc +0 -0
src/modules/extraction_agent.py +85 -0
src/modules/knowledge_base/__pycache__/case_repository.cpython-311.pyc +0 -0
src/modules/knowledge_base/__pycache__/case_repository.cpython-39.pyc +0 -0
src/modules/knowledge_base/__pycache__/schema_repository.cpython-311.pyc +0 -0
src/modules/knowledge_base/__pycache__/schema_repository.cpython-39.pyc +0 -0
src/modules/knowledge_base/case_repository.json +0 -0
src/modules/knowledge_base/case_repository.py +391 -0
src/modules/knowledge_base/schema_repository.py +91 -0
src/modules/reflection_agent.py +74 -0
src/modules/schema_agent.py +151 -0
src/pipeline.py +98 -0
src/run.py +88 -0
src/utils/__init__.py +3 -0
src/utils/__pycache__/__init__.cpython-311.pyc +0 -0
src/utils/__pycache__/__init__.cpython-39.pyc +0 -0
src/utils/__pycache__/data_def.cpython-311.pyc +0 -0
src/utils/__pycache__/data_def.cpython-39.pyc +0 -0
src/utils/__pycache__/process.cpython-311.pyc +0 -0
src/utils/__pycache__/process.cpython-39.pyc +0 -0
src/utils/data_def.py +59 -0
src/utils/process.py +183 -0
src/webui/__init__.py +1 -0

src/__pycache__/pipeline.cpython-311.pyc ADDED Viewed

Binary file (5.34 kB). View file

src/__pycache__/pipeline.cpython-39.pyc ADDED Viewed

Binary file (3.56 kB). View file

src/config.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+agent:
+  default_schema: The final extraction result should be formatted as a JSON object.
+  default_ner: Extract the Named Entities in the given text.
+  default_re: Extract Relationships between Named Entities in the given text.
+  default_ee: Extract the Events in the given text.
+  chunk_token_limit: 1024
+  mode:
+    quick:
+      schema_agent: get_deduced_schema
+      extraction_agent: extract_information_direct
+    standard:
+      schema_agent: get_deduced_schema
+      extraction_agent: extract_information_with_case
+      reflection_agent: reflect_with_case
+    customized:
+      schema_agent:  get_retrieved_schema
+      extraction_agent: extract_information_direct

src/generate_memory.py ADDED Viewed

	@@ -0,0 +1,181 @@

+from typing import Literal
+from models import *
+from utils import *
+from modules import *
+class Pipeline:
+    def __init__(self, llm: BaseEngine):
+        self.llm = llm
+        self.case_repo = CaseRepositoryHandler(llm = llm)
+        self.schema_agent = SchemaAgent(llm = llm)
+        self.extraction_agent = ExtractionAgent(llm = llm, case_repo = self.case_repo)
+        self.reflection_agent = ReflectionAgent(llm = llm, case_repo = self.case_repo)
+    def __init_method(self, data: DataPoint, process_method):
+        default_order = ["schema_agent", "extraction_agent", "reflection_agent"]
+        if "schema_agent" not in process_method:
+            process_method["schema_agent"] = "get_default_schema"
+        if data.task != "Base":
+            process_method["schema_agent"] = "get_retrieved_schema"
+        if "extraction_agent" not in process_method:
+            process_method["extraction_agent"] = "extract_information_direct"
+        sorted_process_method = {key: process_method[key] for key in default_order if key in process_method}
+        return sorted_process_method
+    def __init_data(self, data: DataPoint):
+        if data.task == "NER":
+            data.instruction = config['agent']['default_ner']
+            data.output_schema = "EntityList"
+        elif data.task == "RE":
+            data.instruction = config['agent']['default_re']
+            data.output_schema = "RelationList"
+        elif data.task == "EE":
+            data.instruction = config['agent']['default_ee']
+            data.output_schema = "EventList"
+        return data
+    # main entry
+    def get_extract_result(self,
+                           task: TaskType,
+                           instruction: str = "",
+                           text: str = "",
+                           output_schema: str = "",
+                           constraint: str = "",
+                           use_file: bool = False,
+                           truth: str = "",
+                           mode: str = "quick",
+                           update_case: bool = False
+                           ):
+        data = DataPoint(task=task, instruction=instruction, text=text, output_schema=output_schema, constraint=constraint, use_file=use_file, truth=truth)
+        data = self.__init_data(data)
+        data.instruction = "In the tranquil seaside town, the summer evening cast a golden glow over everything. The townsfolk gathered at the café by the pier, enjoying the sea breeze while eagerly anticipating the annual Ocean Festival's opening ceremony. \nFirst to arrive was Mayor William, dressed in a deep blue suit, holding a roll of his speech. He smiled and greeted the residents, who held deep respect for their community-minded mayor. Beside him trotted Max, his loyal golden retriever, wagging his tail excitedly at every familiar face he saw. \nFollowing closely was Emily, the town’s high school teacher, accompanied by a group of students ready to perform a musical piece they'd rehearsed. One of the girls carried Polly, a vibrant green parrot, on her shoulder. Polly occasionally chimed in with cheerful squawks, adding to the lively atmosphere. \nNot far away, Captain Jack, with his trusty pipe in hand, chatted with old friends about this year's catch. His fleet was the town’s economic backbone, and his seasoned face and towering presence were complemented by the presence of Whiskers, his orange tabby cat, who loved lounging on the dock, attentively watching the gentle waves. \nInside the café, Kate was bustling about, serving guests. As the owner, with her fiery red curls and vivacious spirit, she was the heart of the place. Her friend Susan, an artist living in a tiny cottage nearby, was helping her prepare refreshing beverages. Slinky, Susan's mischievous ferret, darted playfully between the tables, much to the delight of the children present. \nLeaning on the café's railing, a young boy named Tommy watched the sea with wide, gleaming eyes, filled with dreams of the future. By his side sat Daisy, a spirited little dachshund, barking excitedly at the seagulls flying overhead. Tommy's mother, Lucy, stood beside him, smiling softly as she held a seashell he had just found on the beach. \nAmong the crowd, a group of unnamed tourists snapped photos, capturing memories of the charming festival. Street vendors called out, selling their wares—handmade jewelry and sweet confections—as the scent of grilled seafood wafted through the air. \nSuddenly, a burst of laughter erupted—it was James and his band making their grand entrance. Accompanying them was Benny, a friendly border collie who \"performed\" with the band, delighting the crowd with his antics. Set to play a big concert after the opening ceremony, James, the town's star musician, had won the hearts of locals with his soulful tunes. \nAs dusk settled, lights were strung across the streets, casting a magical glow over the town. Mayor William took the stage to deliver his speech, with Max sitting proudly by his side. The festival atmosphere reached its vibrant peak, and in this small town, each person—and animal—carried their own dreams and stories, yet at this moment, they were united by the shared celebration."
+        data.chunk_text_list.append("In the tranquil seaside town, the summer evening cast a golden glow over everything. The townsfolk gathered at the café by the pier, enjoying the sea breeze while eagerly anticipating the annual Ocean Festival's opening ceremony. \nFirst to arrive was Mayor William, dressed in a deep blue suit, holding a roll of his speech. He smiled and greeted the residents, who held deep respect for their community-minded mayor. Beside him trotted Max, his loyal golden retriever, wagging his tail excitedly at every familiar face he saw. \nFollowing closely was Emily, the town’s high school teacher, accompanied by a group of students ready to perform a musical piece they'd rehearsed. One of the girls carried Polly, a vibrant green parrot, on her shoulder. Polly occasionally chimed in with cheerful squawks, adding to the lively atmosphere. \nNot far away, Captain Jack, with his trusty pipe in hand, chatted with old friends about this year's catch. His fleet was the town’s economic backbone, and his seasoned face and towering presence were complemented by the presence of Whiskers, his orange tabby cat, who loved lounging on the dock, attentively watching the gentle waves. \nInside the café, Kate was bustling about, serving guests. As the owner, with her fiery red curls and vivacious spirit, she was the heart of the place. Her friend Susan, an artist living in a tiny cottage nearby, was helping her prepare refreshing beverages. Slinky, Susan's mischievous ferret, darted playfully between the tables, much to the delight of the children present. \nLeaning on the café's railing, a young boy named Tommy watched the sea with wide, gleaming eyes, filled with dreams of the future. By his side sat Daisy, a spirited little dachshund, barking excitedly at the seagulls flying overhead. Tommy's mother, Lucy, stood beside him, smiling softly as she held a seashell he had just found on the beach. \nAmong the crowd, a group of unnamed tourists snapped photos, capturing memories of the charming festival. Street vendors called out, selling their wares—handmade jewelry and sweet confections—as the scent of grilled seafood wafted through the air. \nSuddenly, a burst of laughter erupted—it was James and his band making their grand entrance. Accompanying them was Benny, a friendly border collie who \"performed\" with the band, delighting the crowd with his antics. Set to play a big concert after the opening ceremony, James, the town's star musician, had won the hearts of locals with his soulful tunes. \nAs dusk settled, lights were strung across the streets, casting a magical glow over the town. Mayor William took the stage to deliver his speech, with Max sitting proudly by his side. The festival atmosphere reached its vibrant peak, and in this small town, each person—and animal—carried their own dreams and stories, yet at this moment, they were united by the shared celebration.")
+        data.distilled_text = "This text is from the field of Slice of Life and represents the genre of Novel."
+        data.pred = {
+  "characters": [
+    {
+      "name": "Mayor William",
+      "role": "Mayor"
+    },
+    {
+      "name": "Max",
+      "role": "Golden Retriever, Mayor William's dog"
+    },
+    {
+      "name": "Emily",
+      "role": "High school teacher"
+    },
+    {
+      "name": "Polly",
+      "role": "Parrot, accompanying a student"
+    },
+    {
+      "name": "Captain Jack",
+      "role": "Captain"
+    },
+    {
+      "name": "Whiskers",
+      "role": "Orange tabby cat, Captain Jack's pet"
+    },
+    {
+      "name": "Kate",
+      "role": "Café owner"
+    },
+    {
+      "name": "Susan",
+      "role": "Artist, Kate's friend"
+    },
+    {
+      "name": "Slinky",
+      "role": "Ferret, Susan's pet"
+    },
+    {
+      "name": "Tommy",
+      "role": "Young boy"
+    },
+    {
+      "name": "Daisy",
+      "role": "Dachshund, Tommy's pet"
+    },
+    {
+      "name": "Lucy",
+      "role": "Tommy's mother"
+    },
+    {
+      "name": "James",
+      "role": "Musician, band leader"
+    },
+    {
+      "name": "Benny",
+      "role": "Border Collie, accompanying James and his band"
+    },
+    {
+      "name": "Unnamed Tourists",
+      "role": "Visitors at the festival"
+    },
+    {
+      "name": "Street Vendors",
+      "role": "Sellers at the festival"
+    }
+  ]
+}
+        data.truth = {
+  "characters": [
+    {
+      "name": "Mayor William",
+      "role": "The friendly and respected mayor of the seaside town."
+    },
+    {
+      "name": "Emily",
+      "role": "A high school teacher guiding students in a festival performance."
+    },
+    {
+      "name": "Captain Jack",
+      "role": "A seasoned sailor whose fleet supports the town."
+    },
+    {
+      "name": "Kate",
+      "role": "The welcoming owner of the local café."
+    },
+    {
+      "name": "Susan",
+      "role": "An artist known for her ocean-themed paintings."
+    },
+    {
+      "name": "Tommy",
+      "role": "A young boy with dreams of the sea."
+    },
+    {
+      "name": "Lucy",
+      "role": "Tommy's caring and supportive mother."
+    },
+    {
+      "name": "James",
+      "role": "A charismatic musician and band leader."
+    }
+  ]
+}
+        # Case Update
+        if update_case:
+            if (data.truth == ""):
+                truth = input("Please enter the correct answer you prefer, or press Enter to accept the current answer: ")
+                if truth.strip() == "":
+                    data.truth = data.pred
+                else:
+                    data.truth = extract_json_dict(truth)
+            self.case_repo.update_case(data)
+        # return result
+        result = data.pred
+        trajectory = data.get_result_trajectory()
+        return result, trajectory, "a", "b"
+model = DeepSeek(model_name_or_path="deepseek-chat", api_key="")
+pipeline = Pipeline(model)
+result, trajectory, *_ = pipeline.get_extract_result(update_case=True, task="Base")

src/main.py ADDED Viewed

	@@ -0,0 +1,233 @@

+import random
+import json
+import gradio as gr
+from pipeline import Pipeline
+from models import *
+examples = [
+    {
+        "task": "NER",
+        "use_file": False,
+        "text": "Finally, every other year , ELRA organizes a major conference LREC , the International Language Resources and Evaluation Conference .",
+        "instruction": "",
+        "constraint": """["nationality", "country capital", "place of death", "children", "location contains", "place of birth", "place lived", "administrative division of country", "country of administrative divisions", "company", "neighborhood of", "company founders"]""",
+        "file_path": None,
+    },
+    {
+        "task": "RE",
+        "use_file": False,
+        "text": "The aid group Doctors Without Borders said that since Saturday , more than 275 wounded people had been admitted and treated at Donka Hospital in the capital of Guinea , Conakry .",
+        "instruction": "",
+        "constraint": """["nationality", "country capital", "place of death", "children", "location contains", "place of birth", "place lived", "administrative division of country", "country of administrative divisions", "company", "neighborhood of", "company founders"]""",
+        "file_path": None,
+    },
+        {
+        "task": "EE",
+        "use_file": False,
+        "text": "The file suggested to the user contains no software related to video streaming and simply carries the malicious payload that later compromises victim \u2019s account and sends out the deceptive messages to all victim \u2019s contacts .",
+        "instruction": "",
+        "constraint": """{"phishing": ["damage amount", "attack pattern", "tool", "victim", "place", "attacker", "purpose", "trusted entity", "time"], "data breach": ["damage amount", "attack pattern", "number of data", "number of victim", "tool", "compromised data", "victim", "place", "attacker", "purpose", "time"], "ransom": ["damage amount", "attack pattern", "payment method", "tool", "victim", "place", "attacker", "price", "time"], "discover vulnerability": ["vulnerable system", "vulnerability", "vulnerable system owner", "vulnerable system version", "supported platform", "common vulnerabilities and exposures", "capabilities", "time", "discoverer"], "patch vulnerability": ["vulnerable system", "vulnerability", "issues addressed", "vulnerable system version", "releaser", "supported platform", "common vulnerabilities and exposures", "patch number", "time", "patch"]}""",
+        "file_path": None,
+    },
+    # {
+    #     "task": "Base",
+    #     "use_file": True,
+    #     "file_path": "data/Harry_Potter_Chapter_1.pdf",
+    #     "instruction": "Extract main characters and the background setting from this chapter.",
+    #     "constraint": "",
+    #     "text": "",
+    # },
+    # {
+    #     "task": "Base",
+    #     "use_file": True,
+    #     "file_path": "data/Tulsi_Gabbard_News.html",
+    #     "instruction": "Extract key information from the given text.",
+    #     "constraint": "",
+    #     "text": "",
+    # },
+]
+def create_interface():
+    with gr.Blocks(title="OneKE Demo") as demo:
+        gr.HTML("""
+            <div style="text-align:center;">
+                <p align="center">
+                    <a href="https://github.com/zjunlp/DeepKE/blob/main/example/llm/assets/oneke_logo.png">
+                        <img src="https://raw.githubusercontent.com/zjunlp/DeepKE/refs/heads/main/example/llm/assets/oneke_logo.png" width="240"/>
+                    </a>
+                </p>
+                <h1>OneKE: A Dockerized Schema-Guided LLM Agent-based Knowledge Extraction System</h1>
+                <p>
+                🌐[<a href="https://oneke.openkg.cn/" target="_blank">Web</a>]
+                ⌨️[<a href="https://github.com/zjunlp/OneKE" target="_blank">Code</a>]
+                📹[<a href="http://oneke.openkg.cn/demo.mp4" target="_blank">Video</a>]
+                </p>
+            </div>
+        """)
+        example_button_gr = gr.Button("🎲 Quick Start with an Example 🎲")
+        with gr.Row():
+            with gr.Column():
+                model_gr = gr.Dropdown(choices=["gpt-3.5-turbo", "gpt-4o", "gpt-4o-mini"], label="🤖 Select your Model")
+                api_key_gr = gr.Textbox(label="🔑 Enter your API-Key")
+            with gr.Column():
+                task_gr = gr.Dropdown(choices=["Base", "NER", "RE", "EE"], label="🎯 Select your Task")
+                use_file_gr = gr.Checkbox(label="📂 Use File", value=True)
+        file_path_gr = gr.File(label="📖 Upload a File", visible=True)
+        text_gr = gr.Textbox(label="📖 Text", placeholder="Enter your Text", visible=False)
+        instruction_gr = gr.Textbox(label="🕹️ Instruction", visible=True)
+        constraint_gr = gr.Textbox(label="🕹️ Constraint", visible=False)
+        def update_fields(task):
+            if task == "Base":
+                return gr.update(visible=True, label="🕹️ Instruction", placeholder="Enter your Instruction"), gr.update(visible=False)
+            elif task == "NER":
+                return gr.update(visible=False), gr.update(visible=True, label="🕹️ Constraint", placeholder="Enter your NER Constraint")
+            elif task == "RE":
+                return gr.update(visible=False), gr.update(visible=True, label="🕹️ Constraint", placeholder="Enter your RE Constraint")
+            elif task == "EE":
+                return gr.update(visible=False), gr.update(visible=True, label="🕹️ Constraint", placeholder="Enter your EE Constraint")
+        def update_input_fields(use_file):
+            if use_file:
+                return gr.update(visible=False), gr.update(visible=True)
+            else:
+                return gr.update(visible=True), gr.update(visible=False)
+        def start_with_example():
+            example_index = random.randint(0, len(examples) - 1)
+            example = examples[example_index]
+            return (
+                gr.update(value=example["task"]),
+                gr.update(value=example["use_file"]),
+                gr.update(value=example["file_path"], visible=example["use_file"]),
+                gr.update(value=example["text"], visible=not example["use_file"]),
+                gr.update(value=example["instruction"], visible=example["task"] == "Base"),
+                gr.update(value=example["constraint"], visible=example["task"] in ["NER", "RE", "EE"]),
+            )
+        def submit(model, api_key, task, instruction, constraint, text, use_file, file_path):
+            try:
+                # 创建 Pipeline 实例
+                pipeline = Pipeline(ChatGPT(model_name_or_path=model, api_key=api_key))
+                if task == "Base":
+                    instruction = instruction
+                    constraint = ""
+                else:
+                    instruction = ""
+                    constraint = constraint
+                if use_file:
+                    text = ""
+                    file_path = file_path
+                else:
+                    text = text
+                    file_path = None
+                # 调用 Pipeline
+                _, _, ger_frontend_schema, ger_frontend_res = pipeline.get_extract_result(
+                    task=task,
+                    instruction=instruction,
+                    constraint=constraint,
+                    use_file=use_file,
+                    file_path=file_path,
+                    text=text,
+                )
+                ger_frontend_schema = str(ger_frontend_schema)
+                ger_frontend_res = json.dumps(ger_frontend_res, ensure_ascii=False, indent=4) if isinstance(ger_frontend_res, dict) else str(ger_frontend_res)
+                return ger_frontend_schema, ger_frontend_res, gr.update(value="", visible=False)
+            except Exception as e:
+                error_message = f"⚠️ Error:\n {str(e)}"
+                return "", "", gr.update(value=error_message, visible=True)
+        def clear_all():
+            return (
+                gr.update(value=""),  # model
+                gr.update(value=""),  # API Key
+                gr.update(value=""),  # task
+                gr.update(value="", visible=False),  # instruction
+                gr.update(value="", visible=False),  # constraint
+                gr.update(value=True),  # use_file
+                gr.update(value="", visible=False),  # text
+                gr.update(value=None, visible=True),  # file_path
+                gr.update(value=""),
+                gr.update(value=""),
+                gr.update(value="", visible=False),  # error_output
+            )
+        with gr.Row():
+            submit_button_gr = gr.Button("Submit", variant="primary", scale=8)
+            clear_button = gr.Button("Clear", scale=5)
+        gr.HTML("""
+		    <div style="width: 100%; text-align: center; font-size: 16px; font-weight: bold; position: relative; margin: 20px 0;">
+    			<span style="position: absolute; left: 0; top: 50%; transform: translateY(-50%); width: 45%; border-top: 1px solid #ccc;"></span>
+	    		<span style="position: relative; z-index: 1; background-color: white; padding: 0 10px;">Output:</span>
+			    <span style="position: absolute; right: 0; top: 50%; transform: translateY(-50%); width: 45%; border-top: 1px solid #ccc;"></span>
+		    </div>
+        """)
+        error_output_gr = gr.Textbox(label="😵‍💫 Ops, an Error Occurred", visible=False)
+        with gr.Row():
+            with gr.Column(scale=1):
+                py_output_gr = gr.Code(label="🤔 Generated Schema", language="python", lines=10, interactive=False)
+            with gr.Column(scale=1):
+                json_output_gr = gr.Code(label="😉 Final Answer", language="json", lines=10, interactive=False)
+        task_gr.change(fn=update_fields, inputs=task_gr, outputs=[instruction_gr, constraint_gr])
+        use_file_gr.change(fn=update_input_fields, inputs=use_file_gr, outputs=[text_gr, file_path_gr])
+        example_button_gr.click(
+            fn=start_with_example,
+            inputs=[],
+            outputs=[
+                task_gr,
+                use_file_gr,
+                file_path_gr,
+                text_gr,
+                instruction_gr,
+                constraint_gr,
+            ],
+        )
+        submit_button_gr.click(
+            fn=submit,
+            inputs=[
+                model_gr,
+                api_key_gr,
+                task_gr,
+                instruction_gr,
+                constraint_gr,
+                text_gr,
+                use_file_gr,
+                file_path_gr,
+            ],
+            outputs=[py_output_gr, json_output_gr, error_output_gr],
+            show_progress=True,
+        )
+        clear_button.click(
+            fn=clear_all,
+            outputs=[
+                model_gr,
+                api_key_gr,
+                task_gr,
+                instruction_gr,
+                constraint_gr,
+                use_file_gr,
+                text_gr,
+                file_path_gr,
+                py_output_gr,
+                json_output_gr,
+                error_output_gr,
+            ],
+        )
+    return demo
+interface = create_interface()
+interface.launch()

src/models/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .llm_def import BaseEngine, LLaMA, Qwen, MiniCPM, ChatGLM, ChatGPT, DeepSeek
+from .prompt_example import *
+from .prompt_template import *

src/models/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (434 Bytes). View file

src/models/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (315 Bytes). View file

src/models/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (359 Bytes). View file

src/models/__pycache__/llm_def.cpython-311.pyc ADDED Viewed

Binary file (11.8 kB). View file

src/models/__pycache__/llm_def.cpython-37.pyc ADDED Viewed

Binary file (7.14 kB). View file

src/models/__pycache__/llm_def.cpython-39.pyc ADDED Viewed

Binary file (6.8 kB). View file

src/models/__pycache__/prompt_example.cpython-311.pyc ADDED Viewed

Binary file (5.67 kB). View file

src/models/__pycache__/prompt_example.cpython-39.pyc ADDED Viewed

Binary file (5.66 kB). View file

src/models/__pycache__/prompt_template.cpython-311.pyc ADDED Viewed

Binary file (5.42 kB). View file

src/models/__pycache__/prompt_template.cpython-39.pyc ADDED Viewed

Binary file (4.95 kB). View file

src/models/llm_def.py ADDED Viewed

	@@ -0,0 +1,212 @@

+"""
+Surpported Models.
+Supports:
+- Open Source:LLaMA3, Qwen2.5, MiniCPM3, ChatGLM4
+- Closed Source: ChatGPT, DeepSeek
+"""
+from transformers import pipeline
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoTokenizer
+import torch
+import openai
+import os
+from openai import OpenAI
+# The inferencing code is taken from the official documentation
+class BaseEngine:
+    def __init__(self, model_name_or_path: str):
+        self.name = None
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
+        self.temperature = 0.2
+        self.top_p = 0.9
+        self.max_tokens = 1024
+    def get_chat_response(self, prompt):
+        raise NotImplementedError
+    def set_hyperparameter(self, temperature: float = 0.2, top_p: float = 0.9, max_tokens: int = 1024):
+        self.temperature = temperature
+        self.top_p = top_p
+        self.max_tokens = max_tokens
+class LLaMA(BaseEngine):
+    def __init__(self, model_name_or_path: str):
+        super().__init__(model_name_or_path)
+        self.name = "LLaMA"
+        self.model_id = model_name_or_path
+        self.pipeline = pipeline(
+            "text-generation",
+            model=self.model_id,
+            model_kwargs={"torch_dtype": torch.bfloat16},
+            device_map="auto",
+        )
+        self.terminators = [
+            self.pipeline.tokenizer.eos_token_id,
+            self.pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
+        ]
+    def get_chat_response(self, prompt):
+        messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": prompt},
+        ]
+        outputs = self.pipeline(
+            messages,
+            max_new_tokens=self.max_tokens,
+            eos_token_id=self.terminators,
+            do_sample=True,
+            temperature=self.temperature,
+            top_p=self.top_p,
+        )
+        return outputs[0]["generated_text"][-1]['content'].strip()
+class Qwen(BaseEngine):
+    def __init__(self, model_name_or_path: str):
+        super().__init__(model_name_or_path)
+        self.name = "Qwen"
+        self.model_id = model_name_or_path
+        self.model = AutoModelForCausalLM.from_pretrained(
+            self.model_id,
+            torch_dtype="auto",
+            device_map="auto"
+        )
+    def get_chat_response(self, prompt):
+        messages = [
+            {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
+            {"role": "user", "content": prompt}
+        ]
+        text = self.tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device)
+        generated_ids = self.model.generate(
+            **model_inputs,
+            temperature=self.temperature,
+            top_p=self.top_p,
+            max_new_tokens=self.max_tokens
+        )
+        generated_ids = [
+            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
+        ]
+        response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+        return response
+class MiniCPM(BaseEngine):
+    def __init__(self, model_name_or_path: str):
+        super().__init__(model_name_or_path)
+        self.name = "MiniCPM"
+        self.model_id = model_name_or_path
+        self.model = AutoModelForCausalLM.from_pretrained(
+            self.model_id,
+            torch_dtype=torch.bfloat16,
+            device_map="auto",
+            trust_remote_code=True
+        )
+    def get_chat_response(self, prompt):
+        messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": prompt}
+        ]
+        model_inputs = self.tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to(self.model.device)
+        model_outputs = self.model.generate(
+            model_inputs,
+            temperature=self.temperature,
+            top_p=self.top_p,
+            max_new_tokens=self.max_tokens
+        )
+        output_token_ids = [
+            model_outputs[i][len(model_inputs[i]):] for i in range(len(model_inputs))
+        ]
+        response = self.tokenizer.batch_decode(output_token_ids, skip_special_tokens=True)[0].strip()
+        return response
+class ChatGLM(BaseEngine):
+    def __init__(self, model_name_or_path: str):
+        super().__init__(model_name_or_path)
+        self.name = "ChatGLM"
+        self.model_id = model_name_or_path
+        self.model = AutoModelForCausalLM.from_pretrained(
+            self.model_id,
+            torch_dtype=torch.bfloat16,
+            device_map="auto",
+            low_cpu_mem_usage=True,
+            trust_remote_code=True
+        )
+    def get_chat_response(self, prompt):
+        messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": prompt}
+        ]
+        model_inputs = self.tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True, add_generation_prompt=True, tokenize=True).to(self.model.device)
+        model_outputs = self.model.generate(
+            **model_inputs,
+            temperature=self.temperature,
+            top_p=self.top_p,
+            max_new_tokens=self.max_tokens
+        )
+        model_outputs = model_outputs[:, model_inputs['input_ids'].shape[1]:]
+        response = self.tokenizer.batch_decode(model_outputs, skip_special_tokens=True)[0].strip()
+        return response
+class ChatGPT(BaseEngine):
+    def __init__(self, model_name_or_path: str, api_key: str, base_url=openai.base_url):
+        self.name = "ChatGPT"
+        self.model = model_name_or_path
+        self.base_url = base_url
+        self.temperature = 0.2
+        self.top_p = 0.9
+        self.max_tokens = 1024
+        if api_key != "":
+            self.api_key = api_key
+        else:
+            self.api_key = os.environ["OPENAI_API_KEY"]
+        self.client = OpenAI(api_key=self.api_key, base_url=self.base_url)
+    def get_chat_response(self, input):
+        response = self.client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "user", "content": input},
+            ],
+            stream=False,
+            temperature=self.temperature,
+            max_tokens=self.max_tokens,
+            stop=None
+        )
+        return response.choices[0].message.content
+class DeepSeek(BaseEngine):
+    def __init__(self, model_name_or_path: str, api_key: str, base_url="https://api.deepseek.com"):
+        self.name = "DeepSeek"
+        self.model = model_name_or_path
+        self.base_url = base_url
+        self.temperature = 0.2
+        self.top_p = 0.9
+        self.max_tokens = 1024
+        if api_key != "":
+            self.api_key = api_key
+        else:
+            self.api_key = os.environ["DEEPSEEK_API_KEY"]
+        self.client = OpenAI(api_key=self.api_key, base_url=self.base_url)
+    def get_chat_response(self, input):
+        response = self.client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "user", "content": input},
+            ],
+            stream=False,
+            temperature=self.temperature,
+            max_tokens=self.max_tokens,
+            stop=None
+        )
+        return response.choices[0].message.content

src/models/prompt_example.py ADDED Viewed

	@@ -0,0 +1,137 @@

+json_schema_examples = """
+**Task**: Please extract all economic policies affecting the stock market between 2015 and 2023 and the exact dates of their implementation.
+**Text**: This text is from the field of Economics and represents the genre of Article.
+...(example text)...
+**Output Schema**:
+{
+  "economic_policies": [
+      {
+          "name": null,
+          "implementation_date": null
+      }
+  ]
+}
+Example2:
+**Task**: Tell me the main content of papers related to NLP between 2022 and 2023.
+**Text**: This text is from the field of AI and represents the genre of Research Paper.
+...(example text)...
+**Output Schema**:
+{
+  "papers": [
+      {
+          "title": null,
+          "content": null
+      }
+  ]
+}
+Example3:
+**Task**: Extract all the information in the given text.
+**Text**: This text is from the field of Political and represents the genre of News Report.
+...(example text)...
+**Output Schema**:
+Answer:
+{
+  "news_report":
+    {
+      "title": null,
+      "summary": null,
+      "publication_date": null,
+      "keywords": [],
+      "events": [
+          {
+              "name": null,
+              "time": null,
+              "people_involved": [],
+              "cause": null,
+              "process": null,
+              "result": null
+          }
+      ],
+      quotes: [],
+      viewpoints: []
+    }
+}
+"""
+code_schema_examples = """
+Example1:
+**Task**: Extract all the entities in the given text.
+**Text**:
+...(example text)...
+**Output Schema**:
+```python
+from typing import List, Optional
+from pydantic import BaseModel, Field
+class Entity(BaseModel):
+    label : str = Field(description="The type or category of the entity, such as 'Process', 'Technique', 'Data Structure', 'Methodology', 'Person', etc. ")
+    name : str = Field(description="The specific name of the entity. It should represent a single, distinct concept and must not be an empty string. For example, if the entity is a 'Technique', the name could be 'Neural Networks'.")
+class ExtractionTarget(BaseModel):
+    entity_list : List[Entity] = Field(description="All the entities presented in the context. The entities should encode ONE concept.")
+```
+Example2:
+**Task**: Extract all the information in the given text.
+**Text**: This text is from the field of Political and represents the genre of News Article.
+...(example text)...
+**Output Schema**:
+```python
+from typing import List, Optional
+from pydantic import BaseModel, Field
+class Person(BaseModel):
+    name: str = Field(description="The name of the person")
+    identity: Optional[str] = Field(description="The occupation, status or characteristics of the person.")
+    role: Optional[str] = Field(description="The role or function the person plays in an event.")
+class Event(BaseModel):
+    name: str = Field(description="Name of the event")
+    time: Optional[str] = Field(description="Time when the event took place")
+    people_involved: Optional[List[Person]] = Field(description="People involved in the event")
+    cause: Optional[str] = Field(default=None, description="Reason for the event, if applicable")
+    process: Optional[str] = Field(description="Details of the event process")
+    result: Optional[str] = Field(default=None, description="Result or outcome of the event")
+class ExtractionTarget(BaseModel):
+    title: str = Field(description="The title or headline of the news article")
+    summary: str = Field(description="A brief summary of the news article")
+    publication_date: Optional[str] = Field(description="The publication date of the article")
+    keywords: Optional[List[str]] = Field(description="List of keywords or topics covered in the article")
+    events: List[Event] = Field(description="Events covered in the article")
+    quotes: Optional[List[str]] = Field(default=None, description="Quotes related to the news, if any")
+    viewpoints: Optional[List[str]] = Field(default=None, description="Different viewpoints regarding the news")
+```
+Example3:
+**Task**: Extract the key information in the given text.
+**Text**: This text is from the field of AI and represents the genre of Research Paper.
+...(example text)...
+```python
+from typing import List, Optional
+from pydantic import BaseModel, Field
+class MetaData(BaseModel):
+    title : str = Field(description="The title of the article")
+    authors : List[str] = Field(description="The list of the article's authors")
+    abstract: str = Field(description="The article's abstract")
+    key_words: List[str] = Field(description="The key words associated with the article")
+class Baseline(BaseModel):
+    method_name : str = Field(description="The name of the baseline method")
+    proposed_solution : str = Field(description="the proposed solution in details")
+    performance_metrics : str = Field(description="The performance metrics of the method and comparative analysis")
+class ExtractionTarget(BaseModel):
+    key_contributions: List[str] = Field(description="The key contributions of the article")
+    limitation_of_sota : str=Field(description="the summary limitation of the existing work")
+    proposed_solution : str = Field(description="the proposed solution in details")
+    baselines : List[Baseline] = Field(description="The list of baseline methods and their details")
+    performance_metrics : str = Field(description="The performance metrics of the method and comparative analysis")
+    paper_limitations : str=Field(description="The limitations of the proposed solution of the paper")
+```
+"""

src/models/prompt_template.py ADDED Viewed

	@@ -0,0 +1,174 @@

+from langchain.prompts import PromptTemplate
+from .prompt_example import *
+# ==================================================================== #
+#                           SCHEMA AGENT                               #
+# ==================================================================== #
+# Get Text Analysis
+TEXT_ANALYSIS_INSTRUCTION = """
+**Instruction**: Please analyze and categorize the given text.
+{examples}
+**Text**: {text}
+**Output Shema**: {schema}
+"""
+text_analysis_instruction = PromptTemplate(
+    input_variables=["examples", "text", "schema"],
+    template=TEXT_ANALYSIS_INSTRUCTION,
+)
+# Get Deduced Schema Json
+DEDUCE_SCHEMA_JSON_INSTRUCTION = """
+**Instruction**: Generate an output format that meets the requirements as described in the task. Pay attention to the following requirements:
+    - Format: Return your responses in dictionary format as a JSON object.
+    - Content: Do not include any actual data; all attributes values should be set to None.
+    - Note: Attributes not mentioned in the task description should be ignored.
+{examples}
+**Task**: {instruction}
+**Text**: {distilled_text}
+{text}
+Now please deduce the output schema in json format. All attributes values should be set to None.
+**Output Schema**:
+"""
+deduced_schema_json_instruction = PromptTemplate(
+    input_variables=["examples", "instruction", "distilled_text", "text", "schema"],
+    template=DEDUCE_SCHEMA_JSON_INSTRUCTION,
+)
+# Get Deduced Schema Code
+DEDUCE_SCHEMA_CODE_INSTRUCTION = """
+**Instruction**: Based on the provided text and task description, Define the output schema in Python using Pydantic. Name the final extraction target class as 'ExtractionTarget'.
+{examples}
+**Task**: {instruction}
+**Text**: {distilled_text}
+{text}
+Now please deduce the output schema. Ensure that the output code snippet is wrapped in '```',and can be directly parsed by the Python interpreter.
+**Output Schema**: """
+deduced_schema_code_instruction = PromptTemplate(
+    input_variables=["examples", "instruction", "distilled_text", "text"],
+    template=DEDUCE_SCHEMA_CODE_INSTRUCTION,
+)
+# ==================================================================== #
+#                         EXTRACTION AGENT                             #
+# ==================================================================== #
+EXTRACT_INSTRUCTION = """
+**Instruction**: You are an agent skilled in information extarction. {instruction}
+{examples}
+**Text**: {text}
+{additional_info}
+**Output Schema**: {schema}
+Now please extract the corresponding information from the text. Ensure that the information you extract has a clear reference in the given text. Set any property not explicitly mentioned in the text to null.
+"""
+extract_instruction = PromptTemplate(
+    input_variables=["instruction", "examples", "text", "schema", "additional_info"],
+    template=EXTRACT_INSTRUCTION,
+)
+SUMMARIZE_INSTRUCTION = """
+**Instruction**: Below is a list of results obtained after segmenting and extracting information from a long article. Please consolidate all the answers to generate a final response.
+{examples}
+**Task**: {instruction}
+**Result List**: {answer_list}
+**Output Schema**: {schema}
+Now summarize all the information from the Result List.
+"""
+summarize_instruction = PromptTemplate(
+    input_variables=["instruction", "examples", "answer_list", "schema"],
+    template=SUMMARIZE_INSTRUCTION,
+)
+# ==================================================================== #
+#                          REFLECION AGENT                             #
+# ==================================================================== #
+REFLECT_INSTRUCTION = """**Instruction**: You are an agent skilled in reflection and optimization based on the original result. Refer to **Reflection Reference** to identify potential issues in the current extraction results.
+**Reflection Reference**: {examples}
+Now please review each element in the extraction result. Identify and improve any potential issues in the result based on the reflection. NOTE: If the original result is correct, no modifications are needed!
+**Task**: {instruction}
+**Text**: {text}
+**Output Schema**: {schema}
+**Original Result**: {result}
+"""
+reflect_instruction = PromptTemplate(
+    input_variables=["instruction", "examples", "text", "schema", "result"],
+    template=REFLECT_INSTRUCTION,
+)
+SUMMARIZE_INSTRUCTION = """
+**Instruction**: Below is a list of results obtained after segmenting and extracting information from a long article. Please consolidate all the answers to generate a final response.
+**Task**: {instruction}
+**Result List**: {answer_list}
+{additional_info}
+**Output Schema**: {schema}
+Now summarize the information from the Result List.
+"""
+summarize_instruction = PromptTemplate(
+    input_variables=["instruction", "answer_list", "additional_info", "schema"],
+    template=SUMMARIZE_INSTRUCTION,
+)
+# ==================================================================== #
+#                            CASE REPOSITORY                           #
+# ==================================================================== #
+GOOD_CASE_ANALYSIS_INSTRUCTION = """
+**Instruction**: Below is an information extraction task and its corresponding correct answer. Provide the reasoning steps that led to the correct answer, along with brief explanation of the answer. Your response should be brief and organized.
+**Task**: {instruction}
+**Text**: {text}
+{additional_info}
+**Correct Answer**: {result}
+Now please generate the reasoning steps and breif analysis of the **Correct Answer** given above. DO NOT generate your own extraction result.
+**Analysis**:
+"""
+good_case_analysis_instruction = PromptTemplate(
+    input_variables=["instruction", "text", "result", "additional_info"],
+    template=GOOD_CASE_ANALYSIS_INSTRUCTION,
+)
+BAD_CASE_REFLECTION_INSTRUCTION = """
+**Instruction**: Based on the task description, compare the original answer with the correct one. Your output should be a brief reflection or concise summarized rules.
+**Task**: {instruction}
+**Text**: {text}
+{additional_info}
+**Original Answer**: {original_answer}
+**Correct Answer**: {correct_answer}
+Now please generate a brief and organized reflection. DO NOT generate your own extraction result.
+**Reflection**:
+"""
+bad_case_reflection_instruction = PromptTemplate(
+    input_variables=["instruction", "text", "original_answer", "correct_answer", "additional_info"],
+    template=BAD_CASE_REFLECTION_INSTRUCTION,
+)

src/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .schema_agent import SchemaAgent
+from .extraction_agent import ExtractionAgent
+from .reflection_agent import ReflectionAgent
+from .knowledge_base.case_repository import CaseRepositoryHandler

src/modules/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (459 Bytes). View file

src/modules/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (392 Bytes). View file

src/modules/__pycache__/extraction_agent.cpython-311.pyc ADDED Viewed

Binary file (6.66 kB). View file

src/modules/__pycache__/extraction_agent.cpython-39.pyc ADDED Viewed

Binary file (4.06 kB). View file

src/modules/__pycache__/reflection_agent.cpython-311.pyc ADDED Viewed

Binary file (6.98 kB). View file

src/modules/__pycache__/reflection_agent.cpython-39.pyc ADDED Viewed

Binary file (4.01 kB). View file

src/modules/__pycache__/schema_agent.cpython-311.pyc ADDED Viewed

Binary file (10.7 kB). View file

src/modules/__pycache__/schema_agent.cpython-39.pyc ADDED Viewed

Binary file (6.64 kB). View file

src/modules/extraction_agent.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from models import *
+from utils import *
+from .knowledge_base.case_repository import CaseRepositoryHandler
+class InformationExtractor:
+    def __init__(self, llm: BaseEngine):
+        self.llm = llm
+    def extract_information(self, instruction="", text="", examples="", schema="", additional_info=""):
+        examples = good_case_wrapper(examples)
+        prompt = extract_instruction.format(instruction=instruction, examples=examples, text=text, additional_info=additional_info, schema=schema)
+        response = self.llm.get_chat_response(prompt)
+        response = extract_json_dict(response)
+        print(f"prompt: {prompt}")
+        print("========================================")
+        print(f"response: {response}")
+        return response
+    def summarize_answer(self, instruction="", answer_list="", schema="", additional_info=""):
+        prompt = summarize_instruction.format(instruction=instruction, answer_list=answer_list, schema=schema, additional_info=additional_info)
+        response = self.llm.get_chat_response(prompt)
+        response = extract_json_dict(response)
+        return response
+class ExtractionAgent:
+    def __init__(self, llm: BaseEngine, case_repo: CaseRepositoryHandler):
+        self.llm = llm
+        self.module = InformationExtractor(llm = llm)
+        self.case_repo = case_repo
+        self.methods = ["extract_information_direct", "extract_information_with_case"]
+    def __get_constraint(self, data: DataPoint):
+        if data.constraint == "":
+            return data
+        if data.task == "NER":
+            constraint = json.dumps(data.constraint)
+            if "**Entity Type Constraint**" in constraint:
+                return data
+            data.constraint = f"\n**Entity Type Constraint**: The type of entities must be chosen from the following list.\n{constraint}\n"
+        elif data.task == "RE":
+            constraint = json.dumps(data.constraint)
+            if "**Relation Type Constraint**" in constraint:
+                return data
+            data.constraint = f"\n**Relation Type Constraint**: The type of relations must be chosen from the following list.\n{constraint}\n"
+        elif data.task == "EE":
+            constraint = json.dumps(data.constraint)
+            if "**Event Extraction Constraint**" in constraint:
+                return data
+            data.constraint = f"\n**Event Extraction Constraint**: The event type must be selected from the following dictionary keys, and its event arguments should be chosen from its corresponding dictionary values. \n{constraint}\n"
+        return data
+    def extract_information_direct(self, data: DataPoint):
+        data = self.__get_constraint(data)
+        result_list = []
+        for chunk_text in data.chunk_text_list:
+            extract_direct_result = self.module.extract_information(instruction=data.instruction, text=chunk_text, schema=data.output_schema, examples="", additional_info=data.constraint)
+            result_list.append(extract_direct_result)
+        function_name = current_function_name()
+        data.set_result_list(result_list)
+        data.update_trajectory(function_name, result_list)
+        return data
+    def extract_information_with_case(self, data: DataPoint):
+        data = self.__get_constraint(data)
+        result_list = []
+        for chunk_text in data.chunk_text_list:
+            examples = self.case_repo.query_good_case(data)
+            extract_case_result = self.module.extract_information(instruction=data.instruction, text=chunk_text, schema=data.output_schema, examples=examples, additional_info=data.constraint)
+            result_list.append(extract_case_result)
+        function_name = current_function_name()
+        data.set_result_list(result_list)
+        data.update_trajectory(function_name, result_list)
+        return data
+    def summarize_answer(self, data: DataPoint):
+        if len(data.result_list) == 0:
+            return data
+        if len(data.result_list) == 1:
+            data.set_pred(data.result_list[0])
+            return data
+        summarized_result = self.module.summarize_answer(instruction=data.instruction, answer_list=data.result_list, schema=data.output_schema, additional_info=data.constraint)
+        funtion_name = current_function_name()
+        data.set_pred(summarized_result)
+        data.update_trajectory(funtion_name, summarized_result)
+        return data

src/modules/knowledge_base/__pycache__/case_repository.cpython-311.pyc ADDED Viewed

Binary file (4.64 kB). View file

src/modules/knowledge_base/__pycache__/case_repository.cpython-39.pyc ADDED Viewed

Binary file (3.8 kB). View file

src/modules/knowledge_base/__pycache__/schema_repository.cpython-311.pyc ADDED Viewed

Binary file (9.25 kB). View file

src/modules/knowledge_base/__pycache__/schema_repository.cpython-39.pyc ADDED Viewed

Binary file (5.94 kB). View file

src/modules/knowledge_base/case_repository.json ADDED Viewed

The diff for this file is too large to render. See raw diff

src/modules/knowledge_base/case_repository.py ADDED Viewed

	@@ -0,0 +1,391 @@

+# import json
+# import os
+# import torch
+# import numpy as np
+# from utils import *
+# from sentence_transformers import SentenceTransformer
+# from rapidfuzz import process
+# from models import *
+# import copy
+# import warnings
+# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# warnings.filterwarnings("ignore", category=FutureWarning, message=r".*clean_up_tokenization_spaces*")
+# class CaseRepository:
+#     def __init__(self):
+#         self.embedder = SentenceTransformer("all-MiniLM-L6-v2")
+#         self.embedder.to(device)
+#         self.corpus = self.load_corpus()
+#         self.embedded_corpus = self.embed_corpus()
+#     def load_corpus(self):
+#         with open(os.path.join(os.path.dirname(__file__), "case_repository.json")) as file:
+#             corpus = json.load(file)
+#         return corpus
+#     def update_corpus(self):
+#         try:
+#             with open(os.path.join(os.path.dirname(__file__), "case_repository.json"), "w") as file:
+#                 json.dump(self.corpus, file, indent=2)
+#         except Exception as e:
+#             print(f"Error when updating corpus: {e}")
+#     def embed_corpus(self):
+#         embedded_corpus = {}
+#         for key, content in self.corpus.items():
+#             good_index = [item['index']['embed_index'] for item in content['good']]
+#             encoded_good_index = self.embedder.encode(good_index, convert_to_tensor=True).to(device)
+#             bad_index = [item['index']['embed_index'] for item in content['bad']]
+#             encoded_bad_index = self.embedder.encode(bad_index, convert_to_tensor=True).to(device)
+#             embedded_corpus[key] = {"good": encoded_good_index, "bad": encoded_bad_index}
+#         return embedded_corpus
+#     def get_similarity_scores(self, task: TaskType, embed_index="", str_index="", case_type="", top_k=2):
+#         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+#         # Embedding similarity match
+#         encoded_embed_query = self.embedder.encode(embed_index, convert_to_tensor=True).to(device)
+#         embedding_similarity_matrix = self.embedder.similarity(encoded_embed_query, self.embedded_corpus[task][case_type])
+#         embedding_similarity_scores = embedding_similarity_matrix[0].to(device)
+#         # String similarity match
+#         str_match_corpus = [item['index']['str_index'] for item in self.corpus[task][case_type]]
+#         str_similarity_results = process.extract(str_index, str_match_corpus, limit=len(str_match_corpus))
+#         scores_dict = {match[0]: match[1] for match in str_similarity_results}
+#         scores_in_order = [scores_dict[candidate] for candidate in str_match_corpus]
+#         str_similarity_scores = torch.tensor(scores_in_order, dtype=torch.float32).to(device)
+#         # Normalize scores
+#         embedding_score_range = embedding_similarity_scores.max() - embedding_similarity_scores.min()
+#         str_score_range = str_similarity_scores.max() - str_similarity_scores.min()
+#         if embedding_score_range > 0:
+#             embed_norm_scores = (embedding_similarity_scores - embedding_similarity_scores.min()) / embedding_score_range
+#         else:
+#             embed_norm_scores = embedding_similarity_scores
+#         if str_score_range > 0:
+#             str_norm_scores = (str_similarity_scores - str_similarity_scores.min()) / str_score_range
+#         else:
+#             str_norm_scores = str_similarity_scores / 100
+#         # Combine the scores with weights
+#         combined_scores = 0.5 * embed_norm_scores + 0.5 * str_norm_scores
+#         original_combined_scores = 0.5 * embedding_similarity_scores + 0.5 * str_similarity_scores / 100
+#         scores, indices = torch.topk(combined_scores, k=min(top_k, combined_scores.size(0)))
+#         original_scores, original_indices = torch.topk(original_combined_scores, k=min(top_k, original_combined_scores.size(0)))
+#         return scores, indices, original_scores, original_indices
+#     def query_case(self, task: TaskType, embed_index="", str_index="", case_type="", top_k=2) -> list:
+#         _, indices, _, _ = self.get_similarity_scores(task, embed_index, str_index, case_type, top_k)
+#         top_matches = [self.corpus[task][case_type][idx]["content"] for idx in indices]
+#         return top_matches
+#     def update_case(self, task: TaskType, embed_index="", str_index="", content="" ,case_type=""):
+#         self.corpus[task][case_type].append({"index": {"embed_index": embed_index, "str_index": str_index}, "content": content})
+#         self.embedded_corpus[task][case_type] = torch.cat([self.embedded_corpus[task][case_type], self.embedder.encode([embed_index], convert_to_tensor=True).to(device)], dim=0)
+#         print(f"Case updated for {task} task.")
+# class CaseRepositoryHandler:
+#     def __init__(self, llm: BaseEngine):
+#         self.repository = CaseRepository()
+#         self.llm = llm
+#     def __get_good_case_analysis(self, instruction="", text="", result="", additional_info=""):
+#         prompt = good_case_analysis_instruction.format(
+#             instruction=instruction, text=text, result=result, additional_info=additional_info
+#         )
+#         for _ in range(3):
+#             response = self.llm.get_chat_response(prompt)
+#             response = extract_json_dict(response)
+#             if not isinstance(response, dict):
+#                 return response
+#         return None
+#     def __get_bad_case_reflection(self, instruction="", text="", original_answer="", correct_answer="", additional_info=""):
+#         prompt = bad_case_reflection_instruction.format(
+#             instruction=instruction, text=text, original_answer=original_answer, correct_answer=correct_answer, additional_info=additional_info
+#         )
+#         for _ in range(3):
+#             response = self.llm.get_chat_response(prompt)
+#             response = extract_json_dict(response)
+#             if not isinstance(response, dict):
+#                 return response
+#         return None
+#     def __get_index(self, data: DataPoint, case_type: str):
+#         # set embed_index
+#         embed_index = f"**Text**: {data.distilled_text}\n{data.chunk_text_list[0]}"
+#         # set str_index
+#         if data.task == "Base":
+#             str_index = f"**Task**: {data.instruction}"
+#         else:
+#             str_index = f"{data.constraint}"
+#         if case_type == "bad":
+#             str_index += f"\n\n**Original Result**: {json.dumps(data.pred)}"
+#         return embed_index, str_index
+#     def query_good_case(self, data: DataPoint):
+#         embed_index, str_index = self.__get_index(data, "good")
+#         return self.repository.query_case(task=data.task, embed_index=embed_index, str_index=str_index, case_type="good")
+#     def query_bad_case(self, data: DataPoint):
+#         embed_index, str_index = self.__get_index(data, "bad")
+#         return self.repository.query_case(task=data.task, embed_index=embed_index, str_index=str_index, case_type="bad")
+#     def update_good_case(self, data: DataPoint):
+#         if data.truth == "" :
+#             print("No truth value provided.")
+#             return
+#         embed_index, str_index = self.__get_index(data, "good")
+#         _, _, original_scores, _ = self.repository.get_similarity_scores(data.task, embed_index, str_index, "good", 1)
+#         original_scores = original_scores.tolist()
+#         if original_scores[0] >= 0.9:
+#             print("The similar good case is already in the corpus. Similarity Score: ", original_scores[0])
+#             return
+#         good_case_alaysis = self.__get_good_case_analysis(instruction=data.instruction, text=data.distilled_text, result=data.truth, additional_info=data.constraint)
+#         wrapped_good_case_analysis = f"**Analysis**: {good_case_alaysis}"
+#         wrapped_instruction = f"**Task**: {data.instruction}"
+#         wrapped_text = f"**Text**: {data.distilled_text}\n{data.chunk_text_list[0]}"
+#         wrapped_answer = f"**Correct Answer**: {json.dumps(data.truth)}"
+#         if data.task == "Base":
+#             content = f"{wrapped_instruction}\n\n{wrapped_text}\n\n{wrapped_good_case_analysis}\n\n{wrapped_answer}"
+#         else:
+#             content = f"{wrapped_text}\n\n{data.constraint}\n\n{wrapped_good_case_analysis}\n\n{wrapped_answer}"
+#         self.repository.update_case(data.task, embed_index, str_index, content, "good")
+#     def update_bad_case(self, data: DataPoint):
+#         if data.truth == "" :
+#             print("No truth value provided.")
+#             return
+#         if normalize_obj(data.pred) == normalize_obj(data.truth):
+#             return
+#         embed_index, str_index = self.__get_index(data, "bad")
+#         _, _, original_scores, _ = self.repository.get_similarity_scores(data.task, embed_index, str_index, "bad", 1)
+#         original_scores = original_scores.tolist()
+#         if original_scores[0] >= 0.9:
+#             print("The similar bad case is already in the corpus. Similarity Score: ", original_scores[0])
+#             return
+#         bad_case_reflection = self.__get_bad_case_reflection(instruction=data.instruction, text=data.distilled_text, original_answer=data.pred, correct_answer=data.truth, additional_info=data.constraint)
+#         wrapped_bad_case_reflection = f"**Reflection**: {bad_case_reflection}"
+#         wrapper_original_answer = f"**Original Answer**: {json.dumps(data.pred)}"
+#         wrapper_correct_answer = f"**Correct Answer**: {json.dumps(data.truth)}"
+#         wrapped_instruction = f"**Task**: {data.instruction}"
+#         wrapped_text = f"**Text**: {data.distilled_text}\n{data.chunk_text_list[0]}"
+#         if data.task == "Base":
+#             content = f"{wrapped_instruction}\n\n{wrapped_text}\n\n{wrapper_original_answer}\n\n{wrapped_bad_case_reflection}\n\n{wrapper_correct_answer}"
+#         else:
+#             content =  f"{wrapped_text}\n\n{data.constraint}\n\n{wrapper_original_answer}\n\n{wrapped_bad_case_reflection}\n\n{wrapper_correct_answer}"
+#         self.repository.update_case(data.task, embed_index, str_index, content, "bad")
+#     def update_case(self, data: DataPoint):
+#         self.update_good_case(data)
+#         self.update_bad_case(data)
+#         self.repository.update_corpus()
+import json
+import os
+import torch
+import numpy as np
+from utils import *
+from sentence_transformers import SentenceTransformer
+from rapidfuzz import process
+from models import *
+import copy
+import warnings
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+warnings.filterwarnings("ignore", category=FutureWarning, message=r".*clean_up_tokenization_spaces*")
+class CaseRepository:
+    def __init__(self):
+        # self.embedder = SentenceTransformer("all-MiniLM-L6-v2")
+        # self.embedder.to(device)
+        # self.corpus = self.load_corpus()
+        # self.embedded_corpus = self.embed_corpus()
+        pass
+    def load_corpus(self):
+        # with open(os.path.join(os.path.dirname(__file__), "case_repository.json")) as file:
+        #     corpus = json.load(file)
+        # return corpus
+        pass
+    def update_corpus(self):
+        # try:
+        #     with open(os.path.join(os.path.dirname(__file__), "case_repository.json"), "w") as file:
+        #         json.dump(self.corpus, file, indent=2)
+        # except Exception as e:
+        #     print(f"Error when updating corpus: {e}")
+        pass
+    def embed_corpus(self):
+        # embedded_corpus = {}
+        # for key, content in self.corpus.items():
+        #     good_index = [item['index']['embed_index'] for item in content['good']]
+        #     encoded_good_index = self.embedder.encode(good_index, convert_to_tensor=True).to(device)
+        #     bad_index = [item['index']['embed_index'] for item in content['bad']]
+        #     encoded_bad_index = self.embedder.encode(bad_index, convert_to_tensor=True).to(device)
+        #     embedded_corpus[key] = {"good": encoded_good_index, "bad": encoded_bad_index}
+        # return embedded_corpus
+        pass
+    def get_similarity_scores(self, task: TaskType, embed_index="", str_index="", case_type="", top_k=2):
+        # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # # Embedding similarity match
+        # encoded_embed_query = self.embedder.encode(embed_index, convert_to_tensor=True).to(device)
+        # embedding_similarity_matrix = self.embedder.similarity(encoded_embed_query, self.embedded_corpus[task][case_type])
+        # embedding_similarity_scores = embedding_similarity_matrix[0].to(device)
+        # # String similarity match
+        # str_match_corpus = [item['index']['str_index'] for item in self.corpus[task][case_type]]
+        # str_similarity_results = process.extract(str_index, str_match_corpus, limit=len(str_match_corpus))
+        # scores_dict = {match[0]: match[1] for match in str_similarity_results}
+        # scores_in_order = [scores_dict[candidate] for candidate in str_match_corpus]
+        # str_similarity_scores = torch.tensor(scores_in_order, dtype=torch.float32).to(device)
+        # # Normalize scores
+        # embedding_score_range = embedding_similarity_scores.max() - embedding_similarity_scores.min()
+        # str_score_range = str_similarity_scores.max() - str_similarity_scores.min()
+        # if embedding_score_range > 0:
+        #     embed_norm_scores = (embedding_similarity_scores - embedding_similarity_scores.min()) / embedding_score_range
+        # else:
+        #     embed_norm_scores = embedding_similarity_scores
+        # if str_score_range > 0:
+        #     str_norm_scores = (str_similarity_scores - str_similarity_scores.min()) / str_score_range
+        # else:
+        #     str_norm_scores = str_similarity_scores / 100
+        # # Combine the scores with weights
+        # combined_scores = 0.5 * embed_norm_scores + 0.5 * str_norm_scores
+        # original_combined_scores = 0.5 * embedding_similarity_scores + 0.5 * str_similarity_scores / 100
+        # scores, indices = torch.topk(combined_scores, k=min(top_k, combined_scores.size(0)))
+        # original_scores, original_indices = torch.topk(original_combined_scores, k=min(top_k, original_combined_scores.size(0)))
+        # return scores, indices, original_scores, original_indices
+        pass
+    def query_case(self, task: TaskType, embed_index="", str_index="", case_type="", top_k=2) -> list:
+        # _, indices, _, _ = self.get_similarity_scores(task, embed_index, str_index, case_type, top_k)
+        # top_matches = [self.corpus[task][case_type][idx]["content"] for idx in indices]
+        # return top_matches
+        pass
+    def update_case(self, task: TaskType, embed_index="", str_index="", content="" ,case_type=""):
+        # self.corpus[task][case_type].append({"index": {"embed_index": embed_index, "str_index": str_index}, "content": content})
+        # self.embedded_corpus[task][case_type] = torch.cat([self.embedded_corpus[task][case_type], self.embedder.encode([embed_index], convert_to_tensor=True).to(device)], dim=0)
+        # print(f"Case updated for {task} task.")
+        pass
+class CaseRepositoryHandler:
+    def __init__(self, llm: BaseEngine):
+        self.repository = CaseRepository()
+        self.llm = llm
+    def __get_good_case_analysis(self, instruction="", text="", result="", additional_info=""):
+        # prompt = good_case_analysis_instruction.format(
+        #     instruction=instruction, text=text, result=result, additional_info=additional_info
+        # )
+        # for _ in range(3):
+        #     response = self.llm.get_chat_response(prompt)
+        #     response = extract_json_dict(response)
+        #     if not isinstance(response, dict):
+        #         return response
+        # return None
+        pass
+    def __get_bad_case_reflection(self, instruction="", text="", original_answer="", correct_answer="", additional_info=""):
+        # prompt = bad_case_reflection_instruction.format(
+        #     instruction=instruction, text=text, original_answer=original_answer, correct_answer=correct_answer, additional_info=additional_info
+        # )
+        # for _ in range(3):
+        #     response = self.llm.get_chat_response(prompt)
+        #     response = extract_json_dict(response)
+        #     if not isinstance(response, dict):
+        #         return response
+        # return None
+        pass
+    def __get_index(self, data: DataPoint, case_type: str):
+        # set embed_index
+        # embed_index = f"**Text**: {data.distilled_text}\n{data.chunk_text_list[0]}"
+        # # set str_index
+        # if data.task == "Base":
+        #     str_index = f"**Task**: {data.instruction}"
+        # else:
+        #     str_index = f"{data.constraint}"
+        # if case_type == "bad":
+        #     str_index += f"\n\n**Original Result**: {json.dumps(data.pred)}"
+        # return embed_index, str_index
+        pass
+    def query_good_case(self, data: DataPoint):
+        # embed_index, str_index = self.__get_index(data, "good")
+        # return self.repository.query_case(task=data.task, embed_index=embed_index, str_index=str_index, case_type="good")
+        pass
+    def query_bad_case(self, data: DataPoint):
+        # embed_index, str_index = self.__get_index(data, "bad")
+        # return self.repository.query_case(task=data.task, embed_index=embed_index, str_index=str_index, case_type="bad")
+        pass
+    def update_good_case(self, data: DataPoint):
+        # if data.truth == "" :
+        #     print("No truth value provided.")
+        #     return
+        # embed_index, str_index = self.__get_index(data, "good")
+        # _, _, original_scores, _ = self.repository.get_similarity_scores(data.task, embed_index, str_index, "good", 1)
+        # original_scores = original_scores.tolist()
+        # if original_scores[0] >= 0.9:
+        #     print("The similar good case is already in the corpus. Similarity Score: ", original_scores[0])
+        #     return
+        # good_case_alaysis = self.__get_good_case_analysis(instruction=data.instruction, text=data.distilled_text, result=data.truth, additional_info=data.constraint)
+        # wrapped_good_case_analysis = f"**Analysis**: {good_case_alaysis}"
+        # wrapped_instruction = f"**Task**: {data.instruction}"
+        # wrapped_text = f"**Text**: {data.distilled_text}\n{data.chunk_text_list[0]}"
+        # wrapped_answer = f"**Correct Answer**: {json.dumps(data.truth)}"
+        # if data.task == "Base":
+        #     content = f"{wrapped_instruction}\n\n{wrapped_text}\n\n{wrapped_good_case_analysis}\n\n{wrapped_answer}"
+        # else:
+        #     content = f"{wrapped_text}\n\n{data.constraint}\n\n{wrapped_good_case_analysis}\n\n{wrapped_answer}"
+        # self.repository.update_case(data.task, embed_index, str_index, content, "good")
+        pass
+    def update_bad_case(self, data: DataPoint):
+        # if data.truth == "" :
+        #     print("No truth value provided.")
+        #     return
+        # if normalize_obj(data.pred) == normalize_obj(data.truth):
+        #     return
+        # embed_index, str_index = self.__get_index(data, "bad")
+        # _, _, original_scores, _ = self.repository.get_similarity_scores(data.task, embed_index, str_index, "bad", 1)
+        # original_scores = original_scores.tolist()
+        # if original_scores[0] >= 0.9:
+        #     print("The similar bad case is already in the corpus. Similarity Score: ", original_scores[0])
+        #     return
+        # bad_case_reflection = self.__get_bad_case_reflection(instruction=data.instruction, text=data.distilled_text, original_answer=data.pred, correct_answer=data.truth, additional_info=data.constraint)
+        # wrapped_bad_case_reflection = f"**Reflection**: {bad_case_reflection}"
+        # wrapper_original_answer = f"**Original Answer**: {json.dumps(data.pred)}"
+        # wrapper_correct_answer = f"**Correct Answer**: {json.dumps(data.truth)}"
+        # wrapped_instruction = f"**Task**: {data.instruction}"
+        # wrapped_text = f"**Text**: {data.distilled_text}\n{data.chunk_text_list[0]}"
+        # if data.task == "Base":
+        #     content = f"{wrapped_instruction}\n\n{wrapped_text}\n\n{wrapper_original_answer}\n\n{wrapped_bad_case_reflection}\n\n{wrapper_correct_answer}"
+        # else:
+        #     content =  f"{wrapped_text}\n\n{data.constraint}\n\n{wrapper_original_answer}\n\n{wrapped_bad_case_reflection}\n\n{wrapper_correct_answer}"
+        # self.repository.update_case(data.task, embed_index, str_index, content, "bad")
+        pass
+    def update_case(self, data: DataPoint):
+        # self.update_good_case(data)
+        # self.update_bad_case(data)
+        # self.repository.update_corpus()
+        pass

src/modules/knowledge_base/schema_repository.py ADDED Viewed

	@@ -0,0 +1,91 @@

+from typing import List, Optional
+from pydantic import BaseModel, Field
+from langchain_core.output_parsers import JsonOutputParser
+# ==================================================================== #
+#                                NER TASK                              #
+# ==================================================================== #
+class Entity(BaseModel):
+    name : str = Field(description="The specific name of the entity. ")
+    type : str = Field(description="The type or category that the entity belongs to.")
+class EntityList(BaseModel):
+    entity_list : List[Entity] = Field(description="Named entities appearing in the text.")
+# ==================================================================== #
+#                               RE TASK                                #
+# ==================================================================== #
+class Relation(BaseModel):
+    head : str = Field(description="The starting entity in the relationship.")
+    tail : str = Field(description="The ending entity in the relationship.")
+    relation : str = Field(description="The predicate that defines the relationship between the two entities.")
+class RelationList(BaseModel):
+    relation_list : List[Relation] = Field(description="The collection of relationships between various entities.")
+# ==================================================================== #
+#                               EE TASK                                #
+# ==================================================================== #
+class Event(BaseModel):
+    event_type : str = Field(description="The type of the event.")
+    event_trigger : str = Field(description="A specific word or phrase that indicates the occurrence of the event.")
+    event_argument : dict = Field(description="The arguments or participants involved in the event.")
+class EventList(BaseModel):
+    event_list : List[Event] = Field(description="The events presented in the text.")
+# ==================================================================== #
+#                          TEXT DESCRIPTION                            #
+# ==================================================================== #
+class TextDescription(BaseModel):
+    field: str = Field(description="The field of the given text, such as 'Science', 'Literature', 'Business', 'Medicine', 'Entertainment', etc.")
+    genre: str = Field(description="The genre of the given text, such as 'Article', 'Novel', 'Dialog', 'Blog', 'Manual','Expository', 'News Report', 'Research Paper', etc.")
+# ==================================================================== #
+#                        USER DEFINED SCHEMA                           #
+# ==================================================================== #
+# --------------------------- Research Paper ----------------------- #
+class MetaData(BaseModel):
+    title : str = Field(description="The title of the article")
+    authors : List[str] = Field(description="The list of the article's authors")
+    abstract: str = Field(description="The article's abstract")
+    key_words: List[str] = Field(description="The key words associated with the article")
+class Baseline(BaseModel):
+    method_name : str = Field(description="The name of the baseline method")
+    proposed_solution : str = Field(description="the proposed solution in details")
+    performance_metrics : str = Field(description="The performance metrics of the method and comparative analysis")
+class ExtractionTarget(BaseModel):
+    key_contributions: List[str] = Field(description="The key contributions of the article")
+    limitation_of_sota : str=Field(description="the summary limitation of the existing work")
+    proposed_solution : str = Field(description="the proposed solution in details")
+    baselines : List[Baseline] = Field(description="The list of baseline methods and their details")
+    performance_metrics : str = Field(description="The performance metrics of the method and comparative analysis")
+    paper_limitations : str=Field(description="The limitations of the proposed solution of the paper")
+# --------------------------- News ----------------------- #
+class Person(BaseModel):
+    name: str = Field(description="The name of the person")
+    identity: Optional[str] = Field(description="The occupation, status or characteristics of the person.")
+    role: Optional[str] = Field(description="The role or function the person plays in an event.")
+class Event(BaseModel):
+    name: str = Field(description="Name of the event")
+    time: Optional[str] = Field(description="Time when the event took place")
+    people_involved: Optional[List[Person]] = Field(description="People involved in the event")
+    cause: Optional[str] = Field(default=None, description="Reason for the event, if applicable")
+    process: Optional[str] = Field(description="Details of the event process")
+    result: Optional[str] = Field(default=None, description="Result or outcome of the event")
+class NewsReport(BaseModel):
+    title: str = Field(description="The title or headline of the news report")
+    summary: str = Field(description="A brief summary of the news report")
+    publication_date: Optional[str] = Field(description="The publication date of the report")
+    keywords: Optional[List[str]] = Field(description="List of keywords or topics covered in the news report")
+    events: List[Event] = Field(description="Events covered in the news report")
+    quotes: Optional[List[str]] = Field(default=None, description="Quotes related to the news, if any")
+    viewpoints: Optional[List[str]] = Field(default=None, description="Different viewpoints regarding the news")
+# --------- You can customize new extraction schemas below -------- #

src/modules/reflection_agent.py ADDED Viewed

	@@ -0,0 +1,74 @@

+from models import *
+from utils import *
+from .extraction_agent import ExtractionAgent
+from .knowledge_base.case_repository import CaseRepositoryHandler
+class ReflectionGenerator:
+    def __init__(self, llm: BaseEngine):
+        self.llm = llm
+    def get_reflection(self, instruction="", examples="", text="",schema="", result=""):
+        result = json.dumps(result)
+        examples = bad_case_wrapper(examples)
+        prompt = reflect_instruction.format(instruction=instruction, examples=examples, text=text, schema=schema, result=result)
+        response = self.llm.get_chat_response(prompt)
+        response = extract_json_dict(response)
+        return response
+class ReflectionAgent:
+    def __init__(self, llm: BaseEngine, case_repo: CaseRepositoryHandler):
+        self.llm = llm
+        self.module = ReflectionGenerator(llm = llm)
+        self.extractor = ExtractionAgent(llm = llm, case_repo = case_repo)
+        self.case_repo = case_repo
+        self.methods = ["reflect_with_case"]
+    def __select_result(self, result_list):
+        dict_objects = [obj for obj in result_list if isinstance(obj, dict)]
+        if dict_objects:
+            selected_obj = max(dict_objects, key=lambda d: len(json.dumps(d)))
+        else:
+            selected_obj = max(result_list, key=lambda o: len(json.dumps(o)))
+        return selected_obj
+    def __self_consistance_check(self, data: DataPoint):
+        extract_func = list(data.result_trajectory.keys())[-1]
+        if hasattr(self.extractor, extract_func):
+            result_trails = []
+            result_trails.append(data.result_list)
+            extract_func = getattr(self.extractor, extract_func)
+            temperature = [0.5, 1]
+            for index in range(2):
+                self.module.llm.set_hyperparameter(temperature=temperature[index])
+                data = extract_func(data)
+                result_trails.append(data.result_list)
+            self.module.llm.set_hyperparameter()
+            consistant_result = []
+            reflect_index = []
+            for index, elements in enumerate(zip(*result_trails)):
+                normalized_elements = [normalize_obj(e) for e in elements]
+                element_counts = Counter(normalized_elements)
+                selected_element = next((elements[i] for i, element in enumerate(normalized_elements)
+                                        if element_counts[element] >= 2), None)
+                if selected_element is None:
+                    selected_element = self.__select_result(elements)
+                    reflect_index.append(index)
+                consistant_result.append(selected_element)
+            data.set_result_list(consistant_result)
+            return reflect_index
+    def reflect_with_case(self, data: DataPoint):
+        if data.result_list == []:
+            return data
+        reflect_index = self.__self_consistance_check(data)
+        reflected_result_list = data.result_list
+        for idx in reflect_index:
+            text = data.chunk_text_list[idx]
+            result = data.result_list[idx]
+            examples = json.dumps(self.case_repo.query_bad_case(data))
+            reflected_res = self.module.get_reflection(instruction=data.instruction, examples=examples, text=text, schema=data.output_schema, result=result)
+            reflected_result_list[idx] = reflected_res
+        data.set_result_list(reflected_result_list)
+        function_name = current_function_name()
+        data.update_trajectory(function_name, data.result_list)
+        return data

src/modules/schema_agent.py ADDED Viewed

	@@ -0,0 +1,151 @@

+from models import *
+from utils import *
+from .knowledge_base import schema_repository
+from langchain_core.output_parsers import JsonOutputParser
+class SchemaAnalyzer:
+    def __init__(self, llm: BaseEngine):
+        self.llm = llm
+    def serialize_schema(self, schema) -> str:
+        if isinstance(schema, (str, list, dict, set, tuple)):
+            return schema
+        try:
+            parser = JsonOutputParser(pydantic_object = schema)
+            schema_description = parser.get_format_instructions()
+            schema_content = re.findall(r'```(.*?)```', schema_description, re.DOTALL)
+            explanation = "For example, for the schema {\"properties\": {\"foo\": {\"title\": \"Foo\", \"description\": \"a list of strings\", \"type\": \"array\", \"items\": {\"type\": \"string\"}}}}, the object {\"foo\": [\"bar\", \"baz\"]} is a well-formatted instance."
+            schema = f"{schema_content}\n\n{explanation}"
+        except:
+            return schema
+        return schema
+    def redefine_text(self, text_analysis):
+        try:
+            field = text_analysis['field']
+            genre = text_analysis['genre']
+        except:
+            return text_analysis
+        prompt = f"This text is from the field of {field} and represents the genre of {genre}."
+        return prompt
+    def get_text_analysis(self, text: str):
+        output_schema = self.serialize_schema(schema_repository.TextDescription)
+        prompt = text_analysis_instruction.format(examples="", text=text, schema=output_schema)
+        response = self.llm.get_chat_response(prompt)
+        response = extract_json_dict(response)
+        response = self.redefine_text(response)
+        return response
+    def get_deduced_schema_json(self, instruction: str, text: str, distilled_text: str):
+        prompt = deduced_schema_json_instruction.format(examples=example_wrapper(json_schema_examples), instruction=instruction, distilled_text=distilled_text, text=text)
+        response = self.llm.get_chat_response(prompt)
+        response = extract_json_dict(response)
+        code = response
+        print(f"Deduced Schema in Json: \n{response}\n\n")
+        return code, response
+    def get_deduced_schema_code(self, instruction: str, text: str, distilled_text: str):
+        prompt = deduced_schema_code_instruction.format(examples=example_wrapper(code_schema_examples), instruction=instruction, distilled_text=distilled_text, text=text)
+        response = self.llm.get_chat_response(prompt)
+        print(f"schema prompt: {prompt}")
+        print("========================================")
+        print(f"schema response: {response}")
+        code_blocks = re.findall(r'```[^\n]*\n(.*?)\n```', response, re.DOTALL)
+        if code_blocks:
+            try:
+                code_block = code_blocks[-1]
+                namespace = {}
+                exec(code_block, namespace)
+                schema = namespace.get('ExtractionTarget')
+                if schema is not None:
+                    index = code_block.find("class")
+                    code = code_block[index:]
+                    print(f"Deduced Schema in Code: \n{code}\n\n")
+                    schema = self.serialize_schema(schema)
+                    return code, schema
+            except Exception as e:
+                print(e)
+                return self.get_deduced_schema_json(instruction, text, distilled_text)
+        return self.get_deduced_schema_json(instruction, text, distilled_text)
+class SchemaAgent:
+    def __init__(self, llm: BaseEngine):
+        self.llm = llm
+        self.module = SchemaAnalyzer(llm = llm)
+        self.schema_repo = schema_repository
+        self.methods = ["get_default_schema", "get_retrieved_schema", "get_deduced_schema"]
+    def __preprocess_text(self, data: DataPoint):
+        if data.use_file:
+            data.chunk_text_list = chunk_file(data.file_path)
+        else:
+            data.chunk_text_list = chunk_str(data.text)
+        if data.task == "NER":
+            data.print_schema = """
+class Entity(BaseModel):
+    name : str = Field(description="The specific name of the entity. ")
+    type : str = Field(description="The type or category that the entity belongs to.")
+class EntityList(BaseModel):
+    entity_list : List[Entity] = Field(description="Named entities appearing in the text.")
+            """
+        elif data.task == "RE":
+            data.print_schema = """
+class Relation(BaseModel):
+    head : str = Field(description="The starting entity in the relationship.")
+    tail : str = Field(description="The ending entity in the relationship.")
+    relation : str = Field(description="The predicate that defines the relationship between the two entities.")
+class RelationList(BaseModel):
+    relation_list : List[Relation] = Field(description="The collection of relationships between various entities.")
+            """
+        elif data.task == "EE":
+            data.print_schema = """
+class Event(BaseModel):
+    event_type : str = Field(description="The type of the event.")
+    event_trigger : str = Field(description="A specific word or phrase that indicates the occurrence of the event.")
+    event_argument : dict = Field(description="The arguments or participants involved in the event.")
+class EventList(BaseModel):
+    event_list : List[Event] = Field(description="The events presented in the text.")
+            """
+        return data
+    def get_default_schema(self, data: DataPoint):
+        data = self.__preprocess_text(data)
+        default_schema = config['agent']['default_schema']
+        data.set_schema(default_schema)
+        function_name = current_function_name()
+        data.update_trajectory(function_name, default_schema)
+        return data
+    def get_retrieved_schema(self, data: DataPoint):
+        self.__preprocess_text(data)
+        schema_name = data.output_schema
+        schema_class = getattr(self.schema_repo, schema_name, None)
+        if schema_class is not None:
+            schema = self.module.serialize_schema(schema_class)
+            default_schema = config['agent']['default_schema']
+            data.set_schema(f"{default_schema}\n{schema}")
+            function_name = current_function_name()
+            data.update_trajectory(function_name, schema)
+        else:
+            return self.get_default_schema(data)
+        return data
+    def get_deduced_schema(self, data: DataPoint):
+        self.__preprocess_text(data)
+        target_text = data.chunk_text_list[0]
+        analysed_text = self.module.get_text_analysis(target_text)
+        if len(data.chunk_text_list) > 1:
+            prefix = "Below is a portion of the text to be extracted. "
+            analysed_text = f"{prefix}\n{target_text}"
+        distilled_text = self.module.redefine_text(analysed_text)
+        code, deduced_schema = self.module.get_deduced_schema_code(data.instruction, target_text, distilled_text)
+        data.print_schema = code
+        data.set_distilled_text(distilled_text)
+        default_schema = config['agent']['default_schema']
+        data.set_schema(f"{default_schema}\n{deduced_schema}")
+        function_name = current_function_name()
+        data.update_trajectory(function_name, deduced_schema)
+        return data

src/pipeline.py ADDED Viewed

	@@ -0,0 +1,98 @@

+from typing import Literal
+from models import *
+from utils import *
+from modules import *
+class Pipeline:
+    def __init__(self, llm: BaseEngine):
+        self.llm = llm
+        self.case_repo = CaseRepositoryHandler(llm = llm)
+        self.schema_agent = SchemaAgent(llm = llm)
+        self.extraction_agent = ExtractionAgent(llm = llm, case_repo = self.case_repo)
+        self.reflection_agent = ReflectionAgent(llm = llm, case_repo = self.case_repo)
+    def __init_method(self, data: DataPoint, process_method):
+        default_order = ["schema_agent", "extraction_agent", "reflection_agent"]
+        if "schema_agent" not in process_method:
+            process_method["schema_agent"] = "get_default_schema"
+        if data.task == "Base":
+            process_method["schema_agent"] = "get_deduced_schema"
+        if data.task != "Base":
+            process_method["schema_agent"] = "get_retrieved_schema"
+        if "extraction_agent" not in process_method:
+            process_method["extraction_agent"] = "extract_information_direct"
+        sorted_process_method = {key: process_method[key] for key in default_order if key in process_method}
+        return sorted_process_method
+    def __init_data(self, data: DataPoint):
+        if data.task == "NER":
+            data.instruction = config['agent']['default_ner']
+            data.output_schema = "EntityList"
+        elif data.task == "RE":
+            data.instruction = config['agent']['default_re']
+            data.output_schema = "RelationList"
+        elif data.task == "EE":
+            data.instruction = config['agent']['default_ee']
+            data.output_schema = "EventList"
+        return data
+    # main entry
+    def get_extract_result(self,
+                           task: TaskType,
+                           instruction: str = "",
+                           text: str = "",
+                           output_schema: str = "",
+                           constraint: str = "",
+                           use_file: bool = False,
+                           file_path: str = "",
+                           truth: str = "",
+                           mode: str = "quick",
+                           update_case: bool = False
+    ):
+        print(f" task: {task},\n instruction: {instruction},\n text: {text},\n output_schema: {output_schema},\n constraint: {constraint},\n use_file: {use_file},\n file_path: {file_path},\n truth: {truth},\n mode: {mode},\n update_case: {update_case}")
+        data = DataPoint(task=task, instruction=instruction, text=text, output_schema=output_schema, constraint=constraint, use_file=use_file, file_path=file_path, truth=truth)
+        data = self.__init_data(data)
+        if mode in config['agent']['mode'].keys():
+            process_method = config['agent']['mode'][mode]
+        else:
+            process_method = mode
+        print(f"data=================: {data.task}")
+        print(f"process_method=================: {process_method}")
+        sorted_process_method = self.__init_method(data, process_method)
+        print_schema = False
+        frontend_schema = ""
+        frontend_res = ""
+        # Information Extract
+        print(f"sorted_process_method=================: {sorted_process_method}")
+        for agent_name, method_name in sorted_process_method.items():
+            agent = getattr(self, agent_name, None)
+            if not agent:
+                raise AttributeError(f"{agent_name} does not exist.")
+            method = getattr(agent, method_name, None)
+            if not method:
+                raise AttributeError(f"Method '{method_name}' not found in {agent_name}.")
+            data = method(data)
+            if not print_schema and data.print_schema:
+                print("Schema: \n", data.print_schema)
+                frontend_schema = data.print_schema
+                print_schema = True
+        data = self.extraction_agent.summarize_answer(data)
+        print("Extraction Result: \n", json.dumps(data.pred, indent=2))
+        frontend_res = data.pred
+        # Case Update
+        if update_case:
+            if (data.truth == ""):
+                truth = input("Please enter the correct answer you prefer, or press Enter to accept the current answer: ")
+                if truth.strip() == "":
+                    data.truth = data.pred
+                else:
+                    data.truth = extract_json_dict(truth)
+            self.case_repo.update_case(data)
+        # return result
+        result = data.pred
+        trajectory = data.get_result_trajectory()
+        return result, trajectory, frontend_schema, frontend_res

src/run.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import argparse
+import os
+import yaml
+from pipeline import Pipeline
+from typing import Literal
+import models
+from models import *
+from utils import *
+from modules import *
+def load_extraction_config(yaml_path):
+    # 从文件路径读取 YAML 内容
+    if not os.path.exists(yaml_path):
+        print(f"Error: The config file '{yaml_path}' does not exist.")
+        return {}
+    with open(yaml_path, 'r') as file:
+        config = yaml.safe_load(file)
+    # 提取'extraction'配置的字典
+    model_config = config.get('model', {})
+    extraction_config = config.get('extraction', {})
+    # model config
+    model_name_or_path = model_config.get('model_name_or_path', "")
+    model_category = model_config.get('category', "")
+    api_key = model_config.get('api_key', "")
+    base_url = model_config.get('base_url', "")
+    # extraction config
+    task = extraction_config.get('task', "")
+    instruction = extraction_config.get('instruction', "")
+    text = extraction_config.get('text', "")
+    output_schema = extraction_config.get('output_schema', "")
+    constraint = extraction_config.get('constraint', "")
+    truth = extraction_config.get('truth', "")
+    use_file = extraction_config.get('use_file', False)
+    mode = extraction_config.get('mode', "quick")
+    update_case = extraction_config.get('update_case', False)
+    # 返回一个包含这些变量的字典
+    return {
+        "model": {
+            "model_name_or_path": model_name_or_path,
+            "category": model_category,
+            "api_key": api_key,
+            "base_url": base_url
+        },
+        "extraction": {
+            "task": task,
+            "instruction": instruction,
+            "text": text,
+            "output_schema": output_schema,
+            "constraint": constraint,
+            "truth": truth,
+            "use_file": use_file,
+            "mode": mode,
+            "update_case": update_case
+        }
+    }
+def main():
+    # 创建命令行参数解析器
+    parser = argparse.ArgumentParser(description='Run the extraction model.')
+    parser.add_argument('--config', type=str, required=True,
+                        help='Path to the YAML configuration file.')
+    # 解析命令行参数
+    args = parser.parse_args()
+    # 加载配置
+    config = load_extraction_config(args.config)
+    model_config = config['model']
+    extraction_config = config['extraction']
+    clazz = getattr(models, model_config['category'], None)
+    if clazz is None:
+        print(f"Error: The model category '{model_config['category']}' is not supported.")
+        return
+    if model_config['api_key'] == "":
+        model = clazz(model_config['model_name_or_path'])
+    else:
+        model = clazz(model_config['model_name_or_path'], model_config['api_key'], model_config['base_url'])
+    pipeline = Pipeline(model)
+    result, trajectory, *_ = pipeline.get_extract_result(task=extraction_config['task'], instruction=extraction_config['instruction'], text=extraction_config['text'], output_schema=extraction_config['output_schema'], constraint=extraction_config['constraint'], use_file=extraction_config['use_file'], truth=extraction_config['truth'], mode=extraction_config['mode'], update_case=extraction_config['update_case'])
+    return
+if __name__ == "__main__":
+    main()

src/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .process import *
2	+ from .data_def import DataPoint, TaskType
3	+

src/utils/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (274 Bytes). View file

src/utils/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (239 Bytes). View file

src/utils/__pycache__/data_def.cpython-311.pyc ADDED Viewed

Binary file (3.07 kB). View file

src/utils/__pycache__/data_def.cpython-39.pyc ADDED Viewed

Binary file (2.3 kB). View file

src/utils/__pycache__/process.cpython-311.pyc ADDED Viewed

Binary file (10.7 kB). View file

src/utils/__pycache__/process.cpython-39.pyc ADDED Viewed

Binary file (5.98 kB). View file

src/utils/data_def.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from typing import Literal
+from models import *
+from .process import *
+# predefined processing logic for routine extraction tasks
+TaskType = Literal["NER", "RE", "EE", "Base"]
+ModelType = Literal["gpt-3.5-turbo", "gpt-4o"]
+class DataPoint:
+    def __init__(self,
+                 task: TaskType = "Base",
+                 instruction: str = "",
+                 text: str = "",
+                 output_schema: str = "",
+                 constraint: str = "",
+                 use_file: bool = False,
+                 file_path: str = "",
+                 truth: str = ""):
+        """
+        Initialize a DataPoint instance.
+        """
+        # task information
+        self.task = task
+        self.instruction = instruction
+        self.text = text
+        self.output_schema = output_schema
+        self.constraint = constraint
+        self.use_file = use_file
+        self.file_path = file_path
+        self.truth = extract_json_dict(truth)
+        # temp storage
+        self.print_schema = ""
+        self.distilled_text = ""
+        self.chunk_text_list = []
+        # result feedback
+        self.result_list = []
+        self.result_trajectory = {}
+        self.pred = ""
+    def set_constraint(self, constraint):
+        self.constraint = constraint
+    def set_schema(self, output_schema):
+        self.output_schema = output_schema
+    def set_pred(self, pred):
+        self.pred = pred
+    def set_result_list(self, result_list):
+        self.result_list = result_list
+    def set_distilled_text(self, distilled_text):
+        self.distilled_text = distilled_text
+    def update_trajectory(self, function, result):
+        if function not in self.result_trajectory:
+            self.result_trajectory.update({function: result})
+    def get_result_trajectory(self):
+        return {"instruction": self.instruction, "text": self.text, "constraint": self.constraint,  "trajectory": self.result_trajectory, "pred": self.pred}

src/utils/process.py ADDED Viewed

	@@ -0,0 +1,183 @@

+"""
+Data Processing Functions.
+Supports:
+- Segmentation of long text
+- Segmentation of file content
+"""
+from langchain_community.document_loaders import TextLoader, PyPDFLoader, Docx2txtLoader, BSHTMLLoader, JSONLoader
+from nltk.tokenize import sent_tokenize
+from collections import Counter
+import re
+import json
+import yaml
+import os
+import yaml
+import os
+import inspect
+import ast
+with open(os.path.join(os.path.dirname(__file__), "..", "config.yaml")) as file:
+    config = yaml.safe_load(file)
+# Split the string text into chunks
+def chunk_str(text):
+    sentences = sent_tokenize(text)
+    chunks = []
+    current_chunk = []
+    current_length = 0
+    for sentence in sentences:
+        token_count = len(sentence.split())
+        if current_length + token_count <= config['agent']['chunk_token_limit']:
+            current_chunk.append(sentence)
+            current_length += token_count
+        else:
+            if current_chunk:
+                chunks.append(' '.join(current_chunk))
+            current_chunk = [sentence]
+            current_length = token_count
+    if current_chunk:
+        chunks.append(' '.join(current_chunk))
+    return chunks
+# Load and split the content of a file
+def chunk_file(file_path):
+    pages = []
+    if file_path.endswith(".pdf"):
+        loader = PyPDFLoader(file_path)
+    elif file_path.endswith(".txt"):
+        loader = TextLoader(file_path)
+    elif file_path.endswith(".docx"):
+        loader = Docx2txtLoader(file_path)
+    elif file_path.endswith(".html"):
+        loader = BSHTMLLoader(file_path)
+    elif file_path.endswith(".json"):
+        loader = JSONLoader(file_path)
+    else:
+        raise ValueError("Unsupported file format")  # Inform that the format is unsupported
+    pages = loader.load_and_split()
+    docs = ""
+    for item in pages:
+        docs += item.page_content
+    pages = chunk_str(docs)
+    return pages
+def process_single_quotes(text):
+    result = re.sub(r"(?<!\w)'|'(?!\w)", '"', text)
+    return result
+def remove_empty_values(data):
+    def is_empty(value):
+        return value is None or value == [] or value == "" or value == {}
+    if isinstance(data, dict):
+        return {
+            k: remove_empty_values(v)
+            for k, v in data.items()
+            if not is_empty(v)
+        }
+    elif isinstance(data, list):
+        return [
+            remove_empty_values(item)
+            for item in data
+            if not is_empty(item)
+        ]
+    else:
+        return data
+def extract_json_dict(text):
+    if isinstance(text, dict):
+        return text
+    pattern = r'\{(?:[^{}]|(?:\{(?:[^{}]|(?:\{[^{}]*\})*)*\})*)*\}'
+    matches = re.findall(pattern, text)
+    if matches:
+        json_string = matches[-1]
+        json_string = process_single_quotes(json_string)
+        try:
+            json_dict = json.loads(json_string)
+            json_dict = remove_empty_values(json_dict)
+            if json_dict is None:
+                return "No valid information found."
+            return json_dict
+        except json.JSONDecodeError:
+            return json_string
+    else:
+        return text
+def good_case_wrapper(example: str):
+    if example is None or example == "":
+        return ""
+    example = f"\nHere are some examples:\n{example}\n(END OF EXAMPLES)\nRefer to the reasoning steps and analysis in the examples to help complete the extraction task below.\n\n"
+    return example
+def bad_case_wrapper(example: str):
+    if example is None or example == "":
+        return ""
+    example = f"\nHere are some examples of bad cases:\n{example}\n(END OF EXAMPLES)\nRefer to the reflection rules and reflection steps in the examples to help optimize the original result below.\n\n"
+    return example
+def example_wrapper(example: str):
+    if example is None or example == "":
+        return ""
+    example = f"\nHere are some examples:\n{example}\n(END OF EXAMPLES)\n\n"
+    return example
+def remove_redundant_space(s):
+    s = ' '.join(s.split())
+    s = re.sub(r"\s*(,|:|\(|\)|\.|_|;|'|-)\s*", r'\1', s)
+    return s
+def format_string(s):
+    s = remove_redundant_space(s)
+    s = s.lower()
+    s = s.replace('{','').replace('}','')
+    s = re.sub(',+', ',', s)
+    s = re.sub('\.+', '.', s)
+    s = re.sub(';+', ';', s)
+    s = s.replace('’', "'")
+    return s
+def calculate_metrics(y_truth: set, y_pred: set):
+    TP = len(y_truth & y_pred)
+    FN = len(y_truth - y_pred)
+    FP = len(y_pred - y_truth)
+    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
+    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
+    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
+    return precision, recall, f1_score
+def current_function_name():
+    try:
+        stack = inspect.stack()
+        if len(stack) > 1:
+            outer_func_name = stack[1].function
+            return outer_func_name
+        else:
+            print("No caller function found")
+            return None
+    except Exception as e:
+        print(f"An error occurred: {e}")
+        pass
+def normalize_obj(value):
+    if isinstance(value, dict):
+        return frozenset((k, normalize_obj(v)) for k, v in value.items())
+    elif isinstance(value, (list, set, tuple)):
+        # 将 Counter 转换为元组以便于被哈希
+        return tuple(Counter(map(normalize_obj, value)).items())
+    elif isinstance(value, str):
+        return format_string(value)
+    return value
+def dict_list_to_set(data_list):
+    result_set = set()
+    try:
+        for dictionary in data_list:
+            value_tuple = tuple(format_string(value) for value in dictionary.values())
+            result_set.add(value_tuple)
+        return result_set
+    except Exception as e:
+        print (f"Failed to convert dictionary list to set: {data_list}")
+        return result_set

src/webui/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .interface import InterFace