diff --git a/config/environment.py b/config/environment.py index 3f52064..27ea6fc 100644 --- a/config/environment.py +++ b/config/environment.py @@ -24,8 +24,15 @@ class Environment(BaseSettings): NEO4J_USERNAME: Optional[str] = Field(default=os.getenv("NEO4J_USERNAME")) NEO4J_PASSWORD: Optional[str] = Field(default=os.getenv("NEO4J_PASSWORD")) # Bedrock + ## Generative Model BEDROCK_MODEL_ID: Optional[str] = Field( default=os.getenv("BEDROCK_MODEL_ID", "us.anthropic.claude-3-5-sonnet-20240620-v1:0")) + ## Embedding Model + BEDROCK_EMBEDDING_MODEL_ID: Optional[str] = Field( + default=os.getenv("BEDROCK_EMBEDDING_MODEL_ID", "amazon.titan-embed-text-v2:0")) # MongoDB MONGO_URI: Optional[str] = Field(default=os.getenv("MONGO_URI", "mongodb://root:pwd@127.0.0.1:27017?authSource=admin")) - MONGO_DB_NAME: Optional[str] = Field(default=os.getenv("MONGO_DB_NAME", "db0")) \ No newline at end of file + MONGO_DB_NAME: Optional[str] = Field(default=os.getenv("MONGO_DB_NAME", "db0")) + # QDrant + QDRANT_URL: Optional[str] = Field(default=os.getenv("QDRANT_URL", "http://localhost:6333")) + QDRANT_API_KEY: Optional[str] = Field(default=os.getenv("QDRANT_API_KEY")) \ No newline at end of file diff --git a/core/agent.py b/core/agent.py index 45a4503..8073dbb 100644 --- a/core/agent.py +++ b/core/agent.py @@ -6,6 +6,7 @@ from langgraph.graph import StateGraph from server import SocketManager +from vectorstore import QdrantClientManager from .base import LLMBedRockBase, GraphState from .graph import GraphAgent from .manager import ChatManager @@ -35,8 +36,10 @@ def __init__( username=env.NEO4J_USERNAME, password=env.NEO4J_PASSWORD, ) + self._vectorstore = QdrantClientManager() self._agent = GraphAgent( graph=self._graph, + vectorstore=self._vectorstore, llm=self._llm, chat_manager=self._chat_manager, sio=sio, @@ -81,7 +84,7 @@ async def invoke(self, question: str, **kwargs) -> str: self.route_status, { "search_graph": "SearchGraph", - "search_vector": "SearchGraph", # TODO: Implement vector search + "search_vector": "SearchVector", "answer_final": "Answer", } ) diff --git a/core/graph.py b/core/graph.py index fd8a866..edd80e7 100644 --- a/core/graph.py +++ b/core/graph.py @@ -7,6 +7,7 @@ from schemas import AgentGraphSubquery, AgentGraphRoute, AgentGraphStart from server import SocketManager +from vectorstore import QdrantClientManager from .base import GraphNodesBase from .manager import ChatManager from .prompt import ( @@ -24,12 +25,14 @@ class GraphAgent(GraphNodesBase): def __init__( self, graph: Neo4jGraph, + vectorstore: QdrantClientManager, llm: ChatBedrock, chat_manager: ChatManager, sio: Optional[SocketManager] = None ): self._chat_manager = chat_manager self._graph = graph + self._vectorstore = vectorstore self._llm = llm self._sio = sio @@ -60,10 +63,34 @@ async def search_vector(self, state: dict) -> dict: :param state: :return: """ - # TODO: Implement the vector search logic print("--SEARCHING VECTOR--") depth: int = state["depth"] depth += 1 + information_text = 'Buscando vetores no banco de dados...' + await self._emit("agent_updated", {"status": information_text}) + documents: list[dict] = state["documents"] + for q in state["subqueries"]: + print(f"Processing query: {q}") + information_text += f"\n- Consultando: {q}" + try: + result = await self._vectorstore.asearch( + query=q, + k=10, + filters=None, # TODO: Implement filters if needed + ) + documents.extend([ + { + "query": q, + "content": doc.page_content, + "source": doc.metadata['source'] + } + for doc in result + ]) + information_text += " **OK**" + except Exception as e: + print(f"Error during vector search: {e}") + information_text += f"\n- Erro ao buscar vetores: {e}" + await self._emit("agent_updated", {"status": information_text}) return {"documents": state["documents"], "depth": depth} async def search_graph(self, state: dict) -> dict: @@ -76,15 +103,14 @@ async def search_graph(self, state: dict) -> dict: information_text = 'Buscando relacionamentos em grafos...' await self._emit("agent_updated", {"status": information_text}) documents: list[dict] = state["documents"] - subqueries: list[dict] = state["subqueries"] depth: int = state["depth"] cypher_prompt_copy = CYPHER_PROMPT.model_copy() cypher_prompt_copy.template = cypher_prompt_copy.template.replace( "{{chat_history}}", await self._chat_manager.get_history_as_string()) # Search the graph using the LLM - for query in subqueries: - print(f"Processing query: {query}") - information_text += f"\n- Consultando: {query}" + for q in state["subqueries"]: + print(f"Processing query: {q}") + information_text += f"\n- Consultando: {q}" try: cypher_chain = GraphCypherQAChain.from_llm( self._llm, @@ -96,7 +122,7 @@ async def search_graph(self, state: dict) -> dict: allow_dangerous_requests=True, validate_cypher=True, ) - document = await cypher_chain.ainvoke(query) + document = await cypher_chain.ainvoke(q) documents.append(document) information_text += " **OK**" except CypherSyntaxError as e: diff --git a/core/prompt.py b/core/prompt.py index e4ecfae..e96fcf6 100644 --- a/core/prompt.py +++ b/core/prompt.py @@ -1,5 +1,6 @@ from langchain.prompts import PromptTemplate + ROUTING_CONSTANTS = { "search_graph": "Consultando os relacionamentos do grafo", "search_vector": "Consultando o contexto semântico", @@ -119,8 +120,24 @@ template="""Você é um assistente jurídico responsável por fornecer respostas claras, objetivas e fundamentadas a perguntas legais. Utilize exclusivamente as informações do contexto fornecido para elaborar sua resposta. Caso o contexto esteja vazio ou não contenha detalhes suficientes, informe educadamente que não há informações suficientes para responder à pergunta. +Caso houver as fontes utilizadas, informe-as ao final da resposta. Informações disponíveis: {context}""", input_variables=["context"], +) + +EXTRACT_ENTITIES_PROMPT = PromptTemplate( + template="""You are a legal extraction assistant specialized in the Brazilian legal domain. +Your task is to extract structured legal information from text in order to build a Brazilian legal knowledge graph. +Identify legal entities strictly following the user prompt. + +You must produce output in JSON format, containing a single JSON object with the keys: +If something is missing, leave it empty or null. Do not guess or hallucinate. Extract precisely. +{entities}. Use only the explicit information in the text. + +Extract the following legal entities from the provided text: +{text} +""", + input_variables=["entities", "text"], ) \ No newline at end of file diff --git a/docker-compose.yaml b/docker-compose.yaml index 5671566..6643e20 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -10,6 +10,14 @@ services: volumes: - neo4j-data:/data + qdrant: + image: qdrant/qdrant:latest + ports: + - "6333:6333" # HTTP API + - "6334:6334" # gRPC API + volumes: + - qdrant-data:/qdrant/storage + mongo: image: mongo:latest environment: @@ -29,6 +37,7 @@ services: - .env environment: NEO4J_URL: bolt://neo4j:7687 + QDRANT_URL: http://qdrant:6333 command: celery -A workers.tasks worker --loglevel=INFO volumes: - ./:/app @@ -41,6 +50,7 @@ services: - .env environment: NEO4J_URL: bolt://neo4j:7687 + QDRANT_URL: http://qdrant:6333 MONGO_URI: mongodb://root:pwd@mongo:27017/?authSource=admin ports: - "8000:8000" @@ -50,4 +60,5 @@ services: volumes: neo4j-data: - mongodb-data: \ No newline at end of file + mongodb-data: + qdrant-data: \ No newline at end of file diff --git a/main.py b/main.py index 6531e1e..524bcbc 100644 --- a/main.py +++ b/main.py @@ -1,4 +1,9 @@ import os +import tempfile +from uuid import uuid4 + +from services import S3Client +from botocore.exceptions import ClientError from fastapi import FastAPI, Depends, Request from starlette.staticfiles import StaticFiles @@ -8,8 +13,10 @@ KnowledgeUploadSchema, KnowledgeUpdateResponse, AgentGraphRAGRequest, AgentGraphRAGResponse, ) + from core import AgentGraphRAGBedRock, ChatManager from server import SocketManager +from workers import aupload_knowledge_base app = FastAPI( title="Chat GraphRAG API", @@ -104,20 +111,38 @@ async def update_knowledge(upload: KnowledgeUploadSchema = Depends(KnowledgeUplo :param upload: The uploaded files containing the knowledge base document. :return: A confirmation message. """ - from workers import aupload_knowledge_base if len(upload.files) == 0: return KnowledgeUpdateResponse( success=False, message="No files uploaded." ) job_ids: list[str] = [] + s3_client = S3Client() for file in upload.files: if not file.filename.endswith(('.pdf', '.txt', '.md')): return KnowledgeUpdateResponse( success=False, message="Unsupported file type. Only PDF, TXT, and MD files are allowed." ) - job_id = await aupload_knowledge_base(key=file.filename) + ext = file.filename.split('.')[-1] + key = f"knowledge/{file.filename}-{str(uuid4())}.{ext}" + try: + temp = tempfile.NamedTemporaryFile(delete=False) + with open(temp.name, 'wb') as f: + f.write(file.file.read()) + s3_client.upload_file(temp.name, key) + os.remove(temp.name) + except ClientError as e: + return KnowledgeUpdateResponse( + success=False, + message=f"Failed to upload file to S3: {e}" + ) + except FileNotFoundError as e: + return KnowledgeUpdateResponse( + success=False, + message=f"File not found: {e}" + ) + job_id = await aupload_knowledge_base(key=key) job_ids.append(job_id) return KnowledgeUpdateResponse( success=True, diff --git a/pyproject.toml b/pyproject.toml index e11749d..67f4792 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,6 +6,7 @@ requires-python = ">=3.12" dependencies = [ "amazon-textract-caller>=0.2.4", "amazon-textract-textractor>=1.9.2", + "boto3>=1.39.3", "celery[sqs]>=5.5.3", "fastapi[standard]>=0.115.14", "langchain>=0.3.26", @@ -15,6 +16,7 @@ dependencies = [ "langchain-mongodb>=0.6.2", "langchain-neo4j>=0.4.0", "langchain-openai>=0.3.27", + "langchain-qdrant>=0.2.0", "langgraph>=0.5.1", "loguru>=0.7.3", "neo4j>=5.28.1", diff --git a/schemas/__init__.py b/schemas/__init__.py index 3673c3f..0f8d2dd 100644 --- a/schemas/__init__.py +++ b/schemas/__init__.py @@ -4,4 +4,5 @@ AgentGraphRAGResponse, AgentGraphRAGRequest, ) -from .agent_schema import AgentGraphSubquery, AgentGraphRoute, AgentGraphStart \ No newline at end of file +from .agent_schema import AgentGraphSubquery, AgentGraphRoute, AgentGraphStart +from .document_schema import LegalDocumentMetadata \ No newline at end of file diff --git a/schemas/document_schema.py b/schemas/document_schema.py new file mode 100644 index 0000000..9894b0f --- /dev/null +++ b/schemas/document_schema.py @@ -0,0 +1,48 @@ +from typing import Optional + +from pydantic import BaseModel, Field + + +class LegalDocumentMetadata(BaseModel): + # Document Identification + title: Optional[str] = Field(None, description="Title of the legal document") + type: Optional[str] = Field(None, description="Type of the legal document") + case_number: Optional[str] = Field(None, description="Official case number") + document_number: Optional[str] = Field(None, description="Internal document number") + creation_date: Optional[str] = Field(None, description="Date when the document was created") + filing_date: Optional[str] = Field(None, description="Date when the document was filed") + signature_date: Optional[str] = Field(None, description="Date when the document was signed") + version: Optional[str] = Field(None, description="Version or revision of the document") + place_of_issue: Optional[str] = Field(None, description="Place where the document was issued") + + # Parties Involved + plaintiffs: Optional[list[str]] = Field(None, description="List of plaintiffs") + defendants: Optional[list[str]] = Field(None, description="List of defendants") + lawyers: Optional[list[str]] = Field(None, description="List of lawyers involved") + bar_number: Optional[list[str]] = Field(None, description="List of bar registration numbers") + legal_representatives: Optional[list[str]] = Field(None, description="Other legal representatives") + judge_or_rapporteur: Optional[str] = Field(None, description="Name of judge or rapporteur") + third_parties: Optional[list[str]] = Field(None, description="Interested third parties") + + # Procedural Data + court: Optional[str] = Field(None, description="Court where the case is processed") + jurisdiction: Optional[str] = Field(None, description="Jurisdiction") + district: Optional[str] = Field(None, description="Judicial district") + adjudicating_body: Optional[str] = Field(None, description="Adjudicating body or chamber") + case_class: Optional[str] = Field(None, description="Class of the legal action") + nature_of_action: Optional[str] = Field(None, description="Nature of the action") + main_subject: Optional[str] = Field(None, description="Main subject of the action") + secondary_subjects: Optional[list[str]] = Field(None, description="Other related subjects") + case_progress: Optional[str] = Field(None, description="Current case progress stage") + case_stage: Optional[str] = Field(None, description="Current stage in process") + + # Legal Information + legal_basis: Optional[list[str]] = Field(None, description="Articles, laws or norms cited") + jurisprudence: Optional[list[str]] = Field(None, description="Precedents or case law cited") + legal_thesis: Optional[str] = Field(None, description="Legal thesis argued") + claims: Optional[list[str]] = Field(None, description="Claims requested") + legal_reasoning: Optional[str] = Field(None, description="Legal reasoning or justification") + provisions: Optional[list[str]] = Field(None, description="Provisions applied") + decision: Optional[str] = Field(None, description="Decision content") + case_value: Optional[str] = Field(None, description="Value attributed to the case") + attorney_fees: Optional[str] = Field(None, description="Agreed or court-appointed attorney fees") diff --git a/services/__init__.py b/services/__init__.py new file mode 100644 index 0000000..e2c0756 --- /dev/null +++ b/services/__init__.py @@ -0,0 +1 @@ +from .s3_client import S3Client \ No newline at end of file diff --git a/services/s3_client.py b/services/s3_client.py new file mode 100644 index 0000000..4fc5aef --- /dev/null +++ b/services/s3_client.py @@ -0,0 +1,44 @@ +import boto3 +from config import env + + +class S3Client: + def __init__(self, bucket_name: str = env.S3_BUCKET_NAME): + self._bucket_name = bucket_name + self._session = boto3.Session( + aws_access_key_id=env.AWS_ACCESS_KEY_ID, + aws_secret_access_key=env.AWS_SECRET_ACCESS_KEY, + region_name=env.AWS_REGION, + ) + self._client = self._session.client('s3') + + def upload_file(self, file_path: str, object_name: str) -> None: + """ + Upload a file to the S3 bucket. + :param file_path: The path to the file to upload. + :param object_name: The name of the object in the S3 bucket. + """ + try: + self._client.upload_file(file_path, self._bucket_name, object_name) + print(f"File {file_path} uploaded to {self._bucket_name}/{object_name}.") + except Exception as e: + print(f"Error uploading file: {e}") + + def delete_object(self, file_path: str) -> None: + """ + Delete a file from the S3 bucket. + :param file_path: The path to the file to delete. + """ + try: + self._client.delete_object(Bucket=self._bucket_name, Key=file_path) + print(f"File {file_path} deleted from {self._bucket_name}.") + except Exception as e: + print(f"Error deleting file: {e}") + + @property + def bucket_name(self) -> str: + """ + Get the name of the S3 bucket. + :return: The name of the S3 bucket. + """ + return self._bucket_name \ No newline at end of file diff --git a/template.env b/template.env index 4268576..6ad605d 100644 --- a/template.env +++ b/template.env @@ -7,4 +7,6 @@ S3_BUCKET_NAME=your_s3_bucket_name_here NEO4J_URL=your_neo4j_url_here NEO4J_USERNAME=your_neo4j_username_here NEO4J_PASSWORD=your_neo4j_password_here -BEDROCK_MODEL_ID=your_bedrock_model_id_here \ No newline at end of file +BEDROCK_MODEL_ID=your_bedrock_model_id_here +QDRANT_URL=your_qdrant_url_here +QDRANT_API_KEY=your_qdrant_api_key_here \ No newline at end of file diff --git a/uv.lock b/uv.lock index 5da31fd..969015e 100644 --- a/uv.lock +++ b/uv.lock @@ -462,6 +462,7 @@ source = { virtual = "." } dependencies = [ { name = "amazon-textract-caller" }, { name = "amazon-textract-textractor" }, + { name = "boto3" }, { name = "celery", extra = ["sqs"] }, { name = "fastapi", extra = ["standard"] }, { name = "langchain" }, @@ -471,6 +472,7 @@ dependencies = [ { name = "langchain-mongodb" }, { name = "langchain-neo4j" }, { name = "langchain-openai" }, + { name = "langchain-qdrant" }, { name = "langgraph" }, { name = "loguru" }, { name = "neo4j" }, @@ -494,6 +496,7 @@ dev = [ requires-dist = [ { name = "amazon-textract-caller", specifier = ">=0.2.4" }, { name = "amazon-textract-textractor", specifier = ">=1.9.2" }, + { name = "boto3", specifier = ">=1.39.3" }, { name = "celery", extras = ["sqs"], specifier = ">=5.5.3" }, { name = "fastapi", extras = ["standard"], specifier = ">=0.115.14" }, { name = "langchain", specifier = ">=0.3.26" }, @@ -503,6 +506,7 @@ requires-dist = [ { name = "langchain-mongodb", specifier = ">=0.6.2" }, { name = "langchain-neo4j", specifier = ">=0.4.0" }, { name = "langchain-openai", specifier = ">=0.3.27" }, + { name = "langchain-qdrant", specifier = ">=0.2.0" }, { name = "langgraph", specifier = ">=0.5.1" }, { name = "loguru", specifier = ">=0.7.3" }, { name = "neo4j", specifier = ">=5.28.1" }, @@ -912,6 +916,34 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5c/4f/aab73ecaa6b3086a4c89863d94cf26fa84cbff63f52ce9bc4342b3087a06/greenlet-3.2.3-cp314-cp314-win_amd64.whl", hash = "sha256:8c47aae8fbbfcf82cc13327ae802ba13c9c36753b67e760023fd116bc124a62a", size = 301236, upload_time = "2025-06-05T16:15:20.111Z" }, ] +[[package]] +name = "grpcio" +version = "1.73.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/79/e8/b43b851537da2e2f03fa8be1aef207e5cbfb1a2e014fbb6b40d24c177cd3/grpcio-1.73.1.tar.gz", hash = "sha256:7fce2cd1c0c1116cf3850564ebfc3264fba75d3c74a7414373f1238ea365ef87", size = 12730355, upload_time = "2025-06-26T01:53:24.622Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b8/41/456caf570c55d5ac26f4c1f2db1f2ac1467d5bf3bcd660cba3e0a25b195f/grpcio-1.73.1-cp312-cp312-linux_armv7l.whl", hash = "sha256:921b25618b084e75d424a9f8e6403bfeb7abef074bb6c3174701e0f2542debcf", size = 5334621, upload_time = "2025-06-26T01:52:23.602Z" }, + { url = "https://files.pythonhosted.org/packages/2a/c2/9a15e179e49f235bb5e63b01590658c03747a43c9775e20c4e13ca04f4c4/grpcio-1.73.1-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:277b426a0ed341e8447fbf6c1d6b68c952adddf585ea4685aa563de0f03df887", size = 10601131, upload_time = "2025-06-26T01:52:25.691Z" }, + { url = "https://files.pythonhosted.org/packages/0c/1d/1d39e90ef6348a0964caa7c5c4d05f3bae2c51ab429eb7d2e21198ac9b6d/grpcio-1.73.1-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:96c112333309493c10e118d92f04594f9055774757f5d101b39f8150f8c25582", size = 5759268, upload_time = "2025-06-26T01:52:27.631Z" }, + { url = "https://files.pythonhosted.org/packages/8a/2b/2dfe9ae43de75616177bc576df4c36d6401e0959833b2e5b2d58d50c1f6b/grpcio-1.73.1-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f48e862aed925ae987eb7084409a80985de75243389dc9d9c271dd711e589918", size = 6409791, upload_time = "2025-06-26T01:52:29.711Z" }, + { url = "https://files.pythonhosted.org/packages/6e/66/e8fe779b23b5a26d1b6949e5c70bc0a5fd08f61a6ec5ac7760d589229511/grpcio-1.73.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:83a6c2cce218e28f5040429835fa34a29319071079e3169f9543c3fbeff166d2", size = 6003728, upload_time = "2025-06-26T01:52:31.352Z" }, + { url = "https://files.pythonhosted.org/packages/a9/39/57a18fcef567784108c4fc3f5441cb9938ae5a51378505aafe81e8e15ecc/grpcio-1.73.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:65b0458a10b100d815a8426b1442bd17001fdb77ea13665b2f7dc9e8587fdc6b", size = 6103364, upload_time = "2025-06-26T01:52:33.028Z" }, + { url = "https://files.pythonhosted.org/packages/c5/46/28919d2aa038712fc399d02fa83e998abd8c1f46c2680c5689deca06d1b2/grpcio-1.73.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:0a9f3ea8dce9eae9d7cb36827200133a72b37a63896e0e61a9d5ec7d61a59ab1", size = 6749194, upload_time = "2025-06-26T01:52:34.734Z" }, + { url = "https://files.pythonhosted.org/packages/3d/56/3898526f1fad588c5d19a29ea0a3a4996fb4fa7d7c02dc1be0c9fd188b62/grpcio-1.73.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:de18769aea47f18e782bf6819a37c1c528914bfd5683b8782b9da356506190c8", size = 6283902, upload_time = "2025-06-26T01:52:36.503Z" }, + { url = "https://files.pythonhosted.org/packages/dc/64/18b77b89c5870d8ea91818feb0c3ffb5b31b48d1b0ee3e0f0d539730fea3/grpcio-1.73.1-cp312-cp312-win32.whl", hash = "sha256:24e06a5319e33041e322d32c62b1e728f18ab8c9dbc91729a3d9f9e3ed336642", size = 3668687, upload_time = "2025-06-26T01:52:38.678Z" }, + { url = "https://files.pythonhosted.org/packages/3c/52/302448ca6e52f2a77166b2e2ed75f5d08feca4f2145faf75cb768cccb25b/grpcio-1.73.1-cp312-cp312-win_amd64.whl", hash = "sha256:303c8135d8ab176f8038c14cc10d698ae1db9c480f2b2823f7a987aa2a4c5646", size = 4334887, upload_time = "2025-06-26T01:52:40.743Z" }, + { url = "https://files.pythonhosted.org/packages/37/bf/4ca20d1acbefabcaba633ab17f4244cbbe8eca877df01517207bd6655914/grpcio-1.73.1-cp313-cp313-linux_armv7l.whl", hash = "sha256:b310824ab5092cf74750ebd8a8a8981c1810cb2b363210e70d06ef37ad80d4f9", size = 5335615, upload_time = "2025-06-26T01:52:42.896Z" }, + { url = "https://files.pythonhosted.org/packages/75/ed/45c345f284abec5d4f6d77cbca9c52c39b554397eb7de7d2fcf440bcd049/grpcio-1.73.1-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:8f5a6df3fba31a3485096ac85b2e34b9666ffb0590df0cd044f58694e6a1f6b5", size = 10595497, upload_time = "2025-06-26T01:52:44.695Z" }, + { url = "https://files.pythonhosted.org/packages/a4/75/bff2c2728018f546d812b755455014bc718f8cdcbf5c84f1f6e5494443a8/grpcio-1.73.1-cp313-cp313-manylinux_2_17_aarch64.whl", hash = "sha256:052e28fe9c41357da42250a91926a3e2f74c046575c070b69659467ca5aa976b", size = 5765321, upload_time = "2025-06-26T01:52:46.871Z" }, + { url = "https://files.pythonhosted.org/packages/70/3b/14e43158d3b81a38251b1d231dfb45a9b492d872102a919fbf7ba4ac20cd/grpcio-1.73.1-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1c0bf15f629b1497436596b1cbddddfa3234273490229ca29561209778ebe182", size = 6415436, upload_time = "2025-06-26T01:52:49.134Z" }, + { url = "https://files.pythonhosted.org/packages/e5/3f/81d9650ca40b54338336fd360f36773be8cb6c07c036e751d8996eb96598/grpcio-1.73.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ab860d5bfa788c5a021fba264802e2593688cd965d1374d31d2b1a34cacd854", size = 6007012, upload_time = "2025-06-26T01:52:51.076Z" }, + { url = "https://files.pythonhosted.org/packages/55/f4/59edf5af68d684d0f4f7ad9462a418ac517201c238551529098c9aa28cb0/grpcio-1.73.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:ad1d958c31cc91ab050bd8a91355480b8e0683e21176522bacea225ce51163f2", size = 6105209, upload_time = "2025-06-26T01:52:52.773Z" }, + { url = "https://files.pythonhosted.org/packages/e4/a8/700d034d5d0786a5ba14bfa9ce974ed4c976936c2748c2bd87aa50f69b36/grpcio-1.73.1-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:f43ffb3bd415c57224c7427bfb9e6c46a0b6e998754bfa0d00f408e1873dcbb5", size = 6753655, upload_time = "2025-06-26T01:52:55.064Z" }, + { url = "https://files.pythonhosted.org/packages/1f/29/efbd4ac837c23bc48e34bbaf32bd429f0dc9ad7f80721cdb4622144c118c/grpcio-1.73.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:686231cdd03a8a8055f798b2b54b19428cdf18fa1549bee92249b43607c42668", size = 6287288, upload_time = "2025-06-26T01:52:57.33Z" }, + { url = "https://files.pythonhosted.org/packages/d8/61/c6045d2ce16624bbe18b5d169c1a5ce4d6c3a47bc9d0e5c4fa6a50ed1239/grpcio-1.73.1-cp313-cp313-win32.whl", hash = "sha256:89018866a096e2ce21e05eabed1567479713ebe57b1db7cbb0f1e3b896793ba4", size = 3668151, upload_time = "2025-06-26T01:52:59.405Z" }, + { url = "https://files.pythonhosted.org/packages/c2/d7/77ac689216daee10de318db5aa1b88d159432dc76a130948a56b3aa671a2/grpcio-1.73.1-cp313-cp313-win_amd64.whl", hash = "sha256:4a68f8c9966b94dff693670a5cf2b54888a48a5011c5d9ce2295a1a1465ee84f", size = 4335747, upload_time = "2025-06-26T01:53:01.233Z" }, +] + [[package]] name = "h11" version = "0.16.0" @@ -921,6 +953,28 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload_time = "2025-04-24T03:35:24.344Z" }, ] +[[package]] +name = "h2" +version = "4.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "hpack" }, + { name = "hyperframe" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/1b/38/d7f80fd13e6582fb8e0df8c9a653dcc02b03ca34f4d72f34869298c5baf8/h2-4.2.0.tar.gz", hash = "sha256:c8a52129695e88b1a0578d8d2cc6842bbd79128ac685463b887ee278126ad01f", size = 2150682, upload_time = "2025-02-02T07:43:51.815Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d0/9e/984486f2d0a0bd2b024bf4bc1c62688fcafa9e61991f041fb0e2def4a982/h2-4.2.0-py3-none-any.whl", hash = "sha256:479a53ad425bb29af087f3458a61d30780bc818e4ebcf01f0b536ba916462ed0", size = 60957, upload_time = "2025-02-01T11:02:26.481Z" }, +] + +[[package]] +name = "hpack" +version = "4.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2c/48/71de9ed269fdae9c8057e5a4c0aa7402e8bb16f2c6e90b3aa53327b113f8/hpack-4.1.0.tar.gz", hash = "sha256:ec5eca154f7056aa06f196a557655c5b009b382873ac8d1e66e79e87535f1dca", size = 51276, upload_time = "2025-01-22T21:44:58.347Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/07/c6/80c95b1b2b94682a72cbdbfb85b81ae2daffa4291fbfa1b1464502ede10d/hpack-4.1.0-py3-none-any.whl", hash = "sha256:157ac792668d995c657d93111f46b4535ed114f0c9c8d672271bbec7eae1b496", size = 34357, upload_time = "2025-01-22T21:44:56.92Z" }, +] + [[package]] name = "html5lib" version = "1.1" @@ -984,6 +1038,11 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload_time = "2024-12-06T15:37:21.509Z" }, ] +[package.optional-dependencies] +http2 = [ + { name = "h2" }, +] + [[package]] name = "httpx-sse" version = "0.4.1" @@ -993,6 +1052,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/25/0a/6269e3473b09aed2dab8aa1a600c70f31f00ae1349bee30658f7e358a159/httpx_sse-0.4.1-py3-none-any.whl", hash = "sha256:cba42174344c3a5b06f255ce65b350880f962d99ead85e776f23c6618a377a37", size = 8054, upload_time = "2025-06-24T13:21:04.772Z" }, ] +[[package]] +name = "hyperframe" +version = "6.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/02/e7/94f8232d4a74cc99514c13a9f995811485a6903d48e5d952771ef6322e30/hyperframe-6.1.0.tar.gz", hash = "sha256:f630908a00854a7adeabd6382b43923a4c4cd4b821fcb527e6ab9e15382a3b08", size = 26566, upload_time = "2025-01-22T21:41:49.302Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/48/30/47d0bf6072f7252e6521f3447ccfa40b421b6824517f82854703d0f5a98b/hyperframe-6.1.0-py3-none-any.whl", hash = "sha256:b03380493a519fce58ea5af42e4a42317bf9bd425596f7a0835ffce80f1a42e5", size = 13007, upload_time = "2025-01-22T21:41:47.295Z" }, +] + [[package]] name = "idna" version = "3.10" @@ -1557,6 +1625,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/aa/31/1f0baf6490b082bf4d06f355c5e9c28728931dbf321f3ca03137617a692e/langchain_openai-0.3.27-py3-none-any.whl", hash = "sha256:efe636c3523978c44adc41cf55c8b3766c05c77547982465884d1258afe705df", size = 70368, upload_time = "2025-06-27T17:56:28.726Z" }, ] +[[package]] +name = "langchain-qdrant" +version = "0.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "langchain-core" }, + { name = "pydantic" }, + { name = "qdrant-client" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f2/8c/f006636b4cc2d95ba072a57df3f2f99d8cf7cb47a79cc447a7e3e391f7ee/langchain_qdrant-0.2.0.tar.gz", hash = "sha256:41b8573cbb1b4706f76dc769251d8e6b3e4107ecd5fa97c58141977ec19fba75", size = 21429, upload_time = "2024-11-05T20:51:15.122Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/68/01/22dad84373ba282237a3351547443c9c94c39fe75f71a1759f97cfa89725/langchain_qdrant-0.2.0-py3-none-any.whl", hash = "sha256:8eab5b8a553204ddb809d8183a6f1bc12fc265688592d9d897388f6939c79bf8", size = 23406, upload_time = "2024-11-05T20:51:13.472Z" }, +] + [[package]] name = "langchain-text-splitters" version = "0.3.8" @@ -2280,6 +2362,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload_time = "2025-05-15T12:30:06.134Z" }, ] +[[package]] +name = "portalocker" +version = "2.10.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pywin32", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ed/d3/c6c64067759e87af98cc668c1cc75171347d0f1577fab7ca3749134e3cd4/portalocker-2.10.1.tar.gz", hash = "sha256:ef1bf844e878ab08aee7e40184156e1151f228f103aa5c6bd0724cc330960f8f", size = 40891, upload_time = "2024-07-13T23:15:34.86Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9b/fb/a70a4214956182e0d7a9099ab17d50bfcba1056188e9b14f35b9e2b62a0d/portalocker-2.10.1-py3-none-any.whl", hash = "sha256:53a5984ebc86a025552264b459b46a2086e269b21823cb572f8f28ee759e45bf", size = 18423, upload_time = "2024-07-13T23:15:32.602Z" }, +] + [[package]] name = "prometheus-client" version = "0.22.1" @@ -2358,6 +2452,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cc/35/cc0aaecf278bb4575b8555f2b137de5ab821595ddae9da9d3cd1da4072c7/propcache-0.3.2-py3-none-any.whl", hash = "sha256:98f1ec44fb675f5052cccc8e609c46ed23a35a1cfd18545ad4e29002d858a43f", size = 12663, upload_time = "2025-06-09T22:56:04.484Z" }, ] +[[package]] +name = "protobuf" +version = "6.31.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/52/f3/b9655a711b32c19720253f6f06326faf90580834e2e83f840472d752bc8b/protobuf-6.31.1.tar.gz", hash = "sha256:d8cac4c982f0b957a4dc73a80e2ea24fab08e679c0de9deb835f4a12d69aca9a", size = 441797, upload_time = "2025-05-28T19:25:54.947Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f3/6f/6ab8e4bf962fd5570d3deaa2d5c38f0a363f57b4501047b5ebeb83ab1125/protobuf-6.31.1-cp310-abi3-win32.whl", hash = "sha256:7fa17d5a29c2e04b7d90e5e32388b8bfd0e7107cd8e616feef7ed3fa6bdab5c9", size = 423603, upload_time = "2025-05-28T19:25:41.198Z" }, + { url = "https://files.pythonhosted.org/packages/44/3a/b15c4347dd4bf3a1b0ee882f384623e2063bb5cf9fa9d57990a4f7df2fb6/protobuf-6.31.1-cp310-abi3-win_amd64.whl", hash = "sha256:426f59d2964864a1a366254fa703b8632dcec0790d8862d30034d8245e1cd447", size = 435283, upload_time = "2025-05-28T19:25:44.275Z" }, + { url = "https://files.pythonhosted.org/packages/6a/c9/b9689a2a250264a84e66c46d8862ba788ee7a641cdca39bccf64f59284b7/protobuf-6.31.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:6f1227473dc43d44ed644425268eb7c2e488ae245d51c6866d19fe158e207402", size = 425604, upload_time = "2025-05-28T19:25:45.702Z" }, + { url = "https://files.pythonhosted.org/packages/76/a1/7a5a94032c83375e4fe7e7f56e3976ea6ac90c5e85fac8576409e25c39c3/protobuf-6.31.1-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:a40fc12b84c154884d7d4c4ebd675d5b3b5283e155f324049ae396b95ddebc39", size = 322115, upload_time = "2025-05-28T19:25:47.128Z" }, + { url = "https://files.pythonhosted.org/packages/fa/b1/b59d405d64d31999244643d88c45c8241c58f17cc887e73bcb90602327f8/protobuf-6.31.1-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:4ee898bf66f7a8b0bd21bce523814e6fbd8c6add948045ce958b73af7e8878c6", size = 321070, upload_time = "2025-05-28T19:25:50.036Z" }, + { url = "https://files.pythonhosted.org/packages/f7/af/ab3c51ab7507a7325e98ffe691d9495ee3d3aa5f589afad65ec920d39821/protobuf-6.31.1-py3-none-any.whl", hash = "sha256:720a6c7e6b77288b85063569baae8536671b39f15cc22037ec7045658d80489e", size = 168724, upload_time = "2025-05-28T19:25:53.926Z" }, +] + [[package]] name = "psutil" version = "7.0.0" @@ -2753,6 +2861,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/eb/bc/1709dc55f0970cf4cb8259e435e6773f9946f41a045c2cb90e870b7072da/pyzmq-27.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:d8229f2efece6a660ee211d74d91dbc2a76b95544d46c74c615e491900dc107f", size = 639933, upload_time = "2025-06-13T14:08:00.777Z" }, ] +[[package]] +name = "qdrant-client" +version = "1.14.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "grpcio" }, + { name = "httpx", extra = ["http2"] }, + { name = "numpy" }, + { name = "portalocker" }, + { name = "protobuf" }, + { name = "pydantic" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/1d/56/3f355f931c239c260b4fe3bd6433ec6c9e6185cd5ae0970fe89d0ca6daee/qdrant_client-1.14.3.tar.gz", hash = "sha256:bb899e3e065b79c04f5e47053d59176150c0a5dabc09d7f476c8ce8e52f4d281", size = 286766, upload_time = "2025-06-16T11:13:47.838Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/35/5e/8174c845707e60b60b65c58f01e40bbc1d8181b5ff6463f25df470509917/qdrant_client-1.14.3-py3-none-any.whl", hash = "sha256:66faaeae00f9b5326946851fe4ca4ddb1ad226490712e2f05142266f68dfc04d", size = 328969, upload_time = "2025-06-16T11:13:46.636Z" }, +] + [[package]] name = "rapidfuzz" version = "3.13.0" diff --git a/vectorstore/__init__.py b/vectorstore/__init__.py new file mode 100644 index 0000000..72da478 --- /dev/null +++ b/vectorstore/__init__.py @@ -0,0 +1,5 @@ +from .qdrant_client import QdrantClientManager + +__all__ = [ + "QdrantClientManager" +] \ No newline at end of file diff --git a/vectorstore/base.py b/vectorstore/base.py new file mode 100644 index 0000000..8f5f229 --- /dev/null +++ b/vectorstore/base.py @@ -0,0 +1,27 @@ +from abc import ABC, abstractmethod +from typing import Optional + +from langchain_core.documents import Document +from qdrant_client.http.models import Filter + + +class VectorDBManagerBase(ABC): + @abstractmethod + def search(self, query: str, k: int = 10, filters: Optional[Filter] = None) -> list: + """ + Search for vectors similar to the query vector. + :param query: The query vector or text to search for. + :param k: The number of similar vectors to return. + :param filters: Optional filters to apply to the search. + :return: A list of metadata associated with the found vectors. + """ + pass + + @abstractmethod + def add_documents(self, documents: list[Document]) -> None: + """ + Add a list of Document objects to the vector store. + :param documents: A list of Document objects to add. + :return: None + """ + pass \ No newline at end of file diff --git a/vectorstore/qdrant_client.py b/vectorstore/qdrant_client.py new file mode 100644 index 0000000..4f07beb --- /dev/null +++ b/vectorstore/qdrant_client.py @@ -0,0 +1,67 @@ +from typing import Optional + +from langchain_aws import BedrockEmbeddings +from langchain_core.documents import Document +from langchain_qdrant import QdrantVectorStore +from qdrant_client import QdrantClient, models +from .base import VectorDBManagerBase +from config import env + +COLLECTION_NAME = "documents" + +embeddings = BedrockEmbeddings( + model_id=env.BEDROCK_EMBEDDING_MODEL_ID, + region_name=env.AWS_REGION, + aws_access_key_id=env.AWS_ACCESS_KEY_ID, + aws_secret_access_key=env.AWS_SECRET_ACCESS_KEY, +) + +try: + QdrantClient( + url=env.QDRANT_URL, + api_key=env.QDRANT_API_KEY, + prefer_grpc=True, + ).create_collection( + collection_name=COLLECTION_NAME, + vectors_config=models.VectorParams( + size=1024, # Bedrock embedding size + distance=models.Distance.COSINE, # Distance metric + ), + ) +except Exception: # noqa + pass + +class QdrantClientManager(VectorDBManagerBase): + def __init__(self): + self._vectorstore = QdrantVectorStore.from_existing_collection( + url=env.QDRANT_URL, + api_key=env.QDRANT_API_KEY, + prefer_grpc=True, + collection_name=COLLECTION_NAME, + embedding=embeddings, + ) + + def search(self, query: str, k: int = 10, filters: Optional[models.Filter] = None) -> list[Document]: + return self._vectorstore.similarity_search(query, k=k, filter=filters) + + def add_documents(self, documents: list[Document]) -> None: + self._vectorstore.add_documents(documents) + + async def asearch(self, query: str, k: int = 10, filters: Optional[models.Filter] = None) -> list[Document]: + """ + Asynchronous search method to find similar vectors. + :param query: The query vector or text to search for. + :param k: The number of similar vectors to return. + :param filters: Optional filters to apply to the search. + :return: A list of metadata associated with the found vectors. + """ + return await self._vectorstore.asimilarity_search(query, k=k, filter=filters) + + @property + def vectorstore(self) -> QdrantVectorStore: + """ + Returns the Qdrant client instance. + If the client is not initialized, it will be created. + :return: QdrantVectorStore instance. + """ + return self._vectorstore \ No newline at end of file diff --git a/workers/knowledge.py b/workers/knowledge.py index 974d271..5b55b64 100644 --- a/workers/knowledge.py +++ b/workers/knowledge.py @@ -1,4 +1,7 @@ +import hashlib + from typing import Optional +from uuid import uuid4 from langchain_community.document_loaders import S3FileLoader, AmazonTextractPDFLoader from langchain_core.messages import SystemMessage @@ -12,14 +15,16 @@ from langchain_neo4j import Neo4jGraph from config import env +from core.prompt import EXTRACT_ENTITIES_PROMPT +from schemas import LegalDocumentMetadata +from services import S3Client +from vectorstore import QdrantClientManager -examples = [ +examples: list[dict[str, str]] = [ { - "text": ( - "Maria Silva é autora em um processo contra a Empresa XYZ " - "no Tribunal de Justiça de São Paulo." - ), + "text": "Maria Silva é autora em um processo contra a Empresa XYZ " + "no Tribunal de Justiça de São Paulo.", "head": "Maria Silva", "head_type": "Person", "relation": "PARTY_TO", @@ -27,9 +32,7 @@ "tail_type": "Legal_Case", }, { - "text": ( - "A Empresa XYZ foi representada pelo escritório de advocacia Souza & Associados." - ), + "text": "A Empresa XYZ foi representada pelo escritório de advocacia Souza & Associados.", "head": "Souza & Associados", "head_type": "Organization", "relation": "REPRESENTS", @@ -37,9 +40,7 @@ "tail_type": "Organization", }, { - "text": ( - "O processo está sendo julgado pelo Tribunal de Justiça de São Paulo." - ), + "text": "O processo está sendo julgado pelo Tribunal de Justiça de São Paulo.", "head": "Processo 1020304-55.2023.8.26.0100", "head_type": "Legal_Case", "relation": "HANDLED_BY", @@ -47,9 +48,7 @@ "tail_type": "Court", }, { - "text": ( - "O Tribunal de Justiça de São Paulo está localizado na cidade de São Paulo." - ), + "text": "O Tribunal de Justiça de São Paulo está localizado na cidade de São Paulo.", "head": "Tribunal de Justiça de São Paulo", "head_type": "Court", "relation": "LOCATED_IN", @@ -57,9 +56,7 @@ "tail_type": "Location", }, { - "text": ( - "A petição inicial do processo faz referência ao Artigo 927 do Código Civil." - ), + "text": "A petição inicial do processo faz referência ao Artigo 927 do Código Civil.", "head": "Petição Inicial", "head_type": "Legal_Document", "relation": "REFERS_TO", @@ -67,9 +64,7 @@ "tail_type": "Law_Article", }, { - "text": ( - "A decisão judicial resultou em condenação por danos morais." - ), + "text": "A decisão judicial resultou em condenação por danos morais.", "head": "Decisão Judicial", "head_type": "Legal_Document", "relation": "RESULTS_IN", @@ -77,9 +72,7 @@ "tail_type": "Penalty", }, { - "text": ( - "Foi interposto recurso de apelação pela Empresa XYZ." - ), + "text": "Foi interposto recurso de apelação pela Empresa XYZ.", "head": "Processo 1020304-55.2023.8.26.0100", "head_type": "Legal_Case", "relation": "HAS_ACTION", @@ -87,9 +80,7 @@ "tail_type": "Appeal", }, { - "text": ( - "A audiência de instrução ocorreu no Fórum João Mendes." - ), + "text": "A audiência de instrução ocorreu no Fórum João Mendes.", "head": "Audiência de Instrução", "head_type": "Event", "relation": "HELD_ON", @@ -97,9 +88,7 @@ "tail_type": "Location", }, { - "text": ( - "O laudo pericial foi incluído como prova no processo." - ), + "text": "O laudo pericial foi incluído como prova no processo.", "head": "Laudo Pericial", "head_type": "Evidence", "relation": "EVIDENCE_IN", @@ -107,9 +96,7 @@ "tail_type": "Legal_Case", }, { - "text": ( - "O juiz Pedro Almeida decidiu o processo em favor de Maria Silva." - ), + "text": "O juiz Pedro Almeida decidiu o processo em favor de Maria Silva.", "head": "Pedro Almeida", "head_type": "Person", "relation": "DECIDED_BY", @@ -118,7 +105,7 @@ }, ] -nodes = [ +nodes: list[str] = [ "Person", "Organization", "Court", @@ -136,7 +123,7 @@ "Appeal", ] -relationships = [ +relationships: list[str] = [ "PARTY_TO", "REPRESENTS", "EMPLOYED_BY", @@ -154,6 +141,51 @@ "CITES", ] +legal_document_metadata_keys: list[str] = [ + # Document Identification + "title", + "type", + "case_number", + "document_number", + "creation_date", + "filing_date", + "signature_date", + "version", + "place_of_issue", + + # Parties Involved + "plaintiffs", + "defendants", + "lawyers", + "bar_number", + "legal_representatives", + "judge_or_rapporteur", + "third_parties", + + # Procedural Data + "court", + "jurisdiction", + "district", + "adjudicating_body", + "case_class", + "nature_of_action", + "main_subject", + "secondary_subjects", + "case_progress", + "case_stage", + + # Legal Information + "legal_basis", + "jurisprudence", + "legal_thesis", + "claims", + "legal_reasoning", + "provisions", + "decision", + "case_value", + "attorney_fees", +] + class KnowledgeService: def __init__(self): self._splitter = CharacterTextSplitter.from_tiktoken_encoder( @@ -163,7 +195,24 @@ def __init__(self): ) @staticmethod - def _create_unstructured_prompt( + def calc_document_hash(contents: str) -> str: + """ + Calculate a hash for the document contents. + :param contents: The document contents as bytes. + :return: A string representing the hash of the document. + """ + return hashlib.sha256(contents.encode()).hexdigest() + + @staticmethod + def get_document_id() -> str: + """ + Generate a unique document ID based on the current timestamp. + :return: A string representing the document ID. + """ + return uuid4().hex + + @staticmethod + def _create_unstructured_relationships_prompt( node_labels: Optional[list[str]] = None, rel_types: Optional[list[str] | list[tuple[str, str, str]]] = None, relationship_type: Optional[str] = None, @@ -266,9 +315,9 @@ def _read(key: str) -> str: return "\n".join([page.page_content for page in loader.load()]) raise RuntimeError(f"Unsupported file type: {_ext}") - def update(self, key: str): + def process(self, key: str): """ - Update the knowledge base with the given S3 object ID. + Process a document from S3, split it into chunks, and add it to the knowledge base. :param key: :return: """ @@ -277,7 +326,8 @@ def update(self, key: str): # Split the document into smaller chunks texts = self._splitter.split_text(contents) # Create Document objects from the split texts - documents = [Document(page_content=text, source=key.split('/')[-1]) for text in texts] + source = key.split('/')[-1] + documents = [Document(id=uuid4().hex, page_content=text, source=source) for text in texts] print(f"{len(documents)} Chunks created from {key}.") # Convert the documents to graph documents using LLMGraphTransformer @@ -292,7 +342,7 @@ def update(self, key: str): llm, allowed_nodes=nodes, allowed_relationships=relationships, - prompt= self._create_unstructured_prompt( + prompt=self._create_unstructured_relationships_prompt( node_labels=nodes, rel_types=relationships, ) @@ -305,6 +355,49 @@ def update(self, key: str): # Refresh the schema to ensure the new documents are indexed graph.refresh_schema() - # TODO: Implement vector database update logic here + # Calculate the document hash + document_hash = self.calc_document_hash(contents) + document_id = self.get_document_id() + # Update the metadata with the extracted information + metadatas: dict[str, list[str]|str] = { + "document_id": document_id, + "document_hash": document_hash, + "source": source, + } + # Extract metadata from the document using the LLM + structured = llm.with_structured_output(LegalDocumentMetadata) + chain_legal_document = EXTRACT_ENTITIES_PROMPT | structured + # Iterate over the texts and extract metadata + for text in texts: + # Extract metadata from the document + metadata_extraction_result: LegalDocumentMetadata = chain_legal_document.invoke( # type: ignore + {"entities": ", ".join(legal_document_metadata_keys), "text": text}) + # Parse the metadata extraction result + metadata: dict = metadata_extraction_result.model_dump(exclude_none=True) + for k in metadata.keys(): + if k in ['document_id', 'document_hash', 'source']: continue + if k in list(metadatas.keys()): + if isinstance(metadatas[k], list): + metadatas[k].extend(metadata[k]) + continue + if isinstance(metadatas[k], str): + metadatas[k] += "\n" + metadata[k] + else: + metadatas[k] = metadata[k] - print(f"Knowledge base updated with {len(graph_documents)} documents from {key}.") \ No newline at end of file + # Add the document to the vector database + vectorstore = QdrantClientManager() + # Build the Document objects with the metadata + documents = [ + Document( + id=doc.id, + page_content=doc.page_content, + metadata=metadatas + ) for doc in documents + ] + # Add the documents to the vector database + vectorstore.add_documents(documents=documents) + # Delete object from S3 + S3Client().delete_object(key) + # Log the update + print(f"Knowledge base updated with {len(documents)} documents from {key}.") \ No newline at end of file diff --git a/workers/tasks.py b/workers/tasks.py index bb1fdce..af5f172 100644 --- a/workers/tasks.py +++ b/workers/tasks.py @@ -1,26 +1,15 @@ import asyncio - +from .knowledge import KnowledgeService from .connection import app -@app.task( - name="knowledge.upload_knowledge_base", - bind=True, - autoretry_for=(Exception,), - max_retries=3, - countdown=60, -) -def _upload_knowledge_base(self, key: str): +@app.task(name="knowledge.upload_knowledge_base") +def _upload_knowledge_base(key: str): """ - Update the knowledge base with the given ID. + Synchronous task to update the knowledge base with the given S3 object ID. """ - from .knowledge import KnowledgeService - try: - service = KnowledgeService() - service.update(key) - except Exception as e: - self.retry(exc=e) - + service = KnowledgeService() + service.process(key) async def aupload_knowledge_base(key: str) -> str: