LeDat98 · quannadev · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026
diff --git a/README.md b/README.md
@@ -114,6 +114,18 @@ Images and tables are **embedded into chunk vectors** — not stored separately.
 
 </details>
 
+<details>
+<summary><b>Custom Document Metadata</b></summary>
+
+Enhance RAG accuracy and organization by attaching custom key-value metadata during document upload:
+
+- **Metadata Filtering** — Perform hybrid search (semantic + metadata filtering) to narrow down the search space and prevent hallucinations.
+- **Flexible Organization** — Tag documents with attributes like `year`, `category`, or `author` without needing separate workspaces.
+- **Optimized Retrieval** — Pre-filtering in ChromaDB reduces processing time and latency during vector search.
+- **Supported Endpoints** — Pass `custom_metadata` (list of key-values) in the upload API, and `metadata_filter` in query/chat APIs.
+
+</details>
+
 <details>
 <summary><b>Citation System</b></summary>
 
@@ -526,7 +538,7 @@ All endpoints prefixed with `/api/v1`. Interactive docs at http://localhost:8080
 
 | Method | Endpoint | Description |
 |---|---|---|
-| `POST` | `/documents/upload/{workspace_id}` | Upload file |
+| `POST` | `/documents/upload/{workspace_id}` | Upload file (supports `custom_metadata`) |
 | `GET` | `/documents/{id}/markdown` | Get parsed content |
 | `GET` | `/documents/{id}/images` | List extracted images |
 | `DELETE` | `/documents/{id}` | Delete document |
@@ -538,8 +550,8 @@ All endpoints prefixed with `/api/v1`. Interactive docs at http://localhost:8080
 
 | Method | Endpoint | Description |
 |---|---|---|
-| `POST` | `/rag/query/{workspace_id}` | Hybrid search |
-| `POST` | `/rag/chat/{workspace_id}/stream` | Agentic streaming chat (SSE) |
+| `POST` | `/rag/query/{workspace_id}` | Hybrid search (supports `metadata_filter`) |
+| `POST` | `/rag/chat/{workspace_id}/stream` | Agentic streaming chat (SSE) (supports `metadata_filter`) |
 | `GET` | `/rag/chat/{workspace_id}/history` | Chat history |
 | `POST` | `/rag/process/{document_id}` | Process document |
 | `GET` | `/rag/graph/{workspace_id}` | Knowledge graph data |

diff --git a/backend/alembic.ini b/backend/alembic.ini
@@ -0,0 +1,89 @@
+# A generic, single database configuration.
+
+[alembic]
+# path to migration scripts
+script_location = alembic
+
+# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s
+# file_template = %%(rev)s_%%(slug)s
+
+# sys.path path, will be prepended to sys.path if present.
+# defaults to the current working directory.
+prepend_sys_path = .
+
+# timezone to use when rendering the date within the migration file
+# as well as the filename.
+# If specified, requires the python-dateutil library that can be
+# installed by adding it to the alembic section in setup.py
+# timezone =
+
+# max length of characters to apply to the
+# "slug" field
+# truncate_slug_length = 40
+
+# set to 'true' to run the environment during
+# the 'revision' command, regardless of autogenerate
+# revision_environment = false
+
+# set to 'true' to allow .pyc and .pyo files without
+# a source .py file to be detected as revisions in the
+# versions/ directory
+# sourceless = false
+
+# version location specification; This defaults
+# to alembic/versions.  When using multiple version
+# directories, initial revisions must be specified with --version-path.
+# The path separator used here should be the separator specified by "version_path_separator" below.
+# version_locations = %(here)s/bar:%(here)s/bat:alembic/versions
+
+# version path separator; As mentioned above, this is the character used to split
+# version_locations.
+# version_path_separator = :
+
+# sqlalchemy.url = driver://user:pass@localhost/dbname
+
+
+[post_write_hooks]
+# post_write_hooks defines scripts or Python functions that are run
+# on newly generated revision scripts.  See the documentation for further
+# detail and examples
+
+# format using "black" - use the console_scripts runner, against the "black" entrypoint
+# hooks = black
+# black.type = console_scripts
+# black.entrypoint = black
+# black.options = -l 79 REVISION_SCRIPT_FILENAME
+
+[loggers]
+keys = root,sqlalchemy,alembic
+
+[handlers]
+keys = console
+
+[formatters]
+keys = generic
+
+[logger_root]
+level = WARN
+handlers = console
+qualname =
+
+[logger_sqlalchemy]
+level = WARN
+handlers =
+qualname = sqlalchemy.engine
+
+[logger_alembic]
+level = INFO
+handlers =
+qualname = alembic
+
+[handler_console]
+class = StreamHandler
+args = (sys.stderr,)
+level = NOTSET
+formatter = generic
+
+[formatter_generic]
+format = %(levelname)-5.5s [%(name)s] %(message)s
+datefmt = %H:%M:%S
diff --git a/backend/alembic/README b/backend/alembic/README
@@ -0,0 +1 @@
+Generic single-database configuration.
diff --git a/backend/alembic/env.py b/backend/alembic/env.py
@@ -0,0 +1,97 @@
+from logging.config import fileConfig
+
+from sqlalchemy import engine_from_config
+from sqlalchemy import pool
+
+from alembic import context
+
+# this is the Alembic Config object, which provides
+# access to the values within the .ini file in use.
+config = context.config
+
+# Interpret the config file for Python logging.
+# This line sets up loggers basically.
+if config.config_file_name is not None:
+    fileConfig(config.config_file_name)
+
+# add your model's MetaData object here
+# for 'autogenerate' support
+import os
+import sys
+
+# Add the parent directory of 'alembic' to the Python path
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from app.core.database import Base
+from app.core.config import settings
+
+# Import all models here so Alembic can discover them
+from app.models.document import Document, DocumentImage, DocumentTable
+from app.models.knowledge_base import KnowledgeBase
+from app.models.chat_message import ChatMessage
+
+target_metadata = Base.metadata
+
+# other values from the config, defined by the needs of env.py,
+config.set_main_option("sqlalchemy.url", settings.DATABASE_URL)
+# can be acquired:
+# my_important_option = config.get_main_option("my_important_option")
+# ... etc.
+
+
+def run_migrations_offline() -> None:
+    """Run migrations in 'offline' mode.
+
+    This configures the context with just a URL
+    and not an Engine, though an Engine is acceptable
+    here as well.  By skipping the Engine creation
+    we don't even need a DBAPI to be available.
+
+    Calls to context.execute() here emit the given string to the
+    script output.
+
+    """
+    url = config.get_main_option("sqlalchemy.url")
+    context.configure(
+        url=url,
+        target_metadata=target_metadata,
+        literal_binds=True,
+        dialect_opts={"paramstyle": "named"},
+    )
+
+    with context.begin_transaction():
+        context.run_migrations()
+
+
+def run_migrations_online() -> None:
+    """Run migrations in 'online' mode.
+
+    In this scenario we need to create an Engine
+    and associate a connection with the context.
+
+    """
+    configuration = config.get_section(config.config_ini_section, {})
+    url = config.get_main_option("sqlalchemy.url")
+    if url and url.startswith("postgresql+asyncpg"):
+        url = url.replace("postgresql+asyncpg", "postgresql")
+    configuration["sqlalchemy.url"] = url
+
+    connectable = engine_from_config(
+        configuration,
+        prefix="sqlalchemy.",
+        poolclass=pool.NullPool,
+    )
+
+    with connectable.connect() as connection:
+        context.configure(
+            connection=connection, target_metadata=target_metadata
+        )
+
+        with context.begin_transaction():
+            context.run_migrations()
+
+
+if context.is_offline_mode():
+    run_migrations_offline()
+else:
+    run_migrations_online()
diff --git a/backend/alembic/script.py.mako b/backend/alembic/script.py.mako
@@ -0,0 +1,28 @@
+"""${message}
+
+Revision ID: ${up_revision}
+Revises: ${down_revision | comma,n}
+Create Date: ${create_date}
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+${imports if imports else ""}
+
+# revision identifiers, used by Alembic.
+revision: str = ${repr(up_revision)}
+down_revision: Union[str, Sequence[str], None] = ${repr(down_revision)}
+branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
+depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
+
+
+def upgrade() -> None:
+    """Upgrade schema."""
+    ${upgrades if upgrades else "pass"}
+
+
+def downgrade() -> None:
+    """Downgrade schema."""
+    ${downgrades if downgrades else "pass"}
diff --git a/backend/alembic/versions/2047460692d0_add_custom_metadata_to_documents.py b/backend/alembic/versions/2047460692d0_add_custom_metadata_to_documents.py
@@ -0,0 +1,32 @@
+"""Add custom_metadata to documents
+
+Revision ID: 2047460692d0
+Revises: 
+Create Date: 2026-03-17 14:09:11.881981
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision: str = '2047460692d0'
+down_revision: Union[str, Sequence[str], None] = None
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    """Upgrade schema."""
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column('documents', sa.Column('custom_metadata', sa.JSON(), nullable=True))
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    """Downgrade schema."""
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column('documents', 'custom_metadata')
+    # ### end Alembic commands ###
diff --git a/backend/app/api/documents.py b/backend/app/api/documents.py
@@ -6,9 +6,10 @@
 import logging
 from pathlib import Path
 
-from fastapi import APIRouter, Depends, HTTPException, status, UploadFile, File, BackgroundTasks
+from fastapi import APIRouter, Depends, HTTPException, status, UploadFile, File, BackgroundTasks, Form
 from fastapi.responses import PlainTextResponse
 from sqlalchemy.ext.asyncio import AsyncSession
+import json
 from sqlalchemy import select
 
 from app.core.config import settings
@@ -126,9 +127,29 @@ async def process_document_background(document_id: int, file_path: str, workspac
 async def upload_document(
     workspace_id: int,
     file: UploadFile = File(...),
+    custom_metadata: str | None = Form(None),
     db: AsyncSession = Depends(get_db),
 ):
     """Upload a document to a knowledge base. Processing must be triggered separately."""
+
+    parsed_metadata = None
+    if custom_metadata:
+        try:
+            raw_metadata = json.loads(custom_metadata)
+            if not isinstance(raw_metadata, list):
+                raise ValueError("Metadata must be a list of key-value objects")
+
+            parsed_metadata = {}
+            for item in raw_metadata:
+                if not isinstance(item, dict) or "key" not in item or "value" not in item:
+                    raise ValueError("Each metadata item must contain 'key' and 'value' fields")
+                parsed_metadata[item["key"]] = item["value"]
+        except Exception as e:
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail=f"Invalid custom_metadata format: {e}"
+            )
+
     result = await db.execute(select(KnowledgeBase).where(KnowledgeBase.id == workspace_id))
     kb = result.scalar_one_or_none()
 
@@ -163,6 +184,7 @@ async def upload_document(
         file_type=ext[1:],
         file_size=len(content),
         status=DocumentStatus.PENDING,
+        custom_metadata=parsed_metadata,
     )
     db.add(document)
     await db.commit()

diff --git a/backend/app/api/rag.py b/backend/app/api/rag.py
@@ -103,6 +103,7 @@ async def query_documents(
             top_k=request.top_k,
             document_ids=request.document_ids,
             mode=request.mode,
+            metadata_filter=request.metadata_filter,
         )
 
         chunks_response = []