Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,18 @@ Images and tables are **embedded into chunk vectors** — not stored separately.

</details>

<details>
<summary><b>Custom Document Metadata</b></summary>

Enhance RAG accuracy and organization by attaching custom key-value metadata during document upload:

- **Metadata Filtering** — Perform hybrid search (semantic + metadata filtering) to narrow down the search space and prevent hallucinations.
- **Flexible Organization** — Tag documents with attributes like `year`, `category`, or `author` without needing separate workspaces.
- **Optimized Retrieval** — Pre-filtering in ChromaDB reduces processing time and latency during vector search.
- **Supported Endpoints** — Pass `custom_metadata` (list of key-values) in the upload API, and `metadata_filter` in query/chat APIs.

</details>

<details>
<summary><b>Citation System</b></summary>

Expand Down Expand Up @@ -526,7 +538,7 @@ All endpoints prefixed with `/api/v1`. Interactive docs at http://localhost:8080

| Method | Endpoint | Description |
|---|---|---|
| `POST` | `/documents/upload/{workspace_id}` | Upload file |
| `POST` | `/documents/upload/{workspace_id}` | Upload file (supports `custom_metadata`) |
| `GET` | `/documents/{id}/markdown` | Get parsed content |
| `GET` | `/documents/{id}/images` | List extracted images |
| `DELETE` | `/documents/{id}` | Delete document |
Expand All @@ -538,8 +550,8 @@ All endpoints prefixed with `/api/v1`. Interactive docs at http://localhost:8080

| Method | Endpoint | Description |
|---|---|---|
| `POST` | `/rag/query/{workspace_id}` | Hybrid search |
| `POST` | `/rag/chat/{workspace_id}/stream` | Agentic streaming chat (SSE) |
| `POST` | `/rag/query/{workspace_id}` | Hybrid search (supports `metadata_filter`) |
| `POST` | `/rag/chat/{workspace_id}/stream` | Agentic streaming chat (SSE) (supports `metadata_filter`) |
| `GET` | `/rag/chat/{workspace_id}/history` | Chat history |
| `POST` | `/rag/process/{document_id}` | Process document |
| `GET` | `/rag/graph/{workspace_id}` | Knowledge graph data |
Expand Down
89 changes: 89 additions & 0 deletions backend/alembic.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# A generic, single database configuration.

[alembic]
# path to migration scripts
script_location = alembic

# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s
# file_template = %%(rev)s_%%(slug)s

# sys.path path, will be prepended to sys.path if present.
# defaults to the current working directory.
prepend_sys_path = .

# timezone to use when rendering the date within the migration file
# as well as the filename.
# If specified, requires the python-dateutil library that can be
# installed by adding it to the alembic section in setup.py
# timezone =

# max length of characters to apply to the
# "slug" field
# truncate_slug_length = 40

# set to 'true' to run the environment during
# the 'revision' command, regardless of autogenerate
# revision_environment = false

# set to 'true' to allow .pyc and .pyo files without
# a source .py file to be detected as revisions in the
# versions/ directory
# sourceless = false

# version location specification; This defaults
# to alembic/versions. When using multiple version
# directories, initial revisions must be specified with --version-path.
# The path separator used here should be the separator specified by "version_path_separator" below.
# version_locations = %(here)s/bar:%(here)s/bat:alembic/versions

# version path separator; As mentioned above, this is the character used to split
# version_locations.
# version_path_separator = :

# sqlalchemy.url = driver://user:pass@localhost/dbname


[post_write_hooks]
# post_write_hooks defines scripts or Python functions that are run
# on newly generated revision scripts. See the documentation for further
# detail and examples

# format using "black" - use the console_scripts runner, against the "black" entrypoint
# hooks = black
# black.type = console_scripts
# black.entrypoint = black
# black.options = -l 79 REVISION_SCRIPT_FILENAME

[loggers]
keys = root,sqlalchemy,alembic

[handlers]
keys = console

[formatters]
keys = generic

[logger_root]
level = WARN
handlers = console
qualname =

[logger_sqlalchemy]
level = WARN
handlers =
qualname = sqlalchemy.engine

[logger_alembic]
level = INFO
handlers =
qualname = alembic

[handler_console]
class = StreamHandler
args = (sys.stderr,)
level = NOTSET
formatter = generic

[formatter_generic]
format = %(levelname)-5.5s [%(name)s] %(message)s
datefmt = %H:%M:%S
1 change: 1 addition & 0 deletions backend/alembic/README
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Generic single-database configuration.
97 changes: 97 additions & 0 deletions backend/alembic/env.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
from logging.config import fileConfig

from sqlalchemy import engine_from_config
from sqlalchemy import pool

from alembic import context

# this is the Alembic Config object, which provides
# access to the values within the .ini file in use.
config = context.config

# Interpret the config file for Python logging.
# This line sets up loggers basically.
if config.config_file_name is not None:
fileConfig(config.config_file_name)

# add your model's MetaData object here
# for 'autogenerate' support
import os
import sys

# Add the parent directory of 'alembic' to the Python path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from app.core.database import Base
from app.core.config import settings

# Import all models here so Alembic can discover them
from app.models.document import Document, DocumentImage, DocumentTable
from app.models.knowledge_base import KnowledgeBase
from app.models.chat_message import ChatMessage

target_metadata = Base.metadata

# other values from the config, defined by the needs of env.py,
config.set_main_option("sqlalchemy.url", settings.DATABASE_URL)
# can be acquired:
# my_important_option = config.get_main_option("my_important_option")
# ... etc.


def run_migrations_offline() -> None:
"""Run migrations in 'offline' mode.

This configures the context with just a URL
and not an Engine, though an Engine is acceptable
here as well. By skipping the Engine creation
we don't even need a DBAPI to be available.

Calls to context.execute() here emit the given string to the
script output.

"""
url = config.get_main_option("sqlalchemy.url")
context.configure(
url=url,
target_metadata=target_metadata,
literal_binds=True,
dialect_opts={"paramstyle": "named"},
)

with context.begin_transaction():
context.run_migrations()


def run_migrations_online() -> None:
"""Run migrations in 'online' mode.

In this scenario we need to create an Engine
and associate a connection with the context.

"""
configuration = config.get_section(config.config_ini_section, {})
url = config.get_main_option("sqlalchemy.url")
if url and url.startswith("postgresql+asyncpg"):
url = url.replace("postgresql+asyncpg", "postgresql")
configuration["sqlalchemy.url"] = url

connectable = engine_from_config(
configuration,
prefix="sqlalchemy.",
poolclass=pool.NullPool,
)

with connectable.connect() as connection:
context.configure(
connection=connection, target_metadata=target_metadata
)

with context.begin_transaction():
context.run_migrations()


if context.is_offline_mode():
run_migrations_offline()
else:
run_migrations_online()
28 changes: 28 additions & 0 deletions backend/alembic/script.py.mako
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""${message}

Revision ID: ${up_revision}
Revises: ${down_revision | comma,n}
Create Date: ${create_date}

"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa
${imports if imports else ""}

# revision identifiers, used by Alembic.
revision: str = ${repr(up_revision)}
down_revision: Union[str, Sequence[str], None] = ${repr(down_revision)}
branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}


def upgrade() -> None:
"""Upgrade schema."""
${upgrades if upgrades else "pass"}


def downgrade() -> None:
"""Downgrade schema."""
${downgrades if downgrades else "pass"}
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""Add custom_metadata to documents

Revision ID: 2047460692d0
Revises:
Create Date: 2026-03-17 14:09:11.881981

"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision: str = '2047460692d0'
down_revision: Union[str, Sequence[str], None] = None
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
"""Upgrade schema."""
# ### commands auto generated by Alembic - please adjust! ###
op.add_column('documents', sa.Column('custom_metadata', sa.JSON(), nullable=True))
# ### end Alembic commands ###


def downgrade() -> None:
"""Downgrade schema."""
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column('documents', 'custom_metadata')
# ### end Alembic commands ###
24 changes: 23 additions & 1 deletion backend/app/api/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
import logging
from pathlib import Path

from fastapi import APIRouter, Depends, HTTPException, status, UploadFile, File, BackgroundTasks
from fastapi import APIRouter, Depends, HTTPException, status, UploadFile, File, BackgroundTasks, Form
from fastapi.responses import PlainTextResponse
from sqlalchemy.ext.asyncio import AsyncSession
import json
from sqlalchemy import select

from app.core.config import settings
Expand Down Expand Up @@ -126,9 +127,29 @@ async def process_document_background(document_id: int, file_path: str, workspac
async def upload_document(
workspace_id: int,
file: UploadFile = File(...),
custom_metadata: str | None = Form(None),
db: AsyncSession = Depends(get_db),
):
"""Upload a document to a knowledge base. Processing must be triggered separately."""

parsed_metadata = None
if custom_metadata:
try:
raw_metadata = json.loads(custom_metadata)
if not isinstance(raw_metadata, list):
raise ValueError("Metadata must be a list of key-value objects")

parsed_metadata = {}
for item in raw_metadata:
if not isinstance(item, dict) or "key" not in item or "value" not in item:
raise ValueError("Each metadata item must contain 'key' and 'value' fields")
parsed_metadata[item["key"]] = item["value"]
except Exception as e:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=f"Invalid custom_metadata format: {e}"
)

result = await db.execute(select(KnowledgeBase).where(KnowledgeBase.id == workspace_id))
kb = result.scalar_one_or_none()

Expand Down Expand Up @@ -163,6 +184,7 @@ async def upload_document(
file_type=ext[1:],
file_size=len(content),
status=DocumentStatus.PENDING,
custom_metadata=parsed_metadata,
)
db.add(document)
await db.commit()
Expand Down
1 change: 1 addition & 0 deletions backend/app/api/rag.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ async def query_documents(
top_k=request.top_k,
document_ids=request.document_ids,
mode=request.mode,
metadata_filter=request.metadata_filter,
)

chunks_response = []
Expand Down
Loading