Skip to content

Commit b4aeb35

Browse files
committed
Add full path metadata test and last_modified formatting
1 parent 8273720 commit b4aeb35

File tree

4 files changed

+56
-5
lines changed

4 files changed

+56
-5
lines changed

databend_aiserver/udfs/docparse.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,11 @@ def ai_parse_document(stage_location: StageLocation, path: str) -> Dict[str, Any
239239
# Output shape:
240240
# { "chunks": [...], "metadata": {...}, "error_information": [...] }
241241
resolved_path = resolve_stage_subpath(stage_location, path)
242+
root_prefix = ""
243+
storage_root = stage_location.storage.get("root") if stage_location.storage else ""
244+
if storage_root:
245+
root_prefix = storage_root.rstrip("/")
246+
full_path = f"{root_prefix}/{resolved_path}" if root_prefix else resolved_path
242247

243248
# Keep metadata first for predictable JSON ordering.
244249
payload: Dict[str, Any] = {
@@ -248,7 +253,7 @@ def ai_parse_document(stage_location: StageLocation, path: str) -> Dict[str, Any
248253
"duration_ms": duration_ms,
249254
"file_size": file_size if file_size is not None else 0,
250255
"filename": Path(path).name,
251-
"path": resolved_path or path,
256+
"path": full_path or path,
252257
"timings_ms": {
253258
"convert": (t_convert_end_ns - t_convert_start_ns) / 1_000_000.0,
254259
"chunk": (t_chunk_end_ns - t_convert_end_ns) / 1_000_000.0,

databend_aiserver/udfs/stage.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,15 @@
3030
)
3131

3232

33+
def _format_last_modified(value: Any) -> Optional[str]:
34+
if value is None:
35+
return None
36+
# OpenDAL returns datetime objects; fall back to string otherwise.
37+
if hasattr(value, "isoformat"):
38+
return value.isoformat()
39+
return str(value)
40+
41+
3342
def _collect_stage_files(
3443
stage_location: StageLocation, max_files: Optional[int]
3544
) -> tuple[List[Dict[str, Any]], bool]:
@@ -85,9 +94,7 @@ def _collect_stage_files(
8594
if metadata.etag:
8695
file_info["etag"] = metadata.etag
8796
if hasattr(metadata, "last_modified"):
88-
lm = metadata.last_modified
89-
if lm:
90-
file_info["last_modified"] = lm
97+
file_info["last_modified"] = _format_last_modified(metadata.last_modified)
9198

9299
entries.append(file_info)
93100
if max_entries is not None and len(entries) >= max_entries:
@@ -167,7 +174,7 @@ def ai_list_files(
167174

168175
# Convert mode to string if it exists, otherwise None
169176
mode_str = str(metadata.mode) if metadata.mode is not None else None
170-
last_modified = getattr(metadata, "last_modified", None)
177+
last_modified = _format_last_modified(getattr(metadata, "last_modified", None))
171178

172179
yield {
173180
"stage_name": stage_location.stage_name,

tests/unit/conftest.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,3 +52,20 @@ def memory_stage() -> StageLocation:
5252
operator.write(f"{RELATIVE_PATH}/lorem_ipsum.docx", DOCX_SRC.read_bytes())
5353
operator.write(f"{RELATIVE_PATH}/subdir/note.txt", b"hello from memory")
5454
return stage
55+
56+
57+
@pytest.fixture
58+
def memory_stage_with_root() -> StageLocation:
59+
stage = StageLocation(
60+
name="stage",
61+
stage_name="memory_stage_root",
62+
stage_type="External",
63+
storage={"type": "memory", "root": "s3://wizardbend/dataset"},
64+
relative_path=RELATIVE_PATH,
65+
raw_info={},
66+
)
67+
68+
operator = get_operator(stage)
69+
operator.create_dir(f"{RELATIVE_PATH}/")
70+
operator.write(f"{RELATIVE_PATH}/2206.01062.pdf", PDF_SRC.read_bytes())
71+
return stage

tests/unit/test_docparse_path.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# Copyright 2025 Databend Labs
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from databend_aiserver.udfs.docparse import ai_parse_document
16+
17+
18+
def test_docparse_metadata_path_uses_root(memory_stage_with_root):
19+
payload = ai_parse_document(memory_stage_with_root, "2206.01062.pdf")
20+
meta = payload.get("metadata", {})
21+
assert meta["path"] == "s3://wizardbend/dataset/data/2206.01062.pdf"
22+
assert meta["filename"] == "2206.01062.pdf"

0 commit comments

Comments
 (0)