From a4346f799da2cb4a0533accd161f811373bbfb4a Mon Sep 17 00:00:00 2001
From: Artsiom Beida
Date: Wed, 8 Oct 2025 10:38:49 +0200
Subject: [PATCH 01/18] UI fixes: fix estonian translation does not show report
agency name and url in the report delete confirmation form. fix reports links
to problematic urls
---
GUI/src/pages/Reports/Report.tsx | 2 +-
GUI/src/pages/Reports/index.tsx | 1 +
GUI/translations/en/common.json | 2 +-
GUI/translations/et/common.json | 2 +-
4 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/GUI/src/pages/Reports/Report.tsx b/GUI/src/pages/Reports/Report.tsx
index 7f45aca..089c89a 100644
--- a/GUI/src/pages/Reports/Report.tsx
+++ b/GUI/src/pages/Reports/Report.tsx
@@ -148,7 +148,7 @@ const Report: FC = () => {
cell: ({ row }) => (
{
{t('reports.deleteConfirmation', {
agency: deleteModal.agencyName,
domain: deleteModal.url,
+ interpolation: { escapeValue: false }
})}
diff --git a/GUI/translations/en/common.json b/GUI/translations/en/common.json
index b5e25ea..2a8c810 100644
--- a/GUI/translations/en/common.json
+++ b/GUI/translations/en/common.json
@@ -615,7 +615,7 @@
"startedAt": "Started at",
"finishedAt": "Finished at",
"deleteTitle": "Delete Report",
- "deleteConfirmation": "Are you sure you want to delete the report?",
+ "deleteConfirmation": "Are you sure you want to delete the report {{agency}} - {{domain}}?",
"deleteSuccess": "Report deleted successfully",
"deleteError": "Failed to delete report",
"errorType": "Error type",
diff --git a/GUI/translations/et/common.json b/GUI/translations/et/common.json
index a2f92c7..8ed8534 100644
--- a/GUI/translations/et/common.json
+++ b/GUI/translations/et/common.json
@@ -615,7 +615,7 @@
"startedAt": "Alustatud",
"finishedAt": "Lõpetatud",
"deleteTitle": "Kustuta aruanne",
- "deleteConfirmation": "Kas oled kindel, et soovid kustutada aruande asutusele {agency} - {domain}?",
+ "deleteConfirmation": "Kas oled kindel, et soovid kustutada aruande asutusele {{agency}} - {{domain}}?",
"deleteSuccess": "Aruanne kustutatud edukalt",
"deleteError": "Aruande kustutamine ebaõnnestus",
"errorType": "Vea tüüp",
From 78dbc7e92d42dc0c00e63549bfa89f1f55541dd0 Mon Sep 17 00:00:00 2001
From: Artsiom Beida
Date: Wed, 8 Oct 2025 17:04:19 +0200
Subject: [PATCH 02/18] Increased timeout to 30s for scrapper to account for
slow archive pages;
---
scrapper/scrapper/settings.py | 6 +++---
scrapper/scrapper/spiders/base_spider.py | 2 +-
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/scrapper/scrapper/settings.py b/scrapper/scrapper/settings.py
index ccdd372..a54195d 100644
--- a/scrapper/scrapper/settings.py
+++ b/scrapper/scrapper/settings.py
@@ -106,14 +106,14 @@
ALLOWED_FILETYPES = os.environ.get('SUPPORTED_TYPES', '.html,.docx,.doc,.pdf').split(',')
SCRAPED_DIRECTORY = os.environ.get('SCRAPED_DIRECTORY', "/scrapped-data")
RUUTER_INTERNAL = os.environ.get('RUUTER_INTERNAL', "http://ruuter-internal:8089")
-DOWNLOAD_DELAY = 0.1
+DOWNLOAD_DELAY = 0.2
DOWNLOAD_HANDLERS = {
"http": "scrapper.download_handler.DownloadHandler",
"https": "scrapper.download_handler.DownloadHandler",
}
-PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT = 10_000
+PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT = 30_000
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
PLAYWRIGHT_MAX_CONTEXTS = 1
PLAYWRIGHT_MAX_PAGES_PER_CONTEXT = 1
-RETRY_TIMES = 3
+RETRY_TIMES = 10
diff --git a/scrapper/scrapper/spiders/base_spider.py b/scrapper/scrapper/spiders/base_spider.py
index ee2c609..dfcd2dc 100644
--- a/scrapper/scrapper/spiders/base_spider.py
+++ b/scrapper/scrapper/spiders/base_spider.py
@@ -48,7 +48,7 @@ def get_meta(self):
'playwright': True,
'playwright_include_page': True,
'playwright_page_goto_kwargs': {
- 'timeout': 5_000,
+ 'timeout': 30_000,
'wait_until': 'load',
},
"playwright_context_kwargs": {
From 952f92c035dba7a01c600cd0b9b61580d98b6f29 Mon Sep 17 00:00:00 2001
From: ahmer-mt
Date: Thu, 16 Oct 2025 01:17:54 +0500
Subject: [PATCH 03/18] playwright download fix
---
scrapper/scrapper/download_handler.py | 76 +++++++++++++++++--
scrapper/scrapper/spiders/base_spider.py | 28 +++++--
.../scrapper/spiders/uploaded_file_spider.py | 7 ++
3 files changed, 96 insertions(+), 15 deletions(-)
diff --git a/scrapper/scrapper/download_handler.py b/scrapper/scrapper/download_handler.py
index 6870353..a124ba9 100644
--- a/scrapper/scrapper/download_handler.py
+++ b/scrapper/scrapper/download_handler.py
@@ -1,25 +1,87 @@
import asyncio
from scrapy import Request, Spider
from scrapy.http import Response
+from scrapy.core.downloader.handlers.http import HTTPDownloadHandler
from scrapy_playwright.handler import ScrapyPlaywrightDownloadHandler
+from playwright._impl._errors import Error as PlaywrightError
+from twisted.internet import defer
class DownloadHandler(ScrapyPlaywrightDownloadHandler):
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
- self.args = args
- self.kwargs = kwargs
+ def __init__(self, crawler):
+ super().__init__(crawler)
+ self.crawler = crawler
+ # Initialize standard HTTP handler for non-Playwright requests
+ self._http_handler = HTTPDownloadHandler(
+ settings=crawler.settings,
+ crawler=crawler
+ )
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ return cls(crawler)
+
+ def download_request(self, request: Request, spider: Spider):
+ """
+ Main entry point for downloading requests.
+ Check if Playwright is needed, otherwise use direct HTTP.
+ """
+ # Check if Playwright is requested in meta
+ if not request.meta.get('playwright'):
+ # Use direct HTTP download for uploaded files
+ spider.logger.info(f'Direct HTTP download (no Playwright): {request.url}')
+ return self._http_handler.download_request(request, spider)
+
+ # Use Playwright for regular web pages
+ spider.logger.info(f'Playwright download: {request.url}')
+ return super().download_request(request, spider)
async def _download_request(self, request: Request, spider: Spider) -> Response:
+ """
+ Internal async download method with fallback for download errors.
+ This is called by the parent's download_request when using Playwright.
+ """
try:
- spider.logger.info(f'request started: {request.url}')
+ spider.logger.info(f'Playwright request started: {request.url}')
async with asyncio.timeout(30):
r = await super()._download_request(request, spider)
- spider.logger.info(f'request finished: {request.url}')
+ spider.logger.info(f'Playwright request finished: {request.url}')
return r
+ except Exception as e:
+ # Catch "Download is starting" and similar download errors as safety net
+ if "Download is starting" in str(e) or "net::ERR_ABORTED" in str(e):
+ spider.logger.info(f'Download error detected for {request.url}, falling back to direct HTTP download')
+ # Fall back to direct HTTP download using Scrapy's HTTP handler (consistent with Method 1)
+ try:
+ import requests
+ # Use requests for simplicity in async context
+ response = requests.get(
+ request.url,
+ headers=dict(request.headers.to_unicode_dict()),
+ timeout=30
+ )
+ response.raise_for_status()
+
+ spider.logger.info(f'Direct HTTP download completed: {request.url} ({len(response.content)} bytes)')
+
+ from scrapy.http import HtmlResponse
+ return HtmlResponse(
+ url=response.url,
+ status=response.status_code,
+ headers=dict(response.headers),
+ body=response.content,
+ encoding='utf-8',
+ request=request,
+ )
+ except Exception as download_error:
+ spider.logger.error(f'Direct HTTP download failed for {request.url}: {download_error}')
+ raise
+ else:
+ # Other Playwright errors - re-raise
+ spider.logger.error(f'Playwright error for {request.url}: {str(e)}')
+ raise
except TimeoutError:
spider.logger.warning(f'request timed out due to playwright: {request.url}. Try again')
await self._close()
- super().__init__(*self.args, **self.kwargs)
await self._launch()
return await self._download_request(request, spider)
diff --git a/scrapper/scrapper/spiders/base_spider.py b/scrapper/scrapper/spiders/base_spider.py
index dfcd2dc..b6c2f15 100644
--- a/scrapper/scrapper/spiders/base_spider.py
+++ b/scrapper/scrapper/spiders/base_spider.py
@@ -109,17 +109,29 @@ async def errback(self, failure: Failure):
await page.close()
async def parse(self, response: Response, **kwargs):
- async with self.close_page(response) as page:
- self.check_source_is_stopping()
+ self.check_source_is_stopping()
- page: Page
-
- file_extension = self.guess_file_extension(
- response.headers.get(b'Content-Type', 'text/html').decode('utf-8')
- )
+ file_extension = self.guess_file_extension(
+ response.headers.get(b'Content-Type', 'text/html').decode('utf-8')
+ )
+ # Check if Playwright page is available (might not be if direct HTTP download was used)
+ playwright_page = response.meta.get("playwright_page")
+ if playwright_page:
+ # Use Playwright page for title extraction
+ async with self.close_page(response) as page:
+ page: Page
+ if file_extension == '.html':
+ title = await page.title()
+ else:
+ title = response.url
+ else:
+ # Direct HTTP download (no Playwright page available)
if file_extension == '.html':
- title = await page.title()
+ # Extract title from HTML using BeautifulSoup
+ soup = BeautifulSoup(response.body, 'lxml')
+ title_tag = soup.find('title')
+ title = title_tag.get_text() if title_tag else response.url
else:
title = response.url
diff --git a/scrapper/scrapper/spiders/uploaded_file_spider.py b/scrapper/scrapper/spiders/uploaded_file_spider.py
index 86cb9ab..fce08cc 100644
--- a/scrapper/scrapper/spiders/uploaded_file_spider.py
+++ b/scrapper/scrapper/spiders/uploaded_file_spider.py
@@ -7,6 +7,13 @@
class UploadedFileSpider(SpecifiedPagesSpider):
name = 'uploaded_file'
+ def get_meta(self):
+ """
+ Override to disable Playwright for uploaded files.
+ Uploaded files from S3 should use direct HTTP download.
+ """
+ return {}
+
async def parse(self, response: Response, **kwargs):
base_id, _ = self.get_base_id_and_hash(response.request.url)
From f9a5275f6de59d6933cf7aff2de5374bdf7ed0d7 Mon Sep 17 00:00:00 2001
From: ahmer-mt
Date: Thu, 16 Oct 2025 01:19:07 +0500
Subject: [PATCH 04/18] remove .txt from upload
---
GUI/src/components/FileUploader/FileUploader.tsx | 2 +-
GUI/src/pages/Agency/Agency.tsx | 2 +-
GUI/src/pages/UploadedFiles/index.tsx | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/GUI/src/components/FileUploader/FileUploader.tsx b/GUI/src/components/FileUploader/FileUploader.tsx
index 8c37fd0..4de9583 100644
--- a/GUI/src/components/FileUploader/FileUploader.tsx
+++ b/GUI/src/components/FileUploader/FileUploader.tsx
@@ -36,7 +36,7 @@ const FileUploader: FC = ({
onFilesChange,
onFileDelete,
maxFileSize = 30 * 1024 * 1024, // 30MB default
- acceptedTypes = '.pdf,.doc,.docx,.txt,.html,.htm',
+ acceptedTypes = '.pdf,.doc,.docx,.html,.htm',
multiple = true,
className = '',
uploadProgress,
diff --git a/GUI/src/pages/Agency/Agency.tsx b/GUI/src/pages/Agency/Agency.tsx
index 0876918..d2b9a9c 100644
--- a/GUI/src/pages/Agency/Agency.tsx
+++ b/GUI/src/pages/Agency/Agency.tsx
@@ -684,7 +684,7 @@ const Agency: FC = () => {
onFilesChange={handleFilesChange}
onFileDelete={handleFileDelete}
maxFileSize={30 * 1024 * 1024} // 30MB
- acceptedTypes=".pdf,.doc,.docx,.txt,.html,.htm"
+ acceptedTypes=".pdf,.doc,.docx,.html,.htm"
multiple={true}
uploadProgress={uploadProgress} // Pass upload progress to FileUploader
/>
diff --git a/GUI/src/pages/UploadedFiles/index.tsx b/GUI/src/pages/UploadedFiles/index.tsx
index 6ed3aeb..bb0be3d 100644
--- a/GUI/src/pages/UploadedFiles/index.tsx
+++ b/GUI/src/pages/UploadedFiles/index.tsx
@@ -811,7 +811,7 @@ const UploadedFiles: FC = () => {
onFilesChange={handleFilesChange}
onFileDelete={handleFileDelete}
maxFileSize={30 * 1024 * 1024} // 30MB
- acceptedTypes=".pdf,.doc,.docx,.txt,.html,.htm"
+ acceptedTypes=".pdf,.doc,.docx,.html,.htm"
multiple={true}
uploadProgress={uploadProgress}
/>
From 7321342b2573b5bcba5ce6f1d8afd0773105314a Mon Sep 17 00:00:00 2001
From: ahmer-mt
Date: Thu, 16 Oct 2025 21:34:34 +0500
Subject: [PATCH 05/18] Add Language to metadata
---
cleaning/requirements.txt | 1 +
cleaning/worker/tasks.py | 11 +++++++++++
file-processing/app/services/zip_service.py | 5 +++--
3 files changed, 15 insertions(+), 2 deletions(-)
diff --git a/cleaning/requirements.txt b/cleaning/requirements.txt
index 00c3182..986160a 100644
--- a/cleaning/requirements.txt
+++ b/cleaning/requirements.txt
@@ -3,3 +3,4 @@ uvicorn==0.34.2
unstructured[pdf,docx,doc]==0.18.2
pydantic-settings==2.10.1
beautifulsoup4==4.13.4
+langdetect==1.0.9
diff --git a/cleaning/worker/tasks.py b/cleaning/worker/tasks.py
index f234036..974e6ae 100644
--- a/cleaning/worker/tasks.py
+++ b/cleaning/worker/tasks.py
@@ -2,6 +2,7 @@
import logging
import requests
+from langdetect import detect, LangDetectException
from unstructured.partition.auto import partition
from bs4 import BeautifulSoup
@@ -55,6 +56,15 @@ def clean_file_task(entity: EntityToClean):
cleaned_text = clean_any_file(entity)
logger.info(f'Cleaned as unstructured file for {entity.file_path.as_posix()}')
+ # Detect language from cleaned text
+ detected_language = None
+ if cleaned_text and len(cleaned_text.strip()) > 0:
+ try:
+ detected_language = detect(cleaned_text)
+ print(f'Detected language: {detected_language} for {entity.file_path.as_posix()}')
+ except LangDetectException as e:
+ print(f'Language detection failed for {entity.file_path.as_posix()}: {e}')
+
cleaned_text_filename = entity.directory_path / 'cleaned.txt'
with cleaned_text_filename.open("w") as f:
@@ -74,6 +84,7 @@ def clean_file_task(entity: EntityToClean):
cleaned_metadata_filename = entity.directory_path / "cleaned.meta.json"
with cleaned_metadata_filename.open("w") as f:
metadata['metadata']['cleaned'] = True
+ metadata['language'] = detected_language
json.dump(metadata, f)
r = requests.post(
diff --git a/file-processing/app/services/zip_service.py b/file-processing/app/services/zip_service.py
index bc18283..d842b4c 100644
--- a/file-processing/app/services/zip_service.py
+++ b/file-processing/app/services/zip_service.py
@@ -139,12 +139,13 @@ def exclusion_filter(relative_path: str) -> bool:
# Create zip file in temp directory
base_name = os.path.join(temp_dir, "folder_content")
-
- logger.info(f"Creating zip file {temp_zip_path} with {successful_count} files")
+
shutil.make_archive(base_name, 'zip', local_folder_path)
temp_zip_path = base_name + ".zip"
+ logger.info(f"Creating zip file {temp_zip_path} with {successful_count} files")
+
# Get zip file size
zip_size = os.path.getsize(temp_zip_path)
From bbaa0b2f2a6fdf3489f79c5adc1d5ea351f5c350 Mon Sep 17 00:00:00 2001
From: ahmer-mt
Date: Thu, 16 Oct 2025 21:35:54 +0500
Subject: [PATCH 06/18] lang metadata
---
scrapper/scrapper/items.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/scrapper/scrapper/items.py b/scrapper/scrapper/items.py
index fc18309..fecb6a1 100644
--- a/scrapper/scrapper/items.py
+++ b/scrapper/scrapper/items.py
@@ -25,6 +25,7 @@ class MetadataItem:
version: str = "1.0"
created_at: str = field(default_factory=lambda: str(datetime.now()))
edited_at: str | None = None
+ language: Optional[str] = None
@dataclass
From ef25df999d64a9f87c6fc3a0fcdf80c9b37b2de4 Mon Sep 17 00:00:00 2001
From: ahmer-mt
Date: Thu, 16 Oct 2025 21:36:10 +0500
Subject: [PATCH 07/18] fix callbacks
---
scrapper/scrapper/download_handler.py | 13 +++++++------
1 file changed, 7 insertions(+), 6 deletions(-)
diff --git a/scrapper/scrapper/download_handler.py b/scrapper/scrapper/download_handler.py
index a124ba9..5b6f1ee 100644
--- a/scrapper/scrapper/download_handler.py
+++ b/scrapper/scrapper/download_handler.py
@@ -47,11 +47,17 @@ async def _download_request(self, request: Request, spider: Spider) -> Response:
r = await super()._download_request(request, spider)
spider.logger.info(f'Playwright request finished: {request.url}')
return r
+ except TimeoutError:
+ spider.logger.warning(f'request timed out due to playwright: {request.url}. Try again')
+ await self._close()
+ super().__init__(self.crawler) # Re-initialize with the same crawler
+ await self._launch()
+ return await self._download_request(request, spider)
except Exception as e:
# Catch "Download is starting" and similar download errors as safety net
if "Download is starting" in str(e) or "net::ERR_ABORTED" in str(e):
spider.logger.info(f'Download error detected for {request.url}, falling back to direct HTTP download')
- # Fall back to direct HTTP download using Scrapy's HTTP handler (consistent with Method 1)
+ # Fall back to direct HTTP download
try:
import requests
# Use requests for simplicity in async context
@@ -80,8 +86,3 @@ async def _download_request(self, request: Request, spider: Spider) -> Response:
# Other Playwright errors - re-raise
spider.logger.error(f'Playwright error for {request.url}: {str(e)}')
raise
- except TimeoutError:
- spider.logger.warning(f'request timed out due to playwright: {request.url}. Try again')
- await self._close()
- await self._launch()
- return await self._download_request(request, spider)
From 9c4a991b0e3168e42afee28307b1989d6271d973 Mon Sep 17 00:00:00 2001
From: ahmer-mt
Date: Sat, 18 Oct 2025 00:28:30 +0500
Subject: [PATCH 08/18] ignore archived urls
---
.../app/services/download_service.py | 21 +++----
scrapper/scrapper/spiders/base_spider.py | 7 ++-
.../spiders/sitemap_collect_spider.py | 6 ++
scrapper/scrapper/utils.py | 56 +++++++++++++++++++
4 files changed, 79 insertions(+), 11 deletions(-)
diff --git a/file-processing/app/services/download_service.py b/file-processing/app/services/download_service.py
index d5c9702..0d9d46a 100644
--- a/file-processing/app/services/download_service.py
+++ b/file-processing/app/services/download_service.py
@@ -19,11 +19,12 @@
FileDeleteResult
)
from app.services.blob_storage import storage_provider, BlobStorageException
+from app.core.config import settings
logger = logging.getLogger(__name__)
-# Volume path configuration
-VOLUME_PATH = '/app/data'
+# Source path configuration from settings
+SOURCE_PATH = settings.source_path
# In-memory task store for download tasks
_download_tasks: Dict[str, dict] = {}
@@ -77,7 +78,7 @@ def process_single_file_download(file_item: FileDownloadItem) -> FileDownloadRes
clean_s3_path = parts[0]
# Use volume path configuration
- local_path_str = f"{VOLUME_PATH}/{file_item.local_path}"
+ local_path_str = f"{SOURCE_PATH}/{file_item.local_path}"
local_path = Path(local_path_str)
# Ensure local directory exists
@@ -109,7 +110,7 @@ def process_single_file_download(file_item: FileDownloadItem) -> FileDownloadRes
error_msg = f"Unexpected error: {str(e)}"
return FileDownloadResult(
s3_path=file_item.s3_path,
- local_path=f"{VOLUME_PATH}/{file_item.local_path}",
+ local_path=f"{SOURCE_PATH}/{file_item.local_path}",
status="failed",
error_message=error_msg,
is_folder=False,
@@ -131,7 +132,7 @@ def process_folder_download(file_item: FileDownloadItem) -> FileDownloadResult:
clean_s3_path = parts[0]
# Use volume path configuration
- local_folder_path = f"{VOLUME_PATH}/{file_item.local_path}"
+ local_folder_path = f"{SOURCE_PATH}/{file_item.local_path}"
# Ensure local directory exists
Path(local_folder_path).mkdir(parents=True, exist_ok=True)
@@ -176,7 +177,7 @@ def process_folder_download(file_item: FileDownloadItem) -> FileDownloadResult:
error_msg = f"Folder download error: {str(e)}"
return FileDownloadResult(
s3_path=file_item.s3_path,
- local_path=f"{VOLUME_PATH}/{file_item.local_path}",
+ local_path=f"{SOURCE_PATH}/{file_item.local_path}",
status="failed",
error_message=error_msg,
is_folder=True,
@@ -218,7 +219,7 @@ def process_download_task(task_id: str) -> None:
error_msg = f"Unexpected error processing {file_item.s3_path}: {str(e)}"
results.append(FileDownloadResult(
s3_path=file_item.s3_path,
- local_path=f"{VOLUME_PATH}/{file_item.local_path}",
+ local_path=f"{SOURCE_PATH}/{file_item.local_path}",
status="failed",
error_message=error_msg,
is_folder=file_item.is_folder,
@@ -328,7 +329,7 @@ def delete_files_from_volume(request: DeleteFromVolumeRequest) -> DeleteFromVolu
for file_item in request.files:
try:
# Use volume path configuration
- local_path_str = f"{VOLUME_PATH}/{file_item.local_path}"
+ local_path_str = f"{SOURCE_PATH}/{file_item.local_path}"
local_path = Path(local_path_str)
if local_path.exists():
@@ -369,7 +370,7 @@ def delete_files_from_volume(request: DeleteFromVolumeRequest) -> DeleteFromVolu
except Exception as e:
error_msg = f"Unexpected error: {str(e)}"
results.append(FileDeleteResult(
- local_path=f"{VOLUME_PATH}/{file_item.local_path}",
+ local_path=f"{SOURCE_PATH}/{file_item.local_path}",
status="failed",
error_message=error_msg
))
@@ -445,7 +446,7 @@ def download_files_to_volume(request: DownloadToVolumeRequest) -> DownloadToVolu
error_msg = f"Unexpected error processing {file_item.s3_path}: {str(e)}"
results.append(FileDownloadResult(
s3_path=file_item.s3_path,
- local_path=f"{VOLUME_PATH}/{file_item.local_path}",
+ local_path=f"{SOURCE_PATH}/{file_item.local_path}",
status="failed",
error_message=error_msg,
is_folder=file_item.is_folder,
diff --git a/scrapper/scrapper/spiders/base_spider.py b/scrapper/scrapper/spiders/base_spider.py
index b6c2f15..6595757 100644
--- a/scrapper/scrapper/spiders/base_spider.py
+++ b/scrapper/scrapper/spiders/base_spider.py
@@ -16,7 +16,7 @@
from api.models import BaseObject
from scrapper.items import FileItem, MetadataItem, Metadata, ScrappedItem
-from scrapper.utils import send_error
+from scrapper.utils import send_error, is_archive_url
class BaseSpider(Spider):
@@ -111,6 +111,11 @@ async def errback(self, failure: Failure):
async def parse(self, response: Response, **kwargs):
self.check_source_is_stopping()
+ # Check if URL is an archive page and skip if it is
+ if is_archive_url(response.url):
+ self.logger.info(f'Skipping archive URL: {response.url}')
+ return
+
file_extension = self.guess_file_extension(
response.headers.get(b'Content-Type', 'text/html').decode('utf-8')
)
diff --git a/scrapper/scrapper/spiders/sitemap_collect_spider.py b/scrapper/scrapper/spiders/sitemap_collect_spider.py
index 89846b7..5d6301c 100644
--- a/scrapper/scrapper/spiders/sitemap_collect_spider.py
+++ b/scrapper/scrapper/spiders/sitemap_collect_spider.py
@@ -6,6 +6,7 @@
from api.models import SitemapCollectScrapperTask
from scrapper.spiders.base_spider import BaseSpider
+from scrapper.utils import is_archive_url
class SitemapCollectSpider(BaseSpider):
@@ -143,6 +144,11 @@ async def parse(self, response: Response, **kwargs):
if self.get_pure_domain(next_url) not in self.pure_allowed_domains:
continue
+ # Skip archive URLs
+ if is_archive_url(next_url):
+ self.logger.info(f'Skipping archive URL: {next_url}')
+ continue
+
if next_url not in self.visited_urls:
self.visited_urls.add(next_url)
self.logger.info(f'Schedule scrape for url: {next_url}')
diff --git a/scrapper/scrapper/utils.py b/scrapper/scrapper/utils.py
index 19f6e59..b082621 100644
--- a/scrapper/scrapper/utils.py
+++ b/scrapper/scrapper/utils.py
@@ -3,6 +3,7 @@
import functools
import typing
import requests
+from urllib.parse import urlparse
from scrapper.items import ScrappedItem
@@ -66,3 +67,58 @@ def decorator(self, spider: BaseSpider):
return r
return decorator
+
+
+# Archive URL detection keywords in multiple languages
+ARCHIVE_KEYWORDS = [
+ # Estonian
+ 'arhiiv', 'arhiivi', 'archive',
+ # English
+ 'archived', 'archives',
+ # Russian transliteration
+ 'arkhiv', 'arhiv',
+]
+
+
+def is_archive_url(url: str) -> bool:
+ """
+ Check if a URL points to an archived/historical page.
+
+ Detects archive pages by checking for archive-related keywords in:
+ - Subdomain (e.g., arhiiv.example.ee)
+ - Path segments (e.g., example.ee/arhiiv/2020/)
+
+ Args:
+ url: The URL to check
+
+ Returns:
+ True if the URL appears to be an archive page, False otherwise
+
+ Examples:
+ >>> is_archive_url('https://arhiiv.lastekaitseliit.ee/et/2016/06/7203/')
+ True
+ >>> is_archive_url('https://example.com/arhiiv/old-content')
+ True
+ >>> is_archive_url('https://example.com/current-page')
+ False
+ """
+ try:
+ parsed = urlparse(url.lower())
+
+ # Check subdomain for archive keywords
+ hostname_parts = parsed.hostname.split('.') if parsed.hostname else []
+ for part in hostname_parts:
+ if any(keyword in part for keyword in ARCHIVE_KEYWORDS):
+ return True
+
+ # Check path segments for archive keywords
+ path_parts = parsed.path.split('/')
+ for part in path_parts:
+ if any(keyword in part for keyword in ARCHIVE_KEYWORDS):
+ return True
+
+ return False
+
+ except Exception:
+ # If URL parsing fails, don't filter it out
+ return False
From f6224a84487f1aa8c26e8db5ca129195e05e7a96 Mon Sep 17 00:00:00 2001
From: ahmer-mt
Date: Mon, 20 Oct 2025 22:52:53 +0500
Subject: [PATCH 09/18]
https://github.com/buerokratt/Common-Knowledge/issues/88
---
.../ckb/TEMPLATES/pipeline/clean-file.yml | 10 ++++
cleaning/api/models.py | 5 +-
cleaning/worker/tasks.py | 53 +++++++++++++++++--
3 files changed, 64 insertions(+), 4 deletions(-)
diff --git a/DSL/Ruuter/ckb/TEMPLATES/pipeline/clean-file.yml b/DSL/Ruuter/ckb/TEMPLATES/pipeline/clean-file.yml
index c6a5817..26f3b70 100644
--- a/DSL/Ruuter/ckb/TEMPLATES/pipeline/clean-file.yml
+++ b/DSL/Ruuter/ckb/TEMPLATES/pipeline/clean-file.yml
@@ -4,6 +4,11 @@ extractRequestData:
meta_data_path: ${incoming.body.meta_data_path}
directory_path: ${incoming.body.directory_path}
source_file_id: ${incoming.body.source_file_id}
+ logs_path: ${incoming.body.logs_path}
+ url: ${incoming.body.url}
+ source_base_id: ${incoming.body.source_base_id}
+ agency_base_id: ${incoming.body.agency_base_id}
+ source_run_report_base_id: ${incoming.body.source_run_report_base_id}
cleanData:
@@ -15,6 +20,11 @@ cleanData:
meta_data_path: ${meta_data_path}
directory_path: ${directory_path}
source_file_id: ${source_file_id}
+ logs_path: ${logs_path}
+ url: ${url}
+ source_base_id: ${source_base_id}
+ agency_base_id: ${agency_base_id}
+ source_run_report_base_id: ${source_run_report_base_id}
result: cleanedResult
diff --git a/cleaning/api/models.py b/cleaning/api/models.py
index ff96d4e..833db05 100644
--- a/cleaning/api/models.py
+++ b/cleaning/api/models.py
@@ -7,4 +7,7 @@ class EntityToClean(BaseModel):
directory_path: DirectoryPath
source_file_id: str
url: str
- logs_path: FilePath
\ No newline at end of file
+ logs_path: FilePath
+ source_base_id: str
+ agency_base_id: str
+ source_run_report_base_id: str
\ No newline at end of file
diff --git a/cleaning/worker/tasks.py b/cleaning/worker/tasks.py
index 974e6ae..1bcabe2 100644
--- a/cleaning/worker/tasks.py
+++ b/cleaning/worker/tasks.py
@@ -1,10 +1,12 @@
import json
import logging
+import re
import requests
from langdetect import detect, LangDetectException
from unstructured.partition.auto import partition
+from unstructured.partition.html import partition_html
from bs4 import BeautifulSoup
from api.config import settings
@@ -14,11 +16,52 @@
logger = logging.getLogger(__name__)
+def normalize_newlines(text: str) -> str:
+ """
+ Normalize excessive newlines to maximum of 2 consecutive newlines.
+ Replace 3 or more consecutive newlines with exactly 2 newlines.
+ """
+ # Replace 3 or more newlines with exactly 2 newlines
+ normalized_text = re.sub(r'\n{3,}', '\n\n', text)
+ return normalized_text
+
+
def clean_html(entity: EntityToClean):
- with entity.file_path.open('r') as f:
- soup = BeautifulSoup(f.read(), 'lxml')
+ """Clean HTML files using a multi-step approach for better content extraction."""
+ soup = BeautifulSoup(open(entity.file_path.as_posix(), 'r'), 'lxml')
+
+ # Step 1: Check if there's a element and use only that
+ main_element = soup.find('main')
+ if main_element:
+ logger.info(f'Found element, using only main content for {entity.file_path.as_posix()}')
+ # Remove unwanted elements from main
+ for element in main_element(['script', 'style', 'nav', 'aside', 'form']):
+ element.decompose()
+ cleaned_text = main_element.get_text(separator='\n', strip=True)
+ logger.info(f'Extracted {len(cleaned_text)} chars from element for {entity.file_path.as_posix()}')
+ return cleaned_text
+
+ # Step 2: Try partition_html with skip_headers_and_footers flag
+ logger.info(f'No element found, trying partition_html with skip_headers_and_footers for {entity.file_path.as_posix()}')
+ partitioned = partition_html(
+ filename=entity.file_path.as_posix(),
+ languages=settings.languages,
+ skip_headers_and_footers=True
+ )
+ cleaned_text = '\n\n'.join([str(el) for el in partitioned])
+
+ # Step 3: If partition_html returns empty, fallback to BeautifulSoup
+ if len(partitioned) == 0:
+ logger.warning(f'partition_html returned empty content, using BeautifulSoup fallback for {entity.file_path.as_posix()}')
- return soup.get_text()
+ # Remove unwanted elements (headers, footers, nav, scripts, styles)
+ for element in soup(['header', 'footer', 'nav', 'script', 'style', 'aside', 'form']):
+ element.decompose()
+
+ cleaned_text = soup.get_text(separator='\n', strip=True)
+ logger.info(f'BeautifulSoup fallback extracted {len(cleaned_text)} chars for {entity.file_path.as_posix()}')
+
+ return cleaned_text
def clean_any_file(entity: EntityToClean):
@@ -56,6 +99,10 @@ def clean_file_task(entity: EntityToClean):
cleaned_text = clean_any_file(entity)
logger.info(f'Cleaned as unstructured file for {entity.file_path.as_posix()}')
+ # Normalize excessive newlines (max 2 consecutive newlines)
+ cleaned_text = normalize_newlines(cleaned_text)
+ logger.info(f'Normalized newlines for {entity.file_path.as_posix()}')
+
# Detect language from cleaned text
detected_language = None
if cleaned_text and len(cleaned_text.strip()) > 0:
From f588e979d22964e49040b2bddbee637e3396e68e Mon Sep 17 00:00:00 2001
From: ahmer-mt
Date: Tue, 21 Oct 2025 00:23:06 +0500
Subject: [PATCH 10/18]
https://github.com/buerokratt/Common-Knowledge/issues/90
---
.../ckb/hbs/extract_file_names.handlebars | 5 ++
.../check_duplicate_file_names.sql | 34 +++++++++++++
.../get-upload-urls-for-existing-source.yml | 38 ++++++++++++++
GUI/src/pages/UploadedFiles/index.tsx | 51 ++++++++++++++++---
GUI/src/services/api.ts | 17 +++++--
5 files changed, 134 insertions(+), 11 deletions(-)
create mode 100644 DSL/DMapper/ckb/hbs/extract_file_names.handlebars
create mode 100644 DSL/Resql/ckb/POST/source_file/check_duplicate_file_names.sql
diff --git a/DSL/DMapper/ckb/hbs/extract_file_names.handlebars b/DSL/DMapper/ckb/hbs/extract_file_names.handlebars
new file mode 100644
index 0000000..cb14b65
--- /dev/null
+++ b/DSL/DMapper/ckb/hbs/extract_file_names.handlebars
@@ -0,0 +1,5 @@
+[
+ {{#each files}}
+ "{{this.name}}"{{#unless @last}},{{/unless}}
+ {{/each}}
+]
diff --git a/DSL/Resql/ckb/POST/source_file/check_duplicate_file_names.sql b/DSL/Resql/ckb/POST/source_file/check_duplicate_file_names.sql
new file mode 100644
index 0000000..8532779
--- /dev/null
+++ b/DSL/Resql/ckb/POST/source_file/check_duplicate_file_names.sql
@@ -0,0 +1,34 @@
+/*
+declaration:
+ version: 0.1
+ description: "Check if file names already exist for a source (latest version, non-deleted)"
+ method: post
+ accepts: json
+ returns: json
+ namespace: source_file
+ allowlist:
+ body:
+ - field: source_id
+ type: string
+ description: "Source base ID"
+ - field: file_names
+ type: string
+ description: "Comma-separated file names to check"
+ response:
+ fields:
+ - field: file_name
+ type: string
+ description: "Duplicate file name found"
+*/
+WITH latest_files AS (
+ SELECT DISTINCT ON (base_id)
+ base_id, file_name, is_deleted
+ FROM data_collection.source_file
+ WHERE source_base_id = :source_id::UUID
+ AND type = 'uploaded_file'
+ ORDER BY base_id, updated_at DESC
+)
+SELECT DISTINCT file_name
+FROM latest_files
+WHERE is_deleted = FALSE
+ AND file_name = ANY(string_to_array(:fileNames, ','));
diff --git a/DSL/Ruuter/ckb/POST/source/file/get-upload-urls-for-existing-source.yml b/DSL/Ruuter/ckb/POST/source/file/get-upload-urls-for-existing-source.yml
index 6206ba0..f637e7c 100644
--- a/DSL/Ruuter/ckb/POST/source/file/get-upload-urls-for-existing-source.yml
+++ b/DSL/Ruuter/ckb/POST/source/file/get-upload-urls-for-existing-source.yml
@@ -29,6 +29,32 @@ extractRequestData:
files: "${incoming.body.files}"
expires_in: "${incoming.body.expiresIn || 3600}"
+extractFileNames:
+ call: http.post
+ args:
+ url: "[#CKB_DMAPPER_HBS]/extract_file_names"
+ headers:
+ type: json
+ body:
+ files: ${files}
+ result: fileNamesResult
+
+checkDuplicateFileNames:
+ call: http.post
+ args:
+ url: "[#CKB_RESQL]/source_file/check_duplicate_file_names"
+ headers:
+ type: json
+ body:
+ source_id: ${source_base_id}
+ fileNames: ${fileNamesResult.response.body?.join(',') ?? ''}
+ result: duplicatesResult
+
+validateNoDuplicates:
+ switch:
+ - condition: ${duplicatesResult.response.body !== null && duplicatesResult.response.body.length > 0}
+ next: returnDuplicateError
+
generateSourceFileIds:
call: http.post
args:
@@ -83,3 +109,15 @@ prepareResponse:
returnResponse:
return: ${response}
next: end
+
+returnDuplicateError:
+ assign:
+ errorResponse:
+ error: "Duplicate file names found"
+ duplicateFiles: ${duplicatesResult.response.body}
+ next: returnDuplicateErrorResponse
+
+returnDuplicateErrorResponse:
+ return: ${errorResponse}
+ status: 400
+ next: end
diff --git a/GUI/src/pages/UploadedFiles/index.tsx b/GUI/src/pages/UploadedFiles/index.tsx
index bb0be3d..1e4f1ec 100644
--- a/GUI/src/pages/UploadedFiles/index.tsx
+++ b/GUI/src/pages/UploadedFiles/index.tsx
@@ -252,21 +252,56 @@ const UploadedFiles: FC = () => {
currentFileName: '',
});
+ // Extract error message from various formats
+ // Backend returns: {response: {error: "...", duplicateFiles: [...]}}
+ const responseData =
+ error.response?.data?.response || error.response?.data || {};
+
+ const errorMessage =
+ responseData.error || error.message || t('knowledgeBase.uploadError');
+
+ // Get duplicate file names if available
+ const duplicateFiles = responseData.duplicateFiles || [];
+
// Set failed files to error status
setFormData((prev) => ({
...prev,
- files: prev.files.map((file) => ({
- ...file,
- status:
- file.status === 'uploading' ? ('error' as const) : file.status,
- message: file.status === 'uploading' ? error.message : file.message,
- })),
+ files: prev.files.map((file) => {
+ const isDuplicate = duplicateFiles.some(
+ (df: any) =>
+ df.fileName === file.name ||
+ df.file_name === file.name ||
+ df === file.name
+ );
+
+ // Only mark duplicates as error, leave other files as-is
+ if (isDuplicate) {
+ return {
+ ...file,
+ status: 'error' as const,
+ message: errorMessage,
+ };
+ }
+
+ return file;
+ }),
}));
+ // Build detailed error message
+ let displayMessage = errorMessage;
+ if (duplicateFiles.length > 0) {
+ const fileNames = duplicateFiles
+ .map((df: any) =>
+ typeof df === 'string' ? df : df.fileName || df.file_name
+ )
+ .join(', ');
+ displayMessage = `${errorMessage}: ${fileNames}`;
+ }
+
toast.open({
type: 'error',
title: t('global.notificationError'),
- message: error.message || t('knowledgeBase.uploadError'),
+ message: displayMessage,
});
},
});
@@ -781,7 +816,7 @@ const UploadedFiles: FC = () => {
uploadProgress.isUploading ||
!formData.subsector ||
formData.files.length === 0 ||
- formData.files.every((file) => file.status === 'error')
+ formData.files.some((file) => file.status === 'error')
}
>
{uploadProgress.isUploading
diff --git a/GUI/src/services/api.ts b/GUI/src/services/api.ts
index c5a68b8..93fbfba 100644
--- a/GUI/src/services/api.ts
+++ b/GUI/src/services/api.ts
@@ -35,10 +35,21 @@ const AxiosInterceptor = ({ children }) => {
const errInterceptor = (error: any) => {
import.meta.env.DEBUG_ENABLED && console.debug(error);
- let message =
- error?.response?.data?.response || t('global.notificationErrorMsg');
+ // Keep the original error structure for proper error handling
+ // If there's a response, attach it to a new error with proper message
+ if (error?.response?.data?.response) {
+ const responseData = error.response.data.response;
+ const errorMessage = typeof responseData === 'string'
+ ? responseData
+ : responseData.error || t('global.notificationErrorMsg');
- return Promise.reject(new Error(message));
+ const newError = new Error(errorMessage);
+ // Preserve the original response for error handlers
+ (newError as any).response = error.response;
+ return Promise.reject(newError);
+ }
+
+ return Promise.reject(new Error(error?.message || t('global.notificationErrorMsg')));
};
const apiInterceptor = api.interceptors.response.use(
From d630a6d120967b3da901d5ab5aa5beaf120e77a5 Mon Sep 17 00:00:00 2001
From: ahmer-mt
Date: Tue, 21 Oct 2025 07:48:12 +0500
Subject: [PATCH 11/18]
https://github.com/buerokratt/Common-Knowledge/issues/92
---
API_SPECIFICATION.md | 25 +++++++++++++++--
DATABASE_SCHEMA.md | 2 ++
.../enrich_files_with_uploaded_by.handlebars | 12 ++++++++
...21071904-add-uploaded-by-and-file-size.sql | 10 +++++++
...21071904-add-uploaded-by-and-file-size.xml | 12 ++++++++
.../changelog/20251021071904-rollback.sql | 2 ++
.../create_uploaded_source_files.sql | 10 +++++--
.../POST/source-file/add-uploaded-files.yml | 28 ++++++++++++++++++-
GUI/src/pages/Agency/Agency.tsx | 9 ++----
GUI/src/pages/UploadedFiles/index.tsx | 9 ++----
GUI/src/services/sources.ts | 2 ++
11 files changed, 103 insertions(+), 18 deletions(-)
create mode 100644 DSL/DMapper/ckb/hbs/enrich_files_with_uploaded_by.handlebars
create mode 100644 DSL/Liquibase/changelog/20251021071904-add-uploaded-by-and-file-size.sql
create mode 100644 DSL/Liquibase/changelog/20251021071904-add-uploaded-by-and-file-size.xml
create mode 100644 DSL/Liquibase/changelog/20251021071904-rollback.sql
diff --git a/API_SPECIFICATION.md b/API_SPECIFICATION.md
index 7e0c5fc..9f09f5d 100644
--- a/API_SPECIFICATION.md
+++ b/API_SPECIFICATION.md
@@ -262,20 +262,41 @@ List source files.
#### POST /ckb/source-file/add-uploaded-files
Add uploaded files to a source.
+**Request Headers:**
+- `Cookie`: Contains JWT with user information for tracking uploader
+
**Request Body:**
```json
{
- "source_base_id": "uuid",
+ "agencyId": "uuid",
+ "sourceId": "uuid",
"files": [
{
+ "base_id": "uuid",
"file_name": "document.pdf",
"original_data_url": "s3://bucket/uploads/file",
- "external_id": "ext_123"
+ "subsector": "Legal",
+ "file_size": 13264
}
]
}
```
+**Response:**
+```json
+[
+ {
+ "id": "uuid",
+ "url": null,
+ "hash": "",
+ "original_data_url": "s3://bucket/uploads/file",
+ "path": "s3://bucket/uploads/file"
+ }
+]
+```
+
+**Note:** The `uploaded_by` field is automatically populated from the JWT cookie (user's `idCode`).
+
#### POST /ckb/source-file/get-upload-urls
Get presigned upload URLs.
diff --git a/DATABASE_SCHEMA.md b/DATABASE_SCHEMA.md
index 5a95540..b9d7be4 100644
--- a/DATABASE_SCHEMA.md
+++ b/DATABASE_SCHEMA.md
@@ -69,6 +69,8 @@ erDiagram
TEXT file_name "Original filename"
TEXT external_id "External system ID"
TEXT subsector "Data subsector"
+ BIGINT file_size "File size in bytes"
+ TEXT uploaded_by "User ID who uploaded (uploaded_file only)"
BOOLEAN is_excluded "Excluded from processing"
BOOLEAN is_deleted "Soft delete flag"
}
diff --git a/DSL/DMapper/ckb/hbs/enrich_files_with_uploaded_by.handlebars b/DSL/DMapper/ckb/hbs/enrich_files_with_uploaded_by.handlebars
new file mode 100644
index 0000000..88c8ed6
--- /dev/null
+++ b/DSL/DMapper/ckb/hbs/enrich_files_with_uploaded_by.handlebars
@@ -0,0 +1,12 @@
+[
+ {{#each files}}
+ {
+ "base_id": "{{this.base_id}}",
+ "file_name": "{{this.file_name}}",
+ "subsector": "{{this.subsector}}",
+ "original_data_url": "{{this.original_data_url}}",
+ "file_size": {{this.file_size}},
+ "uploaded_by": "{{@root.uploaded_by}}"
+ }{{#unless @last}},{{/unless}}
+ {{/each}}
+]
diff --git a/DSL/Liquibase/changelog/20251021071904-add-uploaded-by-and-file-size.sql b/DSL/Liquibase/changelog/20251021071904-add-uploaded-by-and-file-size.sql
new file mode 100644
index 0000000..8a857cc
--- /dev/null
+++ b/DSL/Liquibase/changelog/20251021071904-add-uploaded-by-and-file-size.sql
@@ -0,0 +1,10 @@
+-- liquibase formatted sql
+-- changeset ahmer-mt:20251021071904 ignore:true
+-- Add uploaded_by and file_size columns to source_file table
+
+ALTER TABLE data_collection.source_file
+ADD COLUMN uploaded_by TEXT,
+ADD COLUMN file_size BIGINT;
+
+COMMENT ON COLUMN data_collection.source_file.uploaded_by IS 'User/system that uploaded the file (only for uploaded_file type)';
+COMMENT ON COLUMN data_collection.source_file.file_size IS 'File size in bytes';
diff --git a/DSL/Liquibase/changelog/20251021071904-add-uploaded-by-and-file-size.xml b/DSL/Liquibase/changelog/20251021071904-add-uploaded-by-and-file-size.xml
new file mode 100644
index 0000000..949c9a6
--- /dev/null
+++ b/DSL/Liquibase/changelog/20251021071904-add-uploaded-by-and-file-size.xml
@@ -0,0 +1,12 @@
+
+
+
+
+
+
+
+
+
diff --git a/DSL/Liquibase/changelog/20251021071904-rollback.sql b/DSL/Liquibase/changelog/20251021071904-rollback.sql
new file mode 100644
index 0000000..b2ec9d5
--- /dev/null
+++ b/DSL/Liquibase/changelog/20251021071904-rollback.sql
@@ -0,0 +1,2 @@
+-- liquibase formatted sql
+-- changeset ahmer-mt:20251021071904 ignore:true
diff --git a/DSL/Resql/ckb/POST/source_file/create_uploaded_source_files.sql b/DSL/Resql/ckb/POST/source_file/create_uploaded_source_files.sql
index c60f6ec..3982d72 100644
--- a/DSL/Resql/ckb/POST/source_file/create_uploaded_source_files.sql
+++ b/DSL/Resql/ckb/POST/source_file/create_uploaded_source_files.sql
@@ -16,7 +16,7 @@ declaration:
description: "Agency base ID"
- field: files
type: array
- description: "Array of file objects with base_id, file_name, subsector, original_data_url"
+ description: "Array of file objects with base_id, file_name, subsector, original_data_url, file_size, uploaded_by"
response:
fields:
- field: url
@@ -36,7 +36,7 @@ declaration:
description: "Path (same as original_data_url)"
*/
INSERT INTO data_collection.source_file (
- source_base_id, agency_base_id, base_id, file_name, subsector, original_data_url, type
+ source_base_id, agency_base_id, base_id, file_name, subsector, original_data_url, file_size, uploaded_by, type
)
SELECT
:source_id::UUID,
@@ -45,13 +45,17 @@ SELECT
file_data.file_name,
file_data.subsector,
file_data.original_data_url,
+ file_data.file_size::BIGINT,
+ file_data.uploaded_by,
'uploaded_file'::source_file_type
FROM (
SELECT
(SELECT value) ->> 'base_id' AS base_id,
(SELECT value) ->> 'file_name' AS file_name,
(SELECT value) ->> 'subsector' AS subsector,
- (SELECT value) ->> 'original_data_url' AS original_data_url
+ (SELECT value) ->> 'original_data_url' AS original_data_url,
+ (SELECT value) ->> 'file_size' AS file_size,
+ (SELECT value) ->> 'uploaded_by' AS uploaded_by
FROM JSON_ARRAY_ELEMENTS(ARRAY_TO_JSON(ARRAY[:files])) WITH ORDINALITY
) AS file_data
RETURNING NULL as url, base_id as id, '' as hash, original_data_url, original_data_url as path;
\ No newline at end of file
diff --git a/DSL/Ruuter/ckb/POST/source-file/add-uploaded-files.yml b/DSL/Ruuter/ckb/POST/source-file/add-uploaded-files.yml
index c1b6c90..ac9de62 100644
--- a/DSL/Ruuter/ckb/POST/source-file/add-uploaded-files.yml
+++ b/DSL/Ruuter/ckb/POST/source-file/add-uploaded-files.yml
@@ -17,12 +17,27 @@ declaration:
- field: files
type: array
description: "Array of uploaded file objects"
+ headers:
+ - field: cookie
+ type: string
+ description: "Cookie field"
+
+getUserInfo:
+ call: http.post
+ args:
+ url: "[#CKB_TIM]/jwt/custom-jwt-userinfo"
+ contentType: plaintext
+ headers:
+ cookie: ${incoming.headers.cookie}
+ plaintext: "customJwtCookie"
+ result: userInfoResult
extractRequestData:
assign:
agency_id: "${incoming.body.agencyId}"
source_id: "${incoming.body.sourceId}"
files: "${incoming.body.files}"
+ uploaded_by: "${userInfoResult.response.body.idCode}"
validateInput:
switch:
@@ -31,6 +46,17 @@ validateInput:
- condition: ${files === null || files.length === 0}
next: returnError
+enrichFilesWithUploadedBy:
+ call: http.post
+ args:
+ url: "[#CKB_DMAPPER_HBS]/enrich_files_with_uploaded_by"
+ headers:
+ type: json
+ body:
+ files: ${files}
+ uploaded_by: ${uploaded_by}
+ result: enrichedFilesResult
+
createUploadedSourceFiles:
call: http.post
args:
@@ -40,7 +66,7 @@ createUploadedSourceFiles:
body:
agency_id: ${agency_id}
source_id: ${source_id}
- files: ${files}
+ files: ${enrichedFilesResult.response.body}
result: createResult
transformToGetDownloadUrls:
diff --git a/GUI/src/pages/Agency/Agency.tsx b/GUI/src/pages/Agency/Agency.tsx
index d2b9a9c..ea100b9 100644
--- a/GUI/src/pages/Agency/Agency.tsx
+++ b/GUI/src/pages/Agency/Agency.tsx
@@ -208,18 +208,15 @@ const Agency: FC = () => {
})),
}));
+ setUploadModal(false);
+ setFormData({ subsector: '', files: [] });
+
toast.open({
type: 'success',
title: t('global.notification'),
message: t('knowledgeBase.uploadSuccess'),
});
- // Close modal after a short delay to show success state
- setTimeout(() => {
- setUploadModal(false);
- setFormData({ subsector: '', files: [] });
- }, 1000);
-
queryClient.invalidateQueries(['sources']);
},
onError: (error: any) => {
diff --git a/GUI/src/pages/UploadedFiles/index.tsx b/GUI/src/pages/UploadedFiles/index.tsx
index 1e4f1ec..15c6a18 100644
--- a/GUI/src/pages/UploadedFiles/index.tsx
+++ b/GUI/src/pages/UploadedFiles/index.tsx
@@ -229,18 +229,15 @@ const UploadedFiles: FC = () => {
})),
}));
+ setUploadModal(false);
+ setFormData({ search: '', files: [], subsector: '' });
+
toast.open({
type: 'success',
title: t('global.notification'),
message: t('knowledgeBase.uploadSuccess'),
});
- // Close modal after a short delay to show success state
- setTimeout(() => {
- setUploadModal(false);
- setFormData({ search: '', files: [], subsector: '' });
- }, 1000);
-
queryClient.invalidateQueries(['uploadedFiles']);
},
onError: (error: any) => {
diff --git a/GUI/src/services/sources.ts b/GUI/src/services/sources.ts
index 5188a54..f19a4a7 100644
--- a/GUI/src/services/sources.ts
+++ b/GUI/src/services/sources.ts
@@ -230,6 +230,7 @@ export const createSourceFile = async (
file_name: uploadInfo.uploadItem.fileName,
subsector: data.subsector,
original_data_url: uploadInfo.uploadItem.path,
+ file_size: uploadInfo.file.size,
}));
await registerUploadedFiles(
@@ -295,6 +296,7 @@ export const addFilesToExistingSource = async (
file_name: uploadInfo.uploadItem.fileName,
subsector: data.subsector,
original_data_url: uploadInfo.uploadItem.path,
+ file_size: uploadInfo.file.size,
}));
await registerUploadedFiles(
From d652c4166dccf2f7595eca5f2825ad28d11ace0f Mon Sep 17 00:00:00 2001
From: ahmer-mt
Date: Tue, 21 Oct 2025 08:23:56 +0500
Subject: [PATCH 12/18] fix sorting
---
.../ckb/GET/source/list_agency_sources.sql | 14 ++++++------
DSL/Resql/ckb/GET/source/list_api_sources.sql | 10 ++++-----
.../list_excluded_source_files_by_agency.sql | 22 ++++++++++++-------
.../source_file/list_scraped_source_files.sql | 2 +-
.../list_uploaded_source_files.sql | 6 ++---
.../source_run_page/list_source_run_pages.sql | 8 +++----
GUI/src/components/DataTable/index.tsx | 4 ++--
GUI/src/pages/API/ApiDetail.tsx | 2 +-
GUI/src/pages/API/index.tsx | 2 +-
GUI/src/pages/Agency/Agency.tsx | 4 ++--
GUI/src/pages/Agency/index.tsx | 2 +-
GUI/src/pages/Reports/Report.tsx | 2 +-
GUI/src/pages/Reports/index.tsx | 2 +-
GUI/src/pages/ScrapedFiles/index.tsx | 2 +-
GUI/src/pages/UploadedFiles/index.tsx | 2 +-
GUI/src/services/agencies.ts | 2 +-
GUI/src/services/files.ts | 4 ++--
GUI/src/services/reports.ts | 4 ++--
GUI/src/services/sources.ts | 6 ++---
19 files changed, 53 insertions(+), 47 deletions(-)
diff --git a/DSL/Resql/ckb/GET/source/list_agency_sources.sql b/DSL/Resql/ckb/GET/source/list_agency_sources.sql
index 0a75016..cc134e3 100644
--- a/DSL/Resql/ckb/GET/source/list_agency_sources.sql
+++ b/DSL/Resql/ckb/GET/source/list_agency_sources.sql
@@ -63,28 +63,28 @@ declaration:
description: "Total number of matching records"
*/
WITH latest_sources AS (
- SELECT DISTINCT ON (base_id)
- id, base_id, agency_base_id, url, subsector, status, last_scraped_at, type, is_deleted
+ SELECT DISTINCT ON (base_id)
+ id, base_id, agency_base_id, url, subsector, status, last_scraped_at, type, is_deleted, updated_at
FROM data_collection.source
WHERE agency_base_id = :agency_base_id::UUID
ORDER BY base_id, updated_at DESC
)
-SELECT
+SELECT
id, base_id, agency_base_id, url, subsector, status, last_scraped_at, type,
:page as page,
CEIL(COUNT(*) OVER () / :page_size::DECIMAL) AS total_pages,
(COUNT(*) OVER ()) AS total
FROM latest_sources
WHERE is_deleted = FALSE
-ORDER BY
+ORDER BY
CASE WHEN :sorting = 'url asc' THEN url END ASC,
CASE WHEN :sorting = 'url desc' THEN url END DESC,
CASE WHEN :sorting = 'subsector asc' THEN subsector END ASC,
CASE WHEN :sorting = 'subsector desc' THEN subsector END DESC,
- CASE WHEN :sorting = 'last_scraped_at asc' THEN last_scraped_at END ASC,
- CASE WHEN :sorting = 'last_scraped_at desc' THEN last_scraped_at END DESC,
+ CASE WHEN :sorting = 'last_scraped_at asc' THEN last_scraped_at END ASC NULLS LAST,
+ CASE WHEN :sorting = 'last_scraped_at desc' THEN last_scraped_at END DESC NULLS LAST,
CASE WHEN :sorting = 'status asc' THEN status END ASC,
CASE WHEN :sorting = 'status desc' THEN status END DESC,
- last_scraped_at DESC NULLS LAST
+ updated_at DESC NULLS LAST
LIMIT :page_size::INTEGER
OFFSET ((GREATEST(:page::INTEGER, 1) - 1) * :page_size::INTEGER);
\ No newline at end of file
diff --git a/DSL/Resql/ckb/GET/source/list_api_sources.sql b/DSL/Resql/ckb/GET/source/list_api_sources.sql
index 537452e..107d346 100644
--- a/DSL/Resql/ckb/GET/source/list_api_sources.sql
+++ b/DSL/Resql/ckb/GET/source/list_api_sources.sql
@@ -52,26 +52,26 @@ declaration:
description: "Total number of matching records"
*/
WITH latest_sources AS (
- SELECT DISTINCT ON (base_id)
- id, base_id, agency_base_id, url, status, last_scraped_at, type, is_deleted
+ SELECT DISTINCT ON (base_id)
+ id, base_id, agency_base_id, url, status, last_scraped_at, type, is_deleted, updated_at
FROM data_collection.source
WHERE type = 'api'::source_type
ORDER BY base_id, updated_at DESC
)
-SELECT
+SELECT
id, base_id, agency_base_id, url, status, last_scraped_at,
:page as page,
CEIL(COUNT(*) OVER () / :page_size::DECIMAL) AS total_pages,
(COUNT(*) OVER ()) AS total
FROM latest_sources
WHERE is_deleted = FALSE
-ORDER BY
+ORDER BY
CASE WHEN :sorting = 'url asc' THEN url END ASC,
CASE WHEN :sorting = 'url desc' THEN url END DESC,
CASE WHEN :sorting = 'last_scraped_at asc' THEN last_scraped_at END ASC,
CASE WHEN :sorting = 'last_scraped_at desc' THEN last_scraped_at END DESC,
CASE WHEN :sorting = 'status asc' THEN status END ASC,
CASE WHEN :sorting = 'status desc' THEN status END DESC,
- last_scraped_at DESC NULLS LAST
+ updated_at DESC NULLS LAST
LIMIT :page_size::INTEGER
OFFSET ((GREATEST(:page::INTEGER, 1) - 1) * :page_size::INTEGER);
\ No newline at end of file
diff --git a/DSL/Resql/ckb/GET/source_file/list_excluded_source_files_by_agency.sql b/DSL/Resql/ckb/GET/source_file/list_excluded_source_files_by_agency.sql
index e74092f..a909a9d 100644
--- a/DSL/Resql/ckb/GET/source_file/list_excluded_source_files_by_agency.sql
+++ b/DSL/Resql/ckb/GET/source_file/list_excluded_source_files_by_agency.sql
@@ -25,13 +25,19 @@ declaration:
type: string
description: "Base ID of the source"
*/
+WITH latest_files AS (
+ SELECT DISTINCT ON (base_id)
+ id, base_id, agency_base_id, source_base_id, is_deleted, updated_at
+ FROM data_collection.source_file
+ WHERE agency_base_id = :agency_base_id::UUID
+ AND is_excluded = true
+ ORDER BY base_id, updated_at DESC
+)
SELECT
- id,
- base_id,
- agency_base_id,
+ id,
+ base_id,
+ agency_base_id,
source_base_id
-FROM data_collection.source_file
-WHERE
- agency_base_id = :agency_base_id::UUID
- AND is_excluded = true
- AND is_deleted = false;
\ No newline at end of file
+FROM latest_files
+WHERE is_deleted = false
+ORDER BY updated_at DESC NULLS LAST;
\ No newline at end of file
diff --git a/DSL/Resql/ckb/GET/source_file/list_scraped_source_files.sql b/DSL/Resql/ckb/GET/source_file/list_scraped_source_files.sql
index 0d171ff..c9d33fa 100644
--- a/DSL/Resql/ckb/GET/source_file/list_scraped_source_files.sql
+++ b/DSL/Resql/ckb/GET/source_file/list_scraped_source_files.sql
@@ -116,6 +116,6 @@ ORDER BY
CASE WHEN :sorting = 'last_scraped_at desc' THEN last_scraped_at END DESC,
CASE WHEN :sorting = 'external_id asc' THEN external_id END ASC,
CASE WHEN :sorting = 'external_id desc' THEN external_id END DESC,
- last_scraped_at DESC NULLS LAST
+ updated_at DESC NULLS LAST
LIMIT :page_size::INTEGER
OFFSET ((GREATEST(:page::INTEGER, 1) - 1) * :page_size::INTEGER);
\ No newline at end of file
diff --git a/DSL/Resql/ckb/GET/source_file/list_uploaded_source_files.sql b/DSL/Resql/ckb/GET/source_file/list_uploaded_source_files.sql
index 2f97fb9..d8245d1 100644
--- a/DSL/Resql/ckb/GET/source_file/list_uploaded_source_files.sql
+++ b/DSL/Resql/ckb/GET/source_file/list_uploaded_source_files.sql
@@ -101,8 +101,8 @@ ORDER BY
CASE WHEN :sorting = 'excluded desc' THEN is_excluded END DESC,
CASE WHEN :sorting = 'status asc' THEN status END ASC,
CASE WHEN :sorting = 'status desc' THEN status END DESC,
- CASE WHEN :sorting = 'created_at asc' THEN created_at END ASC,
- CASE WHEN :sorting = 'created_at desc' THEN created_at END DESC,
- created_at DESC NULLS LAST
+ CASE WHEN :sorting = 'last_scraped_at asc' THEN created_at END ASC,
+ CASE WHEN :sorting = 'last_scraped_at desc' THEN created_at END DESC,
+ updated_at DESC NULLS LAST
LIMIT :page_size::INTEGER
OFFSET ((GREATEST(:page::INTEGER, 1) - 1) * :page_size::INTEGER);
\ No newline at end of file
diff --git a/DSL/Resql/ckb/GET/source_run_page/list_source_run_pages.sql b/DSL/Resql/ckb/GET/source_run_page/list_source_run_pages.sql
index 439d4b1..4ab8a9a 100644
--- a/DSL/Resql/ckb/GET/source_run_page/list_source_run_pages.sql
+++ b/DSL/Resql/ckb/GET/source_run_page/list_source_run_pages.sql
@@ -54,9 +54,9 @@ declaration:
description: "total number of agencies"
*/
WITH latest_run_pages AS (
- SELECT DISTINCT ON (base_id)
- id, base_id, source_run_report_base_id, url, error_type, error_message, scraped_at, is_deleted
- FROM monitoring.source_run_page
+ SELECT DISTINCT ON (base_id)
+ id, base_id, source_run_report_base_id, url, error_type, error_message, scraped_at, is_deleted, updated_at
+ FROM monitoring.source_run_page
WHERE (:source_run_report_base_id IS NULL OR source_run_report_base_id = :source_run_report_base_id::UUID)
ORDER BY base_id, updated_at DESC
)
@@ -76,6 +76,6 @@ ORDER BY
CASE WHEN :sorting = 'error_message desc' THEN error_message END DESC,
CASE WHEN :sorting = 'scraped_at asc' THEN scraped_at END ASC,
CASE WHEN :sorting = 'scraped_at desc' THEN scraped_at END DESC,
- scraped_at DESC NULLS LAST
+ updated_at DESC NULLS LAST
LIMIT :page_size::INTEGER
OFFSET ((GREATEST(:page::INTEGER, 1) - 1) * :page_size::INTEGER);
\ No newline at end of file
diff --git a/GUI/src/components/DataTable/index.tsx b/GUI/src/components/DataTable/index.tsx
index 9d14356..36eaf6c 100644
--- a/GUI/src/components/DataTable/index.tsx
+++ b/GUI/src/components/DataTable/index.tsx
@@ -221,13 +221,13 @@ const DataTable: FC = ({
{{
asc: (
}
+ icon={}
size="medium"
/>
),
desc: (
}
+ icon={}
size="medium"
/>
),
diff --git a/GUI/src/pages/API/ApiDetail.tsx b/GUI/src/pages/API/ApiDetail.tsx
index 4da9311..3f36fcc 100644
--- a/GUI/src/pages/API/ApiDetail.tsx
+++ b/GUI/src/pages/API/ApiDetail.tsx
@@ -95,7 +95,7 @@ const ApiDetail: FC = () => {
// Convert sorting state to API format
const getSortingParam = (sorting: SortingState): string => {
- if (sorting.length === 0) return 'last_scraped_at desc';
+ if (sorting.length === 0) return '';
const sort = sorting[0];
let field = sort.id;
diff --git a/GUI/src/pages/API/index.tsx b/GUI/src/pages/API/index.tsx
index 0cd3f4f..7194579 100644
--- a/GUI/src/pages/API/index.tsx
+++ b/GUI/src/pages/API/index.tsx
@@ -35,7 +35,7 @@ const ApiList: FC = () => {
// Convert sorting state to API format
const getSortingParam = (sorting: SortingState): string => {
- if (sorting.length === 0) return 'last_scraped_at desc';
+ if (sorting.length === 0) return '';
const sort = sorting[0];
let field = sort.id;
diff --git a/GUI/src/pages/Agency/Agency.tsx b/GUI/src/pages/Agency/Agency.tsx
index ea100b9..30e3cfa 100644
--- a/GUI/src/pages/Agency/Agency.tsx
+++ b/GUI/src/pages/Agency/Agency.tsx
@@ -87,7 +87,7 @@ const Agency: FC = () => {
// Convert sorting state to API format
const getSortingParam = (sorting: SortingState): string => {
- if (sorting.length === 0) return 'last_scraped_at desc';
+ if (sorting.length === 0) return '';
const sort = sorting[0];
let field = sort.id;
@@ -96,7 +96,7 @@ const Agency: FC = () => {
const fieldMap: Record = {
url: 'url',
subsector: 'subsector',
- lastScraped: 'last_scraped_at',
+ lastScrapedAt: 'last_scraped_at',
status: 'status',
};
diff --git a/GUI/src/pages/Agency/index.tsx b/GUI/src/pages/Agency/index.tsx
index 32d0fb0..3bf439d 100644
--- a/GUI/src/pages/Agency/index.tsx
+++ b/GUI/src/pages/Agency/index.tsx
@@ -40,7 +40,7 @@ const AgencyComponent: FC = () => {
// Convert sorting state to API format
const getSortingParam = (sorting: SortingState): string => {
- if (sorting.length === 0) return 'updatedAt desc';
+ if (sorting.length === 0) return '';
const sort = sorting[0];
let field = sort.id;
diff --git a/GUI/src/pages/Reports/Report.tsx b/GUI/src/pages/Reports/Report.tsx
index 089c89a..d59864f 100644
--- a/GUI/src/pages/Reports/Report.tsx
+++ b/GUI/src/pages/Reports/Report.tsx
@@ -33,7 +33,7 @@ const Report: FC = () => {
// Convert sorting state to API format
const getSortingParam = (sorting: SortingState): string => {
- if (sorting.length === 0) return 'scraped_at desc';
+ if (sorting.length === 0) return '';
const sort = sorting[0];
let field = sort.id;
diff --git a/GUI/src/pages/Reports/index.tsx b/GUI/src/pages/Reports/index.tsx
index f8bfe0f..acd93b3 100644
--- a/GUI/src/pages/Reports/index.tsx
+++ b/GUI/src/pages/Reports/index.tsx
@@ -37,7 +37,7 @@ const Reports: FC = () => {
// Convert sorting state to API format
const getSortingParam = (sorting: SortingState): string => {
- if (sorting.length === 0) return 'scraping_started_at desc';
+ if (sorting.length === 0) return '';
const sort = sorting[0];
let field = sort.id;
diff --git a/GUI/src/pages/ScrapedFiles/index.tsx b/GUI/src/pages/ScrapedFiles/index.tsx
index 40f1ba3..87b7387 100644
--- a/GUI/src/pages/ScrapedFiles/index.tsx
+++ b/GUI/src/pages/ScrapedFiles/index.tsx
@@ -95,7 +95,7 @@ const ScrapedFiles: FC = () => {
// Convert sorting state to API format
const getSortingParam = (sorting: SortingState): string => {
- if (sorting.length === 0) return 'last_scraped_at desc';
+ if (sorting.length === 0) return '';
const sort = sorting[0];
let field = sort.id;
diff --git a/GUI/src/pages/UploadedFiles/index.tsx b/GUI/src/pages/UploadedFiles/index.tsx
index 15c6a18..8dadf63 100644
--- a/GUI/src/pages/UploadedFiles/index.tsx
+++ b/GUI/src/pages/UploadedFiles/index.tsx
@@ -114,7 +114,7 @@ const UploadedFiles: FC = () => {
// Convert sorting state to API format
const getSortingParam = (sorting: SortingState): string => {
- if (sorting.length === 0) return 'last_scraped_at desc';
+ if (sorting.length === 0) return '';
const sort = sorting[0];
let field = sort.id;
diff --git a/GUI/src/services/agencies.ts b/GUI/src/services/agencies.ts
index e03b63b..41e1729 100644
--- a/GUI/src/services/agencies.ts
+++ b/GUI/src/services/agencies.ts
@@ -62,7 +62,7 @@ export const getAgencies = async (
params: {
page: params.page || 1,
pageSize: params.pageSize || 10,
- sorting: params.sorting || 'updatedAt desc',
+ sorting: params.sorting || '',
},
});
diff --git a/GUI/src/services/files.ts b/GUI/src/services/files.ts
index 80df62b..8a6538e 100644
--- a/GUI/src/services/files.ts
+++ b/GUI/src/services/files.ts
@@ -97,7 +97,7 @@ export const getScrapedFiles = async (
isExcluded: params.isExcluded,
page: params.page || 1,
pageSize: params.pageSize || 10,
- sorting: params.sorting || 'last_scraped_at desc',
+ sorting: params.sorting || '',
search: params.search,
},
});
@@ -128,7 +128,7 @@ export const getUploadedFiles = async (
isExcluded: params.isExcluded,
page: params.page || 1,
pageSize: params.pageSize || 10,
- sorting: params.sorting || 'last_scraped_at desc',
+ sorting: params.sorting || '',
search: params.search,
},
});
diff --git a/GUI/src/services/reports.ts b/GUI/src/services/reports.ts
index bf28f98..ce2133e 100644
--- a/GUI/src/services/reports.ts
+++ b/GUI/src/services/reports.ts
@@ -40,7 +40,7 @@ export const getReports = async (
params: {
page: params.page || 1,
pageSize: params.pageSize || 10,
- sorting: params.sorting || 'scraping_started_at desc',
+ sorting: params.sorting || '',
},
});
@@ -116,7 +116,7 @@ export const getReportPages = async (
source_run_report_base_id: params.source_run_report_base_id,
page: params.page || 1,
pageSize: params.pageSize || 10,
- sorting: params.sorting || 'scraped_at desc',
+ sorting: params.sorting || '',
},
});
diff --git a/GUI/src/services/sources.ts b/GUI/src/services/sources.ts
index f19a4a7..793b877 100644
--- a/GUI/src/services/sources.ts
+++ b/GUI/src/services/sources.ts
@@ -148,7 +148,7 @@ export const getSources = async (
agencyBaseId: params.agencyBaseId,
page: params.page || 1,
pageSize: params.pageSize || 10,
- sorting: params.sorting || 'last_scraped_at desc',
+ sorting: params.sorting || '',
},
});
@@ -177,7 +177,7 @@ export const getApiIntegrations = async (
params: {
page: params.page || 1,
pageSize: params.pageSize || 10,
- sorting: params.sorting || 'last_scraped_at desc',
+ sorting: params.sorting || '',
},
});
@@ -433,7 +433,7 @@ export const getApiSourceFiles = async (
sourceId: params.sourceId,
page: params.page || 1,
pageSize: params.pageSize || 10,
- sorting: params.sorting || 'last_scraped_at desc',
+ sorting: params.sorting || '',
search: params.search,
type: params.type,
},
From 89ef0dedc03e99e69354b597a58516a260f94307 Mon Sep 17 00:00:00 2001
From: ahmer-mt
Date: Tue, 21 Oct 2025 08:36:30 +0500
Subject: [PATCH 13/18] fix pagination last page delete items
---
GUI/src/pages/Agency/Agency.tsx | 24 ++++++++++++++++++++++--
GUI/src/pages/Agency/index.tsx | 24 ++++++++++++++++++++++--
GUI/src/pages/Reports/index.tsx | 24 ++++++++++++++++++++++--
GUI/src/pages/ScrapedFiles/index.tsx | 24 ++++++++++++++++++++++--
GUI/src/pages/UploadedFiles/index.tsx | 24 ++++++++++++++++++++++--
5 files changed, 110 insertions(+), 10 deletions(-)
diff --git a/GUI/src/pages/Agency/Agency.tsx b/GUI/src/pages/Agency/Agency.tsx
index 30e3cfa..69d3a3c 100644
--- a/GUI/src/pages/Agency/Agency.tsx
+++ b/GUI/src/pages/Agency/Agency.tsx
@@ -295,14 +295,34 @@ const Agency: FC = () => {
// Delete source mutation
const deleteMutation = useMutation({
mutationFn: deleteSource,
- onSuccess: () => {
+ onSuccess: async () => {
toast.open({
type: 'success',
title: t('global.notification'),
message: t('knowledgeBase.deleteSuccess'),
});
setDeleteModal(null);
- queryClient.invalidateQueries(['sources']);
+
+ // Refetch to get updated data
+ await queryClient.invalidateQueries(['sources']);
+
+ // Check if current page is now out of bounds
+ const newTotal = (sourcesData?.total || 0) - 1;
+ const maxPages = Math.ceil(newTotal / pagination.pageSize);
+
+ // Reset to last valid page if current page is out of bounds
+ if (pagination.pageIndex >= maxPages && maxPages > 0) {
+ setPagination({
+ ...pagination,
+ pageIndex: maxPages - 1,
+ });
+ } else if (maxPages === 0) {
+ // If no data left, reset to page 0
+ setPagination({
+ ...pagination,
+ pageIndex: 0,
+ });
+ }
},
onError: (error: any) => {
toast.open({
diff --git a/GUI/src/pages/Agency/index.tsx b/GUI/src/pages/Agency/index.tsx
index 3bf439d..96575f8 100644
--- a/GUI/src/pages/Agency/index.tsx
+++ b/GUI/src/pages/Agency/index.tsx
@@ -81,14 +81,34 @@ const AgencyComponent: FC = () => {
// Delete agency mutation
const deleteAgencyMutation = useMutation({
mutationFn: deleteAgency,
- onSuccess: () => {
+ onSuccess: async () => {
toast.open({
type: 'success',
title: t('global.notification'),
message: t('knowledgeBase.deleteSuccess'),
});
setDeleteModal(null);
- queryClient.invalidateQueries(['agencies']);
+
+ // Refetch to get updated data
+ await queryClient.invalidateQueries(['agencies']);
+
+ // Check if current page is now out of bounds
+ const newTotal = (knowledgeBaseData.total || 0) - 1;
+ const maxPages = Math.ceil(newTotal / pagination.pageSize);
+
+ // Reset to last valid page if current page is out of bounds
+ if (pagination.pageIndex >= maxPages && maxPages > 0) {
+ setPagination({
+ ...pagination,
+ pageIndex: maxPages - 1,
+ });
+ } else if (maxPages === 0) {
+ // If no data left, reset to page 0
+ setPagination({
+ ...pagination,
+ pageIndex: 0,
+ });
+ }
},
onError: (error: any) => {
toast.open({
diff --git a/GUI/src/pages/Reports/index.tsx b/GUI/src/pages/Reports/index.tsx
index acd93b3..8b98cff 100644
--- a/GUI/src/pages/Reports/index.tsx
+++ b/GUI/src/pages/Reports/index.tsx
@@ -80,14 +80,34 @@ const Reports: FC = () => {
// Delete report mutation
const deleteReportMutation = useMutation({
mutationFn: deleteReport,
- onSuccess: () => {
+ onSuccess: async () => {
toast.open({
type: 'success',
title: t('global.notification'),
message: t('reports.deleteSuccess'),
});
setDeleteModal(null);
- queryClient.invalidateQueries(['reports']);
+
+ // Refetch to get updated data
+ await queryClient.invalidateQueries(['reports']);
+
+ // Check if current page is now out of bounds
+ const newTotal = (reportsData.total || 0) - 1;
+ const maxPages = Math.ceil(newTotal / pagination.pageSize);
+
+ // Reset to last valid page if current page is out of bounds
+ if (pagination.pageIndex >= maxPages && maxPages > 0) {
+ setPagination({
+ ...pagination,
+ pageIndex: maxPages - 1,
+ });
+ } else if (maxPages === 0) {
+ // If no data left, reset to page 0
+ setPagination({
+ ...pagination,
+ pageIndex: 0,
+ });
+ }
},
onError: (error: any) => {
toast.open({
diff --git a/GUI/src/pages/ScrapedFiles/index.tsx b/GUI/src/pages/ScrapedFiles/index.tsx
index 87b7387..bd41a67 100644
--- a/GUI/src/pages/ScrapedFiles/index.tsx
+++ b/GUI/src/pages/ScrapedFiles/index.tsx
@@ -160,14 +160,34 @@ const ScrapedFiles: FC = () => {
// Delete file mutation
const deleteMutation = useMutation({
mutationFn: deleteFile,
- onSuccess: () => {
+ onSuccess: async () => {
toast.open({
type: 'success',
title: t('global.notification'),
message: t('knowledgeBase.urlDeleteSuccess'),
});
setDeleteModal(null);
- queryClient.invalidateQueries(['scrapedFiles']);
+
+ // Refetch to get updated data
+ await queryClient.invalidateQueries(['scrapedFiles']);
+
+ // Check if current page is now out of bounds
+ const newTotal = (scrapedFilesData?.total || 0) - 1;
+ const maxPages = Math.ceil(newTotal / pagination.pageSize);
+
+ // Reset to last valid page if current page is out of bounds
+ if (pagination.pageIndex >= maxPages && maxPages > 0) {
+ setPagination({
+ ...pagination,
+ pageIndex: maxPages - 1,
+ });
+ } else if (maxPages === 0) {
+ // If no data left, reset to page 0
+ setPagination({
+ ...pagination,
+ pageIndex: 0,
+ });
+ }
},
onError: (error: any) => {
toast.open({
diff --git a/GUI/src/pages/UploadedFiles/index.tsx b/GUI/src/pages/UploadedFiles/index.tsx
index 8dadf63..337470a 100644
--- a/GUI/src/pages/UploadedFiles/index.tsx
+++ b/GUI/src/pages/UploadedFiles/index.tsx
@@ -306,14 +306,34 @@ const UploadedFiles: FC = () => {
// Delete file mutation
const deleteMutation = useMutation({
mutationFn: deleteFile,
- onSuccess: () => {
+ onSuccess: async () => {
toast.open({
type: 'success',
title: t('global.notification'),
message: t('knowledgeBase.fileDeleteSuccess'),
});
setDeleteModal(null);
- queryClient.invalidateQueries(['uploadedFiles']);
+
+ // Refetch to get updated data
+ await queryClient.invalidateQueries(['uploadedFiles']);
+
+ // Check if current page is now out of bounds
+ const newTotal = (uploadedFilesData?.total || 0) - 1;
+ const maxPages = Math.ceil(newTotal / pagination.pageSize);
+
+ // Reset to last valid page if current page is out of bounds
+ if (pagination.pageIndex >= maxPages && maxPages > 0) {
+ setPagination({
+ ...pagination,
+ pageIndex: maxPages - 1,
+ });
+ } else if (maxPages === 0) {
+ // If no data left, reset to page 0
+ setPagination({
+ ...pagination,
+ pageIndex: 0,
+ });
+ }
},
onError: (error: any) => {
toast.open({
From e9aa1e174553eab0c312659a6508927bec4ea45c Mon Sep 17 00:00:00 2001
From: ahmer-mt
Date: Tue, 21 Oct 2025 19:23:49 +0500
Subject: [PATCH 14/18]
https://github.com/buerokratt/Common-Knowledge/issues/88 fixes
---
cleaning/worker/tasks.py | 38 ++++++++++++-------
scrapper/scrapper/spiders/base_spider.py | 35 ++++++++++++++---
.../spiders/sitemap_collect_spider.py | 11 +++++-
3 files changed, 65 insertions(+), 19 deletions(-)
diff --git a/cleaning/worker/tasks.py b/cleaning/worker/tasks.py
index 1bcabe2..7c5e967 100644
--- a/cleaning/worker/tasks.py
+++ b/cleaning/worker/tasks.py
@@ -30,19 +30,36 @@ def clean_html(entity: EntityToClean):
"""Clean HTML files using a multi-step approach for better content extraction."""
soup = BeautifulSoup(open(entity.file_path.as_posix(), 'r'), 'lxml')
+ # First, remove unwanted elements from the entire document
+ for element in soup(['header', 'footer', 'nav', 'script', 'style', 'aside', 'form']):
+ element.decompose()
+
# Step 1: Check if there's a element and use only that
main_element = soup.find('main')
if main_element:
- logger.info(f'Found element, using only main content for {entity.file_path.as_posix()}')
- # Remove unwanted elements from main
- for element in main_element(['script', 'style', 'nav', 'aside', 'form']):
- element.decompose()
+ print(f'Found element, trying partition_html on main content for {entity.file_path.as_posix()}')
+
+ # Try partition_html on main element content first
+ main_html = str(main_element)
+ partitioned = partition_html(
+ text=main_html,
+ languages=settings.languages,
+ skip_headers_and_footers=True
+ )
+
+ if len(partitioned) > 0:
+ cleaned_text = '\n\n'.join([str(el) for el in partitioned])
+ print(f'partition_html extracted {len(cleaned_text)} chars from element for {entity.file_path.as_posix()}')
+ return cleaned_text
+
+ # If partition_html returns empty, fall back to BeautifulSoup on main
+ print(f'partition_html on returned empty, using BeautifulSoup fallback for {entity.file_path.as_posix()}')
cleaned_text = main_element.get_text(separator='\n', strip=True)
- logger.info(f'Extracted {len(cleaned_text)} chars from element for {entity.file_path.as_posix()}')
+ print(f'BeautifulSoup extracted {len(cleaned_text)} chars from element for {entity.file_path.as_posix()}')
return cleaned_text
# Step 2: Try partition_html with skip_headers_and_footers flag
- logger.info(f'No element found, trying partition_html with skip_headers_and_footers for {entity.file_path.as_posix()}')
+ print(f'No element found, trying partition_html with skip_headers_and_footers for {entity.file_path.as_posix()}')
partitioned = partition_html(
filename=entity.file_path.as_posix(),
languages=settings.languages,
@@ -52,14 +69,9 @@ def clean_html(entity: EntityToClean):
# Step 3: If partition_html returns empty, fallback to BeautifulSoup
if len(partitioned) == 0:
- logger.warning(f'partition_html returned empty content, using BeautifulSoup fallback for {entity.file_path.as_posix()}')
-
- # Remove unwanted elements (headers, footers, nav, scripts, styles)
- for element in soup(['header', 'footer', 'nav', 'script', 'style', 'aside', 'form']):
- element.decompose()
-
+ print(f'partition_html returned empty content, using BeautifulSoup fallback for {entity.file_path.as_posix()}')
cleaned_text = soup.get_text(separator='\n', strip=True)
- logger.info(f'BeautifulSoup fallback extracted {len(cleaned_text)} chars for {entity.file_path.as_posix()}')
+ print(f'BeautifulSoup fallback extracted {len(cleaned_text)} chars for {entity.file_path.as_posix()}')
return cleaned_text
diff --git a/scrapper/scrapper/spiders/base_spider.py b/scrapper/scrapper/spiders/base_spider.py
index 6595757..9d172d8 100644
--- a/scrapper/scrapper/spiders/base_spider.py
+++ b/scrapper/scrapper/spiders/base_spider.py
@@ -122,12 +122,29 @@ async def parse(self, response: Response, **kwargs):
# Check if Playwright page is available (might not be if direct HTTP download was used)
playwright_page = response.meta.get("playwright_page")
+ rendered_html = None
+
if playwright_page:
- # Use Playwright page for title extraction
+ # Use Playwright page for title extraction and get rendered HTML
async with self.close_page(response) as page:
page: Page
- if file_extension == '.html':
+
+ # Check if this is a sitemap or XML file - don't wait or render, use raw content
+ is_sitemap = 'sitemap' in response.url.lower() or file_extension == '.xml'
+
+ if file_extension == '.html' and not is_sitemap:
+ # Wait for content to actually render (check if body has meaningful content)
+ try:
+ self.logger.info('Waiting for dynamic content to render...')
+ await page.wait_for_load_state('networkidle')
+ self.logger.info('Content detected, proceeding...')
+ except Exception as e:
+ # If timeout or error, proceed anyway
+ self.logger.warning(f'Timeout waiting for content, proceeding anyway: {e}')
+
title = await page.title()
+ # Get the fully rendered HTML after JavaScript execution
+ rendered_html = await page.content()
else:
title = response.url
else:
@@ -140,18 +157,26 @@ async def parse(self, response: Response, **kwargs):
else:
title = response.url
+ # Use rendered HTML if available, otherwise use response.body
+ body_to_save = rendered_html.encode('utf-8') if rendered_html else response.body
+
if file_extension == '.html':
- soup = BeautifulSoup(response.body, 'lxml')
+ soup = BeautifulSoup(body_to_save, 'lxml')
text = soup.get_text()
hashed = hashlib.sha1(text.encode()).hexdigest()
else:
- hashed = hashlib.sha1(response.body).hexdigest()
+ hashed = hashlib.sha1(body_to_save).hexdigest()
- file_item = FileItem(body=response.body, source_url=response.url, extension=file_extension)
+ file_item = FileItem(body=body_to_save, source_url=response.url, extension=file_extension)
metadata_item = MetadataItem(
file_type=file_extension, metadata=Metadata(), source_url=response.url, page_title=title
)
scrapped_item = ScrappedItem(file=file_item, metadata=metadata_item, hash=hashed)
+
+ # Store rendered HTML in response meta for link extraction by subclasses
+ if rendered_html:
+ response.meta['rendered_html'] = rendered_html
+
yield scrapped_item
diff --git a/scrapper/scrapper/spiders/sitemap_collect_spider.py b/scrapper/scrapper/spiders/sitemap_collect_spider.py
index 5d6301c..27c76ff 100644
--- a/scrapper/scrapper/spiders/sitemap_collect_spider.py
+++ b/scrapper/scrapper/spiders/sitemap_collect_spider.py
@@ -136,7 +136,16 @@ async def parse(self, response: Response, **kwargs):
if scrapped_item.metadata.file_type != '.html':
continue
- for href in response.css("a::attr(href)").getall():
+ # Use rendered HTML for link extraction if available (for SPAs)
+ rendered_html = response.meta.get('rendered_html')
+ if rendered_html:
+ from bs4 import BeautifulSoup
+ soup = BeautifulSoup(rendered_html, 'lxml')
+ links = [a.get('href') for a in soup.find_all('a', href=True)]
+ else:
+ links = response.css("a::attr(href)").getall()
+
+ for href in links:
next_url = urljoin(response.url, href)
next_url = next_url.split('#')[0]
From 56dde937150933e011985851a8617c288db4fbe5 Mon Sep 17 00:00:00 2001
From: ahmer-mt
Date: Tue, 21 Oct 2025 19:28:14 +0500
Subject: [PATCH 15/18] fix logging
---
cleaning/worker/tasks.py | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/cleaning/worker/tasks.py b/cleaning/worker/tasks.py
index 7c5e967..242c3a4 100644
--- a/cleaning/worker/tasks.py
+++ b/cleaning/worker/tasks.py
@@ -37,7 +37,7 @@ def clean_html(entity: EntityToClean):
# Step 1: Check if there's a element and use only that
main_element = soup.find('main')
if main_element:
- print(f'Found element, trying partition_html on main content for {entity.file_path.as_posix()}')
+ logger.info(f'Found element, trying partition_html on main content for {entity.file_path.as_posix()}')
# Try partition_html on main element content first
main_html = str(main_element)
@@ -49,17 +49,17 @@ def clean_html(entity: EntityToClean):
if len(partitioned) > 0:
cleaned_text = '\n\n'.join([str(el) for el in partitioned])
- print(f'partition_html extracted {len(cleaned_text)} chars from element for {entity.file_path.as_posix()}')
+ logger.info(f'partition_html extracted {len(cleaned_text)} chars from element for {entity.file_path.as_posix()}')
return cleaned_text
# If partition_html returns empty, fall back to BeautifulSoup on main
- print(f'partition_html on returned empty, using BeautifulSoup fallback for {entity.file_path.as_posix()}')
+ logger.info(f'partition_html on returned empty, using BeautifulSoup fallback for {entity.file_path.as_posix()}')
cleaned_text = main_element.get_text(separator='\n', strip=True)
- print(f'BeautifulSoup extracted {len(cleaned_text)} chars from element for {entity.file_path.as_posix()}')
+ logger.info(f'BeautifulSoup extracted {len(cleaned_text)} chars from element for {entity.file_path.as_posix()}')
return cleaned_text
# Step 2: Try partition_html with skip_headers_and_footers flag
- print(f'No element found, trying partition_html with skip_headers_and_footers for {entity.file_path.as_posix()}')
+ logger.info(f'No element found, trying partition_html with skip_headers_and_footers for {entity.file_path.as_posix()}')
partitioned = partition_html(
filename=entity.file_path.as_posix(),
languages=settings.languages,
@@ -69,9 +69,9 @@ def clean_html(entity: EntityToClean):
# Step 3: If partition_html returns empty, fallback to BeautifulSoup
if len(partitioned) == 0:
- print(f'partition_html returned empty content, using BeautifulSoup fallback for {entity.file_path.as_posix()}')
+ logger.info(f'partition_html returned empty content, using BeautifulSoup fallback for {entity.file_path.as_posix()}')
cleaned_text = soup.get_text(separator='\n', strip=True)
- print(f'BeautifulSoup fallback extracted {len(cleaned_text)} chars for {entity.file_path.as_posix()}')
+ logger.info(f'BeautifulSoup fallback extracted {len(cleaned_text)} chars for {entity.file_path.as_posix()}')
return cleaned_text
@@ -120,9 +120,9 @@ def clean_file_task(entity: EntityToClean):
if cleaned_text and len(cleaned_text.strip()) > 0:
try:
detected_language = detect(cleaned_text)
- print(f'Detected language: {detected_language} for {entity.file_path.as_posix()}')
+ logger.info(f'Detected language: {detected_language} for {entity.file_path.as_posix()}')
except LangDetectException as e:
- print(f'Language detection failed for {entity.file_path.as_posix()}: {e}')
+ logger.error(f'Language detection failed for {entity.file_path.as_posix()}: {e}')
cleaned_text_filename = entity.directory_path / 'cleaned.txt'
From 73fd97ea753f4c3ee0233b5757b7415d2c7227a7 Mon Sep 17 00:00:00 2001
From: ahmer-mt
Date: Thu, 23 Oct 2025 02:16:20 +0500
Subject: [PATCH 16/18] fix cleaning pipeline
---
cleaning/requirements.txt | 2 +-
cleaning/worker/tasks.py | 4 +--
scrapper/api/app.py | 4 +++
scrapper/api/models.py | 1 +
scrapper/scrapper/pipelines.py | 7 ++++-
scrapper/scrapper/spiders/base_spider.py | 4 +++
search-service/index.js | 36 +++++++++++++++++++++---
7 files changed, 50 insertions(+), 8 deletions(-)
diff --git a/cleaning/requirements.txt b/cleaning/requirements.txt
index 986160a..55271a8 100644
--- a/cleaning/requirements.txt
+++ b/cleaning/requirements.txt
@@ -1,6 +1,6 @@
fastapi==0.115.12
uvicorn==0.34.2
-unstructured[pdf,docx,doc]==0.18.2
+unstructured[pdf,docx,doc]==0.18.5
pydantic-settings==2.10.1
beautifulsoup4==4.13.4
langdetect==1.0.9
diff --git a/cleaning/worker/tasks.py b/cleaning/worker/tasks.py
index 242c3a4..f5e6951 100644
--- a/cleaning/worker/tasks.py
+++ b/cleaning/worker/tasks.py
@@ -54,7 +54,7 @@ def clean_html(entity: EntityToClean):
# If partition_html returns empty, fall back to BeautifulSoup on main
logger.info(f'partition_html on returned empty, using BeautifulSoup fallback for {entity.file_path.as_posix()}')
- cleaned_text = main_element.get_text(separator='\n', strip=True)
+ cleaned_text = main_element.get_text(separator='\n\n', strip=True)
logger.info(f'BeautifulSoup extracted {len(cleaned_text)} chars from element for {entity.file_path.as_posix()}')
return cleaned_text
@@ -70,7 +70,7 @@ def clean_html(entity: EntityToClean):
# Step 3: If partition_html returns empty, fallback to BeautifulSoup
if len(partitioned) == 0:
logger.info(f'partition_html returned empty content, using BeautifulSoup fallback for {entity.file_path.as_posix()}')
- cleaned_text = soup.get_text(separator='\n', strip=True)
+ cleaned_text = soup.get_text(separator='\n\n', strip=True)
logger.info(f'BeautifulSoup fallback extracted {len(cleaned_text)} chars for {entity.file_path.as_posix()}')
return cleaned_text
diff --git a/scrapper/api/app.py b/scrapper/api/app.py
index 03c78b5..04540c0 100644
--- a/scrapper/api/app.py
+++ b/scrapper/api/app.py
@@ -23,6 +23,8 @@
@app.post('/specified-pages-scrapper-task')
def trigger_specified_pages_scrapper_task(task: SpecifiedLinksScrapeTask):
+ # Always ignore stopping for manual file refresh
+ task.ignore_stopping = True
specified_links_scrapper_task.delay(task.model_dump(mode='json'))
@@ -59,6 +61,8 @@ def trigger_eesti_scrapper_task(task: EestiScrapperTask):
@app.post('/specified-api-files-scrapper-task')
def trigger_specified_api_files_scrapper_task(task: SpecifiedApiFilesScrapeTask):
+ # Always ignore stopping for manual file refresh
+ task.ignore_stopping = True
specified_api_files_scrapper_task.delay(task.model_dump(mode='json'))
@app.post('/generate-edited-metadata')
diff --git a/scrapper/api/models.py b/scrapper/api/models.py
index a85c969..8ac26ce 100644
--- a/scrapper/api/models.py
+++ b/scrapper/api/models.py
@@ -4,6 +4,7 @@
class BaseObject(BaseModel):
agency_id: str
source_id: str
+ ignore_stopping: bool = False
class LinkToScrape(BaseModel):
diff --git a/scrapper/scrapper/pipelines.py b/scrapper/scrapper/pipelines.py
index e37bba8..bd1cd13 100644
--- a/scrapper/scrapper/pipelines.py
+++ b/scrapper/scrapper/pipelines.py
@@ -175,7 +175,7 @@ class ScrappingFinishedPipeline:
def close_spider(self, spider: Spider):
if not hasattr(spider, 'task'):
return
-
+
spider: BaseSpider
task: BaseObject = spider.task
@@ -194,6 +194,11 @@ def open_spider(self, spider: Spider):
spider: BaseSpider
task: BaseObject = spider.task
+ # Skip updating source status for manual file refresh (ignore_stopping flag)
+ # This prevents clearing is_stopping flag when refreshing individual files
+ if hasattr(task, 'ignore_stopping') and task.ignore_stopping:
+ return
+
requests.post(f'{spider.settings.get('RUUTER_INTERNAL')}/ckb/source/update-status', json={
'source_id': task.source_id,
'status': 'running',
diff --git a/scrapper/scrapper/spiders/base_spider.py b/scrapper/scrapper/spiders/base_spider.py
index 9d172d8..a9f6f7d 100644
--- a/scrapper/scrapper/spiders/base_spider.py
+++ b/scrapper/scrapper/spiders/base_spider.py
@@ -33,6 +33,10 @@ def __init__(self, *args, **kwargs):
self.task = kwargs['task']
def check_source_is_stopping(self):
+ # Skip check if this is a manual file refresh (ignore_stopping flag set)
+ if hasattr(self.task, 'ignore_stopping') and self.task.ignore_stopping:
+ return
+
try:
is_stopping = requests.get(
f'{self.settings.get('RUUTER_INTERNAL')}/ckb/source/get',
diff --git a/search-service/index.js b/search-service/index.js
index 3f658b1..cba470f 100644
--- a/search-service/index.js
+++ b/search-service/index.js
@@ -231,17 +231,31 @@ app.get("/search/:sourceId", async (req, res) => {
size: 1000, // Get more docs to find unique source_file_ids
query: {
bool: {
- must: q.trim()
+ should: q.trim()
? [
+ {
+ term: {
+ url: {
+ value: q.trim(),
+ boost: 100,
+ },
+ },
+ },
{
multi_match: {
query: q.trim(),
- fields: ["content^3", "page_title^2", "file_name^2"],
+ fields: [
+ "url^5",
+ "content^3",
+ "page_title^2",
+ "file_name^2",
+ ],
type: "best_fields",
},
},
]
: [{ match_all: {} }],
+ minimum_should_match: q.trim() ? 1 : 0,
},
},
_source: ["source_file_id"],
@@ -285,17 +299,31 @@ app.get("/search/:sourceId", async (req, res) => {
size: parseInt(size),
query: {
bool: {
- must: q.trim()
+ should: q.trim()
? [
+ {
+ term: {
+ url: {
+ value: q.trim(),
+ boost: 100,
+ },
+ },
+ },
{
multi_match: {
query: q.trim(),
- fields: ["content^3", "page_title^2", "file_name^2"],
+ fields: [
+ "url^5",
+ "content^3",
+ "page_title^2",
+ "file_name^2",
+ ],
type: "best_fields",
},
},
]
: [{ match_all: {} }],
+ minimum_should_match: q.trim() ? 1 : 0,
},
},
highlight: {
From 670d35cc373e93d772a169635ed116c9aea85682 Mon Sep 17 00:00:00 2001
From: ahmer-mt
Date: Fri, 24 Oct 2025 12:28:16 +0500
Subject: [PATCH 17/18]
https://github.com/buerokratt/Common-Knowledge/issues/90 same batch
---
.../components/FileUploader/FileUploader.tsx | 38 ++++++++++++++++---
GUI/translations/en/common.json | 1 +
GUI/translations/et/common.json | 1 +
3 files changed, 34 insertions(+), 6 deletions(-)
diff --git a/GUI/src/components/FileUploader/FileUploader.tsx b/GUI/src/components/FileUploader/FileUploader.tsx
index 4de9583..eff6a26 100644
--- a/GUI/src/components/FileUploader/FileUploader.tsx
+++ b/GUI/src/components/FileUploader/FileUploader.tsx
@@ -96,9 +96,22 @@ const FileUploader: FC = ({
const selectedFiles = event.target.files;
if (!selectedFiles) return;
- const newFiles: FileItem[] = Array.from(selectedFiles).map((file) =>
- validateSingleFile(file)
- );
+ const newFiles: FileItem[] = Array.from(selectedFiles).map((file) => {
+ const validatedFile = validateSingleFile(file);
+
+ // Check for duplicate filename in existing files
+ const isDuplicate = files.some(existingFile => existingFile.name === file.name);
+
+ if (isDuplicate && validatedFile.status === 'pending') {
+ return {
+ ...validatedFile,
+ status: 'error' as const,
+ message: t('fileUpload.duplicateFile'),
+ };
+ }
+
+ return validatedFile;
+ });
const updatedFiles = [...files, ...newFiles];
onFilesChange(updatedFiles);
@@ -126,9 +139,22 @@ const FileUploader: FC = ({
const droppedFiles = e.dataTransfer.files;
if (!droppedFiles) return;
- const newFiles: FileItem[] = Array.from(droppedFiles).map((file) =>
- validateSingleFile(file)
- );
+ const newFiles: FileItem[] = Array.from(droppedFiles).map((file) => {
+ const validatedFile = validateSingleFile(file);
+
+ // Check for duplicate filename in existing files
+ const isDuplicate = files.some(existingFile => existingFile.name === file.name);
+
+ if (isDuplicate && validatedFile.status === 'pending') {
+ return {
+ ...validatedFile,
+ status: 'error' as const,
+ message: t('fileUpload.duplicateFile'),
+ };
+ }
+
+ return validatedFile;
+ });
const updatedFiles = [...files, ...newFiles];
onFilesChange(updatedFiles);
diff --git a/GUI/translations/en/common.json b/GUI/translations/en/common.json
index 2a8c810..e52bde3 100644
--- a/GUI/translations/en/common.json
+++ b/GUI/translations/en/common.json
@@ -601,6 +601,7 @@
"allowedFormats": "Allowed formats: ",
"maxSizeExceeded": "Maximum size exceeded.",
"fileAlreadyExists": "File already exists.",
+ "duplicateFile": "A file with the same name is already selected.",
"success": "Success",
"fileExists": "File exists",
"fileExceedsLimit": "File size exceeds the maximum limit",
diff --git a/GUI/translations/et/common.json b/GUI/translations/et/common.json
index 8ed8534..a440e4b 100644
--- a/GUI/translations/et/common.json
+++ b/GUI/translations/et/common.json
@@ -601,6 +601,7 @@
"allowedFormats": "Lubatud formaadid: ",
"maxSizeExceeded": "Maksimaalne suurus ületatud.",
"fileAlreadyExists": "Fail on juba olemas.",
+ "duplicateFile": "Sama nimega fail on juba valitud.",
"success": "Edukas",
"fileExists": "Fail eksisteerib",
"fileExceedsLimit": "Failisuurus ületab maksimaalset piiri",
From eb2fda2034b47c17f85284438bbc912f2bfbdefa Mon Sep 17 00:00:00 2001
From: ahmer-mt
Date: Fri, 24 Oct 2025 16:41:49 +0500
Subject: [PATCH 18/18] fix originalDataUrl issue
---
DSL/Ruuter.internal/ckb/POST/pipeline/delete-file-sync.yml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/DSL/Ruuter.internal/ckb/POST/pipeline/delete-file-sync.yml b/DSL/Ruuter.internal/ckb/POST/pipeline/delete-file-sync.yml
index 3ca9619..0d89c63 100644
--- a/DSL/Ruuter.internal/ckb/POST/pipeline/delete-file-sync.yml
+++ b/DSL/Ruuter.internal/ckb/POST/pipeline/delete-file-sync.yml
@@ -26,7 +26,7 @@ deleteFile:
url: "[#CKB_FILE_MANAGER]/delete-files"
body:
files:
- - s3_path: ${sourceFile.response.body[0].original_data_url}
+ - s3_path: ${sourceFile.response.body[0].originalDataUrl}
result: res
returnResult: