From a4346f799da2cb4a0533accd161f811373bbfb4a Mon Sep 17 00:00:00 2001 From: Artsiom Beida Date: Wed, 8 Oct 2025 10:38:49 +0200 Subject: [PATCH 01/18] UI fixes: fix estonian translation does not show report agency name and url in the report delete confirmation form. fix reports links to problematic urls --- GUI/src/pages/Reports/Report.tsx | 2 +- GUI/src/pages/Reports/index.tsx | 1 + GUI/translations/en/common.json | 2 +- GUI/translations/et/common.json | 2 +- 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/GUI/src/pages/Reports/Report.tsx b/GUI/src/pages/Reports/Report.tsx index 7f45aca..089c89a 100644 --- a/GUI/src/pages/Reports/Report.tsx +++ b/GUI/src/pages/Reports/Report.tsx @@ -148,7 +148,7 @@ const Report: FC = () => { cell: ({ row }) => ( { {t('reports.deleteConfirmation', { agency: deleteModal.agencyName, domain: deleteModal.url, + interpolation: { escapeValue: false } })}

diff --git a/GUI/translations/en/common.json b/GUI/translations/en/common.json index b5e25ea..2a8c810 100644 --- a/GUI/translations/en/common.json +++ b/GUI/translations/en/common.json @@ -615,7 +615,7 @@ "startedAt": "Started at", "finishedAt": "Finished at", "deleteTitle": "Delete Report", - "deleteConfirmation": "Are you sure you want to delete the report?", + "deleteConfirmation": "Are you sure you want to delete the report {{agency}} - {{domain}}?", "deleteSuccess": "Report deleted successfully", "deleteError": "Failed to delete report", "errorType": "Error type", diff --git a/GUI/translations/et/common.json b/GUI/translations/et/common.json index a2f92c7..8ed8534 100644 --- a/GUI/translations/et/common.json +++ b/GUI/translations/et/common.json @@ -615,7 +615,7 @@ "startedAt": "Alustatud", "finishedAt": "Lõpetatud", "deleteTitle": "Kustuta aruanne", - "deleteConfirmation": "Kas oled kindel, et soovid kustutada aruande asutusele {agency} - {domain}?", + "deleteConfirmation": "Kas oled kindel, et soovid kustutada aruande asutusele {{agency}} - {{domain}}?", "deleteSuccess": "Aruanne kustutatud edukalt", "deleteError": "Aruande kustutamine ebaõnnestus", "errorType": "Vea tüüp", From 78dbc7e92d42dc0c00e63549bfa89f1f55541dd0 Mon Sep 17 00:00:00 2001 From: Artsiom Beida Date: Wed, 8 Oct 2025 17:04:19 +0200 Subject: [PATCH 02/18] Increased timeout to 30s for scrapper to account for slow archive pages; --- scrapper/scrapper/settings.py | 6 +++--- scrapper/scrapper/spiders/base_spider.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scrapper/scrapper/settings.py b/scrapper/scrapper/settings.py index ccdd372..a54195d 100644 --- a/scrapper/scrapper/settings.py +++ b/scrapper/scrapper/settings.py @@ -106,14 +106,14 @@ ALLOWED_FILETYPES = os.environ.get('SUPPORTED_TYPES', '.html,.docx,.doc,.pdf').split(',') SCRAPED_DIRECTORY = os.environ.get('SCRAPED_DIRECTORY', "/scrapped-data") RUUTER_INTERNAL = os.environ.get('RUUTER_INTERNAL', "http://ruuter-internal:8089") -DOWNLOAD_DELAY = 0.1 +DOWNLOAD_DELAY = 0.2 DOWNLOAD_HANDLERS = { "http": "scrapper.download_handler.DownloadHandler", "https": "scrapper.download_handler.DownloadHandler", } -PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT = 10_000 +PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT = 30_000 TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" PLAYWRIGHT_MAX_CONTEXTS = 1 PLAYWRIGHT_MAX_PAGES_PER_CONTEXT = 1 -RETRY_TIMES = 3 +RETRY_TIMES = 10 diff --git a/scrapper/scrapper/spiders/base_spider.py b/scrapper/scrapper/spiders/base_spider.py index ee2c609..dfcd2dc 100644 --- a/scrapper/scrapper/spiders/base_spider.py +++ b/scrapper/scrapper/spiders/base_spider.py @@ -48,7 +48,7 @@ def get_meta(self): 'playwright': True, 'playwright_include_page': True, 'playwright_page_goto_kwargs': { - 'timeout': 5_000, + 'timeout': 30_000, 'wait_until': 'load', }, "playwright_context_kwargs": { From 952f92c035dba7a01c600cd0b9b61580d98b6f29 Mon Sep 17 00:00:00 2001 From: ahmer-mt Date: Thu, 16 Oct 2025 01:17:54 +0500 Subject: [PATCH 03/18] playwright download fix --- scrapper/scrapper/download_handler.py | 76 +++++++++++++++++-- scrapper/scrapper/spiders/base_spider.py | 28 +++++-- .../scrapper/spiders/uploaded_file_spider.py | 7 ++ 3 files changed, 96 insertions(+), 15 deletions(-) diff --git a/scrapper/scrapper/download_handler.py b/scrapper/scrapper/download_handler.py index 6870353..a124ba9 100644 --- a/scrapper/scrapper/download_handler.py +++ b/scrapper/scrapper/download_handler.py @@ -1,25 +1,87 @@ import asyncio from scrapy import Request, Spider from scrapy.http import Response +from scrapy.core.downloader.handlers.http import HTTPDownloadHandler from scrapy_playwright.handler import ScrapyPlaywrightDownloadHandler +from playwright._impl._errors import Error as PlaywrightError +from twisted.internet import defer class DownloadHandler(ScrapyPlaywrightDownloadHandler): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.args = args - self.kwargs = kwargs + def __init__(self, crawler): + super().__init__(crawler) + self.crawler = crawler + # Initialize standard HTTP handler for non-Playwright requests + self._http_handler = HTTPDownloadHandler( + settings=crawler.settings, + crawler=crawler + ) + + @classmethod + def from_crawler(cls, crawler): + return cls(crawler) + + def download_request(self, request: Request, spider: Spider): + """ + Main entry point for downloading requests. + Check if Playwright is needed, otherwise use direct HTTP. + """ + # Check if Playwright is requested in meta + if not request.meta.get('playwright'): + # Use direct HTTP download for uploaded files + spider.logger.info(f'Direct HTTP download (no Playwright): {request.url}') + return self._http_handler.download_request(request, spider) + + # Use Playwright for regular web pages + spider.logger.info(f'Playwright download: {request.url}') + return super().download_request(request, spider) async def _download_request(self, request: Request, spider: Spider) -> Response: + """ + Internal async download method with fallback for download errors. + This is called by the parent's download_request when using Playwright. + """ try: - spider.logger.info(f'request started: {request.url}') + spider.logger.info(f'Playwright request started: {request.url}') async with asyncio.timeout(30): r = await super()._download_request(request, spider) - spider.logger.info(f'request finished: {request.url}') + spider.logger.info(f'Playwright request finished: {request.url}') return r + except Exception as e: + # Catch "Download is starting" and similar download errors as safety net + if "Download is starting" in str(e) or "net::ERR_ABORTED" in str(e): + spider.logger.info(f'Download error detected for {request.url}, falling back to direct HTTP download') + # Fall back to direct HTTP download using Scrapy's HTTP handler (consistent with Method 1) + try: + import requests + # Use requests for simplicity in async context + response = requests.get( + request.url, + headers=dict(request.headers.to_unicode_dict()), + timeout=30 + ) + response.raise_for_status() + + spider.logger.info(f'Direct HTTP download completed: {request.url} ({len(response.content)} bytes)') + + from scrapy.http import HtmlResponse + return HtmlResponse( + url=response.url, + status=response.status_code, + headers=dict(response.headers), + body=response.content, + encoding='utf-8', + request=request, + ) + except Exception as download_error: + spider.logger.error(f'Direct HTTP download failed for {request.url}: {download_error}') + raise + else: + # Other Playwright errors - re-raise + spider.logger.error(f'Playwright error for {request.url}: {str(e)}') + raise except TimeoutError: spider.logger.warning(f'request timed out due to playwright: {request.url}. Try again') await self._close() - super().__init__(*self.args, **self.kwargs) await self._launch() return await self._download_request(request, spider) diff --git a/scrapper/scrapper/spiders/base_spider.py b/scrapper/scrapper/spiders/base_spider.py index dfcd2dc..b6c2f15 100644 --- a/scrapper/scrapper/spiders/base_spider.py +++ b/scrapper/scrapper/spiders/base_spider.py @@ -109,17 +109,29 @@ async def errback(self, failure: Failure): await page.close() async def parse(self, response: Response, **kwargs): - async with self.close_page(response) as page: - self.check_source_is_stopping() + self.check_source_is_stopping() - page: Page - - file_extension = self.guess_file_extension( - response.headers.get(b'Content-Type', 'text/html').decode('utf-8') - ) + file_extension = self.guess_file_extension( + response.headers.get(b'Content-Type', 'text/html').decode('utf-8') + ) + # Check if Playwright page is available (might not be if direct HTTP download was used) + playwright_page = response.meta.get("playwright_page") + if playwright_page: + # Use Playwright page for title extraction + async with self.close_page(response) as page: + page: Page + if file_extension == '.html': + title = await page.title() + else: + title = response.url + else: + # Direct HTTP download (no Playwright page available) if file_extension == '.html': - title = await page.title() + # Extract title from HTML using BeautifulSoup + soup = BeautifulSoup(response.body, 'lxml') + title_tag = soup.find('title') + title = title_tag.get_text() if title_tag else response.url else: title = response.url diff --git a/scrapper/scrapper/spiders/uploaded_file_spider.py b/scrapper/scrapper/spiders/uploaded_file_spider.py index 86cb9ab..fce08cc 100644 --- a/scrapper/scrapper/spiders/uploaded_file_spider.py +++ b/scrapper/scrapper/spiders/uploaded_file_spider.py @@ -7,6 +7,13 @@ class UploadedFileSpider(SpecifiedPagesSpider): name = 'uploaded_file' + def get_meta(self): + """ + Override to disable Playwright for uploaded files. + Uploaded files from S3 should use direct HTTP download. + """ + return {} + async def parse(self, response: Response, **kwargs): base_id, _ = self.get_base_id_and_hash(response.request.url) From f9a5275f6de59d6933cf7aff2de5374bdf7ed0d7 Mon Sep 17 00:00:00 2001 From: ahmer-mt Date: Thu, 16 Oct 2025 01:19:07 +0500 Subject: [PATCH 04/18] remove .txt from upload --- GUI/src/components/FileUploader/FileUploader.tsx | 2 +- GUI/src/pages/Agency/Agency.tsx | 2 +- GUI/src/pages/UploadedFiles/index.tsx | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/GUI/src/components/FileUploader/FileUploader.tsx b/GUI/src/components/FileUploader/FileUploader.tsx index 8c37fd0..4de9583 100644 --- a/GUI/src/components/FileUploader/FileUploader.tsx +++ b/GUI/src/components/FileUploader/FileUploader.tsx @@ -36,7 +36,7 @@ const FileUploader: FC = ({ onFilesChange, onFileDelete, maxFileSize = 30 * 1024 * 1024, // 30MB default - acceptedTypes = '.pdf,.doc,.docx,.txt,.html,.htm', + acceptedTypes = '.pdf,.doc,.docx,.html,.htm', multiple = true, className = '', uploadProgress, diff --git a/GUI/src/pages/Agency/Agency.tsx b/GUI/src/pages/Agency/Agency.tsx index 0876918..d2b9a9c 100644 --- a/GUI/src/pages/Agency/Agency.tsx +++ b/GUI/src/pages/Agency/Agency.tsx @@ -684,7 +684,7 @@ const Agency: FC = () => { onFilesChange={handleFilesChange} onFileDelete={handleFileDelete} maxFileSize={30 * 1024 * 1024} // 30MB - acceptedTypes=".pdf,.doc,.docx,.txt,.html,.htm" + acceptedTypes=".pdf,.doc,.docx,.html,.htm" multiple={true} uploadProgress={uploadProgress} // Pass upload progress to FileUploader /> diff --git a/GUI/src/pages/UploadedFiles/index.tsx b/GUI/src/pages/UploadedFiles/index.tsx index 6ed3aeb..bb0be3d 100644 --- a/GUI/src/pages/UploadedFiles/index.tsx +++ b/GUI/src/pages/UploadedFiles/index.tsx @@ -811,7 +811,7 @@ const UploadedFiles: FC = () => { onFilesChange={handleFilesChange} onFileDelete={handleFileDelete} maxFileSize={30 * 1024 * 1024} // 30MB - acceptedTypes=".pdf,.doc,.docx,.txt,.html,.htm" + acceptedTypes=".pdf,.doc,.docx,.html,.htm" multiple={true} uploadProgress={uploadProgress} /> From 7321342b2573b5bcba5ce6f1d8afd0773105314a Mon Sep 17 00:00:00 2001 From: ahmer-mt Date: Thu, 16 Oct 2025 21:34:34 +0500 Subject: [PATCH 05/18] Add Language to metadata --- cleaning/requirements.txt | 1 + cleaning/worker/tasks.py | 11 +++++++++++ file-processing/app/services/zip_service.py | 5 +++-- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/cleaning/requirements.txt b/cleaning/requirements.txt index 00c3182..986160a 100644 --- a/cleaning/requirements.txt +++ b/cleaning/requirements.txt @@ -3,3 +3,4 @@ uvicorn==0.34.2 unstructured[pdf,docx,doc]==0.18.2 pydantic-settings==2.10.1 beautifulsoup4==4.13.4 +langdetect==1.0.9 diff --git a/cleaning/worker/tasks.py b/cleaning/worker/tasks.py index f234036..974e6ae 100644 --- a/cleaning/worker/tasks.py +++ b/cleaning/worker/tasks.py @@ -2,6 +2,7 @@ import logging import requests +from langdetect import detect, LangDetectException from unstructured.partition.auto import partition from bs4 import BeautifulSoup @@ -55,6 +56,15 @@ def clean_file_task(entity: EntityToClean): cleaned_text = clean_any_file(entity) logger.info(f'Cleaned as unstructured file for {entity.file_path.as_posix()}') + # Detect language from cleaned text + detected_language = None + if cleaned_text and len(cleaned_text.strip()) > 0: + try: + detected_language = detect(cleaned_text) + print(f'Detected language: {detected_language} for {entity.file_path.as_posix()}') + except LangDetectException as e: + print(f'Language detection failed for {entity.file_path.as_posix()}: {e}') + cleaned_text_filename = entity.directory_path / 'cleaned.txt' with cleaned_text_filename.open("w") as f: @@ -74,6 +84,7 @@ def clean_file_task(entity: EntityToClean): cleaned_metadata_filename = entity.directory_path / "cleaned.meta.json" with cleaned_metadata_filename.open("w") as f: metadata['metadata']['cleaned'] = True + metadata['language'] = detected_language json.dump(metadata, f) r = requests.post( diff --git a/file-processing/app/services/zip_service.py b/file-processing/app/services/zip_service.py index bc18283..d842b4c 100644 --- a/file-processing/app/services/zip_service.py +++ b/file-processing/app/services/zip_service.py @@ -139,12 +139,13 @@ def exclusion_filter(relative_path: str) -> bool: # Create zip file in temp directory base_name = os.path.join(temp_dir, "folder_content") - - logger.info(f"Creating zip file {temp_zip_path} with {successful_count} files") + shutil.make_archive(base_name, 'zip', local_folder_path) temp_zip_path = base_name + ".zip" + logger.info(f"Creating zip file {temp_zip_path} with {successful_count} files") + # Get zip file size zip_size = os.path.getsize(temp_zip_path) From bbaa0b2f2a6fdf3489f79c5adc1d5ea351f5c350 Mon Sep 17 00:00:00 2001 From: ahmer-mt Date: Thu, 16 Oct 2025 21:35:54 +0500 Subject: [PATCH 06/18] lang metadata --- scrapper/scrapper/items.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scrapper/scrapper/items.py b/scrapper/scrapper/items.py index fc18309..fecb6a1 100644 --- a/scrapper/scrapper/items.py +++ b/scrapper/scrapper/items.py @@ -25,6 +25,7 @@ class MetadataItem: version: str = "1.0" created_at: str = field(default_factory=lambda: str(datetime.now())) edited_at: str | None = None + language: Optional[str] = None @dataclass From ef25df999d64a9f87c6fc3a0fcdf80c9b37b2de4 Mon Sep 17 00:00:00 2001 From: ahmer-mt Date: Thu, 16 Oct 2025 21:36:10 +0500 Subject: [PATCH 07/18] fix callbacks --- scrapper/scrapper/download_handler.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/scrapper/scrapper/download_handler.py b/scrapper/scrapper/download_handler.py index a124ba9..5b6f1ee 100644 --- a/scrapper/scrapper/download_handler.py +++ b/scrapper/scrapper/download_handler.py @@ -47,11 +47,17 @@ async def _download_request(self, request: Request, spider: Spider) -> Response: r = await super()._download_request(request, spider) spider.logger.info(f'Playwright request finished: {request.url}') return r + except TimeoutError: + spider.logger.warning(f'request timed out due to playwright: {request.url}. Try again') + await self._close() + super().__init__(self.crawler) # Re-initialize with the same crawler + await self._launch() + return await self._download_request(request, spider) except Exception as e: # Catch "Download is starting" and similar download errors as safety net if "Download is starting" in str(e) or "net::ERR_ABORTED" in str(e): spider.logger.info(f'Download error detected for {request.url}, falling back to direct HTTP download') - # Fall back to direct HTTP download using Scrapy's HTTP handler (consistent with Method 1) + # Fall back to direct HTTP download try: import requests # Use requests for simplicity in async context @@ -80,8 +86,3 @@ async def _download_request(self, request: Request, spider: Spider) -> Response: # Other Playwright errors - re-raise spider.logger.error(f'Playwright error for {request.url}: {str(e)}') raise - except TimeoutError: - spider.logger.warning(f'request timed out due to playwright: {request.url}. Try again') - await self._close() - await self._launch() - return await self._download_request(request, spider) From 9c4a991b0e3168e42afee28307b1989d6271d973 Mon Sep 17 00:00:00 2001 From: ahmer-mt Date: Sat, 18 Oct 2025 00:28:30 +0500 Subject: [PATCH 08/18] ignore archived urls --- .../app/services/download_service.py | 21 +++---- scrapper/scrapper/spiders/base_spider.py | 7 ++- .../spiders/sitemap_collect_spider.py | 6 ++ scrapper/scrapper/utils.py | 56 +++++++++++++++++++ 4 files changed, 79 insertions(+), 11 deletions(-) diff --git a/file-processing/app/services/download_service.py b/file-processing/app/services/download_service.py index d5c9702..0d9d46a 100644 --- a/file-processing/app/services/download_service.py +++ b/file-processing/app/services/download_service.py @@ -19,11 +19,12 @@ FileDeleteResult ) from app.services.blob_storage import storage_provider, BlobStorageException +from app.core.config import settings logger = logging.getLogger(__name__) -# Volume path configuration -VOLUME_PATH = '/app/data' +# Source path configuration from settings +SOURCE_PATH = settings.source_path # In-memory task store for download tasks _download_tasks: Dict[str, dict] = {} @@ -77,7 +78,7 @@ def process_single_file_download(file_item: FileDownloadItem) -> FileDownloadRes clean_s3_path = parts[0] # Use volume path configuration - local_path_str = f"{VOLUME_PATH}/{file_item.local_path}" + local_path_str = f"{SOURCE_PATH}/{file_item.local_path}" local_path = Path(local_path_str) # Ensure local directory exists @@ -109,7 +110,7 @@ def process_single_file_download(file_item: FileDownloadItem) -> FileDownloadRes error_msg = f"Unexpected error: {str(e)}" return FileDownloadResult( s3_path=file_item.s3_path, - local_path=f"{VOLUME_PATH}/{file_item.local_path}", + local_path=f"{SOURCE_PATH}/{file_item.local_path}", status="failed", error_message=error_msg, is_folder=False, @@ -131,7 +132,7 @@ def process_folder_download(file_item: FileDownloadItem) -> FileDownloadResult: clean_s3_path = parts[0] # Use volume path configuration - local_folder_path = f"{VOLUME_PATH}/{file_item.local_path}" + local_folder_path = f"{SOURCE_PATH}/{file_item.local_path}" # Ensure local directory exists Path(local_folder_path).mkdir(parents=True, exist_ok=True) @@ -176,7 +177,7 @@ def process_folder_download(file_item: FileDownloadItem) -> FileDownloadResult: error_msg = f"Folder download error: {str(e)}" return FileDownloadResult( s3_path=file_item.s3_path, - local_path=f"{VOLUME_PATH}/{file_item.local_path}", + local_path=f"{SOURCE_PATH}/{file_item.local_path}", status="failed", error_message=error_msg, is_folder=True, @@ -218,7 +219,7 @@ def process_download_task(task_id: str) -> None: error_msg = f"Unexpected error processing {file_item.s3_path}: {str(e)}" results.append(FileDownloadResult( s3_path=file_item.s3_path, - local_path=f"{VOLUME_PATH}/{file_item.local_path}", + local_path=f"{SOURCE_PATH}/{file_item.local_path}", status="failed", error_message=error_msg, is_folder=file_item.is_folder, @@ -328,7 +329,7 @@ def delete_files_from_volume(request: DeleteFromVolumeRequest) -> DeleteFromVolu for file_item in request.files: try: # Use volume path configuration - local_path_str = f"{VOLUME_PATH}/{file_item.local_path}" + local_path_str = f"{SOURCE_PATH}/{file_item.local_path}" local_path = Path(local_path_str) if local_path.exists(): @@ -369,7 +370,7 @@ def delete_files_from_volume(request: DeleteFromVolumeRequest) -> DeleteFromVolu except Exception as e: error_msg = f"Unexpected error: {str(e)}" results.append(FileDeleteResult( - local_path=f"{VOLUME_PATH}/{file_item.local_path}", + local_path=f"{SOURCE_PATH}/{file_item.local_path}", status="failed", error_message=error_msg )) @@ -445,7 +446,7 @@ def download_files_to_volume(request: DownloadToVolumeRequest) -> DownloadToVolu error_msg = f"Unexpected error processing {file_item.s3_path}: {str(e)}" results.append(FileDownloadResult( s3_path=file_item.s3_path, - local_path=f"{VOLUME_PATH}/{file_item.local_path}", + local_path=f"{SOURCE_PATH}/{file_item.local_path}", status="failed", error_message=error_msg, is_folder=file_item.is_folder, diff --git a/scrapper/scrapper/spiders/base_spider.py b/scrapper/scrapper/spiders/base_spider.py index b6c2f15..6595757 100644 --- a/scrapper/scrapper/spiders/base_spider.py +++ b/scrapper/scrapper/spiders/base_spider.py @@ -16,7 +16,7 @@ from api.models import BaseObject from scrapper.items import FileItem, MetadataItem, Metadata, ScrappedItem -from scrapper.utils import send_error +from scrapper.utils import send_error, is_archive_url class BaseSpider(Spider): @@ -111,6 +111,11 @@ async def errback(self, failure: Failure): async def parse(self, response: Response, **kwargs): self.check_source_is_stopping() + # Check if URL is an archive page and skip if it is + if is_archive_url(response.url): + self.logger.info(f'Skipping archive URL: {response.url}') + return + file_extension = self.guess_file_extension( response.headers.get(b'Content-Type', 'text/html').decode('utf-8') ) diff --git a/scrapper/scrapper/spiders/sitemap_collect_spider.py b/scrapper/scrapper/spiders/sitemap_collect_spider.py index 89846b7..5d6301c 100644 --- a/scrapper/scrapper/spiders/sitemap_collect_spider.py +++ b/scrapper/scrapper/spiders/sitemap_collect_spider.py @@ -6,6 +6,7 @@ from api.models import SitemapCollectScrapperTask from scrapper.spiders.base_spider import BaseSpider +from scrapper.utils import is_archive_url class SitemapCollectSpider(BaseSpider): @@ -143,6 +144,11 @@ async def parse(self, response: Response, **kwargs): if self.get_pure_domain(next_url) not in self.pure_allowed_domains: continue + # Skip archive URLs + if is_archive_url(next_url): + self.logger.info(f'Skipping archive URL: {next_url}') + continue + if next_url not in self.visited_urls: self.visited_urls.add(next_url) self.logger.info(f'Schedule scrape for url: {next_url}') diff --git a/scrapper/scrapper/utils.py b/scrapper/scrapper/utils.py index 19f6e59..b082621 100644 --- a/scrapper/scrapper/utils.py +++ b/scrapper/scrapper/utils.py @@ -3,6 +3,7 @@ import functools import typing import requests +from urllib.parse import urlparse from scrapper.items import ScrappedItem @@ -66,3 +67,58 @@ def decorator(self, spider: BaseSpider): return r return decorator + + +# Archive URL detection keywords in multiple languages +ARCHIVE_KEYWORDS = [ + # Estonian + 'arhiiv', 'arhiivi', 'archive', + # English + 'archived', 'archives', + # Russian transliteration + 'arkhiv', 'arhiv', +] + + +def is_archive_url(url: str) -> bool: + """ + Check if a URL points to an archived/historical page. + + Detects archive pages by checking for archive-related keywords in: + - Subdomain (e.g., arhiiv.example.ee) + - Path segments (e.g., example.ee/arhiiv/2020/) + + Args: + url: The URL to check + + Returns: + True if the URL appears to be an archive page, False otherwise + + Examples: + >>> is_archive_url('https://arhiiv.lastekaitseliit.ee/et/2016/06/7203/') + True + >>> is_archive_url('https://example.com/arhiiv/old-content') + True + >>> is_archive_url('https://example.com/current-page') + False + """ + try: + parsed = urlparse(url.lower()) + + # Check subdomain for archive keywords + hostname_parts = parsed.hostname.split('.') if parsed.hostname else [] + for part in hostname_parts: + if any(keyword in part for keyword in ARCHIVE_KEYWORDS): + return True + + # Check path segments for archive keywords + path_parts = parsed.path.split('/') + for part in path_parts: + if any(keyword in part for keyword in ARCHIVE_KEYWORDS): + return True + + return False + + except Exception: + # If URL parsing fails, don't filter it out + return False From f6224a84487f1aa8c26e8db5ca129195e05e7a96 Mon Sep 17 00:00:00 2001 From: ahmer-mt Date: Mon, 20 Oct 2025 22:52:53 +0500 Subject: [PATCH 09/18] https://github.com/buerokratt/Common-Knowledge/issues/88 --- .../ckb/TEMPLATES/pipeline/clean-file.yml | 10 ++++ cleaning/api/models.py | 5 +- cleaning/worker/tasks.py | 53 +++++++++++++++++-- 3 files changed, 64 insertions(+), 4 deletions(-) diff --git a/DSL/Ruuter/ckb/TEMPLATES/pipeline/clean-file.yml b/DSL/Ruuter/ckb/TEMPLATES/pipeline/clean-file.yml index c6a5817..26f3b70 100644 --- a/DSL/Ruuter/ckb/TEMPLATES/pipeline/clean-file.yml +++ b/DSL/Ruuter/ckb/TEMPLATES/pipeline/clean-file.yml @@ -4,6 +4,11 @@ extractRequestData: meta_data_path: ${incoming.body.meta_data_path} directory_path: ${incoming.body.directory_path} source_file_id: ${incoming.body.source_file_id} + logs_path: ${incoming.body.logs_path} + url: ${incoming.body.url} + source_base_id: ${incoming.body.source_base_id} + agency_base_id: ${incoming.body.agency_base_id} + source_run_report_base_id: ${incoming.body.source_run_report_base_id} cleanData: @@ -15,6 +20,11 @@ cleanData: meta_data_path: ${meta_data_path} directory_path: ${directory_path} source_file_id: ${source_file_id} + logs_path: ${logs_path} + url: ${url} + source_base_id: ${source_base_id} + agency_base_id: ${agency_base_id} + source_run_report_base_id: ${source_run_report_base_id} result: cleanedResult diff --git a/cleaning/api/models.py b/cleaning/api/models.py index ff96d4e..833db05 100644 --- a/cleaning/api/models.py +++ b/cleaning/api/models.py @@ -7,4 +7,7 @@ class EntityToClean(BaseModel): directory_path: DirectoryPath source_file_id: str url: str - logs_path: FilePath \ No newline at end of file + logs_path: FilePath + source_base_id: str + agency_base_id: str + source_run_report_base_id: str \ No newline at end of file diff --git a/cleaning/worker/tasks.py b/cleaning/worker/tasks.py index 974e6ae..1bcabe2 100644 --- a/cleaning/worker/tasks.py +++ b/cleaning/worker/tasks.py @@ -1,10 +1,12 @@ import json import logging +import re import requests from langdetect import detect, LangDetectException from unstructured.partition.auto import partition +from unstructured.partition.html import partition_html from bs4 import BeautifulSoup from api.config import settings @@ -14,11 +16,52 @@ logger = logging.getLogger(__name__) +def normalize_newlines(text: str) -> str: + """ + Normalize excessive newlines to maximum of 2 consecutive newlines. + Replace 3 or more consecutive newlines with exactly 2 newlines. + """ + # Replace 3 or more newlines with exactly 2 newlines + normalized_text = re.sub(r'\n{3,}', '\n\n', text) + return normalized_text + + def clean_html(entity: EntityToClean): - with entity.file_path.open('r') as f: - soup = BeautifulSoup(f.read(), 'lxml') + """Clean HTML files using a multi-step approach for better content extraction.""" + soup = BeautifulSoup(open(entity.file_path.as_posix(), 'r'), 'lxml') + + # Step 1: Check if there's a
element and use only that + main_element = soup.find('main') + if main_element: + logger.info(f'Found
element, using only main content for {entity.file_path.as_posix()}') + # Remove unwanted elements from main + for element in main_element(['script', 'style', 'nav', 'aside', 'form']): + element.decompose() + cleaned_text = main_element.get_text(separator='\n', strip=True) + logger.info(f'Extracted {len(cleaned_text)} chars from
element for {entity.file_path.as_posix()}') + return cleaned_text + + # Step 2: Try partition_html with skip_headers_and_footers flag + logger.info(f'No
element found, trying partition_html with skip_headers_and_footers for {entity.file_path.as_posix()}') + partitioned = partition_html( + filename=entity.file_path.as_posix(), + languages=settings.languages, + skip_headers_and_footers=True + ) + cleaned_text = '\n\n'.join([str(el) for el in partitioned]) + + # Step 3: If partition_html returns empty, fallback to BeautifulSoup + if len(partitioned) == 0: + logger.warning(f'partition_html returned empty content, using BeautifulSoup fallback for {entity.file_path.as_posix()}') - return soup.get_text() + # Remove unwanted elements (headers, footers, nav, scripts, styles) + for element in soup(['header', 'footer', 'nav', 'script', 'style', 'aside', 'form']): + element.decompose() + + cleaned_text = soup.get_text(separator='\n', strip=True) + logger.info(f'BeautifulSoup fallback extracted {len(cleaned_text)} chars for {entity.file_path.as_posix()}') + + return cleaned_text def clean_any_file(entity: EntityToClean): @@ -56,6 +99,10 @@ def clean_file_task(entity: EntityToClean): cleaned_text = clean_any_file(entity) logger.info(f'Cleaned as unstructured file for {entity.file_path.as_posix()}') + # Normalize excessive newlines (max 2 consecutive newlines) + cleaned_text = normalize_newlines(cleaned_text) + logger.info(f'Normalized newlines for {entity.file_path.as_posix()}') + # Detect language from cleaned text detected_language = None if cleaned_text and len(cleaned_text.strip()) > 0: From f588e979d22964e49040b2bddbee637e3396e68e Mon Sep 17 00:00:00 2001 From: ahmer-mt Date: Tue, 21 Oct 2025 00:23:06 +0500 Subject: [PATCH 10/18] https://github.com/buerokratt/Common-Knowledge/issues/90 --- .../ckb/hbs/extract_file_names.handlebars | 5 ++ .../check_duplicate_file_names.sql | 34 +++++++++++++ .../get-upload-urls-for-existing-source.yml | 38 ++++++++++++++ GUI/src/pages/UploadedFiles/index.tsx | 51 ++++++++++++++++--- GUI/src/services/api.ts | 17 +++++-- 5 files changed, 134 insertions(+), 11 deletions(-) create mode 100644 DSL/DMapper/ckb/hbs/extract_file_names.handlebars create mode 100644 DSL/Resql/ckb/POST/source_file/check_duplicate_file_names.sql diff --git a/DSL/DMapper/ckb/hbs/extract_file_names.handlebars b/DSL/DMapper/ckb/hbs/extract_file_names.handlebars new file mode 100644 index 0000000..cb14b65 --- /dev/null +++ b/DSL/DMapper/ckb/hbs/extract_file_names.handlebars @@ -0,0 +1,5 @@ +[ + {{#each files}} + "{{this.name}}"{{#unless @last}},{{/unless}} + {{/each}} +] diff --git a/DSL/Resql/ckb/POST/source_file/check_duplicate_file_names.sql b/DSL/Resql/ckb/POST/source_file/check_duplicate_file_names.sql new file mode 100644 index 0000000..8532779 --- /dev/null +++ b/DSL/Resql/ckb/POST/source_file/check_duplicate_file_names.sql @@ -0,0 +1,34 @@ +/* +declaration: + version: 0.1 + description: "Check if file names already exist for a source (latest version, non-deleted)" + method: post + accepts: json + returns: json + namespace: source_file + allowlist: + body: + - field: source_id + type: string + description: "Source base ID" + - field: file_names + type: string + description: "Comma-separated file names to check" + response: + fields: + - field: file_name + type: string + description: "Duplicate file name found" +*/ +WITH latest_files AS ( + SELECT DISTINCT ON (base_id) + base_id, file_name, is_deleted + FROM data_collection.source_file + WHERE source_base_id = :source_id::UUID + AND type = 'uploaded_file' + ORDER BY base_id, updated_at DESC +) +SELECT DISTINCT file_name +FROM latest_files +WHERE is_deleted = FALSE + AND file_name = ANY(string_to_array(:fileNames, ',')); diff --git a/DSL/Ruuter/ckb/POST/source/file/get-upload-urls-for-existing-source.yml b/DSL/Ruuter/ckb/POST/source/file/get-upload-urls-for-existing-source.yml index 6206ba0..f637e7c 100644 --- a/DSL/Ruuter/ckb/POST/source/file/get-upload-urls-for-existing-source.yml +++ b/DSL/Ruuter/ckb/POST/source/file/get-upload-urls-for-existing-source.yml @@ -29,6 +29,32 @@ extractRequestData: files: "${incoming.body.files}" expires_in: "${incoming.body.expiresIn || 3600}" +extractFileNames: + call: http.post + args: + url: "[#CKB_DMAPPER_HBS]/extract_file_names" + headers: + type: json + body: + files: ${files} + result: fileNamesResult + +checkDuplicateFileNames: + call: http.post + args: + url: "[#CKB_RESQL]/source_file/check_duplicate_file_names" + headers: + type: json + body: + source_id: ${source_base_id} + fileNames: ${fileNamesResult.response.body?.join(',') ?? ''} + result: duplicatesResult + +validateNoDuplicates: + switch: + - condition: ${duplicatesResult.response.body !== null && duplicatesResult.response.body.length > 0} + next: returnDuplicateError + generateSourceFileIds: call: http.post args: @@ -83,3 +109,15 @@ prepareResponse: returnResponse: return: ${response} next: end + +returnDuplicateError: + assign: + errorResponse: + error: "Duplicate file names found" + duplicateFiles: ${duplicatesResult.response.body} + next: returnDuplicateErrorResponse + +returnDuplicateErrorResponse: + return: ${errorResponse} + status: 400 + next: end diff --git a/GUI/src/pages/UploadedFiles/index.tsx b/GUI/src/pages/UploadedFiles/index.tsx index bb0be3d..1e4f1ec 100644 --- a/GUI/src/pages/UploadedFiles/index.tsx +++ b/GUI/src/pages/UploadedFiles/index.tsx @@ -252,21 +252,56 @@ const UploadedFiles: FC = () => { currentFileName: '', }); + // Extract error message from various formats + // Backend returns: {response: {error: "...", duplicateFiles: [...]}} + const responseData = + error.response?.data?.response || error.response?.data || {}; + + const errorMessage = + responseData.error || error.message || t('knowledgeBase.uploadError'); + + // Get duplicate file names if available + const duplicateFiles = responseData.duplicateFiles || []; + // Set failed files to error status setFormData((prev) => ({ ...prev, - files: prev.files.map((file) => ({ - ...file, - status: - file.status === 'uploading' ? ('error' as const) : file.status, - message: file.status === 'uploading' ? error.message : file.message, - })), + files: prev.files.map((file) => { + const isDuplicate = duplicateFiles.some( + (df: any) => + df.fileName === file.name || + df.file_name === file.name || + df === file.name + ); + + // Only mark duplicates as error, leave other files as-is + if (isDuplicate) { + return { + ...file, + status: 'error' as const, + message: errorMessage, + }; + } + + return file; + }), })); + // Build detailed error message + let displayMessage = errorMessage; + if (duplicateFiles.length > 0) { + const fileNames = duplicateFiles + .map((df: any) => + typeof df === 'string' ? df : df.fileName || df.file_name + ) + .join(', '); + displayMessage = `${errorMessage}: ${fileNames}`; + } + toast.open({ type: 'error', title: t('global.notificationError'), - message: error.message || t('knowledgeBase.uploadError'), + message: displayMessage, }); }, }); @@ -781,7 +816,7 @@ const UploadedFiles: FC = () => { uploadProgress.isUploading || !formData.subsector || formData.files.length === 0 || - formData.files.every((file) => file.status === 'error') + formData.files.some((file) => file.status === 'error') } > {uploadProgress.isUploading diff --git a/GUI/src/services/api.ts b/GUI/src/services/api.ts index c5a68b8..93fbfba 100644 --- a/GUI/src/services/api.ts +++ b/GUI/src/services/api.ts @@ -35,10 +35,21 @@ const AxiosInterceptor = ({ children }) => { const errInterceptor = (error: any) => { import.meta.env.DEBUG_ENABLED && console.debug(error); - let message = - error?.response?.data?.response || t('global.notificationErrorMsg'); + // Keep the original error structure for proper error handling + // If there's a response, attach it to a new error with proper message + if (error?.response?.data?.response) { + const responseData = error.response.data.response; + const errorMessage = typeof responseData === 'string' + ? responseData + : responseData.error || t('global.notificationErrorMsg'); - return Promise.reject(new Error(message)); + const newError = new Error(errorMessage); + // Preserve the original response for error handlers + (newError as any).response = error.response; + return Promise.reject(newError); + } + + return Promise.reject(new Error(error?.message || t('global.notificationErrorMsg'))); }; const apiInterceptor = api.interceptors.response.use( From d630a6d120967b3da901d5ab5aa5beaf120e77a5 Mon Sep 17 00:00:00 2001 From: ahmer-mt Date: Tue, 21 Oct 2025 07:48:12 +0500 Subject: [PATCH 11/18] https://github.com/buerokratt/Common-Knowledge/issues/92 --- API_SPECIFICATION.md | 25 +++++++++++++++-- DATABASE_SCHEMA.md | 2 ++ .../enrich_files_with_uploaded_by.handlebars | 12 ++++++++ ...21071904-add-uploaded-by-and-file-size.sql | 10 +++++++ ...21071904-add-uploaded-by-and-file-size.xml | 12 ++++++++ .../changelog/20251021071904-rollback.sql | 2 ++ .../create_uploaded_source_files.sql | 10 +++++-- .../POST/source-file/add-uploaded-files.yml | 28 ++++++++++++++++++- GUI/src/pages/Agency/Agency.tsx | 9 ++---- GUI/src/pages/UploadedFiles/index.tsx | 9 ++---- GUI/src/services/sources.ts | 2 ++ 11 files changed, 103 insertions(+), 18 deletions(-) create mode 100644 DSL/DMapper/ckb/hbs/enrich_files_with_uploaded_by.handlebars create mode 100644 DSL/Liquibase/changelog/20251021071904-add-uploaded-by-and-file-size.sql create mode 100644 DSL/Liquibase/changelog/20251021071904-add-uploaded-by-and-file-size.xml create mode 100644 DSL/Liquibase/changelog/20251021071904-rollback.sql diff --git a/API_SPECIFICATION.md b/API_SPECIFICATION.md index 7e0c5fc..9f09f5d 100644 --- a/API_SPECIFICATION.md +++ b/API_SPECIFICATION.md @@ -262,20 +262,41 @@ List source files. #### POST /ckb/source-file/add-uploaded-files Add uploaded files to a source. +**Request Headers:** +- `Cookie`: Contains JWT with user information for tracking uploader + **Request Body:** ```json { - "source_base_id": "uuid", + "agencyId": "uuid", + "sourceId": "uuid", "files": [ { + "base_id": "uuid", "file_name": "document.pdf", "original_data_url": "s3://bucket/uploads/file", - "external_id": "ext_123" + "subsector": "Legal", + "file_size": 13264 } ] } ``` +**Response:** +```json +[ + { + "id": "uuid", + "url": null, + "hash": "", + "original_data_url": "s3://bucket/uploads/file", + "path": "s3://bucket/uploads/file" + } +] +``` + +**Note:** The `uploaded_by` field is automatically populated from the JWT cookie (user's `idCode`). + #### POST /ckb/source-file/get-upload-urls Get presigned upload URLs. diff --git a/DATABASE_SCHEMA.md b/DATABASE_SCHEMA.md index 5a95540..b9d7be4 100644 --- a/DATABASE_SCHEMA.md +++ b/DATABASE_SCHEMA.md @@ -69,6 +69,8 @@ erDiagram TEXT file_name "Original filename" TEXT external_id "External system ID" TEXT subsector "Data subsector" + BIGINT file_size "File size in bytes" + TEXT uploaded_by "User ID who uploaded (uploaded_file only)" BOOLEAN is_excluded "Excluded from processing" BOOLEAN is_deleted "Soft delete flag" } diff --git a/DSL/DMapper/ckb/hbs/enrich_files_with_uploaded_by.handlebars b/DSL/DMapper/ckb/hbs/enrich_files_with_uploaded_by.handlebars new file mode 100644 index 0000000..88c8ed6 --- /dev/null +++ b/DSL/DMapper/ckb/hbs/enrich_files_with_uploaded_by.handlebars @@ -0,0 +1,12 @@ +[ + {{#each files}} + { + "base_id": "{{this.base_id}}", + "file_name": "{{this.file_name}}", + "subsector": "{{this.subsector}}", + "original_data_url": "{{this.original_data_url}}", + "file_size": {{this.file_size}}, + "uploaded_by": "{{@root.uploaded_by}}" + }{{#unless @last}},{{/unless}} + {{/each}} +] diff --git a/DSL/Liquibase/changelog/20251021071904-add-uploaded-by-and-file-size.sql b/DSL/Liquibase/changelog/20251021071904-add-uploaded-by-and-file-size.sql new file mode 100644 index 0000000..8a857cc --- /dev/null +++ b/DSL/Liquibase/changelog/20251021071904-add-uploaded-by-and-file-size.sql @@ -0,0 +1,10 @@ +-- liquibase formatted sql +-- changeset ahmer-mt:20251021071904 ignore:true +-- Add uploaded_by and file_size columns to source_file table + +ALTER TABLE data_collection.source_file +ADD COLUMN uploaded_by TEXT, +ADD COLUMN file_size BIGINT; + +COMMENT ON COLUMN data_collection.source_file.uploaded_by IS 'User/system that uploaded the file (only for uploaded_file type)'; +COMMENT ON COLUMN data_collection.source_file.file_size IS 'File size in bytes'; diff --git a/DSL/Liquibase/changelog/20251021071904-add-uploaded-by-and-file-size.xml b/DSL/Liquibase/changelog/20251021071904-add-uploaded-by-and-file-size.xml new file mode 100644 index 0000000..949c9a6 --- /dev/null +++ b/DSL/Liquibase/changelog/20251021071904-add-uploaded-by-and-file-size.xml @@ -0,0 +1,12 @@ + + + + + + + + + diff --git a/DSL/Liquibase/changelog/20251021071904-rollback.sql b/DSL/Liquibase/changelog/20251021071904-rollback.sql new file mode 100644 index 0000000..b2ec9d5 --- /dev/null +++ b/DSL/Liquibase/changelog/20251021071904-rollback.sql @@ -0,0 +1,2 @@ +-- liquibase formatted sql +-- changeset ahmer-mt:20251021071904 ignore:true diff --git a/DSL/Resql/ckb/POST/source_file/create_uploaded_source_files.sql b/DSL/Resql/ckb/POST/source_file/create_uploaded_source_files.sql index c60f6ec..3982d72 100644 --- a/DSL/Resql/ckb/POST/source_file/create_uploaded_source_files.sql +++ b/DSL/Resql/ckb/POST/source_file/create_uploaded_source_files.sql @@ -16,7 +16,7 @@ declaration: description: "Agency base ID" - field: files type: array - description: "Array of file objects with base_id, file_name, subsector, original_data_url" + description: "Array of file objects with base_id, file_name, subsector, original_data_url, file_size, uploaded_by" response: fields: - field: url @@ -36,7 +36,7 @@ declaration: description: "Path (same as original_data_url)" */ INSERT INTO data_collection.source_file ( - source_base_id, agency_base_id, base_id, file_name, subsector, original_data_url, type + source_base_id, agency_base_id, base_id, file_name, subsector, original_data_url, file_size, uploaded_by, type ) SELECT :source_id::UUID, @@ -45,13 +45,17 @@ SELECT file_data.file_name, file_data.subsector, file_data.original_data_url, + file_data.file_size::BIGINT, + file_data.uploaded_by, 'uploaded_file'::source_file_type FROM ( SELECT (SELECT value) ->> 'base_id' AS base_id, (SELECT value) ->> 'file_name' AS file_name, (SELECT value) ->> 'subsector' AS subsector, - (SELECT value) ->> 'original_data_url' AS original_data_url + (SELECT value) ->> 'original_data_url' AS original_data_url, + (SELECT value) ->> 'file_size' AS file_size, + (SELECT value) ->> 'uploaded_by' AS uploaded_by FROM JSON_ARRAY_ELEMENTS(ARRAY_TO_JSON(ARRAY[:files])) WITH ORDINALITY ) AS file_data RETURNING NULL as url, base_id as id, '' as hash, original_data_url, original_data_url as path; \ No newline at end of file diff --git a/DSL/Ruuter/ckb/POST/source-file/add-uploaded-files.yml b/DSL/Ruuter/ckb/POST/source-file/add-uploaded-files.yml index c1b6c90..ac9de62 100644 --- a/DSL/Ruuter/ckb/POST/source-file/add-uploaded-files.yml +++ b/DSL/Ruuter/ckb/POST/source-file/add-uploaded-files.yml @@ -17,12 +17,27 @@ declaration: - field: files type: array description: "Array of uploaded file objects" + headers: + - field: cookie + type: string + description: "Cookie field" + +getUserInfo: + call: http.post + args: + url: "[#CKB_TIM]/jwt/custom-jwt-userinfo" + contentType: plaintext + headers: + cookie: ${incoming.headers.cookie} + plaintext: "customJwtCookie" + result: userInfoResult extractRequestData: assign: agency_id: "${incoming.body.agencyId}" source_id: "${incoming.body.sourceId}" files: "${incoming.body.files}" + uploaded_by: "${userInfoResult.response.body.idCode}" validateInput: switch: @@ -31,6 +46,17 @@ validateInput: - condition: ${files === null || files.length === 0} next: returnError +enrichFilesWithUploadedBy: + call: http.post + args: + url: "[#CKB_DMAPPER_HBS]/enrich_files_with_uploaded_by" + headers: + type: json + body: + files: ${files} + uploaded_by: ${uploaded_by} + result: enrichedFilesResult + createUploadedSourceFiles: call: http.post args: @@ -40,7 +66,7 @@ createUploadedSourceFiles: body: agency_id: ${agency_id} source_id: ${source_id} - files: ${files} + files: ${enrichedFilesResult.response.body} result: createResult transformToGetDownloadUrls: diff --git a/GUI/src/pages/Agency/Agency.tsx b/GUI/src/pages/Agency/Agency.tsx index d2b9a9c..ea100b9 100644 --- a/GUI/src/pages/Agency/Agency.tsx +++ b/GUI/src/pages/Agency/Agency.tsx @@ -208,18 +208,15 @@ const Agency: FC = () => { })), })); + setUploadModal(false); + setFormData({ subsector: '', files: [] }); + toast.open({ type: 'success', title: t('global.notification'), message: t('knowledgeBase.uploadSuccess'), }); - // Close modal after a short delay to show success state - setTimeout(() => { - setUploadModal(false); - setFormData({ subsector: '', files: [] }); - }, 1000); - queryClient.invalidateQueries(['sources']); }, onError: (error: any) => { diff --git a/GUI/src/pages/UploadedFiles/index.tsx b/GUI/src/pages/UploadedFiles/index.tsx index 1e4f1ec..15c6a18 100644 --- a/GUI/src/pages/UploadedFiles/index.tsx +++ b/GUI/src/pages/UploadedFiles/index.tsx @@ -229,18 +229,15 @@ const UploadedFiles: FC = () => { })), })); + setUploadModal(false); + setFormData({ search: '', files: [], subsector: '' }); + toast.open({ type: 'success', title: t('global.notification'), message: t('knowledgeBase.uploadSuccess'), }); - // Close modal after a short delay to show success state - setTimeout(() => { - setUploadModal(false); - setFormData({ search: '', files: [], subsector: '' }); - }, 1000); - queryClient.invalidateQueries(['uploadedFiles']); }, onError: (error: any) => { diff --git a/GUI/src/services/sources.ts b/GUI/src/services/sources.ts index 5188a54..f19a4a7 100644 --- a/GUI/src/services/sources.ts +++ b/GUI/src/services/sources.ts @@ -230,6 +230,7 @@ export const createSourceFile = async ( file_name: uploadInfo.uploadItem.fileName, subsector: data.subsector, original_data_url: uploadInfo.uploadItem.path, + file_size: uploadInfo.file.size, })); await registerUploadedFiles( @@ -295,6 +296,7 @@ export const addFilesToExistingSource = async ( file_name: uploadInfo.uploadItem.fileName, subsector: data.subsector, original_data_url: uploadInfo.uploadItem.path, + file_size: uploadInfo.file.size, })); await registerUploadedFiles( From d652c4166dccf2f7595eca5f2825ad28d11ace0f Mon Sep 17 00:00:00 2001 From: ahmer-mt Date: Tue, 21 Oct 2025 08:23:56 +0500 Subject: [PATCH 12/18] fix sorting --- .../ckb/GET/source/list_agency_sources.sql | 14 ++++++------ DSL/Resql/ckb/GET/source/list_api_sources.sql | 10 ++++----- .../list_excluded_source_files_by_agency.sql | 22 ++++++++++++------- .../source_file/list_scraped_source_files.sql | 2 +- .../list_uploaded_source_files.sql | 6 ++--- .../source_run_page/list_source_run_pages.sql | 8 +++---- GUI/src/components/DataTable/index.tsx | 4 ++-- GUI/src/pages/API/ApiDetail.tsx | 2 +- GUI/src/pages/API/index.tsx | 2 +- GUI/src/pages/Agency/Agency.tsx | 4 ++-- GUI/src/pages/Agency/index.tsx | 2 +- GUI/src/pages/Reports/Report.tsx | 2 +- GUI/src/pages/Reports/index.tsx | 2 +- GUI/src/pages/ScrapedFiles/index.tsx | 2 +- GUI/src/pages/UploadedFiles/index.tsx | 2 +- GUI/src/services/agencies.ts | 2 +- GUI/src/services/files.ts | 4 ++-- GUI/src/services/reports.ts | 4 ++-- GUI/src/services/sources.ts | 6 ++--- 19 files changed, 53 insertions(+), 47 deletions(-) diff --git a/DSL/Resql/ckb/GET/source/list_agency_sources.sql b/DSL/Resql/ckb/GET/source/list_agency_sources.sql index 0a75016..cc134e3 100644 --- a/DSL/Resql/ckb/GET/source/list_agency_sources.sql +++ b/DSL/Resql/ckb/GET/source/list_agency_sources.sql @@ -63,28 +63,28 @@ declaration: description: "Total number of matching records" */ WITH latest_sources AS ( - SELECT DISTINCT ON (base_id) - id, base_id, agency_base_id, url, subsector, status, last_scraped_at, type, is_deleted + SELECT DISTINCT ON (base_id) + id, base_id, agency_base_id, url, subsector, status, last_scraped_at, type, is_deleted, updated_at FROM data_collection.source WHERE agency_base_id = :agency_base_id::UUID ORDER BY base_id, updated_at DESC ) -SELECT +SELECT id, base_id, agency_base_id, url, subsector, status, last_scraped_at, type, :page as page, CEIL(COUNT(*) OVER () / :page_size::DECIMAL) AS total_pages, (COUNT(*) OVER ()) AS total FROM latest_sources WHERE is_deleted = FALSE -ORDER BY +ORDER BY CASE WHEN :sorting = 'url asc' THEN url END ASC, CASE WHEN :sorting = 'url desc' THEN url END DESC, CASE WHEN :sorting = 'subsector asc' THEN subsector END ASC, CASE WHEN :sorting = 'subsector desc' THEN subsector END DESC, - CASE WHEN :sorting = 'last_scraped_at asc' THEN last_scraped_at END ASC, - CASE WHEN :sorting = 'last_scraped_at desc' THEN last_scraped_at END DESC, + CASE WHEN :sorting = 'last_scraped_at asc' THEN last_scraped_at END ASC NULLS LAST, + CASE WHEN :sorting = 'last_scraped_at desc' THEN last_scraped_at END DESC NULLS LAST, CASE WHEN :sorting = 'status asc' THEN status END ASC, CASE WHEN :sorting = 'status desc' THEN status END DESC, - last_scraped_at DESC NULLS LAST + updated_at DESC NULLS LAST LIMIT :page_size::INTEGER OFFSET ((GREATEST(:page::INTEGER, 1) - 1) * :page_size::INTEGER); \ No newline at end of file diff --git a/DSL/Resql/ckb/GET/source/list_api_sources.sql b/DSL/Resql/ckb/GET/source/list_api_sources.sql index 537452e..107d346 100644 --- a/DSL/Resql/ckb/GET/source/list_api_sources.sql +++ b/DSL/Resql/ckb/GET/source/list_api_sources.sql @@ -52,26 +52,26 @@ declaration: description: "Total number of matching records" */ WITH latest_sources AS ( - SELECT DISTINCT ON (base_id) - id, base_id, agency_base_id, url, status, last_scraped_at, type, is_deleted + SELECT DISTINCT ON (base_id) + id, base_id, agency_base_id, url, status, last_scraped_at, type, is_deleted, updated_at FROM data_collection.source WHERE type = 'api'::source_type ORDER BY base_id, updated_at DESC ) -SELECT +SELECT id, base_id, agency_base_id, url, status, last_scraped_at, :page as page, CEIL(COUNT(*) OVER () / :page_size::DECIMAL) AS total_pages, (COUNT(*) OVER ()) AS total FROM latest_sources WHERE is_deleted = FALSE -ORDER BY +ORDER BY CASE WHEN :sorting = 'url asc' THEN url END ASC, CASE WHEN :sorting = 'url desc' THEN url END DESC, CASE WHEN :sorting = 'last_scraped_at asc' THEN last_scraped_at END ASC, CASE WHEN :sorting = 'last_scraped_at desc' THEN last_scraped_at END DESC, CASE WHEN :sorting = 'status asc' THEN status END ASC, CASE WHEN :sorting = 'status desc' THEN status END DESC, - last_scraped_at DESC NULLS LAST + updated_at DESC NULLS LAST LIMIT :page_size::INTEGER OFFSET ((GREATEST(:page::INTEGER, 1) - 1) * :page_size::INTEGER); \ No newline at end of file diff --git a/DSL/Resql/ckb/GET/source_file/list_excluded_source_files_by_agency.sql b/DSL/Resql/ckb/GET/source_file/list_excluded_source_files_by_agency.sql index e74092f..a909a9d 100644 --- a/DSL/Resql/ckb/GET/source_file/list_excluded_source_files_by_agency.sql +++ b/DSL/Resql/ckb/GET/source_file/list_excluded_source_files_by_agency.sql @@ -25,13 +25,19 @@ declaration: type: string description: "Base ID of the source" */ +WITH latest_files AS ( + SELECT DISTINCT ON (base_id) + id, base_id, agency_base_id, source_base_id, is_deleted, updated_at + FROM data_collection.source_file + WHERE agency_base_id = :agency_base_id::UUID + AND is_excluded = true + ORDER BY base_id, updated_at DESC +) SELECT - id, - base_id, - agency_base_id, + id, + base_id, + agency_base_id, source_base_id -FROM data_collection.source_file -WHERE - agency_base_id = :agency_base_id::UUID - AND is_excluded = true - AND is_deleted = false; \ No newline at end of file +FROM latest_files +WHERE is_deleted = false +ORDER BY updated_at DESC NULLS LAST; \ No newline at end of file diff --git a/DSL/Resql/ckb/GET/source_file/list_scraped_source_files.sql b/DSL/Resql/ckb/GET/source_file/list_scraped_source_files.sql index 0d171ff..c9d33fa 100644 --- a/DSL/Resql/ckb/GET/source_file/list_scraped_source_files.sql +++ b/DSL/Resql/ckb/GET/source_file/list_scraped_source_files.sql @@ -116,6 +116,6 @@ ORDER BY CASE WHEN :sorting = 'last_scraped_at desc' THEN last_scraped_at END DESC, CASE WHEN :sorting = 'external_id asc' THEN external_id END ASC, CASE WHEN :sorting = 'external_id desc' THEN external_id END DESC, - last_scraped_at DESC NULLS LAST + updated_at DESC NULLS LAST LIMIT :page_size::INTEGER OFFSET ((GREATEST(:page::INTEGER, 1) - 1) * :page_size::INTEGER); \ No newline at end of file diff --git a/DSL/Resql/ckb/GET/source_file/list_uploaded_source_files.sql b/DSL/Resql/ckb/GET/source_file/list_uploaded_source_files.sql index 2f97fb9..d8245d1 100644 --- a/DSL/Resql/ckb/GET/source_file/list_uploaded_source_files.sql +++ b/DSL/Resql/ckb/GET/source_file/list_uploaded_source_files.sql @@ -101,8 +101,8 @@ ORDER BY CASE WHEN :sorting = 'excluded desc' THEN is_excluded END DESC, CASE WHEN :sorting = 'status asc' THEN status END ASC, CASE WHEN :sorting = 'status desc' THEN status END DESC, - CASE WHEN :sorting = 'created_at asc' THEN created_at END ASC, - CASE WHEN :sorting = 'created_at desc' THEN created_at END DESC, - created_at DESC NULLS LAST + CASE WHEN :sorting = 'last_scraped_at asc' THEN created_at END ASC, + CASE WHEN :sorting = 'last_scraped_at desc' THEN created_at END DESC, + updated_at DESC NULLS LAST LIMIT :page_size::INTEGER OFFSET ((GREATEST(:page::INTEGER, 1) - 1) * :page_size::INTEGER); \ No newline at end of file diff --git a/DSL/Resql/ckb/GET/source_run_page/list_source_run_pages.sql b/DSL/Resql/ckb/GET/source_run_page/list_source_run_pages.sql index 439d4b1..4ab8a9a 100644 --- a/DSL/Resql/ckb/GET/source_run_page/list_source_run_pages.sql +++ b/DSL/Resql/ckb/GET/source_run_page/list_source_run_pages.sql @@ -54,9 +54,9 @@ declaration: description: "total number of agencies" */ WITH latest_run_pages AS ( - SELECT DISTINCT ON (base_id) - id, base_id, source_run_report_base_id, url, error_type, error_message, scraped_at, is_deleted - FROM monitoring.source_run_page + SELECT DISTINCT ON (base_id) + id, base_id, source_run_report_base_id, url, error_type, error_message, scraped_at, is_deleted, updated_at + FROM monitoring.source_run_page WHERE (:source_run_report_base_id IS NULL OR source_run_report_base_id = :source_run_report_base_id::UUID) ORDER BY base_id, updated_at DESC ) @@ -76,6 +76,6 @@ ORDER BY CASE WHEN :sorting = 'error_message desc' THEN error_message END DESC, CASE WHEN :sorting = 'scraped_at asc' THEN scraped_at END ASC, CASE WHEN :sorting = 'scraped_at desc' THEN scraped_at END DESC, - scraped_at DESC NULLS LAST + updated_at DESC NULLS LAST LIMIT :page_size::INTEGER OFFSET ((GREATEST(:page::INTEGER, 1) - 1) * :page_size::INTEGER); \ No newline at end of file diff --git a/GUI/src/components/DataTable/index.tsx b/GUI/src/components/DataTable/index.tsx index 9d14356..36eaf6c 100644 --- a/GUI/src/components/DataTable/index.tsx +++ b/GUI/src/components/DataTable/index.tsx @@ -221,13 +221,13 @@ const DataTable: FC = ({ {{ asc: ( } + icon={} size="medium" /> ), desc: ( } + icon={} size="medium" /> ), diff --git a/GUI/src/pages/API/ApiDetail.tsx b/GUI/src/pages/API/ApiDetail.tsx index 4da9311..3f36fcc 100644 --- a/GUI/src/pages/API/ApiDetail.tsx +++ b/GUI/src/pages/API/ApiDetail.tsx @@ -95,7 +95,7 @@ const ApiDetail: FC = () => { // Convert sorting state to API format const getSortingParam = (sorting: SortingState): string => { - if (sorting.length === 0) return 'last_scraped_at desc'; + if (sorting.length === 0) return ''; const sort = sorting[0]; let field = sort.id; diff --git a/GUI/src/pages/API/index.tsx b/GUI/src/pages/API/index.tsx index 0cd3f4f..7194579 100644 --- a/GUI/src/pages/API/index.tsx +++ b/GUI/src/pages/API/index.tsx @@ -35,7 +35,7 @@ const ApiList: FC = () => { // Convert sorting state to API format const getSortingParam = (sorting: SortingState): string => { - if (sorting.length === 0) return 'last_scraped_at desc'; + if (sorting.length === 0) return ''; const sort = sorting[0]; let field = sort.id; diff --git a/GUI/src/pages/Agency/Agency.tsx b/GUI/src/pages/Agency/Agency.tsx index ea100b9..30e3cfa 100644 --- a/GUI/src/pages/Agency/Agency.tsx +++ b/GUI/src/pages/Agency/Agency.tsx @@ -87,7 +87,7 @@ const Agency: FC = () => { // Convert sorting state to API format const getSortingParam = (sorting: SortingState): string => { - if (sorting.length === 0) return 'last_scraped_at desc'; + if (sorting.length === 0) return ''; const sort = sorting[0]; let field = sort.id; @@ -96,7 +96,7 @@ const Agency: FC = () => { const fieldMap: Record = { url: 'url', subsector: 'subsector', - lastScraped: 'last_scraped_at', + lastScrapedAt: 'last_scraped_at', status: 'status', }; diff --git a/GUI/src/pages/Agency/index.tsx b/GUI/src/pages/Agency/index.tsx index 32d0fb0..3bf439d 100644 --- a/GUI/src/pages/Agency/index.tsx +++ b/GUI/src/pages/Agency/index.tsx @@ -40,7 +40,7 @@ const AgencyComponent: FC = () => { // Convert sorting state to API format const getSortingParam = (sorting: SortingState): string => { - if (sorting.length === 0) return 'updatedAt desc'; + if (sorting.length === 0) return ''; const sort = sorting[0]; let field = sort.id; diff --git a/GUI/src/pages/Reports/Report.tsx b/GUI/src/pages/Reports/Report.tsx index 089c89a..d59864f 100644 --- a/GUI/src/pages/Reports/Report.tsx +++ b/GUI/src/pages/Reports/Report.tsx @@ -33,7 +33,7 @@ const Report: FC = () => { // Convert sorting state to API format const getSortingParam = (sorting: SortingState): string => { - if (sorting.length === 0) return 'scraped_at desc'; + if (sorting.length === 0) return ''; const sort = sorting[0]; let field = sort.id; diff --git a/GUI/src/pages/Reports/index.tsx b/GUI/src/pages/Reports/index.tsx index f8bfe0f..acd93b3 100644 --- a/GUI/src/pages/Reports/index.tsx +++ b/GUI/src/pages/Reports/index.tsx @@ -37,7 +37,7 @@ const Reports: FC = () => { // Convert sorting state to API format const getSortingParam = (sorting: SortingState): string => { - if (sorting.length === 0) return 'scraping_started_at desc'; + if (sorting.length === 0) return ''; const sort = sorting[0]; let field = sort.id; diff --git a/GUI/src/pages/ScrapedFiles/index.tsx b/GUI/src/pages/ScrapedFiles/index.tsx index 40f1ba3..87b7387 100644 --- a/GUI/src/pages/ScrapedFiles/index.tsx +++ b/GUI/src/pages/ScrapedFiles/index.tsx @@ -95,7 +95,7 @@ const ScrapedFiles: FC = () => { // Convert sorting state to API format const getSortingParam = (sorting: SortingState): string => { - if (sorting.length === 0) return 'last_scraped_at desc'; + if (sorting.length === 0) return ''; const sort = sorting[0]; let field = sort.id; diff --git a/GUI/src/pages/UploadedFiles/index.tsx b/GUI/src/pages/UploadedFiles/index.tsx index 15c6a18..8dadf63 100644 --- a/GUI/src/pages/UploadedFiles/index.tsx +++ b/GUI/src/pages/UploadedFiles/index.tsx @@ -114,7 +114,7 @@ const UploadedFiles: FC = () => { // Convert sorting state to API format const getSortingParam = (sorting: SortingState): string => { - if (sorting.length === 0) return 'last_scraped_at desc'; + if (sorting.length === 0) return ''; const sort = sorting[0]; let field = sort.id; diff --git a/GUI/src/services/agencies.ts b/GUI/src/services/agencies.ts index e03b63b..41e1729 100644 --- a/GUI/src/services/agencies.ts +++ b/GUI/src/services/agencies.ts @@ -62,7 +62,7 @@ export const getAgencies = async ( params: { page: params.page || 1, pageSize: params.pageSize || 10, - sorting: params.sorting || 'updatedAt desc', + sorting: params.sorting || '', }, }); diff --git a/GUI/src/services/files.ts b/GUI/src/services/files.ts index 80df62b..8a6538e 100644 --- a/GUI/src/services/files.ts +++ b/GUI/src/services/files.ts @@ -97,7 +97,7 @@ export const getScrapedFiles = async ( isExcluded: params.isExcluded, page: params.page || 1, pageSize: params.pageSize || 10, - sorting: params.sorting || 'last_scraped_at desc', + sorting: params.sorting || '', search: params.search, }, }); @@ -128,7 +128,7 @@ export const getUploadedFiles = async ( isExcluded: params.isExcluded, page: params.page || 1, pageSize: params.pageSize || 10, - sorting: params.sorting || 'last_scraped_at desc', + sorting: params.sorting || '', search: params.search, }, }); diff --git a/GUI/src/services/reports.ts b/GUI/src/services/reports.ts index bf28f98..ce2133e 100644 --- a/GUI/src/services/reports.ts +++ b/GUI/src/services/reports.ts @@ -40,7 +40,7 @@ export const getReports = async ( params: { page: params.page || 1, pageSize: params.pageSize || 10, - sorting: params.sorting || 'scraping_started_at desc', + sorting: params.sorting || '', }, }); @@ -116,7 +116,7 @@ export const getReportPages = async ( source_run_report_base_id: params.source_run_report_base_id, page: params.page || 1, pageSize: params.pageSize || 10, - sorting: params.sorting || 'scraped_at desc', + sorting: params.sorting || '', }, }); diff --git a/GUI/src/services/sources.ts b/GUI/src/services/sources.ts index f19a4a7..793b877 100644 --- a/GUI/src/services/sources.ts +++ b/GUI/src/services/sources.ts @@ -148,7 +148,7 @@ export const getSources = async ( agencyBaseId: params.agencyBaseId, page: params.page || 1, pageSize: params.pageSize || 10, - sorting: params.sorting || 'last_scraped_at desc', + sorting: params.sorting || '', }, }); @@ -177,7 +177,7 @@ export const getApiIntegrations = async ( params: { page: params.page || 1, pageSize: params.pageSize || 10, - sorting: params.sorting || 'last_scraped_at desc', + sorting: params.sorting || '', }, }); @@ -433,7 +433,7 @@ export const getApiSourceFiles = async ( sourceId: params.sourceId, page: params.page || 1, pageSize: params.pageSize || 10, - sorting: params.sorting || 'last_scraped_at desc', + sorting: params.sorting || '', search: params.search, type: params.type, }, From 89ef0dedc03e99e69354b597a58516a260f94307 Mon Sep 17 00:00:00 2001 From: ahmer-mt Date: Tue, 21 Oct 2025 08:36:30 +0500 Subject: [PATCH 13/18] fix pagination last page delete items --- GUI/src/pages/Agency/Agency.tsx | 24 ++++++++++++++++++++++-- GUI/src/pages/Agency/index.tsx | 24 ++++++++++++++++++++++-- GUI/src/pages/Reports/index.tsx | 24 ++++++++++++++++++++++-- GUI/src/pages/ScrapedFiles/index.tsx | 24 ++++++++++++++++++++++-- GUI/src/pages/UploadedFiles/index.tsx | 24 ++++++++++++++++++++++-- 5 files changed, 110 insertions(+), 10 deletions(-) diff --git a/GUI/src/pages/Agency/Agency.tsx b/GUI/src/pages/Agency/Agency.tsx index 30e3cfa..69d3a3c 100644 --- a/GUI/src/pages/Agency/Agency.tsx +++ b/GUI/src/pages/Agency/Agency.tsx @@ -295,14 +295,34 @@ const Agency: FC = () => { // Delete source mutation const deleteMutation = useMutation({ mutationFn: deleteSource, - onSuccess: () => { + onSuccess: async () => { toast.open({ type: 'success', title: t('global.notification'), message: t('knowledgeBase.deleteSuccess'), }); setDeleteModal(null); - queryClient.invalidateQueries(['sources']); + + // Refetch to get updated data + await queryClient.invalidateQueries(['sources']); + + // Check if current page is now out of bounds + const newTotal = (sourcesData?.total || 0) - 1; + const maxPages = Math.ceil(newTotal / pagination.pageSize); + + // Reset to last valid page if current page is out of bounds + if (pagination.pageIndex >= maxPages && maxPages > 0) { + setPagination({ + ...pagination, + pageIndex: maxPages - 1, + }); + } else if (maxPages === 0) { + // If no data left, reset to page 0 + setPagination({ + ...pagination, + pageIndex: 0, + }); + } }, onError: (error: any) => { toast.open({ diff --git a/GUI/src/pages/Agency/index.tsx b/GUI/src/pages/Agency/index.tsx index 3bf439d..96575f8 100644 --- a/GUI/src/pages/Agency/index.tsx +++ b/GUI/src/pages/Agency/index.tsx @@ -81,14 +81,34 @@ const AgencyComponent: FC = () => { // Delete agency mutation const deleteAgencyMutation = useMutation({ mutationFn: deleteAgency, - onSuccess: () => { + onSuccess: async () => { toast.open({ type: 'success', title: t('global.notification'), message: t('knowledgeBase.deleteSuccess'), }); setDeleteModal(null); - queryClient.invalidateQueries(['agencies']); + + // Refetch to get updated data + await queryClient.invalidateQueries(['agencies']); + + // Check if current page is now out of bounds + const newTotal = (knowledgeBaseData.total || 0) - 1; + const maxPages = Math.ceil(newTotal / pagination.pageSize); + + // Reset to last valid page if current page is out of bounds + if (pagination.pageIndex >= maxPages && maxPages > 0) { + setPagination({ + ...pagination, + pageIndex: maxPages - 1, + }); + } else if (maxPages === 0) { + // If no data left, reset to page 0 + setPagination({ + ...pagination, + pageIndex: 0, + }); + } }, onError: (error: any) => { toast.open({ diff --git a/GUI/src/pages/Reports/index.tsx b/GUI/src/pages/Reports/index.tsx index acd93b3..8b98cff 100644 --- a/GUI/src/pages/Reports/index.tsx +++ b/GUI/src/pages/Reports/index.tsx @@ -80,14 +80,34 @@ const Reports: FC = () => { // Delete report mutation const deleteReportMutation = useMutation({ mutationFn: deleteReport, - onSuccess: () => { + onSuccess: async () => { toast.open({ type: 'success', title: t('global.notification'), message: t('reports.deleteSuccess'), }); setDeleteModal(null); - queryClient.invalidateQueries(['reports']); + + // Refetch to get updated data + await queryClient.invalidateQueries(['reports']); + + // Check if current page is now out of bounds + const newTotal = (reportsData.total || 0) - 1; + const maxPages = Math.ceil(newTotal / pagination.pageSize); + + // Reset to last valid page if current page is out of bounds + if (pagination.pageIndex >= maxPages && maxPages > 0) { + setPagination({ + ...pagination, + pageIndex: maxPages - 1, + }); + } else if (maxPages === 0) { + // If no data left, reset to page 0 + setPagination({ + ...pagination, + pageIndex: 0, + }); + } }, onError: (error: any) => { toast.open({ diff --git a/GUI/src/pages/ScrapedFiles/index.tsx b/GUI/src/pages/ScrapedFiles/index.tsx index 87b7387..bd41a67 100644 --- a/GUI/src/pages/ScrapedFiles/index.tsx +++ b/GUI/src/pages/ScrapedFiles/index.tsx @@ -160,14 +160,34 @@ const ScrapedFiles: FC = () => { // Delete file mutation const deleteMutation = useMutation({ mutationFn: deleteFile, - onSuccess: () => { + onSuccess: async () => { toast.open({ type: 'success', title: t('global.notification'), message: t('knowledgeBase.urlDeleteSuccess'), }); setDeleteModal(null); - queryClient.invalidateQueries(['scrapedFiles']); + + // Refetch to get updated data + await queryClient.invalidateQueries(['scrapedFiles']); + + // Check if current page is now out of bounds + const newTotal = (scrapedFilesData?.total || 0) - 1; + const maxPages = Math.ceil(newTotal / pagination.pageSize); + + // Reset to last valid page if current page is out of bounds + if (pagination.pageIndex >= maxPages && maxPages > 0) { + setPagination({ + ...pagination, + pageIndex: maxPages - 1, + }); + } else if (maxPages === 0) { + // If no data left, reset to page 0 + setPagination({ + ...pagination, + pageIndex: 0, + }); + } }, onError: (error: any) => { toast.open({ diff --git a/GUI/src/pages/UploadedFiles/index.tsx b/GUI/src/pages/UploadedFiles/index.tsx index 8dadf63..337470a 100644 --- a/GUI/src/pages/UploadedFiles/index.tsx +++ b/GUI/src/pages/UploadedFiles/index.tsx @@ -306,14 +306,34 @@ const UploadedFiles: FC = () => { // Delete file mutation const deleteMutation = useMutation({ mutationFn: deleteFile, - onSuccess: () => { + onSuccess: async () => { toast.open({ type: 'success', title: t('global.notification'), message: t('knowledgeBase.fileDeleteSuccess'), }); setDeleteModal(null); - queryClient.invalidateQueries(['uploadedFiles']); + + // Refetch to get updated data + await queryClient.invalidateQueries(['uploadedFiles']); + + // Check if current page is now out of bounds + const newTotal = (uploadedFilesData?.total || 0) - 1; + const maxPages = Math.ceil(newTotal / pagination.pageSize); + + // Reset to last valid page if current page is out of bounds + if (pagination.pageIndex >= maxPages && maxPages > 0) { + setPagination({ + ...pagination, + pageIndex: maxPages - 1, + }); + } else if (maxPages === 0) { + // If no data left, reset to page 0 + setPagination({ + ...pagination, + pageIndex: 0, + }); + } }, onError: (error: any) => { toast.open({ From e9aa1e174553eab0c312659a6508927bec4ea45c Mon Sep 17 00:00:00 2001 From: ahmer-mt Date: Tue, 21 Oct 2025 19:23:49 +0500 Subject: [PATCH 14/18] https://github.com/buerokratt/Common-Knowledge/issues/88 fixes --- cleaning/worker/tasks.py | 38 ++++++++++++------- scrapper/scrapper/spiders/base_spider.py | 35 ++++++++++++++--- .../spiders/sitemap_collect_spider.py | 11 +++++- 3 files changed, 65 insertions(+), 19 deletions(-) diff --git a/cleaning/worker/tasks.py b/cleaning/worker/tasks.py index 1bcabe2..7c5e967 100644 --- a/cleaning/worker/tasks.py +++ b/cleaning/worker/tasks.py @@ -30,19 +30,36 @@ def clean_html(entity: EntityToClean): """Clean HTML files using a multi-step approach for better content extraction.""" soup = BeautifulSoup(open(entity.file_path.as_posix(), 'r'), 'lxml') + # First, remove unwanted elements from the entire document + for element in soup(['header', 'footer', 'nav', 'script', 'style', 'aside', 'form']): + element.decompose() + # Step 1: Check if there's a
element and use only that main_element = soup.find('main') if main_element: - logger.info(f'Found
element, using only main content for {entity.file_path.as_posix()}') - # Remove unwanted elements from main - for element in main_element(['script', 'style', 'nav', 'aside', 'form']): - element.decompose() + print(f'Found
element, trying partition_html on main content for {entity.file_path.as_posix()}') + + # Try partition_html on main element content first + main_html = str(main_element) + partitioned = partition_html( + text=main_html, + languages=settings.languages, + skip_headers_and_footers=True + ) + + if len(partitioned) > 0: + cleaned_text = '\n\n'.join([str(el) for el in partitioned]) + print(f'partition_html extracted {len(cleaned_text)} chars from
element for {entity.file_path.as_posix()}') + return cleaned_text + + # If partition_html returns empty, fall back to BeautifulSoup on main + print(f'partition_html on
returned empty, using BeautifulSoup fallback for {entity.file_path.as_posix()}') cleaned_text = main_element.get_text(separator='\n', strip=True) - logger.info(f'Extracted {len(cleaned_text)} chars from
element for {entity.file_path.as_posix()}') + print(f'BeautifulSoup extracted {len(cleaned_text)} chars from
element for {entity.file_path.as_posix()}') return cleaned_text # Step 2: Try partition_html with skip_headers_and_footers flag - logger.info(f'No
element found, trying partition_html with skip_headers_and_footers for {entity.file_path.as_posix()}') + print(f'No
element found, trying partition_html with skip_headers_and_footers for {entity.file_path.as_posix()}') partitioned = partition_html( filename=entity.file_path.as_posix(), languages=settings.languages, @@ -52,14 +69,9 @@ def clean_html(entity: EntityToClean): # Step 3: If partition_html returns empty, fallback to BeautifulSoup if len(partitioned) == 0: - logger.warning(f'partition_html returned empty content, using BeautifulSoup fallback for {entity.file_path.as_posix()}') - - # Remove unwanted elements (headers, footers, nav, scripts, styles) - for element in soup(['header', 'footer', 'nav', 'script', 'style', 'aside', 'form']): - element.decompose() - + print(f'partition_html returned empty content, using BeautifulSoup fallback for {entity.file_path.as_posix()}') cleaned_text = soup.get_text(separator='\n', strip=True) - logger.info(f'BeautifulSoup fallback extracted {len(cleaned_text)} chars for {entity.file_path.as_posix()}') + print(f'BeautifulSoup fallback extracted {len(cleaned_text)} chars for {entity.file_path.as_posix()}') return cleaned_text diff --git a/scrapper/scrapper/spiders/base_spider.py b/scrapper/scrapper/spiders/base_spider.py index 6595757..9d172d8 100644 --- a/scrapper/scrapper/spiders/base_spider.py +++ b/scrapper/scrapper/spiders/base_spider.py @@ -122,12 +122,29 @@ async def parse(self, response: Response, **kwargs): # Check if Playwright page is available (might not be if direct HTTP download was used) playwright_page = response.meta.get("playwright_page") + rendered_html = None + if playwright_page: - # Use Playwright page for title extraction + # Use Playwright page for title extraction and get rendered HTML async with self.close_page(response) as page: page: Page - if file_extension == '.html': + + # Check if this is a sitemap or XML file - don't wait or render, use raw content + is_sitemap = 'sitemap' in response.url.lower() or file_extension == '.xml' + + if file_extension == '.html' and not is_sitemap: + # Wait for content to actually render (check if body has meaningful content) + try: + self.logger.info('Waiting for dynamic content to render...') + await page.wait_for_load_state('networkidle') + self.logger.info('Content detected, proceeding...') + except Exception as e: + # If timeout or error, proceed anyway + self.logger.warning(f'Timeout waiting for content, proceeding anyway: {e}') + title = await page.title() + # Get the fully rendered HTML after JavaScript execution + rendered_html = await page.content() else: title = response.url else: @@ -140,18 +157,26 @@ async def parse(self, response: Response, **kwargs): else: title = response.url + # Use rendered HTML if available, otherwise use response.body + body_to_save = rendered_html.encode('utf-8') if rendered_html else response.body + if file_extension == '.html': - soup = BeautifulSoup(response.body, 'lxml') + soup = BeautifulSoup(body_to_save, 'lxml') text = soup.get_text() hashed = hashlib.sha1(text.encode()).hexdigest() else: - hashed = hashlib.sha1(response.body).hexdigest() + hashed = hashlib.sha1(body_to_save).hexdigest() - file_item = FileItem(body=response.body, source_url=response.url, extension=file_extension) + file_item = FileItem(body=body_to_save, source_url=response.url, extension=file_extension) metadata_item = MetadataItem( file_type=file_extension, metadata=Metadata(), source_url=response.url, page_title=title ) scrapped_item = ScrappedItem(file=file_item, metadata=metadata_item, hash=hashed) + + # Store rendered HTML in response meta for link extraction by subclasses + if rendered_html: + response.meta['rendered_html'] = rendered_html + yield scrapped_item diff --git a/scrapper/scrapper/spiders/sitemap_collect_spider.py b/scrapper/scrapper/spiders/sitemap_collect_spider.py index 5d6301c..27c76ff 100644 --- a/scrapper/scrapper/spiders/sitemap_collect_spider.py +++ b/scrapper/scrapper/spiders/sitemap_collect_spider.py @@ -136,7 +136,16 @@ async def parse(self, response: Response, **kwargs): if scrapped_item.metadata.file_type != '.html': continue - for href in response.css("a::attr(href)").getall(): + # Use rendered HTML for link extraction if available (for SPAs) + rendered_html = response.meta.get('rendered_html') + if rendered_html: + from bs4 import BeautifulSoup + soup = BeautifulSoup(rendered_html, 'lxml') + links = [a.get('href') for a in soup.find_all('a', href=True)] + else: + links = response.css("a::attr(href)").getall() + + for href in links: next_url = urljoin(response.url, href) next_url = next_url.split('#')[0] From 56dde937150933e011985851a8617c288db4fbe5 Mon Sep 17 00:00:00 2001 From: ahmer-mt Date: Tue, 21 Oct 2025 19:28:14 +0500 Subject: [PATCH 15/18] fix logging --- cleaning/worker/tasks.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/cleaning/worker/tasks.py b/cleaning/worker/tasks.py index 7c5e967..242c3a4 100644 --- a/cleaning/worker/tasks.py +++ b/cleaning/worker/tasks.py @@ -37,7 +37,7 @@ def clean_html(entity: EntityToClean): # Step 1: Check if there's a
element and use only that main_element = soup.find('main') if main_element: - print(f'Found
element, trying partition_html on main content for {entity.file_path.as_posix()}') + logger.info(f'Found
element, trying partition_html on main content for {entity.file_path.as_posix()}') # Try partition_html on main element content first main_html = str(main_element) @@ -49,17 +49,17 @@ def clean_html(entity: EntityToClean): if len(partitioned) > 0: cleaned_text = '\n\n'.join([str(el) for el in partitioned]) - print(f'partition_html extracted {len(cleaned_text)} chars from
element for {entity.file_path.as_posix()}') + logger.info(f'partition_html extracted {len(cleaned_text)} chars from
element for {entity.file_path.as_posix()}') return cleaned_text # If partition_html returns empty, fall back to BeautifulSoup on main - print(f'partition_html on
returned empty, using BeautifulSoup fallback for {entity.file_path.as_posix()}') + logger.info(f'partition_html on
returned empty, using BeautifulSoup fallback for {entity.file_path.as_posix()}') cleaned_text = main_element.get_text(separator='\n', strip=True) - print(f'BeautifulSoup extracted {len(cleaned_text)} chars from
element for {entity.file_path.as_posix()}') + logger.info(f'BeautifulSoup extracted {len(cleaned_text)} chars from
element for {entity.file_path.as_posix()}') return cleaned_text # Step 2: Try partition_html with skip_headers_and_footers flag - print(f'No
element found, trying partition_html with skip_headers_and_footers for {entity.file_path.as_posix()}') + logger.info(f'No
element found, trying partition_html with skip_headers_and_footers for {entity.file_path.as_posix()}') partitioned = partition_html( filename=entity.file_path.as_posix(), languages=settings.languages, @@ -69,9 +69,9 @@ def clean_html(entity: EntityToClean): # Step 3: If partition_html returns empty, fallback to BeautifulSoup if len(partitioned) == 0: - print(f'partition_html returned empty content, using BeautifulSoup fallback for {entity.file_path.as_posix()}') + logger.info(f'partition_html returned empty content, using BeautifulSoup fallback for {entity.file_path.as_posix()}') cleaned_text = soup.get_text(separator='\n', strip=True) - print(f'BeautifulSoup fallback extracted {len(cleaned_text)} chars for {entity.file_path.as_posix()}') + logger.info(f'BeautifulSoup fallback extracted {len(cleaned_text)} chars for {entity.file_path.as_posix()}') return cleaned_text @@ -120,9 +120,9 @@ def clean_file_task(entity: EntityToClean): if cleaned_text and len(cleaned_text.strip()) > 0: try: detected_language = detect(cleaned_text) - print(f'Detected language: {detected_language} for {entity.file_path.as_posix()}') + logger.info(f'Detected language: {detected_language} for {entity.file_path.as_posix()}') except LangDetectException as e: - print(f'Language detection failed for {entity.file_path.as_posix()}: {e}') + logger.error(f'Language detection failed for {entity.file_path.as_posix()}: {e}') cleaned_text_filename = entity.directory_path / 'cleaned.txt' From 73fd97ea753f4c3ee0233b5757b7415d2c7227a7 Mon Sep 17 00:00:00 2001 From: ahmer-mt Date: Thu, 23 Oct 2025 02:16:20 +0500 Subject: [PATCH 16/18] fix cleaning pipeline --- cleaning/requirements.txt | 2 +- cleaning/worker/tasks.py | 4 +-- scrapper/api/app.py | 4 +++ scrapper/api/models.py | 1 + scrapper/scrapper/pipelines.py | 7 ++++- scrapper/scrapper/spiders/base_spider.py | 4 +++ search-service/index.js | 36 +++++++++++++++++++++--- 7 files changed, 50 insertions(+), 8 deletions(-) diff --git a/cleaning/requirements.txt b/cleaning/requirements.txt index 986160a..55271a8 100644 --- a/cleaning/requirements.txt +++ b/cleaning/requirements.txt @@ -1,6 +1,6 @@ fastapi==0.115.12 uvicorn==0.34.2 -unstructured[pdf,docx,doc]==0.18.2 +unstructured[pdf,docx,doc]==0.18.5 pydantic-settings==2.10.1 beautifulsoup4==4.13.4 langdetect==1.0.9 diff --git a/cleaning/worker/tasks.py b/cleaning/worker/tasks.py index 242c3a4..f5e6951 100644 --- a/cleaning/worker/tasks.py +++ b/cleaning/worker/tasks.py @@ -54,7 +54,7 @@ def clean_html(entity: EntityToClean): # If partition_html returns empty, fall back to BeautifulSoup on main logger.info(f'partition_html on
returned empty, using BeautifulSoup fallback for {entity.file_path.as_posix()}') - cleaned_text = main_element.get_text(separator='\n', strip=True) + cleaned_text = main_element.get_text(separator='\n\n', strip=True) logger.info(f'BeautifulSoup extracted {len(cleaned_text)} chars from
element for {entity.file_path.as_posix()}') return cleaned_text @@ -70,7 +70,7 @@ def clean_html(entity: EntityToClean): # Step 3: If partition_html returns empty, fallback to BeautifulSoup if len(partitioned) == 0: logger.info(f'partition_html returned empty content, using BeautifulSoup fallback for {entity.file_path.as_posix()}') - cleaned_text = soup.get_text(separator='\n', strip=True) + cleaned_text = soup.get_text(separator='\n\n', strip=True) logger.info(f'BeautifulSoup fallback extracted {len(cleaned_text)} chars for {entity.file_path.as_posix()}') return cleaned_text diff --git a/scrapper/api/app.py b/scrapper/api/app.py index 03c78b5..04540c0 100644 --- a/scrapper/api/app.py +++ b/scrapper/api/app.py @@ -23,6 +23,8 @@ @app.post('/specified-pages-scrapper-task') def trigger_specified_pages_scrapper_task(task: SpecifiedLinksScrapeTask): + # Always ignore stopping for manual file refresh + task.ignore_stopping = True specified_links_scrapper_task.delay(task.model_dump(mode='json')) @@ -59,6 +61,8 @@ def trigger_eesti_scrapper_task(task: EestiScrapperTask): @app.post('/specified-api-files-scrapper-task') def trigger_specified_api_files_scrapper_task(task: SpecifiedApiFilesScrapeTask): + # Always ignore stopping for manual file refresh + task.ignore_stopping = True specified_api_files_scrapper_task.delay(task.model_dump(mode='json')) @app.post('/generate-edited-metadata') diff --git a/scrapper/api/models.py b/scrapper/api/models.py index a85c969..8ac26ce 100644 --- a/scrapper/api/models.py +++ b/scrapper/api/models.py @@ -4,6 +4,7 @@ class BaseObject(BaseModel): agency_id: str source_id: str + ignore_stopping: bool = False class LinkToScrape(BaseModel): diff --git a/scrapper/scrapper/pipelines.py b/scrapper/scrapper/pipelines.py index e37bba8..bd1cd13 100644 --- a/scrapper/scrapper/pipelines.py +++ b/scrapper/scrapper/pipelines.py @@ -175,7 +175,7 @@ class ScrappingFinishedPipeline: def close_spider(self, spider: Spider): if not hasattr(spider, 'task'): return - + spider: BaseSpider task: BaseObject = spider.task @@ -194,6 +194,11 @@ def open_spider(self, spider: Spider): spider: BaseSpider task: BaseObject = spider.task + # Skip updating source status for manual file refresh (ignore_stopping flag) + # This prevents clearing is_stopping flag when refreshing individual files + if hasattr(task, 'ignore_stopping') and task.ignore_stopping: + return + requests.post(f'{spider.settings.get('RUUTER_INTERNAL')}/ckb/source/update-status', json={ 'source_id': task.source_id, 'status': 'running', diff --git a/scrapper/scrapper/spiders/base_spider.py b/scrapper/scrapper/spiders/base_spider.py index 9d172d8..a9f6f7d 100644 --- a/scrapper/scrapper/spiders/base_spider.py +++ b/scrapper/scrapper/spiders/base_spider.py @@ -33,6 +33,10 @@ def __init__(self, *args, **kwargs): self.task = kwargs['task'] def check_source_is_stopping(self): + # Skip check if this is a manual file refresh (ignore_stopping flag set) + if hasattr(self.task, 'ignore_stopping') and self.task.ignore_stopping: + return + try: is_stopping = requests.get( f'{self.settings.get('RUUTER_INTERNAL')}/ckb/source/get', diff --git a/search-service/index.js b/search-service/index.js index 3f658b1..cba470f 100644 --- a/search-service/index.js +++ b/search-service/index.js @@ -231,17 +231,31 @@ app.get("/search/:sourceId", async (req, res) => { size: 1000, // Get more docs to find unique source_file_ids query: { bool: { - must: q.trim() + should: q.trim() ? [ + { + term: { + url: { + value: q.trim(), + boost: 100, + }, + }, + }, { multi_match: { query: q.trim(), - fields: ["content^3", "page_title^2", "file_name^2"], + fields: [ + "url^5", + "content^3", + "page_title^2", + "file_name^2", + ], type: "best_fields", }, }, ] : [{ match_all: {} }], + minimum_should_match: q.trim() ? 1 : 0, }, }, _source: ["source_file_id"], @@ -285,17 +299,31 @@ app.get("/search/:sourceId", async (req, res) => { size: parseInt(size), query: { bool: { - must: q.trim() + should: q.trim() ? [ + { + term: { + url: { + value: q.trim(), + boost: 100, + }, + }, + }, { multi_match: { query: q.trim(), - fields: ["content^3", "page_title^2", "file_name^2"], + fields: [ + "url^5", + "content^3", + "page_title^2", + "file_name^2", + ], type: "best_fields", }, }, ] : [{ match_all: {} }], + minimum_should_match: q.trim() ? 1 : 0, }, }, highlight: { From 670d35cc373e93d772a169635ed116c9aea85682 Mon Sep 17 00:00:00 2001 From: ahmer-mt Date: Fri, 24 Oct 2025 12:28:16 +0500 Subject: [PATCH 17/18] https://github.com/buerokratt/Common-Knowledge/issues/90 same batch --- .../components/FileUploader/FileUploader.tsx | 38 ++++++++++++++++--- GUI/translations/en/common.json | 1 + GUI/translations/et/common.json | 1 + 3 files changed, 34 insertions(+), 6 deletions(-) diff --git a/GUI/src/components/FileUploader/FileUploader.tsx b/GUI/src/components/FileUploader/FileUploader.tsx index 4de9583..eff6a26 100644 --- a/GUI/src/components/FileUploader/FileUploader.tsx +++ b/GUI/src/components/FileUploader/FileUploader.tsx @@ -96,9 +96,22 @@ const FileUploader: FC = ({ const selectedFiles = event.target.files; if (!selectedFiles) return; - const newFiles: FileItem[] = Array.from(selectedFiles).map((file) => - validateSingleFile(file) - ); + const newFiles: FileItem[] = Array.from(selectedFiles).map((file) => { + const validatedFile = validateSingleFile(file); + + // Check for duplicate filename in existing files + const isDuplicate = files.some(existingFile => existingFile.name === file.name); + + if (isDuplicate && validatedFile.status === 'pending') { + return { + ...validatedFile, + status: 'error' as const, + message: t('fileUpload.duplicateFile'), + }; + } + + return validatedFile; + }); const updatedFiles = [...files, ...newFiles]; onFilesChange(updatedFiles); @@ -126,9 +139,22 @@ const FileUploader: FC = ({ const droppedFiles = e.dataTransfer.files; if (!droppedFiles) return; - const newFiles: FileItem[] = Array.from(droppedFiles).map((file) => - validateSingleFile(file) - ); + const newFiles: FileItem[] = Array.from(droppedFiles).map((file) => { + const validatedFile = validateSingleFile(file); + + // Check for duplicate filename in existing files + const isDuplicate = files.some(existingFile => existingFile.name === file.name); + + if (isDuplicate && validatedFile.status === 'pending') { + return { + ...validatedFile, + status: 'error' as const, + message: t('fileUpload.duplicateFile'), + }; + } + + return validatedFile; + }); const updatedFiles = [...files, ...newFiles]; onFilesChange(updatedFiles); diff --git a/GUI/translations/en/common.json b/GUI/translations/en/common.json index 2a8c810..e52bde3 100644 --- a/GUI/translations/en/common.json +++ b/GUI/translations/en/common.json @@ -601,6 +601,7 @@ "allowedFormats": "Allowed formats: ", "maxSizeExceeded": "Maximum size exceeded.", "fileAlreadyExists": "File already exists.", + "duplicateFile": "A file with the same name is already selected.", "success": "Success", "fileExists": "File exists", "fileExceedsLimit": "File size exceeds the maximum limit", diff --git a/GUI/translations/et/common.json b/GUI/translations/et/common.json index 8ed8534..a440e4b 100644 --- a/GUI/translations/et/common.json +++ b/GUI/translations/et/common.json @@ -601,6 +601,7 @@ "allowedFormats": "Lubatud formaadid: ", "maxSizeExceeded": "Maksimaalne suurus ületatud.", "fileAlreadyExists": "Fail on juba olemas.", + "duplicateFile": "Sama nimega fail on juba valitud.", "success": "Edukas", "fileExists": "Fail eksisteerib", "fileExceedsLimit": "Failisuurus ületab maksimaalset piiri", From eb2fda2034b47c17f85284438bbc912f2bfbdefa Mon Sep 17 00:00:00 2001 From: ahmer-mt Date: Fri, 24 Oct 2025 16:41:49 +0500 Subject: [PATCH 18/18] fix originalDataUrl issue --- DSL/Ruuter.internal/ckb/POST/pipeline/delete-file-sync.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DSL/Ruuter.internal/ckb/POST/pipeline/delete-file-sync.yml b/DSL/Ruuter.internal/ckb/POST/pipeline/delete-file-sync.yml index 3ca9619..0d89c63 100644 --- a/DSL/Ruuter.internal/ckb/POST/pipeline/delete-file-sync.yml +++ b/DSL/Ruuter.internal/ckb/POST/pipeline/delete-file-sync.yml @@ -26,7 +26,7 @@ deleteFile: url: "[#CKB_FILE_MANAGER]/delete-files" body: files: - - s3_path: ${sourceFile.response.body[0].original_data_url} + - s3_path: ${sourceFile.response.body[0].originalDataUrl} result: res returnResult: