From a4346f799da2cb4a0533accd161f811373bbfb4a Mon Sep 17 00:00:00 2001
From: Artsiom Beida <artsiom.beida@mindtitan.com>
Date: Wed, 8 Oct 2025 10:38:49 +0200
Subject: [PATCH 01/18] UI fixes: fix estonian translation does not show report
 agency name and url in the report delete confirmation form. fix reports links
 to problematic urls

---
 GUI/src/pages/Reports/Report.tsx | 2 +-
 GUI/src/pages/Reports/index.tsx  | 1 +
 GUI/translations/en/common.json  | 2 +-
 GUI/translations/et/common.json  | 2 +-
 4 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/GUI/src/pages/Reports/Report.tsx b/GUI/src/pages/Reports/Report.tsx
index 7f45aca..089c89a 100644
--- a/GUI/src/pages/Reports/Report.tsx
+++ b/GUI/src/pages/Reports/Report.tsx
@@ -148,7 +148,7 @@ const Report: FC = () => {
       cell: ({ row }) => (
         <Tooltip content={row.original.url}>
           <a
-            href={`https://${row.original.url}`}
+            href={row.original.url}
             target="_blank"
             rel="noopener noreferrer"
             style={{ textDecoration: 'underline', color: '#005AA3' }}
diff --git a/GUI/src/pages/Reports/index.tsx b/GUI/src/pages/Reports/index.tsx
index 88257c6..f8bfe0f 100644
--- a/GUI/src/pages/Reports/index.tsx
+++ b/GUI/src/pages/Reports/index.tsx
@@ -285,6 +285,7 @@ const Reports: FC = () => {
             {t('reports.deleteConfirmation', {
               agency: deleteModal.agencyName,
               domain: deleteModal.url,
+              interpolation: { escapeValue: false }
             })}
           </p>
         </Dialog>
diff --git a/GUI/translations/en/common.json b/GUI/translations/en/common.json
index b5e25ea..2a8c810 100644
--- a/GUI/translations/en/common.json
+++ b/GUI/translations/en/common.json
@@ -615,7 +615,7 @@
     "startedAt": "Started at",
     "finishedAt": "Finished at",
     "deleteTitle": "Delete Report",
-    "deleteConfirmation": "Are you sure you want to delete the report?",
+    "deleteConfirmation": "Are you sure you want to delete the report {{agency}} - {{domain}}?",
     "deleteSuccess": "Report deleted successfully",
     "deleteError": "Failed to delete report",
     "errorType": "Error type",
diff --git a/GUI/translations/et/common.json b/GUI/translations/et/common.json
index a2f92c7..8ed8534 100644
--- a/GUI/translations/et/common.json
+++ b/GUI/translations/et/common.json
@@ -615,7 +615,7 @@
     "startedAt": "Alustatud",
     "finishedAt": "Lõpetatud",
     "deleteTitle": "Kustuta aruanne",
-    "deleteConfirmation": "Kas oled kindel, et soovid kustutada aruande asutusele {agency} - {domain}?",
+    "deleteConfirmation": "Kas oled kindel, et soovid kustutada aruande asutusele {{agency}} - {{domain}}?",
     "deleteSuccess": "Aruanne kustutatud edukalt",
     "deleteError": "Aruande kustutamine ebaõnnestus",
     "errorType": "Vea tüüp",

From 78dbc7e92d42dc0c00e63549bfa89f1f55541dd0 Mon Sep 17 00:00:00 2001
From: Artsiom Beida <artsiom.beida@mindtitan.com>
Date: Wed, 8 Oct 2025 17:04:19 +0200
Subject: [PATCH 02/18] Increased timeout to 30s for scrapper to account for
 slow archive pages;

---
 scrapper/scrapper/settings.py            | 6 +++---
 scrapper/scrapper/spiders/base_spider.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/scrapper/scrapper/settings.py b/scrapper/scrapper/settings.py
index ccdd372..a54195d 100644
--- a/scrapper/scrapper/settings.py
+++ b/scrapper/scrapper/settings.py
@@ -106,14 +106,14 @@
 ALLOWED_FILETYPES = os.environ.get('SUPPORTED_TYPES', '.html,.docx,.doc,.pdf').split(',')
 SCRAPED_DIRECTORY = os.environ.get('SCRAPED_DIRECTORY', "/scrapped-data")
 RUUTER_INTERNAL = os.environ.get('RUUTER_INTERNAL', "http://ruuter-internal:8089")
-DOWNLOAD_DELAY = 0.1
+DOWNLOAD_DELAY = 0.2
 
 DOWNLOAD_HANDLERS = {
     "http": "scrapper.download_handler.DownloadHandler",
     "https": "scrapper.download_handler.DownloadHandler",
 }
-PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT = 10_000
+PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT = 30_000
 TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
 PLAYWRIGHT_MAX_CONTEXTS = 1
 PLAYWRIGHT_MAX_PAGES_PER_CONTEXT = 1
-RETRY_TIMES = 3
+RETRY_TIMES = 10
diff --git a/scrapper/scrapper/spiders/base_spider.py b/scrapper/scrapper/spiders/base_spider.py
index ee2c609..dfcd2dc 100644
--- a/scrapper/scrapper/spiders/base_spider.py
+++ b/scrapper/scrapper/spiders/base_spider.py
@@ -48,7 +48,7 @@ def get_meta(self):
             'playwright': True,
             'playwright_include_page': True,
             'playwright_page_goto_kwargs': {
-                'timeout': 5_000,
+                'timeout': 30_000,
                 'wait_until': 'load',
             },
             "playwright_context_kwargs": {

From 952f92c035dba7a01c600cd0b9b61580d98b6f29 Mon Sep 17 00:00:00 2001
From: ahmer-mt <ahmer.ali@mindtitan.com>
Date: Thu, 16 Oct 2025 01:17:54 +0500
Subject: [PATCH 03/18] playwright download fix

---
 scrapper/scrapper/download_handler.py         | 76 +++++++++++++++++--
 scrapper/scrapper/spiders/base_spider.py      | 28 +++++--
 .../scrapper/spiders/uploaded_file_spider.py  |  7 ++
 3 files changed, 96 insertions(+), 15 deletions(-)

diff --git a/scrapper/scrapper/download_handler.py b/scrapper/scrapper/download_handler.py
index 6870353..a124ba9 100644
--- a/scrapper/scrapper/download_handler.py
+++ b/scrapper/scrapper/download_handler.py
@@ -1,25 +1,87 @@
 import asyncio
 from scrapy import Request, Spider
 from scrapy.http import Response
+from scrapy.core.downloader.handlers.http import HTTPDownloadHandler
 from scrapy_playwright.handler import ScrapyPlaywrightDownloadHandler
+from playwright._impl._errors import Error as PlaywrightError
+from twisted.internet import defer
 
 
 class DownloadHandler(ScrapyPlaywrightDownloadHandler):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.args = args
-        self.kwargs = kwargs
+    def __init__(self, crawler):
+        super().__init__(crawler)
+        self.crawler = crawler
+        # Initialize standard HTTP handler for non-Playwright requests
+        self._http_handler = HTTPDownloadHandler(
+            settings=crawler.settings,
+            crawler=crawler
+        )
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        return cls(crawler)
+
+    def download_request(self, request: Request, spider: Spider):
+        """
+        Main entry point for downloading requests.
+        Check if Playwright is needed, otherwise use direct HTTP.
+        """
+        # Check if Playwright is requested in meta
+        if not request.meta.get('playwright'):
+            # Use direct HTTP download for uploaded files
+            spider.logger.info(f'Direct HTTP download (no Playwright): {request.url}')
+            return self._http_handler.download_request(request, spider)
+
+        # Use Playwright for regular web pages
+        spider.logger.info(f'Playwright download: {request.url}')
+        return super().download_request(request, spider)
 
     async def _download_request(self, request: Request, spider: Spider) -> Response:
+        """
+        Internal async download method with fallback for download errors.
+        This is called by the parent's download_request when using Playwright.
+        """
         try:
-            spider.logger.info(f'request started: {request.url}')
+            spider.logger.info(f'Playwright request started: {request.url}')
             async with asyncio.timeout(30):
                 r = await super()._download_request(request, spider)
-                spider.logger.info(f'request finished: {request.url}')
+                spider.logger.info(f'Playwright request finished: {request.url}')
                 return r
+        except Exception as e:
+            # Catch "Download is starting" and similar download errors as safety net
+            if "Download is starting" in str(e) or "net::ERR_ABORTED" in str(e):
+                spider.logger.info(f'Download error detected for {request.url}, falling back to direct HTTP download')
+                # Fall back to direct HTTP download using Scrapy's HTTP handler (consistent with Method 1)
+                try:
+                    import requests
+                    # Use requests for simplicity in async context
+                    response = requests.get(
+                        request.url,
+                        headers=dict(request.headers.to_unicode_dict()),
+                        timeout=30
+                    )
+                    response.raise_for_status()
+
+                    spider.logger.info(f'Direct HTTP download completed: {request.url} ({len(response.content)} bytes)')
+
+                    from scrapy.http import HtmlResponse
+                    return HtmlResponse(
+                        url=response.url,
+                        status=response.status_code,
+                        headers=dict(response.headers),
+                        body=response.content,
+                        encoding='utf-8',
+                        request=request,
+                    )
+                except Exception as download_error:
+                    spider.logger.error(f'Direct HTTP download failed for {request.url}: {download_error}')
+                    raise
+            else:
+                # Other Playwright errors - re-raise
+                spider.logger.error(f'Playwright error for {request.url}: {str(e)}')
+                raise
         except TimeoutError:
             spider.logger.warning(f'request timed out due to playwright: {request.url}. Try again')
             await self._close()
-            super().__init__(*self.args, **self.kwargs)
             await self._launch()
             return await self._download_request(request, spider)
diff --git a/scrapper/scrapper/spiders/base_spider.py b/scrapper/scrapper/spiders/base_spider.py
index dfcd2dc..b6c2f15 100644
--- a/scrapper/scrapper/spiders/base_spider.py
+++ b/scrapper/scrapper/spiders/base_spider.py
@@ -109,17 +109,29 @@ async def errback(self, failure: Failure):
             await page.close()
 
     async def parse(self, response: Response, **kwargs):
-        async with self.close_page(response) as page:
-            self.check_source_is_stopping()
+        self.check_source_is_stopping()
 
-            page: Page
-
-            file_extension = self.guess_file_extension(
-                response.headers.get(b'Content-Type', 'text/html').decode('utf-8')
-            )
+        file_extension = self.guess_file_extension(
+            response.headers.get(b'Content-Type', 'text/html').decode('utf-8')
+        )
 
+        # Check if Playwright page is available (might not be if direct HTTP download was used)
+        playwright_page = response.meta.get("playwright_page")
+        if playwright_page:
+            # Use Playwright page for title extraction
+            async with self.close_page(response) as page:
+                page: Page
+                if file_extension == '.html':
+                    title = await page.title()
+                else:
+                    title = response.url
+        else:
+            # Direct HTTP download (no Playwright page available)
             if file_extension == '.html':
-                title = await page.title()
+                # Extract title from HTML using BeautifulSoup
+                soup = BeautifulSoup(response.body, 'lxml')
+                title_tag = soup.find('title')
+                title = title_tag.get_text() if title_tag else response.url
             else:
                 title = response.url
 
diff --git a/scrapper/scrapper/spiders/uploaded_file_spider.py b/scrapper/scrapper/spiders/uploaded_file_spider.py
index 86cb9ab..fce08cc 100644
--- a/scrapper/scrapper/spiders/uploaded_file_spider.py
+++ b/scrapper/scrapper/spiders/uploaded_file_spider.py
@@ -7,6 +7,13 @@
 class UploadedFileSpider(SpecifiedPagesSpider):
     name = 'uploaded_file'
 
+    def get_meta(self):
+        """
+        Override to disable Playwright for uploaded files.
+        Uploaded files from S3 should use direct HTTP download.
+        """
+        return {}
+
     async def parse(self, response: Response, **kwargs):
         base_id, _ = self.get_base_id_and_hash(response.request.url)
 

From f9a5275f6de59d6933cf7aff2de5374bdf7ed0d7 Mon Sep 17 00:00:00 2001
From: ahmer-mt <ahmer.ali@mindtitan.com>
Date: Thu, 16 Oct 2025 01:19:07 +0500
Subject: [PATCH 04/18] remove .txt from upload

---
 GUI/src/components/FileUploader/FileUploader.tsx | 2 +-
 GUI/src/pages/Agency/Agency.tsx                  | 2 +-
 GUI/src/pages/UploadedFiles/index.tsx            | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/GUI/src/components/FileUploader/FileUploader.tsx b/GUI/src/components/FileUploader/FileUploader.tsx
index 8c37fd0..4de9583 100644
--- a/GUI/src/components/FileUploader/FileUploader.tsx
+++ b/GUI/src/components/FileUploader/FileUploader.tsx
@@ -36,7 +36,7 @@ const FileUploader: FC<FileUploaderProps> = ({
   onFilesChange,
   onFileDelete,
   maxFileSize = 30 * 1024 * 1024, // 30MB default
-  acceptedTypes = '.pdf,.doc,.docx,.txt,.html,.htm',
+  acceptedTypes = '.pdf,.doc,.docx,.html,.htm',
   multiple = true,
   className = '',
   uploadProgress,
diff --git a/GUI/src/pages/Agency/Agency.tsx b/GUI/src/pages/Agency/Agency.tsx
index 0876918..d2b9a9c 100644
--- a/GUI/src/pages/Agency/Agency.tsx
+++ b/GUI/src/pages/Agency/Agency.tsx
@@ -684,7 +684,7 @@ const Agency: FC = () => {
               onFilesChange={handleFilesChange}
               onFileDelete={handleFileDelete}
               maxFileSize={30 * 1024 * 1024} // 30MB
-              acceptedTypes=".pdf,.doc,.docx,.txt,.html,.htm"
+              acceptedTypes=".pdf,.doc,.docx,.html,.htm"
               multiple={true}
               uploadProgress={uploadProgress} // Pass upload progress to FileUploader
             />
diff --git a/GUI/src/pages/UploadedFiles/index.tsx b/GUI/src/pages/UploadedFiles/index.tsx
index 6ed3aeb..bb0be3d 100644
--- a/GUI/src/pages/UploadedFiles/index.tsx
+++ b/GUI/src/pages/UploadedFiles/index.tsx
@@ -811,7 +811,7 @@ const UploadedFiles: FC = () => {
                 onFilesChange={handleFilesChange}
                 onFileDelete={handleFileDelete}
                 maxFileSize={30 * 1024 * 1024} // 30MB
-                acceptedTypes=".pdf,.doc,.docx,.txt,.html,.htm"
+                acceptedTypes=".pdf,.doc,.docx,.html,.htm"
                 multiple={true}
                 uploadProgress={uploadProgress}
               />

From 7321342b2573b5bcba5ce6f1d8afd0773105314a Mon Sep 17 00:00:00 2001
From: ahmer-mt <ahmer.ali@mindtitan.com>
Date: Thu, 16 Oct 2025 21:34:34 +0500
Subject: [PATCH 05/18] Add Language to metadata

---
 cleaning/requirements.txt                   |  1 +
 cleaning/worker/tasks.py                    | 11 +++++++++++
 file-processing/app/services/zip_service.py |  5 +++--
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/cleaning/requirements.txt b/cleaning/requirements.txt
index 00c3182..986160a 100644
--- a/cleaning/requirements.txt
+++ b/cleaning/requirements.txt
@@ -3,3 +3,4 @@ uvicorn==0.34.2
 unstructured[pdf,docx,doc]==0.18.2
 pydantic-settings==2.10.1
 beautifulsoup4==4.13.4
+langdetect==1.0.9
diff --git a/cleaning/worker/tasks.py b/cleaning/worker/tasks.py
index f234036..974e6ae 100644
--- a/cleaning/worker/tasks.py
+++ b/cleaning/worker/tasks.py
@@ -2,6 +2,7 @@
 import logging
 
 import requests
+from langdetect import detect, LangDetectException
 
 from unstructured.partition.auto import partition
 from bs4 import BeautifulSoup
@@ -55,6 +56,15 @@ def clean_file_task(entity: EntityToClean):
             cleaned_text = clean_any_file(entity)
             logger.info(f'Cleaned as unstructured file for {entity.file_path.as_posix()}')
 
+        # Detect language from cleaned text
+        detected_language = None
+        if cleaned_text and len(cleaned_text.strip()) > 0:
+            try:
+                detected_language = detect(cleaned_text)
+                print(f'Detected language: {detected_language} for {entity.file_path.as_posix()}')
+            except LangDetectException as e:
+                print(f'Language detection failed for {entity.file_path.as_posix()}: {e}')
+
         cleaned_text_filename = entity.directory_path / 'cleaned.txt'
 
         with cleaned_text_filename.open("w") as f:
@@ -74,6 +84,7 @@ def clean_file_task(entity: EntityToClean):
         cleaned_metadata_filename = entity.directory_path / "cleaned.meta.json"
         with cleaned_metadata_filename.open("w") as f:
             metadata['metadata']['cleaned'] = True
+            metadata['language'] = detected_language
             json.dump(metadata, f)
 
         r = requests.post(
diff --git a/file-processing/app/services/zip_service.py b/file-processing/app/services/zip_service.py
index bc18283..d842b4c 100644
--- a/file-processing/app/services/zip_service.py
+++ b/file-processing/app/services/zip_service.py
@@ -139,12 +139,13 @@ def exclusion_filter(relative_path: str) -> bool:
         
         # Create zip file in temp directory
         base_name = os.path.join(temp_dir, "folder_content")
-        
-        logger.info(f"Creating zip file {temp_zip_path} with {successful_count} files")
+    
         
         shutil.make_archive(base_name, 'zip', local_folder_path)
         temp_zip_path = base_name + ".zip"
 
+        logger.info(f"Creating zip file {temp_zip_path} with {successful_count} files")
+        
         # Get zip file size
         zip_size = os.path.getsize(temp_zip_path)
 

From bbaa0b2f2a6fdf3489f79c5adc1d5ea351f5c350 Mon Sep 17 00:00:00 2001
From: ahmer-mt <ahmer.ali@mindtitan.com>
Date: Thu, 16 Oct 2025 21:35:54 +0500
Subject: [PATCH 06/18] lang metadata

---
 scrapper/scrapper/items.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scrapper/scrapper/items.py b/scrapper/scrapper/items.py
index fc18309..fecb6a1 100644
--- a/scrapper/scrapper/items.py
+++ b/scrapper/scrapper/items.py
@@ -25,6 +25,7 @@ class MetadataItem:
     version: str = "1.0"
     created_at: str = field(default_factory=lambda: str(datetime.now()))
     edited_at: str | None = None
+    language: Optional[str] = None
 
 
 @dataclass

From ef25df999d64a9f87c6fc3a0fcdf80c9b37b2de4 Mon Sep 17 00:00:00 2001
From: ahmer-mt <ahmer.ali@mindtitan.com>
Date: Thu, 16 Oct 2025 21:36:10 +0500
Subject: [PATCH 07/18] fix callbacks

---
 scrapper/scrapper/download_handler.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/scrapper/scrapper/download_handler.py b/scrapper/scrapper/download_handler.py
index a124ba9..5b6f1ee 100644
--- a/scrapper/scrapper/download_handler.py
+++ b/scrapper/scrapper/download_handler.py
@@ -47,11 +47,17 @@ async def _download_request(self, request: Request, spider: Spider) -> Response:
                 r = await super()._download_request(request, spider)
                 spider.logger.info(f'Playwright request finished: {request.url}')
                 return r
+        except TimeoutError:
+            spider.logger.warning(f'request timed out due to playwright: {request.url}. Try again')
+            await self._close()
+            super().__init__(self.crawler)  # Re-initialize with the same crawler
+            await self._launch()
+            return await self._download_request(request, spider)
         except Exception as e:
             # Catch "Download is starting" and similar download errors as safety net
             if "Download is starting" in str(e) or "net::ERR_ABORTED" in str(e):
                 spider.logger.info(f'Download error detected for {request.url}, falling back to direct HTTP download')
-                # Fall back to direct HTTP download using Scrapy's HTTP handler (consistent with Method 1)
+                # Fall back to direct HTTP download
                 try:
                     import requests
                     # Use requests for simplicity in async context
@@ -80,8 +86,3 @@ async def _download_request(self, request: Request, spider: Spider) -> Response:
                 # Other Playwright errors - re-raise
                 spider.logger.error(f'Playwright error for {request.url}: {str(e)}')
                 raise
-        except TimeoutError:
-            spider.logger.warning(f'request timed out due to playwright: {request.url}. Try again')
-            await self._close()
-            await self._launch()
-            return await self._download_request(request, spider)

From 9c4a991b0e3168e42afee28307b1989d6271d973 Mon Sep 17 00:00:00 2001
From: ahmer-mt <ahmer.ali@mindtitan.com>
Date: Sat, 18 Oct 2025 00:28:30 +0500
Subject: [PATCH 08/18] ignore archived urls

---
 .../app/services/download_service.py          | 21 +++----
 scrapper/scrapper/spiders/base_spider.py      |  7 ++-
 .../spiders/sitemap_collect_spider.py         |  6 ++
 scrapper/scrapper/utils.py                    | 56 +++++++++++++++++++
 4 files changed, 79 insertions(+), 11 deletions(-)

diff --git a/file-processing/app/services/download_service.py b/file-processing/app/services/download_service.py
index d5c9702..0d9d46a 100644
--- a/file-processing/app/services/download_service.py
+++ b/file-processing/app/services/download_service.py
@@ -19,11 +19,12 @@
     FileDeleteResult
 )
 from app.services.blob_storage import storage_provider, BlobStorageException
+from app.core.config import settings
 
 logger = logging.getLogger(__name__)
 
-# Volume path configuration
-VOLUME_PATH = '/app/data'
+# Source path configuration from settings
+SOURCE_PATH = settings.source_path
 
 # In-memory task store for download tasks
 _download_tasks: Dict[str, dict] = {}
@@ -77,7 +78,7 @@ def process_single_file_download(file_item: FileDownloadItem) -> FileDownloadRes
                 clean_s3_path = parts[0]
         
         # Use volume path configuration
-        local_path_str = f"{VOLUME_PATH}/{file_item.local_path}"
+        local_path_str = f"{SOURCE_PATH}/{file_item.local_path}"
         local_path = Path(local_path_str)
         
         # Ensure local directory exists
@@ -109,7 +110,7 @@ def process_single_file_download(file_item: FileDownloadItem) -> FileDownloadRes
         error_msg = f"Unexpected error: {str(e)}"
         return FileDownloadResult(
             s3_path=file_item.s3_path,
-            local_path=f"{VOLUME_PATH}/{file_item.local_path}",
+            local_path=f"{SOURCE_PATH}/{file_item.local_path}",
             status="failed",
             error_message=error_msg,
             is_folder=False,
@@ -131,7 +132,7 @@ def process_folder_download(file_item: FileDownloadItem) -> FileDownloadResult:
                 clean_s3_path = parts[0]
         
         # Use volume path configuration
-        local_folder_path = f"{VOLUME_PATH}/{file_item.local_path}"
+        local_folder_path = f"{SOURCE_PATH}/{file_item.local_path}"
         
         # Ensure local directory exists
         Path(local_folder_path).mkdir(parents=True, exist_ok=True)
@@ -176,7 +177,7 @@ def process_folder_download(file_item: FileDownloadItem) -> FileDownloadResult:
         error_msg = f"Folder download error: {str(e)}"
         return FileDownloadResult(
             s3_path=file_item.s3_path,
-            local_path=f"{VOLUME_PATH}/{file_item.local_path}",
+            local_path=f"{SOURCE_PATH}/{file_item.local_path}",
             status="failed",
             error_message=error_msg,
             is_folder=True,
@@ -218,7 +219,7 @@ def process_download_task(task_id: str) -> None:
                 error_msg = f"Unexpected error processing {file_item.s3_path}: {str(e)}"
                 results.append(FileDownloadResult(
                     s3_path=file_item.s3_path,
-                    local_path=f"{VOLUME_PATH}/{file_item.local_path}",
+                    local_path=f"{SOURCE_PATH}/{file_item.local_path}",
                     status="failed",
                     error_message=error_msg,
                     is_folder=file_item.is_folder,
@@ -328,7 +329,7 @@ def delete_files_from_volume(request: DeleteFromVolumeRequest) -> DeleteFromVolu
     for file_item in request.files:
         try:
             # Use volume path configuration
-            local_path_str = f"{VOLUME_PATH}/{file_item.local_path}"
+            local_path_str = f"{SOURCE_PATH}/{file_item.local_path}"
             local_path = Path(local_path_str)
             
             if local_path.exists():
@@ -369,7 +370,7 @@ def delete_files_from_volume(request: DeleteFromVolumeRequest) -> DeleteFromVolu
         except Exception as e:
             error_msg = f"Unexpected error: {str(e)}"
             results.append(FileDeleteResult(
-                local_path=f"{VOLUME_PATH}/{file_item.local_path}",
+                local_path=f"{SOURCE_PATH}/{file_item.local_path}",
                 status="failed",
                 error_message=error_msg
             ))
@@ -445,7 +446,7 @@ def download_files_to_volume(request: DownloadToVolumeRequest) -> DownloadToVolu
             error_msg = f"Unexpected error processing {file_item.s3_path}: {str(e)}"
             results.append(FileDownloadResult(
                 s3_path=file_item.s3_path,
-                local_path=f"{VOLUME_PATH}/{file_item.local_path}",
+                local_path=f"{SOURCE_PATH}/{file_item.local_path}",
                 status="failed",
                 error_message=error_msg,
                 is_folder=file_item.is_folder,
diff --git a/scrapper/scrapper/spiders/base_spider.py b/scrapper/scrapper/spiders/base_spider.py
index b6c2f15..6595757 100644
--- a/scrapper/scrapper/spiders/base_spider.py
+++ b/scrapper/scrapper/spiders/base_spider.py
@@ -16,7 +16,7 @@
 
 from api.models import BaseObject
 from scrapper.items import FileItem, MetadataItem, Metadata, ScrappedItem
-from scrapper.utils import send_error
+from scrapper.utils import send_error, is_archive_url
 
 
 class BaseSpider(Spider):
@@ -111,6 +111,11 @@ async def errback(self, failure: Failure):
     async def parse(self, response: Response, **kwargs):
         self.check_source_is_stopping()
 
+        # Check if URL is an archive page and skip if it is
+        if is_archive_url(response.url):
+            self.logger.info(f'Skipping archive URL: {response.url}')
+            return
+
         file_extension = self.guess_file_extension(
             response.headers.get(b'Content-Type', 'text/html').decode('utf-8')
         )
diff --git a/scrapper/scrapper/spiders/sitemap_collect_spider.py b/scrapper/scrapper/spiders/sitemap_collect_spider.py
index 89846b7..5d6301c 100644
--- a/scrapper/scrapper/spiders/sitemap_collect_spider.py
+++ b/scrapper/scrapper/spiders/sitemap_collect_spider.py
@@ -6,6 +6,7 @@
 
 from api.models import SitemapCollectScrapperTask
 from scrapper.spiders.base_spider import BaseSpider
+from scrapper.utils import is_archive_url
 
 
 class SitemapCollectSpider(BaseSpider):
@@ -143,6 +144,11 @@ async def parse(self, response: Response, **kwargs):
                 if self.get_pure_domain(next_url) not in self.pure_allowed_domains:
                     continue
 
+                # Skip archive URLs
+                if is_archive_url(next_url):
+                    self.logger.info(f'Skipping archive URL: {next_url}')
+                    continue
+
                 if next_url not in self.visited_urls:
                     self.visited_urls.add(next_url)
                     self.logger.info(f'Schedule scrape for url: {next_url}')
diff --git a/scrapper/scrapper/utils.py b/scrapper/scrapper/utils.py
index 19f6e59..b082621 100644
--- a/scrapper/scrapper/utils.py
+++ b/scrapper/scrapper/utils.py
@@ -3,6 +3,7 @@
 import functools
 import typing
 import requests
+from urllib.parse import urlparse
 
 from scrapper.items import ScrappedItem
 
@@ -66,3 +67,58 @@ def decorator(self, spider: BaseSpider):
         return r
 
     return decorator
+
+
+# Archive URL detection keywords in multiple languages
+ARCHIVE_KEYWORDS = [
+    # Estonian
+    'arhiiv', 'arhiivi', 'archive',
+    # English
+    'archived', 'archives',
+    # Russian transliteration
+    'arkhiv', 'arhiv',
+]
+
+
+def is_archive_url(url: str) -> bool:
+    """
+    Check if a URL points to an archived/historical page.
+
+    Detects archive pages by checking for archive-related keywords in:
+    - Subdomain (e.g., arhiiv.example.ee)
+    - Path segments (e.g., example.ee/arhiiv/2020/)
+
+    Args:
+        url: The URL to check
+
+    Returns:
+        True if the URL appears to be an archive page, False otherwise
+
+    Examples:
+        >>> is_archive_url('https://arhiiv.lastekaitseliit.ee/et/2016/06/7203/')
+        True
+        >>> is_archive_url('https://example.com/arhiiv/old-content')
+        True
+        >>> is_archive_url('https://example.com/current-page')
+        False
+    """
+    try:
+        parsed = urlparse(url.lower())
+
+        # Check subdomain for archive keywords
+        hostname_parts = parsed.hostname.split('.') if parsed.hostname else []
+        for part in hostname_parts:
+            if any(keyword in part for keyword in ARCHIVE_KEYWORDS):
+                return True
+
+        # Check path segments for archive keywords
+        path_parts = parsed.path.split('/')
+        for part in path_parts:
+            if any(keyword in part for keyword in ARCHIVE_KEYWORDS):
+                return True
+
+        return False
+
+    except Exception:
+        # If URL parsing fails, don't filter it out
+        return False

From f6224a84487f1aa8c26e8db5ca129195e05e7a96 Mon Sep 17 00:00:00 2001
From: ahmer-mt <ahmer.ali@mindtitan.com>
Date: Mon, 20 Oct 2025 22:52:53 +0500
Subject: [PATCH 09/18] 
 https://github.com/buerokratt/Common-Knowledge/issues/88

---
 .../ckb/TEMPLATES/pipeline/clean-file.yml     | 10 ++++
 cleaning/api/models.py                        |  5 +-
 cleaning/worker/tasks.py                      | 53 +++++++++++++++++--
 3 files changed, 64 insertions(+), 4 deletions(-)

diff --git a/DSL/Ruuter/ckb/TEMPLATES/pipeline/clean-file.yml b/DSL/Ruuter/ckb/TEMPLATES/pipeline/clean-file.yml
index c6a5817..26f3b70 100644
--- a/DSL/Ruuter/ckb/TEMPLATES/pipeline/clean-file.yml
+++ b/DSL/Ruuter/ckb/TEMPLATES/pipeline/clean-file.yml
@@ -4,6 +4,11 @@ extractRequestData:
     meta_data_path: ${incoming.body.meta_data_path}
     directory_path: ${incoming.body.directory_path}
     source_file_id: ${incoming.body.source_file_id}
+    logs_path: ${incoming.body.logs_path}
+    url: ${incoming.body.url}
+    source_base_id: ${incoming.body.source_base_id}
+    agency_base_id: ${incoming.body.agency_base_id}
+    source_run_report_base_id: ${incoming.body.source_run_report_base_id}
 
 
 cleanData:
@@ -15,6 +20,11 @@ cleanData:
       meta_data_path: ${meta_data_path}
       directory_path: ${directory_path}
       source_file_id: ${source_file_id}
+      logs_path: ${logs_path}
+      url: ${url}
+      source_base_id: ${source_base_id}
+      agency_base_id: ${agency_base_id}
+      source_run_report_base_id: ${source_run_report_base_id}
   result: cleanedResult
 
 
diff --git a/cleaning/api/models.py b/cleaning/api/models.py
index ff96d4e..833db05 100644
--- a/cleaning/api/models.py
+++ b/cleaning/api/models.py
@@ -7,4 +7,7 @@ class EntityToClean(BaseModel):
     directory_path: DirectoryPath
     source_file_id: str
     url: str
-    logs_path: FilePath
\ No newline at end of file
+    logs_path: FilePath
+    source_base_id: str
+    agency_base_id: str
+    source_run_report_base_id: str
\ No newline at end of file
diff --git a/cleaning/worker/tasks.py b/cleaning/worker/tasks.py
index 974e6ae..1bcabe2 100644
--- a/cleaning/worker/tasks.py
+++ b/cleaning/worker/tasks.py
@@ -1,10 +1,12 @@
 import json
 import logging
+import re
 
 import requests
 from langdetect import detect, LangDetectException
 
 from unstructured.partition.auto import partition
+from unstructured.partition.html import partition_html
 from bs4 import BeautifulSoup
 
 from api.config import settings
@@ -14,11 +16,52 @@
 logger = logging.getLogger(__name__)
 
 
+def normalize_newlines(text: str) -> str:
+    """
+    Normalize excessive newlines to maximum of 2 consecutive newlines.
+    Replace 3 or more consecutive newlines with exactly 2 newlines.
+    """
+    # Replace 3 or more newlines with exactly 2 newlines
+    normalized_text = re.sub(r'\n{3,}', '\n\n', text)
+    return normalized_text
+
+
 def clean_html(entity: EntityToClean):
-    with entity.file_path.open('r') as f:
-        soup = BeautifulSoup(f.read(), 'lxml')
+    """Clean HTML files using a multi-step approach for better content extraction."""
+    soup = BeautifulSoup(open(entity.file_path.as_posix(), 'r'), 'lxml')
+
+    # Step 1: Check if there's a <main> element and use only that
+    main_element = soup.find('main')
+    if main_element:
+        logger.info(f'Found <main> element, using only main content for {entity.file_path.as_posix()}')
+        # Remove unwanted elements from main
+        for element in main_element(['script', 'style', 'nav', 'aside', 'form']):
+            element.decompose()
+        cleaned_text = main_element.get_text(separator='\n', strip=True)
+        logger.info(f'Extracted {len(cleaned_text)} chars from <main> element for {entity.file_path.as_posix()}')
+        return cleaned_text
+
+    # Step 2: Try partition_html with skip_headers_and_footers flag
+    logger.info(f'No <main> element found, trying partition_html with skip_headers_and_footers for {entity.file_path.as_posix()}')
+    partitioned = partition_html(
+        filename=entity.file_path.as_posix(),
+        languages=settings.languages,
+        skip_headers_and_footers=True
+    )
+    cleaned_text = '\n\n'.join([str(el) for el in partitioned])
+
+    # Step 3: If partition_html returns empty, fallback to BeautifulSoup
+    if len(partitioned) == 0:
+        logger.warning(f'partition_html returned empty content, using BeautifulSoup fallback for {entity.file_path.as_posix()}')
 
-    return soup.get_text()
+        # Remove unwanted elements (headers, footers, nav, scripts, styles)
+        for element in soup(['header', 'footer', 'nav', 'script', 'style', 'aside', 'form']):
+            element.decompose()
+
+        cleaned_text = soup.get_text(separator='\n', strip=True)
+        logger.info(f'BeautifulSoup fallback extracted {len(cleaned_text)} chars for {entity.file_path.as_posix()}')
+
+    return cleaned_text
 
 
 def clean_any_file(entity: EntityToClean):
@@ -56,6 +99,10 @@ def clean_file_task(entity: EntityToClean):
             cleaned_text = clean_any_file(entity)
             logger.info(f'Cleaned as unstructured file for {entity.file_path.as_posix()}')
 
+        # Normalize excessive newlines (max 2 consecutive newlines)
+        cleaned_text = normalize_newlines(cleaned_text)
+        logger.info(f'Normalized newlines for {entity.file_path.as_posix()}')
+
         # Detect language from cleaned text
         detected_language = None
         if cleaned_text and len(cleaned_text.strip()) > 0:

From f588e979d22964e49040b2bddbee637e3396e68e Mon Sep 17 00:00:00 2001
From: ahmer-mt <ahmer.ali@mindtitan.com>
Date: Tue, 21 Oct 2025 00:23:06 +0500
Subject: [PATCH 10/18] 
 https://github.com/buerokratt/Common-Knowledge/issues/90

---
 .../ckb/hbs/extract_file_names.handlebars     |  5 ++
 .../check_duplicate_file_names.sql            | 34 +++++++++++++
 .../get-upload-urls-for-existing-source.yml   | 38 ++++++++++++++
 GUI/src/pages/UploadedFiles/index.tsx         | 51 ++++++++++++++++---
 GUI/src/services/api.ts                       | 17 +++++--
 5 files changed, 134 insertions(+), 11 deletions(-)
 create mode 100644 DSL/DMapper/ckb/hbs/extract_file_names.handlebars
 create mode 100644 DSL/Resql/ckb/POST/source_file/check_duplicate_file_names.sql

diff --git a/DSL/DMapper/ckb/hbs/extract_file_names.handlebars b/DSL/DMapper/ckb/hbs/extract_file_names.handlebars
new file mode 100644
index 0000000..cb14b65
--- /dev/null
+++ b/DSL/DMapper/ckb/hbs/extract_file_names.handlebars
@@ -0,0 +1,5 @@
+[
+    {{#each files}}
+    "{{this.name}}"{{#unless @last}},{{/unless}}
+    {{/each}}
+]
diff --git a/DSL/Resql/ckb/POST/source_file/check_duplicate_file_names.sql b/DSL/Resql/ckb/POST/source_file/check_duplicate_file_names.sql
new file mode 100644
index 0000000..8532779
--- /dev/null
+++ b/DSL/Resql/ckb/POST/source_file/check_duplicate_file_names.sql
@@ -0,0 +1,34 @@
+/*
+declaration:
+  version: 0.1
+  description: "Check if file names already exist for a source (latest version, non-deleted)"
+  method: post
+  accepts: json
+  returns: json
+  namespace: source_file
+  allowlist:
+    body:
+      - field: source_id
+        type: string
+        description: "Source base ID"
+      - field: file_names
+        type: string
+        description: "Comma-separated file names to check"
+  response:
+    fields:
+      - field: file_name
+        type: string
+        description: "Duplicate file name found"
+*/
+WITH latest_files AS (
+    SELECT DISTINCT ON (base_id)
+        base_id, file_name, is_deleted
+    FROM data_collection.source_file
+    WHERE source_base_id = :source_id::UUID
+      AND type = 'uploaded_file'
+    ORDER BY base_id, updated_at DESC
+)
+SELECT DISTINCT file_name
+FROM latest_files
+WHERE is_deleted = FALSE
+  AND file_name = ANY(string_to_array(:fileNames, ','));
diff --git a/DSL/Ruuter/ckb/POST/source/file/get-upload-urls-for-existing-source.yml b/DSL/Ruuter/ckb/POST/source/file/get-upload-urls-for-existing-source.yml
index 6206ba0..f637e7c 100644
--- a/DSL/Ruuter/ckb/POST/source/file/get-upload-urls-for-existing-source.yml
+++ b/DSL/Ruuter/ckb/POST/source/file/get-upload-urls-for-existing-source.yml
@@ -29,6 +29,32 @@ extractRequestData:
     files: "${incoming.body.files}"
     expires_in: "${incoming.body.expiresIn || 3600}"
 
+extractFileNames:
+  call: http.post
+  args:
+    url: "[#CKB_DMAPPER_HBS]/extract_file_names"
+    headers:
+      type: json
+    body:
+      files: ${files}
+  result: fileNamesResult
+
+checkDuplicateFileNames:
+  call: http.post
+  args:
+    url: "[#CKB_RESQL]/source_file/check_duplicate_file_names"
+    headers:
+      type: json
+    body:
+      source_id: ${source_base_id}
+      fileNames: ${fileNamesResult.response.body?.join(',') ?? ''}
+  result: duplicatesResult
+
+validateNoDuplicates:
+  switch:
+    - condition: ${duplicatesResult.response.body !== null && duplicatesResult.response.body.length > 0}
+      next: returnDuplicateError
+
 generateSourceFileIds:
   call: http.post
   args:
@@ -83,3 +109,15 @@ prepareResponse:
 returnResponse:
   return: ${response}
   next: end
+
+returnDuplicateError:
+  assign:
+    errorResponse:
+      error: "Duplicate file names found"
+      duplicateFiles: ${duplicatesResult.response.body}
+  next: returnDuplicateErrorResponse
+
+returnDuplicateErrorResponse:
+  return: ${errorResponse}
+  status: 400
+  next: end
diff --git a/GUI/src/pages/UploadedFiles/index.tsx b/GUI/src/pages/UploadedFiles/index.tsx
index bb0be3d..1e4f1ec 100644
--- a/GUI/src/pages/UploadedFiles/index.tsx
+++ b/GUI/src/pages/UploadedFiles/index.tsx
@@ -252,21 +252,56 @@ const UploadedFiles: FC = () => {
         currentFileName: '',
       });
 
+      // Extract error message from various formats
+      // Backend returns: {response: {error: "...", duplicateFiles: [...]}}
+      const responseData =
+        error.response?.data?.response || error.response?.data || {};
+
+      const errorMessage =
+        responseData.error || error.message || t('knowledgeBase.uploadError');
+
+      // Get duplicate file names if available
+      const duplicateFiles = responseData.duplicateFiles || [];
+
       // Set failed files to error status
       setFormData((prev) => ({
         ...prev,
-        files: prev.files.map((file) => ({
-          ...file,
-          status:
-            file.status === 'uploading' ? ('error' as const) : file.status,
-          message: file.status === 'uploading' ? error.message : file.message,
-        })),
+        files: prev.files.map((file) => {
+          const isDuplicate = duplicateFiles.some(
+            (df: any) =>
+              df.fileName === file.name ||
+              df.file_name === file.name ||
+              df === file.name
+          );
+
+          // Only mark duplicates as error, leave other files as-is
+          if (isDuplicate) {
+            return {
+              ...file,
+              status: 'error' as const,
+              message: errorMessage,
+            };
+          }
+
+          return file;
+        }),
       }));
 
+      // Build detailed error message
+      let displayMessage = errorMessage;
+      if (duplicateFiles.length > 0) {
+        const fileNames = duplicateFiles
+          .map((df: any) =>
+            typeof df === 'string' ? df : df.fileName || df.file_name
+          )
+          .join(', ');
+        displayMessage = `${errorMessage}: ${fileNames}`;
+      }
+
       toast.open({
         type: 'error',
         title: t('global.notificationError'),
-        message: error.message || t('knowledgeBase.uploadError'),
+        message: displayMessage,
       });
     },
   });
@@ -781,7 +816,7 @@ const UploadedFiles: FC = () => {
                     uploadProgress.isUploading ||
                     !formData.subsector ||
                     formData.files.length === 0 ||
-                    formData.files.every((file) => file.status === 'error')
+                    formData.files.some((file) => file.status === 'error')
                   }
                 >
                   {uploadProgress.isUploading
diff --git a/GUI/src/services/api.ts b/GUI/src/services/api.ts
index c5a68b8..93fbfba 100644
--- a/GUI/src/services/api.ts
+++ b/GUI/src/services/api.ts
@@ -35,10 +35,21 @@ const AxiosInterceptor = ({ children }) => {
     const errInterceptor = (error: any) => {
       import.meta.env.DEBUG_ENABLED && console.debug(error);
 
-      let message =
-        error?.response?.data?.response || t('global.notificationErrorMsg');
+      // Keep the original error structure for proper error handling
+      // If there's a response, attach it to a new error with proper message
+      if (error?.response?.data?.response) {
+        const responseData = error.response.data.response;
+        const errorMessage = typeof responseData === 'string'
+          ? responseData
+          : responseData.error || t('global.notificationErrorMsg');
 
-      return Promise.reject(new Error(message));
+        const newError = new Error(errorMessage);
+        // Preserve the original response for error handlers
+        (newError as any).response = error.response;
+        return Promise.reject(newError);
+      }
+
+      return Promise.reject(new Error(error?.message || t('global.notificationErrorMsg')));
     };
 
     const apiInterceptor = api.interceptors.response.use(

From d630a6d120967b3da901d5ab5aa5beaf120e77a5 Mon Sep 17 00:00:00 2001
From: ahmer-mt <ahmer.ali@mindtitan.com>
Date: Tue, 21 Oct 2025 07:48:12 +0500
Subject: [PATCH 11/18] 
 https://github.com/buerokratt/Common-Knowledge/issues/92

---
 API_SPECIFICATION.md                          | 25 +++++++++++++++--
 DATABASE_SCHEMA.md                            |  2 ++
 .../enrich_files_with_uploaded_by.handlebars  | 12 ++++++++
 ...21071904-add-uploaded-by-and-file-size.sql | 10 +++++++
 ...21071904-add-uploaded-by-and-file-size.xml | 12 ++++++++
 .../changelog/20251021071904-rollback.sql     |  2 ++
 .../create_uploaded_source_files.sql          | 10 +++++--
 .../POST/source-file/add-uploaded-files.yml   | 28 ++++++++++++++++++-
 GUI/src/pages/Agency/Agency.tsx               |  9 ++----
 GUI/src/pages/UploadedFiles/index.tsx         |  9 ++----
 GUI/src/services/sources.ts                   |  2 ++
 11 files changed, 103 insertions(+), 18 deletions(-)
 create mode 100644 DSL/DMapper/ckb/hbs/enrich_files_with_uploaded_by.handlebars
 create mode 100644 DSL/Liquibase/changelog/20251021071904-add-uploaded-by-and-file-size.sql
 create mode 100644 DSL/Liquibase/changelog/20251021071904-add-uploaded-by-and-file-size.xml
 create mode 100644 DSL/Liquibase/changelog/20251021071904-rollback.sql

diff --git a/API_SPECIFICATION.md b/API_SPECIFICATION.md
index 7e0c5fc..9f09f5d 100644
--- a/API_SPECIFICATION.md
+++ b/API_SPECIFICATION.md
@@ -262,20 +262,41 @@ List source files.
 #### POST /ckb/source-file/add-uploaded-files
 Add uploaded files to a source.
 
+**Request Headers:**
+- `Cookie`: Contains JWT with user information for tracking uploader
+
 **Request Body:**
 ```json
 {
-  "source_base_id": "uuid",
+  "agencyId": "uuid",
+  "sourceId": "uuid",
   "files": [
     {
+      "base_id": "uuid",
       "file_name": "document.pdf",
       "original_data_url": "s3://bucket/uploads/file",
-      "external_id": "ext_123"
+      "subsector": "Legal",
+      "file_size": 13264
     }
   ]
 }
 ```
 
+**Response:**
+```json
+[
+  {
+    "id": "uuid",
+    "url": null,
+    "hash": "",
+    "original_data_url": "s3://bucket/uploads/file",
+    "path": "s3://bucket/uploads/file"
+  }
+]
+```
+
+**Note:** The `uploaded_by` field is automatically populated from the JWT cookie (user's `idCode`).
+
 #### POST /ckb/source-file/get-upload-urls
 Get presigned upload URLs.
 
diff --git a/DATABASE_SCHEMA.md b/DATABASE_SCHEMA.md
index 5a95540..b9d7be4 100644
--- a/DATABASE_SCHEMA.md
+++ b/DATABASE_SCHEMA.md
@@ -69,6 +69,8 @@ erDiagram
         TEXT file_name "Original filename"
         TEXT external_id "External system ID"
         TEXT subsector "Data subsector"
+        BIGINT file_size "File size in bytes"
+        TEXT uploaded_by "User ID who uploaded (uploaded_file only)"
         BOOLEAN is_excluded "Excluded from processing"
         BOOLEAN is_deleted "Soft delete flag"
     }
diff --git a/DSL/DMapper/ckb/hbs/enrich_files_with_uploaded_by.handlebars b/DSL/DMapper/ckb/hbs/enrich_files_with_uploaded_by.handlebars
new file mode 100644
index 0000000..88c8ed6
--- /dev/null
+++ b/DSL/DMapper/ckb/hbs/enrich_files_with_uploaded_by.handlebars
@@ -0,0 +1,12 @@
+[
+    {{#each files}}
+    {
+        "base_id": "{{this.base_id}}",
+        "file_name": "{{this.file_name}}",
+        "subsector": "{{this.subsector}}",
+        "original_data_url": "{{this.original_data_url}}",
+        "file_size": {{this.file_size}},
+        "uploaded_by": "{{@root.uploaded_by}}"
+    }{{#unless @last}},{{/unless}}
+    {{/each}}
+]
diff --git a/DSL/Liquibase/changelog/20251021071904-add-uploaded-by-and-file-size.sql b/DSL/Liquibase/changelog/20251021071904-add-uploaded-by-and-file-size.sql
new file mode 100644
index 0000000..8a857cc
--- /dev/null
+++ b/DSL/Liquibase/changelog/20251021071904-add-uploaded-by-and-file-size.sql
@@ -0,0 +1,10 @@
+-- liquibase formatted sql
+-- changeset ahmer-mt:20251021071904 ignore:true
+-- Add uploaded_by and file_size columns to source_file table
+
+ALTER TABLE data_collection.source_file
+ADD COLUMN uploaded_by TEXT,
+ADD COLUMN file_size BIGINT;
+
+COMMENT ON COLUMN data_collection.source_file.uploaded_by IS 'User/system that uploaded the file (only for uploaded_file type)';
+COMMENT ON COLUMN data_collection.source_file.file_size IS 'File size in bytes';
diff --git a/DSL/Liquibase/changelog/20251021071904-add-uploaded-by-and-file-size.xml b/DSL/Liquibase/changelog/20251021071904-add-uploaded-by-and-file-size.xml
new file mode 100644
index 0000000..949c9a6
--- /dev/null
+++ b/DSL/Liquibase/changelog/20251021071904-add-uploaded-by-and-file-size.xml
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<databaseChangeLog xmlns="http://www.liquibase.org/xml/ns/dbchangelog"
+                   xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+                   xsi:schemaLocation="http://www.liquibase.org/xml/ns/dbchangelog
+http://www.liquibase.org/xml/ns/dbchangelog/dbchangelog-4.1.xsd">
+    <changeSet id="20251021071904" author="ahmer-mt">
+        <sqlFile path="changelog/20251021071904-add-uploaded-by-and-file-size.sql" />
+        <rollback>
+            <sqlFile path="changelog/20251021071904-rollback.sql" />
+        </rollback>
+    </changeSet>
+</databaseChangeLog>
diff --git a/DSL/Liquibase/changelog/20251021071904-rollback.sql b/DSL/Liquibase/changelog/20251021071904-rollback.sql
new file mode 100644
index 0000000..b2ec9d5
--- /dev/null
+++ b/DSL/Liquibase/changelog/20251021071904-rollback.sql
@@ -0,0 +1,2 @@
+-- liquibase formatted sql
+-- changeset ahmer-mt:20251021071904 ignore:true
diff --git a/DSL/Resql/ckb/POST/source_file/create_uploaded_source_files.sql b/DSL/Resql/ckb/POST/source_file/create_uploaded_source_files.sql
index c60f6ec..3982d72 100644
--- a/DSL/Resql/ckb/POST/source_file/create_uploaded_source_files.sql
+++ b/DSL/Resql/ckb/POST/source_file/create_uploaded_source_files.sql
@@ -16,7 +16,7 @@ declaration:
         description: "Agency base ID"
       - field: files
         type: array
-        description: "Array of file objects with base_id, file_name, subsector, original_data_url"
+        description: "Array of file objects with base_id, file_name, subsector, original_data_url, file_size, uploaded_by"
   response:
     fields:
       - field: url
@@ -36,7 +36,7 @@ declaration:
         description: "Path (same as original_data_url)"
 */
 INSERT INTO data_collection.source_file (
-    source_base_id, agency_base_id, base_id, file_name, subsector, original_data_url, type
+    source_base_id, agency_base_id, base_id, file_name, subsector, original_data_url, file_size, uploaded_by, type
 )
 SELECT
     :source_id::UUID,
@@ -45,13 +45,17 @@ SELECT
     file_data.file_name,
     file_data.subsector,
     file_data.original_data_url,
+    file_data.file_size::BIGINT,
+    file_data.uploaded_by,
     'uploaded_file'::source_file_type
 FROM (
     SELECT
         (SELECT value) ->> 'base_id' AS base_id,
         (SELECT value) ->> 'file_name' AS file_name,
         (SELECT value) ->> 'subsector' AS subsector,
-        (SELECT value) ->> 'original_data_url' AS original_data_url
+        (SELECT value) ->> 'original_data_url' AS original_data_url,
+        (SELECT value) ->> 'file_size' AS file_size,
+        (SELECT value) ->> 'uploaded_by' AS uploaded_by
     FROM JSON_ARRAY_ELEMENTS(ARRAY_TO_JSON(ARRAY[:files])) WITH ORDINALITY
 ) AS file_data
 RETURNING NULL as url,  base_id as id, '' as hash, original_data_url, original_data_url as path;
\ No newline at end of file
diff --git a/DSL/Ruuter/ckb/POST/source-file/add-uploaded-files.yml b/DSL/Ruuter/ckb/POST/source-file/add-uploaded-files.yml
index c1b6c90..ac9de62 100644
--- a/DSL/Ruuter/ckb/POST/source-file/add-uploaded-files.yml
+++ b/DSL/Ruuter/ckb/POST/source-file/add-uploaded-files.yml
@@ -17,12 +17,27 @@ declaration:
       - field: files
         type: array
         description: "Array of uploaded file objects"
+    headers:
+      - field: cookie
+        type: string
+        description: "Cookie field"
+
+getUserInfo:
+  call: http.post
+  args:
+    url: "[#CKB_TIM]/jwt/custom-jwt-userinfo"
+    contentType: plaintext
+    headers:
+      cookie: ${incoming.headers.cookie}
+    plaintext: "customJwtCookie"
+  result: userInfoResult
 
 extractRequestData:
   assign:
     agency_id: "${incoming.body.agencyId}"
     source_id: "${incoming.body.sourceId}"
     files: "${incoming.body.files}"
+    uploaded_by: "${userInfoResult.response.body.idCode}"
 
 validateInput:
   switch:
@@ -31,6 +46,17 @@ validateInput:
     - condition: ${files === null || files.length === 0}
       next: returnError
 
+enrichFilesWithUploadedBy:
+  call: http.post
+  args:
+    url: "[#CKB_DMAPPER_HBS]/enrich_files_with_uploaded_by"
+    headers:
+      type: json
+    body:
+      files: ${files}
+      uploaded_by: ${uploaded_by}
+  result: enrichedFilesResult
+
 createUploadedSourceFiles:
   call: http.post
   args:
@@ -40,7 +66,7 @@ createUploadedSourceFiles:
     body:
       agency_id: ${agency_id}
       source_id: ${source_id}
-      files: ${files}
+      files: ${enrichedFilesResult.response.body}
   result: createResult
 
 transformToGetDownloadUrls:
diff --git a/GUI/src/pages/Agency/Agency.tsx b/GUI/src/pages/Agency/Agency.tsx
index d2b9a9c..ea100b9 100644
--- a/GUI/src/pages/Agency/Agency.tsx
+++ b/GUI/src/pages/Agency/Agency.tsx
@@ -208,18 +208,15 @@ const Agency: FC = () => {
         })),
       }));
 
+      setUploadModal(false);
+      setFormData({ subsector: '', files: [] });
+
       toast.open({
         type: 'success',
         title: t('global.notification'),
         message: t('knowledgeBase.uploadSuccess'),
       });
 
-      // Close modal after a short delay to show success state
-      setTimeout(() => {
-        setUploadModal(false);
-        setFormData({ subsector: '', files: [] });
-      }, 1000);
-
       queryClient.invalidateQueries(['sources']);
     },
     onError: (error: any) => {
diff --git a/GUI/src/pages/UploadedFiles/index.tsx b/GUI/src/pages/UploadedFiles/index.tsx
index 1e4f1ec..15c6a18 100644
--- a/GUI/src/pages/UploadedFiles/index.tsx
+++ b/GUI/src/pages/UploadedFiles/index.tsx
@@ -229,18 +229,15 @@ const UploadedFiles: FC = () => {
         })),
       }));
 
+      setUploadModal(false);
+      setFormData({ search: '', files: [], subsector: '' });
+
       toast.open({
         type: 'success',
         title: t('global.notification'),
         message: t('knowledgeBase.uploadSuccess'),
       });
 
-      // Close modal after a short delay to show success state
-      setTimeout(() => {
-        setUploadModal(false);
-        setFormData({ search: '', files: [], subsector: '' });
-      }, 1000);
-
       queryClient.invalidateQueries(['uploadedFiles']);
     },
     onError: (error: any) => {
diff --git a/GUI/src/services/sources.ts b/GUI/src/services/sources.ts
index 5188a54..f19a4a7 100644
--- a/GUI/src/services/sources.ts
+++ b/GUI/src/services/sources.ts
@@ -230,6 +230,7 @@ export const createSourceFile = async (
           file_name: uploadInfo.uploadItem.fileName,
           subsector: data.subsector,
           original_data_url: uploadInfo.uploadItem.path,
+          file_size: uploadInfo.file.size,
         }));
 
         await registerUploadedFiles(
@@ -295,6 +296,7 @@ export const addFilesToExistingSource = async (
           file_name: uploadInfo.uploadItem.fileName,
           subsector: data.subsector,
           original_data_url: uploadInfo.uploadItem.path,
+          file_size: uploadInfo.file.size,
         }));
 
         await registerUploadedFiles(

From d652c4166dccf2f7595eca5f2825ad28d11ace0f Mon Sep 17 00:00:00 2001
From: ahmer-mt <ahmer.ali@mindtitan.com>
Date: Tue, 21 Oct 2025 08:23:56 +0500
Subject: [PATCH 12/18] fix sorting

---
 .../ckb/GET/source/list_agency_sources.sql    | 14 ++++++------
 DSL/Resql/ckb/GET/source/list_api_sources.sql | 10 ++++-----
 .../list_excluded_source_files_by_agency.sql  | 22 ++++++++++++-------
 .../source_file/list_scraped_source_files.sql |  2 +-
 .../list_uploaded_source_files.sql            |  6 ++---
 .../source_run_page/list_source_run_pages.sql |  8 +++----
 GUI/src/components/DataTable/index.tsx        |  4 ++--
 GUI/src/pages/API/ApiDetail.tsx               |  2 +-
 GUI/src/pages/API/index.tsx                   |  2 +-
 GUI/src/pages/Agency/Agency.tsx               |  4 ++--
 GUI/src/pages/Agency/index.tsx                |  2 +-
 GUI/src/pages/Reports/Report.tsx              |  2 +-
 GUI/src/pages/Reports/index.tsx               |  2 +-
 GUI/src/pages/ScrapedFiles/index.tsx          |  2 +-
 GUI/src/pages/UploadedFiles/index.tsx         |  2 +-
 GUI/src/services/agencies.ts                  |  2 +-
 GUI/src/services/files.ts                     |  4 ++--
 GUI/src/services/reports.ts                   |  4 ++--
 GUI/src/services/sources.ts                   |  6 ++---
 19 files changed, 53 insertions(+), 47 deletions(-)

diff --git a/DSL/Resql/ckb/GET/source/list_agency_sources.sql b/DSL/Resql/ckb/GET/source/list_agency_sources.sql
index 0a75016..cc134e3 100644
--- a/DSL/Resql/ckb/GET/source/list_agency_sources.sql
+++ b/DSL/Resql/ckb/GET/source/list_agency_sources.sql
@@ -63,28 +63,28 @@ declaration:
         description: "Total number of matching records"
 */
 WITH latest_sources AS (
-    SELECT DISTINCT ON (base_id) 
-        id, base_id, agency_base_id, url, subsector, status, last_scraped_at, type, is_deleted
+    SELECT DISTINCT ON (base_id)
+        id, base_id, agency_base_id, url, subsector, status, last_scraped_at, type, is_deleted, updated_at
     FROM data_collection.source
     WHERE agency_base_id = :agency_base_id::UUID
     ORDER BY base_id, updated_at DESC
 )
-SELECT 
+SELECT
     id, base_id, agency_base_id, url, subsector, status, last_scraped_at, type,
     :page as page,
     CEIL(COUNT(*) OVER () / :page_size::DECIMAL) AS total_pages,
     (COUNT(*) OVER ()) AS total
 FROM latest_sources
 WHERE is_deleted = FALSE
-ORDER BY 
+ORDER BY
     CASE WHEN :sorting = 'url asc' THEN url END ASC,
     CASE WHEN :sorting = 'url desc' THEN url END DESC,
     CASE WHEN :sorting = 'subsector asc' THEN subsector END ASC,
     CASE WHEN :sorting = 'subsector desc' THEN subsector END DESC,
-    CASE WHEN :sorting = 'last_scraped_at asc' THEN last_scraped_at END ASC,
-    CASE WHEN :sorting = 'last_scraped_at desc' THEN last_scraped_at END DESC,
+    CASE WHEN :sorting = 'last_scraped_at asc' THEN last_scraped_at END ASC NULLS LAST,
+    CASE WHEN :sorting = 'last_scraped_at desc' THEN last_scraped_at END DESC NULLS LAST,
     CASE WHEN :sorting = 'status asc' THEN status END ASC,
     CASE WHEN :sorting = 'status desc' THEN status END DESC,
-    last_scraped_at DESC NULLS LAST
+    updated_at DESC NULLS LAST
 LIMIT :page_size::INTEGER 
 OFFSET ((GREATEST(:page::INTEGER, 1) - 1) * :page_size::INTEGER);
\ No newline at end of file
diff --git a/DSL/Resql/ckb/GET/source/list_api_sources.sql b/DSL/Resql/ckb/GET/source/list_api_sources.sql
index 537452e..107d346 100644
--- a/DSL/Resql/ckb/GET/source/list_api_sources.sql
+++ b/DSL/Resql/ckb/GET/source/list_api_sources.sql
@@ -52,26 +52,26 @@ declaration:
         description: "Total number of matching records"
 */
 WITH latest_sources AS (
-    SELECT DISTINCT ON (base_id) 
-        id, base_id, agency_base_id, url, status, last_scraped_at, type, is_deleted
+    SELECT DISTINCT ON (base_id)
+        id, base_id, agency_base_id, url, status, last_scraped_at, type, is_deleted, updated_at
     FROM data_collection.source
     WHERE type = 'api'::source_type
     ORDER BY base_id, updated_at DESC
 )
-SELECT 
+SELECT
     id, base_id, agency_base_id, url, status, last_scraped_at,
     :page as page,
     CEIL(COUNT(*) OVER () / :page_size::DECIMAL) AS total_pages,
     (COUNT(*) OVER ()) AS total
 FROM latest_sources
 WHERE is_deleted = FALSE
-ORDER BY 
+ORDER BY
     CASE WHEN :sorting = 'url asc' THEN url END ASC,
     CASE WHEN :sorting = 'url desc' THEN url END DESC,
     CASE WHEN :sorting = 'last_scraped_at asc' THEN last_scraped_at END ASC,
     CASE WHEN :sorting = 'last_scraped_at desc' THEN last_scraped_at END DESC,
     CASE WHEN :sorting = 'status asc' THEN status END ASC,
     CASE WHEN :sorting = 'status desc' THEN status END DESC,
-    last_scraped_at DESC NULLS LAST
+    updated_at DESC NULLS LAST
 LIMIT :page_size::INTEGER 
 OFFSET ((GREATEST(:page::INTEGER, 1) - 1) * :page_size::INTEGER);
\ No newline at end of file
diff --git a/DSL/Resql/ckb/GET/source_file/list_excluded_source_files_by_agency.sql b/DSL/Resql/ckb/GET/source_file/list_excluded_source_files_by_agency.sql
index e74092f..a909a9d 100644
--- a/DSL/Resql/ckb/GET/source_file/list_excluded_source_files_by_agency.sql
+++ b/DSL/Resql/ckb/GET/source_file/list_excluded_source_files_by_agency.sql
@@ -25,13 +25,19 @@ declaration:
         type: string
         description: "Base ID of the source"
 */
+WITH latest_files AS (
+    SELECT DISTINCT ON (base_id)
+        id, base_id, agency_base_id, source_base_id, is_deleted, updated_at
+    FROM data_collection.source_file
+    WHERE agency_base_id = :agency_base_id::UUID
+      AND is_excluded = true
+    ORDER BY base_id, updated_at DESC
+)
 SELECT
-    id, 
-    base_id, 
-    agency_base_id, 
+    id,
+    base_id,
+    agency_base_id,
     source_base_id
-FROM data_collection.source_file
-WHERE 
-    agency_base_id = :agency_base_id::UUID 
-    AND is_excluded = true 
-    AND is_deleted = false;
\ No newline at end of file
+FROM latest_files
+WHERE is_deleted = false
+ORDER BY updated_at DESC NULLS LAST;
\ No newline at end of file
diff --git a/DSL/Resql/ckb/GET/source_file/list_scraped_source_files.sql b/DSL/Resql/ckb/GET/source_file/list_scraped_source_files.sql
index 0d171ff..c9d33fa 100644
--- a/DSL/Resql/ckb/GET/source_file/list_scraped_source_files.sql
+++ b/DSL/Resql/ckb/GET/source_file/list_scraped_source_files.sql
@@ -116,6 +116,6 @@ ORDER BY
     CASE WHEN :sorting = 'last_scraped_at desc' THEN last_scraped_at END DESC,
     CASE WHEN :sorting = 'external_id asc' THEN external_id END ASC,
     CASE WHEN :sorting = 'external_id desc' THEN external_id END DESC,
-    last_scraped_at DESC NULLS LAST
+    updated_at DESC NULLS LAST
 LIMIT :page_size::INTEGER 
 OFFSET ((GREATEST(:page::INTEGER, 1) - 1) * :page_size::INTEGER);
\ No newline at end of file
diff --git a/DSL/Resql/ckb/GET/source_file/list_uploaded_source_files.sql b/DSL/Resql/ckb/GET/source_file/list_uploaded_source_files.sql
index 2f97fb9..d8245d1 100644
--- a/DSL/Resql/ckb/GET/source_file/list_uploaded_source_files.sql
+++ b/DSL/Resql/ckb/GET/source_file/list_uploaded_source_files.sql
@@ -101,8 +101,8 @@ ORDER BY
     CASE WHEN :sorting = 'excluded desc' THEN is_excluded END DESC,
     CASE WHEN :sorting = 'status asc' THEN status END ASC,
     CASE WHEN :sorting = 'status desc' THEN status END DESC,
-    CASE WHEN :sorting = 'created_at asc' THEN created_at END ASC,
-    CASE WHEN :sorting = 'created_at desc' THEN created_at END DESC,
-    created_at DESC NULLS LAST
+    CASE WHEN :sorting = 'last_scraped_at asc' THEN created_at END ASC,
+    CASE WHEN :sorting = 'last_scraped_at desc' THEN created_at END DESC,
+    updated_at DESC NULLS LAST
 LIMIT :page_size::INTEGER 
 OFFSET ((GREATEST(:page::INTEGER, 1) - 1) * :page_size::INTEGER);
\ No newline at end of file
diff --git a/DSL/Resql/ckb/GET/source_run_page/list_source_run_pages.sql b/DSL/Resql/ckb/GET/source_run_page/list_source_run_pages.sql
index 439d4b1..4ab8a9a 100644
--- a/DSL/Resql/ckb/GET/source_run_page/list_source_run_pages.sql
+++ b/DSL/Resql/ckb/GET/source_run_page/list_source_run_pages.sql
@@ -54,9 +54,9 @@ declaration:
         description: "total number of agencies"
 */
 WITH latest_run_pages AS (
-    SELECT DISTINCT ON (base_id) 
-        id, base_id, source_run_report_base_id, url, error_type, error_message, scraped_at, is_deleted
-    FROM monitoring.source_run_page 
+    SELECT DISTINCT ON (base_id)
+        id, base_id, source_run_report_base_id, url, error_type, error_message, scraped_at, is_deleted, updated_at
+    FROM monitoring.source_run_page
     WHERE (:source_run_report_base_id IS NULL OR source_run_report_base_id = :source_run_report_base_id::UUID)
     ORDER BY base_id, updated_at DESC
 )
@@ -76,6 +76,6 @@ ORDER BY
     CASE WHEN :sorting = 'error_message desc' THEN error_message END DESC,
     CASE WHEN :sorting = 'scraped_at asc' THEN scraped_at END ASC,
     CASE WHEN :sorting = 'scraped_at desc' THEN scraped_at END DESC,
-    scraped_at DESC NULLS LAST
+    updated_at DESC NULLS LAST
 LIMIT :page_size::INTEGER 
 OFFSET ((GREATEST(:page::INTEGER, 1) - 1) * :page_size::INTEGER);
\ No newline at end of file
diff --git a/GUI/src/components/DataTable/index.tsx b/GUI/src/components/DataTable/index.tsx
index 9d14356..36eaf6c 100644
--- a/GUI/src/components/DataTable/index.tsx
+++ b/GUI/src/components/DataTable/index.tsx
@@ -221,13 +221,13 @@ const DataTable: FC<DataTableProps> = ({
                               {{
                                 asc: (
                                   <Icon
-                                    icon={<MdExpandMore fontSize={20} />}
+                                    icon={<MdExpandLess fontSize={20} />}
                                     size="medium"
                                   />
                                 ),
                                 desc: (
                                   <Icon
-                                    icon={<MdExpandLess fontSize={20} />}
+                                    icon={<MdExpandMore fontSize={20} />}
                                     size="medium"
                                   />
                                 ),
diff --git a/GUI/src/pages/API/ApiDetail.tsx b/GUI/src/pages/API/ApiDetail.tsx
index 4da9311..3f36fcc 100644
--- a/GUI/src/pages/API/ApiDetail.tsx
+++ b/GUI/src/pages/API/ApiDetail.tsx
@@ -95,7 +95,7 @@ const ApiDetail: FC = () => {
 
   // Convert sorting state to API format
   const getSortingParam = (sorting: SortingState): string => {
-    if (sorting.length === 0) return 'last_scraped_at desc';
+    if (sorting.length === 0) return '';
 
     const sort = sorting[0];
     let field = sort.id;
diff --git a/GUI/src/pages/API/index.tsx b/GUI/src/pages/API/index.tsx
index 0cd3f4f..7194579 100644
--- a/GUI/src/pages/API/index.tsx
+++ b/GUI/src/pages/API/index.tsx
@@ -35,7 +35,7 @@ const ApiList: FC = () => {
 
   // Convert sorting state to API format
   const getSortingParam = (sorting: SortingState): string => {
-    if (sorting.length === 0) return 'last_scraped_at desc';
+    if (sorting.length === 0) return '';
 
     const sort = sorting[0];
     let field = sort.id;
diff --git a/GUI/src/pages/Agency/Agency.tsx b/GUI/src/pages/Agency/Agency.tsx
index ea100b9..30e3cfa 100644
--- a/GUI/src/pages/Agency/Agency.tsx
+++ b/GUI/src/pages/Agency/Agency.tsx
@@ -87,7 +87,7 @@ const Agency: FC = () => {
 
   // Convert sorting state to API format
   const getSortingParam = (sorting: SortingState): string => {
-    if (sorting.length === 0) return 'last_scraped_at desc';
+    if (sorting.length === 0) return '';
 
     const sort = sorting[0];
     let field = sort.id;
@@ -96,7 +96,7 @@ const Agency: FC = () => {
     const fieldMap: Record<string, string> = {
       url: 'url',
       subsector: 'subsector',
-      lastScraped: 'last_scraped_at',
+      lastScrapedAt: 'last_scraped_at',
       status: 'status',
     };
 
diff --git a/GUI/src/pages/Agency/index.tsx b/GUI/src/pages/Agency/index.tsx
index 32d0fb0..3bf439d 100644
--- a/GUI/src/pages/Agency/index.tsx
+++ b/GUI/src/pages/Agency/index.tsx
@@ -40,7 +40,7 @@ const AgencyComponent: FC = () => {
 
   // Convert sorting state to API format
   const getSortingParam = (sorting: SortingState): string => {
-    if (sorting.length === 0) return 'updatedAt desc';
+    if (sorting.length === 0) return '';
 
     const sort = sorting[0];
     let field = sort.id;
diff --git a/GUI/src/pages/Reports/Report.tsx b/GUI/src/pages/Reports/Report.tsx
index 089c89a..d59864f 100644
--- a/GUI/src/pages/Reports/Report.tsx
+++ b/GUI/src/pages/Reports/Report.tsx
@@ -33,7 +33,7 @@ const Report: FC = () => {
 
   // Convert sorting state to API format
   const getSortingParam = (sorting: SortingState): string => {
-    if (sorting.length === 0) return 'scraped_at desc';
+    if (sorting.length === 0) return '';
 
     const sort = sorting[0];
     let field = sort.id;
diff --git a/GUI/src/pages/Reports/index.tsx b/GUI/src/pages/Reports/index.tsx
index f8bfe0f..acd93b3 100644
--- a/GUI/src/pages/Reports/index.tsx
+++ b/GUI/src/pages/Reports/index.tsx
@@ -37,7 +37,7 @@ const Reports: FC = () => {
 
   // Convert sorting state to API format
   const getSortingParam = (sorting: SortingState): string => {
-    if (sorting.length === 0) return 'scraping_started_at desc';
+    if (sorting.length === 0) return '';
 
     const sort = sorting[0];
     let field = sort.id;
diff --git a/GUI/src/pages/ScrapedFiles/index.tsx b/GUI/src/pages/ScrapedFiles/index.tsx
index 40f1ba3..87b7387 100644
--- a/GUI/src/pages/ScrapedFiles/index.tsx
+++ b/GUI/src/pages/ScrapedFiles/index.tsx
@@ -95,7 +95,7 @@ const ScrapedFiles: FC = () => {
 
   // Convert sorting state to API format
   const getSortingParam = (sorting: SortingState): string => {
-    if (sorting.length === 0) return 'last_scraped_at desc';
+    if (sorting.length === 0) return '';
 
     const sort = sorting[0];
     let field = sort.id;
diff --git a/GUI/src/pages/UploadedFiles/index.tsx b/GUI/src/pages/UploadedFiles/index.tsx
index 15c6a18..8dadf63 100644
--- a/GUI/src/pages/UploadedFiles/index.tsx
+++ b/GUI/src/pages/UploadedFiles/index.tsx
@@ -114,7 +114,7 @@ const UploadedFiles: FC = () => {
 
   // Convert sorting state to API format
   const getSortingParam = (sorting: SortingState): string => {
-    if (sorting.length === 0) return 'last_scraped_at desc';
+    if (sorting.length === 0) return '';
 
     const sort = sorting[0];
     let field = sort.id;
diff --git a/GUI/src/services/agencies.ts b/GUI/src/services/agencies.ts
index e03b63b..41e1729 100644
--- a/GUI/src/services/agencies.ts
+++ b/GUI/src/services/agencies.ts
@@ -62,7 +62,7 @@ export const getAgencies = async (
     params: {
       page: params.page || 1,
       pageSize: params.pageSize || 10,
-      sorting: params.sorting || 'updatedAt desc',
+      sorting: params.sorting || '',
     },
   });
 
diff --git a/GUI/src/services/files.ts b/GUI/src/services/files.ts
index 80df62b..8a6538e 100644
--- a/GUI/src/services/files.ts
+++ b/GUI/src/services/files.ts
@@ -97,7 +97,7 @@ export const getScrapedFiles = async (
       isExcluded: params.isExcluded,
       page: params.page || 1,
       pageSize: params.pageSize || 10,
-      sorting: params.sorting || 'last_scraped_at desc',
+      sorting: params.sorting || '',
       search: params.search,
     },
   });
@@ -128,7 +128,7 @@ export const getUploadedFiles = async (
       isExcluded: params.isExcluded,
       page: params.page || 1,
       pageSize: params.pageSize || 10,
-      sorting: params.sorting || 'last_scraped_at desc',
+      sorting: params.sorting || '',
       search: params.search,
     },
   });
diff --git a/GUI/src/services/reports.ts b/GUI/src/services/reports.ts
index bf28f98..ce2133e 100644
--- a/GUI/src/services/reports.ts
+++ b/GUI/src/services/reports.ts
@@ -40,7 +40,7 @@ export const getReports = async (
     params: {
       page: params.page || 1,
       pageSize: params.pageSize || 10,
-      sorting: params.sorting || 'scraping_started_at desc',
+      sorting: params.sorting || '',
     },
   });
 
@@ -116,7 +116,7 @@ export const getReportPages = async (
       source_run_report_base_id: params.source_run_report_base_id,
       page: params.page || 1,
       pageSize: params.pageSize || 10,
-      sorting: params.sorting || 'scraped_at desc',
+      sorting: params.sorting || '',
     },
   });
 
diff --git a/GUI/src/services/sources.ts b/GUI/src/services/sources.ts
index f19a4a7..793b877 100644
--- a/GUI/src/services/sources.ts
+++ b/GUI/src/services/sources.ts
@@ -148,7 +148,7 @@ export const getSources = async (
       agencyBaseId: params.agencyBaseId,
       page: params.page || 1,
       pageSize: params.pageSize || 10,
-      sorting: params.sorting || 'last_scraped_at desc',
+      sorting: params.sorting || '',
     },
   });
 
@@ -177,7 +177,7 @@ export const getApiIntegrations = async (
     params: {
       page: params.page || 1,
       pageSize: params.pageSize || 10,
-      sorting: params.sorting || 'last_scraped_at desc',
+      sorting: params.sorting || '',
     },
   });
 
@@ -433,7 +433,7 @@ export const getApiSourceFiles = async (
       sourceId: params.sourceId,
       page: params.page || 1,
       pageSize: params.pageSize || 10,
-      sorting: params.sorting || 'last_scraped_at desc',
+      sorting: params.sorting || '',
       search: params.search,
       type: params.type,
     },

From 89ef0dedc03e99e69354b597a58516a260f94307 Mon Sep 17 00:00:00 2001
From: ahmer-mt <ahmer.ali@mindtitan.com>
Date: Tue, 21 Oct 2025 08:36:30 +0500
Subject: [PATCH 13/18] fix pagination last page delete items

---
 GUI/src/pages/Agency/Agency.tsx       | 24 ++++++++++++++++++++++--
 GUI/src/pages/Agency/index.tsx        | 24 ++++++++++++++++++++++--
 GUI/src/pages/Reports/index.tsx       | 24 ++++++++++++++++++++++--
 GUI/src/pages/ScrapedFiles/index.tsx  | 24 ++++++++++++++++++++++--
 GUI/src/pages/UploadedFiles/index.tsx | 24 ++++++++++++++++++++++--
 5 files changed, 110 insertions(+), 10 deletions(-)

diff --git a/GUI/src/pages/Agency/Agency.tsx b/GUI/src/pages/Agency/Agency.tsx
index 30e3cfa..69d3a3c 100644
--- a/GUI/src/pages/Agency/Agency.tsx
+++ b/GUI/src/pages/Agency/Agency.tsx
@@ -295,14 +295,34 @@ const Agency: FC = () => {
   // Delete source mutation
   const deleteMutation = useMutation({
     mutationFn: deleteSource,
-    onSuccess: () => {
+    onSuccess: async () => {
       toast.open({
         type: 'success',
         title: t('global.notification'),
         message: t('knowledgeBase.deleteSuccess'),
       });
       setDeleteModal(null);
-      queryClient.invalidateQueries(['sources']);
+
+      // Refetch to get updated data
+      await queryClient.invalidateQueries(['sources']);
+
+      // Check if current page is now out of bounds
+      const newTotal = (sourcesData?.total || 0) - 1;
+      const maxPages = Math.ceil(newTotal / pagination.pageSize);
+
+      // Reset to last valid page if current page is out of bounds
+      if (pagination.pageIndex >= maxPages && maxPages > 0) {
+        setPagination({
+          ...pagination,
+          pageIndex: maxPages - 1,
+        });
+      } else if (maxPages === 0) {
+        // If no data left, reset to page 0
+        setPagination({
+          ...pagination,
+          pageIndex: 0,
+        });
+      }
     },
     onError: (error: any) => {
       toast.open({
diff --git a/GUI/src/pages/Agency/index.tsx b/GUI/src/pages/Agency/index.tsx
index 3bf439d..96575f8 100644
--- a/GUI/src/pages/Agency/index.tsx
+++ b/GUI/src/pages/Agency/index.tsx
@@ -81,14 +81,34 @@ const AgencyComponent: FC = () => {
   // Delete agency mutation
   const deleteAgencyMutation = useMutation({
     mutationFn: deleteAgency,
-    onSuccess: () => {
+    onSuccess: async () => {
       toast.open({
         type: 'success',
         title: t('global.notification'),
         message: t('knowledgeBase.deleteSuccess'),
       });
       setDeleteModal(null);
-      queryClient.invalidateQueries(['agencies']);
+
+      // Refetch to get updated data
+      await queryClient.invalidateQueries(['agencies']);
+
+      // Check if current page is now out of bounds
+      const newTotal = (knowledgeBaseData.total || 0) - 1;
+      const maxPages = Math.ceil(newTotal / pagination.pageSize);
+
+      // Reset to last valid page if current page is out of bounds
+      if (pagination.pageIndex >= maxPages && maxPages > 0) {
+        setPagination({
+          ...pagination,
+          pageIndex: maxPages - 1,
+        });
+      } else if (maxPages === 0) {
+        // If no data left, reset to page 0
+        setPagination({
+          ...pagination,
+          pageIndex: 0,
+        });
+      }
     },
     onError: (error: any) => {
       toast.open({
diff --git a/GUI/src/pages/Reports/index.tsx b/GUI/src/pages/Reports/index.tsx
index acd93b3..8b98cff 100644
--- a/GUI/src/pages/Reports/index.tsx
+++ b/GUI/src/pages/Reports/index.tsx
@@ -80,14 +80,34 @@ const Reports: FC = () => {
   // Delete report mutation
   const deleteReportMutation = useMutation({
     mutationFn: deleteReport,
-    onSuccess: () => {
+    onSuccess: async () => {
       toast.open({
         type: 'success',
         title: t('global.notification'),
         message: t('reports.deleteSuccess'),
       });
       setDeleteModal(null);
-      queryClient.invalidateQueries(['reports']);
+
+      // Refetch to get updated data
+      await queryClient.invalidateQueries(['reports']);
+
+      // Check if current page is now out of bounds
+      const newTotal = (reportsData.total || 0) - 1;
+      const maxPages = Math.ceil(newTotal / pagination.pageSize);
+
+      // Reset to last valid page if current page is out of bounds
+      if (pagination.pageIndex >= maxPages && maxPages > 0) {
+        setPagination({
+          ...pagination,
+          pageIndex: maxPages - 1,
+        });
+      } else if (maxPages === 0) {
+        // If no data left, reset to page 0
+        setPagination({
+          ...pagination,
+          pageIndex: 0,
+        });
+      }
     },
     onError: (error: any) => {
       toast.open({
diff --git a/GUI/src/pages/ScrapedFiles/index.tsx b/GUI/src/pages/ScrapedFiles/index.tsx
index 87b7387..bd41a67 100644
--- a/GUI/src/pages/ScrapedFiles/index.tsx
+++ b/GUI/src/pages/ScrapedFiles/index.tsx
@@ -160,14 +160,34 @@ const ScrapedFiles: FC = () => {
   // Delete file mutation
   const deleteMutation = useMutation({
     mutationFn: deleteFile,
-    onSuccess: () => {
+    onSuccess: async () => {
       toast.open({
         type: 'success',
         title: t('global.notification'),
         message: t('knowledgeBase.urlDeleteSuccess'),
       });
       setDeleteModal(null);
-      queryClient.invalidateQueries(['scrapedFiles']);
+
+      // Refetch to get updated data
+      await queryClient.invalidateQueries(['scrapedFiles']);
+
+      // Check if current page is now out of bounds
+      const newTotal = (scrapedFilesData?.total || 0) - 1;
+      const maxPages = Math.ceil(newTotal / pagination.pageSize);
+
+      // Reset to last valid page if current page is out of bounds
+      if (pagination.pageIndex >= maxPages && maxPages > 0) {
+        setPagination({
+          ...pagination,
+          pageIndex: maxPages - 1,
+        });
+      } else if (maxPages === 0) {
+        // If no data left, reset to page 0
+        setPagination({
+          ...pagination,
+          pageIndex: 0,
+        });
+      }
     },
     onError: (error: any) => {
       toast.open({
diff --git a/GUI/src/pages/UploadedFiles/index.tsx b/GUI/src/pages/UploadedFiles/index.tsx
index 8dadf63..337470a 100644
--- a/GUI/src/pages/UploadedFiles/index.tsx
+++ b/GUI/src/pages/UploadedFiles/index.tsx
@@ -306,14 +306,34 @@ const UploadedFiles: FC = () => {
   // Delete file mutation
   const deleteMutation = useMutation({
     mutationFn: deleteFile,
-    onSuccess: () => {
+    onSuccess: async () => {
       toast.open({
         type: 'success',
         title: t('global.notification'),
         message: t('knowledgeBase.fileDeleteSuccess'),
       });
       setDeleteModal(null);
-      queryClient.invalidateQueries(['uploadedFiles']);
+
+      // Refetch to get updated data
+      await queryClient.invalidateQueries(['uploadedFiles']);
+
+      // Check if current page is now out of bounds
+      const newTotal = (uploadedFilesData?.total || 0) - 1;
+      const maxPages = Math.ceil(newTotal / pagination.pageSize);
+
+      // Reset to last valid page if current page is out of bounds
+      if (pagination.pageIndex >= maxPages && maxPages > 0) {
+        setPagination({
+          ...pagination,
+          pageIndex: maxPages - 1,
+        });
+      } else if (maxPages === 0) {
+        // If no data left, reset to page 0
+        setPagination({
+          ...pagination,
+          pageIndex: 0,
+        });
+      }
     },
     onError: (error: any) => {
       toast.open({

From e9aa1e174553eab0c312659a6508927bec4ea45c Mon Sep 17 00:00:00 2001
From: ahmer-mt <ahmer.ali@mindtitan.com>
Date: Tue, 21 Oct 2025 19:23:49 +0500
Subject: [PATCH 14/18] 
 https://github.com/buerokratt/Common-Knowledge/issues/88 fixes

---
 cleaning/worker/tasks.py                      | 38 ++++++++++++-------
 scrapper/scrapper/spiders/base_spider.py      | 35 ++++++++++++++---
 .../spiders/sitemap_collect_spider.py         | 11 +++++-
 3 files changed, 65 insertions(+), 19 deletions(-)

diff --git a/cleaning/worker/tasks.py b/cleaning/worker/tasks.py
index 1bcabe2..7c5e967 100644
--- a/cleaning/worker/tasks.py
+++ b/cleaning/worker/tasks.py
@@ -30,19 +30,36 @@ def clean_html(entity: EntityToClean):
     """Clean HTML files using a multi-step approach for better content extraction."""
     soup = BeautifulSoup(open(entity.file_path.as_posix(), 'r'), 'lxml')
 
+    # First, remove unwanted elements from the entire document
+    for element in soup(['header', 'footer', 'nav', 'script', 'style', 'aside', 'form']):
+        element.decompose()
+
     # Step 1: Check if there's a <main> element and use only that
     main_element = soup.find('main')
     if main_element:
-        logger.info(f'Found <main> element, using only main content for {entity.file_path.as_posix()}')
-        # Remove unwanted elements from main
-        for element in main_element(['script', 'style', 'nav', 'aside', 'form']):
-            element.decompose()
+        print(f'Found <main> element, trying partition_html on main content for {entity.file_path.as_posix()}')
+
+        # Try partition_html on main element content first
+        main_html = str(main_element)
+        partitioned = partition_html(
+            text=main_html,
+            languages=settings.languages,
+            skip_headers_and_footers=True
+        )
+
+        if len(partitioned) > 0:
+            cleaned_text = '\n\n'.join([str(el) for el in partitioned])
+            print(f'partition_html extracted {len(cleaned_text)} chars from <main> element for {entity.file_path.as_posix()}')
+            return cleaned_text
+
+        # If partition_html returns empty, fall back to BeautifulSoup on main
+        print(f'partition_html on <main> returned empty, using BeautifulSoup fallback for {entity.file_path.as_posix()}')
         cleaned_text = main_element.get_text(separator='\n', strip=True)
-        logger.info(f'Extracted {len(cleaned_text)} chars from <main> element for {entity.file_path.as_posix()}')
+        print(f'BeautifulSoup extracted {len(cleaned_text)} chars from <main> element for {entity.file_path.as_posix()}')
         return cleaned_text
 
     # Step 2: Try partition_html with skip_headers_and_footers flag
-    logger.info(f'No <main> element found, trying partition_html with skip_headers_and_footers for {entity.file_path.as_posix()}')
+    print(f'No <main> element found, trying partition_html with skip_headers_and_footers for {entity.file_path.as_posix()}')
     partitioned = partition_html(
         filename=entity.file_path.as_posix(),
         languages=settings.languages,
@@ -52,14 +69,9 @@ def clean_html(entity: EntityToClean):
 
     # Step 3: If partition_html returns empty, fallback to BeautifulSoup
     if len(partitioned) == 0:
-        logger.warning(f'partition_html returned empty content, using BeautifulSoup fallback for {entity.file_path.as_posix()}')
-
-        # Remove unwanted elements (headers, footers, nav, scripts, styles)
-        for element in soup(['header', 'footer', 'nav', 'script', 'style', 'aside', 'form']):
-            element.decompose()
-
+        print(f'partition_html returned empty content, using BeautifulSoup fallback for {entity.file_path.as_posix()}')
         cleaned_text = soup.get_text(separator='\n', strip=True)
-        logger.info(f'BeautifulSoup fallback extracted {len(cleaned_text)} chars for {entity.file_path.as_posix()}')
+        print(f'BeautifulSoup fallback extracted {len(cleaned_text)} chars for {entity.file_path.as_posix()}')
 
     return cleaned_text
 
diff --git a/scrapper/scrapper/spiders/base_spider.py b/scrapper/scrapper/spiders/base_spider.py
index 6595757..9d172d8 100644
--- a/scrapper/scrapper/spiders/base_spider.py
+++ b/scrapper/scrapper/spiders/base_spider.py
@@ -122,12 +122,29 @@ async def parse(self, response: Response, **kwargs):
 
         # Check if Playwright page is available (might not be if direct HTTP download was used)
         playwright_page = response.meta.get("playwright_page")
+        rendered_html = None
+
         if playwright_page:
-            # Use Playwright page for title extraction
+            # Use Playwright page for title extraction and get rendered HTML
             async with self.close_page(response) as page:
                 page: Page
-                if file_extension == '.html':
+
+                # Check if this is a sitemap or XML file - don't wait or render, use raw content
+                is_sitemap = 'sitemap' in response.url.lower() or file_extension == '.xml'
+
+                if file_extension == '.html' and not is_sitemap:
+                    # Wait for content to actually render (check if body has meaningful content)
+                    try:
+                        self.logger.info('Waiting for dynamic content to render...')
+                        await page.wait_for_load_state('networkidle')
+                        self.logger.info('Content detected, proceeding...')
+                    except Exception as e:
+                        # If timeout or error, proceed anyway
+                        self.logger.warning(f'Timeout waiting for content, proceeding anyway: {e}')
+
                     title = await page.title()
+                    # Get the fully rendered HTML after JavaScript execution
+                    rendered_html = await page.content()
                 else:
                     title = response.url
         else:
@@ -140,18 +157,26 @@ async def parse(self, response: Response, **kwargs):
             else:
                 title = response.url
 
+        # Use rendered HTML if available, otherwise use response.body
+        body_to_save = rendered_html.encode('utf-8') if rendered_html else response.body
+
         if file_extension == '.html':
-            soup = BeautifulSoup(response.body, 'lxml')
+            soup = BeautifulSoup(body_to_save, 'lxml')
             text = soup.get_text()
             hashed = hashlib.sha1(text.encode()).hexdigest()
         else:
-            hashed = hashlib.sha1(response.body).hexdigest()
+            hashed = hashlib.sha1(body_to_save).hexdigest()
 
-        file_item = FileItem(body=response.body, source_url=response.url, extension=file_extension)
+        file_item = FileItem(body=body_to_save, source_url=response.url, extension=file_extension)
 
         metadata_item = MetadataItem(
             file_type=file_extension, metadata=Metadata(), source_url=response.url, page_title=title
         )
 
         scrapped_item = ScrappedItem(file=file_item, metadata=metadata_item, hash=hashed)
+
+        # Store rendered HTML in response meta for link extraction by subclasses
+        if rendered_html:
+            response.meta['rendered_html'] = rendered_html
+
         yield scrapped_item
diff --git a/scrapper/scrapper/spiders/sitemap_collect_spider.py b/scrapper/scrapper/spiders/sitemap_collect_spider.py
index 5d6301c..27c76ff 100644
--- a/scrapper/scrapper/spiders/sitemap_collect_spider.py
+++ b/scrapper/scrapper/spiders/sitemap_collect_spider.py
@@ -136,7 +136,16 @@ async def parse(self, response: Response, **kwargs):
             if scrapped_item.metadata.file_type != '.html':
                 continue
 
-            for href in response.css("a::attr(href)").getall():
+            # Use rendered HTML for link extraction if available (for SPAs)
+            rendered_html = response.meta.get('rendered_html')
+            if rendered_html:
+                from bs4 import BeautifulSoup
+                soup = BeautifulSoup(rendered_html, 'lxml')
+                links = [a.get('href') for a in soup.find_all('a', href=True)]
+            else:
+                links = response.css("a::attr(href)").getall()
+
+            for href in links:
                 next_url = urljoin(response.url, href)
                 next_url = next_url.split('#')[0]
 

From 56dde937150933e011985851a8617c288db4fbe5 Mon Sep 17 00:00:00 2001
From: ahmer-mt <ahmer.ali@mindtitan.com>
Date: Tue, 21 Oct 2025 19:28:14 +0500
Subject: [PATCH 15/18] fix logging

---
 cleaning/worker/tasks.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/cleaning/worker/tasks.py b/cleaning/worker/tasks.py
index 7c5e967..242c3a4 100644
--- a/cleaning/worker/tasks.py
+++ b/cleaning/worker/tasks.py
@@ -37,7 +37,7 @@ def clean_html(entity: EntityToClean):
     # Step 1: Check if there's a <main> element and use only that
     main_element = soup.find('main')
     if main_element:
-        print(f'Found <main> element, trying partition_html on main content for {entity.file_path.as_posix()}')
+        logger.info(f'Found <main> element, trying partition_html on main content for {entity.file_path.as_posix()}')
 
         # Try partition_html on main element content first
         main_html = str(main_element)
@@ -49,17 +49,17 @@ def clean_html(entity: EntityToClean):
 
         if len(partitioned) > 0:
             cleaned_text = '\n\n'.join([str(el) for el in partitioned])
-            print(f'partition_html extracted {len(cleaned_text)} chars from <main> element for {entity.file_path.as_posix()}')
+            logger.info(f'partition_html extracted {len(cleaned_text)} chars from <main> element for {entity.file_path.as_posix()}')
             return cleaned_text
 
         # If partition_html returns empty, fall back to BeautifulSoup on main
-        print(f'partition_html on <main> returned empty, using BeautifulSoup fallback for {entity.file_path.as_posix()}')
+        logger.info(f'partition_html on <main> returned empty, using BeautifulSoup fallback for {entity.file_path.as_posix()}')
         cleaned_text = main_element.get_text(separator='\n', strip=True)
-        print(f'BeautifulSoup extracted {len(cleaned_text)} chars from <main> element for {entity.file_path.as_posix()}')
+        logger.info(f'BeautifulSoup extracted {len(cleaned_text)} chars from <main> element for {entity.file_path.as_posix()}')
         return cleaned_text
 
     # Step 2: Try partition_html with skip_headers_and_footers flag
-    print(f'No <main> element found, trying partition_html with skip_headers_and_footers for {entity.file_path.as_posix()}')
+    logger.info(f'No <main> element found, trying partition_html with skip_headers_and_footers for {entity.file_path.as_posix()}')
     partitioned = partition_html(
         filename=entity.file_path.as_posix(),
         languages=settings.languages,
@@ -69,9 +69,9 @@ def clean_html(entity: EntityToClean):
 
     # Step 3: If partition_html returns empty, fallback to BeautifulSoup
     if len(partitioned) == 0:
-        print(f'partition_html returned empty content, using BeautifulSoup fallback for {entity.file_path.as_posix()}')
+        logger.info(f'partition_html returned empty content, using BeautifulSoup fallback for {entity.file_path.as_posix()}')
         cleaned_text = soup.get_text(separator='\n', strip=True)
-        print(f'BeautifulSoup fallback extracted {len(cleaned_text)} chars for {entity.file_path.as_posix()}')
+        logger.info(f'BeautifulSoup fallback extracted {len(cleaned_text)} chars for {entity.file_path.as_posix()}')
 
     return cleaned_text
 
@@ -120,9 +120,9 @@ def clean_file_task(entity: EntityToClean):
         if cleaned_text and len(cleaned_text.strip()) > 0:
             try:
                 detected_language = detect(cleaned_text)
-                print(f'Detected language: {detected_language} for {entity.file_path.as_posix()}')
+                logger.info(f'Detected language: {detected_language} for {entity.file_path.as_posix()}')
             except LangDetectException as e:
-                print(f'Language detection failed for {entity.file_path.as_posix()}: {e}')
+                logger.error(f'Language detection failed for {entity.file_path.as_posix()}: {e}')
 
         cleaned_text_filename = entity.directory_path / 'cleaned.txt'
 

From 73fd97ea753f4c3ee0233b5757b7415d2c7227a7 Mon Sep 17 00:00:00 2001
From: ahmer-mt <ahmer.ali@mindtitan.com>
Date: Thu, 23 Oct 2025 02:16:20 +0500
Subject: [PATCH 16/18] fix cleaning pipeline

---
 cleaning/requirements.txt                |  2 +-
 cleaning/worker/tasks.py                 |  4 +--
 scrapper/api/app.py                      |  4 +++
 scrapper/api/models.py                   |  1 +
 scrapper/scrapper/pipelines.py           |  7 ++++-
 scrapper/scrapper/spiders/base_spider.py |  4 +++
 search-service/index.js                  | 36 +++++++++++++++++++++---
 7 files changed, 50 insertions(+), 8 deletions(-)

diff --git a/cleaning/requirements.txt b/cleaning/requirements.txt
index 986160a..55271a8 100644
--- a/cleaning/requirements.txt
+++ b/cleaning/requirements.txt
@@ -1,6 +1,6 @@
 fastapi==0.115.12
 uvicorn==0.34.2
-unstructured[pdf,docx,doc]==0.18.2
+unstructured[pdf,docx,doc]==0.18.5
 pydantic-settings==2.10.1
 beautifulsoup4==4.13.4
 langdetect==1.0.9
diff --git a/cleaning/worker/tasks.py b/cleaning/worker/tasks.py
index 242c3a4..f5e6951 100644
--- a/cleaning/worker/tasks.py
+++ b/cleaning/worker/tasks.py
@@ -54,7 +54,7 @@ def clean_html(entity: EntityToClean):
 
         # If partition_html returns empty, fall back to BeautifulSoup on main
         logger.info(f'partition_html on <main> returned empty, using BeautifulSoup fallback for {entity.file_path.as_posix()}')
-        cleaned_text = main_element.get_text(separator='\n', strip=True)
+        cleaned_text = main_element.get_text(separator='\n\n', strip=True)
         logger.info(f'BeautifulSoup extracted {len(cleaned_text)} chars from <main> element for {entity.file_path.as_posix()}')
         return cleaned_text
 
@@ -70,7 +70,7 @@ def clean_html(entity: EntityToClean):
     # Step 3: If partition_html returns empty, fallback to BeautifulSoup
     if len(partitioned) == 0:
         logger.info(f'partition_html returned empty content, using BeautifulSoup fallback for {entity.file_path.as_posix()}')
-        cleaned_text = soup.get_text(separator='\n', strip=True)
+        cleaned_text = soup.get_text(separator='\n\n', strip=True)
         logger.info(f'BeautifulSoup fallback extracted {len(cleaned_text)} chars for {entity.file_path.as_posix()}')
 
     return cleaned_text
diff --git a/scrapper/api/app.py b/scrapper/api/app.py
index 03c78b5..04540c0 100644
--- a/scrapper/api/app.py
+++ b/scrapper/api/app.py
@@ -23,6 +23,8 @@
 
 @app.post('/specified-pages-scrapper-task')
 def trigger_specified_pages_scrapper_task(task: SpecifiedLinksScrapeTask):
+    # Always ignore stopping for manual file refresh
+    task.ignore_stopping = True
     specified_links_scrapper_task.delay(task.model_dump(mode='json'))
 
 
@@ -59,6 +61,8 @@ def trigger_eesti_scrapper_task(task: EestiScrapperTask):
 
 @app.post('/specified-api-files-scrapper-task')
 def trigger_specified_api_files_scrapper_task(task: SpecifiedApiFilesScrapeTask):
+    # Always ignore stopping for manual file refresh
+    task.ignore_stopping = True
     specified_api_files_scrapper_task.delay(task.model_dump(mode='json'))
 
 @app.post('/generate-edited-metadata')
diff --git a/scrapper/api/models.py b/scrapper/api/models.py
index a85c969..8ac26ce 100644
--- a/scrapper/api/models.py
+++ b/scrapper/api/models.py
@@ -4,6 +4,7 @@
 class BaseObject(BaseModel):
     agency_id: str
     source_id: str
+    ignore_stopping: bool = False
 
 
 class LinkToScrape(BaseModel):
diff --git a/scrapper/scrapper/pipelines.py b/scrapper/scrapper/pipelines.py
index e37bba8..bd1cd13 100644
--- a/scrapper/scrapper/pipelines.py
+++ b/scrapper/scrapper/pipelines.py
@@ -175,7 +175,7 @@ class ScrappingFinishedPipeline:
     def close_spider(self, spider: Spider):
         if not hasattr(spider, 'task'):
             return
-    
+
         spider: BaseSpider
         task: BaseObject = spider.task
 
@@ -194,6 +194,11 @@ def open_spider(self, spider: Spider):
         spider: BaseSpider
         task: BaseObject = spider.task
 
+        # Skip updating source status for manual file refresh (ignore_stopping flag)
+        # This prevents clearing is_stopping flag when refreshing individual files
+        if hasattr(task, 'ignore_stopping') and task.ignore_stopping:
+            return
+
         requests.post(f'{spider.settings.get('RUUTER_INTERNAL')}/ckb/source/update-status', json={
             'source_id': task.source_id,
             'status': 'running',
diff --git a/scrapper/scrapper/spiders/base_spider.py b/scrapper/scrapper/spiders/base_spider.py
index 9d172d8..a9f6f7d 100644
--- a/scrapper/scrapper/spiders/base_spider.py
+++ b/scrapper/scrapper/spiders/base_spider.py
@@ -33,6 +33,10 @@ def __init__(self, *args, **kwargs):
             self.task = kwargs['task']
 
     def check_source_is_stopping(self):
+        # Skip check if this is a manual file refresh (ignore_stopping flag set)
+        if hasattr(self.task, 'ignore_stopping') and self.task.ignore_stopping:
+            return
+
         try:
             is_stopping = requests.get(
                 f'{self.settings.get('RUUTER_INTERNAL')}/ckb/source/get',
diff --git a/search-service/index.js b/search-service/index.js
index 3f658b1..cba470f 100644
--- a/search-service/index.js
+++ b/search-service/index.js
@@ -231,17 +231,31 @@ app.get("/search/:sourceId", async (req, res) => {
           size: 1000, // Get more docs to find unique source_file_ids
           query: {
             bool: {
-              must: q.trim()
+              should: q.trim()
                 ? [
+                    {
+                      term: {
+                        url: {
+                          value: q.trim(),
+                          boost: 100,
+                        },
+                      },
+                    },
                     {
                       multi_match: {
                         query: q.trim(),
-                        fields: ["content^3", "page_title^2", "file_name^2"],
+                        fields: [
+                          "url^5",
+                          "content^3",
+                          "page_title^2",
+                          "file_name^2",
+                        ],
                         type: "best_fields",
                       },
                     },
                   ]
                 : [{ match_all: {} }],
+              minimum_should_match: q.trim() ? 1 : 0,
             },
           },
           _source: ["source_file_id"],
@@ -285,17 +299,31 @@ app.get("/search/:sourceId", async (req, res) => {
           size: parseInt(size),
           query: {
             bool: {
-              must: q.trim()
+              should: q.trim()
                 ? [
+                    {
+                      term: {
+                        url: {
+                          value: q.trim(),
+                          boost: 100,
+                        },
+                      },
+                    },
                     {
                       multi_match: {
                         query: q.trim(),
-                        fields: ["content^3", "page_title^2", "file_name^2"],
+                        fields: [
+                          "url^5",
+                          "content^3",
+                          "page_title^2",
+                          "file_name^2",
+                        ],
                         type: "best_fields",
                       },
                     },
                   ]
                 : [{ match_all: {} }],
+              minimum_should_match: q.trim() ? 1 : 0,
             },
           },
           highlight: {

From 670d35cc373e93d772a169635ed116c9aea85682 Mon Sep 17 00:00:00 2001
From: ahmer-mt <ahmer.ali@mindtitan.com>
Date: Fri, 24 Oct 2025 12:28:16 +0500
Subject: [PATCH 17/18] 
 https://github.com/buerokratt/Common-Knowledge/issues/90 same batch

---
 .../components/FileUploader/FileUploader.tsx  | 38 ++++++++++++++++---
 GUI/translations/en/common.json               |  1 +
 GUI/translations/et/common.json               |  1 +
 3 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/GUI/src/components/FileUploader/FileUploader.tsx b/GUI/src/components/FileUploader/FileUploader.tsx
index 4de9583..eff6a26 100644
--- a/GUI/src/components/FileUploader/FileUploader.tsx
+++ b/GUI/src/components/FileUploader/FileUploader.tsx
@@ -96,9 +96,22 @@ const FileUploader: FC<FileUploaderProps> = ({
     const selectedFiles = event.target.files;
     if (!selectedFiles) return;
 
-    const newFiles: FileItem[] = Array.from(selectedFiles).map((file) =>
-      validateSingleFile(file)
-    );
+    const newFiles: FileItem[] = Array.from(selectedFiles).map((file) => {
+      const validatedFile = validateSingleFile(file);
+
+      // Check for duplicate filename in existing files
+      const isDuplicate = files.some(existingFile => existingFile.name === file.name);
+
+      if (isDuplicate && validatedFile.status === 'pending') {
+        return {
+          ...validatedFile,
+          status: 'error' as const,
+          message: t('fileUpload.duplicateFile'),
+        };
+      }
+
+      return validatedFile;
+    });
 
     const updatedFiles = [...files, ...newFiles];
     onFilesChange(updatedFiles);
@@ -126,9 +139,22 @@ const FileUploader: FC<FileUploaderProps> = ({
     const droppedFiles = e.dataTransfer.files;
     if (!droppedFiles) return;
 
-    const newFiles: FileItem[] = Array.from(droppedFiles).map((file) =>
-      validateSingleFile(file)
-    );
+    const newFiles: FileItem[] = Array.from(droppedFiles).map((file) => {
+      const validatedFile = validateSingleFile(file);
+
+      // Check for duplicate filename in existing files
+      const isDuplicate = files.some(existingFile => existingFile.name === file.name);
+
+      if (isDuplicate && validatedFile.status === 'pending') {
+        return {
+          ...validatedFile,
+          status: 'error' as const,
+          message: t('fileUpload.duplicateFile'),
+        };
+      }
+
+      return validatedFile;
+    });
 
     const updatedFiles = [...files, ...newFiles];
     onFilesChange(updatedFiles);
diff --git a/GUI/translations/en/common.json b/GUI/translations/en/common.json
index 2a8c810..e52bde3 100644
--- a/GUI/translations/en/common.json
+++ b/GUI/translations/en/common.json
@@ -601,6 +601,7 @@
     "allowedFormats": "Allowed formats: ",
     "maxSizeExceeded": "Maximum size exceeded.",
     "fileAlreadyExists": "File already exists.",
+    "duplicateFile": "A file with the same name is already selected.",
     "success": "Success",
     "fileExists": "File exists",
     "fileExceedsLimit": "File size exceeds the maximum limit",
diff --git a/GUI/translations/et/common.json b/GUI/translations/et/common.json
index 8ed8534..a440e4b 100644
--- a/GUI/translations/et/common.json
+++ b/GUI/translations/et/common.json
@@ -601,6 +601,7 @@
     "allowedFormats": "Lubatud formaadid: ",
     "maxSizeExceeded": "Maksimaalne suurus ületatud.",
     "fileAlreadyExists": "Fail on juba olemas.",
+    "duplicateFile": "Sama nimega fail on juba valitud.",
     "success": "Edukas",
     "fileExists": "Fail eksisteerib",
     "fileExceedsLimit": "Failisuurus ületab maksimaalset piiri",

From eb2fda2034b47c17f85284438bbc912f2bfbdefa Mon Sep 17 00:00:00 2001
From: ahmer-mt <ahmer.ali@mindtitan.com>
Date: Fri, 24 Oct 2025 16:41:49 +0500
Subject: [PATCH 18/18] fix originalDataUrl issue

---
 DSL/Ruuter.internal/ckb/POST/pipeline/delete-file-sync.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DSL/Ruuter.internal/ckb/POST/pipeline/delete-file-sync.yml b/DSL/Ruuter.internal/ckb/POST/pipeline/delete-file-sync.yml
index 3ca9619..0d89c63 100644
--- a/DSL/Ruuter.internal/ckb/POST/pipeline/delete-file-sync.yml
+++ b/DSL/Ruuter.internal/ckb/POST/pipeline/delete-file-sync.yml
@@ -26,7 +26,7 @@ deleteFile:
     url: "[#CKB_FILE_MANAGER]/delete-files"
     body:
       files:
-        - s3_path: ${sourceFile.response.body[0].original_data_url}
+        - s3_path: ${sourceFile.response.body[0].originalDataUrl}
   result: res
 
 returnResult: