archi-physics · nhduongvn · Feb 19, 2026 · Apr 1, 2026
diff --git a/examples/agents/cms-comp-ops.md b/examples/agents/cms-comp-ops.md
@@ -12,3 +12,20 @@ tools:
 You are the CMS Comp Ops assistant. You help with operational questions, troubleshooting,
 and documentation lookups. Use tools when needed, cite evidence from retrieved sources,
 and keep responses concise and actionable.
+
+## Tool guidance for ELOG
+
+- For questions about a specific person's activity (e.g. "what did huangch report?"), use
+  `search_metadata_index` with `tech:<username>`. ELOG entries store the technician in
+  the `tech` metadata field.
+- For questions about recent ELOG incidents, combine `search_metadata_index` (to find
+  entries by author/category/node) with `search_vectorstore_hybrid` (for full-text content).
+- Use `list_metadata_schema` first if unsure which metadata keys are available.
+- When citing ELOG entries, always use the `url` field from the search result metadata as
+  the link. Never construct a URL manually from a hash or document ID — those are internal
+  Archi identifiers, not ELOG entry numbers.
+- Search results are numbered `[1]`, `[2]`, `[3]`… — these are result indices, not ELOG
+  entry numbers. Always extract the actual entry number from the `url` field (the last
+  path segment, e.g. `/elog/dCache/847` → entry 847).
+- `search_metadata_index` returns at most 5 results. If the user asks for "all entries"
+  from a person or category, note that only the top 5 matches are shown.
diff --git a/examples/deployments/basic-ollama-fnal/agent.prompt b/examples/deployments/basic-ollama-fnal/agent.prompt
@@ -0,0 +1,2 @@
+You are a conversational agent named Archi who helps users with their issues.
+You have been given access to several tools to access tickets, documentation, and other useful resources to help you answer their questions: if the answer to their question is not provided in there, or you don't know, ask for more details from the user, and always answer truthfully.
diff --git a/examples/deployments/basic-ollama-fnal/condense.prompt b/examples/deployments/basic-ollama-fnal/condense.prompt
@@ -0,0 +1,12 @@
+# Prompt used to condense a chat history and a follow up question into a stand alone question. 
+# This is a very general prompt for condensing histories, so for base installs it will not need to be modified
+# 
+# All condensing prompts must have the following tags in them, which will be filled with the appropriate information:
+#      {history}
+#      {question}
+#
+Given the following conversation between you (the AI named archi), a human user who needs help, and an expert, and a follow up question, rephrase the follow up question to be a standalone question, in its original language.
+
+Chat History: {history}
+Follow Up Input: {question}
+Standalone question:
diff --git a/examples/deployments/basic-ollama-fnal/config.yaml b/examples/deployments/basic-ollama-fnal/config.yaml
@@ -0,0 +1,54 @@
+# Basic configuration file for an Archi deployment
+# with a chat app interface and PostgreSQL with pgvector for document storage.
+# The LLM is used through an existing Ollama server.
+#
+# run with:
+# archi create   --name my-archi-ollama   --config examples/deployments/basic-ollama-fnal/config.yaml   --env-file examples/deployments/basic-ollama-fnal/secrets.env   --services chatbot   --podman   --hostmode   --force -v 4
+
+name: my_archi
+
+services:
+  chat_app:
+    agent_class: CMSCompOpsAgent
+    agents_dir: examples/agents
+    default_provider: local
+    default_model: "Qwen3-coder:latest" #pipeline default model
+    providers:
+      local:
+        enabled: true
+        base_url: https://ollama.fnal.gov # make sure this matches your ollama server URL!
+        mode: ollama #call to LangChain class ChatOllama, other option is openai_compat which calls ChatOpenAI LangChain class
+        default_model: "Qwen3-coder:latest" # local server default model in the dropdown menu in "Model Provider" section of the GUI, make sure this matches a model you have downloaded locally with ollama
+        models: #options for the dropdown menu in "Model" section of the GUI if dynamic fetch fails (GET https://ollama.fnal.gov/api/tags)
+          - "gpt-oss:120b"
+          - "qwen3:32b"
+    trained_on: "My data"
+    port: 7868
+    external_port: 7868
+  vectorstore:
+    backend: postgres # PostgreSQL with pgvector (only supported backend)
+  data_manager:
+    port: 7899 #7899
+    external_port: 7899
+    auth:
+      enabled: false # set to true and provide DM_API_TOKEN in .env for production
+
+data_manager:
+  sources:
+    elog:
+      max_entries: 500   # scrape the 500 most recent entries; remove to scrape all
+      verify_ssl: false
+    jira:
+      max_tickets: 10
+      url: https://its.cern.ch/jira/
+      projects:
+        - "CMSPROD"
+    links:
+      input_lists:
+        - examples/deployments/basic-ollama-fnal/miscellanea.list
+        - examples/deployments/basic-ollama-fnal/dcache-elog.list
+    redmine:
+      url: https://cleo.mit.edu
+      projects:
+        - emails-to-ticket
+  embedding_name: HuggingFaceEmbeddings
diff --git a/examples/deployments/basic-ollama-fnal/dcache-elog.list b/examples/deployments/basic-ollama-fnal/dcache-elog.list
@@ -0,0 +1 @@
+elog-https://www-enstore.fnal.gov/elog/dCache/
diff --git a/examples/deployments/basic-ollama-fnal/miscellanea.list b/examples/deployments/basic-ollama-fnal/miscellanea.list
@@ -0,0 +1,6 @@
+# PPC
+# https://ppc.mit.edu/
+# A2
+# https://ppc.mit.edu/a2/
+# git
+# git-https://github.com/archi-physics/archi.git
diff --git a/src/cli/templates/base-config.yaml b/src/cli/templates/base-config.yaml
@@ -248,6 +248,13 @@ data_manager:
       visible: {{ data_manager.sources.redmine.visible | default(false, true) }}
       schedule: '{{ data_manager.sources.redmine.schedule | default("", true) }}'
       anonymize_data: {{ data_manager.sources.redmine.anonymize_data | default(true, true) }}
+    elog:
+      enabled: {{ data_manager.sources.elog.enabled | default(false, true) }}
+      visible: {{ data_manager.sources.elog.visible | default(false, true) }}
+      url: '{{ data_manager.sources.elog.url | default('', true) }}'
+      max_entries: {{ data_manager.sources.elog.max_entries | default(null, true) }}
+      verify_ssl: {{ data_manager.sources.elog.verify_ssl | default(false, true) }}
+      schedule: '{{ data_manager.sources.elog.schedule | default("", true) }}'
   utils:
     anonymizer:
         nlp_model: {{ data_manager.utils.anonymizer.nlp_model | default('en_core_web_sm', true) }}

diff --git a/src/data_manager/collectors/scrapers/integrations/elog_scraper.py b/src/data_manager/collectors/scrapers/integrations/elog_scraper.py
@@ -0,0 +1,156 @@
+"""
+ELOG scraper for electronic logbooks (https://elog.sourceforge.net/).
+
+Automatically discovers all entries by walking index pages sequentially,
+then fetches each individual entry as a ScrapedResource.
+
+Config (under data_manager.sources.elog):
+    url:          Base URL of the logbook, e.g. https://www-enstore.fnal.gov/elog/dCache/
+    max_entries:  Optional cap on total entries to fetch (default: unlimited)
+    verify_ssl:   Whether to verify SSL certificates (default: True)
+"""
+
+import re
+import requests
+from typing import Dict, Iterator, List, Optional, Set, Tuple
+from urllib.parse import urljoin, urlparse
+
+from bs4 import BeautifulSoup
+
+from src.data_manager.collectors.scrapers.scraped_resource import ScrapedResource
+from src.utils.logging import get_logger
+
+logger = get_logger(__name__)
+
+_ENTRY_PATH = re.compile(r"/\d+$")
+
+
+class ElogScraper:
+    """Crawls an ELOG logbook index (walking pages sequentially) and yields each entry."""
+
+    def __init__(self, config: dict) -> None:
+        self.base_url   = config.get("url", "").rstrip("/") + "/"
+        self.max_entries: Optional[int] = config.get("max_entries")
+        self.verify_ssl = config.get("verify_ssl", True)
+        self._session   = requests.Session()
+        if not self.verify_ssl:
+            import urllib3
+            urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+
+    def iter_entries(self) -> Iterator[ScrapedResource]:
+        """Yield one ScrapedResource per logbook entry, newest first."""
+        entry_urls = self._discover_entry_urls()
+        fetched = 0
+        for url in entry_urls:
+            if self.max_entries is not None and fetched >= self.max_entries:
+                logger.info(f"Reached max_entries={self.max_entries}; stopping.")
+                break
+            resource = self._fetch_entry(url)
+            if resource is not None:
+                yield resource
+                fetched += 1
+        logger.info(f"ElogScraper: fetched {fetched} entries from {self.base_url}")
+
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+
+    def _discover_entry_urls(self) -> List[str]:
+        """Return entry URLs newest-first by walking index pages sequentially until max_entries is reached."""
+        seen:   Set[str]  = set()
+        result: List[str] = []
+        page = 1
+
+        while True:
+            page_url = self.base_url if page == 1 else f"{self.base_url}page{page}"
+            new_urls = [u for u in self._get_entry_urls_from_page(page_url) if u not in seen]
+            if not new_urls:
+                logger.info(f"ElogScraper: no new entries on {page_url}, stopping.")
+                break
+            seen.update(new_urls)
+            result.extend(new_urls)
+            logger.debug(f"ElogScraper: page {page} added {len(new_urls)} entries ({len(result)} total)")
+            if self.max_entries is not None and len(result) >= self.max_entries:
+                break
+            page += 1
+
+        result.sort(key=lambda u: int(u.rstrip("/").rsplit("/", 1)[-1]), reverse=True)
+        logger.info(f"ElogScraper: discovered {len(result)} unique entries")
+        return result
+
+    def _get_entry_urls_from_page(self, page_url: str) -> List[str]:
+        """Return all entry URLs found on a single index/listing page."""
+        html = self._fetch_html(page_url)
+        if html is None:
+            return []
+        soup  = BeautifulSoup(html, "html.parser")
+        base_host = urlparse(self.base_url).netloc
+        entries: Set[str] = set()
+        for a in soup.find_all("a", href=True):
+            full = urljoin(page_url, a["href"])
+            parsed = urlparse(full)
+            if parsed.netloc == base_host and _ENTRY_PATH.search(parsed.path):
+                # Strip query/fragment so we get the canonical entry URL
+                entries.add(parsed._replace(query="", fragment="").geturl())
+        return list(entries)
+
+    def _fetch_entry(self, url: str) -> Optional[ScrapedResource]:
+        """Fetch a single entry page, extract structured text, and return a ScrapedResource."""
+        html = self._fetch_html(url)
+        if html is None:
+            return None
+        text, metadata = self._parse_entry(html, url)
+        return ScrapedResource(
+            url=url,
+            content=text,
+            suffix="txt",
+            source_type="web",
+            metadata=metadata,
+        )
+
+    def _parse_entry(self, html: str, url: str) -> Tuple[str, Dict]:
+        """Parse an ELOG entry page into clean text and structured metadata."""
+        soup = BeautifulSoup(html, "html.parser")
+        meta: dict = {"url": url, "elog_entry": True}
+
+        # Extract entry ID from URL
+        entry_id = url.rstrip("/").rsplit("/", 1)[-1]
+        meta["entry_id"] = entry_id
+
+        # Extract attribute rows (Incident Date, Tech, Node, Inst, Category, Fix Action)
+        for row in soup.select("table.listframe tr td table tr"):
+            cells = row.find_all("td")
+            if len(cells) == 2:
+                key = cells[0].get_text(strip=True).rstrip(":")
+                value = cells[1].get_text(strip=True)
+                if key and value:
+                    meta[key.lower().replace(" ", "_")] = value
+
+        # Main message body
+        body = ""
+        pre = soup.find("pre", class_="messagepre")
+        if pre:
+            body = pre.get_text()
+
+        # Build clean plain-text document
+        lines = [f"ELOG Entry {entry_id} — {self.base_url}"]
+        for k, v in meta.items():
+            if k not in ("url", "elog_entry", "entry_id"):
+                lines.append(f"{k.replace('_', ' ').title()}: {v}")
+        lines.append("")
+        lines.append(body.strip())
+
+        return "\n".join(lines), meta
+
+    def _fetch_html(self, url: str) -> Optional[str]:
+        try:
+            r = self._session.get(url, timeout=15, verify=self.verify_ssl)
+            r.raise_for_status()
+            return r.text
+        except Exception as exc:
+            logger.warning(f"ElogScraper: could not fetch {url}: {exc}")
+            return None
diff --git a/src/data_manager/collectors/scrapers/scraper_manager.py b/src/data_manager/collectors/scrapers/scraper_manager.py
@@ -54,6 +54,10 @@ def __init__(self, dm_config: Optional[Dict[str, Any]] = None) -> None:
 
         self.sso_enabled = bool(sso_config.get("enabled", False))
 
+        elog_config = sources_config.get("elog", {}) if isinstance(sources_config, dict) else {}
+        self.elog_config = elog_config if isinstance(elog_config, dict) else {}
+        self.elog_enabled = bool(self.elog_config.get("url"))
+
         self.data_path = Path(global_config["DATA_PATH"])
         self.input_lists = links_config.get("input_lists", [])
         self.git_dir = self.data_path / "git"
@@ -70,7 +74,7 @@ def collect_all_from_config(
         self, persistence: PersistenceService
     ) -> None:
         """Run the configured scrapers and persist their output."""
-        link_urls, git_urls, sso_urls = self._collect_urls_from_lists_by_type(self.input_lists)
+        link_urls, git_urls, sso_urls, elog_urls = self._collect_urls_from_lists_by_type(self.input_lists)
 
         if git_urls:
             self.git_enabled = True
@@ -81,6 +85,7 @@ def collect_all_from_config(
         self.collect_links(persistence, link_urls=link_urls)
         self.collect_sso(persistence, sso_urls=sso_urls)
         self.collect_git(persistence, git_urls=git_urls)
+        self.collect_elog(persistence, extra_urls=elog_urls)
 
         logger.info("Web scraping was completed successfully")
 
@@ -152,6 +157,51 @@ def schedule_collect_sso(self, persistence: PersistenceService, last_run: Option
         catalog_urls = [m[1].get("url", "") for m in metadata]
         self.collect_sso(persistence, sso_urls=catalog_urls)
 
+    def schedule_collect_elog(self, persistence: PersistenceService, last_run: Optional[str] = None) -> None:
+        metadata = persistence.catalog.get_metadata_by_filter("source_type", source_type="elog", metadata_keys=["url"])
+        catalog_urls = [m[1].get("url", "") for m in metadata]
+        self.collect_elog(persistence, extra_urls=catalog_urls)
+
+    def collect_elog(self, persistence: PersistenceService, extra_urls: Optional[List[str]] = None) -> int:
+        """Collect all entries from configured ELOG logbooks.
+
+        Sources:
+          - dedicated  ``elog:`` config section (url key)
+          - URLs auto-detected as ELOG from input_lists (passed via extra_urls)
+        """
+        from src.data_manager.collectors.scrapers.integrations.elog_scraper import ElogScraper
+        elog_dir = persistence.data_path / "websites"
+        elog_dir.mkdir(parents=True, exist_ok=True)
+
+        urls_to_scrape: List[str] = list(extra_urls) if extra_urls else []
+        if self.elog_enabled:
+            urls_to_scrape.append(self.elog_config.get("url"))
+
+        # Normalize and deduplicate URLs while preserving order
+        normalized_urls: List[str] = []
+        seen = set()
+        for raw_url in urls_to_scrape:
+            if not raw_url:
+                continue
+            url = raw_url.rstrip("/")
+            if url and url not in seen:
+                seen.add(url)
+                normalized_urls.append(url)
+        urls_to_scrape = normalized_urls
+
+        if not urls_to_scrape:
+            return 0
+
+        total = 0
+        for url in urls_to_scrape:
+            cfg = {**self.elog_config, "url": url}
+            scraper = ElogScraper(cfg)
+            for resource in scraper.iter_entries():
+                persistence.persist_resource(resource, elog_dir)
+                total += 1
+        logger.info(f"ELOG scraping complete: {total} entries collected")
+        return total
+
     def _collect_links_from_urls(
         self,
         urls: List[str],
@@ -261,20 +311,36 @@ def _collect_urls_from_lists(self, input_lists) -> List[str]:
 
         return urls
 
-    def _collect_urls_from_lists_by_type(self, input_lists: List[str]) -> tuple[List[str], List[str], List[str]]:
-        """All types of URLs are in the same input lists, separate them via prefixes"""
+    def _collect_urls_from_lists_by_type(self, input_lists: List[str]) -> tuple[List[str], List[str], List[str], List[str]]:
+        """All types of URLs are in the same input lists, separate them via prefixes or auto-detection."""
         link_urls: List[str] = []
-        git_urls: List[str] = []
-        sso_urls: List[str] = []
+        git_urls:  List[str] = []
+        sso_urls:  List[str] = []
+        elog_urls: List[str] = []
         for raw_url in self._collect_urls_from_lists(input_lists):
             if raw_url.startswith("git-"):
                 git_urls.append(raw_url.split("git-", 1)[1])
                 continue
             if raw_url.startswith("sso-"):
                 sso_urls.append(raw_url.split("sso-", 1)[1])
                 continue
+            if raw_url.startswith("elog-"):
+                elog_urls.append(raw_url.split("elog-", 1)[1])
+                continue
+            if self._is_elog_url(raw_url):
+                elog_urls.append(raw_url)
+                continue
             link_urls.append(raw_url)
-        return link_urls, git_urls, sso_urls
+        return link_urls, git_urls, sso_urls, elog_urls
+
+    @staticmethod
+    def _is_elog_url(url: str) -> bool:
+        """Return True if the URL looks like an ELOG logbook index (fallback heuristic).
+        Prefer the explicit 'elog-' prefix in input lists over this auto-detection.
+        """
+        from urllib.parse import urlparse
+        path = urlparse(url).path.lower()
+        return "/elog/" in path or "/elogs/" in path
     def _resolve_scraper(self):
         class_name = self.selenium_config.get("selenium_class")
         class_map = self.selenium_config.get("selenium_class_map", {})
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		You are a conversational agent named Archi who helps users with their issues.
		You have been given access to several tools to access tickets, documentation, and other useful resources to help you answer their questions: if the answer to their question is not provided in there, or you don't know, ask for more details from the user, and always answer truthfully.