civilization_node/kiwix_tool.py at main · emincb/civilization_node · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
"""
title: Kiwix Knowledge Retrieval
author: Civilization Node Operator
description: Search offline ZIM archives (Wikipedia, StackOverflow, iFixit) and return ACTUAL CONTENT to the LLM.
"""

import requests
from bs4 import BeautifulSoup
import urllib.parse
import re

class Tools:
    def __init__(self):
        self.kiwix_host = "http://civ_library:8080"
        self.valves = self.Valves()

    class Valves:
        default_zim: str = "wikipedia"

    """
    title: Kiwix Knowledge Retrieval
    author: Civilization Node Operator
    description: Search offline ZIM archives. You MUST provide a 'query'.
    """

    def search_knowledge_base(self, query: str, context: str = "general") -> str:
        """
        Search for a topic in the offline library.
        :param query: The specific search terms (e.g. "Python list comprehension").
                      Supports multiple queries separated by semicolons (e.g. "radio freq; antenna types").
        :param context: Choose one of: "general" (Wikipedia), "code" (StackOverflow), "repair" (iFixit), "medical" (WikiMed), "chemistry", "books".
        :return: The content of the article(s) or an error message.
        """
        if not query or not query.strip():
            return "Error: Empty query."

        # Handle multiple queries separated by ';'
        sub_queries = [q.strip() for q in query.split(';') if q.strip()]
        results = []

        for sub_q in sub_queries:
            results.append(self._perform_single_search(sub_q, context))

        return "\n\n" + ("="*20) + "\n\n".join(results)

    def _perform_single_search(self, query: str, context: str) -> str:
        # Map context to partial names/keywords in Title
        zim_map = {
            "general": "wikipedia",
            "code": "stack overflow",
            "repair": "ifixit",
            "medical": "medical",
            "linux": "arch",
            "science": "phet",
            "books": "gutenberg"
        }

        # 1. Try to find a specific match in the map
        search_keyword = zim_map.get(context, context)

        # 2. Resolve to the EXACT Book ID from the Server
        target_id = _resolve_book_id(self.kiwix_host, search_keyword)

        # 3. Fallback: If specific library not found, default to Wikipedia main
        if not target_id and search_keyword != "wikipedia":
             print(f"DEBUG: Library for '{search_keyword}' not found, falling back to Wikipedia.")
             target_id = _resolve_book_id(self.kiwix_host, "wikipedia")

        if not target_id:
            available = _get_available_books(self.kiwix_host)
            return f"Error: No matching ZIM found for '{context}'. Available: {available}"

        try:
            # 2. Search
            print(f"DEBUG: Searching for '{query}' in {target_id}")
            search_url = f"{self.kiwix_host}/search?content={target_id}&pattern={urllib.parse.quote(query)}"
            search_resp = requests.get(search_url, timeout=5)

            # 3. Parse and Re-Rank Links
            soup = BeautifulSoup(search_resp.content, 'html.parser')
            candidates = []

            query_terms = query.lower().split()

            for link in soup.find_all('a', href=True):
                href = link['href']
                # Basic validity check
                if target_id not in href or "search?" in href or "skin/" in href or ".css" in href:
                    continue

                # Scoring Logic
                score = 0
                href_lower = href.lower()

                # Criterion A: Query Term Match
                matches = sum(1 for term in query_terms if term in href_lower)
                score += (matches * 10)

                # Criterion B: Content Type Preference
                if context == "repair":
                    if "/Guide/" in href: score += 50
                    if "Replacement" in href: score += 20
                    if "/Device/" in href: score -= 5

                # Criterion C: Exact Match / Shortness
                score -= len(href) * 0.1

                candidates.append((score, href))

            if not candidates:
                 return f"No articles found for '{query}' in {target_id}."

            # Sort by score descending
            candidates.sort(key=lambda x: x[0], reverse=True)
            first_result = candidates[0][1]

            if not first_result:
                return f"No articles found for '{query}' in {target_id}."

            # 4. Fetch Content
            article_url = f"{self.kiwix_host}{first_result}" if first_result.startswith("/") else f"{self.kiwix_host}/{first_result}"
            article_resp = requests.get(article_url, timeout=10)
            article_soup = BeautifulSoup(article_resp.content, 'html.parser')

            # 5. Clean Content
            for script in article_soup(["script", "style", "nav", "footer", "header", "form"]):
                script.decompose()

            text = article_soup.get_text(separator=' ', strip=True)

            # 6. Format
            return f"### QUERY: {query}\n<source id=\"{target_id}\">\n{text[:6000]}...\n</source>"

        except Exception as e:
            return f"System Error processing '{query}': {e}"

def _resolve_book_id(host: str, partial_name: str) -> str:
    print(f"DEBUG: Resolving book ID for '{partial_name}'")
    try:
        # Kiwix-serve returns an OPDS Atom feed (XML), not JSON
        r = requests.get(f"{host}/catalog/v2/entries", timeout=2)
        if r.status_code != 200:
            return None

        # Parse XML
        soup = BeautifulSoup(r.content, 'xml')

        for entry in soup.find_all('entry'):
            # Title often contains readable name: "Wikipedia English"
            title = entry.find('title').text if entry.find('title') else ""

            if partial_name.lower() in title.lower():
                # Extract ID from the <link type="text/html" href="/content/ID">
                # We search for the link that points to the content root
                link = entry.find('link', type="text/html")
                if link and link.get('href'):
                    # href is like "/content/wikipedia_en_all_nopic_2025-12"
                    # We need just the ID part
                    raw_id = link['href'].split('/content/')[-1]
                    return raw_id
        return None
    except:
        return None

def _get_available_books(host: str):
    try:
            r = requests.get(f"{host}/catalog/v2/entries", timeout=1)
            soup = BeautifulSoup(r.content, 'xml')
            titles = [e.find('title').text for e in soup.find_all('entry') if e.find('title')]
            return titles
    except:
            return "Unable to list (XML Parse Error)."