AminSS99 · AminSS99 · Mar 4, 2026 · gemini-code-assist · Mar 4, 2026 · gemini-code-assist
diff --git a/.jules/sentinel.md b/.jules/sentinel.md
@@ -0,0 +1,4 @@
+## 2025-02-27 - [Fix XXE Vulnerability in Scrapers]
+**Vulnerability:** Found `xml.etree.ElementTree.fromstring` parsing untrusted XML data (RSS feeds) in `functions/scrapers/theverge.py` and `functions/scrapers/producthunt.py`. This is vulnerable to XML Entity Expansion (Billion Laughs) and XXE attacks.
+**Learning:** Standard library XML parsers in Python are not secure by default. Untrusted input must not be passed to them.
+**Prevention:** Always use `defusedxml.ElementTree.fromstring` as a drop-in replacement when parsing XML from external or untrusted sources.
diff --git a/functions/requirements.txt b/functions/requirements.txt
@@ -7,3 +7,4 @@ beautifulsoup4==4.*
 feedparser==6.*
 openai==1.*
 tzdata
+defusedxml
diff --git a/functions/scrapers/producthunt.py b/functions/scrapers/producthunt.py
@@ -6,6 +6,7 @@
 import httpx
 from typing import List, Dict, Any
 from datetime import datetime
+from defusedxml.ElementTree import fromstring
 import xml.etree.ElementTree as ET
-from defusedxml.ElementTree import fromstring
-import xml.etree.ElementTree as ET
+from defusedxml import ElementTree as ET
-from defusedxml.ElementTree import fromstring
-import xml.etree.ElementTree as ET
+from defusedxml import ElementTree as ET
 from bs4 import BeautifulSoup
 
@@ -80,8 +81,8 @@ def fetch_producthunt(limit: int = 10) -> List[Dict[str, Any]]:
                 print(f"Product Hunt RSS returned {response.status_code}")
                 return []
 
-            # Parse RSS feed
-            root = ET.fromstring(response.content)
+            # Parse RSS feed securely
+            root = fromstring(response.content)
 
             products = []
             items = root.findall('.//item')

diff --git a/functions/scrapers/theverge.py b/functions/scrapers/theverge.py
@@ -6,7 +6,7 @@
 import httpx
 from typing import List, Dict, Any
 from datetime import datetime
-import xml.etree.ElementTree as ET
+from defusedxml.ElementTree import fromstring
 
 try:
     from ..resilience import retry_with_backoff
@@ -39,8 +39,8 @@ def fetch_theverge(limit: int = 10) -> List[Dict[str, Any]]:
             response = client.get(VERGE_RSS_URL, headers=headers)
             response.raise_for_status()
 
-            # Parse XML/Atom feed
-            root = ET.fromstring(response.content)
+            # Parse XML/Atom feed securely
+            root = fromstring(response.content)
 
             # The Verge uses Atom format
             namespace = {'atom': 'http://www.w3.org/2005/Atom'}