diff --git a/.jules/sentinel.md b/.jules/sentinel.md new file mode 100644 index 0000000..f334e0d --- /dev/null +++ b/.jules/sentinel.md @@ -0,0 +1,4 @@ +## 2025-02-27 - [Fix XXE Vulnerability in Scrapers] +**Vulnerability:** Found `xml.etree.ElementTree.fromstring` parsing untrusted XML data (RSS feeds) in `functions/scrapers/theverge.py` and `functions/scrapers/producthunt.py`. This is vulnerable to XML Entity Expansion (Billion Laughs) and XXE attacks. +**Learning:** Standard library XML parsers in Python are not secure by default. Untrusted input must not be passed to them. +**Prevention:** Always use `defusedxml.ElementTree.fromstring` as a drop-in replacement when parsing XML from external or untrusted sources. diff --git a/functions/requirements.txt b/functions/requirements.txt index 9354334..b206a2b 100644 --- a/functions/requirements.txt +++ b/functions/requirements.txt @@ -7,3 +7,4 @@ beautifulsoup4==4.* feedparser==6.* openai==1.* tzdata +defusedxml diff --git a/functions/scrapers/producthunt.py b/functions/scrapers/producthunt.py index c64f6fd..c6bf26b 100644 --- a/functions/scrapers/producthunt.py +++ b/functions/scrapers/producthunt.py @@ -6,6 +6,7 @@ import httpx from typing import List, Dict, Any from datetime import datetime +from defusedxml.ElementTree import fromstring import xml.etree.ElementTree as ET from bs4 import BeautifulSoup @@ -80,8 +81,8 @@ def fetch_producthunt(limit: int = 10) -> List[Dict[str, Any]]: print(f"Product Hunt RSS returned {response.status_code}") return [] - # Parse RSS feed - root = ET.fromstring(response.content) + # Parse RSS feed securely + root = fromstring(response.content) products = [] items = root.findall('.//item') diff --git a/functions/scrapers/theverge.py b/functions/scrapers/theverge.py index 5746806..d5b2eeb 100644 --- a/functions/scrapers/theverge.py +++ b/functions/scrapers/theverge.py @@ -6,7 +6,7 @@ import httpx from typing import List, Dict, Any from datetime import datetime -import xml.etree.ElementTree as ET +from defusedxml.ElementTree import fromstring try: from ..resilience import retry_with_backoff @@ -39,8 +39,8 @@ def fetch_theverge(limit: int = 10) -> List[Dict[str, Any]]: response = client.get(VERGE_RSS_URL, headers=headers) response.raise_for_status() - # Parse XML/Atom feed - root = ET.fromstring(response.content) + # Parse XML/Atom feed securely + root = fromstring(response.content) # The Verge uses Atom format namespace = {'atom': 'http://www.w3.org/2005/Atom'}