diff --git a/.jules/sentinel.md b/.jules/sentinel.md new file mode 100644 index 0000000..724a1bf --- /dev/null +++ b/.jules/sentinel.md @@ -0,0 +1,4 @@ +## 2025-03-05 - [XML External Entity (XXE) Prevention] +**Vulnerability:** Found `xml.etree.ElementTree` being used to parse external untrusted RSS feeds in scrapers (`theverge.py` and `producthunt.py`). This standard library is vulnerable to XML vulnerabilities such as XXE and Billion Laughs. +**Learning:** External feeds must always be treated as untrusted data. Standard XML parsers often do not protect against recursive entities or external entity resolution. +**Prevention:** Always use `defusedxml.ElementTree` instead of the standard library `xml.etree.ElementTree` when parsing untrusted XML/RSS feeds to prevent XML-based attacks. diff --git a/functions/requirements.txt b/functions/requirements.txt index 9354334..94eae01 100644 --- a/functions/requirements.txt +++ b/functions/requirements.txt @@ -7,3 +7,4 @@ beautifulsoup4==4.* feedparser==6.* openai==1.* tzdata +defusedxml==0.7.* diff --git a/functions/scrapers/producthunt.py b/functions/scrapers/producthunt.py index c64f6fd..e0b5417 100644 --- a/functions/scrapers/producthunt.py +++ b/functions/scrapers/producthunt.py @@ -6,7 +6,7 @@ import httpx from typing import List, Dict, Any from datetime import datetime -import xml.etree.ElementTree as ET +import defusedxml.ElementTree as ET from bs4 import BeautifulSoup diff --git a/functions/scrapers/theverge.py b/functions/scrapers/theverge.py index 5746806..78af6aa 100644 --- a/functions/scrapers/theverge.py +++ b/functions/scrapers/theverge.py @@ -6,7 +6,7 @@ import httpx from typing import List, Dict, Any from datetime import datetime -import xml.etree.ElementTree as ET +import defusedxml.ElementTree as ET try: from ..resilience import retry_with_backoff