From c3226011e47ff416e674eb746161b953be9c07e8 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Sat, 7 Mar 2026 23:51:13 +0000 Subject: [PATCH] =?UTF-8?q?=F0=9F=9B=A1=EF=B8=8F=20Sentinel:=20[HIGH]=20Fi?= =?UTF-8?q?x=20XXE=20vulnerabilities=20in=20XML=20parsing?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces `xml.etree.ElementTree` with `defusedxml.ElementTree` to securely parse XML/RSS feeds from external sources, mitigating XML External Entity (XXE) and Billion Laughs vulnerabilities. Also updates `requirements.txt` to include `defusedxml` and logs learning to `.jules/sentinel.md`. Co-authored-by: AminSS99 <139346033+AminSS99@users.noreply.github.com> --- .jules/sentinel.md | 4 ++++ functions/requirements.txt | 1 + functions/scrapers/producthunt.py | 10 +++++++--- functions/scrapers/theverge.py | 2 +- 4 files changed, 13 insertions(+), 4 deletions(-) create mode 100644 .jules/sentinel.md diff --git a/.jules/sentinel.md b/.jules/sentinel.md new file mode 100644 index 0000000..365c84f --- /dev/null +++ b/.jules/sentinel.md @@ -0,0 +1,4 @@ +## 2024-05-20 - Prevent XML External Entity (XXE) and Billion Laughs Vulnerabilities +**Vulnerability:** Use of standard library `xml.etree.ElementTree` to parse untrusted XML/RSS feeds in `theverge.py` and `producthunt.py` scrapers. +**Learning:** The built-in `xml.etree` module in Python is vulnerable to malicious XML payloads such as XML External Entities (XXE) and Billion Laughs attacks. Parsing feeds from external, untrusted sources without defensive measures creates severe security risks (DoS or data exfiltration). +**Prevention:** Always use `defusedxml.ElementTree` instead of `xml.etree.ElementTree` when parsing any XML data from an untrusted source or network request. Ensure `defusedxml` is included in project dependencies. diff --git a/functions/requirements.txt b/functions/requirements.txt index 9354334..3d6e032 100644 --- a/functions/requirements.txt +++ b/functions/requirements.txt @@ -7,3 +7,4 @@ beautifulsoup4==4.* feedparser==6.* openai==1.* tzdata +defusedxml==0.* diff --git a/functions/scrapers/producthunt.py b/functions/scrapers/producthunt.py index c64f6fd..5bab284 100644 --- a/functions/scrapers/producthunt.py +++ b/functions/scrapers/producthunt.py @@ -6,7 +6,7 @@ import httpx from typing import List, Dict, Any from datetime import datetime -import xml.etree.ElementTree as ET +import defusedxml.ElementTree as ET from bs4 import BeautifulSoup @@ -114,8 +114,12 @@ def fetch_producthunt(limit: int = 10) -> List[Dict[str, Any]]: for entry in entries[:limit]: title = entry.find('atom:title', atom_ns) link = entry.find('atom:link', atom_ns) - summary = entry.find('atom:summary', atom_ns) or entry.find('atom:content', atom_ns) - published = entry.find('atom:published', atom_ns) or entry.find('atom:updated', atom_ns) + summary = entry.find('atom:summary', atom_ns) + if summary is None: + summary = entry.find('atom:content', atom_ns) + published = entry.find('atom:published', atom_ns) + if published is None: + published = entry.find('atom:updated', atom_ns) if title is None or link is None: continue diff --git a/functions/scrapers/theverge.py b/functions/scrapers/theverge.py index 5746806..78af6aa 100644 --- a/functions/scrapers/theverge.py +++ b/functions/scrapers/theverge.py @@ -6,7 +6,7 @@ import httpx from typing import List, Dict, Any from datetime import datetime -import xml.etree.ElementTree as ET +import defusedxml.ElementTree as ET try: from ..resilience import retry_with_backoff