From c3226011e47ff416e674eb746161b953be9c07e8 Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
 <161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Sat, 7 Mar 2026 23:51:13 +0000
Subject: [PATCH] =?UTF-8?q?=F0=9F=9B=A1=EF=B8=8F=20Sentinel:=20[HIGH]=20Fi?=
 =?UTF-8?q?x=20XXE=20vulnerabilities=20in=20XML=20parsing?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replaces `xml.etree.ElementTree` with `defusedxml.ElementTree` to securely parse XML/RSS feeds from external sources, mitigating XML External Entity (XXE) and Billion Laughs vulnerabilities. Also updates `requirements.txt` to include `defusedxml` and logs learning to `.jules/sentinel.md`.

Co-authored-by: AminSS99 <139346033+AminSS99@users.noreply.github.com>
---
 .jules/sentinel.md                |  4 ++++
 functions/requirements.txt        |  1 +
 functions/scrapers/producthunt.py | 10 +++++++---
 functions/scrapers/theverge.py    |  2 +-
 4 files changed, 13 insertions(+), 4 deletions(-)
 create mode 100644 .jules/sentinel.md

diff --git a/.jules/sentinel.md b/.jules/sentinel.md
new file mode 100644
index 0000000..365c84f
--- /dev/null
+++ b/.jules/sentinel.md
@@ -0,0 +1,4 @@
+## 2024-05-20 - Prevent XML External Entity (XXE) and Billion Laughs Vulnerabilities
+**Vulnerability:** Use of standard library `xml.etree.ElementTree` to parse untrusted XML/RSS feeds in `theverge.py` and `producthunt.py` scrapers.
+**Learning:** The built-in `xml.etree` module in Python is vulnerable to malicious XML payloads such as XML External Entities (XXE) and Billion Laughs attacks. Parsing feeds from external, untrusted sources without defensive measures creates severe security risks (DoS or data exfiltration).
+**Prevention:** Always use `defusedxml.ElementTree` instead of `xml.etree.ElementTree` when parsing any XML data from an untrusted source or network request. Ensure `defusedxml` is included in project dependencies.
diff --git a/functions/requirements.txt b/functions/requirements.txt
index 9354334..3d6e032 100644
--- a/functions/requirements.txt
+++ b/functions/requirements.txt
@@ -7,3 +7,4 @@ beautifulsoup4==4.*
 feedparser==6.*
 openai==1.*
 tzdata
+defusedxml==0.*
diff --git a/functions/scrapers/producthunt.py b/functions/scrapers/producthunt.py
index c64f6fd..5bab284 100644
--- a/functions/scrapers/producthunt.py
+++ b/functions/scrapers/producthunt.py
@@ -6,7 +6,7 @@
 import httpx
 from typing import List, Dict, Any
 from datetime import datetime
-import xml.etree.ElementTree as ET
+import defusedxml.ElementTree as ET
 from bs4 import BeautifulSoup
 
 
@@ -114,8 +114,12 @@ def fetch_producthunt(limit: int = 10) -> List[Dict[str, Any]]:
                 for entry in entries[:limit]:
                     title = entry.find('atom:title', atom_ns)
                     link = entry.find('atom:link', atom_ns)
-                    summary = entry.find('atom:summary', atom_ns) or entry.find('atom:content', atom_ns)
-                    published = entry.find('atom:published', atom_ns) or entry.find('atom:updated', atom_ns)
+                    summary = entry.find('atom:summary', atom_ns)
+                    if summary is None:
+                        summary = entry.find('atom:content', atom_ns)
+                    published = entry.find('atom:published', atom_ns)
+                    if published is None:
+                        published = entry.find('atom:updated', atom_ns)
 
                     if title is None or link is None:
                         continue
diff --git a/functions/scrapers/theverge.py b/functions/scrapers/theverge.py
index 5746806..78af6aa 100644
--- a/functions/scrapers/theverge.py
+++ b/functions/scrapers/theverge.py
@@ -6,7 +6,7 @@
 import httpx
 from typing import List, Dict, Any
 from datetime import datetime
-import xml.etree.ElementTree as ET
+import defusedxml.ElementTree as ET
 
 try:
     from ..resilience import retry_with_backoff