From 4640e1dac2928b612a660c592eb1762678e58223 Mon Sep 17 00:00:00 2001 From: Leo Torres Date: Thu, 26 Mar 2026 10:50:22 +0100 Subject: [PATCH] Add researcher-friendly error messages for zip upload validation Translate technical ZipValidator errors into plain language that non-developer researchers can understand and act on. Groups related errors (e.g. multiple forbidden files), strips jargon (no "symlink", "MIME type", "path traversal"), and adds non-blocking warnings for skipped files, large images, and high file counts. Co-Authored-By: Claude Opus 4.6 (1M context) --- app/templates/partials/upload_form.html | 6 + app/upload/__init__.py | 10 +- app/upload/zip_errors.py | 376 ++++++++++++++++++++++++ static/css/main.css | 42 ++- tests/test_zip_errors.py | 370 +++++++++++++++++++++++ 5 files changed, 802 insertions(+), 2 deletions(-) create mode 100644 app/upload/zip_errors.py create mode 100644 tests/test_zip_errors.py diff --git a/app/templates/partials/upload_form.html b/app/templates/partials/upload_form.html index cbd1fa2..69498bf 100644 --- a/app/templates/partials/upload_form.html +++ b/app/templates/partials/upload_form.html @@ -36,6 +36,12 @@

Upload New Scroll

{% endif %} +{% if warnings %} +
+ {{ warnings|safe }} +
+{% endif %} +
{{ form_input("Title", "title", "text", form_data.title if form_data else "", required=true, help_text="A clear, descriptive title for your research scroll") }} diff --git a/app/upload/__init__.py b/app/upload/__init__.py index 6b5b892..8829fa6 100644 --- a/app/upload/__init__.py +++ b/app/upload/__init__.py @@ -2,5 +2,13 @@ from .processors import HTMLProcessor from .validators import FileValidator +from .zip_errors import ZipUploadResult, translate_zip_errors +from .zip_validator import ZipValidator -__all__ = ["HTMLProcessor", "FileValidator"] +__all__ = [ + "HTMLProcessor", + "FileValidator", + "ZipUploadResult", + "ZipValidator", + "translate_zip_errors", +] diff --git a/app/upload/zip_errors.py b/app/upload/zip_errors.py new file mode 100644 index 0000000..fc570d7 --- /dev/null +++ b/app/upload/zip_errors.py @@ -0,0 +1,376 @@ +"""Researcher-friendly error messages for zip upload validation failures. + +Translates technical ZipValidator errors into plain-language messages that +researchers (who may not be web developers) can understand and act on. +""" + +from dataclasses import dataclass, field +import html +import re + +# Threshold for "many files" warning +HIGH_FILE_COUNT_THRESHOLD = 200 + +# Threshold for "large image" warning (10MB) +LARGE_IMAGE_THRESHOLD = 10 * 1024 * 1024 + + +@dataclass +class ZipUploadResult: + """Structured result of zip upload validation with friendly messages.""" + + errors: list[str] = field(default_factory=list) + warnings: list[str] = field(default_factory=list) + skipped_files: list[str] = field(default_factory=list) + + @property + def has_errors(self) -> bool: + return len(self.errors) > 0 + + @property + def has_warnings(self) -> bool: + return len(self.warnings) > 0 + + def format_errors_html(self) -> str: + if not self.errors: + return "" + escaped = [html.escape(e) for e in self.errors] + items = "".join(f"
  • {e}
  • " for e in escaped) + return ( + f"We found some problems with your archive:" + f'
      {items}
    ' + ) + + def format_warnings_html(self) -> str: + if not self.warnings: + return "" + escaped = [html.escape(w) for w in self.warnings] + items = "".join(f"
  • {w}
  • " for w in escaped) + return f'
      {items}
    ' + + +# --- Pattern matchers for raw error classification --- + +_ARCHIVE_SIZE_RE = re.compile(r"Archive size \((.+?)\) exceeds the (\d+)MB limit") +_UNCOMPRESSED_SIZE_RE = re.compile(r"Total uncompressed size \((.+?)\) exceeds the (\d+)MB limit") +_FILE_COUNT_RE = re.compile(r"Archive contains (\d+) files, maximum is (\d+)") +_DISALLOWED_EXT_RE = re.compile(r"File '(.+?)' has disallowed extension '(.+?)'") +_NESTED_ARCHIVE_RE = re.compile(r"File '(.+?)' is a nested archive") +_COMPRESSION_RATIO_RE = re.compile(r"File '(.+?)' has suspicious compression ratio") +_SVG_FORBIDDEN_TAG_RE = re.compile(r"SVG '(.+?)' contains forbidden <(.+?)> element") +_SVG_EVENT_HANDLER_RE = re.compile( + r"SVG '(.+?)' contains forbidden event handler attribute '(.+?)'" +) +_SVG_DANGEROUS_HREF_RE = re.compile(r"SVG '(.+?)' contains dangerous href") +_HTML_VALIDATION_RE = re.compile(r"HTML validation error in '(.+?)': (.+)") +_CONTENT_VALIDATION_RE = re.compile(r"Content validation error in '(.+?)': (.+)") +_MIME_SPOOFING_RE = re.compile(r"File '(.+?)' has extension '.+?' but MIME type is '.+?'") +_FILE_SIZE_LIMIT_RE = re.compile(r"File '(.+?)' \((.+?)\) exceeds the (\d+)MB size limit") +_PATH_TRAVERSAL_RE = re.compile(r"Path traversal detected") +_ABSOLUTE_PATH_RE = re.compile(r"Absolute path detected") +_NULL_BYTE_RE = re.compile(r"Null byte detected") +_BACKSLASH_RE = re.compile(r"Backslash detected in path") +_PATH_RESOLVES_RE = re.compile(r"resolves outside extraction directory") +_SYMLINK_RE = re.compile(r"Symlink detected") +_FILENAME_CHARS_RE = re.compile(r"Filename '.+?' contains invalid characters") +_FILENAME_LENGTH_RE = re.compile(r"Filename component '.+?' exceeds") +_RESERVED_NAME_RE = re.compile(r"uses reserved name") +_UTF8_RE = re.compile(r"File '(.+?)' is not valid UTF-8") +_HTML_UTF8_RE = re.compile(r"HTML file '(.+?)' is not valid UTF-8") +_NO_HTML_RE = re.compile(r"Archive must contain at least one .html file") +_CORRUPT_RE = re.compile(r"not a valid zip archive|Could not open archive") +_NOT_FOUND_RE = re.compile(r"Archive file not found") +_SVG_PARSE_RE = re.compile(r"SVG file '(.+?)' could not be parsed") + + +def translate_zip_errors( + raw_errors: list[str], + *, + skipped_files: list[str] | None = None, + large_files: list[tuple[str, int]] | None = None, + file_count: int | None = None, +) -> ZipUploadResult: + """Translate raw ZipValidator errors into researcher-friendly messages. + + Groups related errors (e.g. multiple forbidden file types) into single messages + so the researcher gets a clear, actionable summary. + + Args: + raw_errors: Error strings from ZipValidator.validate() + skipped_files: Hidden/metadata files that were skipped (for warning) + large_files: List of (filename, size_bytes) for large image files (for warning) + file_count: Total file count in archive (for warning if high) + """ + result = ZipUploadResult() + + # Buckets for grouping related errors + forbidden_files: list[str] = [] + nested_archives: list[str] = [] + path_issues: list[str] = [] + svg_issues: list[tuple[str, str]] = [] # (filename, description) + html_issues: list[tuple[str, str]] = [] # (filename, detail) + content_issues: list[tuple[str, str]] = [] + file_size_issues: list[tuple[str, str]] = [] # (filename, category) + mime_issues: list[str] = [] # filenames + utf8_issues: list[str] = [] # filenames + + for err in raw_errors: + # Archive-level: corrupt + if _CORRUPT_RE.search(err): + result.errors.append( + "We couldn't read your archive. Please try creating a new zip file " + "from your project folder." + ) + continue + + # Archive-level: not found + if _NOT_FOUND_RE.search(err): + result.errors.append("We couldn't find the uploaded file. Please try uploading again.") + continue + + # Archive size + m = _ARCHIVE_SIZE_RE.search(err) + if m: + size, limit = m.group(1), m.group(2) + result.errors.append( + f"Your archive is {size}, which exceeds our {limit} MB limit. " + "Large files are usually images or data -- try compressing images " + "or hosting large datasets externally." + ) + continue + + # Uncompressed size + m = _UNCOMPRESSED_SIZE_RE.search(err) + if m: + size, limit = m.group(1), m.group(2) + result.errors.append( + f"Your archive is too large when extracted ({size}, limit {limit} MB). " + "Try compressing images or removing unnecessary files." + ) + continue + + # File count + m = _FILE_COUNT_RE.search(err) + if m: + count, maximum = m.group(1), m.group(2) + result.errors.append( + f"Your archive contains {count} files, which is more than our limit " + f"of {maximum}. Please reduce the number of files or remove " + "unnecessary ones." + ) + continue + + # No HTML + if _NO_HTML_RE.search(err): + result.errors.append( + "Your archive doesn't contain an HTML file. Press publishes " + "HTML research papers -- please include your paper as an .html file." + ) + continue + + # Disallowed extension -> group + m = _DISALLOWED_EXT_RE.search(err) + if m: + forbidden_files.append(m.group(1)) + continue + + # Nested archive -> group + m = _NESTED_ARCHIVE_RE.search(err) + if m: + nested_archives.append(m.group(1)) + continue + + # Compression ratio (zip bomb) + if _COMPRESSION_RATIO_RE.search(err): + result.errors.append( + "Your archive has an unusual compression pattern. " + "Please create a standard zip file from your project folder." + ) + continue + + # SVG forbidden tag + m = _SVG_FORBIDDEN_TAG_RE.search(err) + if m: + svg_issues.append((m.group(1), f"contains a <{m.group(2)}> element")) + continue + + # SVG event handler + m = _SVG_EVENT_HANDLER_RE.search(err) + if m: + svg_issues.append((m.group(1), "contains interactive code")) + continue + + # SVG dangerous href + m = _SVG_DANGEROUS_HREF_RE.search(err) + if m: + svg_issues.append((m.group(1), "contains interactive code")) + continue + + # SVG parse error + m = _SVG_PARSE_RE.search(err) + if m: + svg_issues.append((m.group(1), "could not be read")) + continue + + # HTML validation error + m = _HTML_VALIDATION_RE.search(err) + if m: + html_issues.append((m.group(1), m.group(2))) + continue + + # Content validation error + m = _CONTENT_VALIDATION_RE.search(err) + if m: + content_issues.append((m.group(1), m.group(2))) + continue + + # MIME type spoofing + m = _MIME_SPOOFING_RE.search(err) + if m: + mime_issues.append(m.group(1)) + continue + + # Per-file size limit + m = _FILE_SIZE_LIMIT_RE.search(err) + if m: + file_size_issues.append((m.group(1), m.group(2))) + continue + + # Path safety issues -> group + if any( + p.search(err) + for p in [ + _PATH_TRAVERSAL_RE, + _ABSOLUTE_PATH_RE, + _NULL_BYTE_RE, + _BACKSLASH_RE, + _PATH_RESOLVES_RE, + ] + ): + path_issues.append(err) + continue + + # Symlinks -> path issue + if _SYMLINK_RE.search(err): + path_issues.append(err) + continue + + # Filename issues -> path issue + if any( + p.search(err) for p in [_FILENAME_CHARS_RE, _FILENAME_LENGTH_RE, _RESERVED_NAME_RE] + ): + path_issues.append(err) + continue + + # UTF-8 issues + m = _UTF8_RE.search(err) or _HTML_UTF8_RE.search(err) + if m: + utf8_issues.append(m.group(1)) + continue + + # Fallback: pass through unknown errors + result.errors.append(err) + + # Emit grouped messages + + if forbidden_files: + file_list = ", ".join(forbidden_files[:5]) + extra = f" (and {len(forbidden_files) - 5} more)" if len(forbidden_files) > 5 else "" + result.errors.append( + f"Your archive contains files we can't host: {file_list}{extra}. " + "Please remove executable or archive files and re-upload. " + "HTML, CSS, JavaScript, images, fonts, and data files are all fine." + ) + + if nested_archives: + file_list = ", ".join(nested_archives[:5]) + extra = f" (and {len(nested_archives) - 5} more)" if len(nested_archives) > 5 else "" + result.errors.append( + f"Your archive contains other archives: {file_list}{extra}. " + "Please zip your project folder directly -- don't zip a zip." + ) + + if path_issues: + result.errors.append( + "Some files in your archive have unusual paths. " + "Please re-create the zip from your project folder directly." + ) + + if svg_issues: + # Group by filename + seen = {} + for filename, desc in svg_issues: + if filename not in seen: + seen[filename] = desc + for filename, desc in seen.items(): + result.errors.append( + f"One of your SVG images ({filename}) {desc}, which we can't " + "allow for security reasons. Please use a static SVG or convert to PNG." + ) + + if html_issues: + # Group by file + by_file: dict[str, list[str]] = {} + for filename, detail in html_issues: + by_file.setdefault(filename, []).append(detail) + for filename, details in by_file.items(): + if len(details) == 1: + result.errors.append( + f"Your HTML file ({filename}) has content we can't accept: {details[0]}" + ) + else: + summary = "; ".join(details[:3]) + extra = f" (and {len(details) - 3} more issues)" if len(details) > 3 else "" + result.errors.append( + f"Your HTML file ({filename}) has content we can't accept: {summary}{extra}" + ) + + if content_issues: + for filename, detail in content_issues: + result.errors.append(f"Your HTML file ({filename}) has a content issue: {detail}") + + if mime_issues: + file_list = ", ".join(mime_issues[:3]) + result.errors.append( + f"Some files don't match their extension ({file_list}). " + "Please make sure files have the correct extension for their type." + ) + + if file_size_issues: + for filename, category in file_size_issues: + result.errors.append( + f"The file '{filename}' is too large for its type ({category}). " + "Please reduce the file size or remove it." + ) + + if utf8_issues: + file_list = ", ".join(utf8_issues[:3]) + result.errors.append( + f"Some text files are not UTF-8 encoded ({file_list}). " + "Please save them with UTF-8 encoding." + ) + + # --- Warnings --- + + if skipped_files: + count = len(skipped_files) + result.skipped_files = skipped_files + result.warnings.append( + f"{count} file{'s were' if count != 1 else ' was'} skipped " + "(macOS metadata, hidden files) -- this is normal." + ) + + if large_files: + names = ", ".join(name for name, _ in large_files[:3]) + extra = f" (and {len(large_files) - 3} more)" if len(large_files) > 3 else "" + result.warnings.append( + f"Some image files are large (over 10 MB): {names}{extra}. " + "This may affect loading times for readers." + ) + + if file_count is not None and file_count >= HIGH_FILE_COUNT_THRESHOLD: + result.warnings.append( + f"Your archive has {file_count} files. Large archives may take longer to load." + ) + + return result diff --git a/static/css/main.css b/static/css/main.css index eeac00b..9ea48ee 100644 --- a/static/css/main.css +++ b/static/css/main.css @@ -28,6 +28,9 @@ --error-border: #fecaca; --highlight-bg: #fef3c7; --highlight-text: #92400e; + --warning-bg: #fffbeb; + --warning-border: #fde68a; + --warning-text: #92400e; } /* Dark theme colors */ @@ -57,6 +60,9 @@ --error-border: #991b1b; --highlight-bg: #92400e; --highlight-text: #fcd34d; + --warning-bg: #451a03; + --warning-border: #78350f; + --warning-text: #fcd34d; } /* Auto dark mode based on browser preference */ @@ -87,6 +93,9 @@ --error-border: #991b1b; --highlight-bg: #92400e; --highlight-text: #fcd34d; + --warning-bg: #451a03; + --warning-border: #78350f; + --warning-text: #fcd34d; } } @@ -582,6 +591,26 @@ button.btn, a.btn { .form-errors li:last-child { margin-bottom: 0; } +.form-warnings { + background: var(--warning-bg); + border: 1px solid var(--warning-border); + border-radius: var(--border-radius); + padding: var(--space-md); + margin-bottom: var(--space-lg); +} +.form-warnings ul { + list-style: none; + margin: 0; + padding: 0; +} +.form-warnings li { + color: var(--warning-text); + font-size: var(--text-sm); + margin-bottom: var(--space-xs); +} +.form-warnings li:last-child { + margin-bottom: 0; +} .success-message { background: var(--success-bg); border: 1px solid var(--success-border); @@ -695,7 +724,18 @@ button.btn, a.btn { font-size: var(--mobile-text-sm); margin-bottom: var(--mobile-space-xs); } - + + .form-warnings { + padding: var(--mobile-space-lg); + margin-bottom: var(--mobile-space-lg); + border-radius: var(--border-radius); + } + + .form-warnings li { + font-size: var(--mobile-text-sm); + margin-bottom: var(--mobile-space-xs); + } + .success-message { padding: var(--mobile-space-xl); border-radius: var(--border-radius); diff --git a/tests/test_zip_errors.py b/tests/test_zip_errors.py new file mode 100644 index 0000000..71baee1 --- /dev/null +++ b/tests/test_zip_errors.py @@ -0,0 +1,370 @@ +"""Tests for researcher-friendly zip upload error messages.""" + +import io +import stat +import struct +import zipfile +import zlib + +from app.upload.zip_errors import ( + ZipUploadResult, + translate_zip_errors, +) +from app.upload.zip_validator import ZipValidator + + +def _make_zip( + files: dict[str, bytes | str], *, symlinks: list[tuple[str, str]] | None = None +) -> bytes: + buf = io.BytesIO() + with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf: + for name, content in files.items(): + if isinstance(content, str): + content = content.encode("utf-8") + zf.writestr(name, content) + if symlinks: + for link_name, target in symlinks: + info = zipfile.ZipInfo(link_name) + info.external_attr = (stat.S_IFLNK | 0o777) << 16 + zf.writestr(info, target.encode("utf-8")) + return buf.getvalue() + + +def _minimal_png() -> bytes: + signature = b"\x89PNG\r\n\x1a\n" + + def _chunk(chunk_type: bytes, data: bytes) -> bytes: + c = chunk_type + data + return struct.pack(">I", len(data)) + c + struct.pack(">I", zlib.crc32(c) & 0xFFFFFFFF) + + ihdr_data = struct.pack(">IIBBBBB", 1, 1, 8, 2, 0, 0, 0) + ihdr = _chunk(b"IHDR", ihdr_data) + raw_row = b"\x00\xff\xff\xff" + idat = _chunk(b"IDAT", zlib.compress(raw_row)) + iend = _chunk(b"IEND", b"") + return signature + ihdr + idat + iend + + +MINIMAL_PNG = _minimal_png() + +MINIMAL_HTML = """ +Test +

    This is a test academic paper with enough words to pass content validation. +We need at least one hundred words so let us keep writing some more filler text here. +The quick brown fox jumps over the lazy dog multiple times in this paragraph. +Research shows that academic papers need structure and content to be meaningful. +This document serves as a minimal valid HTML file for testing the zip validator. +Additional sentences help us reach the word count threshold for validation. +More text follows to ensure we have sufficient content for the validator. +The final sentence in this paragraph completes our minimal test document.

    """ + + +class TestTranslateZipErrors: + """Test that technical errors become researcher-friendly messages.""" + + def test_no_html_found(self): + raw_errors = ["Archive must contain at least one .html file."] + result = translate_zip_errors(raw_errors) + assert len(result.errors) == 1 + assert "HTML" in result.errors[0] + assert "include" in result.errors[0].lower() or ".html" in result.errors[0] + + def test_archive_too_large(self): + raw_errors = ["Archive size (55.3MB) exceeds the 50MB limit."] + result = translate_zip_errors(raw_errors) + assert len(result.errors) == 1 + assert "55.3 MB" in result.errors[0] or "55.3MB" in result.errors[0] + assert "50 MB" in result.errors[0] or "50MB" in result.errors[0] + + def test_uncompressed_too_large(self): + raw_errors = ["Total uncompressed size (250MB) exceeds the 200MB limit."] + result = translate_zip_errors(raw_errors) + assert len(result.errors) == 1 + assert ( + "extracted" in result.errors[0].lower() or "uncompressed" in result.errors[0].lower() + ) + + def test_corrupt_archive(self): + raw_errors = ["File is not a valid zip archive (corrupt or wrong format)."] + result = translate_zip_errors(raw_errors) + assert len(result.errors) == 1 + assert "read" in result.errors[0].lower() or "new zip" in result.errors[0].lower() + + def test_forbidden_files_grouped(self): + raw_errors = [ + "File 'payload.exe' has disallowed extension '.exe'. Allowed: ...", + "File 'script.bat' has disallowed extension '.bat'. Allowed: ...", + "File 'run.sh' has disallowed extension '.sh'. Allowed: ...", + ] + result = translate_zip_errors(raw_errors) + assert len(result.errors) == 1 + assert "payload.exe" in result.errors[0] + assert "script.bat" in result.errors[0] + assert "run.sh" in result.errors[0] + + def test_nested_archives_grouped(self): + raw_errors = [ + "File 'inner.zip' is a nested archive. Nested archives are not allowed.", + "File 'data.tar.gz' is a nested archive. Nested archives are not allowed.", + ] + result = translate_zip_errors(raw_errors) + assert len(result.errors) == 1 + assert "inner.zip" in result.errors[0] + assert "data.tar.gz" in result.errors[0] + + def test_path_issues(self): + raw_errors = [ + "Path traversal detected in '../../../etc/passwd'. Paths containing '..' are not allowed." + ] + result = translate_zip_errors(raw_errors) + assert len(result.errors) == 1 + # Should not expose the path traversal attempt detail + assert ( + "unusual paths" in result.errors[0].lower() or "re-create" in result.errors[0].lower() + ) + + def test_too_many_files(self): + raw_errors = ["Archive contains 501 files, maximum is 500."] + result = translate_zip_errors(raw_errors) + assert len(result.errors) == 1 + assert "500" in result.errors[0] + + def test_zip_bomb_detected(self): + raw_errors = [ + "File 'bomb.html' has suspicious compression ratio (150:1). Maximum allowed is 100:1." + ] + result = translate_zip_errors(raw_errors) + assert len(result.errors) == 1 + assert ( + "compression" in result.errors[0].lower() or "standard zip" in result.errors[0].lower() + ) + + def test_svg_security(self): + raw_errors = ["SVG 'image.svg' contains forbidden "], + warnings=[], + ) + html = result.format_errors_html() + assert "