apache · szehon-ho · Mar 13, 2026 · Mar 13, 2026
diff --git a/dev/generate_srs_registry.py b/dev/generate_srs_registry.py
@@ -0,0 +1,382 @@
+#!/usr/bin/env python3
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+Generate the Spatial Reference System (SRS) registry for Apache Spark.
+
+Downloads CRS definitions from the PROJ (Cartographic Projections and
+Coordinate Transformations Library) GitHub repository and generates a CSV
+file used by Spark for geospatial types on both the JVM and Python sides.
+
+PROJ is a C/C++ library (https://proj.org/) that maintains the authoritative
+EPSG and ESRI CRS databases. This script extracts SRID metadata from PROJ's
+SQL source files, which contain easily parseable plain-text SRS information.
+
+The script produces entries from the following PROJ SQL files:
+  - geodetic_crs.sql     (EPSG geodetic CRS: geographic, geocentric, etc.)
+  - projected_crs.sql    (EPSG projected CRS)
+  - compound_crs.sql     (EPSG compound CRS)
+  - vertical_crs.sql     (EPSG vertical CRS)
+  - engineering_crs.sql  (EPSG engineering CRS)
+  - esri.sql             (ESRI geodetic, projected, compound, vertical, engineering CRS)
+
+Prerequisites:
+    Python 3.9+ (no third-party packages required)
+
+Usage:
+    # Generate from the default PROJ version:
+    python dev/generate_srs_registry.py
+
+    # Generate from a specific PROJ version:
+    python dev/generate_srs_registry.py --proj-version 9.7.1
+
+    # Verify the generated files:
+    wc -l sql/api/src/main/resources/org/apache/spark/sql/srs_registry.csv
+    wc -l python/pyspark/sql/srs_registry.csv
+
+Upgrade workflow:
+    1. Update `DEFAULT_PROJ_VERSION` to the new PROJ release tag.
+    2. Run this script using `python dev/generate_srs_registry.py`.
+    3. Review the diff to see which SRIDs were added or removed.
+"""
+
+import argparse
+import csv
+import io
+import os
+import re
+import sys
+import urllib.request
+
+# Default PROJ version to download SQL files from.
+DEFAULT_PROJ_VERSION = "9.7.1"
+# PLEASE ENSURE THIS IS UPDATED TO A VALID PROJ VERSION TAG WHEN UPGRADING!
+
+# Default timeout (in seconds) for downloading SQL files from GitHub.
+DEFAULT_DOWNLOAD_TIMEOUT_SECS = 30
+
+# URL template for raw SQL files from the PROJ GitHub repository.
+PROJ_RAW_URL = "https://raw.githubusercontent.com/OSGeo/PROJ/{version}/data/sql/{filename}"
+
+# PROJ SQL files to download. EPSG CRS definitions are spread across
+# dedicated files, while ESRI definitions are all in a single file.
+PROJ_SQL_FILES = [
+    "geodetic_crs.sql",
+    "projected_crs.sql",
+    "compound_crs.sql",
+    "vertical_crs.sql",
+    "engineering_crs.sql",
+    "esri.sql",
+]
+
+# Output paths for the generated CSV, relative to the Spark repo root.
+JAVA_RESOURCE_PATH = os.path.join(
+    "sql", "api", "src", "main", "resources", "org", "apache", "spark", "sql", "srs_registry.csv"
+)
+PYTHON_RESOURCE_PATH = os.path.join("python", "pyspark", "sql", "srs_registry.csv")
+
+
+def download_sql(version, filename, timeout=DEFAULT_DOWNLOAD_TIMEOUT_SECS):
+    """Download a SQL file from the PROJ GitHub repository at a pinned version tag."""
+    url = PROJ_RAW_URL.format(version=version, filename=filename)
+    print(f"  Downloading {url}")
+    try:
+        with urllib.request.urlopen(url, timeout=timeout) as response:
+            return response.read().decode("utf-8")
+    except urllib.error.URLError as e:
+        print(f"ERROR: Failed to download {url}: {e}", file=sys.stderr)
+        if "CERTIFICATE_VERIFY_FAILED" in str(e):
+            print(
+                "Hint: Run 'Install Certificates.command' from your Python "
+                "installation, or set the SSL_CERT_FILE environment variable.",
+                file=sys.stderr,
+            )
+        print(f"Check that PROJ version '{version}' exists as a GitHub tag.", file=sys.stderr)
+        sys.exit(1)
+
+
+def parse_sql_values(values_str):
+    """
+    Parse the comma-separated fields inside a SQL VALUES(...) clause.
+
+    Handles SQL-quoted strings (single quotes with '' escape for literal
+    apostrophes) and unquoted NULL / integer literals.
+
+    Returns a list of string values, with NULL represented as None.
+    """
+    fields = []
+    i = 0
+    n = len(values_str)
+    while i < n:
+        if values_str[i] in (" ", "\t"):
+            i += 1
+            continue
+        if values_str[i] == "'":
+            # Quoted string: scan until closing quote ('' is an escaped quote).
+            i += 1
+            buf = []
+            while i < n:
+                if values_str[i] == "'" and i + 1 < n and values_str[i + 1] == "'":
+                    buf.append("'")
+                    i += 2
+                elif values_str[i] == "'":
+                    i += 1
+                    break
+                else:
+                    buf.append(values_str[i])
+                    i += 1
+            fields.append("".join(buf))
+        elif values_str[i : i + 4].upper() == "NULL":
+            fields.append(None)
+            i += 4
+        else:
+            # Unquoted literal (integer, etc.)
+            j = i
+            while j < n and values_str[j] not in (",", ")"):
+                j += 1
+            fields.append(values_str[i:j].strip())
+            i = j
+        # Skip comma separator.
+        while i < n and values_str[i] in (",", " ", "\t"):
+            if values_str[i] == ",":
+                i += 1
+                break
+            i += 1
+    return fields
+
+
+def parse_geodetic_crs(sql_content):
+    """
+    Parse geodetic_crs INSERT statements from SQL content.
+
+    The `type` field (position 4) determines whether the CRS is geographic:
+        'geographic 2D', 'geographic 3D' -> geographic
+        'geocentric', 'other'            -> non-geographic
+
+    Deprecated entries are included (to match Databricks Runtime behavior).
+    Entries with non-numeric codes are skipped.
+
+    Returns a list of (srid, string_id, is_geographic) tuples.
+    """
+    results = []
+    pattern = re.compile(r'INSERT INTO "geodetic_crs" VALUES\((.+)\);', re.IGNORECASE)
+    for line in sql_content.splitlines():
+        match = pattern.search(line)
+        if not match:
+            continue
+        fields = parse_sql_values(match.group(1))
+        if len(fields) < 5:
+            continue
+        auth_name = fields[0]
+        code = fields[1]
+        crs_type = fields[4]
+        try:
+            srid = int(code)
+        except (ValueError, TypeError):
+            continue
+        is_geographic = crs_type is not None and crs_type.startswith("geographic")
+        string_id = f"{auth_name}:{code}"
+        results.append((srid, string_id, is_geographic))
+    return results
+
+
+def parse_simple_crs(sql_content, table_name):
+    """
+    Parse INSERT statements for CRS tables that are always non-geographic.
+
+    Works for projected_crs, compound_crs, vertical_crs, and engineering_crs tables.
+    Fields used: auth_name (0), code (1).
+
+    Deprecated entries are included (to match Databricks Runtime behavior).
+    Entries with non-numeric codes are skipped.
+
+    Returns a list of (srid, string_id, is_geographic=False) tuples.
+    """
+    results = []
+    pattern = re.compile(rf'INSERT INTO "{table_name}" VALUES\((.+)\);', re.IGNORECASE)
+    for line in sql_content.splitlines():
+        match = pattern.search(line)
+        if not match:
+            continue
+        fields = parse_sql_values(match.group(1))
+        if len(fields) < 2:
+            continue
+        auth_name = fields[0]
+        code = fields[1]
+        try:
+            srid = int(code)
+        except (ValueError, TypeError):
+            continue
+        string_id = f"{auth_name}:{code}"
+        results.append((srid, string_id, False))
+    return results
+
+
+def parse_all_crs_from_sql(sql_content):
+    """
+    Parse all CRS types (geodetic, projected, compound, vertical, engineering)
+    from a single SQL file. Used for multi-table files like esri.sql.
+    """
+    entries = []
+    entries.extend(parse_geodetic_crs(sql_content))
+    for table in ["projected_crs", "compound_crs", "vertical_crs", "engineering_crs"]:
+        entries.extend(parse_simple_crs(sql_content, table))
+    return entries
+
+
+def write_csv(entries, proj_version, output_path):
+    """Write SRS entries to a CSV file with a metadata header."""
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    buf = io.StringIO()
+    writer = csv.writer(buf)
+    writer.writerow(["srid", "string_id", "is_geographic"])
+    for srid, string_id, is_geographic in sorted(entries):
+        writer.writerow([srid, string_id, str(is_geographic).lower()])
+    csv_data = buf.getvalue().replace("\r\n", "\n")
+    with open(output_path, "w", encoding="utf-8") as f:
+        f.write(
+            f"# Generated by dev/generate_srs_registry.py from PROJ {proj_version}\n"
+            f"# Source: https://github.com/OSGeo/PROJ/tree/{proj_version}/data/sql\n"
+            f"# Do not edit manually. Re-run the script to regenerate.\n"
+        )
+        f.write(csv_data)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate the SRS registry for Apache Spark from PROJ data."
+    )
+    parser.add_argument(
+        "--proj-version",
+        default=DEFAULT_PROJ_VERSION,
+        help=f"PROJ release tag to download from (default: {DEFAULT_PROJ_VERSION})",
+    )
+    parser.add_argument(
+        "--repo-root",
+        default=None,
+        help="Path to the Spark repository root (auto-detected if not set)",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=int,
+        default=DEFAULT_DOWNLOAD_TIMEOUT_SECS,
+        help=f"Download timeout in seconds (default: {DEFAULT_DOWNLOAD_TIMEOUT_SECS})",
+    )
+    args = parser.parse_args()
+
+    # Auto-detect repo root: this script lives in dev/ under the repo root.
+    if args.repo_root:
+        repo_root = args.repo_root
+    else:
+        repo_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+
+    print(f"Spark repo root: {repo_root}")
+    print(f"PROJ version: {args.proj_version}")
+    print()
+
+    # Download PROJ SQL files.
+    print("Downloading PROJ SQL files...")
+    sql_files = {}
+    for filename in PROJ_SQL_FILES:
+        sql_files[filename] = download_sql(args.proj_version, filename, args.timeout)
+    print()
+
+    # Parse CRS entries from EPSG-specific files.
+    print("Parsing CRS entries...")
+    all_entries = []
+
+    geodetic = parse_geodetic_crs(sql_files["geodetic_crs.sql"])
+    print(f"  geodetic_crs.sql:  {len(geodetic)} entries")
+    all_entries.extend(geodetic)
+
+    projected = parse_simple_crs(sql_files["projected_crs.sql"], "projected_crs")
+    print(f"  projected_crs.sql: {len(projected)} entries")
+    all_entries.extend(projected)
+
+    compound = parse_simple_crs(sql_files["compound_crs.sql"], "compound_crs")
+    print(f"  compound_crs.sql:  {len(compound)} entries")
+    all_entries.extend(compound)
+
+    vertical = parse_simple_crs(sql_files["vertical_crs.sql"], "vertical_crs")
+    print(f"  vertical_crs.sql:  {len(vertical)} entries")
+    all_entries.extend(vertical)
+
+    engineering = parse_simple_crs(sql_files["engineering_crs.sql"], "engineering_crs")
+    print(f"  engineering_crs.sql: {len(engineering)} entries")
+    all_entries.extend(engineering)
+
+    # Parse ESRI entries from the combined esri.sql file.
+    esri = parse_all_crs_from_sql(sql_files["esri.sql"])
+    print(f"  esri.sql:          {len(esri)} entries")
+    all_entries.extend(esri)
+
+    print()
+
+    # Deduplicate: when the same SRID appears in multiple tables or authorities,
+    # keep the first occurrence. Since EPSG files are parsed before ESRI, this
+    # gives EPSG precedence over ESRI for conflicting SRIDs.
+    seen = set()
+    deduped = []
+    duplicates = 0
+    for entry in all_entries:
+        if entry[0] not in seen:
+            deduped.append(entry)
+            seen.add(entry[0])
+        else:
+            duplicates += 1
+    if duplicates:
+        print(f"  Removed {duplicates} duplicate SRID(s)")
+    all_entries = deduped
+
+    # Count entries by authority.
+    authority_counts = {}
+    for _, string_id, _ in all_entries:
+        auth = string_id.split(":")[0]
+        authority_counts[auth] = authority_counts.get(auth, 0) + 1
+
+    n_geographic = sum(1 for _, _, g in all_entries if g)
+    n_nongeographic = len(all_entries) - n_geographic
+    print()
+    print(
+        f"  Total: {len(all_entries)} entries "
+        f"({n_geographic} geographic, {n_nongeographic} non-geographic)"
+    )
+    print("  Breakdown by authority:")
+    for auth in sorted(authority_counts):
+        print(f"    {auth}: {authority_counts[auth]}")
+    print()
+
+    # Write CSV to both Java and Python resource directories.
+    java_path = os.path.join(repo_root, JAVA_RESOURCE_PATH)
+    python_path = os.path.join(repo_root, PYTHON_RESOURCE_PATH)
+
+    print("Writing CSV files...")
+    write_csv(all_entries, args.proj_version, java_path)
+    print(f"  {java_path}")
+    write_csv(all_entries, args.proj_version, python_path)
+    print(f"  {python_path}")
+    print()
+
+    print("Done. Verify with:")
+    print(f"  wc -l {JAVA_RESOURCE_PATH}")
+    print(f"  git diff {JAVA_RESOURCE_PATH}")
+
+
+if __name__ == "__main__":
+    main()