Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/fix-python-cli-flags.fixed
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix Python interface CLI flags: --data-dir/--clean-frs-base/--clean-frs → --data, --frs-raw → --frs, matching the binary's current flag names. Fixes all economy-wide simulations failing with "unexpected argument '--clean-frs-base' found".
1 change: 1 addition & 0 deletions changelog.d/gcs-lcfs-spi-was.added
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add GCS download support for LCFS, SPI, and WAS datasets. New ensure_dataset(dataset, year) and updated download_all() support all four datasets (frs, lcfs, spi, was) from gs://policyengine-uk-microdata.
114 changes: 68 additions & 46 deletions interfaces/python/policyengine_uk_compiled/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,20 +60,23 @@ def _get_credentials() -> tuple[str, str]:
return token.split(":", 1)


def ensure_year(year: int) -> Path:
"""Ensure FRS data for a specific year is available locally, downloading if needed.
DATASETS = ("frs", "lcfs", "spi", "was")


def ensure_dataset_year(dataset: str, year: int) -> Path:
"""Ensure clean CSVs for a dataset/year are available locally, downloading if needed.

Returns the path to the year directory (e.g. ~/.policyengine-uk-data/frs/2023/).
Returns the path to the year directory (e.g. ~/.policyengine-uk-data/frs/2026/).
"""
year_dir = LOCAL_CACHE / "frs" / str(year)
year_dir = LOCAL_CACHE / dataset / str(year)
expected_files = ["persons.csv", "benunits.csv", "households.csv"]
if all((year_dir / f).exists() for f in expected_files):
return year_dir

access_key, secret_key = _get_credentials()
year_dir.mkdir(parents=True, exist_ok=True)
for f in expected_files:
key = f"frs/{year}/{f}"
key = f"{dataset}/{year}/{f}"
dest = year_dir / f
if dest.exists():
continue
Expand All @@ -84,16 +87,13 @@ def ensure_year(year: int) -> Path:
return year_dir


def ensure_frs(year: int, clean_frs_base: str | None = None) -> str:
"""Return a path to FRS data base dir, downloading the needed year if missing.
# Keep old name for backwards compatibility
def ensure_year(year: int) -> Path:
return ensure_dataset_year("frs", year)

Args:
year: The fiscal year to ensure data for.
clean_frs_base: Explicit path. If it exists with data for this year, returned as-is.

Returns:
Path string to the FRS base directory (containing year subdirs).
"""
def ensure_frs(year: int, clean_frs_base: str | None = None) -> str:
"""Return a path to FRS data base dir, downloading the needed year if missing."""
if clean_frs_base:
year_dir = Path(clean_frs_base) / str(year)
if year_dir.is_dir():
Expand All @@ -111,42 +111,64 @@ def ensure_frs(year: int, clean_frs_base: str | None = None) -> str:
f"a directory with a {year}/ subdirectory, or set {ENV_TOKEN} to "
f"auto-download from GCS."
)
ensure_year(year)
ensure_dataset_year("frs", year)
return str(local_base)


def ensure_dataset(dataset: str, year: int) -> str:
"""Return a path to a dataset base dir, downloading the needed year if missing.

Supports: frs, lcfs, spi, was.
"""
if dataset not in DATASETS:
raise ValueError(f"Unknown dataset {dataset!r}. Choose from: {DATASETS}")

local_base = LOCAL_CACHE / dataset
year_dir = local_base / str(year)
expected = ["persons.csv", "benunits.csv", "households.csv"]
if all((year_dir / f).exists() for f in expected):
return str(local_base)

if not os.environ.get(ENV_TOKEN):
raise FileNotFoundError(
f"No {dataset.upper()} data found for {year}. Set {ENV_TOKEN} to auto-download."
)
ensure_dataset_year(dataset, year)
return str(local_base)


def download_all(force: bool = False) -> Path:
"""Download all available FRS years. Returns the base frs directory."""
def download_all(force: bool = False, datasets: tuple = DATASETS) -> None:
"""Download all available years for the given datasets (default: all)."""
import re
access_key, secret_key = _get_credentials()

# List all objects to discover years
keys = []
marker = ""
while True:
path = f"/?prefix=frs/&marker={marker}"
headers = _sign_request("GET", "/", access_key, secret_key)
url = f"https://{GCS_HOST}/{GCS_BUCKET}{path}"
req = urllib.request.Request(url, headers=headers)
with urllib.request.urlopen(req) as resp:
body = resp.read().decode()
found = re.findall(r"<Key>([^<]+)</Key>", body)
if not found:
break
keys.extend(found)
if "<IsTruncated>true</IsTruncated>" not in body:
break
marker = found[-1]

total = len(keys)
for i, key in enumerate(keys, 1):
rel = key[len("frs/"):]
if not rel:
continue
dest = LOCAL_CACHE / "frs" / rel
if dest.exists() and not force:
continue
_download_object(key, dest, access_key, secret_key)
print(f"\r Downloading frs: {i}/{total}", end="", flush=True)
print()
return LOCAL_CACHE / "frs"
for dataset in datasets:
keys = []
marker = ""
while True:
path = f"/?prefix={dataset}/&marker={marker}"
headers = _sign_request("GET", "/", access_key, secret_key)
url = f"https://{GCS_HOST}/{GCS_BUCKET}{path}"
req = urllib.request.Request(url, headers=headers)
with urllib.request.urlopen(req) as resp:
body = resp.read().decode()
found = re.findall(r"<Key>([^<]+)</Key>", body)
if not found:
break
keys.extend(found)
if "<IsTruncated>true</IsTruncated>" not in body:
break
marker = found[-1]

total = len(keys)
for i, key in enumerate(keys, 1):
rel = key[len(f"{dataset}/"):]
if not rel:
continue
dest = LOCAL_CACHE / dataset / rel
if dest.exists() and not force:
continue
_download_object(key, dest, access_key, secret_key)
print(f"\r Downloading {dataset}: {i}/{total}", end="", flush=True)
if keys:
print()
10 changes: 5 additions & 5 deletions interfaces/python/policyengine_uk_compiled/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,18 +200,18 @@ def _build_cmd(self, policy: Optional[Parameters] = None, extra_args: Optional[l
if self._stdin_payload is not None:
cmd.append("--stdin-data")
elif self._data_dir:
cmd += ["--data-dir", self._data_dir]
cmd += ["--data", self._data_dir]
elif self._clean_frs_base:
cmd += ["--clean-frs-base", self._clean_frs_base]
cmd += ["--data", self._clean_frs_base]
elif self._clean_frs:
cmd += ["--clean-frs", self._clean_frs]
cmd += ["--data", self._clean_frs]
elif self._frs_raw:
cmd += ["--frs-raw", self._frs_raw]
cmd += ["--frs", self._frs_raw]
else:
# No data source specified — try auto-resolving FRS data
from policyengine_uk_compiled.data import ensure_frs
frs_path = ensure_frs(self.year)
cmd += ["--clean-frs-base", frs_path]
cmd += ["--data", frs_path]

if policy:
overlay = policy.model_dump(exclude_none=True)
Expand Down
Loading