diff --git a/changelog.d/fix-python-cli-flags.fixed b/changelog.d/fix-python-cli-flags.fixed new file mode 100644 index 0000000..4055d4e --- /dev/null +++ b/changelog.d/fix-python-cli-flags.fixed @@ -0,0 +1 @@ +Fix Python interface CLI flags: --data-dir/--clean-frs-base/--clean-frs → --data, --frs-raw → --frs, matching the binary's current flag names. Fixes all economy-wide simulations failing with "unexpected argument '--clean-frs-base' found". \ No newline at end of file diff --git a/changelog.d/gcs-lcfs-spi-was.added b/changelog.d/gcs-lcfs-spi-was.added new file mode 100644 index 0000000..9c703be --- /dev/null +++ b/changelog.d/gcs-lcfs-spi-was.added @@ -0,0 +1 @@ +Add GCS download support for LCFS, SPI, and WAS datasets. New ensure_dataset(dataset, year) and updated download_all() support all four datasets (frs, lcfs, spi, was) from gs://policyengine-uk-microdata. \ No newline at end of file diff --git a/interfaces/python/policyengine_uk_compiled/data.py b/interfaces/python/policyengine_uk_compiled/data.py index 0cef657..91c14ee 100644 --- a/interfaces/python/policyengine_uk_compiled/data.py +++ b/interfaces/python/policyengine_uk_compiled/data.py @@ -60,12 +60,15 @@ def _get_credentials() -> tuple[str, str]: return token.split(":", 1) -def ensure_year(year: int) -> Path: - """Ensure FRS data for a specific year is available locally, downloading if needed. +DATASETS = ("frs", "lcfs", "spi", "was") + + +def ensure_dataset_year(dataset: str, year: int) -> Path: + """Ensure clean CSVs for a dataset/year are available locally, downloading if needed. - Returns the path to the year directory (e.g. ~/.policyengine-uk-data/frs/2023/). + Returns the path to the year directory (e.g. ~/.policyengine-uk-data/frs/2026/). """ - year_dir = LOCAL_CACHE / "frs" / str(year) + year_dir = LOCAL_CACHE / dataset / str(year) expected_files = ["persons.csv", "benunits.csv", "households.csv"] if all((year_dir / f).exists() for f in expected_files): return year_dir @@ -73,7 +76,7 @@ def ensure_year(year: int) -> Path: access_key, secret_key = _get_credentials() year_dir.mkdir(parents=True, exist_ok=True) for f in expected_files: - key = f"frs/{year}/{f}" + key = f"{dataset}/{year}/{f}" dest = year_dir / f if dest.exists(): continue @@ -84,16 +87,13 @@ def ensure_year(year: int) -> Path: return year_dir -def ensure_frs(year: int, clean_frs_base: str | None = None) -> str: - """Return a path to FRS data base dir, downloading the needed year if missing. +# Keep old name for backwards compatibility +def ensure_year(year: int) -> Path: + return ensure_dataset_year("frs", year) - Args: - year: The fiscal year to ensure data for. - clean_frs_base: Explicit path. If it exists with data for this year, returned as-is. - Returns: - Path string to the FRS base directory (containing year subdirs). - """ +def ensure_frs(year: int, clean_frs_base: str | None = None) -> str: + """Return a path to FRS data base dir, downloading the needed year if missing.""" if clean_frs_base: year_dir = Path(clean_frs_base) / str(year) if year_dir.is_dir(): @@ -111,42 +111,64 @@ def ensure_frs(year: int, clean_frs_base: str | None = None) -> str: f"a directory with a {year}/ subdirectory, or set {ENV_TOKEN} to " f"auto-download from GCS." ) - ensure_year(year) + ensure_dataset_year("frs", year) + return str(local_base) + + +def ensure_dataset(dataset: str, year: int) -> str: + """Return a path to a dataset base dir, downloading the needed year if missing. + + Supports: frs, lcfs, spi, was. + """ + if dataset not in DATASETS: + raise ValueError(f"Unknown dataset {dataset!r}. Choose from: {DATASETS}") + + local_base = LOCAL_CACHE / dataset + year_dir = local_base / str(year) + expected = ["persons.csv", "benunits.csv", "households.csv"] + if all((year_dir / f).exists() for f in expected): + return str(local_base) + + if not os.environ.get(ENV_TOKEN): + raise FileNotFoundError( + f"No {dataset.upper()} data found for {year}. Set {ENV_TOKEN} to auto-download." + ) + ensure_dataset_year(dataset, year) return str(local_base) -def download_all(force: bool = False) -> Path: - """Download all available FRS years. Returns the base frs directory.""" +def download_all(force: bool = False, datasets: tuple = DATASETS) -> None: + """Download all available years for the given datasets (default: all).""" import re access_key, secret_key = _get_credentials() - # List all objects to discover years - keys = [] - marker = "" - while True: - path = f"/?prefix=frs/&marker={marker}" - headers = _sign_request("GET", "/", access_key, secret_key) - url = f"https://{GCS_HOST}/{GCS_BUCKET}{path}" - req = urllib.request.Request(url, headers=headers) - with urllib.request.urlopen(req) as resp: - body = resp.read().decode() - found = re.findall(r"([^<]+)", body) - if not found: - break - keys.extend(found) - if "true" not in body: - break - marker = found[-1] - - total = len(keys) - for i, key in enumerate(keys, 1): - rel = key[len("frs/"):] - if not rel: - continue - dest = LOCAL_CACHE / "frs" / rel - if dest.exists() and not force: - continue - _download_object(key, dest, access_key, secret_key) - print(f"\r Downloading frs: {i}/{total}", end="", flush=True) - print() - return LOCAL_CACHE / "frs" + for dataset in datasets: + keys = [] + marker = "" + while True: + path = f"/?prefix={dataset}/&marker={marker}" + headers = _sign_request("GET", "/", access_key, secret_key) + url = f"https://{GCS_HOST}/{GCS_BUCKET}{path}" + req = urllib.request.Request(url, headers=headers) + with urllib.request.urlopen(req) as resp: + body = resp.read().decode() + found = re.findall(r"([^<]+)", body) + if not found: + break + keys.extend(found) + if "true" not in body: + break + marker = found[-1] + + total = len(keys) + for i, key in enumerate(keys, 1): + rel = key[len(f"{dataset}/"):] + if not rel: + continue + dest = LOCAL_CACHE / dataset / rel + if dest.exists() and not force: + continue + _download_object(key, dest, access_key, secret_key) + print(f"\r Downloading {dataset}: {i}/{total}", end="", flush=True) + if keys: + print() diff --git a/interfaces/python/policyengine_uk_compiled/engine.py b/interfaces/python/policyengine_uk_compiled/engine.py index 372997a..998023c 100644 --- a/interfaces/python/policyengine_uk_compiled/engine.py +++ b/interfaces/python/policyengine_uk_compiled/engine.py @@ -200,18 +200,18 @@ def _build_cmd(self, policy: Optional[Parameters] = None, extra_args: Optional[l if self._stdin_payload is not None: cmd.append("--stdin-data") elif self._data_dir: - cmd += ["--data-dir", self._data_dir] + cmd += ["--data", self._data_dir] elif self._clean_frs_base: - cmd += ["--clean-frs-base", self._clean_frs_base] + cmd += ["--data", self._clean_frs_base] elif self._clean_frs: - cmd += ["--clean-frs", self._clean_frs] + cmd += ["--data", self._clean_frs] elif self._frs_raw: - cmd += ["--frs-raw", self._frs_raw] + cmd += ["--frs", self._frs_raw] else: # No data source specified — try auto-resolving FRS data from policyengine_uk_compiled.data import ensure_frs frs_path = ensure_frs(self.year) - cmd += ["--clean-frs-base", frs_path] + cmd += ["--data", frs_path] if policy: overlay = policy.model_dump(exclude_none=True)