Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,14 @@ dist
**/.ipynb_checkpoints
.idea
.DS_Store.firebase/*.cache
.firebase/
.firebase/*.cache
**/.DS_Store
test-results/
lib/
.env
.idea/
.vscode/
task-launcher/.venv/
task-launcher/data/
task-launcher/firestore-debug.log
32 changes: 32 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,38 @@ npm run dev
You can now locally run tasks e.g. TROG `http://localhost:8080/?task=trog`.
Task parameters are documented here (TODO linkme).

### Location Selection Save Debug (Emulator)

For testing location saves against the Firebase emulators:

1. Start the emulators from the functions repo (auth `9290`, functions `5005`, firestore `8185`, UI `4002`).
2. Open the task with debug save enabled:
`http://localhost:8080/?task=locationselection&locationSaveDebug=true`
3. Click **Save** and confirm a `locations` doc appears in the Emulator UI.

### Kontur Population Cache

The population lookup uses a local Kontur cache if available, otherwise it falls back to WorldPop.
You can point the dev server at a compressed, sparse Kontur cache stored elsewhere (e.g. GCS) by
setting one of these environment variables before starting `npm run dev`. The cache is sharded
by R5 parent cell, so the URL/path should be a *folder* containing `{r5CellId}.json.gz` files:

- `KONTUR_H3_CACHE_URL` (base URL; supports `.gz` shards)
- `KONTUR_H3_CACHE_PATH` (base folder for local shards)

#### Build the R5 shard cache

We provide a repeatable script to download the Kontur dataset and build R5 shards:

```bash
cd /home/david/levante/core-tasks/task-launcher
pip install h3 pyarrow
python scripts/build_kontur_r5_shards.py --download --gzip --output data/kontur-h3-r5
```

This uses the latest 400m Kontur dataset from HDX and requires `ogr2ogr` (GDAL) to convert
the GeoPackage into Parquet/CSV for streaming.

Task details:

1. [Matrix Reasoning](https://hs-levante-assessment-dev.web.app/?task=matrix-reasoning) [George]
Expand Down
22 changes: 0 additions & 22 deletions task-launcher/.firebase/hosting.ZGlzdA.cache

This file was deleted.

3 changes: 3 additions & 0 deletions task-launcher/.gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
.venv/
data/
firestore-debug.log
node_modules
dist
**/.Rhistory
Expand Down
6 changes: 3 additions & 3 deletions task-launcher/firebase.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,15 @@
"emulators": {
"auth": {
"host": "127.0.0.1",
"port": 9199
"port": 9290
},
"firestore": {
"host": "127.0.0.1",
"port": 8180
"port": 8185
},
"functions": {
"host": "127.0.0.1",
"port": 5002
"port": 5005
},
"ui": {
"host": "127.0.0.1",
Expand Down
35 changes: 12 additions & 23 deletions task-launcher/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions task-launcher/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
"@sentry/browser": "^8.7.0",
"cypress-real-events": "^1.13.0",
"fscreen": "^1.2.0",
"h3-js": "^4.4.0",
"i18next": "^22.4.15",
"i18next-browser-languagedetector": "^7.0.1",
"jspsych": "^7.2.1",
Expand Down
229 changes: 229 additions & 0 deletions task-launcher/scripts/build_kontur_r5_shards.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
#!/usr/bin/env python3
import argparse
import csv
import gzip
import json
import os
import shutil
import subprocess
import sys
import urllib.request
from collections import OrderedDict

try:
import h3
except ImportError as exc:
raise SystemExit("Missing dependency: pip install h3") from exc

try:
import pyarrow.dataset as ds
except ImportError:
ds = None


DEFAULT_DATASET_URL = (
"https://geodata-eu-central-1-kontur-public.s3.eu-central-1.amazonaws.com/"
"kontur_datasets/kontur_population_20231101.gpkg.gz"
)


def download_file(url: str, dest_path: str) -> None:
os.makedirs(os.path.dirname(dest_path), exist_ok=True)
with urllib.request.urlopen(url) as response, open(dest_path, "wb") as out_file:
shutil.copyfileobj(response, out_file)


def gunzip_file(src_path: str, dest_path: str) -> None:
os.makedirs(os.path.dirname(dest_path), exist_ok=True)
with gzip.open(src_path, "rb") as src, open(dest_path, "wb") as dest:
shutil.copyfileobj(src, dest)


def ensure_tabular_from_gpkg(gpkg_path: str, parquet_path: str, csv_path: str) -> tuple[str, str]:
if os.path.exists(parquet_path):
return "parquet", parquet_path
if os.path.exists(csv_path):
return "csv", csv_path
ogr2ogr = shutil.which("ogr2ogr")
if not ogr2ogr:
raise SystemExit("ogr2ogr not found; install GDAL to convert GPKG.")
os.makedirs(os.path.dirname(parquet_path), exist_ok=True)
if ds is not None:
cmd = [
ogr2ogr,
"-f",
"Parquet",
parquet_path,
gpkg_path,
"-select",
"h3,population",
]
try:
subprocess.check_call(cmd)
return "parquet", parquet_path
except subprocess.CalledProcessError:
pass
cmd = [
ogr2ogr,
"-f",
"CSV",
csv_path,
gpkg_path,
"-select",
"h3,population",
]
subprocess.check_call(cmd)
return "csv", csv_path


def iter_rows_from_parquet(parquet_path: str):
if ds is None:
raise SystemExit("pyarrow not available for parquet parsing.")
dataset = ds.dataset(parquet_path, format="parquet")
for batch in dataset.to_batches(columns=["h3", "population"]):
h3_col = batch.column(0).to_pylist()
pop_col = batch.column(1).to_pylist()
for h3_id, pop in zip(h3_col, pop_col):
yield h3_id, pop


def iter_rows_from_csv(csv_path: str):
with open(csv_path, "r", encoding="utf-8") as fh:
reader = csv.DictReader(fh)
for row in reader:
yield row.get("h3"), row.get("population")


def merge_into(existing: dict, incoming: dict) -> dict:
for res, cells in incoming.items():
res_map = existing.setdefault(res, {})
for cell_id, pop in cells.items():
res_map[cell_id] = res_map.get(cell_id, 0) + pop
return existing


def flush_shard(output_dir: str, r5_cell_id: str, shard_data: dict, gzip_output: bool) -> None:
os.makedirs(output_dir, exist_ok=True)
filename = f"{r5_cell_id}.json.gz" if gzip_output else f"{r5_cell_id}.json"
output_path = os.path.join(output_dir, filename)
existing = {}
if os.path.exists(output_path):
if gzip_output:
with gzip.open(output_path, "rt", encoding="utf-8") as fh:
existing = json.load(fh)
else:
with open(output_path, "r", encoding="utf-8") as fh:
existing = json.load(fh)
merged = merge_into(existing.get("resolutions", {}), shard_data)
payload = {"resolutions": merged}
if gzip_output:
with gzip.open(output_path, "wt", encoding="utf-8") as fh:
json.dump(payload, fh, separators=(",", ":"))
else:
with open(output_path, "w", encoding="utf-8") as fh:
json.dump(payload, fh, separators=(",", ":"))


def build_shards(
input_path: str,
input_format: str,
output_dir: str,
resolutions: list[int],
max_shards: int,
gzip_output: bool,
) -> None:
shard_cache: OrderedDict[str, dict] = OrderedDict()

def get_shard(r5_cell_id: str) -> dict:
if r5_cell_id in shard_cache:
shard_cache.move_to_end(r5_cell_id)
return shard_cache[r5_cell_id]
if len(shard_cache) >= max_shards:
oldest_r5, oldest_data = shard_cache.popitem(last=False)
flush_shard(output_dir, oldest_r5, oldest_data, gzip_output)
shard_cache[r5_cell_id] = {str(res): {} for res in resolutions}
return shard_cache[r5_cell_id]

if input_format == "parquet":
row_iter = iter_rows_from_parquet(input_path)
else:
row_iter = iter_rows_from_csv(input_path)

for h3_id, pop in row_iter:
if h3_id is None or pop is None:
continue
try:
pop_val = float(pop)
except (TypeError, ValueError):
continue
if pop_val <= 0:
continue
try:
base_resolution = h3.get_resolution(h3_id)
except Exception:
continue
try:
r5_cell = h3.cell_to_parent(h3_id, 5)
except Exception:
continue

shard = get_shard(r5_cell)
for res in resolutions:
if res > base_resolution:
continue
try:
parent = h3.cell_to_parent(h3_id, res)
except Exception:
continue
res_map = shard[str(res)]
res_map[parent] = res_map.get(parent, 0) + pop_val

for r5_cell, data in shard_cache.items():
flush_shard(output_dir, r5_cell, data, gzip_output)


def main() -> None:
parser = argparse.ArgumentParser(description="Build R5-sharded Kontur H3 population cache.")
parser.add_argument("--input", help="Input Parquet file with h3,population columns.")
parser.add_argument("--output", default="data/kontur-h3-r5", help="Output shard directory.")
parser.add_argument("--resolutions", default="5,6,7", help="Comma-separated resolutions to build.")
parser.add_argument("--max-shards", type=int, default=64, help="Max in-memory shard count.")
parser.add_argument("--download", action="store_true", help="Download and convert dataset.")
parser.add_argument("--gzip", action="store_true", help="Write .json.gz shards.")
args = parser.parse_args()

if args.download:
raw_dir = os.path.join("data", "kontur", "raw")
os.makedirs(raw_dir, exist_ok=True)
gz_path = os.path.join(raw_dir, "kontur_population_20231101.gpkg.gz")
gpkg_path = os.path.join(raw_dir, "kontur_population_20231101.gpkg")
parquet_path = os.path.join(raw_dir, "kontur_population_20231101.parquet")
csv_path = os.path.join(raw_dir, "kontur_population_20231101.csv")
if not os.path.exists(gz_path):
print(f"Downloading {DEFAULT_DATASET_URL} ...")
download_file(DEFAULT_DATASET_URL, gz_path)
if not os.path.exists(gpkg_path):
print("Extracting .gpkg.gz ...")
gunzip_file(gz_path, gpkg_path)
input_format, input_path = ensure_tabular_from_gpkg(gpkg_path, parquet_path, csv_path)
else:
input_path = args.input
input_format = "parquet" if input_path and input_path.endswith(".parquet") else "csv"

if not input_path or not os.path.exists(input_path):
raise SystemExit("Input Parquet file not found. Use --input or --download.")

resolutions = [int(x.strip()) for x in args.resolutions.split(",") if x.strip()]
build_shards(
input_path=input_path,
input_format=input_format,
output_dir=args.output,
resolutions=resolutions,
max_shards=max(args.max_shards, 1),
gzip_output=args.gzip,
)
print(f"Shards written to {args.output}")


if __name__ == "__main__":
main()
Loading