From acc6a7e5c56a6186f55c34e90e5bcfaab687a365 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Thu, 2 Apr 2026 14:46:24 +0100 Subject: [PATCH] Fix LCFS income/weights, add --uprate-to, generate 2026/27 clean data - LCFS: wkgrossp for employment income, p047p+b3262p for SE, p048p for investment income; rescale weighta to UK household population - Add --uprate-to flag to --extract: uprate before writing clean CSVs - Update SKILL.md to document new flag --- SKILL.md | 20 ++++++++++++++ .../lcfs-income-weights-spi-was-2026.changed | 7 +++++ src/data/clean.rs | 27 ++++++++++++++++++- src/data/lcfs.rs | 17 +++++++++--- 4 files changed, 66 insertions(+), 5 deletions(-) create mode 100644 changelog.d/lcfs-income-weights-spi-was-2026.changed diff --git a/SKILL.md b/SKILL.md index d0ff2c4..f5edd53 100644 --- a/SKILL.md +++ b/SKILL.md @@ -38,6 +38,7 @@ For microdata (per-entity DataFrames): use `--output-microdata-stdout` and parse | `--output json` | Machine-readable aggregate output | | `--output-microdata-stdout` | Per-entity CSVs to stdout | | `--export-params-json` | Dump baseline parameters | +| `--uprate-to YYYY` | With `--extract`: uprate dataset to target year before writing clean CSVs | ## Data @@ -95,6 +96,25 @@ Four raw survey inputs are supported. All use the same two-step flow: `--extract **UKDS data**: LCFS (SN 9468), WAS (SN 7215), SPI (SN 9422) are all under project `ecf0b3c4-29d2-4d8a-931d-0e3773a4ac0b`. Download tab zips from UKDS MCP and unzip before extracting. +## Versioning and releasing + +Versions are managed via `pyproject.toml` (the source of truth) and towncrier-style changelog fragments in `changelog.d/`. + +- **Do not** edit `CHANGELOG.md` or `Cargo.toml` versions directly — they are updated automatically by CI. +- To ship a change, drop a fragment file in `changelog.d/` with the naming convention `.`: + +| File suffix | Semver bump | +|---|---| +| `.fixed` | patch | +| `.changed` | patch | +| `.added` | minor | +| `.removed` | minor | +| `.breaking` | major | + +Example: `changelog.d/parse-id-list-delimiters.fixed` + +The content of the file is the human-readable changelog entry. CI runs `.github/bump_version.py` to infer the bump from fragment types, update `pyproject.toml`, then `publish-git-tag.sh` to tag and release. + ## Building ``` diff --git a/changelog.d/lcfs-income-weights-spi-was-2026.changed b/changelog.d/lcfs-income-weights-spi-was-2026.changed new file mode 100644 index 0000000..6c90962 --- /dev/null +++ b/changelog.d/lcfs-income-weights-spi-was-2026.changed @@ -0,0 +1,7 @@ +Fix LCFS income columns and weights; add --uprate-to flag; generate 2026/27 clean data for FRS, LCFS, SPI, and WAS. + +LCFS loader: switch employment income to wkgrossp (weekly gross pay, well-populated), add p047p for main SE income, add p048p for investment income, and rescale weighta to UK household population (~28.3m) so weighted aggregates are correct. + +Add --uprate-to flag to --extract mode, allowing raw survey data to be extracted and uprated to a target fiscal year in one step (e.g. --frs raw/ --year 2023 --uprate-to 2026 --extract data/frs/2026/). + +Update SKILL.md to document --uprate-to and the UKDS project ID for LCFS/WAS/SPI downloads. diff --git a/src/data/clean.rs b/src/data/clean.rs index fb5cfd1..aed6831 100644 --- a/src/data/clean.rs +++ b/src/data/clean.rs @@ -687,7 +687,32 @@ fn parse_id_list(s: &str) -> Vec { if s.is_empty() { return Vec::new(); } - s.split(';').filter_map(|x| x.trim().parse::().ok()).collect() + s.split(|c| c == ';' || c == ',').filter_map(|x| x.trim().parse::().ok()).collect() +} + +#[cfg(test)] +mod tests { + use super::parse_id_list; + + #[test] + fn parse_id_list_semicolons() { + assert_eq!(parse_id_list("0;1;2"), vec![0, 1, 2]); + } + + #[test] + fn parse_id_list_commas() { + assert_eq!(parse_id_list("0,1"), vec![0, 1]); + } + + #[test] + fn parse_id_list_single() { + assert_eq!(parse_id_list("3"), vec![3]); + } + + #[test] + fn parse_id_list_empty() { + assert_eq!(parse_id_list(""), Vec::::new()); + } } fn parse_region(s: &str) -> Region { diff --git a/src/data/lcfs.rs b/src/data/lcfs.rs index 5ec38a9..0ed674f 100644 --- a/src/data/lcfs.rs +++ b/src/data/lcfs.rs @@ -37,7 +37,9 @@ pub fn load_lcfs(data_dir: &Path, fiscal_year: u32) -> anyhow::Result { let person_table = load_table_cols(data_dir, &person_file, Some(&[ "case", "person", "a003", "a004", "a002", // age (two variants), sex - "b303p", "b3262p", // employment income, self-employment income + "wkgrossp", // weekly gross pay (employee, well-populated) + "p047p", "b3262p", // SE income: main job, subsidiary job + "p048p", // investment income (weekly) "b3381", "p049p", // state pension, private pension income ]))?; @@ -48,13 +50,19 @@ pub fn load_lcfs(data_dir: &Path, fiscal_year: u32) -> anyhow::Result { persons_by_case.entry(case).or_default().push(row); } + // weighta is a design weight summing to roughly the sample size (~28,000-30,000). + // Rescale to UK household population (~28.3m) so that weighted sums are population totals. + let weighta_sum: f64 = hh_table.iter().map(|r| get_f64(r, "weighta").max(0.0)).sum(); + const UK_HOUSEHOLDS: f64 = 28_300_000.0; + let weight_scale = if weighta_sum > 0.0 { UK_HOUSEHOLDS / weighta_sum } else { 1.0 }; + let mut people = Vec::new(); let mut benunits = Vec::new(); let mut households = Vec::new(); for hh_row in &hh_table { let case = get_i64(hh_row, "case"); - let weight = get_f64(hh_row, "weighta"); + let weight = get_f64(hh_row, "weighta") * weight_scale; if weight <= 0.0 { continue; } let region = region_from_gvtregno(get_i64(hh_row, "gorx")); @@ -96,8 +104,9 @@ pub fn load_lcfs(data_dir: &Path, fiscal_year: u32) -> anyhow::Result { is_benunit_head: is_head, is_household_head: is_head, is_in_scotland: region.is_scotland(), - employment_income: get_f64(prow, "b303p").max(0.0) * WEEKS_IN_YEAR, - self_employment_income: get_f64(prow, "b3262p").max(0.0) * WEEKS_IN_YEAR, + employment_income: get_f64(prow, "wkgrossp").max(0.0) * WEEKS_IN_YEAR, + self_employment_income: (get_f64(prow, "p047p") + get_f64(prow, "b3262p")).max(0.0) * WEEKS_IN_YEAR, + savings_interest_income: get_f64(prow, "p048p").max(0.0) * WEEKS_IN_YEAR, state_pension: get_f64(prow, "b3381").max(0.0) * WEEKS_IN_YEAR, pension_income: get_f64(prow, "p049p").max(0.0) * WEEKS_IN_YEAR, // Allocate total household benefit income to head as passthrough