From acc6a7e5c56a6186f55c34e90e5bcfaab687a365 Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil.woodruff@cabinetoffice.gov.uk>
Date: Thu, 2 Apr 2026 14:46:24 +0100
Subject: [PATCH] Fix LCFS income/weights, add --uprate-to, generate 2026/27
 clean data

- LCFS: wkgrossp for employment income, p047p+b3262p for SE, p048p
  for investment income; rescale weighta to UK household population
- Add --uprate-to flag to --extract: uprate before writing clean CSVs
- Update SKILL.md to document new flag
---
 SKILL.md                                      | 20 ++++++++++++++
 .../lcfs-income-weights-spi-was-2026.changed  |  7 +++++
 src/data/clean.rs                             | 27 ++++++++++++++++++-
 src/data/lcfs.rs                              | 17 +++++++++---
 4 files changed, 66 insertions(+), 5 deletions(-)
 create mode 100644 changelog.d/lcfs-income-weights-spi-was-2026.changed
diff --git a/SKILL.md b/SKILL.md
index d0ff2c4..f5edd53 100644
--- a/SKILL.md
+++ b/SKILL.md
@@ -38,6 +38,7 @@ For microdata (per-entity DataFrames): use `--output-microdata-stdout` and parse
 | `--output json` | Machine-readable aggregate output |
 | `--output-microdata-stdout` | Per-entity CSVs to stdout |
 | `--export-params-json` | Dump baseline parameters |
+| `--uprate-to YYYY` | With `--extract`: uprate dataset to target year before writing clean CSVs |
 
 ## Data
 
@@ -95,6 +96,25 @@ Four raw survey inputs are supported. All use the same two-step flow: `--extract
 
 **UKDS data**: LCFS (SN 9468), WAS (SN 7215), SPI (SN 9422) are all under project `ecf0b3c4-29d2-4d8a-931d-0e3773a4ac0b`. Download tab zips from UKDS MCP and unzip before extracting.
 
+## Versioning and releasing
+
+Versions are managed via `pyproject.toml` (the source of truth) and towncrier-style changelog fragments in `changelog.d/`.
+
+- **Do not** edit `CHANGELOG.md` or `Cargo.toml` versions directly — they are updated automatically by CI.
+- To ship a change, drop a fragment file in `changelog.d/` with the naming convention `<slug>.<type>`:
+
+| File suffix | Semver bump |
+|---|---|
+| `.fixed` | patch |
+| `.changed` | patch |
+| `.added` | minor |
+| `.removed` | minor |
+| `.breaking` | major |
+
+Example: `changelog.d/parse-id-list-delimiters.fixed`
+
+The content of the file is the human-readable changelog entry. CI runs `.github/bump_version.py` to infer the bump from fragment types, update `pyproject.toml`, then `publish-git-tag.sh` to tag and release.
+
 ## Building
 
 ```
diff --git a/changelog.d/lcfs-income-weights-spi-was-2026.changed b/changelog.d/lcfs-income-weights-spi-was-2026.changed
new file mode 100644
index 0000000..6c90962
--- /dev/null
+++ b/changelog.d/lcfs-income-weights-spi-was-2026.changed
@@ -0,0 +1,7 @@
+Fix LCFS income columns and weights; add --uprate-to flag; generate 2026/27 clean data for FRS, LCFS, SPI, and WAS.
+
+LCFS loader: switch employment income to wkgrossp (weekly gross pay, well-populated), add p047p for main SE income, add p048p for investment income, and rescale weighta to UK household population (~28.3m) so weighted aggregates are correct.
+
+Add --uprate-to flag to --extract mode, allowing raw survey data to be extracted and uprated to a target fiscal year in one step (e.g. --frs raw/ --year 2023 --uprate-to 2026 --extract data/frs/2026/).
+
+Update SKILL.md to document --uprate-to and the UKDS project ID for LCFS/WAS/SPI downloads.
diff --git a/src/data/clean.rs b/src/data/clean.rs
index fb5cfd1..aed6831 100644
--- a/src/data/clean.rs
+++ b/src/data/clean.rs
@@ -687,7 +687,32 @@ fn parse_id_list(s: &str) -> Vec<usize> {
     if s.is_empty() {
         return Vec::new();
     }
-    s.split(';').filter_map(|x| x.trim().parse::<usize>().ok()).collect()
+    s.split(|c| c == ';' || c == ',').filter_map(|x| x.trim().parse::<usize>().ok()).collect()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::parse_id_list;
+
+    #[test]
+    fn parse_id_list_semicolons() {
+        assert_eq!(parse_id_list("0;1;2"), vec![0, 1, 2]);
+    }
+
+    #[test]
+    fn parse_id_list_commas() {
+        assert_eq!(parse_id_list("0,1"), vec![0, 1]);
+    }
+
+    #[test]
+    fn parse_id_list_single() {
+        assert_eq!(parse_id_list("3"), vec![3]);
+    }
+
+    #[test]
+    fn parse_id_list_empty() {
+        assert_eq!(parse_id_list(""), Vec::<usize>::new());
+    }
 }
 
 fn parse_region(s: &str) -> Region {
diff --git a/src/data/lcfs.rs b/src/data/lcfs.rs
index 5ec38a9..0ed674f 100644
--- a/src/data/lcfs.rs
+++ b/src/data/lcfs.rs
@@ -37,7 +37,9 @@ pub fn load_lcfs(data_dir: &Path, fiscal_year: u32) -> anyhow::Result<Dataset> {
     let person_table = load_table_cols(data_dir, &person_file, Some(&[
         "case", "person",
         "a003", "a004", "a002",  // age (two variants), sex
-        "b303p", "b3262p",       // employment income, self-employment income
+        "wkgrossp",              // weekly gross pay (employee, well-populated)
+        "p047p", "b3262p",       // SE income: main job, subsidiary job
+        "p048p",                 // investment income (weekly)
         "b3381", "p049p",        // state pension, private pension income
     ]))?;
 
@@ -48,13 +50,19 @@ pub fn load_lcfs(data_dir: &Path, fiscal_year: u32) -> anyhow::Result<Dataset> {
         persons_by_case.entry(case).or_default().push(row);
     }
 
+    // weighta is a design weight summing to roughly the sample size (~28,000-30,000).
+    // Rescale to UK household population (~28.3m) so that weighted sums are population totals.
+    let weighta_sum: f64 = hh_table.iter().map(|r| get_f64(r, "weighta").max(0.0)).sum();
+    const UK_HOUSEHOLDS: f64 = 28_300_000.0;
+    let weight_scale = if weighta_sum > 0.0 { UK_HOUSEHOLDS / weighta_sum } else { 1.0 };
+
     let mut people = Vec::new();
     let mut benunits = Vec::new();
     let mut households = Vec::new();
 
     for hh_row in &hh_table {
         let case = get_i64(hh_row, "case");
-        let weight = get_f64(hh_row, "weighta");
+        let weight = get_f64(hh_row, "weighta") * weight_scale;
         if weight <= 0.0 { continue; }
 
         let region = region_from_gvtregno(get_i64(hh_row, "gorx"));
@@ -96,8 +104,9 @@ pub fn load_lcfs(data_dir: &Path, fiscal_year: u32) -> anyhow::Result<Dataset> {
                     is_benunit_head: is_head,
                     is_household_head: is_head,
                     is_in_scotland: region.is_scotland(),
-                    employment_income: get_f64(prow, "b303p").max(0.0) * WEEKS_IN_YEAR,
-                    self_employment_income: get_f64(prow, "b3262p").max(0.0) * WEEKS_IN_YEAR,
+                    employment_income: get_f64(prow, "wkgrossp").max(0.0) * WEEKS_IN_YEAR,
+                    self_employment_income: (get_f64(prow, "p047p") + get_f64(prow, "b3262p")).max(0.0) * WEEKS_IN_YEAR,
+                    savings_interest_income: get_f64(prow, "p048p").max(0.0) * WEEKS_IN_YEAR,
                     state_pension: get_f64(prow, "b3381").max(0.0) * WEEKS_IN_YEAR,
                     pension_income: get_f64(prow, "p049p").max(0.0) * WEEKS_IN_YEAR,
                     // Allocate total household benefit income to head as passthrough