Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ For microdata (per-entity DataFrames): use `--output-microdata-stdout` and parse
| `--output json` | Machine-readable aggregate output |
| `--output-microdata-stdout` | Per-entity CSVs to stdout |
| `--export-params-json` | Dump baseline parameters |
| `--uprate-to YYYY` | With `--extract`: uprate dataset to target year before writing clean CSVs |

## Data

Expand Down Expand Up @@ -95,6 +96,25 @@ Four raw survey inputs are supported. All use the same two-step flow: `--extract

**UKDS data**: LCFS (SN 9468), WAS (SN 7215), SPI (SN 9422) are all under project `ecf0b3c4-29d2-4d8a-931d-0e3773a4ac0b`. Download tab zips from UKDS MCP and unzip before extracting.

## Versioning and releasing

Versions are managed via `pyproject.toml` (the source of truth) and towncrier-style changelog fragments in `changelog.d/`.

- **Do not** edit `CHANGELOG.md` or `Cargo.toml` versions directly — they are updated automatically by CI.
- To ship a change, drop a fragment file in `changelog.d/` with the naming convention `<slug>.<type>`:

| File suffix | Semver bump |
|---|---|
| `.fixed` | patch |
| `.changed` | patch |
| `.added` | minor |
| `.removed` | minor |
| `.breaking` | major |

Example: `changelog.d/parse-id-list-delimiters.fixed`

The content of the file is the human-readable changelog entry. CI runs `.github/bump_version.py` to infer the bump from fragment types, update `pyproject.toml`, then `publish-git-tag.sh` to tag and release.

## Building

```
Expand Down
7 changes: 7 additions & 0 deletions changelog.d/lcfs-income-weights-spi-was-2026.changed
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Fix LCFS income columns and weights; add --uprate-to flag; generate 2026/27 clean data for FRS, LCFS, SPI, and WAS.

LCFS loader: switch employment income to wkgrossp (weekly gross pay, well-populated), add p047p for main SE income, add p048p for investment income, and rescale weighta to UK household population (~28.3m) so weighted aggregates are correct.

Add --uprate-to flag to --extract mode, allowing raw survey data to be extracted and uprated to a target fiscal year in one step (e.g. --frs raw/ --year 2023 --uprate-to 2026 --extract data/frs/2026/).

Update SKILL.md to document --uprate-to and the UKDS project ID for LCFS/WAS/SPI downloads.
27 changes: 26 additions & 1 deletion src/data/clean.rs
Original file line number Diff line number Diff line change
Expand Up @@ -687,7 +687,32 @@ fn parse_id_list(s: &str) -> Vec<usize> {
if s.is_empty() {
return Vec::new();
}
s.split(';').filter_map(|x| x.trim().parse::<usize>().ok()).collect()
s.split(|c| c == ';' || c == ',').filter_map(|x| x.trim().parse::<usize>().ok()).collect()
}

#[cfg(test)]
mod tests {
use super::parse_id_list;

#[test]
fn parse_id_list_semicolons() {
assert_eq!(parse_id_list("0;1;2"), vec![0, 1, 2]);
}

#[test]
fn parse_id_list_commas() {
assert_eq!(parse_id_list("0,1"), vec![0, 1]);
}

#[test]
fn parse_id_list_single() {
assert_eq!(parse_id_list("3"), vec![3]);
}

#[test]
fn parse_id_list_empty() {
assert_eq!(parse_id_list(""), Vec::<usize>::new());
}
}

fn parse_region(s: &str) -> Region {
Expand Down
17 changes: 13 additions & 4 deletions src/data/lcfs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,9 @@ pub fn load_lcfs(data_dir: &Path, fiscal_year: u32) -> anyhow::Result<Dataset> {
let person_table = load_table_cols(data_dir, &person_file, Some(&[
"case", "person",
"a003", "a004", "a002", // age (two variants), sex
"b303p", "b3262p", // employment income, self-employment income
"wkgrossp", // weekly gross pay (employee, well-populated)
"p047p", "b3262p", // SE income: main job, subsidiary job
"p048p", // investment income (weekly)
"b3381", "p049p", // state pension, private pension income
]))?;

Expand All @@ -48,13 +50,19 @@ pub fn load_lcfs(data_dir: &Path, fiscal_year: u32) -> anyhow::Result<Dataset> {
persons_by_case.entry(case).or_default().push(row);
}

// weighta is a design weight summing to roughly the sample size (~28,000-30,000).
// Rescale to UK household population (~28.3m) so that weighted sums are population totals.
let weighta_sum: f64 = hh_table.iter().map(|r| get_f64(r, "weighta").max(0.0)).sum();
const UK_HOUSEHOLDS: f64 = 28_300_000.0;
let weight_scale = if weighta_sum > 0.0 { UK_HOUSEHOLDS / weighta_sum } else { 1.0 };

let mut people = Vec::new();
let mut benunits = Vec::new();
let mut households = Vec::new();

for hh_row in &hh_table {
let case = get_i64(hh_row, "case");
let weight = get_f64(hh_row, "weighta");
let weight = get_f64(hh_row, "weighta") * weight_scale;
if weight <= 0.0 { continue; }

let region = region_from_gvtregno(get_i64(hh_row, "gorx"));
Expand Down Expand Up @@ -96,8 +104,9 @@ pub fn load_lcfs(data_dir: &Path, fiscal_year: u32) -> anyhow::Result<Dataset> {
is_benunit_head: is_head,
is_household_head: is_head,
is_in_scotland: region.is_scotland(),
employment_income: get_f64(prow, "b303p").max(0.0) * WEEKS_IN_YEAR,
self_employment_income: get_f64(prow, "b3262p").max(0.0) * WEEKS_IN_YEAR,
employment_income: get_f64(prow, "wkgrossp").max(0.0) * WEEKS_IN_YEAR,
self_employment_income: (get_f64(prow, "p047p") + get_f64(prow, "b3262p")).max(0.0) * WEEKS_IN_YEAR,
savings_interest_income: get_f64(prow, "p048p").max(0.0) * WEEKS_IN_YEAR,
state_pension: get_f64(prow, "b3381").max(0.0) * WEEKS_IN_YEAR,
pension_income: get_f64(prow, "p049p").max(0.0) * WEEKS_IN_YEAR,
// Allocate total household benefit income to head as passthrough
Expand Down
Loading