Skip to content

Commit 7258fec

Browse files
committed
Add codes analysis comparison with both default codes and current codes
1 parent 5ae1cb6 commit 7258fec

File tree

1 file changed

+62
-12
lines changed

1 file changed

+62
-12
lines changed

cli/src/analyze/codes.rs

Lines changed: 62 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,9 @@ where
7878
let mut stats = Default::default();
7979
let has_ef = std::fs::metadata(args.src.with_extension("ef")).is_ok_and(|x| x.is_file());
8080

81+
// Load the compression flags from the properties file so we can compare them
82+
let (_, _, comp_flags) = parse_properties::<E>(args.src.with_extension(PROPERTIES_EXTENSION))?;
83+
8184
if has_ef {
8285
log::info!(
8386
"Analyzing codes in parallel using {} threads",
@@ -163,8 +166,41 @@ where
163166
});
164167
}
165168

169+
println!("Default codes");
170+
compare_codes(&stats, CompFlags::default(), args.top_k);
171+
172+
print!("\n\n\n");
173+
174+
println!("Current codes");
175+
compare_codes(&stats, comp_flags, args.top_k);
176+
177+
Ok(())
178+
}
179+
180+
/// Get the size in bits used by a given code.
181+
/// This should go in dsi-bitstream eventually.
182+
fn get_size_by_code(stats: &CodesStats, code: Codes) -> Option<u64> {
183+
match code {
184+
Codes::Unary => Some(stats.unary),
185+
Codes::Gamma => Some(stats.gamma),
186+
Codes::Delta => Some(stats.delta),
187+
Codes::Omega => Some(stats.omega),
188+
Codes::VByteBe | Codes::VByteLe => Some(stats.vbyte),
189+
Codes::Zeta(k) => stats.zeta.get(k - 1).copied(),
190+
Codes::Golomb(b) => stats.golomb.get(b as usize - 1).copied(),
191+
Codes::ExpGolomb(k) => stats.exp_golomb.get(k).copied(),
192+
Codes::Rice(k) => stats.rice.get(k).copied(),
193+
Codes::Pi(0) => Some(stats.gamma), // Pi(0) is Gamma
194+
Codes::Pi(1) => Some(stats.zeta[1]), // Pi(1) is Zeta(2)
195+
Codes::Pi(k) => stats.pi.get(k - 2).copied(),
196+
_ => unreachable!("Code {:?} not supported", code),
197+
}
198+
}
199+
200+
/// Print the statistics of how much the optimal codes improve over the reference ones.
201+
pub fn compare_codes(stats: &DecoderStats, reference: CompFlags, top_k: usize) {
166202
macro_rules! impl_best_code {
167-
($new_bits:expr, $old_bits:expr, $stats:expr, $($code:ident - $old:expr),*) => {
203+
($new_bits:expr, $old_bits:expr, $stats:expr, $($code:ident -> $old:expr),*) => {
168204
println!("{:>17} {:>20} {:>12} {:>10} {:>10} {:>16}",
169205
"Type", "Code", "Improvement", "Weight", "Bytes", "Bits",
170206
);
@@ -189,7 +225,7 @@ where
189225
normalize(best_size as f64 / 8.0),
190226
best_size,
191227
);
192-
for i in 1..args.top_k.min(codes.len()).max(1) {
228+
for i in 1..top_k.min(codes.len()).max(1) {
193229
let (code, size) = codes[i];
194230
let improvement = 100.0 * ($old as f64 - size as f64) / $old as f64;
195231
println!("{:>17} {:>20} {:>12.3}% {:>10.3} {:>10} {:>16}",
@@ -206,21 +242,36 @@ where
206242
};
207243
}
208244

245+
println!("Code optimization results against:");
246+
for (name, code) in [
247+
("outdegrees", reference.outdegrees),
248+
("reference offsets", reference.references),
249+
("block counts", reference.blocks),
250+
("blocks", reference.blocks),
251+
("interval counts", reference.intervals),
252+
("interval starts", reference.intervals),
253+
("interval lengths", reference.intervals),
254+
("first residuals", reference.residuals),
255+
("residuals", reference.residuals),
256+
] {
257+
println!("\t{:>18} : {:?}", name, code);
258+
}
259+
209260
let mut new_bits = 0;
210261
let mut old_bits = 0;
211262
impl_best_code!(
212263
new_bits,
213264
old_bits,
214265
stats,
215-
outdegrees - stats.outdegrees.gamma,
216-
reference_offsets - stats.reference_offsets.unary,
217-
block_counts - stats.block_counts.gamma,
218-
blocks - stats.blocks.gamma,
219-
interval_counts - stats.interval_counts.gamma,
220-
interval_starts - stats.interval_starts.gamma,
221-
interval_lens - stats.interval_lens.gamma,
222-
first_residuals - stats.first_residuals.zeta[2],
223-
residuals - stats.residuals.zeta[2]
266+
outdegrees -> get_size_by_code(&stats.outdegrees, reference.outdegrees).unwrap(),
267+
reference_offsets -> get_size_by_code(&stats.reference_offsets, reference.references).unwrap(),
268+
block_counts -> get_size_by_code(&stats.block_counts, reference.blocks).unwrap(),
269+
blocks -> get_size_by_code(&stats.blocks, reference.blocks).unwrap(),
270+
interval_counts -> get_size_by_code(&stats.interval_counts, reference.intervals).unwrap(),
271+
interval_starts -> get_size_by_code(&stats.interval_starts, reference.intervals).unwrap(),
272+
interval_lens -> get_size_by_code(&stats.interval_lens, reference.intervals).unwrap(),
273+
first_residuals -> get_size_by_code(&stats.first_residuals, reference.residuals).unwrap(),
274+
residuals -> get_size_by_code(&stats.residuals, reference.residuals).unwrap()
224275
);
225276

226277
println!();
@@ -239,7 +290,6 @@ where
239290
" Improvement: {:>15.3}%",
240291
100.0 * (old_bits - new_bits) as f64 / old_bits as f64
241292
);
242-
Ok(())
243293
}
244294

245295
fn normalize(mut value: f64) -> String {

0 commit comments

Comments
 (0)