From 63a2195ffdb931fa6e63cbc7930c0a16b20dd8e3 Mon Sep 17 00:00:00 2001 From: JSCU-CNI <121175071+JSCU-CNI@users.noreply.github.com> Date: Thu, 27 Nov 2025 16:54:53 +0100 Subject: [PATCH] Add Chromium DiskCache and SimpleDiskCache implementation --- dissect/database/chromium/__init__.py | 8 + dissect/database/chromium/cache/__init__.py | 13 ++ dissect/database/chromium/cache/c_cache.py | 154 ++++++++++++ dissect/database/chromium/cache/c_cache.pyi | 170 ++++++++++++++ dissect/database/chromium/cache/c_simple.py | 87 +++++++ dissect/database/chromium/cache/c_simple.pyi | 149 ++++++++++++ dissect/database/chromium/cache/cache.py | 233 +++++++++++++++++++ dissect/database/chromium/cache/simple.py | 194 +++++++++++++++ tests/chromium/__init__.py | 0 9 files changed, 1008 insertions(+) create mode 100644 dissect/database/chromium/__init__.py create mode 100644 dissect/database/chromium/cache/__init__.py create mode 100644 dissect/database/chromium/cache/c_cache.py create mode 100644 dissect/database/chromium/cache/c_cache.pyi create mode 100644 dissect/database/chromium/cache/c_simple.py create mode 100644 dissect/database/chromium/cache/c_simple.pyi create mode 100644 dissect/database/chromium/cache/cache.py create mode 100644 dissect/database/chromium/cache/simple.py create mode 100644 tests/chromium/__init__.py diff --git a/dissect/database/chromium/__init__.py b/dissect/database/chromium/__init__.py new file mode 100644 index 0000000..fdaf54a --- /dev/null +++ b/dissect/database/chromium/__init__.py @@ -0,0 +1,8 @@ +from __future__ import annotations + +from dissect.database.chromium.cache import DiskCache, SimpleDiskCache + +__all__ = [ + "DiskCache", + "SimpleDiskCache", +] diff --git a/dissect/database/chromium/cache/__init__.py b/dissect/database/chromium/cache/__init__.py new file mode 100644 index 0000000..5fbbd3c --- /dev/null +++ b/dissect/database/chromium/cache/__init__.py @@ -0,0 +1,13 @@ +from __future__ import annotations + +from dissect.database.chromium.cache.c_cache import c_cache +from dissect.database.chromium.cache.c_simple import c_simple +from dissect.database.chromium.cache.cache import DiskCache +from dissect.database.chromium.cache.simple import SimpleDiskCache + +__all__ = [ + "DiskCache", + "SimpleDiskCache", + "c_cache", + "c_simple", +] diff --git a/dissect/database/chromium/cache/c_cache.py b/dissect/database/chromium/cache/c_cache.py new file mode 100644 index 0000000..6d22157 --- /dev/null +++ b/dissect/database/chromium/cache/c_cache.py @@ -0,0 +1,154 @@ +from __future__ import annotations + +from dissect.cstruct import cstruct + +# References: +# - https://chromium.googlesource.com/chromium/src/+/HEAD/net/disk_cache/blockfile/addr.h +# - https://chromium.googlesource.com/chromium/src/+/HEAD/net/disk_cache/blockfile/disk_format_base.h +# - https://chromium.googlesource.com/chromium/src/+/HEAD/net/disk_cache/blockfile/disk_format.h +cache_def = """ + +/* Cache Address format. */ + +enum FileType { + EXTERNAL = 0, + RANKINGS = 1, + BLOCK_256 = 2, + BLOCK_1K = 3, + BLOCK_4K = 4, + BLOCK_FILES = 5, + BLOCK_ENTRIES = 6, + BLOCK_EVICTED = 7 +}; + +// int kMaxBlockSize = 4096 * 4; +// int16_t kMaxBlockFile = 255; +// int kMaxNumBlocks = 4; +// int16_t kFirstAdditionalBlockFile = 4; + +#define kInitializedMask 0x80000000 +#define kFileTypeMask 0x70000000 +#define kFileTypeOffset 28 +#define kReservedBitsMask 0x0c000000 +#define kNumBlocksMask 0x03000000 +#define kNumBlocksOffset 24 +#define kFileSelectorMask 0x00ff0000 +#define kFileSelectorOffset 16 +#define kStartBlockMask 0x0000FFFF +#define kFileNameMask 0x0FFFFFFF + +/* Cache types. */ + +/* Index file format. */ +typedef uint32_t CacheAddr; + +struct LruData { + int32 padding_1[2]; + int32 filled; // Flag to tell when we filled the cache. + int32 sizes[5]; + CacheAddr heads[5]; + CacheAddr tails[5]; + CacheAddr transaction; // In-flight operation target. + int32 operation; // Actual in-flight operation. + int32 operation_list; // In-flight operation list. + int32 padding_2[7]; +}; + +struct IndexHeader { + uint32 magic; // 0xc3ca03c1 + uint32 version; + int32 num_entries; + int32 num_bytes_legacy; + int32 last_file; // f_###### + int32 dirty_flag; + CacheAddr stats; + int32 table_len; + int32 crash_flag; + int32 experiment_flag; + uint64 create_time; + int64 num_bytes; + int32 corruption_flag; + int32 padding[49]; + LruData lru_data; + // CacheAddr table[table_len]; // max is kIndexTablesize (0x10000) +}; + +/* Data Block File Format. */ +#define kBlockHeaderSize 8192 + +struct BlockFileHeader { + uint32 magic; // 0xc3ca04c1 + uint32 version; + int16 this_file; // Index of this file (data_#). + int16 next_file; // Next file when this one is full (data_#). + int32 entry_size; // Size of the blocks of this file. + int32 num_entries; // Number of stored entries. + int32 max_entries; // Current maximum number of entries. + int32 empty[4]; + int32 hints[4]; + int32 updating; + int32 user[5]; + // char allocation_map[4 * 2028]; + // total header should be exactly kBlockHeaderSize bytes long (8192). +}; + +/* Cache Entry Format. */ + +enum EntryState { + ENTRY_NORMAL = 0, + ENTRY_EVICTED, // The entry was recently evicted from the cache. + ENTRY_DOOMED // The entry was doomed. +}; + +enum EntryFlags { + PARENT_ENTRY = 1, // This entry has children (sparse) entries. + CHILD_ENTRY = 1 << 1 // Child entry that stores sparse data. +}; + +struct EntryStore { + uint32 hash; // Full hash of the key. + CacheAddr next; // Next entry with the same hash or bucket. + CacheAddr rankings_node; // Rankings node for this entry. + int32 reuse_count; // How often is this entry used. + int32 refetch_count; // How often is this fetched from the net. + int32 state; // Current state. + uint64 creation_time; + int32 key_len; + CacheAddr long_key; // Optional address of a long key. + + int32 data_size[4]; // We can store up to 4 data streams for + CacheAddr data_addr[4]; // each entry. + + uint32 flags; // Any combination of EntryFlags. + int32 padding[4]; + uint32 self_hash; // The hash of EntryStore up to this point. + char key[256 - 24 * 4]; // null terminated +}; +""" + +c_cache = cstruct(endian="<").load(cache_def) + + +def BlockSizeForFileType(file_type: int) -> int: + if file_type == 1: # RANKINGS + return 36 + + if file_type == 2: # BLOCK_256 + return 256 + + if file_type == 3: # BLOCK_1K + return 1024 + + if file_type == 4: # BLOCK_4K + return 4096 + + if file_type == 5: # BLOCK_FILES + return 8 + + if file_type == 6: # BLOCK_ENTRIES + return 104 + + if file_type == 7: # BLOCK_EVICETED + return 48 + + raise ValueError(f"Unknown file_type {file_type!r}") diff --git a/dissect/database/chromium/cache/c_cache.pyi b/dissect/database/chromium/cache/c_cache.pyi new file mode 100644 index 0000000..1c69fbd --- /dev/null +++ b/dissect/database/chromium/cache/c_cache.pyi @@ -0,0 +1,170 @@ +# Generated by cstruct-stubgen +from typing import BinaryIO, Literal, TypeAlias, overload + +import dissect.cstruct as __cs__ + +class _c_cache(__cs__.cstruct): + kInitializedMask: Literal[2147483648] = ... + kFileTypeMask: Literal[1879048192] = ... + kFileTypeOffset: Literal[28] = ... + kReservedBitsMask: Literal[201326592] = ... + kNumBlocksMask: Literal[50331648] = ... + kNumBlocksOffset: Literal[24] = ... + kFileSelectorMask: Literal[16711680] = ... + kFileSelectorOffset: Literal[16] = ... + kStartBlockMask: Literal[65535] = ... + kFileNameMask: Literal[268435455] = ... + class FileType(__cs__.Enum): + EXTERNAL = ... + RANKINGS = ... + BLOCK_256 = ... + BLOCK_1K = ... + BLOCK_4K = ... + BLOCK_FILES = ... + BLOCK_ENTRIES = ... + BLOCK_EVICTED = ... + + CacheAddr: TypeAlias = _c_cache.uint32 + class LruData(__cs__.Structure): + padding_1: __cs__.Array[_c_cache.int32] + filled: _c_cache.int32 + sizes: __cs__.Array[_c_cache.int32] + heads: __cs__.Array[_c_cache.uint32] + tails: __cs__.Array[_c_cache.uint32] + transaction: _c_cache.uint32 + operation: _c_cache.int32 + operation_list: _c_cache.int32 + padding_2: __cs__.Array[_c_cache.int32] + @overload + def __init__( + self, + padding_1: __cs__.Array[_c_cache.int32] | None = ..., + filled: _c_cache.int32 | None = ..., + sizes: __cs__.Array[_c_cache.int32] | None = ..., + heads: __cs__.Array[_c_cache.uint32] | None = ..., + tails: __cs__.Array[_c_cache.uint32] | None = ..., + transaction: _c_cache.uint32 | None = ..., + operation: _c_cache.int32 | None = ..., + operation_list: _c_cache.int32 | None = ..., + padding_2: __cs__.Array[_c_cache.int32] | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class IndexHeader(__cs__.Structure): + magic: _c_cache.uint32 + version: _c_cache.uint32 + num_entries: _c_cache.int32 + num_bytes_legacy: _c_cache.int32 + last_file: _c_cache.int32 + dirty_flag: _c_cache.int32 + stats: _c_cache.uint32 + table_len: _c_cache.int32 + crash_flag: _c_cache.int32 + experiment_flag: _c_cache.int32 + create_time: _c_cache.uint64 + num_bytes: _c_cache.int64 + corruption_flag: _c_cache.int32 + padding: __cs__.Array[_c_cache.int32] + lru_data: _c_cache.LruData + @overload + def __init__( + self, + magic: _c_cache.uint32 | None = ..., + version: _c_cache.uint32 | None = ..., + num_entries: _c_cache.int32 | None = ..., + num_bytes_legacy: _c_cache.int32 | None = ..., + last_file: _c_cache.int32 | None = ..., + dirty_flag: _c_cache.int32 | None = ..., + stats: _c_cache.uint32 | None = ..., + table_len: _c_cache.int32 | None = ..., + crash_flag: _c_cache.int32 | None = ..., + experiment_flag: _c_cache.int32 | None = ..., + create_time: _c_cache.uint64 | None = ..., + num_bytes: _c_cache.int64 | None = ..., + corruption_flag: _c_cache.int32 | None = ..., + padding: __cs__.Array[_c_cache.int32] | None = ..., + lru_data: _c_cache.LruData | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class BlockFileHeader(__cs__.Structure): + magic: _c_cache.uint32 + version: _c_cache.uint32 + this_file: _c_cache.int16 + next_file: _c_cache.int16 + entry_size: _c_cache.int32 + num_entries: _c_cache.int32 + max_entries: _c_cache.int32 + empty: __cs__.Array[_c_cache.int32] + hints: __cs__.Array[_c_cache.int32] + updating: _c_cache.int32 + user: __cs__.Array[_c_cache.int32] + @overload + def __init__( + self, + magic: _c_cache.uint32 | None = ..., + version: _c_cache.uint32 | None = ..., + this_file: _c_cache.int16 | None = ..., + next_file: _c_cache.int16 | None = ..., + entry_size: _c_cache.int32 | None = ..., + num_entries: _c_cache.int32 | None = ..., + max_entries: _c_cache.int32 | None = ..., + empty: __cs__.Array[_c_cache.int32] | None = ..., + hints: __cs__.Array[_c_cache.int32] | None = ..., + updating: _c_cache.int32 | None = ..., + user: __cs__.Array[_c_cache.int32] | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class EntryState(__cs__.Enum): + ENTRY_NORMAL = ... + ENTRY_EVICTED = ... + ENTRY_DOOMED = ... + + class EntryFlags(__cs__.Enum): + PARENT_ENTRY = ... + CHILD_ENTRY = ... + + class EntryStore(__cs__.Structure): + hash: _c_cache.uint32 + next: _c_cache.uint32 + rankings_node: _c_cache.uint32 + reuse_count: _c_cache.int32 + refetch_count: _c_cache.int32 + state: _c_cache.int32 + creation_time: _c_cache.uint64 + key_len: _c_cache.int32 + long_key: _c_cache.uint32 + data_size: __cs__.Array[_c_cache.int32] + data_addr: __cs__.Array[_c_cache.uint32] + flags: _c_cache.uint32 + padding: __cs__.Array[_c_cache.int32] + self_hash: _c_cache.uint32 + key: __cs__.CharArray + @overload + def __init__( + self, + hash: _c_cache.uint32 | None = ..., + next: _c_cache.uint32 | None = ..., + rankings_node: _c_cache.uint32 | None = ..., + reuse_count: _c_cache.int32 | None = ..., + refetch_count: _c_cache.int32 | None = ..., + state: _c_cache.int32 | None = ..., + creation_time: _c_cache.uint64 | None = ..., + key_len: _c_cache.int32 | None = ..., + long_key: _c_cache.uint32 | None = ..., + data_size: __cs__.Array[_c_cache.int32] | None = ..., + data_addr: __cs__.Array[_c_cache.uint32] | None = ..., + flags: _c_cache.uint32 | None = ..., + padding: __cs__.Array[_c_cache.int32] | None = ..., + self_hash: _c_cache.uint32 | None = ..., + key: __cs__.CharArray | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + +# Technically `c_cache` is an instance of `_c_cache`, but then we can't use it in type hints +c_cache: TypeAlias = _c_cache diff --git a/dissect/database/chromium/cache/c_simple.py b/dissect/database/chromium/cache/c_simple.py new file mode 100644 index 0000000..5ffbedf --- /dev/null +++ b/dissect/database/chromium/cache/c_simple.py @@ -0,0 +1,87 @@ +from __future__ import annotations + +from dissect.cstruct import cstruct + +# References: +# - https://chromium.googlesource.com/chromium/src/+/HEAD/net/disk_cache/simple/simple_index_file.h +# - https://chromium.googlesource.com/chromium/src/+/HEAD/net/disk_cache/simple/simple_entry_format.h +simple_def = """ +/* Simple Indexes */ + +#define kSimpleIndexMagicNumber 0x656e74657220796f + +struct FakeIndexHeader { + uint64 magic; // kSimpleIndexMagicNumber + uint32 version; + int32 padding[2]; +}; + +struct IndexTableEntry { + uint64 hash; + int64 last_used; + int64 size; +}; + +struct RealIndexHeader { + uint32 size; + uint32 crc32; + uint64 magic; // kSimpleIndexMagicNumber + uint32 version; + int64 num_entries; + int64 cache_size; + int32 unknown; + IndexTableEntry entries[num_entries]; +}; + +/* Simple File Headers. */ + +#define kSimpleInitialMagicNumber 0xfcfb6d1ba7725c30 +#define kSimpleFinalMagicNumber 0xf4fa6f45970d41d8 + +struct SimpleFileHeader { + uint64 magic; // kSimpleInitialMagicNumber + uint32 version; + uint32 key_length; + uint32 key_hash; // md5 + uint32 unused_padding; + char key[key_length]; + + // followed by SimpleFileStream_* +}; + +#define kSimpleEOFSize 24 + +struct SimpleFileEOF { + uint64 magic; // kSimpleFinalMagicNumber + uint32 flags; // hash type: 0 = ?, 1 = crc32, 2 = sha256, 3 = 1 + 2 + uint32 crc32; + int32 stream_size; // only used in the EOF record for stream 0. +}; + +struct SimpleFileStream_0_1 { + // preceded by SimpleFileHeader + // char data_stream_1[]; + // SimpleFileEOF + // char data_stream_0[]; + // SHA256 if flags = 2 or 3 + // SimpleFileEOF +}; + +struct SimpleFileStream_2 { + // preceded by SimpleFileHeader + // char data_stream_2[]; + // SimpleFileEOF +}; + +#define kSimpleSparseRangeMagicNumber 0xeb97bf016553676b + +struct SimpleFileSparseRangeHeader { + uint64 magic; // kSimpleSparseRangeMagicNumber + int64 offset; + int64 length; + uint32 crc32; + // char data[length]; +}; +""" + +c_simple = cstruct(endian="<").load(simple_def) diff --git a/dissect/database/chromium/cache/c_simple.pyi b/dissect/database/chromium/cache/c_simple.pyi new file mode 100644 index 0000000..6caaf71 --- /dev/null +++ b/dissect/database/chromium/cache/c_simple.pyi @@ -0,0 +1,149 @@ +# Generated by cstruct-stubgen +from typing import BinaryIO, Literal, TypeAlias, overload + +import dissect.cstruct as __cs__ + +class _c_simple(__cs__.cstruct): + kSimpleIndexMagicNumber: Literal[7308907224324143471] = ... + kSimpleInitialMagicNumber: Literal[18229283882253048880] = ... + kSimpleFinalMagicNumber: Literal[17652544034109735384] = ... + kSimpleEOFSize: Literal[24] = ... + kSimpleSparseRangeMagicNumber: Literal[16976247333112211307] = ... + class FakeIndexHeader(__cs__.Structure): + magic: _c_simple.uint64 + version: _c_simple.uint32 + padding: __cs__.Array[_c_simple.int32] + @overload + def __init__( + self, + magic: _c_simple.uint64 | None = ..., + version: _c_simple.uint32 | None = ..., + padding: __cs__.Array[_c_simple.int32] | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class IndexTableEntry(__cs__.Structure): + hash: _c_simple.uint64 + last_used: _c_simple.int64 + size: _c_simple.int64 + @overload + def __init__( + self, + hash: _c_simple.uint64 | None = ..., + last_used: _c_simple.int64 | None = ..., + size: _c_simple.int64 | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class RealIndexHeader(__cs__.Structure): + size: _c_simple.uint32 + crc32: _c_simple.uint32 + magic: _c_simple.uint64 + version: _c_simple.uint32 + num_entries: _c_simple.int64 + cache_size: _c_simple.int64 + unknown: _c_simple.int32 + class IndexTableEntry(__cs__.Structure): + hash: _c_simple.uint64 + last_used: _c_simple.int64 + size: _c_simple.int64 + @overload + def __init__( + self, + hash: _c_simple.uint64 | None = ..., + last_used: _c_simple.int64 | None = ..., + size: _c_simple.int64 | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + entries: __cs__.Array[IndexTableEntry] + @overload + def __init__( + self, + size: _c_simple.uint32 | None = ..., + crc32: _c_simple.uint32 | None = ..., + magic: _c_simple.uint64 | None = ..., + version: _c_simple.uint32 | None = ..., + num_entries: _c_simple.int64 | None = ..., + cache_size: _c_simple.int64 | None = ..., + unknown: _c_simple.int32 | None = ..., + entries: __cs__.Array[IndexTableEntry] | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class SimpleFileHeader(__cs__.Structure): + magic: _c_simple.uint64 + version: _c_simple.uint32 + key_length: _c_simple.uint32 + key_hash: _c_simple.uint32 + unused_padding: _c_simple.uint32 + key: __cs__.CharArray + @overload + def __init__( + self, + magic: _c_simple.uint64 | None = ..., + version: _c_simple.uint32 | None = ..., + key_length: _c_simple.uint32 | None = ..., + key_hash: _c_simple.uint32 | None = ..., + unused_padding: _c_simple.uint32 | None = ..., + key: __cs__.CharArray | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class SimpleFileEOF(__cs__.Structure): + magic: _c_simple.uint64 + flags: _c_simple.uint32 + crc32: _c_simple.uint32 + stream_size: _c_simple.int32 + @overload + def __init__( + self, + magic: _c_simple.uint64 | None = ..., + flags: _c_simple.uint32 | None = ..., + crc32: _c_simple.uint32 | None = ..., + stream_size: _c_simple.int32 | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class SimpleFileStream_0_1(__cs__.Structure): + @overload + def __init__(self): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class SimpleFileStream_2(__cs__.Structure): + @overload + def __init__(self): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class SimpleFileSparseRangeHeader(__cs__.Structure): + magic: _c_simple.uint64 + offset: _c_simple.int64 + length: _c_simple.int64 + crc32: _c_simple.uint32 + @overload + def __init__( + self, + magic: _c_simple.uint64 | None = ..., + offset: _c_simple.int64 | None = ..., + length: _c_simple.int64 | None = ..., + crc32: _c_simple.uint32 | None = ..., + ): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + + class SimpleFileStreamSparse(__cs__.Structure): + @overload + def __init__(self): ... + @overload + def __init__(self, fh: bytes | memoryview | bytearray | BinaryIO, /): ... + +# Technically `c_simple` is an instance of `_c_simple`, but then we can't use it in type hints +c_simple: TypeAlias = _c_simple diff --git a/dissect/database/chromium/cache/cache.py b/dissect/database/chromium/cache/cache.py new file mode 100644 index 0000000..465b911 --- /dev/null +++ b/dissect/database/chromium/cache/cache.py @@ -0,0 +1,233 @@ +from __future__ import annotations + +import gzip +import zlib +from typing import TYPE_CHECKING + +from cramjam import brotli +from dissect.cstruct.utils import u32 +from dissect.util.stream import RangeStream +from dissect.util.ts import webkittimestamp + +from dissect.database.chromium.cache.c_cache import BlockSizeForFileType, c_cache + +if TYPE_CHECKING: + from collections.abc import Iterator + from io import BufferedReader + from pathlib import Path + + +class DiskCache: + """Chromium Disk (Block File) Cache implementation. + + References: + - https://www.chromium.org/developers/design-documents/network-stack/disk-cache/ + - https://github.com/libyal/dtformats/blob/main/documentation/Chrome%20Cache%20file%20format.asciidoc + """ + + def __init__(self, path: Path): + if not path.exists(): + raise ValueError(f"Provided path does not exist: {path!r}") + + if not path.is_dir(): + raise ValueError(f"Provided path is not a directory: {path!r}") + + # Sanity check for expected directory structure. + files = {"index", "data_0", "data_1", "data_2", "data_3"} + self.children = set(path.iterdir()) + if not files.issubset({file.name for file in self.children}): + raise ValueError(f"Provided directory does not contain expected disk cache files: {path!r}") + + self.path = path + self.index = CacheIndexFile(self, path.joinpath("index")) + + if self.index.header.magic != 0xC103CAC3: + raise ValueError(f"Provided directory contains invalid index file: {path!r}") + + if self.index.header.version != 0x30000: + raise ValueError(f"Unsupported Disk Cache index version {self.index.header.version!r} in {path!r}") + + self.create_time = webkittimestamp(self.index.header.create_time) + self.num_entries = self.index.header.num_entries + + self.block_files = [ + CacheBlockFile(self, path.joinpath(name)) for name in ("data_0", "data_1", "data_2", "data_3") + ] + + def __repr__(self) -> str: + return f"" + + def block_file(self, id: int) -> CacheBlockFile | None: + for block_file in self.block_files: + if block_file.id == id: + return block_file + return None + + @property + def entries(self) -> Iterator[CacheEntryStore]: + for address in self.index.addresses: + while address.is_initialized: + entry = CacheEntryStore(self, address) + yield entry + + # An EntryStore can point to a next address for another EntryStore + if entry.next != 0: + address = CacheAddress(self.index, entry.next) + else: + break + + +class CacheIndexFile: + """Chromium Disk Cache Index file. + + References: + - https://chromium.googlesource.com/chromium/src/+/HEAD/net/disk_cache/blockfile/disk_format.h + """ + + def __init__(self, disk_cache: DiskCache, path: Path): + self.disk_cache = disk_cache + self.path = path + + self.fh = path.open("rb") + self.header = c_cache.IndexHeader(self.fh) + + def __repr__(self) -> str: + return f"" + + @property + def addresses(self) -> Iterator[CacheAddress]: + """Yield :class:`CacheAddress` from the index table.""" + + if hasattr(self, "_addresses"): + yield from self._addresses + return + + self._addresses = [] + + for _ in range(self.header.table_len): + addr = CacheAddress(self, u32(self.fh.read(4))) + self._addresses.append(addr) + yield addr + + # TODO: get(address)? + + +class CacheBlockFile: + """Chromium Disk Cache Data Block file. + + References: + - https://chromium.googlesource.com/chromium/src/+/HEAD/net/disk_cache/blockfile/disk_format.h + """ + + def __init__(self, disk_cache: DiskCache, path: Path): + self.disk_cache = disk_cache + self.path = path + + self.fh = path.open("rb") + self.header = c_cache.BlockFileHeader(self.fh) + + self.id = self.header.this_file + self.entry_size = self.header.entry_size + self.num_entries = self.header.num_entries + + def __repr__(self) -> str: + return f"" # noqa: E501 + + def read(self, addr: CacheAddress) -> RangeStream: + offset = c_cache.kBlockHeaderSize + (self.entry_size * addr.start_block) + size = self.entry_size * addr.num_blocks + return RangeStream(self.fh, offset, size) + + +class CacheAddress: + """Chromium Disk Cache Address. + + References: + - https://chromium.googlesource.com/chromium/src/+/HEAD/net/disk_cache/blockfile/addr.h + """ + + def __init__(self, index: CacheIndexFile, addr: int): + self.index = index + self.address = addr + + self.is_initialized = addr & c_cache.kInitializedMask != 0 + self.file_type = c_cache.FileType((addr & c_cache.kFileTypeMask) >> c_cache.kFileTypeOffset) + self.is_separate_file = (addr & c_cache.kFileTypeMask) == 0 + self.is_block_file = not self.is_separate_file + + if self.is_separate_file: + self.file_number = addr & c_cache.kFileNameMask + self.block_size = None + self.num_blocks = None + self.start_block = None + else: + self.file_number = (addr & c_cache.kFileSelectorMask) >> c_cache.kFileSelectorOffset + self.block_size = BlockSizeForFileType(self.file_type.value) + self.num_blocks = 1 + ((addr & c_cache.kNumBlocksMask) >> c_cache.kNumBlocksOffset) + self.start_block = addr & c_cache.kStartBlockMask + + def __repr__(self) -> str: + return f"" # noqa: E501 + + @property + def data(self) -> BufferedReader | RangeStream: + if not self.is_initialized: + raise ValueError("Cannot read data from non initialized address") + + if self.file_type == c_cache.FileType.EXTERNAL: + file_name = f"f_{self.file_number:06x}" + path = self.index.disk_cache.path.joinpath(file_name) + return path.open("rb") + + if self.file_type in (c_cache.FileType.BLOCK_256, c_cache.FileType.BLOCK_1K, c_cache.FileType.BLOCK_4K): + block_file = self.index.disk_cache.block_file(self.file_number) + if not block_file: + raise ValueError(f"Requested block file {self.file_number!r} does not exist") + return block_file.read(self) + + raise ValueError(f"No data for file type {self.file_type!r}") + + +class CacheEntryStore: + """Represents a Cache EntryStore object.""" + + def __init__(self, disk_cache: DiskCache, addr: CacheAddress): + self.disk_cache = disk_cache + self.address = addr + + self.header = c_cache.EntryStore(self.address.data) + self.state = c_cache.EntryState(self.header.state) + self.creation_time = webkittimestamp(self.header.creation_time) + self.next = self.header.next + + if self.header.long_key: + key_addr = CacheAddress(disk_cache.index, self.header.long_key) + self.key = key_addr.data.read(self.header.key_len).decode() + else: + self.key = self.header.key.decode().strip("\x00") + + def __repr__(self): + return f"" # noqa: E501 + + @property + def meta(self) -> bytes: + addr = CacheAddress(self.disk_cache.index, self.header.data_addr[0]) + # TODO: Properly unpickle, contains a treasure of data. + return addr.data.read() + + @property + def data(self) -> bytes: + addr = CacheAddress(self.disk_cache.index, self.header.data_addr[1]) + header = addr.data.read(4) + + if header[0:2] == b"\x1f\x8b": + return gzip.decompress(addr.data.read()) + + meta = self.meta + if b"content-encoding:br" in meta: + return brotli.decompress(addr.data.read()).read() + + if b"content-encoding:deflate" in meta: + return zlib.decompress(addr.data.read(), -zlib.MAX_WBITS) + + return addr.data.read() diff --git a/dissect/database/chromium/cache/simple.py b/dissect/database/chromium/cache/simple.py new file mode 100644 index 0000000..32cb10b --- /dev/null +++ b/dissect/database/chromium/cache/simple.py @@ -0,0 +1,194 @@ +from __future__ import annotations + +import gzip +import os +import zlib +from enum import IntEnum +from typing import TYPE_CHECKING + +from cramjam import brotli +from dissect.util.ts import webkittimestamp + +from dissect.database.chromium.cache.c_simple import c_simple + +if TYPE_CHECKING: + from pathlib import Path + + +class SimpleDiskCache: + """Chromium Very Simple Disk Cache Backend implementation. + + References: + - https://www.chromium.org/developers/design-documents/network-stack/disk-cache/very-simple-backend/ + - https://chromium.googlesource.com/chromium/src/+/HEAD/net/disk_cache/simple/ + """ + + def __init__(self, path: Path): + if not path.exists(): + raise ValueError(f"Provided path does not exist: {path!r}") + + if not path.is_dir(): + raise ValueError(f"Provided path is not a directory: {path!r}") + + # Sanity check for expected directory structure. + files = {"index-dir", "index"} + self.children = set(path.iterdir()) + if not files.issubset({file.name for file in self.children}): + raise ValueError(f"Provided directory does not contain expected disk cache files: {path!r}") + + self.path = path + self.index = SimpleIndexFile(self, path.joinpath("index-dir/the-real-index")) + self.cache_files = [ + SimpleCacheFile(self, child) for child in self.children if len(child.name) == 18 and "_" in child.name + ] + + def __repr__(self) -> str: + return f"" + + def get(self, key: str) -> SimpleCacheFile | None: + """Return the first matching :class:`SimpleCacheFile` for the given key identifier.""" + for cache_file in self.cache_files: + if cache_file.key == key: + return cache_file + return None + + +class SimpleIndexFile: + """Represents a Chromium Very Simple Disk Cache Backend index file.""" + + def __init__(self, disk_cache: SimpleDiskCache, path: Path): + self.disk_cache = disk_cache + self.path = path + + self.fh = path.open("rb") + self.header = c_simple.RealIndexHeader(self.fh) + + if self.header.magic != c_simple.kSimpleIndexMagicNumber: + raise ValueError(f"Unexpected magic header for {path!s}: {self.header.magic!r}") + + self.entries = self.header.entries + + if len(self.entries) != self.header.num_entries: + raise ValueError(f"Mismatch in amount of expected entries for {path!s}") + + self.last_used = webkittimestamp(self.entries[-1].last_used) + + def __repr__(self): + return f"" + + +class SimpleCacheFile: + """Represents a Chromium Very Simple Disk Cache Backend cache file. + + References: + - https://chromium.googlesource.com/chromium/src/+/HEAD/net/disk_cache/simple/simple_entry_format.h + - https://github.com/schorlet/simplecache + """ + + def __init__(self, disk_cache: SimpleDiskCache, path: Path): + self.disk_cache = disk_cache + self.path = path + + self.fh = path.open("rb") + self.header = c_simple.SimpleFileHeader(self.fh) + self.header_size = len(self.header.dumps()) + self.type = infer_file_type(self.path.name) + self.key = self.header.key.decode("latin1") + + def __repr__(self) -> str: + return f"" + + def _streams(self) -> None: + """Parse the stream(s) of this Simple Cache File.""" + + if self.type == SimpleFileType.STREAM_0_1: + # We read backwards in the file handle (stream 0 is positioned after stream 1). + + # Stream 0 + self.fh.seek(-c_simple.kSimpleEOFSize, os.SEEK_END) + eof = c_simple.SimpleFileEOF(self.fh) + offset = -c_simple.kSimpleEOFSize - eof.stream_size + if eof.flags in (2, 3): + offset -= 32 + self.fh.seek(offset, os.SEEK_END) + self._meta = self.fh.read(eof.stream_size) + + # Stream 1 + self.fh.seek(-(c_simple.kSimpleEOFSize * 2) - eof.stream_size, os.SEEK_END) + if eof.flags in (2, 3): + self.fh.seek(-32, os.SEEK_CUR) + eof2 = c_simple.SimpleFileEOF(self.fh) + self.fh.seek(self.header_size) + self._data = self.fh.read(eof2.stream_size) + + elif self.type == SimpleFileType.STREAM_2: + # Should be simple + raise NotImplementedError + + elif self.type == SimpleFileType.STREAM_SPARSE: + ranges = [] + while True: + try: + range_header = c_simple.SimpleFileSparseRangeHeader(self.fh) + except EOFError: + break + + if range_header.magic != c_simple.kSimpleSparseRangeMagicNumber: + break + + offset = self.fh.tell() + ranges.append((range_header, offset)) + self.fh.seek(offset + range_header.length) + + if len(ranges) > 1: + raise ValueError("Did not expect another range in sparse stream") + + for range_header, offset in ranges: + self.fh.seek(offset) + self._meta = b"" + self._data = self.fh.read(range_header.length) + + @property + def meta(self) -> bytes: + if not hasattr(self, "_meta"): + self._streams() + return self._meta + + @property + def data(self) -> bytes: + if not hasattr(self, "_data"): + self._streams() + + if self._data[0:2] == b"\x1f\x8b": + return gzip.decompress(self._data) + + if b"content-encoding:br" in self.meta: + return brotli.decompress(self._data).read() + + if b"content-encoding:deflate" in self.meta: + return zlib.decompress(self._data, -zlib.MAX_WBITS) + + return self._data + + +class SimpleFileType(IntEnum): + """SimpleFileType enum.""" + + STREAM_0_1 = 0 + STREAM_2 = 1 + STREAM_SPARSE = 2 + + +def infer_file_type(file_name: str) -> SimpleFileType: + """Infer the :class:`SimpleFileType` based on the name of the :class:`SimpleCacheFile`.""" + + if file_name.endswith("_0"): + return SimpleFileType.STREAM_0_1 + + if file_name.endswith("_1"): + return SimpleFileType.STREAM_2 + + if file_name.endswith("_s"): + return SimpleFileType.STREAM_SPARSE + + raise ValueError(f"Unknown SimpleFileType for filename {file_name!r}") diff --git a/tests/chromium/__init__.py b/tests/chromium/__init__.py new file mode 100644 index 0000000..e69de29