diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 0000000000..ce7eb0eb16 --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,28 @@ +## About + + + + +## Checklist (for Author) + +- [ ] [libvirt-tests](https://github.com/cyberus-technology/libvirt-tests) + pipeline succeeded (currently this must be done manually locally) +- [ ] PR associated with + [ticket](https://github.com/cobaltcore-dev/cobaltcore/issues?q=is%3Aissue%20state%3Aopen%20label%3Acyberus%2Ccyberus-maybe) + +## Hints for Reviewers + + + + +## Steps to Undraft (if draft) + + + diff --git a/.github/workflows/audit.yaml b/.github/workflows/audit.yaml deleted file mode 100644 index 2e44b9af40..0000000000 --- a/.github/workflows/audit.yaml +++ /dev/null @@ -1,16 +0,0 @@ -name: Cloud Hypervisor Dependency Audit -on: - pull_request: - paths: - - '**/Cargo.toml' - - '**/Cargo.lock' - -jobs: - security_audit: - name: Audit - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: actions-rust-lang/audit@v1 - with: - token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 070650ba6e..3778f6b9d8 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -13,21 +13,16 @@ jobs: matrix: rust: - stable - - beta - nightly - - "1.83.0" + - "1.88.0" target: - x86_64-unknown-linux-gnu - - x86_64-unknown-linux-musl steps: - name: Code checkout - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: fetch-depth: 0 - - name: Install musl-gcc - run: sudo apt install -y musl-tools - - name: Install Rust toolchain (${{ matrix.rust }}) uses: dtolnay/rust-toolchain@stable with: @@ -35,34 +30,40 @@ jobs: target: ${{ matrix.target }} - name: Build (default features) - run: cargo rustc --locked --bin cloud-hypervisor -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + run: cargo rustc --locked --bin cloud-hypervisor -- -D warnings - name: Build (kvm) - run: cargo rustc --locked --bin cloud-hypervisor --no-default-features --features "kvm" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + run: cargo rustc --locked --bin cloud-hypervisor --no-default-features --features "kvm" -- -D warnings - name: Build (default features + tdx) - run: cargo rustc --locked --bin cloud-hypervisor --features "tdx" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + run: cargo rustc --locked --bin cloud-hypervisor --features "tdx" -- -D warnings - name: Build (default features + dbus_api) - run: cargo rustc --locked --bin cloud-hypervisor --features "dbus_api" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + run: cargo rustc --locked --bin cloud-hypervisor --features "dbus_api" -- -D warnings - name: Build (default features + guest_debug) - run: cargo rustc --locked --bin cloud-hypervisor --features "guest_debug" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + run: cargo rustc --locked --bin cloud-hypervisor --features "guest_debug" -- -D warnings - name: Build (default features + pvmemcontrol) - run: cargo rustc --locked --bin cloud-hypervisor --features "pvmemcontrol" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + run: cargo rustc --locked --bin cloud-hypervisor --features "pvmemcontrol" -- -D warnings + + - name: Build (default features + fw_cfg) + run: cargo rustc --locked --bin cloud-hypervisor --features "fw_cfg" -- -D warnings + + - name: Build (default features + ivshmem) + run: cargo rustc --locked --bin cloud-hypervisor --features "ivshmem" -- -D warnings - name: Build (mshv) - run: cargo rustc --locked --bin cloud-hypervisor --no-default-features --features "mshv" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + run: cargo rustc --locked --bin cloud-hypervisor --no-default-features --features "mshv" -- -D warnings - name: Build (sev_snp) - run: cargo rustc --locked --bin cloud-hypervisor --no-default-features --features "sev_snp" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + run: cargo rustc --locked --bin cloud-hypervisor --no-default-features --features "sev_snp" -- -D warnings - name: Build (igvm) - run: cargo rustc --locked --bin cloud-hypervisor --no-default-features --features "igvm" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + run: cargo rustc --locked --bin cloud-hypervisor --no-default-features --features "igvm" -- -D warnings - name: Build (mshv + kvm) - run: cargo rustc --locked --bin cloud-hypervisor --no-default-features --features "mshv,kvm" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + run: cargo rustc --locked --bin cloud-hypervisor --no-default-features --features "mshv,kvm" -- -D warnings - name: Release Build (default features) run: cargo build --locked --all --release --target=${{ matrix.target }} diff --git a/.github/workflows/commit-lint.yml b/.github/workflows/commit-lint.yml new file mode 100644 index 0000000000..ec2dfec7ac --- /dev/null +++ b/.github/workflows/commit-lint.yml @@ -0,0 +1,23 @@ +name: Commit Lint +on: [ pull_request ] +jobs: + gitlint: + name: Check commit messages + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + fetch-depth: 0 + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: "3.10" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install --upgrade gitlint + - name: Lint git commit messages + run: | + gitlint --commits origin/$GITHUB_BASE_REF.. diff --git a/.github/workflows/dco.yaml b/.github/workflows/dco.yaml deleted file mode 100644 index 888b685820..0000000000 --- a/.github/workflows/dco.yaml +++ /dev/null @@ -1,20 +0,0 @@ -name: DCO -on: [pull_request, merge_group] - -jobs: - check: - name: DCO Check ("Signed-Off-By") - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - name: Set up Python 3.x - uses: actions/setup-python@v5 - with: - python-version: '3.x' - - name: Check DCO - if: ${{ github.event_name == 'pull_request' }} - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - pip3 install -U dco-check - dco-check -e "49699333+dependabot[bot]@users.noreply.github.com" diff --git a/.github/workflows/docker-image.yaml b/.github/workflows/docker-image.yaml deleted file mode 100644 index 6891d60997..0000000000 --- a/.github/workflows/docker-image.yaml +++ /dev/null @@ -1,65 +0,0 @@ -name: Cloud Hypervisor's Docker image update -on: - push: - branches: main - paths: resources/Dockerfile - pull_request: - paths: resources/Dockerfile -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -env: - REGISTRY: ghcr.io - IMAGE_NAME: ${{ github.repository }} - -jobs: - main: - runs-on: ubuntu-latest - steps: - - name: Code checkout - uses: actions/checkout@v4 - - - name: Set up QEMU - uses: docker/setup-qemu-action@v3 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Login to ghcr - uses: docker/login-action@v3 - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - - name: Docker meta - id: meta - uses: docker/metadata-action@v5 - with: - images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} - # generate Docker tags based on the following events/attributes - tags: | - type=raw,value=20250412-0 - type=sha - - - name: Build and push - if: ${{ github.event_name == 'push' }} - uses: docker/build-push-action@v6 - with: - file: ./resources/Dockerfile - platforms: linux/amd64,linux/arm64 - push: true - tags: ${{ steps.meta.outputs.tags }} - - - name: Build only - if: ${{ github.event_name == 'pull_request' }} - uses: docker/build-push-action@v6 - with: - file: ./resources/Dockerfile - platforms: linux/amd64,linux/arm64 - tags: ${{ steps.meta.outputs.tags }} - - - name: Image digest - run: echo ${{ steps.docker_build.outputs.digest }} diff --git a/.github/workflows/formatting.yaml b/.github/workflows/formatting.yaml index b6dd6cafc1..37a0b3e6b5 100644 --- a/.github/workflows/formatting.yaml +++ b/.github/workflows/formatting.yaml @@ -14,12 +14,11 @@ jobs: - nightly target: - x86_64-unknown-linux-gnu - - aarch64-unknown-linux-musl env: RUSTFLAGS: -D warnings steps: - name: Code checkout - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: Install Rust toolchain (${{ matrix.rust }}) uses: dtolnay/rust-toolchain@stable with: diff --git a/.github/workflows/fuzz-build.yaml b/.github/workflows/fuzz-build.yaml deleted file mode 100644 index db868de2be..0000000000 --- a/.github/workflows/fuzz-build.yaml +++ /dev/null @@ -1,32 +0,0 @@ -name: Cloud Hypervisor Cargo Fuzz Build -on: [pull_request, merge_group] -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - build: - name: Cargo Fuzz Build - runs-on: ubuntu-latest - strategy: - matrix: - rust: - - nightly - target: - - x86_64-unknown-linux-gnu - env: - RUSTFLAGS: -D warnings - steps: - - name: Code checkout - uses: actions/checkout@v4 - - name: Install Rust toolchain (${{ matrix.rust }}) - uses: dtolnay/rust-toolchain@stable - with: - toolchain: ${{ matrix.rust }} - target: ${{ matrix.target }} - - name: Install Cargo fuzz - run: cargo install cargo-fuzz - - name: Fuzz Build - run: cargo fuzz build - - name: Fuzz Check - run: cargo fuzz check diff --git a/.github/workflows/gitlint.yaml b/.github/workflows/gitlint.yaml deleted file mode 100644 index 11ebf707a4..0000000000 --- a/.github/workflows/gitlint.yaml +++ /dev/null @@ -1,25 +0,0 @@ -name: Commit messages check -on: - pull_request: - -jobs: - gitlint: - name: Check commit messages - runs-on: ubuntu-latest - steps: - - name: Checkout repository - uses: actions/checkout@v4 - with: - ref: ${{ github.event.pull_request.head.sha }} - fetch-depth: 0 - - name: Set up Python 3.10 - uses: actions/setup-python@v5 - with: - python-version: "3.10" - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install --upgrade gitlint - - name: Lint git commit messages - run: | - gitlint --commits origin/$GITHUB_BASE_REF.. diff --git a/.github/workflows/hadolint.yaml b/.github/workflows/hadolint.yaml deleted file mode 100644 index 31b8910984..0000000000 --- a/.github/workflows/hadolint.yaml +++ /dev/null @@ -1,25 +0,0 @@ -name: Lint Dockerfile -on: - push: - paths: - - resources/Dockerfile - pull_request: - paths: - - resources/Dockerfile - -jobs: - hadolint: - name: Run Hadolint Dockerfile Linter - runs-on: ubuntu-latest - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Lint Dockerfile - uses: hadolint/hadolint-action@master - with: - dockerfile: ./resources/Dockerfile - format: tty - no-fail: false - verbose: true - failure-threshold: info diff --git a/.github/workflows/integration-arm64.yaml b/.github/workflows/integration-arm64.yaml deleted file mode 100644 index d580a991cc..0000000000 --- a/.github/workflows/integration-arm64.yaml +++ /dev/null @@ -1,54 +0,0 @@ -name: Cloud Hypervisor Tests (ARM64) -on: [pull_request, merge_group] -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - build: - timeout-minutes: 120 - name: Tests (ARM64) - runs-on: bookworm-arm64 - steps: - - name: Fix workspace permissions - run: sudo chown -R runner:runner ${GITHUB_WORKSPACE} - - name: Code checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - name: Run unit tests (musl) - run: scripts/dev_cli.sh tests --unit --libc musl - - name: Load openvswitch module - run: sudo modprobe openvswitch - - name: Run integration tests (musl) - timeout-minutes: 60 - run: scripts/dev_cli.sh tests --integration --libc musl - - name: Install Azure CLI - if: ${{ github.event_name != 'pull_request' }} - run: | - sudo apt install -y ca-certificates curl apt-transport-https lsb-release gnupg - curl -sL https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor | sudo tee /etc/apt/trusted.gpg.d/microsoft.gpg > /dev/null - echo "deb [arch=arm64] https://packages.microsoft.com/repos/azure-cli/ bookworm main" | sudo tee /etc/apt/sources.list.d/azure-cli.list - sudo apt update - sudo apt install -y azure-cli - - name: Download Windows image - if: ${{ github.event_name != 'pull_request' }} - shell: bash - run: | - IMG_BASENAME=windows-11-iot-enterprise-aarch64.raw - IMG_PATH=$HOME/workloads/$IMG_BASENAME - IMG_GZ_PATH=$HOME/workloads/$IMG_BASENAME.gz - IMG_GZ_BLOB_NAME=windows-11-iot-enterprise-aarch64-9-min.raw.gz - cp "scripts/$IMG_BASENAME.sha1" "$HOME/workloads/" - pushd "$HOME/workloads" - if sha1sum "$IMG_BASENAME.sha1" --check; then - exit - fi - popd - mkdir -p "$HOME/workloads" - az storage blob download --container-name private-images --file "$IMG_GZ_PATH" --name "$IMG_GZ_BLOB_NAME" --connection-string "${{ secrets.CH_PRIVATE_IMAGES }}" - gzip -d $IMG_GZ_PATH - - name: Run Windows guest integration tests - if: ${{ github.event_name != 'pull_request' }} - timeout-minutes: 30 - run: scripts/dev_cli.sh tests --integration-windows --libc musl diff --git a/.github/workflows/integration-metrics.yaml b/.github/workflows/integration-metrics.yaml index 440e9ad850..e8dd72ea84 100644 --- a/.github/workflows/integration-metrics.yaml +++ b/.github/workflows/integration-metrics.yaml @@ -12,7 +12,7 @@ jobs: METRICS_PUBLISH_KEY: ${{ secrets.METRICS_PUBLISH_KEY }} steps: - name: Code checkout - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: fetch-depth: 0 - name: Run metrics tests diff --git a/.github/workflows/integration-rate-limiter.yaml b/.github/workflows/integration-rate-limiter.yaml index 5700bfe46f..91682f77f8 100644 --- a/.github/workflows/integration-rate-limiter.yaml +++ b/.github/workflows/integration-rate-limiter.yaml @@ -13,7 +13,7 @@ jobs: steps: - name: Code checkout if: ${{ github.event_name != 'pull_request' }} - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: fetch-depth: 0 - name: Run rate-limiter integration tests diff --git a/.github/workflows/integration-vfio.yaml b/.github/workflows/integration-vfio.yaml index 3549ace272..edd7399b15 100644 --- a/.github/workflows/integration-vfio.yaml +++ b/.github/workflows/integration-vfio.yaml @@ -16,7 +16,7 @@ jobs: run: sudo chown -R runner:runner ${GITHUB_WORKSPACE} - name: Code checkout if: ${{ github.event_name != 'pull_request' }} - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: fetch-depth: 0 - name: Run VFIO integration tests diff --git a/.github/workflows/integration-windows.yaml b/.github/workflows/integration-windows.yaml index 29aa04a78f..0769789a9d 100644 --- a/.github/workflows/integration-windows.yaml +++ b/.github/workflows/integration-windows.yaml @@ -11,7 +11,7 @@ jobs: steps: - name: Code checkout if: ${{ github.event_name != 'pull_request' }} - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: fetch-depth: 0 - name: Install Docker diff --git a/.github/workflows/integration-x86-64.yaml b/.github/workflows/integration-x86-64.yaml index 80690512f5..b5b8ec9071 100644 --- a/.github/workflows/integration-x86-64.yaml +++ b/.github/workflows/integration-x86-64.yaml @@ -10,14 +10,13 @@ jobs: strategy: fail-fast: false matrix: - runner: ['garm-jammy', "garm-jammy-amd"] - libc: ["musl", 'gnu'] + libc: ['gnu'] name: Tests (x86-64) runs-on: ${{ github.event_name == 'pull_request' && !(matrix.runner == 'garm-jammy' && matrix.libc == 'gnu') && 'ubuntu-latest' || format('{0}-16', matrix.runner) }} steps: - name: Code checkout if: ${{ github.event_name != 'pull_request' || (matrix.runner == 'garm-jammy' && matrix.libc == 'gnu') }} - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: fetch-depth: 0 - name: Install Docker @@ -36,17 +35,6 @@ jobs: - name: Run unit tests if: ${{ github.event_name != 'pull_request' || (matrix.runner == 'garm-jammy' && matrix.libc == 'gnu') }} run: scripts/dev_cli.sh tests --unit --libc ${{ matrix.libc }} - - name: Load openvswitch module - if: ${{ github.event_name != 'pull_request' || (matrix.runner == 'garm-jammy' && matrix.libc == 'gnu') }} - run: sudo modprobe openvswitch - - name: Run integration tests - if: ${{ github.event_name != 'pull_request' || (matrix.runner == 'garm-jammy' && matrix.libc == 'gnu') }} - timeout-minutes: 40 - run: scripts/dev_cli.sh tests --integration --libc ${{ matrix.libc }} - - name: Run live-migration integration tests - if: ${{ github.event_name != 'pull_request' || (matrix.runner == 'garm-jammy' && matrix.libc == 'gnu') }} - timeout-minutes: 20 - run: scripts/dev_cli.sh tests --integration-live-migration --libc ${{ matrix.libc }} - name: Skipping build for PR if: ${{ github.event_name == 'pull_request' && matrix.runner != 'garm-jammy' && matrix.libc != 'gnu' }} run: echo "Skipping build for PR" diff --git a/.github/workflows/lychee.yaml b/.github/workflows/lychee.yaml deleted file mode 100644 index dd3a372dc8..0000000000 --- a/.github/workflows/lychee.yaml +++ /dev/null @@ -1,16 +0,0 @@ -name: Link Check (lychee) -on: - pull_request - -jobs: - link_check: - name: Link Check - runs-on: ubuntu-latest - steps: - - name: Code checkout - uses: actions/checkout@v4 - - - name: Link Availability Check - uses: lycheeverse/lychee-action@master - with: - args: --verbose --config .lychee.toml . diff --git a/.github/workflows/openapi.yaml b/.github/workflows/openapi.yaml index 0cd5b848cc..9c1266e4d7 100644 --- a/.github/workflows/openapi.yaml +++ b/.github/workflows/openapi.yaml @@ -6,7 +6,7 @@ jobs: runs-on: ubuntu-latest container: openapitools/openapi-generator-cli steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - name: Validate OpenAPI env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/package-consistency.yaml b/.github/workflows/package-consistency.yaml deleted file mode 100644 index 0c57baa6c0..0000000000 --- a/.github/workflows/package-consistency.yaml +++ /dev/null @@ -1,32 +0,0 @@ -name: Cloud Hypervisor Consistency -on: [pull_request, merge_group] -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - build: - name: Rust VMM Consistency Check - runs-on: ubuntu-latest - steps: - - name: Code checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Install dependencies - run: sudo apt install -y python3 - - - name: Install Rust toolchain stable - uses: dtolnay/rust-toolchain@stable - with: - toolchain: stable - - - name: Check Rust VMM Package Consistency of root Workspace - run: python3 scripts/package-consistency-check.py github.com/rust-vmm - - - name: Check Rust VMM Package Consistency of fuzz Workspace - run: | - pushd fuzz - python3 ../scripts/package-consistency-check.py github.com/rust-vmm - popd diff --git a/.github/workflows/preview-riscv64-modules.yaml b/.github/workflows/preview-riscv64-modules.yaml new file mode 100644 index 0000000000..e69de29bb2 diff --git a/.github/workflows/preview-riscv64.yaml b/.github/workflows/preview-riscv64.yaml deleted file mode 100644 index 84435402a8..0000000000 --- a/.github/workflows/preview-riscv64.yaml +++ /dev/null @@ -1,39 +0,0 @@ -name: Cloud Hypervisor RISC-V 64-bit Preview -on: [pull_request, merge_group] -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - build: - name: Cargo - runs-on: riscv64-qemu-host - strategy: - fail-fast: false - matrix: - module: - - hypervisor - - arch - - vm-allocator - - devices - - steps: - - name: Code checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Install Rust toolchain - run: /opt/scripts/exec-in-qemu.sh rustup default 1.83.0 - - - name: Build ${{ matrix.module }} Module (kvm) - run: /opt/scripts/exec-in-qemu.sh cargo rustc --locked -p ${{ matrix.module }} --no-default-features --features "kvm" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states - - - name: Clippy ${{ matrix.module }} Module (kvm) - run: /opt/scripts/exec-in-qemu.sh cargo clippy --locked -p ${{ matrix.module }} --no-default-features --features "kvm" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states - - - name: Test ${{ matrix.module }} Module (kvm) - run: /opt/scripts/exec-in-qemu.sh cargo test --locked -p ${{ matrix.module }} --no-default-features --features "kvm" - - - name: Check no files were modified - run: test -z "$(git status --porcelain)" diff --git a/.github/workflows/quality.yaml b/.github/workflows/quality.yaml index 19a4981a4d..f7b5206e6e 100644 --- a/.github/workflows/quality.yaml +++ b/.github/workflows/quality.yaml @@ -13,23 +13,17 @@ jobs: fail-fast: false matrix: rust: - - beta - stable target: - - aarch64-unknown-linux-gnu - - aarch64-unknown-linux-musl - x86_64-unknown-linux-gnu - - x86_64-unknown-linux-musl include: - - rust: beta - experimental: true - rust: stable experimental: false steps: - name: Code checkout - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: fetch-depth: 0 @@ -50,88 +44,112 @@ jobs: git checkout ${{ github.sha }} - name: Clippy (kvm) - uses: actions-rs/cargo@v1 + uses: houseabsolute/actions-rust-cross@v1 with: - use-cross: ${{ matrix.target != 'x86_64-unknown-linux-gnu' }} command: clippy - args: --target=${{ matrix.target }} --locked --all --all-targets --no-default-features --tests --examples --features "kvm" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 + toolchain: ${{ matrix.rust }} + target: ${{ matrix.target }} + args: --locked --all --all-targets --no-default-features --tests --examples --features "kvm" -- -D warnings - name: Clippy (mshv) - uses: actions-rs/cargo@v1 + uses: houseabsolute/actions-rust-cross@v1 with: - use-cross: ${{ matrix.target != 'x86_64-unknown-linux-gnu' }} command: clippy - args: --target=${{ matrix.target }} --locked --all --all-targets --no-default-features --tests --examples --features "mshv" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 + toolchain: ${{ matrix.rust }} + target: ${{ matrix.target }} + args: --locked --all --all-targets --no-default-features --tests --examples --features "mshv" -- -D warnings - name: Clippy (mshv + kvm) - uses: actions-rs/cargo@v1 + uses: houseabsolute/actions-rust-cross@v1 with: - use-cross: ${{ matrix.target != 'x86_64-unknown-linux-gnu' }} command: clippy - args: --target=${{ matrix.target }} --locked --all --all-targets --no-default-features --tests --examples --features "mshv,kvm" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 + toolchain: ${{ matrix.rust }} + target: ${{ matrix.target }} + args: --locked --all --all-targets --no-default-features --tests --examples --features "mshv,kvm" -- -D warnings - name: Clippy (default features) - uses: actions-rs/cargo@v1 + uses: houseabsolute/actions-rust-cross@v1 with: - use-cross: ${{ matrix.target != 'x86_64-unknown-linux-gnu' }} command: clippy - args: --target=${{ matrix.target }} --locked --all --all-targets --tests --examples -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 + toolchain: ${{ matrix.rust }} + target: ${{ matrix.target }} + args: --locked --all --all-targets --tests --examples -- -D warnings - name: Clippy (default features + guest_debug) - uses: actions-rs/cargo@v1 + uses: houseabsolute/actions-rust-cross@v1 with: - use-cross: ${{ matrix.target != 'x86_64-unknown-linux-gnu' }} command: clippy - args: --target=${{ matrix.target }} --locked --all --all-targets --tests --examples --features "guest_debug" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 + toolchain: ${{ matrix.rust }} + target: ${{ matrix.target }} + args: --locked --all --all-targets --tests --examples --features "guest_debug" -- -D warnings - name: Clippy (default features + pvmemcontrol) - uses: actions-rs/cargo@v1 + uses: houseabsolute/actions-rust-cross@v1 with: - use-cross: ${{ matrix.target != 'x86_64-unknown-linux-gnu' }} command: clippy - args: --target=${{ matrix.target }} --locked --all --all-targets --tests --examples --features "pvmemcontrol" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 + toolchain: ${{ matrix.rust }} + target: ${{ matrix.target }} + args: --locked --all --all-targets --tests --examples --features "pvmemcontrol" -- -D warnings - name: Clippy (default features + tracing) - uses: actions-rs/cargo@v1 + uses: houseabsolute/actions-rust-cross@v1 with: - use-cross: ${{ matrix.target != 'x86_64-unknown-linux-gnu' }} command: clippy - args: --target=${{ matrix.target }} --locked --all --all-targets --tests --examples --features "tracing" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states - - - name: Clippy (mshv) - if: ${{ matrix.target == 'x86_64-unknown-linux-gnu' }} + cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 + toolchain: ${{ matrix.rust }} + target: ${{ matrix.target }} + args: --locked --all --all-targets --tests --examples --features "tracing" -- -D warnings + - name: Clippy (default features + fw_cfg) uses: actions-rs/cargo@v1 with: + use-cross: ${{ matrix.target != 'x86_64-unknown-linux-gnu' }} command: clippy - args: --target=${{ matrix.target }} --locked --all --all-targets --no-default-features --tests --examples --features "mshv" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + args: --target=${{ matrix.target }} --locked --all --all-targets --tests --examples --features "fw_cfg" -- -D warnings - - name: Clippy (mshv + kvm) - if: ${{ matrix.target == 'x86_64-unknown-linux-gnu' }} - uses: actions-rs/cargo@v1 + - name: Clippy (default features + ivshmem) + uses: houseabsolute/actions-rust-cross@v1 with: command: clippy - args: --target=${{ matrix.target }} --locked --all --all-targets --no-default-features --tests --examples --features "mshv,kvm" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 + toolchain: ${{ matrix.rust }} + target: ${{ matrix.target }} + args: --locked --all --all-targets --tests --examples --features "ivshmem" -- -D warnings - name: Clippy (sev_snp) if: ${{ matrix.target == 'x86_64-unknown-linux-gnu' }} - uses: actions-rs/cargo@v1 + uses: houseabsolute/actions-rust-cross@v1 with: command: clippy - args: --target=${{ matrix.target }} --locked --all --all-targets --no-default-features --tests --examples --features "sev_snp" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 + toolchain: ${{ matrix.rust }} + target: ${{ matrix.target }} + args: --locked --all --all-targets --no-default-features --tests --examples --features "sev_snp" -- -D warnings - name: Clippy (igvm) if: ${{ matrix.target == 'x86_64-unknown-linux-gnu' }} - uses: actions-rs/cargo@v1 + uses: houseabsolute/actions-rust-cross@v1 with: command: clippy - args: --target=${{ matrix.target }} --locked --all --all-targets --no-default-features --tests --examples --features "igvm" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 + toolchain: ${{ matrix.rust }} + target: ${{ matrix.target }} + args: --locked --all --all-targets --no-default-features --tests --examples --features "igvm" -- -D warnings - name: Clippy (kvm + tdx) if: ${{ matrix.target == 'x86_64-unknown-linux-gnu' }} - uses: actions-rs/cargo@v1 + uses: houseabsolute/actions-rust-cross@v1 with: command: clippy - args: --target=${{ matrix.target }} --locked --all --all-targets --no-default-features --tests --examples --features "tdx,kvm" -- -D warnings -D clippy::undocumented_unsafe_blocks -W clippy::assertions_on_result_states + cross-version: 3e0957637b49b1bbced23ad909170650c5b70635 + toolchain: ${{ matrix.rust }} + target: ${{ matrix.target }} + args: --locked --all --all-targets --no-default-features --tests --examples --features "tdx,kvm" -- -D warnings - name: Check build did not modify any files run: test -z "$(git status --porcelain)" @@ -141,6 +159,6 @@ jobs: name: Typos / Spellcheck runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 # Executes "typos ." - - uses: crate-ci/typos@v1.34.0 + - uses: crate-ci/typos@v1.36.2 diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml deleted file mode 100644 index 01d4d6d810..0000000000 --- a/.github/workflows/release.yaml +++ /dev/null @@ -1,95 +0,0 @@ -name: Cloud Hypervisor Release -on: [create, merge_group] -concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }} - cancel-in-progress: true -env: - GITHUB_TOKEN: ${{ github.token }} - -jobs: - release: - if: (github.event_name == 'create' && github.event.ref_type == 'tag') || github.event_name == 'merge_group' - name: Release ${{ matrix.platform.target }} - strategy: - fail-fast: false - matrix: - platform: - - target: x86_64-unknown-linux-gnu - args: --all --release --features mshv - name_ch: cloud-hypervisor - name_ch_remote: ch-remote - - target: x86_64-unknown-linux-musl - args: --all --release --features mshv - name_ch: cloud-hypervisor-static - name_ch_remote: ch-remote-static - - target: aarch64-unknown-linux-musl - args: --all --release - name_ch: cloud-hypervisor-static-aarch64 - name_ch_remote: ch-remote-static-aarch64 - runs-on: ubuntu-latest - steps: - - name: Code checkout - uses: actions/checkout@v4 - - name: Install musl-gcc - if: contains(matrix.platform.target, 'musl') - run: sudo apt install -y musl-tools - - name: Create release directory - if: | - github.event_name == 'create' && github.event.ref_type == 'tag' && - matrix.platform.target == 'x86_64-unknown-linux-gnu' - run: rsync -rv --exclude=.git . ../cloud-hypervisor-${{ github.event.ref }} - - name: Build ${{ matrix.platform.target }} - uses: houseabsolute/actions-rust-cross@v1 - with: - command: build - target: ${{ matrix.platform.target }} - args: ${{ matrix.platform.args }} - strip: true - toolchain: "1.83.0" - - name: Copy Release Binaries - if: github.event_name == 'create' && github.event.ref_type == 'tag' - shell: bash - run: | - cp target/${{ matrix.platform.target }}/release/cloud-hypervisor ./${{ matrix.platform.name_ch }} - cp target/${{ matrix.platform.target }}/release/ch-remote ./${{ matrix.platform.name_ch_remote }} - - name: Upload Release Artifacts - if: github.event_name == 'create' && github.event.ref_type == 'tag' - uses: actions/upload-artifact@v4 - with: - name: Artifacts for ${{ matrix.platform.target }} - path: | - ./${{ matrix.platform.name_ch }} - ./${{ matrix.platform.name_ch_remote }} - - name: Vendor - if: | - github.event_name == 'create' && github.event.ref_type == 'tag' && - matrix.platform.target == 'x86_64-unknown-linux-gnu' - working-directory: ../cloud-hypervisor-${{ github.event.ref }} - run: | - mkdir ../vendor-cargo-home - export CARGO_HOME=$(realpath ../vendor-cargo-home) - mkdir .cargo - cargo vendor > .cargo/config.toml - - name: Create vendored source archive - if: | - github.event_name == 'create' && github.event.ref_type == 'tag' && - matrix.platform.target == 'x86_64-unknown-linux-gnu' - run: tar cJf cloud-hypervisor-${{ github.event.ref }}.tar.xz ../cloud-hypervisor-${{ github.event.ref }} - - name: Upload cloud-hypervisor vendored source archive - if: | - github.event_name == 'create' && github.event.ref_type == 'tag' && - matrix.platform.target == 'x86_64-unknown-linux-gnu' - id: upload-release-cloud-hypervisor-vendored-sources - uses: actions/upload-artifact@v4 - with: - path: cloud-hypervisor-${{ github.event.ref }}.tar.xz - name: cloud-hypervisor-${{ github.event.ref }}.tar.xz - - name: Create GitHub Release - if: github.event_name == 'create' && github.event.ref_type == 'tag' - uses: softprops/action-gh-release@v2 - with: - draft: true - files: | - ./${{ matrix.platform.name_ch }} - ./${{ matrix.platform.name_ch_remote }} - ./cloud-hypervisor-${{ github.event.ref }}.tar.xz diff --git a/.github/workflows/reuse.yaml b/.github/workflows/reuse.yaml index a2161c2818..3a463eedcd 100644 --- a/.github/workflows/reuse.yaml +++ b/.github/workflows/reuse.yaml @@ -7,6 +7,6 @@ jobs: name: REUSE Compliance Check runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - name: REUSE Compliance Check uses: fsfe/reuse-action@v5 diff --git a/.github/workflows/shlint.yaml b/.github/workflows/shlint.yaml index 9089964f06..b9208f3f20 100644 --- a/.github/workflows/shlint.yaml +++ b/.github/workflows/shlint.yaml @@ -12,7 +12,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout repository - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: Run the shell script checkers uses: luizm/action-sh-checker@master env: diff --git a/.github/workflows/taplo.yaml b/.github/workflows/taplo.yaml index 2b1e618984..75b61d9236 100644 --- a/.github/workflows/taplo.yaml +++ b/.github/workflows/taplo.yaml @@ -10,7 +10,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Code checkout - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: Install Rust toolchain uses: dtolnay/rust-toolchain@stable - name: Install build dependencies diff --git a/.lychee.toml b/.lychee.toml index 9eb8f9fdf7..44517a7819 100644 --- a/.lychee.toml +++ b/.lychee.toml @@ -2,8 +2,6 @@ verbose = "info" exclude = [ # Availability of links below should be manually verified. - # Page for intel SGX support, returns 403 while querying. - '^https://www.intel.com/content/www/us/en/developer/tools/software-guard-extensions/linux-overview.html', # Page for intel TDX support, returns 403 while querying. '^https://www.intel.com/content/www/us/en/developer/tools/trust-domain-extensions/overview.html', # Page for TPM, returns 403 while querying. @@ -14,8 +12,16 @@ exclude = [ # OSDev has added bot protection and accesses my result in 403 Forbidden. '^https://wiki.osdev.org', + # Exclude all pages with $ in the URL since $XXX is a variable + "\\$.*", + # Exclude local files + "file://.*", ] +# Exclude loopback addresses +exclude_loopback = true + + max_retries = 3 retry_wait_time = 5 diff --git a/.rustfmt.toml b/.rustfmt.toml index 754d7badfd..394a1065be 100644 --- a/.rustfmt.toml +++ b/.rustfmt.toml @@ -1,4 +1,4 @@ -edition = "2021" +edition = "2024" group_imports="StdExternalCrate" imports_granularity="Module" diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index cdd75e31a1..c77d3e36da 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -114,3 +114,13 @@ Signed-off-by: Sebastien Boeuf Then, after the corresponding PR is merged, GitHub will automatically close that issue when parsing the [commit message](https://help.github.com/articles/closing-issues-via-commit-messages/). + +## AI Generated Code + +Our policy is to decline any contributions known to contain contents +generated or derived from using Large Language Models (LLMs). This +includes ChatGPT, Gemini, Claude, Copilot and similar tools. + +The goal is to avoid ambiguity in license compliance and optimize the +use of limited project resources, especially for code review and +maintenance. This policy can be revisited as LLMs evolve and mature. diff --git a/Cargo.lock b/Cargo.lock index afa7538716..b96aead2cf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -36,9 +36,9 @@ dependencies = [ [[package]] name = "anstream" -version = "0.6.15" +version = "0.6.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64e15c1ab1f89faffbf04a634d5e1962e9074f2741eef6d97f3c4e322426d526" +checksum = "301af1932e46185686725e0fad2f8f2aa7da69dd70bf6ecc44d6b703844a3933" dependencies = [ "anstyle", "anstyle-parse", @@ -51,15 +51,15 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.8" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bec1de6f59aedf83baf9ff929c98f2ad654b97c9510f4e70cf6f661d49fd5b1" +checksum = "862ed96ca487e809f1c8e5a8447f6ee2cf102f846893800b20cebdf541fc6bbd" [[package]] name = "anstyle-parse" -version = "0.2.6" +version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" +checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" dependencies = [ "utf8parse", ] @@ -85,9 +85,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.94" +version = "1.0.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1fd03a028ef38ba2276dce7e33fcd6369c158a1bca17946c4b1b701891c1ff7" +checksum = "b0674a1ddeecb70197781e945de4b3b8ffb61fa939a5597bcf48503737663100" [[package]] name = "api_client" @@ -114,12 +114,13 @@ dependencies = [ "libc", "linux-loader", "log", + "proptest", "serde", + "serde_json", "thiserror 2.0.12", "uuid", "vm-fdt", "vm-memory", - "vm-migration", "vmm-sys-util", ] @@ -162,9 +163,9 @@ dependencies = [ [[package]] name = "async-io" -version = "2.4.1" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1237c0ae75a0f3765f58910ff9cdd0a12eeb39ab2f4c7de23262f337f0aacbb3" +checksum = "19634d6336019ef220f09fd31168ce5c184b295cbf80345437cc36094ef223ca" dependencies = [ "async-lock", "cfg-if", @@ -175,8 +176,7 @@ dependencies = [ "polling", "rustix 1.0.7", "slab", - "tracing", - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] [[package]] @@ -192,9 +192,9 @@ dependencies = [ [[package]] name = "async-process" -version = "2.3.0" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "63255f1dc2381611000436537bbedfe83183faa303a5a0edaf191edef06526bb" +checksum = "65daa13722ad51e6ab1a1b9c01299142bc75135b337923cfa10e79bbbd669f00" dependencies = [ "async-channel", "async-io", @@ -205,8 +205,7 @@ dependencies = [ "cfg-if", "event-listener", "futures-lite", - "rustix 0.38.44", - "tracing", + "rustix 1.0.7", ] [[package]] @@ -222,9 +221,9 @@ dependencies = [ [[package]] name = "async-signal" -version = "0.2.10" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "637e00349800c0bdf8bfc21ebbc0b6524abea702b0da4168ac00d070d0c0b9f3" +checksum = "f567af260ef69e1d52c2b560ce0ea230763e6fbb9214a85d768760a920e3e3c1" dependencies = [ "async-io", "async-lock", @@ -232,10 +231,10 @@ dependencies = [ "cfg-if", "futures-core", "futures-io", - "rustix 0.38.44", + "rustix 1.0.7", "signal-hook-registry", "slab", - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] [[package]] @@ -246,9 +245,9 @@ checksum = "8b75356056920673b02621b35afd0f7dda9306d03c79a30f5c56c44cf256e3de" [[package]] name = "async-trait" -version = "0.1.86" +version = "0.1.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "644dd749086bf3771a2fbc5f256fdb982d53f011c7d5d560304eafeecebce79d" +checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" dependencies = [ "proc-macro2", "quote", @@ -263,9 +262,32 @@ checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" [[package]] name = "autocfg" -version = "1.4.0" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + +[[package]] +name = "aws-lc-rs" +version = "1.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "879b6c89592deb404ba4dc0ae6b58ffd1795c78991cbb5b8bc441c48a070440d" +dependencies = [ + "aws-lc-sys", + "zeroize", +] + +[[package]] +name = "aws-lc-sys" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" +checksum = "107a4e9d9cab9963e04e84bb8dee0e25f2a987f9a8bad5ed054abd439caa8f8c" +dependencies = [ + "bindgen", + "cc", + "cmake", + "dunce", + "fs_extra", +] [[package]] name = "backtrace" @@ -282,6 +304,41 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "bindgen" +version = "0.72.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895" +dependencies = [ + "bitflags 2.9.4", + "cexpr", + "clang-sys", + "itertools 0.13.0", + "log", + "prettyplease", + "proc-macro2", + "quote", + "regex", + "rustc-hash 2.1.1", + "shlex", + "syn", +] + +[[package]] +name = "bit-set" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + [[package]] name = "bitfield-struct" version = "0.10.1" @@ -301,9 +358,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.9.0" +version = "2.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c8214115b7bf84099f1309324e63141d4c5d7cc26862f97a0a857dbefe165bd" +checksum = "2261d10cca569e4643e526d8dc2e62e433cc8aba21ab764233731f8d369bf394" [[package]] name = "block" @@ -353,13 +410,24 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "cc" -version = "1.2.27" +version = "1.2.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d487aa071b5f64da6f19a3e848e3578944b726ee5a4854b82172f02aa876bfdc" +checksum = "42bc4aea80032b7bf409b0bc7ccad88853858911b7713a8062fdc0623867bedc" dependencies = [ + "jobserver", + "libc", "shlex", ] +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom", +] + [[package]] name = "cfg-if" version = "1.0.0" @@ -372,20 +440,31 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" +[[package]] +name = "clang-sys" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" +dependencies = [ + "glob", + "libc", + "libloading", +] + [[package]] name = "clap" -version = "4.5.13" +version = "4.5.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fbb260a053428790f3de475e304ff84cdbc4face759ea7a3e64c1edd938a7fc" +checksum = "7eac00902d9d136acd712710d71823fb8ac8004ca445a89e73a41d45aa712931" dependencies = [ "clap_builder", ] [[package]] name = "clap_builder" -version = "4.5.13" +version = "4.5.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64b17d7ea74e9f833c7dbf2cbe4fb12ff26783eda4782a8975b72f895c9b4d99" +checksum = "2ad9bbf750e73b5884fb8a211a9424a1906c1e156724260fdae972f31d70e1d6" dependencies = [ "anstream", "anstyle", @@ -402,13 +481,14 @@ checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" [[package]] name = "cloud-hypervisor" -version = "46.0.0" +version = "48.0.0" dependencies = [ "anyhow", "api_client", "clap", "dhat", "dirs", + "env_logger", "epoll", "event_monitor", "hypervisor", @@ -416,6 +496,7 @@ dependencies = [ "log", "net_util", "option_parser", + "rustls", "seccompiler", "serde_json", "signal-hook", @@ -430,6 +511,15 @@ dependencies = [ "zbus", ] +[[package]] +name = "cmake" +version = "0.1.54" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7caa3f9de89ddbe2c607f4101924c5abec803763ae9534e4f4d7d8f84aa81f0" +dependencies = [ + "cc", +] + [[package]] name = "colorchoice" version = "1.0.3" @@ -527,11 +617,13 @@ dependencies = [ "acpi_tables", "anyhow", "arch", - "bitflags 2.9.0", + "bitfield-struct", + "bitflags 2.9.4", "byteorder", "event_monitor", "hypervisor", "libc", + "linux-loader", "log", "num_enum", "pci", @@ -543,6 +635,7 @@ dependencies = [ "vm-memory", "vm-migration", "vmm-sys-util", + "zerocopy 0.8.26", ] [[package]] @@ -554,8 +647,8 @@ dependencies = [ "backtrace", "lazy_static", "mintex", - "parking_lot 0.12.1", - "rustc-hash", + "parking_lot", + "rustc-hash 1.1.0", "serde", "serde_json", "thousands", @@ -579,9 +672,21 @@ dependencies = [ "libc", "option-ext", "redox_users", - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] +[[package]] +name = "dunce" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + [[package]] name = "endi" version = "1.1.0" @@ -590,9 +695,9 @@ checksum = "a3d8a32ae18130a3c84dd492d4215c3d913c3b07c6b63c2eb3eb7ff1101ab7bf" [[package]] name = "enumflags2" -version = "0.7.10" +version = "0.7.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d232db7f5956f3f14313dc2f87985c58bd2c695ce124c8cdd984e08e15ac133d" +checksum = "1027f7680c853e056ebcec683615fb6fbbc07dbaa13b4d5d9442b146ded4ecef" dependencies = [ "enumflags2_derive", "serde", @@ -600,9 +705,9 @@ dependencies = [ [[package]] name = "enumflags2_derive" -version = "0.7.10" +version = "0.7.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de0d48a183585823424a4ce1aa132d174a6a81bd540895822eb4c8373a8e49e8" +checksum = "67c78a4d8fdf9953a5c9d458f9efe940fd97a0cab0941c075a813ac594733827" dependencies = [ "proc-macro2", "quote", @@ -621,14 +726,14 @@ dependencies = [ [[package]] name = "env_logger" -version = "0.11.3" +version = "0.11.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38b35839ba51819680ba087cd351788c9a3c476841207e0b8cee0b04722343b9" +checksum = "13c863f0904021b108aa8b2f55046443e6b1ebde8fd4a15c399893aae4fa069f" dependencies = [ "anstream", "anstyle", "env_filter", - "humantime", + "jiff", "log", ] @@ -638,7 +743,7 @@ version = "4.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74351c3392ea1ff6cd2628e0042d268ac2371cb613252ff383b6dfa50d22fa79" dependencies = [ - "bitflags 2.9.0", + "bitflags 2.9.4", "libc", ] @@ -650,19 +755,19 @@ checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" [[package]] name = "errno" -version = "0.3.12" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cea14ef9355e3beab063703aa9dab15afd25f0667c341310c1e5274bb1d0da18" +checksum = "778e2ac28f6c47af28e4907f13ffd1e1ddbd400980a9abd7c8df189bf578a5ad" dependencies = [ "libc", - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] [[package]] name = "event-listener" -version = "5.4.0" +version = "5.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3492acde4c3fc54c845eaab3eed8bd00c7a7d881f78bfc801e43a93dec1331ae" +checksum = "e13b66accf52311f30a0db42147dadea9850cb48cd070028831ae5f5d4b856ab" dependencies = [ "concurrent-queue", "parking", @@ -719,6 +824,12 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "fs_extra" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" + [[package]] name = "futures" version = "0.3.31" @@ -823,11 +934,11 @@ dependencies = [ [[package]] name = "gdbstub" -version = "0.7.1" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6341b3480afbb34eaefc7f92713bc92f2d83e338aaa1c44192f9c2956f4a4903" +checksum = "71d66e32caf5dd59f561be0143e413e01d651bd8498eb9aa0be8c482c81c8d31" dependencies = [ - "bitflags 2.9.0", + "bitflags 2.9.4", "cfg-if", "log", "managed", @@ -837,9 +948,9 @@ dependencies = [ [[package]] name = "gdbstub_arch" -version = "0.3.0" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e3b1357bd3203fc09a6601327ae0ab38865d14231d0b65d3143f5762cc7977d" +checksum = "22dde0e1b68787036ccedd0b1ff6f953527a0e807e571fbe898975203027278f" dependencies = [ "gdbstub", "num-traits", @@ -900,12 +1011,6 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" -[[package]] -name = "humantime" -version = "2.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" - [[package]] name = "hypervisor" version = "0.1.0" @@ -954,8 +1059,8 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" [[package]] name = "igvm" -version = "0.3.4" -source = "git+https://github.com/microsoft/igvm?branch=main#01daa631a596459cb4de58505881007dd13d4410" +version = "0.4.0" +source = "git+https://github.com/microsoft/igvm?branch=main#dff4ebc9c5bd16707ff75de26ccabe2d4dfdbcd8" dependencies = [ "bitfield-struct", "crc32fast", @@ -971,8 +1076,8 @@ dependencies = [ [[package]] name = "igvm_defs" -version = "0.3.4" -source = "git+https://github.com/microsoft/igvm?branch=main#01daa631a596459cb4de58505881007dd13d4410" +version = "0.4.0" +source = "git+https://github.com/microsoft/igvm?branch=main#dff4ebc9c5bd16707ff75de26ccabe2d4dfdbcd8" dependencies = [ "bitfield-struct", "open-enum", @@ -990,22 +1095,14 @@ dependencies = [ "hashbrown", ] -[[package]] -name = "instant" -version = "0.1.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" -dependencies = [ - "cfg-if", -] - [[package]] name = "io-uring" -version = "0.6.4" +version = "0.7.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "595a0399f411a508feb2ec1e970a4a30c249351e30208960d58298de8660b0e5" +checksum = "046fa2d4d00aea763528b4950358d0ead425372445dc8ff86312b3c69ff7727b" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.9.4", + "cfg-if", "libc", ] @@ -1024,12 +1121,64 @@ version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" +[[package]] +name = "itertools" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +dependencies = [ + "either", +] + +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" +[[package]] +name = "jiff" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be1f93b8b1eb69c77f24bbb0afdf66f54b632ee39af40ca21c4365a1d7347e49" +dependencies = [ + "jiff-static", + "log", + "portable-atomic", + "portable-atomic-util", + "serde", +] + +[[package]] +name = "jiff-static" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03343451ff899767262ec32146f6d559dd759fdadf42ff0e227c7c48f72594b4" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "jobserver" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +dependencies = [ + "getrandom 0.3.3", + "libc", +] + [[package]] name = "js-sys" version = "0.3.77" @@ -1042,22 +1191,22 @@ dependencies = [ [[package]] name = "kvm-bindings" -version = "0.10.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa4933174d0cc4b77b958578cd45784071cc5ae212c2d78fbd755aaaa6dfa71a" +checksum = "9a537873e15e8daabb416667e606d9b0abc2a8fb9a45bd5853b888ae0ead82f9" dependencies = [ "serde", "vmm-sys-util", - "zerocopy 0.7.35", + "zerocopy 0.8.26", ] [[package]] name = "kvm-ioctls" -version = "0.19.1" +version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e013ae7fcd2c6a8f384104d16afe7ea02969301ea2bb2a56e44b011ebc907cab" +checksum = "0c8f7370330b4f57981e300fa39b02088f2f2a5c2d0f1f994e8090589619c56d" dependencies = [ - "bitflags 2.9.0", + "bitflags 2.9.4", "kvm-bindings", "libc", "vmm-sys-util", @@ -1065,13 +1214,13 @@ dependencies = [ [[package]] name = "landlock" -version = "0.4.0" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dafb8a4afee64f167eb2b52d32f0eea002e41a7a6450e68c799c8ec3a81a634c" +checksum = "b3d2ef408b88e913bfc6594f5e693d57676f6463ded7d8bf994175364320c706" dependencies = [ "enumflags2", "libc", - "thiserror 1.0.62", + "thiserror 2.0.12", ] [[package]] @@ -1086,21 +1235,31 @@ version = "0.2.172" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" +[[package]] +name = "libloading" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" +dependencies = [ + "cfg-if", + "windows-link", +] + [[package]] name = "libredox" version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d" dependencies = [ - "bitflags 2.9.0", + "bitflags 2.9.4", "libc", ] [[package]] name = "libssh2-sys" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dc8a030b787e2119a731f1951d6a773e2280c660f8ec4b0f5e1505a386e71ee" +checksum = "220e4f05ad4a218192533b300327f5150e809b54c4ec83b5a1d91833601811b9" dependencies = [ "cc", "libc", @@ -1125,8 +1284,7 @@ dependencies = [ [[package]] name = "linux-loader" version = "0.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "870c3814345f050991f99869417779f6062542bcf4ed81db7a1b926ad1306638" +source = "git+https://github.com/rust-vmm/linux-loader?branch=main#5fdaed87ddafc89d6abf0b50195a12d19133000d" dependencies = [ "vm-memory", ] @@ -1183,32 +1341,38 @@ dependencies = [ [[package]] name = "micro_http" version = "0.1.0" -source = "git+https://github.com/firecracker-microvm/micro-http?branch=main#4f621532e81ee2ad096a9c9592fdacc40d19de48" +source = "git+https://github.com/firecracker-microvm/micro-http?branch=main#bf5098916006912f8dd35aaa6daa5579c6c297b2" dependencies = [ "libc", "vmm-sys-util", ] +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + [[package]] name = "miniz_oxide" -version = "0.8.8" +version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3be647b768db090acb35d5ec5db2b0e1f1de11133ca123b9eacf5137868f892a" +checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" dependencies = [ "adler2", ] [[package]] name = "mintex" -version = "0.1.3" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9bec4598fddb13cc7b528819e697852653252b760f1228b7642679bf2ff2cd07" +checksum = "c505b3e17ed6b70a7ed2e67fbb2c560ee327353556120d6e72f5232b6880d536" [[package]] name = "mshv-bindings" -version = "0.5.1" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "909de5fd4a5a3347a6c62872f6816e6279efd8615a753f10a3bc4daaef8a72ef" +checksum = "805cf329582f770f62cc612716a04c14815276ae266b6298375a672d3c5a5184" dependencies = [ "libc", "num_enum", @@ -1220,9 +1384,9 @@ dependencies = [ [[package]] name = "mshv-ioctls" -version = "0.5.1" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c7d94972588d562bd349b916de6a43f2ee268e6e9c91cfb5b30549ed4ea2751" +checksum = "aefaab4c067cf5226a917227640d835327b25b71a8d465f815f74f490344e10a" dependencies = [ "libc", "mshv-bindings", @@ -1274,7 +1438,7 @@ version = "0.30.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6" dependencies = [ - "bitflags 2.9.0", + "bitflags 2.9.4", "cfg-if", "cfg_aliases", "libc", @@ -1287,6 +1451,16 @@ version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "43794a0ace135be66a25d3ae77d41b91615fb68ae937f904090203e81f755b65" +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + [[package]] name = "num-traits" version = "0.2.19" @@ -1403,17 +1577,6 @@ version = "2.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f38d5652c16fde515bb1ecef450ab0f6a219d619a7274976324d5e377f7dceba" -[[package]] -name = "parking_lot" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99" -dependencies = [ - "instant", - "lock_api", - "parking_lot_core 0.8.6", -] - [[package]] name = "parking_lot" version = "0.12.1" @@ -1421,21 +1584,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" dependencies = [ "lock_api", - "parking_lot_core 0.9.9", -] - -[[package]] -name = "parking_lot_core" -version = "0.8.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60a2cfe6f0ad2bfc16aefa463b497d5c7a5ecd44a23efa72aa342d90177356dc" -dependencies = [ - "cfg-if", - "instant", - "libc", - "redox_syscall 0.2.16", - "smallvec", - "winapi", + "parking_lot_core", ] [[package]] @@ -1446,7 +1595,7 @@ checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" dependencies = [ "cfg-if", "libc", - "redox_syscall 0.4.1", + "redox_syscall", "smallvec", "windows-targets 0.48.5", ] @@ -1488,7 +1637,6 @@ dependencies = [ "serde_json", "test_infra", "thiserror 2.0.12", - "wait-timeout", ] [[package]] @@ -1626,6 +1774,21 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "portable-atomic" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f84267b20a16ea918e43c6a88433c2d54fa145c92a811b5b047ccbe153674483" + +[[package]] +name = "portable-atomic-util" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507" +dependencies = [ + "portable-atomic", +] + [[package]] name = "ppv-lite86" version = "0.2.20" @@ -1635,24 +1798,59 @@ dependencies = [ "zerocopy 0.7.35", ] +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn", +] + [[package]] name = "proc-macro-crate" -version = "3.2.0" +version = "3.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ecf48c7ca261d60b74ab1a7b20da18bede46776b2e55535cb958eb595c5fa7b" +checksum = "edce586971a4dfaa28950c6f18ed55e0406c1ab88bbce2c6f6293a7aaba73d35" dependencies = [ "toml_edit", ] [[package]] name = "proc-macro2" -version = "1.0.95" +version = "1.0.101" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" +checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de" dependencies = [ "unicode-ident", ] +[[package]] +name = "proptest" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bee689443a2bd0a16ab0348b52ee43e3b2d1b1f931c8aa5c9f8de4c86fbe8c40" +dependencies = [ + "bit-set", + "bit-vec", + "bitflags 2.9.4", + "num-traits", + "rand", + "rand_chacha", + "rand_xorshift", + "regex-syntax", + "rusty-fork", + "tempfile", + "unarray", +] + +[[package]] +name = "quick-error" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" + [[package]] name = "quote" version = "1.0.40" @@ -1664,15 +1862,15 @@ dependencies = [ [[package]] name = "r-efi" -version = "5.2.0" +version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" [[package]] name = "rand" -version = "0.9.1" +version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97" +checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" dependencies = [ "rand_chacha", "rand_core", @@ -1697,6 +1895,15 @@ dependencies = [ "getrandom 0.3.3", ] +[[package]] +name = "rand_xorshift" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "513962919efc330f829edb2535844d1b912b0fbe2ca165d613e4e8788bb05a5a" +dependencies = [ + "rand_core", +] + [[package]] name = "range_map_vec" version = "0.2.0" @@ -1714,15 +1921,6 @@ dependencies = [ "vmm-sys-util", ] -[[package]] -name = "redox_syscall" -version = "0.2.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" -dependencies = [ - "bitflags 1.3.2", -] - [[package]] name = "redox_syscall" version = "0.4.1" @@ -1734,9 +1932,9 @@ dependencies = [ [[package]] name = "redox_users" -version = "0.5.0" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd6f9d3d47bdd2ad6945c5015a226ec6155d0bcdfd8f7cd29f86b71f8de99d2b" +checksum = "a4e608c6638b9c18977b00b475ac1f28d14e84b27d8d42f70e0bf1e3dec127ac" dependencies = [ "getrandom 0.2.15", "libredox", @@ -1783,11 +1981,25 @@ dependencies = [ "syn", ] +[[package]] +name = "ring" +version = "0.17.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" +dependencies = [ + "cc", + "cfg-if", + "getrandom 0.2.15", + "libc", + "untrusted", + "windows-sys 0.52.0", +] + [[package]] name = "rustc-demangle" -version = "0.1.24" +version = "0.1.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" +checksum = "56f7d92ca342cea22a06f2121d944b4fd82af56988c270852495420f961d4ace" [[package]] name = "rustc-hash" @@ -1795,13 +2007,19 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" +[[package]] +name = "rustc-hash" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" + [[package]] name = "rustix" version = "0.38.44" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" dependencies = [ - "bitflags 2.9.0", + "bitflags 2.9.4", "errno", "libc", "linux-raw-sys 0.4.15", @@ -1814,18 +2032,66 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c71e83d6afe7ff64890ec6b71d6a69bb8a610ab78ce364b3352876bb4c801266" dependencies = [ - "bitflags 2.9.0", + "bitflags 2.9.4", "errno", "libc", "linux-raw-sys 0.9.4", "windows-sys 0.59.0", ] +[[package]] +name = "rustls" +version = "0.23.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a9586e9ee2b4f8fab52a0048ca7334d7024eef48e2cb9407e3497bb7cab7fa7" +dependencies = [ + "aws-lc-rs", + "log", + "once_cell", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", +] + +[[package]] +name = "rustls-pki-types" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "229a4a4c221013e7e1f1a043678c5cc39fe5171437c88fb47151a21e6f5b5c79" +dependencies = [ + "zeroize", +] + +[[package]] +name = "rustls-webpki" +version = "0.103.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e10b3f4191e8a80e6b43eebabfac91e5dcecebb27a71f04e820c47ec41d314bf" +dependencies = [ + "aws-lc-rs", + "ring", + "rustls-pki-types", + "untrusted", +] + [[package]] name = "rustversion" -version = "1.0.21" +version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a0d197bd2c9dc6e53b84da9556a69ba4cdfab8619eb41a8bd1cc2027a0f6b1d" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + +[[package]] +name = "rusty-fork" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc6bf79ff24e648f6da1f8d1f011e9cac26491b619e6b9280f2b47f1774e6ee2" +dependencies = [ + "fnv", + "quick-error", + "tempfile", + "wait-timeout", +] [[package]] name = "ryu" @@ -1870,11 +2136,12 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.120" +version = "1.0.143" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e0d21c9a8cae1235ad58a00c11cb40d4b1e5c784f1ef2c537876ed6ffd8b7c5" +checksum = "d401abef1d108fbd9cbaebc3e46611f4b1021f714a0597a71f41ee463f5f4a5a" dependencies = [ "itoa", + "memchr", "ryu", "serde", ] @@ -1892,9 +2159,9 @@ dependencies = [ [[package]] name = "serde_with" -version = "3.9.0" +version = "3.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69cecfa94848272156ea67b2b1a53f20fc7bc638c4a46d2f8abde08f05f4b857" +checksum = "f2c45cd61fefa9db6f254525d46e392b852e0e61d9a1fd36e5bd183450a556d5" dependencies = [ "serde", "serde_derive", @@ -1903,9 +2170,9 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "3.9.0" +version = "3.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8fee4991ef4f274617a51ad4af30519438dacb2f56ac773b08a1922ff743350" +checksum = "de90945e6565ce0d9a25098082ed4ee4002e047cb59892c318d66821e14bb30f" dependencies = [ "darling", "proc-macro2", @@ -1935,27 +2202,24 @@ dependencies = [ [[package]] name = "signal-hook-registry" -version = "1.4.2" +version = "1.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9e9e0b4211b72e7b8b6e85c807d36c212bdb33ea8587f7569562a84df5465b1" +checksum = "b2a4719bff48cee6b39d12c020eeb490953ad2443b7055bd0b21fca26bd8c28b" dependencies = [ "libc", ] [[package]] name = "slab" -version = "0.4.9" +version = "0.4.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" -dependencies = [ - "autocfg", -] +checksum = "7a2ae44ef20feb57a68b23d846850f861394c2e02dc425a50098ae8c90267589" [[package]] name = "smallvec" -version = "1.13.2" +version = "1.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" [[package]] name = "spin" @@ -1968,14 +2232,14 @@ dependencies = [ [[package]] name = "ssh2" -version = "0.9.4" +version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7fe461910559f6d5604c3731d00d2aafc4a83d1665922e280f42f9a168d5455" +checksum = "2f84d13b3b8a0d4e91a2629911e951db1bb8671512f5c09d7d4ba34500ba68c8" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.9.4", "libc", "libssh2-sys", - "parking_lot 0.11.2", + "parking_lot", ] [[package]] @@ -1990,11 +2254,17 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + [[package]] name = "syn" -version = "2.0.104" +version = "2.0.106" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17b6f705963418cdb9927482fa304bc562ece2fdd4f616084c50b7023b435a40" +checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6" dependencies = [ "proc-macro2", "quote", @@ -2016,12 +2286,12 @@ dependencies = [ [[package]] name = "terminal_size" -version = "0.3.0" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21bebf2b7c9e0a515f6e0f8c51dc0f8e4696391e6f1ff30379559f8365fb0df7" +checksum = "60b8cb979cb11c32ce1603f8137b22262a9d131aaa5c37b5678025f22b8becd0" dependencies = [ - "rustix 0.38.44", - "windows-sys 0.48.0", + "rustix 1.0.7", + "windows-sys 0.60.2", ] [[package]] @@ -2031,7 +2301,6 @@ dependencies = [ "dirs", "epoll", "libc", - "serde", "serde_json", "ssh2", "thiserror 2.0.12", @@ -2087,9 +2356,9 @@ checksum = "3bf63baf9f5039dadc247375c29eb13706706cfde997d0330d05aa63a77d8820" [[package]] name = "toml_datetime" -version = "0.6.8" +version = "0.6.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41" +checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c" [[package]] name = "toml_edit" @@ -2107,7 +2376,6 @@ name = "tpm" version = "0.1.0" dependencies = [ "anyhow", - "byteorder", "libc", "log", "net_gen", @@ -2167,12 +2435,24 @@ dependencies = [ "winapi", ] +[[package]] +name = "unarray" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94" + [[package]] name = "unicode-ident" version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + [[package]] name = "utf8parse" version = "0.2.2" @@ -2181,28 +2461,16 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.17.0" +version = "1.18.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3cf4199d1e5d15ddd86a694e4d0dffa9c323ce759fea589f00fef9d81cc1931d" +checksum = "2f87b8aa10b915a06587d0dec516c282ff295b475d94abf425d62b57710070a2" dependencies = [ "getrandom 0.3.3", "js-sys", "rand", - "uuid-macro-internal", "wasm-bindgen", ] -[[package]] -name = "uuid-macro-internal" -version = "1.17.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26b682e8c381995ea03130e381928e0e005b7c9eb483c6c8682f50e07b33c2b7" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "vcpkg" version = "0.2.15" @@ -2211,16 +2479,18 @@ checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" [[package]] name = "vfio-bindings" -version = "0.4.0" -source = "git+https://github.com/rust-vmm/vfio?branch=main#3d158a14460cac7ca3c99c2effa0a46880935cb0" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "698c66a4522a31ab407a410a59c9660da036178e4fe3f371825cd6aad7d46837" dependencies = [ "vmm-sys-util", ] [[package]] name = "vfio-ioctls" -version = "0.4.0" -source = "git+https://github.com/rust-vmm/vfio?branch=main#3d158a14460cac7ca3c99c2effa0a46880935cb0" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7af7e8d49719333e5eb52209417f26695c9ab2b117a82596a63a44947f97c5d6" dependencies = [ "byteorder", "kvm-bindings", @@ -2229,7 +2499,7 @@ dependencies = [ "log", "mshv-bindings", "mshv-ioctls", - "thiserror 1.0.62", + "thiserror 2.0.12", "vfio-bindings", "vm-memory", "vmm-sys-util", @@ -2237,16 +2507,17 @@ dependencies = [ [[package]] name = "vfio_user" -version = "0.1.0" -source = "git+https://github.com/rust-vmm/vfio-user?branch=main#3febcdd3fa2531623865663ca1721e1962ed9979" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8db5bc783aad75202ad4cbcdc5e893cff1dd8fa24a1bcdb4de8998d3c4d169a" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.9.4", "libc", "log", "serde", "serde_derive", "serde_json", - "thiserror 1.0.62", + "thiserror 2.0.12", "vfio-bindings", "vm-memory", "vmm-sys-util", @@ -2254,10 +2525,11 @@ dependencies = [ [[package]] name = "vhost" -version = "0.12.1" -source = "git+https://github.com/rust-vmm/vhost?rev=d983ae0#d983ae07f78663b7d24059667376992460b571a2" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a4dcad85a129d97d5d4b2f3c47a4affdeedd76bdcd02094bcb5d9b76cac2d05" dependencies = [ - "bitflags 2.9.0", + "bitflags 2.9.4", "libc", "uuid", "vm-memory", @@ -2266,8 +2538,9 @@ dependencies = [ [[package]] name = "vhost-user-backend" -version = "0.16.1" -source = "git+https://github.com/rust-vmm/vhost?rev=d983ae0#d983ae07f78663b7d24059667376992460b571a2" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e183205a9ba7cb9c47fcb0fc0a07fc295a110efbb11ab78ad0d793b0a38a7bde" dependencies = [ "libc", "log", @@ -2285,7 +2558,6 @@ dependencies = [ "block", "clap", "env_logger", - "epoll", "libc", "log", "option_parser", @@ -2319,16 +2591,15 @@ dependencies = [ [[package]] name = "virtio-bindings" -version = "0.2.4" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1711e61c00f8cb450bd15368152a1e37a12ef195008ddc7d0f4812f9e2b30a68" +checksum = "804f498a26d5a63be7bbb8bdcd3869c3f286c4c4a17108905276454da0caf8cb" [[package]] name = "virtio-devices" version = "0.1.0" dependencies = [ "anyhow", - "arc-swap", "block", "byteorder", "epoll", @@ -2336,13 +2607,11 @@ dependencies = [ "libc", "log", "mshv-ioctls", - "net_gen", "net_util", "pci", "rate_limiter", "seccompiler", "serde", - "serde_json", "serde_with", "serial_buffer", "thiserror 2.0.12", @@ -2359,9 +2628,9 @@ dependencies = [ [[package]] name = "virtio-queue" -version = "0.14.0" +version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "872e2f3fbd70a7e6f01689720cce3d5c2c5efe52b484dd07b674246ada0e9a8d" +checksum = "fb0479158f863e59323771a1f684d843962f76960b86fecfec2bfa9c8f0f9180" dependencies = [ "log", "virtio-bindings", @@ -2382,7 +2651,6 @@ dependencies = [ name = "vm-device" version = "0.1.0" dependencies = [ - "anyhow", "hypervisor", "serde", "thiserror 2.0.12", @@ -2398,9 +2666,9 @@ source = "git+https://github.com/rust-vmm/vm-fdt?branch=main#ef5bd734f5f66fb0772 [[package]] name = "vm-memory" -version = "0.16.1" +version = "0.16.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1720e7240cdc739f935456eb77f370d7e9b2a3909204da1e2b47bef1137a013" +checksum = "1fd5e56d48353c5f54ef50bd158a0452fc82f5383da840f7b8efc31695dd3b9d" dependencies = [ "arc-swap", "libc", @@ -2413,6 +2681,8 @@ name = "vm-migration" version = "0.1.0" dependencies = [ "anyhow", + "itertools 0.14.0", + "rustls", "serde", "serde_json", "thiserror 2.0.12", @@ -2423,7 +2693,6 @@ dependencies = [ name = "vm-virtio" version = "0.1.0" dependencies = [ - "log", "virtio-queue", "vm-memory", ] @@ -2434,9 +2703,8 @@ version = "0.1.0" dependencies = [ "acpi_tables", "anyhow", - "arc-swap", "arch", - "bitflags 2.9.0", + "bitflags 2.9.4", "block", "blocking", "cfg-if", @@ -2453,6 +2721,7 @@ dependencies = [ "hypervisor", "igvm", "igvm_defs", + "kvm-bindings", "landlock", "libc", "linux-loader", @@ -2474,9 +2743,9 @@ dependencies = [ "uuid", "vfio-ioctls", "vfio_user", + "vhost", "virtio-bindings", "virtio-devices", - "virtio-queue", "vm-allocator", "vm-device", "vm-memory", @@ -2489,9 +2758,9 @@ dependencies = [ [[package]] name = "vmm-sys-util" -version = "0.12.1" +version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d1435039746e20da4f8d507a72ee1b916f7b4b05af7a91c093d2c6561934ede" +checksum = "d21f366bf22bfba3e868349978766a965cbe628c323d58e026be80b8357ab789" dependencies = [ "bitflags 1.3.2", "libc", @@ -2501,9 +2770,9 @@ dependencies = [ [[package]] name = "wait-timeout" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f200f5b12eb75f8c1ed65abd4b2db8a6e1b138a20de009dacee265a2498f3f6" +checksum = "09ac3b126d3914f9849036f826e054cbabdc8519970b8998ddaf3b5bd3c65f11" dependencies = [ "libc", ] @@ -2604,13 +2873,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] -name = "windows-sys" -version = "0.48.0" +name = "windows-link" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" -dependencies = [ - "windows-targets 0.48.5", -] +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" [[package]] name = "windows-sys" @@ -2630,6 +2896,15 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows-sys" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" +dependencies = [ + "windows-targets 0.53.2", +] + [[package]] name = "windows-targets" version = "0.48.5" @@ -2654,13 +2929,29 @@ dependencies = [ "windows_aarch64_gnullvm 0.52.6", "windows_aarch64_msvc 0.52.6", "windows_i686_gnu 0.52.6", - "windows_i686_gnullvm", + "windows_i686_gnullvm 0.52.6", "windows_i686_msvc 0.52.6", "windows_x86_64_gnu 0.52.6", "windows_x86_64_gnullvm 0.52.6", "windows_x86_64_msvc 0.52.6", ] +[[package]] +name = "windows-targets" +version = "0.53.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c66f69fcc9ce11da9966ddb31a40968cad001c5bedeb5c2b82ede4253ab48aef" +dependencies = [ + "windows_aarch64_gnullvm 0.53.0", + "windows_aarch64_msvc 0.53.0", + "windows_i686_gnu 0.53.0", + "windows_i686_gnullvm 0.53.0", + "windows_i686_msvc 0.53.0", + "windows_x86_64_gnu 0.53.0", + "windows_x86_64_gnullvm 0.53.0", + "windows_x86_64_msvc 0.53.0", +] + [[package]] name = "windows_aarch64_gnullvm" version = "0.48.5" @@ -2673,6 +2964,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764" + [[package]] name = "windows_aarch64_msvc" version = "0.48.5" @@ -2685,6 +2982,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c" + [[package]] name = "windows_i686_gnu" version = "0.48.5" @@ -2697,12 +3000,24 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" +[[package]] +name = "windows_i686_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3" + [[package]] name = "windows_i686_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11" + [[package]] name = "windows_i686_msvc" version = "0.48.5" @@ -2715,6 +3030,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" +[[package]] +name = "windows_i686_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d" + [[package]] name = "windows_x86_64_gnu" version = "0.48.5" @@ -2727,6 +3048,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba" + [[package]] name = "windows_x86_64_gnullvm" version = "0.48.5" @@ -2739,6 +3066,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57" + [[package]] name = "windows_x86_64_msvc" version = "0.48.5" @@ -2751,6 +3084,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" + [[package]] name = "winnow" version = "0.7.2" @@ -2766,7 +3105,7 @@ version = "0.39.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1" dependencies = [ - "bitflags 2.9.0", + "bitflags 2.9.4", ] [[package]] @@ -2870,11 +3209,17 @@ dependencies = [ "syn", ] +[[package]] +name = "zeroize" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" + [[package]] name = "zvariant" -version = "5.5.3" +version = "5.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d30786f75e393ee63a21de4f9074d4c038d52c5b1bb4471f955db249f9dffb1" +checksum = "999dd3be73c52b1fccd109a4a81e4fcd20fab1d3599c8121b38d04e1419498db" dependencies = [ "endi", "enumflags2", @@ -2886,9 +3231,9 @@ dependencies = [ [[package]] name = "zvariant_derive" -version = "5.5.3" +version = "5.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75fda702cd42d735ccd48117b1630432219c0e9616bf6cb0f8350844ee4d9580" +checksum = "6643fd0b26a46d226bd90d3f07c1b5321fe9bb7f04673cb37ac6d6883885b68e" dependencies = [ "proc-macro-crate", "proc-macro2", diff --git a/Cargo.toml b/Cargo.toml index fefbd227e4..b9599a045e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,11 +3,11 @@ authors = ["The Cloud Hypervisor Authors"] build = "build.rs" default-run = "cloud-hypervisor" description = "Open source Virtual Machine Monitor (VMM) that runs on top of KVM & MSHV" -edition = "2021" +edition = "2024" homepage = "https://github.com/cloud-hypervisor/cloud-hypervisor" license = "Apache-2.0 AND BSD-3-Clause" name = "cloud-hypervisor" -version = "46.0.0" +version = "48.0.0" # Minimum buildable version: # Keep in sync with version in .github/workflows/build.yaml # Policy on MSRV (see #4318): @@ -15,13 +15,26 @@ version = "46.0.0" # a.) A dependency requires it, # b.) If we want to use a new feature and that MSRV is at least 6 months old, # c.) There is a security issue that is addressed by the toolchain update. -rust-version = "1.83.0" +rust-version = "1.88.0" [profile.release] codegen-units = 1 lto = true opt-level = "s" -strip = true + +# Tradeof between performance and fast compilation times for local testing and +# development with frequent rebuilds. +[profile.optimized-dev] +codegen-units = 16 +inherits = "release" +lto = false +opt-level = 2 +strip = false + +# Optimize more for dependencies: They don't require frequent rebuilds. +[profile.optimized-dev.package."*"] +codegen-units = 1 +opt-level = 3 [profile.profiling] debug = true @@ -29,19 +42,21 @@ inherits = "release" strip = false [dependencies] -anyhow = "1.0.94" +anyhow = { workspace = true } api_client = { path = "api_client" } -clap = { version = "4.5.13", features = ["string"] } -dhat = { version = "0.3.3", optional = true } -epoll = "4.3.3" +clap = { workspace = true, features = ["string"] } +dhat = { workspace = true, optional = true } +env_logger = { workspace = true } +epoll = { workspace = true } event_monitor = { path = "event_monitor" } hypervisor = { path = "hypervisor" } -libc = "0.2.167" -log = { version = "0.4.22", features = ["std"] } +libc = { workspace = true } +log = { workspace = true, features = ["std"] } option_parser = { path = "option_parser" } +rustls = { workspace = true } seccompiler = { workspace = true } serde_json = { workspace = true } -signal-hook = "0.3.18" +signal-hook = { workspace = true } thiserror = { workspace = true } tpm = { path = "tpm" } tracer = { path = "tracer" } @@ -51,11 +66,11 @@ vmm-sys-util = { workspace = true } zbus = { version = "5.7.1", optional = true } [dev-dependencies] -dirs = "6.0.0" +dirs = { workspace = true } net_util = { path = "net_util" } serde_json = { workspace = true } test_infra = { path = "test_infra" } -wait-timeout = "0.2.0" +wait-timeout = { workspace = true } # Please adjust `vmm::feature_list()` accordingly when changing the # feature list below @@ -63,9 +78,11 @@ wait-timeout = "0.2.0" dbus_api = ["vmm/dbus_api", "zbus"] default = ["io_uring", "kvm"] dhat-heap = ["dhat", "vmm/dhat-heap"] # For heap profiling +fw_cfg = ["vmm/fw_cfg"] guest_debug = ["vmm/guest_debug"] igvm = ["mshv", "vmm/igvm"] io_uring = ["vmm/io_uring"] +ivshmem = ["vmm/ivshmem"] kvm = ["vmm/kvm"] mshv = ["vmm/mshv"] pvmemcontrol = ["vmm/pvmemcontrol"] @@ -73,6 +90,9 @@ sev_snp = ["igvm", "mshv", "vmm/sev_snp"] tdx = ["vmm/tdx"] tracing = ["tracer/tracing", "vmm/tracing"] +[lints] +workspace = true + [workspace] members = [ "api_client", @@ -99,26 +119,28 @@ members = [ "vm-virtio", "vmm", ] +package.edition = "2024" [workspace.dependencies] # rust-vmm crates acpi_tables = { git = "https://github.com/rust-vmm/acpi_tables", branch = "main" } -kvm-bindings = "0.10.0" -kvm-ioctls = "0.19.1" -linux-loader = "0.13.0" -mshv-bindings = "0.5.1" -mshv-ioctls = "0.5.1" +kvm-bindings = "0.12.1" +kvm-ioctls = "0.22.1" +# TODO: update to 0.13.1+ +linux-loader = { git = "https://github.com/rust-vmm/linux-loader", branch = "main" } +mshv-bindings = "0.6.0" +mshv-ioctls = "0.6.0" seccompiler = "0.5.0" -vfio-bindings = { git = "https://github.com/rust-vmm/vfio", branch = "main" } -vfio-ioctls = { git = "https://github.com/rust-vmm/vfio", branch = "main", default-features = false } -vfio_user = { git = "https://github.com/rust-vmm/vfio-user", branch = "main" } -vhost = { git = "https://github.com/rust-vmm/vhost", rev = "d983ae0" } -vhost-user-backend = { git = "https://github.com/rust-vmm/vhost", rev = "d983ae0" } -virtio-bindings = "0.2.4" -virtio-queue = "0.14.0" +vfio-bindings = { version = "0.6.0", default-features = false } +vfio-ioctls = { version = "0.5.1", default-features = false } +vfio_user = { version = "0.1.1", default-features = false } +vhost = { version = "0.14.0", default-features = false } +vhost-user-backend = { version = "0.20.0", default-features = false } +virtio-bindings = "0.2.6" +virtio-queue = "0.16.0" vm-fdt = { git = "https://github.com/rust-vmm/vm-fdt", branch = "main" } vm-memory = "0.16.1" -vmm-sys-util = "0.12.1" +vmm-sys-util = "0.14.0" # igvm crates # TODO: bump to 0.3.5 release @@ -126,9 +148,47 @@ igvm = { git = "https://github.com/microsoft/igvm", branch = "main" } igvm_defs = { git = "https://github.com/microsoft/igvm", branch = "main" } # serde crates -serde_json = "1.0.120" +serde = "1.0.208" +serde_json = "1.0.143" +serde_with = { version = "3.14.0", default-features = false } # other crates +anyhow = "1.0.99" +bitflags = "2.9.4" +byteorder = "1.5.0" +cfg-if = "1.0.0" +clap = "4.5.47" +dhat = "0.3.3" +dirs = "6.0.0" +env_logger = "0.11.8" +epoll = "4.3.3" +flume = "0.11.1" +itertools = "0.14.0" +libc = "0.2.167" +log = "0.4.22" +rustls = "0.23.34" +signal-hook = "0.3.18" thiserror = "2.0.12" -uuid = { version = "1.17.0" } +uuid = { version = "1.18.1" } +wait-timeout = "0.2.1" zerocopy = { version = "0.8.26", default-features = false } + +[workspace.lints.rust] +# `level = warn` is irrelevant here but mandatory for rustc/cargo +unexpected_cfgs = { level = "warn", check-cfg = ['cfg(devcli_testenv)'] } + +[workspace.lints.clippy] +# Any clippy lint (group) in alphabetical order: +# https://rust-lang.github.io/rust-clippy/master/index.html + +# Groups +all = "deny" # shorthand for the other groups but here for compleness +complexity = "deny" +correctness = "deny" +perf = "deny" +style = "deny" +suspicious = "deny" + +# Individual Lints +assertions_on_result_states = "deny" +undocumented_unsafe_blocks = "deny" diff --git a/README.md b/README.md index 4609903d0f..fdb18255f0 100644 --- a/README.md +++ b/README.md @@ -153,7 +153,7 @@ interface will be enabled as per `network-config` details. $ sudo setcap cap_net_admin+ep ./cloud-hypervisor $ ./create-cloud-init.sh $ ./cloud-hypervisor \ - --kernel ./hypervisor-fw \ + --firmware ./hypervisor-fw \ --disk path=focal-server-cloudimg-amd64.raw path=/tmp/ubuntu-cloudinit.img \ --cpus boot=4 \ --memory size=1024M \ @@ -175,6 +175,18 @@ $ ./cloud-hypervisor \ --console off ``` +## Booting: `--firmware` vs `--kernel` + +The following scenarios are supported by Cloud Hypervisor to bootstrap a VM, i.e., +to load a payload/bootitem(s): + +- Provide firmware +- Provide kernel \[+ cmdline\]\ [+ initrd\] + +Please note that our Cloud Hypervisor firmware (`hypervisor-fw`) has a Xen PVH +boot entry, therefore it can also be booted via the `--kernel` parameter, as +seen in some examples. + ### Custom Kernel and Disk Image #### Building your Kernel diff --git a/api_client/Cargo.toml b/api_client/Cargo.toml index 630f1b4c44..b8791dfc3d 100644 --- a/api_client/Cargo.toml +++ b/api_client/Cargo.toml @@ -1,9 +1,12 @@ [package] authors = ["The Cloud Hypervisor Authors"] -edition = "2021" +edition.workspace = true name = "api_client" version = "0.1.0" [dependencies] thiserror = { workspace = true } vmm-sys-util = { workspace = true } + +[lints] +workspace = true diff --git a/api_client/src/lib.rs b/api_client/src/lib.rs index 52e85a3367..0ee7fa1d0b 100644 --- a/api_client/src/lib.rs +++ b/api_client/src/lib.rs @@ -118,12 +118,11 @@ fn parse_http_response(socket: &mut dyn Read) -> Result, Error> { } } - if let Some(body_offset) = body_offset { - if let Some(content_length) = content_length { - if res.len() >= content_length + body_offset { - break; - } - } + if let Some(body_offset) = body_offset + && let Some(content_length) = content_length + && res.len() >= content_length + body_offset + { + break; } } let body_string = content_length.and(body_offset.map(|o| String::from(&res[o..]))); diff --git a/arch/Cargo.toml b/arch/Cargo.toml index 4c068d131f..5cc8b0fd71 100644 --- a/arch/Cargo.toml +++ b/arch/Cargo.toml @@ -1,29 +1,39 @@ [package] authors = ["The Chromium OS Authors"] -edition = "2021" +edition.workspace = true name = "arch" version = "0.1.0" [features] default = [] +fw_cfg = [] kvm = ["hypervisor/kvm"] sev_snp = [] tdx = [] [dependencies] -anyhow = "1.0.94" -byteorder = "1.5.0" +anyhow = { workspace = true } +byteorder = { workspace = true } hypervisor = { path = "../hypervisor" } -libc = "0.2.167" +libc = { workspace = true } linux-loader = { workspace = true, features = ["bzimage", "elf", "pe"] } -log = "0.4.22" -serde = { version = "1.0.208", features = ["derive", "rc"] } +log = { workspace = true } +serde = { workspace = true, features = ["derive", "rc"] } +# We currently use this for (de-)serializing CPU profile data +serde_json = { workspace = true } thiserror = { workspace = true } uuid = { workspace = true } vm-memory = { workspace = true, features = ["backend-bitmap", "backend-mmap"] } -vm-migration = { path = "../vm-migration" } vmm-sys-util = { workspace = true, features = ["with-serde"] } [target.'cfg(any(target_arch = "aarch64", target_arch = "riscv64"))'.dependencies] fdt_parser = { version = "0.1.5", package = "fdt" } vm-fdt = { workspace = true } + +# Use this to test our custom serialization logic +[dev-dependencies] +proptest = "1.0.0" +serde_json = { workspace = true } + +[lints] +workspace = true diff --git a/arch/src/aarch64/fdt.rs b/arch/src/aarch64/fdt.rs index 23df4d805a..2755012440 100644 --- a/arch/src/aarch64/fdt.rs +++ b/arch/src/aarch64/fdt.rs @@ -110,11 +110,8 @@ pub fn get_cache_size(cache_level: CacheLevel) -> u32 { let file_path = Path::new(&file_directory); if !file_path.exists() { - warn!("File: {} does not exist.", file_directory); 0 } else { - info!("File: {} exist.", file_directory); - let src = fs::read_to_string(file_directory).expect("File not exists or file corrupted."); // The content of the file is as simple as a size, like: "32K" let src = src.trim(); @@ -144,11 +141,8 @@ pub fn get_cache_coherency_line_size(cache_level: CacheLevel) -> u32 { let file_path = Path::new(&file_directory); if !file_path.exists() { - warn!("File: {} does not exist.", file_directory); 0 } else { - info!("File: {} exist.", file_directory); - let src = fs::read_to_string(file_directory).expect("File not exists or file corrupted."); src.trim().parse::().unwrap() } @@ -167,11 +161,8 @@ pub fn get_cache_number_of_sets(cache_level: CacheLevel) -> u32 { let file_path = Path::new(&file_directory); if !file_path.exists() { - warn!("File: {} does not exist.", file_directory); 0 } else { - info!("File: {} exist.", file_directory); - let src = fs::read_to_string(file_directory).expect("File not exists or file corrupted."); src.trim().parse::().unwrap() } @@ -195,11 +186,8 @@ pub fn get_cache_shared(cache_level: CacheLevel) -> bool { let file_path = Path::new(&file_directory); if !file_path.exists() { - warn!("File: {} does not exist.", file_directory); result = false; } else { - info!("File: {} exist.", file_directory); - let src = fs::read_to_string(file_directory).expect("File not exists or file corrupted."); let src = src.trim(); if src.is_empty() { @@ -218,7 +206,7 @@ pub fn create_fdt, - vcpu_topology: Option<(u8, u8, u8)>, + vcpu_topology: Option<(u16, u16, u16, u16)>, device_info: &HashMap<(DeviceType, String), T, S>, gic_device: &Arc>, initrd: &Option, @@ -231,8 +219,8 @@ pub fn create_fdt, guest_mem: &GuestMemoryMmap) -> R fn create_cpu_nodes( fdt: &mut FdtWriter, vcpu_mpidr: &[u64], - vcpu_topology: Option<(u8, u8, u8)>, + vcpu_topology: Option<(u16, u16, u16, u16)>, numa_nodes: &NumaNodes, ) -> FdtWriterResult<()> { // See https://github.com/torvalds/linux/blob/master/Documentation/devicetree/bindings/arm/cpus.yaml. @@ -289,8 +277,11 @@ fn create_cpu_nodes( fdt.property_u32("#size-cells", 0x0)?; let num_cpus = vcpu_mpidr.len(); - let (threads_per_core, cores_per_package, packages) = vcpu_topology.unwrap_or((1, 1, 1)); - let max_cpus: u32 = (threads_per_core * cores_per_package * packages).into(); + let (threads_per_core, cores_per_die, dies_per_package, packages) = + vcpu_topology.unwrap_or((1, 1, 1, 1)); + let cores_per_package = cores_per_die * dies_per_package; + let max_cpus: u32 = + threads_per_core as u32 * cores_per_die as u32 * dies_per_package as u32 * packages as u32; // Add cache info. // L1 Data Cache Info. @@ -322,7 +313,6 @@ fn create_cpu_nodes( if !cache_exist { warn!("cache sysfs system does not exist."); } else { - info!("cache sysfs system exists."); // L1 Data Cache Info. l1_d_cache_size = get_cache_size(CacheLevel::L1D); l1_d_cache_line_size = get_cache_coherency_line_size(CacheLevel::L1D); @@ -370,7 +360,7 @@ fn create_cpu_nodes( if numa_nodes.len() > 1 { for numa_node_idx in 0..numa_nodes.len() { let numa_node = numa_nodes.get(&(numa_node_idx as u32)); - if numa_node.unwrap().cpus.contains(&(cpu_id as u8)) { + if numa_node.unwrap().cpus.contains(&(cpu_id as u32)) { fdt.property_u32("numa-node-id", numa_node_idx as u32)?; } } @@ -423,9 +413,6 @@ fn create_cpu_nodes( fdt.end_node(l2_cache_node)?; } - if l2_cache_size != 0 && l2_cache_shared { - warn!("L2 cache shared with other cpus"); - } } fdt.end_node(cpu_node)?; @@ -462,7 +449,8 @@ fn create_cpu_nodes( } if let Some(topology) = vcpu_topology { - let (threads_per_core, cores_per_package, packages) = topology; + let (threads_per_core, cores_per_die, dies_per_package, packages) = topology; + let cores_per_package = cores_per_die * dies_per_package; let cpu_map_node = fdt.begin_node("cpu-map")?; // Create device tree nodes with regard of above mapping. @@ -850,6 +838,21 @@ fn create_gpio_node( Ok(()) } +// https://www.kernel.org/doc/Documentation/devicetree/bindings/arm/fw-cfg.txt +#[cfg(feature = "fw_cfg")] +fn create_fw_cfg_node( + fdt: &mut FdtWriter, + dev_info: &T, +) -> FdtWriterResult<()> { + // FwCfg node + let fw_cfg_node = fdt.begin_node(&format!("fw-cfg@{:x}", dev_info.addr()))?; + fdt.property("compatible", b"qemu,fw-cfg-mmio\0")?; + fdt.property_array_u64("reg", &[dev_info.addr(), dev_info.length()])?; + fdt.end_node(fw_cfg_node)?; + + Ok(()) +} + fn create_devices_node( fdt: &mut FdtWriter, dev_info: &HashMap<(DeviceType, String), T, S>, @@ -865,6 +868,8 @@ fn create_devices_node { ordered_virtio_device.push(info); } + #[cfg(feature = "fw_cfg")] + DeviceType::FwCfg => create_fw_cfg_node(fdt, info)?, } } @@ -994,39 +999,39 @@ fn create_pci_nodes( fdt.property_array_u32("msi-map", &msi_map)?; fdt.property_u32("msi-parent", MSI_PHANDLE)?; - if pci_device_info_elem.pci_segment_id == 0 { - if let Some(virtio_iommu_bdf) = virtio_iommu_bdf { - // See kernel document Documentation/devicetree/bindings/pci/pci-iommu.txt - // for 'iommu-map' attribute setting. - let iommu_map = [ - 0_u32, - VIRTIO_IOMMU_PHANDLE, - 0_u32, - virtio_iommu_bdf, - virtio_iommu_bdf + 1, - VIRTIO_IOMMU_PHANDLE, - virtio_iommu_bdf + 1, - 0xffff - virtio_iommu_bdf, - ]; - fdt.property_array_u32("iommu-map", &iommu_map)?; - - // See kernel document Documentation/devicetree/bindings/virtio/iommu.txt - // for virtio-iommu node settings. - let virtio_iommu_node_name = format!("virtio_iommu@{virtio_iommu_bdf:x}"); - let virtio_iommu_node = fdt.begin_node(&virtio_iommu_node_name)?; - fdt.property_u32("#iommu-cells", 1)?; - fdt.property_string("compatible", "virtio,pci-iommu")?; - - // 'reg' is a five-cell address encoded as - // (phys.hi phys.mid phys.lo size.hi size.lo). phys.hi should contain the - // device's BDF as 0b00000000 bbbbbbbb dddddfff 00000000. The other cells - // should be zero. - let reg = [virtio_iommu_bdf << 8, 0_u32, 0_u32, 0_u32, 0_u32]; - fdt.property_array_u32("reg", ®)?; - fdt.property_u32("phandle", VIRTIO_IOMMU_PHANDLE)?; - - fdt.end_node(virtio_iommu_node)?; - } + if pci_device_info_elem.pci_segment_id == 0 + && let Some(virtio_iommu_bdf) = virtio_iommu_bdf + { + // See kernel document Documentation/devicetree/bindings/pci/pci-iommu.txt + // for 'iommu-map' attribute setting. + let iommu_map = [ + 0_u32, + VIRTIO_IOMMU_PHANDLE, + 0_u32, + virtio_iommu_bdf, + virtio_iommu_bdf + 1, + VIRTIO_IOMMU_PHANDLE, + virtio_iommu_bdf + 1, + 0xffff - virtio_iommu_bdf, + ]; + fdt.property_array_u32("iommu-map", &iommu_map)?; + + // See kernel document Documentation/devicetree/bindings/virtio/iommu.txt + // for virtio-iommu node settings. + let virtio_iommu_node_name = format!("virtio_iommu@{virtio_iommu_bdf:x}"); + let virtio_iommu_node = fdt.begin_node(&virtio_iommu_node_name)?; + fdt.property_u32("#iommu-cells", 1)?; + fdt.property_string("compatible", "virtio,pci-iommu")?; + + // 'reg' is a five-cell address encoded as + // (phys.hi phys.mid phys.lo size.hi size.lo). phys.hi should contain the + // device's BDF as 0b00000000 bbbbbbbb dddddfff 00000000. The other cells + // should be zero. + let reg = [virtio_iommu_bdf << 8, 0_u32, 0_u32, 0_u32, 0_u32]; + fdt.property_array_u32("reg", ®)?; + fdt.property_u32("phandle", VIRTIO_IOMMU_PHANDLE)?; + + fdt.end_node(virtio_iommu_node)?; } fdt.end_node(pci_node)?; diff --git a/arch/src/aarch64/mod.rs b/arch/src/aarch64/mod.rs index 51f51ccaf6..f98942b83a 100644 --- a/arch/src/aarch64/mod.rs +++ b/arch/src/aarch64/mod.rs @@ -15,7 +15,7 @@ use std::sync::{Arc, Mutex}; use hypervisor::arch::aarch64::gic::Vgic; use hypervisor::arch::aarch64::regs::MPIDR_EL1; -use log::{log_enabled, Level}; +use log::{Level, log_enabled}; use thiserror::Error; use vm_memory::{Address, GuestAddress, GuestMemory, GuestMemoryAtomic}; @@ -67,7 +67,7 @@ pub struct EntryPoint { /// Configure the specified VCPU, and return its MPIDR. pub fn configure_vcpu( vcpu: &Arc, - id: u8, + id: u32, boot_setup: Option<(EntryPoint, &GuestMemoryAtomic)>, ) -> super::Result { if let Some((kernel_entry_point, _guest_memory)) = boot_setup { @@ -126,7 +126,7 @@ pub fn configure_system, - vcpu_topology: Option<(u8, u8, u8)>, + vcpu_topology: Option<(u16, u16, u16, u16)>, device_info: &HashMap<(DeviceType, String), T, S>, initrd: &Option, pci_space_info: &[PciSpaceInfo], diff --git a/arch/src/lib.rs b/arch/src/lib.rs index 333a65d9c4..2a298d0ba3 100644 --- a/arch/src/lib.rs +++ b/arch/src/lib.rs @@ -12,14 +12,16 @@ extern crate log; use std::collections::BTreeMap; +use std::str::FromStr; use std::sync::Arc; use std::{fmt, result}; +use serde::de::IntoDeserializer; use serde::{Deserialize, Serialize}; use thiserror::Error; #[cfg(target_arch = "x86_64")] -use crate::x86_64::SgxEpcSection; +pub use crate::x86_64::cpu_profile::CpuProfile; type GuestMemoryMmap = vm_memory::GuestMemoryMmap; type GuestRegionMmap = vm_memory::GuestRegionMmap; @@ -59,6 +61,31 @@ pub enum Error { /// Type for returning public functions outcome. pub type Result = result::Result; +// If the target_arch is x86_64 we import CpuProfile from the x86_64 module, otherwise we +// declare it here. +#[cfg(not(target_arch = "x86_64"))] +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(rename_all = "kebab-case")] +/// A [`CpuProfile`] is a mechanism for ensuring live migration compatibility +/// between host's with potentially different CPU models. +pub enum CpuProfile { + #[default] + Host, +} + +impl FromStr for CpuProfile { + type Err = serde::de::value::Error; + fn from_str(s: &str) -> result::Result { + // Should accept both plain strings, and strings surrounded by `"`. + let normalized = s + .strip_prefix('"') + .unwrap_or(s) + .strip_suffix('"') + .unwrap_or(s); + Self::deserialize(normalized.into_deserializer()) + } +} + /// Type for memory region types. #[derive(Clone, Copy, PartialEq, Eq, Debug, Serialize, Deserialize)] pub enum RegionType { @@ -84,9 +111,9 @@ pub mod aarch64; #[cfg(target_arch = "aarch64")] pub use aarch64::{ - arch_memory_regions, configure_system, configure_vcpu, fdt::DeviceInfoForFdt, - get_host_cpu_phys_bits, initramfs_load_addr, layout, layout::CMDLINE_MAX_SIZE, - layout::IRQ_BASE, uefi, EntryPoint, _NSIG, + _NSIG, EntryPoint, arch_memory_regions, configure_system, configure_vcpu, + fdt::DeviceInfoForFdt, get_host_cpu_phys_bits, initramfs_load_addr, layout, + layout::CMDLINE_MAX_SIZE, layout::IRQ_BASE, uefi, }; /// Module for riscv64 related functionality. @@ -95,9 +122,9 @@ pub mod riscv64; #[cfg(target_arch = "riscv64")] pub use riscv64::{ - arch_memory_regions, configure_system, configure_vcpu, fdt::DeviceInfoForFdt, - get_host_cpu_phys_bits, initramfs_load_addr, layout, layout::CMDLINE_MAX_SIZE, - layout::IRQ_BASE, EntryPoint, _NSIG, + _NSIG, EntryPoint, arch_memory_regions, configure_system, configure_vcpu, + fdt::DeviceInfoForFdt, get_host_cpu_phys_bits, initramfs_load_addr, layout, + layout::CMDLINE_MAX_SIZE, layout::IRQ_BASE, uefi, }; #[cfg(target_arch = "x86_64")] @@ -105,10 +132,9 @@ pub mod x86_64; #[cfg(target_arch = "x86_64")] pub use x86_64::{ - arch_memory_regions, configure_system, configure_vcpu, generate_common_cpuid, - generate_ram_ranges, get_host_cpu_phys_bits, initramfs_load_addr, layout, - layout::CMDLINE_MAX_SIZE, layout::CMDLINE_START, regs, CpuidConfig, CpuidFeatureEntry, - EntryPoint, _NSIG, + _NSIG, CpuidConfig, CpuidFeatureEntry, EntryPoint, arch_memory_regions, configure_system, + configure_vcpu, generate_common_cpuid, generate_ram_ranges, get_host_cpu_phys_bits, + initramfs_load_addr, layout, layout::CMDLINE_MAX_SIZE, layout::CMDLINE_START, regs, }; /// Safe wrapper for `sysconf(_SC_PAGESIZE)`. @@ -123,12 +149,10 @@ fn pagesize() -> usize { pub struct NumaNode { pub memory_regions: Vec>, pub hotplug_regions: Vec>, - pub cpus: Vec, + pub cpus: Vec, pub pci_segments: Vec, pub distances: BTreeMap, pub memory_zones: Vec, - #[cfg(target_arch = "x86_64")] - pub sgx_epc_sections: Vec, } pub type NumaNodes = BTreeMap; @@ -155,6 +179,9 @@ pub enum DeviceType { /// Device Type: GPIO. #[cfg(target_arch = "aarch64")] Gpio, + /// Device Type: fw_cfg. + #[cfg(feature = "fw_cfg")] + FwCfg, } /// Default (smallest) memory page size for the supported architectures. diff --git a/arch/src/riscv64/fdt.rs b/arch/src/riscv64/fdt.rs index 1a7e2e5f46..ee453eb2fc 100644 --- a/arch/src/riscv64/fdt.rs +++ b/arch/src/riscv64/fdt.rs @@ -119,7 +119,7 @@ fn create_cpu_nodes(fdt: &mut FdtWriter, num_cpus: u32) -> FdtWriterResult<()> { fdt.property_u32("timebase-frequency", timebase_frequency)?; for cpu_index in 0..num_cpus { - let cpu = fdt.begin_node(&format!("cpu@{:x}", cpu_index))?; + let cpu = fdt.begin_node(&format!("cpu@{cpu_index:x}"))?; fdt.property_string("device_type", "cpu")?; fdt.property_string("compatible", "riscv")?; fdt.property_string("mmu-type", "sv48")?; @@ -184,7 +184,7 @@ fn create_memory_node(fdt: &mut FdtWriter, guest_mem: &GuestMemoryMmap) -> FdtWr } let ram_start = super::layout::RAM_START.raw_value(); - let memory_node_name = format!("memory@{:x}", ram_start); + let memory_node_name = format!("memory@{ram_start:x}"); let memory_node = fdt.begin_node(&memory_node_name)?; fdt.property_string("device_type", "memory")?; fdt.property_array_u64("reg", &mem_reg_property)?; @@ -448,10 +448,7 @@ fn print_node(node: fdt_parser::node::FdtNode<'_, '_>, n_spaces: usize) { // - At first, try to convert it to CStr and print, // - If failed, print it as u32 array. let value_result = match CStr::from_bytes_with_nul(value) { - Ok(value_cstr) => match value_cstr.to_str() { - Ok(value_str) => Some(value_str), - Err(_e) => None, - }, + Ok(value_cstr) => value_cstr.to_str().ok(), Err(_e) => None, }; diff --git a/arch/src/riscv64/layout.rs b/arch/src/riscv64/layout.rs index 40583301c1..3ef7eddf2c 100644 --- a/arch/src/riscv64/layout.rs +++ b/arch/src/riscv64/layout.rs @@ -44,16 +44,23 @@ // | | // | APLICs | // | | +// 4 MB +---------------------------------------------------------------+ +// | UEFI flash | // 0 GB +---------------------------------------------------------------+ // // use vm_memory::GuestAddress; +/// 0x0 ~ 0x40_0000 (4 MiB) is reserved to UEFI +/// UEFI binary size is required less than 3 MiB, reserving 4 MiB is enough. +pub const UEFI_START: GuestAddress = GuestAddress(0); +pub const UEFI_SIZE: u64 = 0x040_0000; + /// AIA related devices /// See https://elixir.bootlin.com/linux/v6.10/source/arch/riscv/include/uapi/asm/kvm.h -/// 0x0 ~ 0x0400_0000 (64 MiB) resides APLICs -pub const APLIC_START: GuestAddress = GuestAddress(0); +/// 0x40_0000 ~ 0x0400_0000 (64 MiB) resides APLICs +pub const APLIC_START: GuestAddress = GuestAddress(0x40_0000); pub const APLIC_SIZE: u64 = 0x4000; /// 0x0400_0000 ~ 0x0800_0000 (64 MiB) resides IMSICs diff --git a/arch/src/riscv64/mod.rs b/arch/src/riscv64/mod.rs index a04cf9471f..6a0342b3cd 100644 --- a/arch/src/riscv64/mod.rs +++ b/arch/src/riscv64/mod.rs @@ -7,13 +7,15 @@ pub mod fdt; /// Layout for this riscv64 system. pub mod layout; +/// Module for loading UEFI binary. +pub mod uefi; use std::collections::HashMap; use std::fmt::Debug; use std::sync::{Arc, Mutex}; use hypervisor::arch::riscv64::aia::Vaia; -use log::{log_enabled, Level}; +use log::{Level, log_enabled}; use thiserror::Error; use vm_memory::{Address, GuestAddress, GuestMemory, GuestMemoryAtomic}; @@ -57,7 +59,7 @@ pub struct EntryPoint { /// Configure the specified VCPU, and return its MPIDR. pub fn configure_vcpu( vcpu: &Arc, - id: u8, + id: u32, boot_setup: Option<(EntryPoint, &GuestMemoryAtomic)>, ) -> super::Result<()> { if let Some((kernel_entry_point, _guest_memory)) = boot_setup { diff --git a/arch/src/riscv64/uefi.rs b/arch/src/riscv64/uefi.rs new file mode 100644 index 0000000000..bd40e36ff0 --- /dev/null +++ b/arch/src/riscv64/uefi.rs @@ -0,0 +1,50 @@ +// Copyright 2020 Arm Limited (or its affiliates). All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +use std::io::{Read, Seek, SeekFrom}; +use std::os::fd::AsFd; +use std::result; + +use thiserror::Error; +use vm_memory::{GuestAddress, GuestMemory}; + +/// Errors thrown while loading UEFI binary +#[derive(Debug, Error)] +pub enum Error { + /// Unable to seek to UEFI image start. + #[error("Unable to seek to UEFI image start")] + SeekUefiStart, + /// Unable to seek to UEFI image end. + #[error("Unable to seek to UEFI image end")] + SeekUefiEnd, + /// UEFI image too big. + #[error("UEFI image too big")] + UefiTooBig, + /// Unable to read UEFI image + #[error("Unable to read UEFI image")] + ReadUefiImage, +} +type Result = result::Result; + +pub fn load_uefi( + guest_mem: &M, + guest_addr: GuestAddress, + uefi_image: &mut F, +) -> Result<()> +where + F: Read + Seek + AsFd, +{ + let uefi_size = uefi_image + .seek(SeekFrom::End(0)) + .map_err(|_| Error::SeekUefiEnd)? as usize; + + // edk2 image on virtual platform is smaller than 3M + if uefi_size > 0x300000 { + return Err(Error::UefiTooBig); + } + uefi_image.rewind().map_err(|_| Error::SeekUefiStart)?; + guest_mem + .read_exact_volatile_from(guest_addr, &mut uefi_image.as_fd(), uefi_size) + .map_err(|_| Error::ReadUefiImage) +} diff --git a/arch/src/x86_64/cpu_profile.rs b/arch/src/x86_64/cpu_profile.rs new file mode 100644 index 0000000000..36c8a62e2d --- /dev/null +++ b/arch/src/x86_64/cpu_profile.rs @@ -0,0 +1,258 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// + +use hypervisor::arch::x86::CpuIdEntry; +use hypervisor::{CpuVendor, HypervisorType}; +use serde::{Deserialize, Serialize}; +use thiserror::Error; + +use crate::x86_64::CpuidReg; +use crate::x86_64::cpuid_definitions::{Parameters, deserialize_from_hex, serialize_as_hex}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)] +#[serde(rename_all = "kebab-case")] +/// A [`CpuProfile`] is a mechanism for ensuring live migration compatibility +/// between host's with potentially different CPU models. +pub enum CpuProfile { + #[default] + Host, + #[cfg(feature = "kvm")] + Skylake, + #[cfg(feature = "kvm")] + SapphireRapids, +} + +impl CpuProfile { + /// Loads pre-generated data associated with a CPU profile. + /// + /// If the `amx` flag is false then the AMX tile state components will be + /// zeroed out from the associated profile data. This is necessary because + /// they will then not be present in the vector of [`CpuidEntry`] values + /// obtained from the hypervisor. + // + // We can only generate CPU profiles for the KVM hypervisor for the time being. + #[cfg(feature = "kvm")] + pub(in crate::x86_64) fn data(&self, amx: bool) -> Option { + let mut data: CpuProfileData = match self { + Self::Host => None, + Self::Skylake => Some( + serde_json::from_slice(include_bytes!("cpu_profiles/skylake.json")) + .inspect_err(|e| { + error!("BUG: could not deserialize CPU profile. Got error: {:?}", e) + }) + .expect("should be able to deserialize pre-generated data"), + ), + Self::SapphireRapids => Some( + serde_json::from_slice(include_bytes!("cpu_profiles/sapphire-rapids.json")) + .inspect_err(|e| { + error!("BUG: could not deserialize CPU profile. Got error: {:?}", e) + }) + .expect("should be able to deserialize pre-generated data"), + ), + }?; + + if !amx { + // In this case we will need to wipe out the AMX tile state components (if they are included in the profile) + for adj in data.adjustments.iter_mut() { + if adj.0.sub_leaf.start() != adj.0.sub_leaf.end() { + continue; + } + let sub_leaf = *adj.0.sub_leaf.start(); + let leaf = adj.0.leaf; + if (leaf == 0xd) && (sub_leaf == 0) && (adj.0.register == CpuidReg::EAX) { + adj.1.replacements &= !((1 << 17) | (1 << 18)); + } + + if (leaf == 0xd) && (sub_leaf == 1) && (adj.0.register == CpuidReg::ECX) { + adj.1.replacements &= !((1 << 17) | (1 << 18)); + } + + if (leaf == 0xd) && ((sub_leaf == 17) | (sub_leaf == 18)) { + adj.1.replacements = 0; + } + } + } + + Some(data) + } + + #[cfg(not(feature = "kvm"))] + pub(in crate::x86_64) fn data(&self, _amx: bool) -> Option { + if matches!(*self, Self::Host) { + return None; + } + // This will need to be addressed before upstreaming. + // We will probably need one profile per hypervisor. + unreachable!() + } +} + +/// Every [`CpuProfile`] different from `Host` has associated [`CpuProfileData`]. +/// +/// New constructors of this struct may only be generated through the CHV CLI (when built from source with +/// the `cpu-profile-generation` feature) which other hosts may then attempt to load in order to +/// increase the likelihood of successful live migrations among all hosts that opted in to the given +/// CPU profile. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[allow(dead_code)] +pub struct CpuProfileData { + /// The hypervisor used when generating this CPU profile. + pub(in crate::x86_64) hypervisor: HypervisorType, + /// The vendor of the CPU belonging to the host that generated this CPU profile. + pub(in crate::x86_64) cpu_vendor: CpuVendor, + /// Adjustments necessary to become compatible with the desired target. + pub(in crate::x86_64) adjustments: Vec<(Parameters, CpuidOutputRegisterAdjustments)>, +} + +/* TODO: The [`CpuProfile`] struct will likely need a few more iterations. The following +section should explain why: + +# MSR restrictions + +CPU profiles also need to restrict which MSRs may be manipulated by the guest as various physical CPUs +can have differing supported MSRs. + +The CPU profile will thus necessarily need to contain some data related to MSR restrictions. That will +be taken care of in a follow up MR. + +*/ + +/// Used for adjusting an entire cpuid output register (EAX, EBX, ECX or EDX) +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub(super) struct CpuidOutputRegisterAdjustments { + #[serde(serialize_with = "serialize_as_hex")] + #[serde(deserialize_with = "deserialize_from_hex")] + pub(in crate::x86_64) replacements: u32, + /// Used to zero out the area `replacements` occupy. This mask is not necessarily !replacements, as replacements may pack values of different types (i.e. it is wrong to think of it as a bitset conceptually speaking). + #[serde(serialize_with = "serialize_as_hex")] + #[serde(deserialize_with = "deserialize_from_hex")] + pub(in crate::x86_64) mask: u32, +} +impl CpuidOutputRegisterAdjustments { + pub(in crate::x86_64) fn adjust(self, cpuid_output_register: &mut u32) { + let temp_register_copy = *cpuid_output_register; + let replacements_area_masked_in_temp_copy = temp_register_copy & self.mask; + *cpuid_output_register = replacements_area_masked_in_temp_copy | self.replacements; + } + + pub(in crate::x86_64) fn adjust_cpuid_entries( + mut cpuid: Vec, + adjustments: &[(Parameters, Self)], + ) -> Result, MissingCpuidEntriesError> { + for entry in &mut cpuid { + for (reg, reg_value) in [ + (CpuidReg::EAX, &mut entry.eax), + (CpuidReg::EBX, &mut entry.ebx), + (CpuidReg::ECX, &mut entry.ecx), + (CpuidReg::EDX, &mut entry.edx), + ] { + // Get the adjustment corresponding to the entry's function/leaf and index/sub-leaf for each of the register. If no such + // adjustment is found we use the trivial adjustment (leading to the register being zeroed out entirely). + let adjustment = adjustments + .iter() + .find_map(|(param, adjustment)| { + ((param.leaf == entry.function) + & param.sub_leaf.contains(&entry.index) + & (param.register == reg)) + .then_some(*adjustment) + }) + .unwrap_or(CpuidOutputRegisterAdjustments { + mask: 0, + replacements: 0, + }); + adjustment.adjust(reg_value); + } + } + + Self::expected_entries_found(&cpuid, adjustments).map(|_| cpuid) + } + + /// Check that we found every value that was supposed to be replaced with something else than 0 + /// + /// IMPORTANT: This function assumes that the given `cpuid` has already been adjusted with the + /// provided `adjustments`. + fn expected_entries_found( + cpuid: &[CpuIdEntry], + adjustments: &[(Parameters, Self)], + ) -> Result<(), MissingCpuidEntriesError> { + let mut missing_entry = false; + + // Invalid state components can be ignored. The next few lines obtain the relevant entries to + // check for this. + let eax_0xd_0 = cpuid + .iter() + .find(|entry| (entry.function == 0xd) && (entry.index == 0)) + .map(|entry| entry.eax) + .unwrap_or(0); + let ecx_0xd_1 = cpuid + .iter() + .find(|entry| (entry.function == 0xd) && (entry.index == 1)) + .map(|entry| entry.ecx) + .unwrap_or(0); + + let edx_0xd_0 = cpuid + .iter() + .find(|entry| (entry.function == 0xd) && (entry.index == 0)) + .map(|entry| entry.edx) + .unwrap_or(0); + let edx_0xd_1 = cpuid + .iter() + .find(|entry| (entry.function == 0xd) && (entry.index == 1)) + .map(|entry| entry.edx) + .unwrap_or(0); + + for (param, adjustment) in adjustments { + if adjustment.replacements == 0 { + continue; + } + let sub_start = *param.sub_leaf.start(); + let sub_end = *param.sub_leaf.end(); + + let can_skip_lo = if (param.leaf == 0xd) && (2..32).contains(&sub_start) { + let start = sub_start; + let end = std::cmp::min(sub_end, 31); + let mask = (start..=end).fold(0, |acc, next| acc | (1 << next)); + ((mask & eax_0xd_0) == 0) & ((mask & ecx_0xd_1) == 0) + } else { + false + }; + + let can_skip_hi = if (param.leaf == 0xd) && (32..64).contains(&sub_end) { + let start = std::cmp::max(32, sub_start); + let end = sub_end; + let mask = (start..=end) + .map(|val| val - 32) + .fold(0, |acc, next| acc | (1 << next)); + ((mask & edx_0xd_0) == 0) & ((mask & edx_0xd_1) == 0) + } else { + false + }; + + if can_skip_lo && can_skip_hi { + // This means that all state components referred to by the specified sub-leaf range are not valid + // and may be skipped. + continue; + } + if !cpuid.iter().any(|entry| { + (entry.function == param.leaf) && (param.sub_leaf.contains(&entry.index)) + }) { + error!( + "cannot adjust CPU profile. No entry found matching the required parameters: {:?}", + param + ); + missing_entry = true; + } + } + if missing_entry { + Err(MissingCpuidEntriesError) + } else { + Ok(()) + } + } +} + +#[derive(Debug, Error)] +#[error("Required CPUID entries not found")] +pub struct MissingCpuidEntriesError; diff --git a/arch/src/x86_64/cpu_profiles/sapphire-rapids.json b/arch/src/x86_64/cpu_profiles/sapphire-rapids.json new file mode 100644 index 0000000000..0ea90aa979 --- /dev/null +++ b/arch/src/x86_64/cpu_profiles/sapphire-rapids.json @@ -0,0 +1,3436 @@ +{ + "hypervisor": "Kvm", + "cpu_vendor": "Intel", + "adjustments": [ + [ + { + "leaf": "0x00000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000020", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x756e6547", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x6c65746e", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x49656e69", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000001", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x000806f8", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000001", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ff00" + } + ], + [ + { + "leaf": "0x00000001", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x76fa3223", + "mask": "0x80000000" + } + ], + [ + { + "leaf": "0x00000001", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x078bfbff", + "mask": "0x08000000" + } + ], + [ + { + "leaf": "0x00000002", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000002", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000002", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000002", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffc3ff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffc3ff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffc3ff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffc3ff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 4, + "end": 4 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffc3ff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 5, + "end": 4294967295 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffc3ff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 4, + "end": 4 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 5, + "end": 4294967295 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x7fffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x7fffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x7fffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x7fffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 4, + "end": 4 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x7fffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 5, + "end": 4294967295 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x7fffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000007" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000007" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000007" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000007" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 4, + "end": 4 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000007" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 5, + "end": 4294967295 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000007" + } + ], + [ + { + "leaf": "0x00000005", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000005", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000005", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000005", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000006", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000004", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000006", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000006", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000006", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000007", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000002", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000007", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0xf1bf07ab", + "mask": "0x00002040" + } + ], + [ + { + "leaf": "0x00000007", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x1b415f6e", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000007", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0xa7c04010", + "mask": "0x18000400" + } + ], + [ + { + "leaf": "0x00000007", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EAX" + }, + { + "replacements": "0x00001c30", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000007", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000007", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000007", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EDX" + }, + { + "replacements": "0x00000017", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000009", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000a", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000a", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000a", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000a", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000b", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000001f" + } + ], + [ + { + "leaf": "0x0000000b", + "sub_leaf": { + "start": 1, + "end": 4294967295 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000001f" + } + ], + [ + { + "leaf": "0x0000000b", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x0000000b", + "sub_leaf": { + "start": 1, + "end": 4294967295 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x0000000b", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x0000000b", + "sub_leaf": { + "start": 1, + "end": 4294967295 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x0000000b", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x0000000b", + "sub_leaf": { + "start": 1, + "end": 4294967295 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x000602e7", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EAX" + }, + { + "replacements": "0x0000001f", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EAX" + }, + { + "replacements": "0x00000100", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 3, + "end": 4 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "EAX" + }, + { + "replacements": "0x00000040", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 6, + "end": 6 + }, + "register": "EAX" + }, + { + "replacements": "0x00000200", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 7, + "end": 7 + }, + "register": "EAX" + }, + { + "replacements": "0x00000400", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 8, + "end": 8 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 9, + "end": 9 + }, + "register": "EAX" + }, + { + "replacements": "0x00000008", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 10, + "end": 16 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 17, + "end": 17 + }, + "register": "EAX" + }, + { + "replacements": "0x00000040", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 18, + "end": 18 + }, + "register": "EAX" + }, + { + "replacements": "0x00002000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 19, + "end": 63 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EBX" + }, + { + "replacements": "0x00000240", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 3, + "end": 4 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "EBX" + }, + { + "replacements": "0x00000440", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 6, + "end": 6 + }, + "register": "EBX" + }, + { + "replacements": "0x00000480", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 7, + "end": 7 + }, + "register": "EBX" + }, + { + "replacements": "0x00000680", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 8, + "end": 8 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 9, + "end": 9 + }, + "register": "EBX" + }, + { + "replacements": "0x00000a80", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 10, + "end": 16 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 17, + "end": 17 + }, + "register": "EBX" + }, + { + "replacements": "0x00000ac0", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 18, + "end": 18 + }, + "register": "EBX" + }, + { + "replacements": "0x00000b00", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 19, + "end": 63 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 3, + "end": 4 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 6, + "end": 6 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 7, + "end": 7 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 8, + "end": 8 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 9, + "end": 9 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 10, + "end": 16 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 17, + "end": 17 + }, + "register": "ECX" + }, + { + "replacements": "0x00000002", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 18, + "end": 18 + }, + "register": "ECX" + }, + { + "replacements": "0x00000006", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 19, + "end": 63 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 3, + "end": 4 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 3, + "end": 4 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 3, + "end": 4 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 3, + "end": 4 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "EAX" + }, + { + "replacements": "0x00000040", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 6, + "end": 6 + }, + "register": "EAX" + }, + { + "replacements": "0x00000200", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 7, + "end": 7 + }, + "register": "EAX" + }, + { + "replacements": "0x00000400", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 8, + "end": 8 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 9, + "end": 9 + }, + "register": "EAX" + }, + { + "replacements": "0x00000008", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 10, + "end": 16 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 17, + "end": 17 + }, + "register": "EAX" + }, + { + "replacements": "0x00000040", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 18, + "end": 18 + }, + "register": "EAX" + }, + { + "replacements": "0x00002000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 19, + "end": 63 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "EBX" + }, + { + "replacements": "0x00000440", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 6, + "end": 6 + }, + "register": "EBX" + }, + { + "replacements": "0x00000480", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 7, + "end": 7 + }, + "register": "EBX" + }, + { + "replacements": "0x00000680", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 8, + "end": 8 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 9, + "end": 9 + }, + "register": "EBX" + }, + { + "replacements": "0x00000a80", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 10, + "end": 16 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 17, + "end": 17 + }, + "register": "EBX" + }, + { + "replacements": "0x00000ac0", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 18, + "end": 18 + }, + "register": "EBX" + }, + { + "replacements": "0x00000b00", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 19, + "end": 63 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 6, + "end": 6 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 7, + "end": 7 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 8, + "end": 8 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 9, + "end": 9 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 10, + "end": 16 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 17, + "end": 17 + }, + "register": "ECX" + }, + { + "replacements": "0x00000002", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 18, + "end": 18 + }, + "register": "ECX" + }, + { + "replacements": "0x00000006", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 19, + "end": 63 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000f", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000f", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000f", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000f", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000f", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000f", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000001f" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000001f" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000014", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000014", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000014", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000014", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000014", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000015", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000015", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000015", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000016", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x00000016", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x00000016", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x00000017", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000018", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000018", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffff070f" + } + ], + [ + { + "leaf": "0x00000018", + "sub_leaf": { + "start": 1, + "end": 4294967295 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffff070f" + } + ], + [ + { + "leaf": "0x00000018", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000018", + "sub_leaf": { + "start": 1, + "end": 4294967295 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000018", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x03ffc1ff" + } + ], + [ + { + "leaf": "0x00000018", + "sub_leaf": { + "start": 1, + "end": 4294967295 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x03ffc1ff" + } + ], + [ + { + "leaf": "0x0000001c", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001c", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001c", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001d", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000001", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001d", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EAX" + }, + { + "replacements": "0x04002000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001d", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EBX" + }, + { + "replacements": "0x00080040", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001d", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "ECX" + }, + { + "replacements": "0x00000010", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001e", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001e", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00004010", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001e", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001f", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000001f" + } + ], + [ + { + "leaf": "0x0000001f", + "sub_leaf": { + "start": 1, + "end": 4294967295 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000001f" + } + ], + [ + { + "leaf": "0x0000001f", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x0000001f", + "sub_leaf": { + "start": 1, + "end": 4294967295 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x0000001f", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x0000001f", + "sub_leaf": { + "start": 1, + "end": 4294967295 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x0000001f", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x0000001f", + "sub_leaf": { + "start": 1, + "end": 4294967295 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000020", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000020", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000021", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000021", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000021", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 4, + "end": 4 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 4, + "end": 4 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000024", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000024", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x80000008", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x80000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x80000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x80000001", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0fff3fff" + } + ], + [ + { + "leaf": "0x80000001", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xf000ffff" + } + ], + [ + { + "leaf": "0x80000001", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000121", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000001", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x2c100800", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000002", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x65746e49", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000002", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x6153206c", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000002", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x69687070", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000002", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x72206572", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000003", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x64697061", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000003", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000073", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000003", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000003", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000004", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000004", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000004", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000004", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000006", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x80000007", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000100", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000008", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00ffffff" + } + ], + [ + { + "leaf": "0x80000008", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x40000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x40000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x40000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x40000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x40000001", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0103feff" + } + ], + [ + { + "leaf": "0x40000001", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000001" + } + ] + ] +} \ No newline at end of file diff --git a/arch/src/x86_64/cpu_profiles/skylake.json b/arch/src/x86_64/cpu_profiles/skylake.json new file mode 100644 index 0000000000..84ae4d99ee --- /dev/null +++ b/arch/src/x86_64/cpu_profiles/skylake.json @@ -0,0 +1,3184 @@ +{ + "hypervisor": "Kvm", + "cpu_vendor": "Intel", + "adjustments": [ + [ + { + "leaf": "0x00000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000016", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x756e6547", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x6c65746e", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x49656e69", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000001", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00050654", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000001", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ff00" + } + ], + [ + { + "leaf": "0x00000001", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x76fa3223", + "mask": "0x80000000" + } + ], + [ + { + "leaf": "0x00000001", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x078bfbff", + "mask": "0x08000000" + } + ], + [ + { + "leaf": "0x00000002", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000002", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000002", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000002", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffc3ff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffc3ff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffc3ff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffc3ff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 4, + "end": 4 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffc3ff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 5, + "end": 4294967295 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffc3ff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 4, + "end": 4 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 5, + "end": 4294967295 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x7fffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x7fffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x7fffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x7fffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 4, + "end": 4 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x7fffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 5, + "end": 4294967295 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x7fffffff" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000007" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000007" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000007" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000007" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 4, + "end": 4 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000007" + } + ], + [ + { + "leaf": "0x00000004", + "sub_leaf": { + "start": 5, + "end": 4294967295 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000007" + } + ], + [ + { + "leaf": "0x00000005", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000005", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000005", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000005", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000006", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000004", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000006", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000006", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000006", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000007", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000007", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0xd19f07ab", + "mask": "0x00002040" + } + ], + [ + { + "leaf": "0x00000007", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x0000000c", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000007", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0xa4000000", + "mask": "0x18000400" + } + ], + [ + { + "leaf": "0x00000007", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000007", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000007", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000007", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000009", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000a", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000a", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000a", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000a", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000b", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000001f" + } + ], + [ + { + "leaf": "0x0000000b", + "sub_leaf": { + "start": 1, + "end": 4294967295 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000001f" + } + ], + [ + { + "leaf": "0x0000000b", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x0000000b", + "sub_leaf": { + "start": 1, + "end": 4294967295 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x0000000b", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x0000000b", + "sub_leaf": { + "start": 1, + "end": 4294967295 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x0000000b", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x0000000b", + "sub_leaf": { + "start": 1, + "end": 4294967295 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x000002e7", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EAX" + }, + { + "replacements": "0x0000000f", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EAX" + }, + { + "replacements": "0x00000100", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "EAX" + }, + { + "replacements": "0x00000040", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 4, + "end": 4 + }, + "register": "EAX" + }, + { + "replacements": "0x00000040", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "EAX" + }, + { + "replacements": "0x00000040", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 6, + "end": 6 + }, + "register": "EAX" + }, + { + "replacements": "0x00000200", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 7, + "end": 7 + }, + "register": "EAX" + }, + { + "replacements": "0x00000400", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 8, + "end": 8 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 9, + "end": 9 + }, + "register": "EAX" + }, + { + "replacements": "0x00000008", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 10, + "end": 63 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EBX" + }, + { + "replacements": "0x00000240", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "EBX" + }, + { + "replacements": "0x000003c0", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 4, + "end": 4 + }, + "register": "EBX" + }, + { + "replacements": "0x00000400", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "EBX" + }, + { + "replacements": "0x00000440", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 6, + "end": 6 + }, + "register": "EBX" + }, + { + "replacements": "0x00000480", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 7, + "end": 7 + }, + "register": "EBX" + }, + { + "replacements": "0x00000680", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 8, + "end": 8 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 9, + "end": 9 + }, + "register": "EBX" + }, + { + "replacements": "0x00000a80", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 10, + "end": 63 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 4, + "end": 4 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 6, + "end": 6 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 7, + "end": 7 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 8, + "end": 8 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 9, + "end": 9 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 10, + "end": 63 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 4, + "end": 4 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 4, + "end": 4 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 4, + "end": 4 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 4, + "end": 4 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "EAX" + }, + { + "replacements": "0x00000040", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 6, + "end": 6 + }, + "register": "EAX" + }, + { + "replacements": "0x00000200", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 7, + "end": 7 + }, + "register": "EAX" + }, + { + "replacements": "0x00000400", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 8, + "end": 8 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 9, + "end": 9 + }, + "register": "EAX" + }, + { + "replacements": "0x00000008", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 10, + "end": 63 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "EBX" + }, + { + "replacements": "0x00000440", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 6, + "end": 6 + }, + "register": "EBX" + }, + { + "replacements": "0x00000480", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 7, + "end": 7 + }, + "register": "EBX" + }, + { + "replacements": "0x00000680", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 8, + "end": 8 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 9, + "end": 9 + }, + "register": "EBX" + }, + { + "replacements": "0x00000a80", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 10, + "end": 63 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 6, + "end": 6 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 7, + "end": 7 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 8, + "end": 8 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 9, + "end": 9 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000d", + "sub_leaf": { + "start": 10, + "end": 63 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000f", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000f", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000f", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000f", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000f", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000000f", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000001f" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000001f" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000010", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000014", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000014", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000014", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000014", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000014", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000015", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000015", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000015", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000016", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x00000016", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x00000016", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x00000017", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000018", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000018", + "sub_leaf": { + "start": 0, + "end": 4294967295 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffff070f" + } + ], + [ + { + "leaf": "0x00000018", + "sub_leaf": { + "start": 0, + "end": 4294967295 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000018", + "sub_leaf": { + "start": 0, + "end": 4294967295 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x03ffc1ff" + } + ], + [ + { + "leaf": "0x0000001c", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001c", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001c", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001d", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001d", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001d", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001d", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001e", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001e", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001e", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x0000001f", + "sub_leaf": { + "start": 0, + "end": 4294967295 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000001f" + } + ], + [ + { + "leaf": "0x0000001f", + "sub_leaf": { + "start": 0, + "end": 4294967295 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x0000001f", + "sub_leaf": { + "start": 0, + "end": 4294967295 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x0000ffff" + } + ], + [ + { + "leaf": "0x0000001f", + "sub_leaf": { + "start": 0, + "end": 4294967295 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x00000020", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000020", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000021", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000021", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000021", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 1, + "end": 1 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 2, + "end": 2 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 3, + "end": 3 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 4, + "end": 4 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 4, + "end": 4 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000023", + "sub_leaf": { + "start": 5, + "end": 5 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000024", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x00000024", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x80000008", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x80000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x80000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x80000001", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0fff3fff" + } + ], + [ + { + "leaf": "0x80000001", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xf000ffff" + } + ], + [ + { + "leaf": "0x80000001", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000121", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000001", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x2c100800", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000002", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x65746e49", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000002", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x6b53206c", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000002", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x6b616c79", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000002", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000065", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000003", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000003", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000003", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000003", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000004", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000004", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000004", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000004", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000006", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x80000007", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000100", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x80000008", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x00ffffff" + } + ], + [ + { + "leaf": "0x80000008", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000000" + } + ], + [ + { + "leaf": "0x40000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x40000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EBX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x40000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "ECX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x40000000", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0xffffffff" + } + ], + [ + { + "leaf": "0x40000001", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EAX" + }, + { + "replacements": "0x00000000", + "mask": "0x0103feff" + } + ], + [ + { + "leaf": "0x40000001", + "sub_leaf": { + "start": 0, + "end": 0 + }, + "register": "EDX" + }, + { + "replacements": "0x00000000", + "mask": "0x00000001" + } + ] + ] +} \ No newline at end of file diff --git a/arch/src/x86_64/cpuid_definitions/intel.rs b/arch/src/x86_64/cpuid_definitions/intel.rs new file mode 100644 index 0000000000..cf43b63a8e --- /dev/null +++ b/arch/src/x86_64/cpuid_definitions/intel.rs @@ -0,0 +1,4814 @@ +//! This module contains CPUID definitions for Intel CPUs. +use std::ops::RangeInclusive; + +use super::{ + CpuidDefinitions, CpuidReg, Parameters, ProfilePolicy, ValueDefinition, ValueDefinitions, +}; + +/// Contains CPUID definitions described in "Intel Architecture Instruction Set Extensions and Future Features" +/// +/// ## Missing leaves +/// +/// The following known CPUID leaves are left out of this table: +/// - 0x3 (Only relevant for Intel Pentium III), +/// - 0x12 (Only relevant for SGX which is deprecated), +/// - 0x19 (Key locker leaf. These features are not in scope for CPU profiles for the time being) +/// - 0x1a (Native Model ID Enumeration leaf), +/// - 0x1b (PCONFIG Information Sub-leaf. This is not in scope for CPU profiles for the time being), +/// - 0x27 (L3 Cache Intel RDT Monitoring Capability Asymmetric Enumeration), +/// - 0x28 (Intel Resource Director Technology Allocation Asymmetric Enumeration), +/// - 0x21 (Only relevant for Intel TDX which is not in scope fore CPU profiles for the time being), +/// - 0x40000000 - 0x4FFFFFFF (Reserved for hypervisors), +/// +/// ### How we produced this table +/// +/// We first ran the [`cpuidgen` tool](https://gitlab.com/x86-cpuid.org/x86-cpuid-db), whose +/// output is licensed under the SPDX Creative Commons Zero 1.0 Universal License. We then wrote a +/// throw-away Rust script to modify the output into something more similar to Rust code. Following +/// this we used macros and other functionality in the [Helix editor](https://helix-editor.com/) to +/// get actual Rust code. +/// +/// We then read through the CPUID section (1.4) of the Intel Architecture Instruction Set +/// Extensions and Future Features manual and manually inserted several leaf definitions that +/// we noticed were missing from the table we had produced. During this process we also changed +/// a few of the short names and descriptions to be more inline with what is written in the +/// aforementioned Intel manual. Finally we decided on a [`ProfilePolicy`] to be set for every +/// single [`ValueDefinition`] and manually appended those. +pub static INTEL_CPUID_DEFINITIONS: CpuidDefinitions<154> = const { + CpuidDefinitions([ + // ========================================================================================= + // Basic CPUID Information + // ========================================================================================= + ( + Parameters { + leaf: 0x0, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "max_std_leaf", + description: "Maximum Input value for Basic CPUID Information", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x0, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_vendorid_0", + description: "CPU vendor ID string bytes 0 - 3", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x0, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_vendorid_2", + description: "CPU vendor ID string bytes 8 - 11", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x0, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_vendorid_1", + description: "CPU vendor ID string bytes 4 - 7", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + // TODO: Do we really want to inherit these values from the corresponding CPU, or should we zero it out or set something else here? + ( + Parameters { + leaf: 0x1, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "stepping", + description: "Stepping ID", + bits_range: (0, 3), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "base_model", + description: "Base CPU model ID", + bits_range: (4, 7), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "base_family_id", + description: "Base CPU family ID", + bits_range: (8, 11), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "cpu_type", + description: "CPU type", + bits_range: (12, 13), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "ext_model", + description: "Extended CPU model ID", + bits_range: (16, 19), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "ext_family", + description: "Extended CPU family ID", + bits_range: (20, 27), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + ( + Parameters { + leaf: 0x1, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "brand_id", + description: "Brand index", + bits_range: (0, 7), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "clflush_size", + description: "CLFLUSH instruction cache line size", + bits_range: (8, 15), + policy: ProfilePolicy::Passthrough, + }, + // This is set by cloud hypervisor + ValueDefinition { + short: "n_logical_cpu", + description: "Logical CPU count", + bits_range: (16, 23), + policy: ProfilePolicy::Static(0), + }, + // This is set by cloud hypervisor + ValueDefinition { + short: "local_apic_id", + description: "Initial local APIC physical ID", + bits_range: (24, 31), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x1, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "sse3", + description: "Streaming SIMD Extensions 3 (SSE3)", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "pclmulqdq", + description: "PCLMULQDQ instruction support", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "dtes64", + description: "64-bit DS save area", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "monitor", + description: "MONITOR/MWAIT support", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "ds_cpl", + description: "CPL Qualified Debug Store", + bits_range: (4, 4), + policy: ProfilePolicy::Static(0), + }, + // TODO: Ideally configurable by the user (host must have this otherwise CHV will not run) + ValueDefinition { + short: "vmx", + description: "Virtual Machine Extensions", + bits_range: (5, 5), + policy: ProfilePolicy::Static(1), + }, + ValueDefinition { + short: "smx", + description: "Safer Mode Extensions", + bits_range: (6, 6), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "est", + description: "Enhanced Intel SpeedStep", + bits_range: (7, 7), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "tm2", + description: "Thermal Monitor 2", + bits_range: (8, 8), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "ssse3", + description: "Supplemental SSE3", + bits_range: (9, 9), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "cnxt_id", + description: "L1 Context ID", + bits_range: (10, 10), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "sdbg", + description: "Silicon Debug", + bits_range: (11, 11), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "fma", + description: "FMA extensions using YMM state", + bits_range: (12, 12), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "cx16", + description: "CMPXCHG16B instruction support", + bits_range: (13, 13), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "xtpr", + description: "xTPR Update Control", + bits_range: (14, 14), + policy: ProfilePolicy::Static(0), + }, + // MSR related + ValueDefinition { + short: "pdcm", + description: "Perfmon and Debug Capability", + bits_range: (15, 15), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "pcid", + description: "Process-context identifiers", + bits_range: (17, 17), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "dca", + description: "Direct Cache Access", + bits_range: (18, 18), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "sse4_1", + description: "SSE4.1", + bits_range: (19, 19), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "sse4_2", + description: "SSE4.2", + bits_range: (20, 20), + policy: ProfilePolicy::Inherit, + }, + // Set by Cloud hypervisor + ValueDefinition { + short: "x2apic", + description: "X2APIC support", + bits_range: (21, 21), + policy: ProfilePolicy::Static(1), + }, + ValueDefinition { + short: "movbe", + description: "MOVBE instruction support", + bits_range: (22, 22), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "popcnt", + description: "POPCNT instruction support", + bits_range: (23, 23), + policy: ProfilePolicy::Inherit, + }, + // Set by Cloud hypervisor + ValueDefinition { + short: "tsc_deadline_timer", + description: "APIC timer one-shot operation", + bits_range: (24, 24), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "aes", + description: "AES instructions", + bits_range: (25, 25), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xsave", + description: "XSAVE (and related instructions) support", + bits_range: (26, 26), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "osxsave", + description: "XSAVE (and related instructions) are enabled by OS", + bits_range: (27, 27), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx", + description: "AVX instructions support", + bits_range: (28, 28), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "f16c", + description: "Half-precision floating-point conversion support", + bits_range: (29, 29), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "rdrand", + description: "RDRAND instruction support", + bits_range: (30, 30), + policy: ProfilePolicy::Inherit, + }, + // TODO: If set by CHV set to 0 and write comment + ValueDefinition { + short: "guest_status", + description: "System is running as guest; (para-)virtualized system", + bits_range: (31, 31), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + ( + Parameters { + leaf: 0x1, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "fpu", + description: "Floating-Point Unit on-chip (x87)", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "vme", + description: "Virtual-8086 Mode Extensions", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "de", + description: "Debugging Extensions", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "pse", + description: "Page Size Extension", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "tsc", + description: "Time Stamp Counter", + bits_range: (4, 4), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "msr", + description: "Model-Specific Registers (RDMSR and WRMSR support)", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "pae", + description: "Physical Address Extensions", + bits_range: (6, 6), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "mce", + description: "Machine Check Exception", + bits_range: (7, 7), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "cx8", + description: "CMPXCHG8B instruction", + bits_range: (8, 8), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "apic", + description: "APIC on-chip", + bits_range: (9, 9), + policy: ProfilePolicy::Static(1), + }, + // MSR related (maybe not necessary to look into which ones) + ValueDefinition { + short: "sep", + description: "SYSENTER, SYSEXIT, and associated MSRs", + bits_range: (11, 11), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "mtrr", + description: "Memory Type Range Registers", + bits_range: (12, 12), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "pge", + description: "Page Global Extensions", + bits_range: (13, 13), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "mca", + description: "Machine Check Architecture", + bits_range: (14, 14), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "cmov", + description: "Conditional Move Instruction", + bits_range: (15, 15), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "pat", + description: "Page Attribute Table", + bits_range: (16, 16), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "pse36", + description: "Page Size Extension (36-bit)", + bits_range: (17, 17), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "psn", + description: "Processor Serial Number", + bits_range: (18, 18), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "clfsh", + description: "CLFLUSH instruction", + bits_range: (19, 19), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "ds", + description: "Debug Store", + bits_range: (21, 21), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "acpi", + description: "Thermal monitor and clock control", + bits_range: (22, 22), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "mmx", + description: "MMX instructions", + bits_range: (23, 23), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "fxsr", + description: "FXSAVE and FXRSTOR instructions", + bits_range: (24, 24), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "sse", + description: "SSE instructions", + bits_range: (25, 25), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "sse2", + description: "SSE2 instructions", + bits_range: (26, 26), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "ss", + description: "Self Snoop", + bits_range: (27, 27), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "htt", + description: "Hyper-threading", + bits_range: (28, 28), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "tm", + description: "Thermal Monitor", + bits_range: (29, 29), + policy: ProfilePolicy::Static(0), + }, + // TODO: Not really sure what the default should be for PBE. It seems like it is something that needs to be enabled via the IA32_MISC_ENABLE MSR hence perhaps this should be set via CPU features? + // MSR related + ValueDefinition { + short: "pbe", + description: "Pending Break Enable", + bits_range: (31, 31), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // ========================================================================================= + // Cache and TLB Information + // ========================================================================================= + ( + Parameters { + leaf: 0x2, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "iteration_count", + description: "Number of times this leaf must be queried", + bits_range: (0, 7), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "desc1", + description: "Descriptor #1", + bits_range: (8, 15), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "desc2", + description: "Descriptor #2", + bits_range: (16, 23), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "desc3", + description: "Descriptor #3", + bits_range: (24, 30), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "eax_invalid", + description: "Descriptors 1-3 are invalid if set", + bits_range: (31, 31), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + ( + Parameters { + leaf: 0x2, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "desc4", + description: "Descriptor #4", + bits_range: (0, 7), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "desc5", + description: "Descriptor #5", + bits_range: (8, 15), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "desc6", + description: "Descriptor #6", + bits_range: (16, 23), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "desc7", + description: "Descriptor #7", + bits_range: (24, 30), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "ebx_invalid", + description: "Descriptors 4-7 are invalid if set", + bits_range: (31, 31), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + ( + Parameters { + leaf: 0x2, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "desc8", + description: "Descriptor #8", + bits_range: (0, 7), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "desc9", + description: "Descriptor #9", + bits_range: (8, 15), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "desc10", + description: "Descriptor #10", + bits_range: (16, 23), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "desc11", + description: "Descriptor #11", + bits_range: (24, 30), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "ecx_invalid", + description: "Descriptors 8-11 are invalid if set", + bits_range: (31, 31), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + ( + Parameters { + leaf: 0x2, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "desc12", + description: "Descriptor #12", + bits_range: (0, 7), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "desc13", + description: "Descriptor #13", + bits_range: (8, 15), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "desc14", + description: "Descriptor #14", + bits_range: (16, 23), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "desc15", + description: "Descriptor #15", + bits_range: (24, 30), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "edx_invalid", + description: "Descriptors 12-15 are invalid if set", + bits_range: (31, 31), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + // ========================================================================================= + // Deterministic Cache Parameters + // ========================================================================================= + ( + Parameters { + leaf: 0x4, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "cache_type", + description: "Cache type field", + bits_range: (0, 4), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "cache_level", + description: "Cache level (1-based)", + bits_range: (5, 7), + policy: ProfilePolicy::Passthrough, + }, + // TODO: Could there be a problem migrating from a CPU with self-initializing cache to one without? + ValueDefinition { + short: "cache_self_init", + description: "Self-initializing cache level", + bits_range: (8, 8), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "fully_associative", + description: "Fully-associative cache", + bits_range: (9, 9), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "num_threads_sharing", + description: "Number logical CPUs sharing this cache", + bits_range: (14, 25), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "num_cores_on_die", + description: "Number of cores in the physical package", + bits_range: (26, 31), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + ( + Parameters { + leaf: 0x4, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "cache_linesize", + description: "System coherency line size (0-based)", + bits_range: (0, 11), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "cache_npartitions", + description: "Physical line partitions (0-based)", + bits_range: (12, 21), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "cache_nways", + description: "Ways of associativity (0-based)", + bits_range: (22, 31), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + ( + Parameters { + leaf: 0x4, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cache_nsets", + description: "Cache number of sets (0-based)", + bits_range: (0, 30), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x4, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "wbinvd_rll_no_guarantee", + description: "WBINVD/INVD not guaranteed for Remote Lower-Level caches", + bits_range: (0, 0), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "ll_inclusive", + description: "Cache is inclusive of Lower-Level caches", + bits_range: (1, 1), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "complex_indexing", + description: "Not a direct-mapped cache (complex function)", + bits_range: (2, 2), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + // ========================================================================================= + // MONITOR/MWAIT + // ========================================================================================= + ( + Parameters { + leaf: 0x5, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "min_mon_size", + description: "Smallest monitor-line size, in bytes", + bits_range: (0, 15), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0x5, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "max_mon_size", + description: "Largest monitor-line size, in bytes", + bits_range: (0, 15), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0x5, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "mwait_ext", + description: "Enumeration of MONITOR/MWAIT extensions is supported", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "mwait_irq_break", + description: "Interrupts as a break-event for MWAIT is supported", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x5, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "n_c0_substates", + description: "Number of C0 sub C-states supported using MWAIT", + bits_range: (0, 3), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "n_c1_substates", + description: "Number of C1 sub C-states supported using MWAIT", + bits_range: (4, 7), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "n_c2_substates", + description: "Number of C2 sub C-states supported using MWAIT", + bits_range: (8, 11), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "n_c3_substates", + description: "Number of C3 sub C-states supported using MWAIT", + bits_range: (12, 15), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "n_c4_substates", + description: "Number of C4 sub C-states supported using MWAIT", + bits_range: (16, 19), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "n_c5_substates", + description: "Number of C5 sub C-states supported using MWAIT", + bits_range: (20, 23), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "n_c6_substates", + description: "Number of C6 sub C-states supported using MWAIT", + bits_range: (24, 27), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "n_c7_substates", + description: "Number of C7 sub C-states supported using MWAIT", + bits_range: (28, 31), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // ========================================================================================= + // Thermal and Power Management + // ========================================================================================= + ( + Parameters { + leaf: 0x6, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "dtherm", + description: "Digital temperature sensor", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "turbo_boost", + description: "Intel Turbo Boost", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "arat", + description: "Always-Running APIC Timer (not affected by p-state)", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "pln", + description: "Power Limit Notification (PLN) event", + bits_range: (4, 4), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "ecmd", + description: "Clock modulation duty cycle extension", + bits_range: (5, 5), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "pts", + description: "Package thermal management", + bits_range: (6, 6), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "hwp", + description: "HWP (Hardware P-states) base registers are supported", + bits_range: (7, 7), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "hwp_notify", + description: "HWP notification (IA32_HWP_INTERRUPT MSR)", + bits_range: (8, 8), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "hwp_act_window", + description: "HWP activity window (IA32_HWP_REQUEST[bits 41:32]) supported", + bits_range: (9, 9), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "hwp_epp", + description: "HWP Energy Performance Preference", + bits_range: (10, 10), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "hwp_pkg_req", + description: "HWP Package Level Request", + bits_range: (11, 11), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "hdc_base_regs", + description: "HDC base registers are supported", + bits_range: (13, 13), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "turbo_boost_3_0", + description: "Intel Turbo Boost Max 3.0", + bits_range: (14, 14), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "hwp_capabilities", + description: "HWP Highest Performance change", + bits_range: (15, 15), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "hwp_peci_override", + description: "HWP PECI override", + bits_range: (16, 16), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "hwp_flexible", + description: "Flexible HWP", + bits_range: (17, 17), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "hwp_fast", + description: "IA32_HWP_REQUEST MSR fast access mode", + bits_range: (18, 18), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "hfi", + description: "HW_FEEDBACK MSRs supported", + bits_range: (19, 19), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "hwp_ignore_idle", + description: "Ignoring idle logical CPU HWP req is supported", + bits_range: (20, 20), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "thread_director", + description: "Intel thread director support", + bits_range: (23, 23), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "therm_interrupt_bit25", + description: "IA32_THERM_INTERRUPT MSR bit 25 is supported", + bits_range: (24, 24), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x6, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "n_therm_thresholds", + description: "Digital thermometer thresholds", + bits_range: (0, 3), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0x6, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + // MSR related + ValueDefinition { + short: "aperfmperf", + description: "MPERF/APERF MSRs (effective frequency interface)", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + // MSR related + ValueDefinition { + short: "epb", + description: "IA32_ENERGY_PERF_BIAS MSR support", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "thrd_director_nclasses", + description: "Number of classes, Intel thread director", + bits_range: (8, 15), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x6, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "perfcap_reporting", + description: "Performance capability reporting", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "encap_reporting", + description: "Energy efficiency capability reporting", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "feedback_sz", + description: "Feedback interface structure size, in 4K pages", + bits_range: (8, 11), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "this_lcpu_hwfdbk_idx", + description: "This logical CPU hardware feedback interface index", + bits_range: (16, 31), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // =================================================================================================================== + // Structured Extended Feature Flags Enumeration Main Leaf + // =================================================================================================================== + ( + Parameters { + leaf: 0x7, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "leaf7_n_subleaves", + description: "Number of leaf 0x7 subleaves", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x7, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "fsgsbase", + description: "FSBASE/GSBASE read/write support", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "tsc_adjust", + description: "IA32_TSC_ADJUST MSR supported", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + // SGX is deprecated so we disable it unconditionally for all CPU profiles + ValueDefinition { + short: "sgx", + description: "Intel SGX (Software Guard Extensions)", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "bmi1", + description: "Bit manipulation extensions group 1", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit, + }, + // TSX related which is riddled with CVEs. Consider two profiles, or making it opt-in/out. QEMU always has a CPU model with and without TSX. + ValueDefinition { + short: "hle", + description: "Hardware Lock Elision", + bits_range: (4, 4), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "avx2", + description: "AVX2 instruction set", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit, + }, + /*The KVM docs recommend always setting this (https://docs.kernel.org/virt/kvm/x86/errata.html#kvm-get-supported-cpuid-issues). + + Keep in mind however that in my limited understanding this isn't about enabling or disabling a feature, but it describes critical behaviour. + Hence I am wondering whether it should be a hard error if the host does not have this bit set, but the desired CPU profile does? + + TODO: Check what KVM_GET_SUPPORTED_CPUID actually gives here (on the Skylake server) + */ + ValueDefinition { + short: "fdp_excptn_only", + description: "FPU Data Pointer updated only on x87 exceptions", + bits_range: (6, 6), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "smep", + description: "Supervisor Mode Execution Protection", + bits_range: (7, 7), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "bmi2", + description: "Bit manipulation extensions group 2", + bits_range: (8, 8), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "erms", + description: "Enhanced REP MOVSB/STOSB", + bits_range: (9, 9), + policy: ProfilePolicy::Inherit, + }, + /* + The instruction enabled by this seems rather powerful. Are we sure that doesn't have security implications? + I included this because it seems like QEMU does (to the best of my understanding). + */ + ValueDefinition { + short: "invpcid", + description: "INVPCID instruction (Invalidate Processor Context ID)", + bits_range: (10, 10), + policy: ProfilePolicy::Inherit, + }, + // This is TSX related. TSX is riddled with CVEs: Consider two profiles (one with it disabled) or an opt-in/out feature. + ValueDefinition { + short: "rtm", + description: "Intel restricted transactional memory", + bits_range: (11, 11), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "rdt_m", + description: "Supports Intel Resource Director Technology Monitoring Capability if 1", + bits_range: (12, 12), + policy: ProfilePolicy::Static(0), + }, + // The KVM docs recommend always setting this (https://docs.kernel.org/virt/kvm/x86/errata.html#kvm-get-supported-cpuid-issues). TODO: Is it OK to just set this to 1? + ValueDefinition { + short: "zero_fcs_fds", + description: "Deprecates FPU CS and FPU DS values if 1", + bits_range: (13, 13), + policy: ProfilePolicy::Passthrough, + }, + // This has been deprecated + ValueDefinition { + short: "mpx", + description: "Intel memory protection extensions", + bits_range: (14, 14), + policy: ProfilePolicy::Static(0), + }, + // This might be useful for certain high performance applications, but it also seems like a rather niche and advanced feature. QEMU does also not automatically enable this from what we can tell. + // TODO: Should we make this OPT-IN? + ValueDefinition { + short: "rdt_a", + description: "Intel RDT-A. Supports Intel Resource Director Technology Allocation Capability if 1", + bits_range: (15, 15), + policy: ProfilePolicy::Static(0), + }, + // TODO: Do the wider avx512 zmm registers work out of the box when the hardware supports it? + ValueDefinition { + short: "avx512f", + description: "AVX-512 foundation instructions", + bits_range: (16, 16), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx512dq", + description: "AVX-512 double/quadword instructions", + bits_range: (17, 17), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "rdseed", + description: "RDSEED instruction", + bits_range: (18, 18), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "adx", + description: "ADCX/ADOX instructions", + bits_range: (19, 19), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "smap", + description: "Supervisor mode access prevention", + bits_range: (20, 20), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx512ifma", + description: "AVX-512 integer fused multiply add", + bits_range: (21, 21), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "clflushopt", + description: "CLFLUSHOPT instruction", + bits_range: (23, 23), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "clwb", + description: "CLWB instruction", + bits_range: (24, 24), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "intel_pt", + description: "Intel processor trace", + bits_range: (25, 25), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "avx512pf", + description: "AVX-512 prefetch instructions", + bits_range: (26, 26), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx512er", + description: "AVX-512 exponent/reciprocal instructions", + bits_range: (27, 27), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx512cd", + description: "AVX-512 conflict detection instructions", + bits_range: (28, 28), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "sha_ni", + description: "SHA/SHA256 instructions", + bits_range: (29, 29), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx512bw", + description: "AVX-512 byte/word instructions", + bits_range: (30, 30), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx512vl", + description: "AVX-512 VL (128/256 vector length) extensions", + bits_range: (31, 31), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + ( + Parameters { + leaf: 0x7, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "prefetchwt1", + description: "PREFETCHWT1 (Intel Xeon Phi only)", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "avx512vbmi", + description: "AVX-512 Vector byte manipulation instructions", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + // Also set by QEMU for CPU models from what we can tell + ValueDefinition { + short: "umip", + description: "User mode instruction protection", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit, + }, + // Also set by QEMU for CPU models from what we can tell + ValueDefinition { + short: "pku", + description: "Protection keys for user-space", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "ospke", + description: "OS protection keys enable", + bits_range: (4, 4), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "waitpkg", + description: "WAITPKG instructions", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx512_vbmi2", + description: "AVX-512 vector byte manipulation instructions group 2", + bits_range: (6, 6), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "cet_ss", + description: "CET shadow stack features", + bits_range: (7, 7), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "gfni", + description: "Galois field new instructions", + bits_range: (8, 8), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "vaes", + description: "Vector AES instructions", + bits_range: (9, 9), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "vpclmulqdq", + description: "VPCLMULQDQ 256-bit instruction support", + bits_range: (10, 10), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx512_vnni", + description: "Vector neural network instructions", + bits_range: (11, 11), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx512_bitalg", + description: "AVX-512 bitwise algorithms", + bits_range: (12, 12), + policy: ProfilePolicy::Inherit, + }, + // Seems to be TDX related which is experimental in CHV. We disable this for CPU profiles for now, but could potentially add it as an opt-in feature eventually. + ValueDefinition { + short: "tme", + description: "Intel total memory encryption", + bits_range: (13, 13), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "avx512_vpopcntdq", + description: "AVX-512: POPCNT for vectors of DWORD/QWORD", + bits_range: (14, 14), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "la57", + description: "57-bit linear addresses (five-level paging)", + bits_range: (16, 16), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "mawau_val_lm", + description: "BNDLDX/BNDSTX MAWAU value in 64-bit mode", + bits_range: (17, 21), + policy: ProfilePolicy::Static(0), + }, + // MSR related + ValueDefinition { + short: "rdpid", + description: "RDPID instruction", + bits_range: (22, 22), + policy: ProfilePolicy::Inherit, + }, + // We leave key locker support out for CPU profiles for the time being. We may want this to be opt-in in the future though + ValueDefinition { + short: "key_locker", + description: "Intel key locker support", + bits_range: (23, 23), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "bus_lock_detect", + description: "OS bus-lock detection", + bits_range: (24, 24), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "cldemote", + description: "CLDEMOTE instruction", + bits_range: (25, 25), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "movdiri", + description: "MOVDIRI instruction", + bits_range: (27, 27), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "movdir64b", + description: "MOVDIR64B instruction", + bits_range: (28, 28), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "enqcmd", + description: "Enqueue stores supported (ENQCMD{,S})", + bits_range: (29, 29), + policy: ProfilePolicy::Static(0), + }, + // SGX support is deprecated so we disable it unconditionally for CPU profiles + ValueDefinition { + short: "sgx_lc", + description: "Intel SGX launch configuration", + bits_range: (30, 30), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "pks", + description: "Protection keys for supervisor-mode pages", + bits_range: (31, 31), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x7, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + // SGX is deprecated + ValueDefinition { + short: "sgx_keys", + description: "Intel SGX attestation services", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "avx512_4vnniw", + description: "AVX-512 neural network instructions (Intel Xeon Phi only?)", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx512_4fmaps", + description: "AVX-512 multiply accumulation single precision (Intel Xeon Phi only?)", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "fsrm", + description: "Fast short REP MOV", + bits_range: (4, 4), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "uintr", + description: "CPU supports user interrupts", + bits_range: (5, 5), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "avx512_vp2intersect", + description: "VP2INTERSECT{D,Q} instructions", + bits_range: (8, 8), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "srdbs_ctrl", + description: "SRBDS mitigation MSR available: If 1, enumerates support for the IA32_MCU_OPT_CTRL MSR and indicates that its bit 0 (RNGDS_MITG_DIS) is also supported.", + bits_range: (9, 9), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "md_clear", + description: "VERW MD_CLEAR microcode support", + bits_range: (10, 10), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "rtm_always_abort", + description: "XBEGIN (RTM transaction) always aborts", + bits_range: (11, 11), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "tsx_force_abort", + description: "MSR TSX_FORCE_ABORT, RTM_ABORT bit, supported", + bits_range: (13, 13), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "serialize", + description: "SERIALIZE instruction", + bits_range: (14, 14), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "hybrid_cpu", + description: "The CPU is identified as a 'hybrid part'", + bits_range: (15, 15), + policy: ProfilePolicy::Inherit, + }, + // TODO: This is TSX related which is riddled with CVEs. We could consider an additional profile enabling TSX in the future, but we leave it out for now. + ValueDefinition { + short: "tsxldtrk", + description: "TSX suspend/resume load address tracking", + bits_range: (16, 16), + policy: ProfilePolicy::Static(0), + }, + // Might be relevant for confidential computing + ValueDefinition { + short: "pconfig", + description: "PCONFIG instruction", + bits_range: (18, 18), + policy: ProfilePolicy::Static(0), + }, + // MSR related + ValueDefinition { + short: "arch_lbr", + description: "Intel architectural LBRs", + bits_range: (19, 19), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "ibt", + description: "CET indirect branch tracking", + bits_range: (20, 20), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "amx_bf16", + description: "AMX-BF16: tile bfloat16 support", + bits_range: (22, 22), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx512_fp16", + description: "AVX-512 FP16 instructions", + bits_range: (23, 23), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "amx_tile", + description: "AMX-TILE: tile architecture support", + bits_range: (24, 24), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "amx_int8", + description: "AMX-INT8: tile 8-bit integer support", + bits_range: (25, 25), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "spec_ctrl", + description: "Speculation Control (IBRS/IBPB: indirect branch restrictions)", + bits_range: (26, 26), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "intel_stibp", + description: "Single thread indirect branch predictors", + bits_range: (27, 27), + policy: ProfilePolicy::Passthrough, + }, + // MSR related + ValueDefinition { + short: "flush_l1d", + description: "FLUSH L1D cache: IA32_FLUSH_CMD MSR", + bits_range: (28, 28), + policy: ProfilePolicy::Passthrough, + }, + // MSR related + ValueDefinition { + short: "arch_capabilities", + description: "Intel IA32_ARCH_CAPABILITIES MSR", + bits_range: (29, 29), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "core_capabilities", + description: "IA32_CORE_CAPABILITIES MSR", + bits_range: (30, 30), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "spec_ctrl_ssbd", + description: "Speculative store bypass disable", + bits_range: (31, 31), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + // =================================================================================================================== + // Structured Extended Feature Flags Enumeration Sub-Leaf 1 + // =================================================================================================================== + ( + Parameters { + leaf: 0x7, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "sha512", + description: "SHA-512 extensions", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "sm3", + description: "SM3 instructions", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "sm4", + description: "SM4 instructions", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit, + }, + // RAO-INT is deprecated and removed from most compilers as far as we are aware + ValueDefinition { + short: "RAO-INT", + description: "RAO-INT instructions", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "avx_vnni", + description: "AVX-VNNI instructions", + bits_range: (4, 4), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx512_bf16", + description: "AVX-512 bfloat16 instructions", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit, + }, + /* + Not set in QEMU from what we can tell, but according seems to be fine to expose this to guests + if we understood https://www.phoronix.com/news/Intel-Linux-LASS-KVM correctly. It is also + our understanding that this feature can enable guests opting in to more security (possibly at the cost of some performance). + */ + ValueDefinition { + short: "lass", + description: "Linear address space separation", + bits_range: (6, 6), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "cmpccxadd", + description: "CMPccXADD instructions", + bits_range: (7, 7), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "arch_perfmon_ext", + description: "ArchPerfmonExt: leaf 0x23 is supported", + bits_range: (8, 8), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "fzrm", + description: "Fast zero-length REP MOVSB", + bits_range: (10, 10), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "fsrs", + description: "Fast short REP STOSB", + bits_range: (11, 11), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "fsrc", + description: "Fast Short REP CMPSB/SCASB", + bits_range: (12, 12), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "fred", + description: "FRED: Flexible return and event delivery transitions", + bits_range: (17, 17), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "lkgs", + description: "LKGS: Load 'kernel' (userspace) GS", + bits_range: (18, 18), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "wrmsrns", + description: "WRMSRNS instruction (WRMSR-non-serializing)", + bits_range: (19, 19), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "nmi_src", + description: "NMI-source reporting with FRED event data", + bits_range: (20, 20), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "amx_fp16", + description: "AMX-FP16: FP16 tile operations", + bits_range: (21, 21), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "hreset", + description: "History reset support", + bits_range: (22, 22), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "avx_ifma", + description: "Integer fused multiply add", + bits_range: (23, 23), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "lam", + description: "Linear address masking", + bits_range: (26, 26), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "rd_wr_msrlist", + description: "RDMSRLIST/WRMSRLIST instructions", + bits_range: (27, 27), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "invd_disable_post_bios_done", + description: "If 1, supports INVD execution prevention after BIOS Done", + bits_range: (30, 30), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "movrs", + description: "MOVRS", + bits_range: (31, 31), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + ( + Parameters { + leaf: 0x7, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "intel_ppin", + description: "Protected processor inventory number (PPIN{,_CTL} MSRs)", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + // MSR related + ValueDefinition { + short: "pbndkb", + description: "PBNDKB instruction supported and enumerates the existence of the IA32_TSE_CAPABILITY MSR", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // TODO: Missing entry for (0x7, 1, ECX) + // Make the whole register zero though + // + ( + Parameters { + leaf: 0x7, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "avx_vnni_int8", + description: "AVX-VNNI-INT8 instructions", + bits_range: (4, 4), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx_ne_convert", + description: "AVX-NE-CONVERT instructions", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit, + }, + // NOTE: AMX currently requires opt-in, even for the host CPU profile. We still inherit this value for profiles as the value will be zeroed out if the user has not opted in for "amx" via CpuFeatures. + ValueDefinition { + short: "amx_complex", + description: "AMX-COMPLEX instructions (starting from Granite Rapids)", + bits_range: (8, 8), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx_vnni_int16", + description: "AVX-VNNI-INT16 instructions", + bits_range: (10, 10), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "utmr", + description: "If 1, supports user-timer events", + bits_range: (13, 13), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "prefetchit_0_1", + description: "PREFETCHIT0/1 instructions", + bits_range: (14, 14), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "user_msr", + description: "If 1, supports the URDMSR and UWRMSR instructions", + bits_range: (15, 15), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "uiret_uif", + description: "If 1, UIRET sets UIF to the value of bit 1 of the RFLAGS image loaded from the stack", + bits_range: (15, 15), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "cet_sss", + description: "CET supervisor shadow stacks safe to use", + bits_range: (18, 18), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx10", + description: "If 1, supports the Intel AVX10 instructions and indicates the presence of leaf 0x24", + bits_range: (19, 19), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "apx_f", + description: "If 1, the processor provides foundational support for Intel Advanced Performance Extensions", + bits_range: (21, 21), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "mwait", + description: "If 1, MWAIT is supported even if (0x1 ECX bit 3 (monitor) is enumerated as 0)", + bits_range: (23, 23), + policy: ProfilePolicy::Static(0), + }, + // MSR related + ValueDefinition { + short: "slsm", + description: "If 1, indicates bit 0 of the IA32_INTEGRITY_STATUS MSR is supported. Bit 0 of this MSR indicates whether static lockstep is active on this logical processor", + bits_range: (24, 24), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // =================================================================================================================== + // Structured Extended Feature Flags Enumeration Sub-Leaf 2 + // =================================================================================================================== + ( + Parameters { + leaf: 0x7, + sub_leaf: RangeInclusive::new(2, 2), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + // MSR related + ValueDefinition { + short: "intel_psfd", + description: "If 1, indicates bit 7 of the IA32_SPEC_CTRL_MSR is supported. Bit 7 of this MSR disables fast store forwarding predictor without disabling speculative store bypass", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "ipred_ctrl", + description: "MSR bits IA32_SPEC_CTRL.IPRED_DIS_{U,S}", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "rrsba_ctrl", + description: "MSR bits IA32_SPEC_CTRL.RRSBA_DIS_{U,S}", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "ddp_ctrl", + description: "MSR bit IA32_SPEC_CTRL.DDPD_U", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "bhi_ctrl", + description: "MSR bit IA32_SPEC_CTRL.BHI_DIS_S", + bits_range: (4, 4), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "mcdt_no", + description: "MCDT mitigation not needed", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "uclock_disable", + description: "UC-lock disable is supported", + bits_range: (6, 6), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + // =================================================================================================================== + // Direct Cache Access Information + // =================================================================================================================== + ( + Parameters { + leaf: 0x9, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + // MSR related + ValueDefinition { + short: "dca_cap_msr_value", + description: "Value of bits [31:0] of IA32_PLATFORM_DCA_CAP MSR (address 1f8H)", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // =================================================================================================================== + // Architectural Performance Monitoring + // =================================================================================================================== + // We will just zero out everything to do with PMU for CPU profiles + ( + Parameters { + leaf: 0xa, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "pmu_version", + description: "Performance monitoring unit version ID", + bits_range: (0, 7), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "pmu_n_gcounters", + description: "Number of general PMU counters per logical CPU", + bits_range: (8, 15), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "pmu_gcounters_nbits", + description: "Bitwidth of PMU general counters", + bits_range: (16, 23), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "pmu_cpuid_ebx_bits", + description: "Length of leaf 0xa EBX bit vector", + bits_range: (24, 31), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0xa, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "no_core_cycle_evt", + description: "Core cycle event not available", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "no_insn_retired_evt", + description: "Instruction retired event not available", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "no_refcycle_evt", + description: "Reference cycles event not available", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "no_llc_ref_evt", + description: "LLC-reference event not available", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "no_llc_miss_evt", + description: "LLC-misses event not available", + bits_range: (4, 4), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "no_br_insn_ret_evt", + description: "Branch instruction retired event not available", + bits_range: (5, 5), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "no_br_mispredict_evt", + description: "Branch mispredict retired event not available", + bits_range: (6, 6), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "no_td_slots_evt", + description: "Topdown slots event not available", + bits_range: (7, 7), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0xa, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "pmu_fcounters_bitmap", + description: "Fixed-function PMU counters support bitmap", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xa, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "pmu_n_fcounters", + description: "Number of fixed PMU counters", + bits_range: (0, 4), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "pmu_fcounters_nbits", + description: "Bitwidth of PMU fixed counters", + bits_range: (5, 12), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "anythread_depr", + description: "AnyThread deprecation", + bits_range: (15, 15), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // =================================================================================================================== + // Extended Topology Enumeration + // =================================================================================================================== + + // Leaf 0xB must be set by CHV itself (and do all necessary checks) + ( + Parameters { + leaf: 0xb, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "x2apic_id_shift", + description: "Bit width of this level (previous levels inclusive)", + bits_range: (0, 4), + policy: ProfilePolicy::Passthrough, + }]), + ), + // Set by VMM/user provided config + ( + Parameters { + leaf: 0xb, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "domain_lcpus_count", + description: "Logical CPUs count across all instances of this domain", + bits_range: (0, 15), + policy: ProfilePolicy::Passthrough, + }]), + ), + // Set by VMM/user provided config + ( + Parameters { + leaf: 0xb, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "domain_nr", + description: "This domain level (subleaf ID)", + bits_range: (0, 7), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "domain_type", + description: "This domain type", + bits_range: (8, 15), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + // Set by VMM/user provided config + ( + Parameters { + leaf: 0xb, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "x2apic_id", + description: "x2APIC ID of current logical CPU", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + // =================================================================================================================== + // Processor Extended State Enumeration Main Leaf + // =================================================================================================================== + // TODO: Implement CPUID compatibility checks in CHV for this leaf + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "xcr0_x87", + description: "XCR0.X87 (bit 0) supported", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xcr0_sse", + description: "XCR0.SEE (bit 1) supported", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xcr0_avx", + description: "XCR0.AVX (bit 2) supported", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit, + }, + // MPX is deprecated + ValueDefinition { + short: "xcr0_mpx_bndregs", + description: "XCR0.BNDREGS (bit 3) supported (MPX BND0-BND3 registers)", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + // MPX is deprecated + ValueDefinition { + short: "xcr0_mpx_bndcsr", + description: "XCR0.BNDCSR (bit 4) supported (MPX BNDCFGU/BNDSTATUS registers)", + bits_range: (4, 4), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "xcr0_avx512_opmask", + description: "XCR0.OPMASK (bit 5) supported (AVX-512 k0-k7 registers)", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xcr0_avx512_zmm_hi256", + description: "XCR0.ZMM_Hi256 (bit 6) supported (AVX-512 ZMM0->ZMM7/15 registers)", + bits_range: (6, 6), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xcr0_avx512_hi16_zmm", + description: "XCR0.HI16_ZMM (bit 7) supported (AVX-512 ZMM16->ZMM31 registers)", + bits_range: (7, 7), + policy: ProfilePolicy::Inherit, + }, + // MSR related + ValueDefinition { + short: "xcr0_ia32_xss", + description: "XCR0.IA32_XSS (bit 8) used for IA32_XSS", + bits_range: (8, 8), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xcr0_pkru", + description: "XCR0.PKRU (bit 9) supported (XSAVE PKRU registers)", + bits_range: (9, 9), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xcr0_ia32_xss_bits", + description: "XCR0.IA32_XSS (bit 10 - 16) used for IA32_XSS", + bits_range: (10, 16), + policy: ProfilePolicy::Inherit, + }, + // NOTE: AMX currently requires opt-in, even for the host CPU profile. We still inherit this value for profiles and modify this value at runtime if AMX is not enabled by the user. + ValueDefinition { + short: "xcr0_tileconfig", + description: "XCR0.TILECONFIG (bit 17) supported (AMX can manage TILECONFIG)", + bits_range: (17, 17), + policy: ProfilePolicy::Inherit, + }, + // NOTE: AMX currently requires opt-in, even for the host CPU profile. We still inherit this value for profiles and modify this value at runtime if AMX is not ebabled by the user. + ValueDefinition { + short: "xcr0_tiledata", + description: "XCR0.TILEDATA (bit 18) supported (AMX can manage TILEDATA)", + bits_range: (18, 18), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + // This value can be changed by the OS and must thus be passthrough + ValueDefinitions::new(&[ValueDefinition { + short: "xsave_sz_xcr0_enabled", + description: "XSAVE/XRSTOR area byte size, for XCR0 enabled features", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + // This may be passthrough because we restrict each individual state component + ValueDefinitions::new(&[ValueDefinition { + short: "xsave_sz_max", + description: "XSAVE/XRSTOR area max byte size, all CPU features", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + // TODO: Do we know of any state components corresponding to the upper bits in XCR0? Perhaps it would be + // better to have `ProfilePolicy::Static(0)` here? + ValueDefinitions::new(&[ValueDefinition { + short: "xcr0_upper_bits", + description: "Reports the valid bit fields of the upper 32 bits of the XCR0 register", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + // =================================================================================================================== + // Processor Extended State Enumeration Sub-leaf 1 + // =================================================================================================================== + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "xsaveopt", + description: "XSAVEOPT instruction", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xsavec", + description: "XSAVEC instruction", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xgetbv1", + description: "XGETBV instruction with ECX = 1", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit, + }, + // TODO: Can this have security implications in terms of supervisor state getting exposed? + ValueDefinition { + short: "xsaves", + description: "XSAVES/XRSTORS instructions (and XSS MSR)", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xfd", + description: "Extended feature disable support", + bits_range: (4, 4), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + /*NOTE: This will depend on which CPU features (in CHV) are enabled and pre-computation can potentially lead to a combinatorial explosion. Luckily we can deal with each component (and its size) separately, hence we can just passthrough whatever we get from the host here.*/ + ValueDefinition { + short: "xsave_sz_xcr0_xmms_enabled", + description: "XSAVE area size, all XCR0 and IA32_XSS features enabled", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::ECX, + }, + /* Reports the supported bits of the lower IA32_XSS MSR. IA32_XSS[n] can be set to 1 only if ECX[n] = 1*/ + ValueDefinitions::new(&[ + ValueDefinition { + short: "xcr0_7bits", + description: "Used for XCR0", + bits_range: (0, 7), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xss_pt", + description: "PT state, supported", + bits_range: (8, 8), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xcr0_bit9", + description: "Used for XCR0", + bits_range: (9, 9), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xss_pasid", + description: "PASID state, supported", + bits_range: (10, 10), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xss_cet_u", + description: "CET user state, supported", + bits_range: (11, 11), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xss_cet_p", + description: "CET supervisor state, supported", + bits_range: (12, 12), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xss_hdc", + description: "HDC state, supported", + bits_range: (13, 13), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xss_uintr", + description: "UINTR state, supported", + bits_range: (14, 14), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xss_lbr", + description: "LBR state, supported", + bits_range: (15, 15), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xss_hwp", + description: "HWP state, supported", + bits_range: (16, 16), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xcr0_bits", + description: "Used for XCR0", + bits_range: (17, 18), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EDX, + }, + /* Reports the supported bits of the upper 32 bits of the IA32_XSS MSR. IA32_XSS[n + 32 ] can be set to 1 only if EDX[n] = 1*/ + ValueDefinitions::new(&[ValueDefinition { + short: "ia32_xss_upper", + description: " Reports the supported bits of the upper 32 bits of the IA32_XSS MSR. IA32_XSS[n + 32 ] can be set to 1 only if EDX[n] = 1", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + // =================================================================================================================== + // Processor Extended State Enumeration Sub-leaves + // =================================================================================================================== + + /* LEAF 0xd sub-leaf n >=2 : + If ECX contains an invalid sub-leaf index, EAX/EBX/ECX/EDX return 0. Sub-leaf n (0 ≤ n ≤ 31) is + invalid if sub-leaf 0 returns 0 in EAX[n] and sub-leaf 1 returns 0 in ECX[n]. Sub-leaf n (32 ≤ n ≤ 63) + is invalid if sub-leaf 0 returns 0 in EDX[n-32] and sub-leaf 1 returns 0 in EDX[n-32]. + */ + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(2, 63), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "xsave_sz", + description: "Size of save area for subleaf-N feature, in bytes", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(2, 63), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "xsave_offset", + description: "Offset of save area for subleaf-N feature, in bytes", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(2, 63), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "is_xss_bit", + description: "Subleaf N describes an XSS bit, otherwise XCR0 bit", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "compacted_xsave_64byte_aligned", + description: "When compacted, subleaf-N feature XSAVE area is 64-byte aligned", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xfd_faulting", + description: "Indicates support for xfd faulting", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + // Intel MPX is deprecated hence we zero out these sub-leaves + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(3, 4), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-3-4-eax-mpx-zero", + description: "This leaf has been zeroed out because MPX state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(3, 4), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-3-4-ebx-mpx-zero", + description: "This leaf has been zeroed out because MPX state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(3, 4), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-3-4-ecx-mpx-zero", + description: "This leaf has been zeroed out because MPX state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(3, 4), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "0xd-3-4-edx-mpx-zero", + description: "This leaf has been zeroed out because MPX state components are disabled", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + // NOTE: Sub-leaves 17 & 18 are AMX related and we will alter the adjustments corresponding to + // the policy declared here at runtime for those values. + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(5, 63), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "xsave_sz", + description: "Size of save area for subleaf-N feature, in bytes", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(5, 63), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "xsave_offset", + description: "Offset of save area for subleaf-N feature, in bytes", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0xd, + sub_leaf: RangeInclusive::new(5, 63), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "is_xss_bit", + description: "Subleaf N describes an XSS bit, otherwise XCR0 bit", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "compacted_xsave_64byte_aligned", + description: "When compacted, subleaf-N feature XSAVE area is 64-byte aligned", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "xfd_faulting", + description: "Indicates support for xfd faulting", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + // =================================================================================================================== + // Intel Resource Director Technology Monitoring Enumeration + // =================================================================================================================== + ( + Parameters { + leaf: 0xf, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "core_rmid_max", + description: "RMID max, within this core, all types (0-based)", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xf, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "l3-cache-rdt-monitoring", + description: "Supports L3 Cache Intel RDT Monitoring if 1", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }]), + ), + // =================================================================================================================== + // Intel Resource Director Technology Monitoring Enumeration Sub-leaf 1 + // =================================================================================================================== + ( + Parameters { + leaf: 0xf, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "l3c_qm_bitwidth", + description: "L3 QoS-monitoring counter bitwidth (24-based)", + bits_range: (0, 7), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "l3c_qm_overflow_bit", + description: "QM_CTR MSR bit 61 is an overflow bit", + bits_range: (8, 8), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "l3c_qm_non_cpu_agent", + description: "If 1, indicates the presence of non-CPU agent Intel RDT CTM support", + bits_range: (9, 9), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "l3c_qm_non_cpu_agent", + description: "If 1, indicates the presence of non-CPU agent Intel RDT MBM support", + bits_range: (10, 10), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0xf, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "l3c_qm_conver_factor", + description: "QM_CTR MSR conversion factor to bytes", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xf, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "l3c_qm_rmid_max", + description: "L3 QoS-monitoring max RMID", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0xf, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "cqm_occup_llc", + description: "L3 QoS occupancy monitoring supported", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "cqm_mbm_total", + description: "L3 QoS total bandwidth monitoring supported", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "cqm_mbm_local", + description: "L3 QoS local bandwidth monitoring supported", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // =================================================================================================================== + // Intel Resource Director Technology Allocation Enumeration + // =================================================================================================================== + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + //TODO: These features may be good for increased performance. Perhaps there needs to be some mechanism to opt-in for non-host CPU profiles? + ValueDefinitions::new(&[ + ValueDefinition { + short: "cat_l3", + description: "L3 Cache Allocation Technology supported", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "cat_l2", + description: "L2 Cache Allocation Technology supported", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "mba", + description: "Memory Bandwidth Allocation supported", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // =================================================================================================================== + // Intel Resource Director Technology Allocation Enumeration Sub-leaf (ECX = ResID = 1) + // =================================================================================================================== + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cat_cbm_len", + description: "L3_CAT capacity bitmask length, minus-one notation", + bits_range: (0, 4), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cat_units_bitmap", + description: "L3_CAT bitmap of allocation units", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::ECX, + }, + //TODO: These feature may be good for increased performance. Perhaps there needs to be some mechanism to opt-in for non-host CPU profiles? + ValueDefinitions::new(&[ + ValueDefinition { + short: "l3_cat_non_cpu_agents", + description: "L3_CAT for non-CPU agent is supported", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "cdp_l3", + description: "L3/L2_CAT CDP (Code and Data Prioritization)", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "cat_sparse_1s", + description: "L3/L2_CAT non-contiguous 1s value supported", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EDX, + }, + // TODO: We might need some way to opt in to use Intel cache allocation technology in guests with non-host CPU profiles. + ValueDefinitions::new(&[ValueDefinition { + short: "cat_cos_max", + description: "Highest COS number supported for this ResID", + bits_range: (0, 15), + policy: ProfilePolicy::Static(0), + }]), + ), + // =================================================================================================================== + // Intel Resource Director Technology Allocation Enumeration Sub-leaf (ECX = ResID = 2) + // =================================================================================================================== + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(2, 2), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cat_cbm_len", + description: "L2_CAT capacity bitmask length, minus-one notation", + bits_range: (0, 4), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(2, 2), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cat_units_bitmap", + description: "L2_CAT bitmap of allocation units", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(2, 2), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cat_cos_max", + description: "Highest COS number supported for this ResID", + bits_range: (0, 15), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(2, 2), + register: CpuidReg::ECX, + }, + // TODO: We might need some way to opt in to use Intel cache allocation technology in guests with non-host CPU profiles. + ValueDefinitions::new(&[ + ValueDefinition { + short: "cdp_l2", + description: "L2_CAT CDP (Code and Data Prioritization)", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "cat_sparse_1s", + description: "L2_CAT non-contiguous 1s value supported", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // =================================================================================================================== + // Intel Resource Director Technology Allocation Enumeration Sub-leaf (ECX = ResID = 3) + // =================================================================================================================== + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(3, 3), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + // TODO: We might need some way to opt in to use Intel MBA technology in guests with non-host CPU profiles. + ValueDefinition { + short: "mba_max_delay", + description: "Max MBA throttling value; minus-one notation", + bits_range: (0, 11), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(3, 3), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "per_thread_mba", + description: "Per-thread MBA controls are supported", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "mba_delay_linear", + description: "Delay values are linear", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(3, 3), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "mba_cos_max", + description: "MBA max Class of Service supported", + bits_range: (0, 15), + policy: ProfilePolicy::Static(0), + }]), + ), + // =================================================================================================================== + // Intel Resource Director Technology Allocation Enumeration Sub-leaf (ECX = ResID = 5) + // =================================================================================================================== + // + // TODO: We may want to have some way to opt-in to use Intel RDT for guests with non-host CPU profiles. + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(5, 5), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "core_max_throttle", + description: "Max Core throttling level supported by the corresponding ResID", + bits_range: (0, 7), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "core_scope", + description: "If 1, indicates the logical processor scope of the IA32_QoS_Core_BW_Thrtl_n MSRs. Other values are reserved", + bits_range: (8, 11), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(5, 5), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cba_delay_linear", + description: "The response of the bandwidth control is approximately linear", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0x10, + sub_leaf: RangeInclusive::new(5, 5), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "core_cos_max", + description: "Core max Class of Service supported", + bits_range: (0, 15), + policy: ProfilePolicy::Static(0), + }]), + ), + // SGX is already disabled and deprecated so we don't need to worry about leaf 0x12 and its subleaves + + // =================================================================================================================== + // Intel Processor Trace Enumeration Main Leaf + // =================================================================================================================== + ( + Parameters { + leaf: 0x14, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "pt_max_subleaf", + description: "Maximum leaf 0x14 subleaf", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0x14, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "cr3_filtering", + description: "IA32_RTIT_CR3_MATCH is accessible", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "psb_cyc", + description: "Configurable PSB and cycle-accurate mode", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "ip_filtering", + description: "IP/TraceStop filtering; Warm-reset PT MSRs preservation", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "mtc_timing", + description: "MTC timing packet; COFI-based packets suppression", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "ptwrite", + description: "PTWRITE support", + bits_range: (4, 4), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "power_event_trace", + description: "Power Event Trace support", + bits_range: (5, 5), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "psb_pmi_preserve", + description: "PSB and PMI preservation support", + bits_range: (6, 6), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "event_trace", + description: "Event Trace packet generation through IA32_RTIT_CTL.EventEn", + bits_range: (7, 7), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "tnt_disable", + description: "TNT packet generation disable through IA32_RTIT_CTL.DisTNT", + bits_range: (8, 8), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x14, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "topa_output", + description: "ToPA output scheme support", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "topa_multiple_entries", + description: "ToPA tables can hold multiple entries", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "single_range_output", + description: "Single-range output scheme supported", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "trance_transport_output", + description: "Trace Transport subsystem output support", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "ip_payloads_lip", + description: "IP payloads have LIP values (CS base included)", + bits_range: (31, 31), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // =================================================================================================================== + // Intel Processor Trace Enumeration Sub-leaf 1 + // =================================================================================================================== + ( + Parameters { + leaf: 0x14, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "num_address_ranges", + description: "Filtering number of configurable Address Ranges", + bits_range: (0, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "mtc_periods_bmp", + description: "Bitmap of supported MTC period encodings", + bits_range: (16, 31), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x14, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "cycle_thresholds_bmp", + description: "Bitmap of supported Cycle Threshold encodings", + bits_range: (0, 15), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "psb_periods_bmp", + description: "Bitmap of supported Configurable PSB frequency encodings", + bits_range: (16, 31), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // =================================================================================================================== + // Time Stamp Counter and Core Crystal Clock Information + // =================================================================================================================== + ( + Parameters { + leaf: 0x15, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "tsc_denominator", + description: "Denominator of the TSC/'core crystal clock' ratio", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x15, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "tsc_numerator", + description: "Numerator of the TSC/'core crystal clock' ratio", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x15, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_crystal_hz", + description: "Core crystal clock nominal frequency, in Hz", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + // =================================================================================================================== + // Processor Frequency Information + // =================================================================================================================== + ( + Parameters { + leaf: 0x16, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_base_mhz", + description: "Processor base frequency, in MHz", + bits_range: (0, 15), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x16, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_max_mhz", + description: "Processor max frequency, in MHz", + bits_range: (0, 15), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x16, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "bus_mhz", + description: "Bus reference frequency, in MHz", + bits_range: (0, 15), + policy: ProfilePolicy::Passthrough, + }]), + ), + // =================================================================================================================== + // System-On-Chip Vendor Attribute Enumeration Main Leaf + // =================================================================================================================== + + // System-On-Chip should probably not be supported for CPU profiles for the foreseeable feature. + ( + Parameters { + leaf: 0x17, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "soc_max_subleaf", + description: "Maximum leaf 0x17 subleaf", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + // =================================================================================================================== + // Deterministic Address Translation Parameters + // =================================================================================================================== + ( + Parameters { + leaf: 0x18, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "tlb_max_subleaf", + description: "Maximum leaf 0x18 subleaf", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x18, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "tlb_4k_page", + description: "TLB 4KB-page entries supported", + bits_range: (0, 0), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "tlb_2m_page", + description: "TLB 2MB-page entries supported", + bits_range: (1, 1), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "tlb_4m_page", + description: "TLB 4MB-page entries supported", + bits_range: (2, 2), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "tlb_1g_page", + description: "TLB 1GB-page entries supported", + bits_range: (3, 3), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "hard_partitioning", + description: "(Hard/Soft) partitioning between logical CPUs sharing this structure", + bits_range: (8, 10), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "n_way_associative", + description: "Ways of associativity", + bits_range: (16, 31), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + ( + Parameters { + leaf: 0x18, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "n_sets", + description: "Number of sets", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x18, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "tlb_type", + description: "Translation cache type (TLB type)", + bits_range: (0, 4), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "tlb_cache_level", + description: "Translation cache level (1-based)", + bits_range: (5, 7), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "is_fully_associative", + description: "Fully-associative structure", + bits_range: (8, 8), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "tlb_max_addressable_ids", + description: "Max number of addressable IDs for logical CPUs sharing this TLB - 1", + bits_range: (14, 25), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + // We don't support key locker for now (leaf 0x19): Hence we zero out leaf 0x19 for CPU profiles We zero LEAF + // 0x1A (Native Model ID Enumeration) out for CPU profiles LEAF 0x1B (PCONFIG) is zeroed out for CPU profiles + // for now + + // =================================================================================================================== + // Last Branch Records Information + // =================================================================================================================== + ( + Parameters { + leaf: 0x1c, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "lbr_depth_8", + description: "Max stack depth (number of LBR entries) = 8", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_depth_16", + description: "Max stack depth (number of LBR entries) = 16", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_depth_24", + description: "Max stack depth (number of LBR entries) = 24", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_depth_32", + description: "Max stack depth (number of LBR entries) = 32", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_depth_40", + description: "Max stack depth (number of LBR entries) = 40", + bits_range: (4, 4), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_depth_48", + description: "Max stack depth (number of LBR entries) = 48", + bits_range: (5, 5), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_depth_56", + description: "Max stack depth (number of LBR entries) = 56", + bits_range: (6, 6), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_depth_64", + description: "Max stack depth (number of LBR entries) = 64", + bits_range: (7, 7), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_deep_c_reset", + description: "LBRs maybe cleared on MWAIT C-state > C1", + bits_range: (30, 30), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_ip_is_lip", + description: "LBR IP contain Last IP, otherwise effective IP", + bits_range: (31, 31), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x1c, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "lbr_cpl", + description: "CPL filtering (non-zero IA32_LBR_CTL[2:1]) supported", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_branch_filter", + description: "Branch filtering (non-zero IA32_LBR_CTL[22:16]) supported", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_call_stack", + description: "Call-stack mode (IA32_LBR_CTL[3] = 1) supported", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x1c, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "lbr_mispredict", + description: "Branch misprediction bit supported (IA32_LBR_x_INFO[63])", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_timed_lbr", + description: "Timed LBRs (CPU cycles since last LBR entry) supported", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_branch_type", + description: "Branch type field (IA32_LBR_INFO_x[59:56]) supported", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_events_gpc_bmp", + description: "LBR PMU-events logging support; bitmap for first 4 GP (general-purpose) Counters", + bits_range: (16, 19), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // =================================================================================================================== + // Tile Information Main Leaf + // =================================================================================================================== + // NOTE: AMX is opt-in, but there are no problems with inheriting these values. The CHV will take care of zeroing out the bits userspace applications should check for if the user did not opt-in to amx. + ( + Parameters { + leaf: 0x1d, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "amx_max_palette", + description: "Highest palette ID / subleaf ID", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + // =================================================================================================================== + // Tile Palette 1 Sub-leaf + // =================================================================================================================== + // NOTE: AMX is opt-in, but there are no problems with inheriting these values. The CHV will take care of zeroing out the bits userspace applications should check for if the user did not opt-in to amx. + ( + Parameters { + leaf: 0x1d, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "amx_palette_size", + description: "AMX palette total tiles size, in bytes", + bits_range: (0, 15), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "amx_tile_size", + description: "AMX single tile's size, in bytes", + bits_range: (16, 31), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + ( + Parameters { + leaf: 0x1d, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "amx_tile_row_size", + description: "AMX tile single row's size, in bytes", + bits_range: (0, 15), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "amx_palette_nr_tiles", + description: "AMX palette number of tiles", + bits_range: (16, 31), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + ( + Parameters { + leaf: 0x1d, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "amx_tile_nr_rows", + description: "AMX tile max number of rows", + bits_range: (0, 15), + policy: ProfilePolicy::Inherit, + }]), + ), + // =================================================================================================================== + // TMUL Information Main Leaf + // =================================================================================================================== + // NOTE: AMX is opt-in, but there are no problems with inheriting these values. The CHV will take care of zeroing out the bits userspace applications should check for if the user did not opt-in to amx. + ( + Parameters { + leaf: 0x1e, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "tmul_info_max", + description: "Reports the maximum number of sub-leaves that are supported in leaf 0x1e", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x1e, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "tmul_maxk", + description: "TMUL unit maximum height, K (rows or columns)", + bits_range: (0, 7), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "tmul_maxn", + description: "TMUL unit maximum SIMD dimension, N (column bytes)", + bits_range: (8, 23), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + // =================================================================================================================== + // TMUL Information Sub-leaf 1 + // =================================================================================================================== + // NOTE: AMX is opt-in, but there are no problems with inheriting these values. The CHV will take care of zeroing out the bits userspace applications should check for if the user did not opt-in to amx. + ( + Parameters { + leaf: 0x1e, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EAX, + }, + // NOTE: AMX currently requires opt-in, even for the host CPU profile. We still inherit this value for profiles as the relevant feature bits that userspace applications must check will be zeroed out if the user has not opted in for "amx" via CpuFeatures. + ValueDefinitions::new(&[ + ValueDefinition { + short: "amx_int8", + description: "If 1, the processor supports tile computational operations on 8-bit integers", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "amx_bf16", + description: "If 1, the processor supports tile computational operations on bfloat16 numbers", + bits_range: (1, 1), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "amx_complex", + description: "If 1, the processor supports the AMX-COMPLEX instructions", + bits_range: (2, 2), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "amx_fp16", + description: "If 1, the processor supports tile computational operations on FP16 numbers", + bits_range: (3, 3), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "amx_fp8", + description: "If 1, the processor supports tile computational operations on FP8 numbers", + bits_range: (4, 4), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "amx_transpose", + description: "If 1, the processor supports the AMX-TRANSPOSE instructions", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "amx_tf32", + description: "If 1, the processor supports the AMX-TF32 (FP19) instructions", + bits_range: (6, 6), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "amx_avx512", + description: "If 1, the processor supports the AMX-AVX512 instructions", + bits_range: (7, 7), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "amx_movrs", + description: "If 1, the processor supports the AMX-MOVRS instructions", + bits_range: (8, 8), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + // =================================================================================================================== + // V2 Extended Topology Enumeration + // =================================================================================================================== + + // The values in leaf 0x1f must be set by CHV itself. + ( + Parameters { + leaf: 0x1f, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "x2apic_id_shift", + description: "Bit width of this level (previous levels inclusive)", + bits_range: (0, 4), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x1f, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "domain_lcpus_count", + description: "Logical CPUs count across all instances of this domain", + bits_range: (0, 15), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x1f, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "domain_level", + description: "This domain level (subleaf ID)", + bits_range: (0, 7), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "domain_type", + description: "This domain type", + bits_range: (8, 15), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + ( + Parameters { + leaf: 0x1f, + sub_leaf: RangeInclusive::new(0, u32::MAX), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "x2apic_id", + description: "x2APIC ID of current logical CPU", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + // =================================================================================================================== + // Processor History Reset + // =================================================================================================================== + ( + Parameters { + leaf: 0x20, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "hreset_nr_subleaves", + description: "CPUID 0x20 max subleaf + 1", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x20, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "hreset_thread_director", + description: "HRESET of Intel thread director is supported", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }]), + ), + // =================================================================================================================== + // TDX + // =================================================================================================================== + + // TDX is not supported by CPU profiles for now. We just zero out this leaf for CPU profiles for the time being. + ( + Parameters { + leaf: 0x21, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "tdx_vendorid_0", + description: "TDX vendor ID string bytes 0 - 3", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0x21, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "tdx_vendorid_2", + description: "CPU vendor ID string bytes 8 - 11", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0x21, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "tdx_vendorid_1", + description: "CPU vendor ID string bytes 4 - 7", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + // =================================================================================================================== + // Architectural Performance Monitoring Extended Main Leaf + // =================================================================================================================== + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "subleaf_0", + description: "If 1, subleaf 0 exists", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "subleaf_1", + description: "If 1, subleaf 1 exists", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "subleaf_2", + description: "If 1, subleaf 2 exists", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "subleaf_3", + description: "If 1, subleaf 3 exists", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "subleaf_4", + description: "If 1, subleaf 4 exists", + bits_range: (4, 4), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "subleaf_5", + description: "If 1, subleaf 5 exists. The processor suppots Architectural PEBS. The IA32_PEBS_BASE and IA32_PEBS_INDEX MSRs exist", + bits_range: (5, 5), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "unitmask2", + description: "IA32_PERFEVTSELx MSRs UnitMask2 is supported", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "eq_bit", + description: "equal flag in the IA32_PERFEVTSELx MSR is supported", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "RDPMC_USR_DISABLE", + description: "RDPMC_USR_DISABLE", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "num_slots_per_cycle", + description: "Number of slots per cycle. This number can be multiplied by the number of cycles (from CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.CORE or IA32_FIXED_CTR1) to determine the total number of slots", + bits_range: (0, 7), + policy: ProfilePolicy::Static(0), + }]), + ), + // =================================================================================================================== + // Architectural Performance Monitoring Extended Sub-leaf 1 + // =================================================================================================================== + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "pmu_gp_counters_bitmap", + description: "General-purpose PMU counters bitmap", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(1, 1), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "pmu_f_counters_bitmap", + description: "Fixed PMU counters bitmap", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + // =================================================================================================================== + // Architectural Performance Monitoring Extended Sub-leaf 2 + // =================================================================================================================== + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(2, 2), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "pmu_acr_bitmap", + description: "Bitmap of Auto Counter Reload (ACR) general-purpose counters that can be reloaded", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + // =================================================================================================================== + // Architectural Performance Monitoring Extended Sub-leaf 3 + // =================================================================================================================== + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(3, 3), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "core_cycles_evt", + description: "Core cycles event supported", + bits_range: (0, 0), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "insn_retired_evt", + description: "Instructions retired event supported", + bits_range: (1, 1), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "ref_cycles_evt", + description: "Reference cycles event supported", + bits_range: (2, 2), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "llc_refs_evt", + description: "Last-level cache references event supported", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "llc_misses_evt", + description: "Last-level cache misses event supported", + bits_range: (4, 4), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "br_insn_ret_evt", + description: "Branch instruction retired event supported", + bits_range: (5, 5), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "br_mispr_evt", + description: "Branch mispredict retired event supported", + bits_range: (6, 6), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "td_slots_evt", + description: "Topdown slots event supported", + bits_range: (7, 7), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "td_backend_bound_evt", + description: "Topdown backend bound event supported", + bits_range: (8, 8), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "td_bad_spec_evt", + description: "Topdown bad speculation event supported", + bits_range: (9, 9), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "td_frontend_bound_evt", + description: "Topdown frontend bound event supported", + bits_range: (10, 10), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "td_retiring_evt", + description: "Topdown retiring event support", + bits_range: (11, 11), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr_inserts", + description: "LBR support", + bits_range: (12, 12), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // =================================================================================================================== + // Architectural Performance Monitoring Extended Sub-leaf 4 + // =================================================================================================================== + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(4, 4), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "allow_in_record", + description: "If 1, indicates that the ALLOW_IN_RECORD bit is available in the IA32_PMC_GPn_CFG_C and IA32_PMC_FXm_CFG_C MSRs", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "cntr", + description: "Counters group sub-groups general-purpose counters, fixed-function counters, and performance metrics are available", + bits_range: (0, 7), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr", + description: "LBR group and both bits [41:40] are available", + bits_range: (8, 9), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "xer", + description: "These bits correspond to XER group bits [55:49]", + bits_range: (17, 23), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "grp", + description: "If 1, the GRP group is available", + bits_range: (29, 29), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "aux", + description: "If 1, the AUX group is available", + bits_range: (30, 30), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(4, 4), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "allow_in_record", + description: "If 1, indicates that the ALLOW_IN_RECORD bit is available in the IA32_PMC_GPn_CFG_C and IA32_PMC_FXm_CFG_C MSRs", + bits_range: (3, 3), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "cntr", + description: "Counters group sub-groups general-purpose counters, fixed-function counters, and performance metrics are available", + bits_range: (0, 7), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "lbr", + description: "LBR group and both bits [41:40] are available", + bits_range: (8, 9), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "xer", + description: "These bits correspond to XER group bits [55:49]", + bits_range: (17, 23), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "grp", + description: "If 1, the GRP group is available", + bits_range: (29, 29), + policy: ProfilePolicy::Static(0), + }, + ValueDefinition { + short: "aux", + description: "If 1, the AUX group is available", + bits_range: (30, 30), + policy: ProfilePolicy::Static(0), + }, + ]), + ), + // =================================================================================================================== + // Architectural Performance Monitoring Extended Sub-leaf 5 + // =================================================================================================================== + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(5, 5), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "architectural_pebs_counters", + description: "General-purpose counters support Architectural PEBS. Bit vector of general-purpose counters for which the Architectural PEBS mechanism is available", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(5, 5), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "pebs_pdist_counters", + description: "General-purpose counters for which PEBS support PDIST", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(5, 5), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "pebs_fixed_function_counters", + description: "Fixed-function counters support Architectural PEBS. Bit vector of fixed-function counters for which the Architectural PEBS mechanism is available. If ECX[x] == 1, then the IA32_PMC_FXm_CFG_C MSR is available, and PEBS is supported", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + ( + Parameters { + leaf: 0x23, + sub_leaf: RangeInclusive::new(5, 5), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "pebs_fixed_function_pdist_counters", + description: "Fixed-function counters for which PEBS supports PDIST", + bits_range: (0, 31), + policy: ProfilePolicy::Static(0), + }]), + ), + // =================================================================================================================== + // Converged Vector ISA Main Leaf + // =================================================================================================================== + ( + Parameters { + leaf: 0x24, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "converged_vector_isa_max_sub_leaves", + description: "Reports the maximum number of sub-leaves that are supported in leaf 0x24", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x24, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "avx_10_version", + description: "Reports the intel AVX10 Converged Vector ISA version", + bits_range: (0, 7), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "avx_10_lengths", + description: "Reserved at 111", + bits_range: (0, 7), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + // Hypervisor reserved CPUID leaves are set elsewhere + + // =================================================================================================================== + // Extended Function CPUID Information + // =================================================================================================================== + ( + Parameters { + leaf: 0x80000000, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "max_ext_leaf", + description: "Maximum extended CPUID leaf supported", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000000, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_vendorid_0", + description: "Vendor ID string bytes 0 - 3", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x80000000, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_vendorid_2", + description: "Vendor ID string bytes 8 - 11", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x80000000, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_vendorid_1", + description: "Vendor ID string bytes 4 - 7", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x80000001, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + // TODO: Would inherit be better than passthrough? Currently CHV manually copies these over from the host ... + ValueDefinitions::new(&[ + ValueDefinition { + short: "e_stepping_id", + description: "Stepping ID", + bits_range: (0, 3), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "e_base_model", + description: "Base processor model", + bits_range: (4, 7), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "e_base_family", + description: "Base processor family", + bits_range: (8, 11), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "e_base_type", + description: "Base processor type (Transmeta)", + bits_range: (12, 13), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "e_ext_model", + description: "Extended processor model", + bits_range: (16, 19), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "e_ext_family", + description: "Extended processor family", + bits_range: (20, 27), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + ( + Parameters { + leaf: 0x80000001, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "brand_id", + description: "Brand ID", + bits_range: (0, 15), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "pkg_type", + description: "Package type", + bits_range: (28, 31), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + ( + Parameters { + leaf: 0x80000001, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "lahf_lm", + description: "LAHF and SAHF in 64-bit mode", + bits_range: (0, 0), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "lzcnt", + description: "LZCNT advanced bit manipulation", + bits_range: (5, 5), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "prefetchw", + description: "3DNow PREFETCH/PREFETCHW support", + bits_range: (8, 8), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + ( + Parameters { + leaf: 0x80000001, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "syscall", + description: "SYSCALL and SYSRET instructions", + bits_range: (11, 11), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "nx", + description: "Execute Disable Bit available", + bits_range: (20, 20), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "pdpe1gb", + description: "1-GB large page support", + bits_range: (26, 26), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "rdtscp", + description: "RDTSCP instruction and IA32_TSC_AUX are available", + bits_range: (27, 27), + policy: ProfilePolicy::Inherit, + }, + ValueDefinition { + short: "lm", + description: "Long mode (x86-64, 64-bit support)", + bits_range: (29, 29), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + // The profile generation tool will actually modify the brand id string before + // acting on the policy set here. + ( + Parameters { + leaf: 0x80000002, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_brandid_0", + description: "CPU brand ID string, bytes 0 - 3", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000002, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_brandid_1", + description: "CPU brand ID string, bytes 4 - 7", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000002, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_brandid_2", + description: "CPU brand ID string, bytes 8 - 11", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000002, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_brandid_3", + description: "CPU brand ID string, bytes 12 - 15", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000003, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_brandid_4", + description: "CPU brand ID string bytes, 16 - 19", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000003, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_brandid_5", + description: "CPU brand ID string bytes, 20 - 23", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000003, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_brandid_6", + description: "CPU brand ID string bytes, 24 - 27", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000003, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_brandid_7", + description: "CPU brand ID string bytes, 28 - 31", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000004, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_brandid_8", + description: "CPU brand ID string, bytes 32 - 35", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000004, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_brandid_9", + description: "CPU brand ID string, bytes 36 - 39", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000004, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_brandid_10", + description: "CPU brand ID string, bytes 40 - 43", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000004, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "cpu_brandid_11", + description: "CPU brand ID string, bytes 44 - 47", + bits_range: (0, 31), + policy: ProfilePolicy::Inherit, + }]), + ), + ( + Parameters { + leaf: 0x80000006, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "l2_line_size", + description: "L2 cache line size, in bytes", + bits_range: (0, 7), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "l2_nlines", + description: "L2 cache number of lines per tag", + bits_range: (8, 11), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "l2_assoc", + description: "L2 cache associativity", + bits_range: (12, 15), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "l2_size_kb", + description: "L2 cache size, in KB", + bits_range: (16, 31), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + // EAX, EBX and ECX of 0x8000_0007 are all reserved (=0) on Intel + ( + Parameters { + leaf: 0x80000007, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ + // TODO: We may want some mechanism to let users opt-in to using an invariant TSC provided by the hardware (when available). + // TODO: Probably unconditionally set by CHV + ValueDefinition { + short: "constant_tsc", + description: "TSC ticks at constant rate across all P and C states", + bits_range: (8, 8), + policy: ProfilePolicy::Inherit, + }, + ]), + ), + ( + Parameters { + leaf: 0x80000008, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "phys_addr_bits", + description: "Max physical address bits", + bits_range: (0, 7), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "virt_addr_bits", + description: "Max virtual address bits", + bits_range: (8, 15), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "guest_phys_addr_bits", + description: "Max nested-paging guest physical address bits", + bits_range: (16, 23), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + ( + Parameters { + leaf: 0x80000008, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "wbnoinvd", + description: "WBNOINVD supported", + bits_range: (9, 9), + policy: ProfilePolicy::Static(0), + }]), + ), + ]) +}; diff --git a/arch/src/x86_64/cpuid_definitions/kvm.rs b/arch/src/x86_64/cpuid_definitions/kvm.rs new file mode 100644 index 0000000000..89285b2aa4 --- /dev/null +++ b/arch/src/x86_64/cpuid_definitions/kvm.rs @@ -0,0 +1,204 @@ +//! This module contains CPUID definitions for the KVM hypervisor. + +use std::ops::RangeInclusive; + +use crate::x86_64::CpuidReg; +use crate::x86_64::cpuid_definitions::{ + CpuidDefinitions, Parameters, ProfilePolicy, ValueDefinition, ValueDefinitions, +}; + +/// CPUID features defined for the KVM hypervisor. +/// +/// See https://www.kernel.org/doc/html/latest/virt/kvm/x86/cpuid.html +pub const KVM_CPUID_DEFINITIONS: CpuidDefinitions<6> = const { + CpuidDefinitions([ + //===================================================================== + // KVM CPUID Signature + // =================================================================== + ( + Parameters { + leaf: 0x4000_0000, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "max_hypervisor_leaf", + description: "The maximum valid leaf between 0x4000_0000 and 0x4FFF_FFF", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x4000_0000, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EBX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "hypervisor_string_ebx", + description: "Part of the hypervisor string", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x4000_0000, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::ECX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "hypervisor_string_ecx", + description: "Part of the hypervisor string", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + ( + Parameters { + leaf: 0x4000_0000, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "hypervisor_string_edx", + description: "Part of the hypervisor string", + bits_range: (0, 31), + policy: ProfilePolicy::Passthrough, + }]), + ), + //===================================================================== + // KVM CPUID Features + // =================================================================== + ( + Parameters { + leaf: 0x4000_0001, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EAX, + }, + ValueDefinitions::new(&[ + ValueDefinition { + short: "kvm_feature_clocksource", + description: "kvmclock available at MSRs 0x11 and 0x12", + bits_range: (0, 0), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_nop_io_delay", + description: "Not necessary to perform delays on PIO operations", + bits_range: (1, 1), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_mmu_op", + description: "Deprecated", + bits_range: (2, 2), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_clocksource2", + description: "kvmclock available at MSRs 0x4b564d00 and 0x4b564d01", + bits_range: (3, 3), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_async_pf", + description: "async pf can be enabled by writing to MSR 0x4b564d02", + bits_range: (4, 4), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_steal_time", + description: "steal time can be enabled by writing to msr 0x4b564d03", + bits_range: (5, 5), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_pv_eoi", + description: "paravirtualized end of interrupt handler can be enabled by writing to msr 0x4b564d04", + bits_range: (6, 6), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_pv_unhalt", + description: "guest checks this feature bit before enabling paravirtualized spinlock support", + bits_range: (7, 7), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_pv_tlb_flush", + description: "guest checks this feature bit before enabling paravirtualized tlb flush", + bits_range: (9, 9), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_async_pf_vmexit", + description: "paravirtualized async PF VM EXIT can be enabled by setting bit 2 when writing to msr 0x4b564d02", + bits_range: (10, 10), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_pv_send_ipi", + description: "guest checks this feature bit before enabling paravirtualized send IPIs", + bits_range: (11, 11), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_poll_control", + description: "host-side polling on HLT can be disabled by writing to msr 0x4b564d05.", + bits_range: (12, 12), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_pv_sched_yield", + description: "guest checks this feature bit before using paravirtualized sched yield.", + bits_range: (13, 13), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_async_pf_int", + description: "guest checks this feature bit before using the second async pf control msr 0x4b564d06 and async pf acknowledgment msr 0x4b564d07.", + bits_range: (14, 14), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_msi_ext_dest_id", + description: "guest checks this feature bit before using extended destination ID bits in MSI address bits 11-5.", + bits_range: (15, 15), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_hc_map_gpa_range", + description: "guest checks this feature bit before using the map gpa range hypercall to notify the page state change", + bits_range: (16, 16), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_migration_control", + description: "guest checks this feature bit before using MSR_KVM_MIGRATION_CONTROL", + bits_range: (17, 17), + policy: ProfilePolicy::Passthrough, + }, + ValueDefinition { + short: "kvm_feature_clocksource_stable_bit", + description: "host will warn if no guest-side per-cpu warps are expected in kvmclock", + bits_range: (24, 24), + policy: ProfilePolicy::Passthrough, + }, + ]), + ), + ( + Parameters { + leaf: 0x4000_0001, + sub_leaf: RangeInclusive::new(0, 0), + register: CpuidReg::EDX, + }, + ValueDefinitions::new(&[ValueDefinition { + short: "kvm_hints_realtime", + description: "guest checks this feature bit to determine that vCPUs are never preempted for an unlimited time allowing optimizations", + bits_range: (0, 0), + policy: ProfilePolicy::Passthrough, + }]), + ), + ]) +}; diff --git a/arch/src/x86_64/cpuid_definitions/mod.rs b/arch/src/x86_64/cpuid_definitions/mod.rs new file mode 100644 index 0000000000..ee62550d80 --- /dev/null +++ b/arch/src/x86_64/cpuid_definitions/mod.rs @@ -0,0 +1,211 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::io::Write; +use std::ops::RangeInclusive; + +use serde::{Deserialize, Deserializer, Serialize, Serializer}; + +use crate::x86_64::CpuidReg; + +pub mod intel; +#[cfg(feature = "kvm")] +pub mod kvm; + +pub(in crate::x86_64) fn serialize_as_hex( + input: &u32, + serializer: S, +) -> Result { + // two bytes for "0x" prefix and eight for the hex encoded number + let mut buffer = [0_u8; 10]; + let _ = write!(&mut buffer[..], "{:#010x}", input); + let str = core::str::from_utf8(&buffer[..]) + .expect("the buffer should be filled with valid UTF-8 bytes"); + serializer.serialize_str(str) +} + +pub(in crate::x86_64) fn deserialize_from_hex<'de, D: Deserializer<'de>>( + deserializer: D, +) -> Result { + let hex = <&'de str as Deserialize>::deserialize(deserializer)?; + u32::from_str_radix(hex.strip_prefix("0x").unwrap_or(""), 16).map_err(|_| { + ::custom(format!("{hex} is not a hex encoded 32 bit integer")) + }) +} + +/// Parameters for inspecting CPUID definitions. +#[derive(Debug, Clone, Eq, PartialEq, Serialize, Deserialize)] +pub struct Parameters { + // The leaf (EAX) parameter used with the CPUID instruction + #[serde(serialize_with = "serialize_as_hex")] + #[serde(deserialize_with = "deserialize_from_hex")] + pub leaf: u32, + // The sub-leaf (ECX) parameter used with the CPUID instruction + pub sub_leaf: RangeInclusive, + // The register we are interested in inspecting which gets filled by the CPUID instruction + pub register: CpuidReg, +} + +/// Describes a policy for how the corresponding CPUID data should be considered when building +/// a CPU profile. +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub enum ProfilePolicy { + /// Store the corresponding data when building the CPU profile. + /// + /// When the CPU profile gets utilized the corresponding data will be set into the modified + /// CPUID instruction(s). + Inherit, + /// Ignore the corresponding data when building the CPU profile. + /// + /// When the CPU profile gets utilized the corresponding data will then instead get + /// extracted from the host. + /// + /// This variant is typically set for data that has no effect on migration compatibility, + /// but there may be some exceptions such as data which is necessary to run the VM at all, + /// but must coincide with whatever is on the host. + Passthrough, + /// Set the following hardcoded value in the CPU profile. + /// + /// This variant is typically used for features/values that don't work well with live migration (even when using the exact same physical CPU model). + Static(u32), +} + +/// A description of a range of bits in a register populated by the CPUID instruction with specific parameters. +#[derive(Clone, Copy, Debug)] +pub struct ValueDefinition { + /// A short name for the value obtainable through CPUID + pub short: &'static str, + /// A description of the value obtainable through CPUID + pub description: &'static str, + /// The range of bits in the output register corresponding to this feature or value. + /// + /// This is not a `RangeInclusive` because that type does unfortunately not implement `Copy`. + pub bits_range: (u8, u8), + /// The policy corresponding to this value when building CPU profiles. + pub policy: ProfilePolicy, +} + +/// Describes values within a register populated by the CPUID instruction with specific parameters. +/// +/// NOTE: The only way to interact with this value (beyond this crate) is via the const [`Self::as_slice()`](Self::as_slice) method. +pub struct ValueDefinitions(&'static [ValueDefinition]); +impl ValueDefinitions { + /// Constructor permitting at most 32 entries. + const fn new(cpuid_descriptions: &'static [ValueDefinition]) -> Self { + // Note that this function is only called within this module, at compile time, hence it is fine to have some + // additional sanity checks such as the following assert. + assert!(cpuid_descriptions.len() <= 32); + Self(cpuid_descriptions) + } + /// Converts this into a slice representation. This is the only way to read values of this type. + pub const fn as_slice(&self) -> &'static [ValueDefinition] { + self.0 + } +} + +/// Describes multiple CPUID outputs. +/// +/// Each wrapped [`ValueDefinitions`] corresponds to the given [`Parameters`] in the same tuple. +/// +pub struct CpuidDefinitions( + [(Parameters, ValueDefinitions); NUM_PARAMETERS], +); + +impl CpuidDefinitions { + pub const fn as_slice(&self) -> &[(Parameters, ValueDefinitions); NUM_PARAMETERS] { + &self.0 + } +} + +#[cfg(test)] +mod tests { + use proptest::prelude::*; + use serde::Deserialize; + + use super::{Parameters, deserialize_from_hex, serialize_as_hex}; + use crate::x86_64::CpuidReg; + + /* + Check that the leaves get the string representation we expect. + This does not really matter from a functionality point of view, but we want + to read it in the expected format when manually viewing the generated CPU + profile files. + + Also assert that deserialization gives the original value back + */ + #[test] + fn hex_serialization() { + for (leaf, expected) in [ + 0x0_u32, 0x7, 0xd, 0x1e, 0x40000000, 0x4fffffff, 0x80000000, 0x8fffffff, + ] + .into_iter() + .zip([ + "0x00000000", + "0x00000007", + "0x0000000d", + "0x0000001e", + "0x40000000", + "0x4fffffff", + "0x80000000", + "0x8fffffff", + ]) { + let mut v = Vec::new(); + let mut serializer = serde_json::Serializer::new(&mut v); + serialize_as_hex(&leaf, &mut serializer).unwrap(); + let serialized = str::from_utf8(&v[..]).unwrap(); + // JSON Strings have surrounding "" hence we trim that + let serialized_trimmed = serialized + .strip_prefix('"') + .unwrap() + .strip_suffix('"') + .unwrap(); + dbg!(serialized_trimmed); + assert_eq!(serialized_trimmed, expected); + // Also check that we can deserialize this back to the original value + let mut deserializer = serde_json::Deserializer::from_str(serialized); + let deserialized = deserialize_from_hex(&mut deserializer).unwrap(); + assert_eq!(deserialized, leaf); + } + } + + // Check that serializing and then deserializing a value of type `Parameter` results in the + // same value we started with. + proptest! { + #[test] + fn parameter_serialization_roundtrip_works(leaf in 0u32..u32::MAX, x1 in 0u32..100, x2 in 0u32..100, reg in 0..4) { + let sub_leaf_range_start = std::cmp::min(x1, x2); + let sub_leaf_range_end = std::cmp::max(x1,x2); + let sub_leaf = sub_leaf_range_start..=sub_leaf_range_end; + let register = match reg { + 0 => CpuidReg::EAX, + 1 => CpuidReg::EBX, + 2 => CpuidReg::ECX, + 3 => CpuidReg::EDX, + _ => unreachable!() + }; + let cpuid_parameters = Parameters { + leaf, + sub_leaf, + register + }; + let serialized = serde_json::to_string(&cpuid_parameters).unwrap(); + let deserialized: Parameters = serde_json::from_str(&serialized).unwrap(); + prop_assert_eq!(&deserialized, &cpuid_parameters); + } + } + + // Check that `deserialize_from_hex` does not succeed if the stringified u32 does not start with 0x + proptest! { + #[test] + fn hex_deserialization_requires_prefix(leaf in any::().prop_map(|leaf| std::iter::once('"').chain(leaf.to_string().chars()).chain(std::iter::once('"')).collect::())) { + let mut deserializer = serde_json::Deserializer::from_str(leaf.as_str()); + // Check that standard deserialization works + let result = ::deserialize(&mut deserializer); + prop_assert!(result.is_ok()); + let mut deserializer = serde_json::Deserializer::from_str(leaf.as_str()); + prop_assert!(deserialize_from_hex(&mut deserializer).is_err()); + } + } +} diff --git a/arch/src/x86_64/mod.rs b/arch/src/x86_64/mod.rs index baa984c94b..a2e14c4b1d 100644 --- a/arch/src/x86_64/mod.rs +++ b/arch/src/x86_64/mod.rs @@ -7,32 +7,39 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE-BSD-3-Clause file. use std::sync::Arc; +pub mod cpu_profile; +pub mod cpuid_definitions; pub mod interrupts; pub mod layout; mod mpspec; mod mptable; pub mod regs; -use std::collections::BTreeMap; use std::mem; -use hypervisor::arch::x86::{CpuIdEntry, CPUID_FLAG_VALID_INDEX}; +use hypervisor::arch::x86::{CPUID_FLAG_VALID_INDEX, CpuIdEntry}; use hypervisor::{CpuVendor, HypervisorCpuError, HypervisorError}; use linux_loader::loader::bootparam::{boot_params, setup_header}; use linux_loader::loader::elf::start_info::{ hvm_memmap_table_entry, hvm_modlist_entry, hvm_start_info, }; +use serde::{Deserialize, Serialize}; use thiserror::Error; use vm_memory::{ Address, Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, - GuestMemoryRegion, GuestUsize, + GuestMemoryRegion, }; -use crate::{GuestMemoryMmap, InitramfsConfig, RegionType}; +use crate::x86_64::cpu_profile::CpuidOutputRegisterAdjustments; +use crate::{CpuProfile, GuestMemoryMmap, InitramfsConfig, RegionType}; mod smbios; use std::arch::x86_64; #[cfg(feature = "tdx")] pub mod tdx; +// While modern architectures support more than 255 CPUs via x2APIC, +// legacy devices such as mptable support at most 254 CPUs. +pub const MAX_SUPPORTED_CPUS_LEGACY: u32 = 254; + // CPUID feature bits #[cfg(feature = "kvm")] const TSC_DEADLINE_TIMER_ECX_BIT: u8 = 24; // tsc deadline timer ecx bit. @@ -57,6 +64,8 @@ const KVM_FEATURE_ASYNC_PF_VMEXIT_BIT: u8 = 10; #[cfg(feature = "tdx")] const KVM_FEATURE_STEAL_TIME_BIT: u8 = 5; +const KVM_FEATURE_MSI_EXT_DEST_ID: u8 = 15; + pub const _NSIG: i32 = 65; #[derive(Debug, Copy, Clone)] @@ -73,60 +82,13 @@ pub struct EntryPoint { const E820_RAM: u32 = 1; const E820_RESERVED: u32 = 2; -#[derive(Clone)] -pub struct SgxEpcSection { - start: GuestAddress, - size: GuestUsize, -} - -impl SgxEpcSection { - pub fn new(start: GuestAddress, size: GuestUsize) -> Self { - SgxEpcSection { start, size } - } - pub fn start(&self) -> GuestAddress { - self.start - } - pub fn size(&self) -> GuestUsize { - self.size - } -} - -#[derive(Clone)] -pub struct SgxEpcRegion { - start: GuestAddress, - size: GuestUsize, - epc_sections: BTreeMap, -} - -impl SgxEpcRegion { - pub fn new(start: GuestAddress, size: GuestUsize) -> Self { - SgxEpcRegion { - start, - size, - epc_sections: BTreeMap::new(), - } - } - pub fn start(&self) -> GuestAddress { - self.start - } - pub fn size(&self) -> GuestUsize { - self.size - } - pub fn epc_sections(&self) -> &BTreeMap { - &self.epc_sections - } - pub fn insert(&mut self, id: String, epc_section: SgxEpcSection) { - self.epc_sections.insert(id, epc_section); - } -} - pub struct CpuidConfig { - pub sgx_epc_sections: Option>, pub phys_bits: u8, pub kvm_hyperv: bool, #[cfg(feature = "tdx")] pub tdx: bool, pub amx: bool, + pub profile: CpuProfile, } #[derive(Debug, Error)] @@ -163,22 +125,30 @@ pub enum Error { #[error("Error setting up SMBIOS table")] SmbiosSetup(#[source] smbios::Error), - /// Could not find any SGX EPC section - #[error("Could not find any SGX EPC section")] - NoSgxEpcSection, - - /// Missing SGX CPU feature - #[error("Missing SGX CPU feature")] - MissingSgxFeature, - - /// Missing SGX_LC CPU feature - #[error("Missing SGX_LC CPU feature")] - MissingSgxLaunchControlFeature, - /// Error getting supported CPUID through the hypervisor (kvm/mshv) API #[error("Error getting supported CPUID through the hypervisor API")] CpuidGetSupported(#[source] HypervisorError), + #[error( + "The selected CPU profile cannot be utilized because the host's CPUID entries are not compatible with the profile" + )] + CpuProfileCpuidIncompatibility, + /// Error because TDX cannot be enabled when a custom (non host) CPU profile has been selected + #[error("TDX cannot be enabled when a custom CPU profile has been selected")] + CpuProfileTdxIncompatibility, + #[error( + "The selected CPU profile cannot be utilized because a necessary CPUID entry was not found" + )] + /// Error when trying to apply a CPU profile because a necessary CPUID entry was not found + MissingExpectedCpuidEntry(#[source] cpu_profile::MissingCpuidEntriesError), + /// Error when trying to apply a CPU profile because the host has a CPU from a different vendor + #[error( + "The selected CPU profile cannot be utilized because the host has a CPU from a different vendor" + )] + CpuProfileVendorIncompatibility { + cpu_vendor_profile: CpuVendor, + cpu_vendor_host: CpuVendor, + }, /// Error populating CPUID with KVM HyperV emulation details #[error("Error populating CPUID with KVM HyperV emulation details")] CpuidKvmHyperV(#[source] vmm_sys_util::fam::Error), @@ -209,11 +179,11 @@ pub enum Error { E820Configuration, } -pub fn get_x2apic_id(cpu_id: u32, topology: Option<(u8, u8, u8)>) -> u32 { +pub fn get_x2apic_id(cpu_id: u32, topology: Option<(u16, u16, u16, u16)>) -> u32 { if let Some(t) = topology { - let thread_mask_width = u8::BITS - (t.0 - 1).leading_zeros(); - let core_mask_width = u8::BITS - (t.1 - 1).leading_zeros(); - let die_mask_width = u8::BITS - (t.2 - 1).leading_zeros(); + let thread_mask_width = u16::BITS - (t.0 - 1).leading_zeros(); + let core_mask_width = u16::BITS - (t.1 - 1).leading_zeros(); + let die_mask_width = u16::BITS - (t.2 - 1).leading_zeros(); let thread_id = cpu_id % (t.0 as u32); let core_id = cpu_id / (t.0 as u32) % (t.1 as u32); @@ -229,7 +199,14 @@ pub fn get_x2apic_id(cpu_id: u32, topology: Option<(u8, u8, u8)>) -> u32 { cpu_id } -#[derive(Copy, Clone, Debug)] +pub fn get_max_x2apic_id(topology: (u16, u16, u16, u16)) -> u32 { + get_x2apic_id( + (topology.0 as u32 * topology.1 as u32 * topology.2 as u32 * topology.3 as u32) - 1, + Some(topology), + ) +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub enum CpuidReg { EAX, EBX, @@ -327,7 +304,7 @@ impl CpuidPatch { } } - pub fn patch_cpuid(cpuid: &mut [CpuIdEntry], patches: Vec) { + pub fn patch_cpuid(cpuid: &mut [CpuIdEntry], patches: &[CpuidPatch]) { for entry in cpuid { for patch in patches.iter() { if entry.function == patch.function && entry.index == patch.index { @@ -454,7 +431,7 @@ impl CpuidFeatureEntry { feature_reg: CpuidReg::EDX, compatible_check: CpuidCompatibleCheck::BitwiseSubset, }, - // KVM CPUID bits: https://www.kernel.org/doc/html/latest/virt/kvm/cpuid.html + // KVM CPUID bits: https://www.kernel.org/doc/html/latest/virt/kvm/x86/cpuid.html // Leaf 0x4000_0000, EAX/EBX/ECX/EDX, KVM CPUID SIGNATURE CpuidFeatureEntry { function: 0x4000_0000, @@ -551,8 +528,60 @@ impl CpuidFeatureEntry { let src_vm_features = Self::get_features_from_cpuid(src_vm_cpuid, feature_entry_list); let dest_vm_features = Self::get_features_from_cpuid(dest_vm_cpuid, feature_entry_list); - // Loop on feature bit and check if the 'source vm' feature is a subset - // of those of the 'destination vm' feature + // If both processors are Intel then we can use the existing Intel CPUID definitions to log more + // precise information about potential errors + let both_intel = { + // Check if the vendor string is "GenuineIntel". This assumes that `leaf_0` is the entry + // corresponding to CPUID leaf 0. + let is_intel = |leaf_0: &CpuIdEntry| { + leaf_0.ebx == 0x756e_6547 && leaf_0.ecx == 0x6c65_746e && leaf_0.edx == 0x4965_6e69 + }; + let src_0 = src_vm_cpuid + .iter() + .find(|entry| (entry.function == 0x0) & (entry.index == 0x0)); + let dest_0 = dest_vm_cpuid + .iter() + .find(|entry| (entry.function == 0x0) & (entry.index == 0x0)); + src_0 + .zip(dest_0) + .is_some_and(|(src, dest)| is_intel(src) & is_intel(dest)) + }; + let extra_reporting = |entry: &CpuidFeatureEntry, src_reg: u32, dest_reg: u32| { + if let Some((_, defs)) = cpuid_definitions::intel::INTEL_CPUID_DEFINITIONS + .as_slice() + .iter() + .find(|(param, _)| { + (param.leaf == entry.function) && (param.sub_leaf.contains(&entry.index)) + }) + { + for def in defs.as_slice() { + let mask = (def.bits_range.0..=def.bits_range.1) + .fold(0, |acc, next| acc | (1 << next)); + + let src_val = src_reg & mask; + let dest_val = dest_reg & mask; + + let is_compatible = match entry.compatible_check { + CpuidCompatibleCheck::BitwiseSubset => (src_val & (!dest_val)) == 0, + CpuidCompatibleCheck::NumNotGreater => src_val <= dest_val, + CpuidCompatibleCheck::Equal => src_val == dest_val, + }; + if !is_compatible { + info!( + "CPUID incompatibility for value definition='{:?}' detected in leaf={:#02x}, sub-leaf={:#02x}, register={:?}, compatibility_check={:?}, source VM value='{:#04x}' destination VM value='{:#04x}'", + def, + entry.function, + entry.index, + entry.feature_reg, + entry.compatible_check, + src_val, + dest_val + ); + } + } + } + }; + let mut compatible = true; for (i, (src_vm_feature, dest_vm_feature)) in src_vm_features .iter() @@ -573,10 +602,16 @@ impl CpuidFeatureEntry { error!( "Detected incompatible CPUID entry: leaf={:#02x} (subleaf={:#02x}), register='{:?}', \ compatible_check='{:?}', source VM feature='{:#04x}', destination VM feature'{:#04x}'.", - entry.function, entry.index, entry.feature_reg, - entry.compatible_check, src_vm_feature, dest_vm_feature - ); - + entry.function, + entry.index, + entry.feature_reg, + entry.compatible_check, + src_vm_feature, + dest_vm_feature + ); + if both_intel { + extra_reporting(entry, *src_vm_feature, *dest_vm_feature); + } compatible = false; } } @@ -590,10 +625,15 @@ impl CpuidFeatureEntry { } } +/// This function generates the CPUID entries to be set for all CPUs. +/// +/// If the `config` has a CPU profile set (other than host) then the profile +/// will be applied pub fn generate_common_cpuid( hypervisor: &Arc, config: &CpuidConfig, ) -> super::Result> { + info!("calling generate_common_cpuid"); // SAFETY: cpuid called with valid leaves if unsafe { x86_64::__cpuid(1) }.ecx & (1 << HYPERVISOR_ECX_BIT) == 1 << HYPERVISOR_ECX_BIT { // SAFETY: cpuid called with valid leaves @@ -655,179 +695,237 @@ pub fn generate_common_cpuid( }); } - // Supported CPUID - let mut cpuid = hypervisor + // Supported CPUID according to the host and hypervisor + let mut host_cpuid = hypervisor .get_supported_cpuid() .map_err(Error::CpuidGetSupported)?; - CpuidPatch::patch_cpuid(&mut cpuid, cpuid_patches); - - if let Some(sgx_epc_sections) = &config.sgx_epc_sections { - update_cpuid_sgx(&mut cpuid, sgx_epc_sections)?; + let use_custom_profile = config.profile != CpuProfile::Host; + // Obtain cpuid entries that are adjusted to the specified CPU profile and the cpuid entries of the compatibility target + // TODO: Try to write this in a clearer way + let (host_adjusted_to_profile, profile_cpu_vendor) = { + config + .profile + .data(config.amx) + .map(|profile_data| { + ( + CpuidOutputRegisterAdjustments::adjust_cpuid_entries( + host_cpuid.clone(), + &profile_data.adjustments, + ) + .map(Some), + Some(profile_data.cpu_vendor), + ) + }) + .unwrap_or((Ok(None), None)) + }; + let mut host_adjusted_to_profile = + host_adjusted_to_profile.map_err(Error::MissingExpectedCpuidEntry)?; + + // There should be relatively few cases where live migration can succeed between hosts from different + // CPU vendors and making our checks account for that possibility would complicate things substantially. + // We thus require that the host's cpu vendor matches the one used to generate the CPU profile. + if let Some(cpu_vendor_profile) = profile_cpu_vendor + && let cpu_vendor_host = hypervisor.get_cpu_vendor() + && cpu_vendor_profile != cpu_vendor_host + { + return Err(Error::CpuProfileVendorIncompatibility { + cpu_vendor_profile, + cpu_vendor_host, + } + .into()); } + // We now make the modifications according to the config parameters to each of the cpuid entries + // declared above and then perform a compatibility check. + for cpuid_optiion in [Some(&mut host_cpuid), host_adjusted_to_profile.as_mut()] { + let Some(cpuid) = cpuid_optiion else { + break; + }; + CpuidPatch::patch_cpuid(cpuid, &cpuid_patches); - #[cfg(feature = "tdx")] - let tdx_capabilities = if config.tdx { - let caps = hypervisor - .tdx_capabilities() - .map_err(Error::TdxCapabilities)?; - info!("TDX capabilities {:#?}", caps); - Some(caps) - } else { - None - }; + #[cfg(feature = "tdx")] + let tdx_capabilities = if config.tdx { + if use_custom_profile { + return Err(Error::CpuProfileTdxIncompatibility.into()); + } + let caps = hypervisor + .tdx_capabilities() + .map_err(Error::TdxCapabilities)?; + info!("TDX capabilities {:#?}", caps); + Some(caps) + } else { + None + }; - // Update some existing CPUID - for entry in cpuid.as_mut_slice().iter_mut() { - match entry.function { - // Clear AMX related bits if the AMX feature is not enabled - 0x7 => { - if !config.amx && entry.index == 0 { - entry.edx &= !((1 << AMX_BF16) | (1 << AMX_TILE) | (1 << AMX_INT8)) + // Update some existing CPUID + for entry in cpuid.as_mut_slice().iter_mut() { + match entry.function { + // Clear AMX related bits if the AMX feature is not enabled + 0x7 => { + if !config.amx && entry.index == 0 { + entry.edx &= !((1 << AMX_BF16) | (1 << AMX_TILE) | (1 << AMX_INT8)) + } } - } - 0xd => - { - #[cfg(feature = "tdx")] - if let Some(caps) = &tdx_capabilities { - let xcr0_mask: u64 = 0x82ff; - let xss_mask: u64 = !xcr0_mask; - if entry.index == 0 { - entry.eax &= (caps.xfam_fixed0 as u32) & (xcr0_mask as u32); - entry.eax |= (caps.xfam_fixed1 as u32) & (xcr0_mask as u32); - entry.edx &= ((caps.xfam_fixed0 & xcr0_mask) >> 32) as u32; - entry.edx |= ((caps.xfam_fixed1 & xcr0_mask) >> 32) as u32; - } else if entry.index == 1 { - entry.ecx &= (caps.xfam_fixed0 as u32) & (xss_mask as u32); - entry.ecx |= (caps.xfam_fixed1 as u32) & (xss_mask as u32); - entry.edx &= ((caps.xfam_fixed0 & xss_mask) >> 32) as u32; - entry.edx |= ((caps.xfam_fixed1 & xss_mask) >> 32) as u32; + 0xd => + { + #[cfg(feature = "tdx")] + if let Some(caps) = &tdx_capabilities { + let xcr0_mask: u64 = 0x82ff; + let xss_mask: u64 = !xcr0_mask; + if entry.index == 0 { + entry.eax &= (caps.xfam_fixed0 as u32) & (xcr0_mask as u32); + entry.eax |= (caps.xfam_fixed1 as u32) & (xcr0_mask as u32); + entry.edx &= ((caps.xfam_fixed0 & xcr0_mask) >> 32) as u32; + entry.edx |= ((caps.xfam_fixed1 & xcr0_mask) >> 32) as u32; + } else if entry.index == 1 { + entry.ecx &= (caps.xfam_fixed0 as u32) & (xss_mask as u32); + entry.ecx |= (caps.xfam_fixed1 as u32) & (xss_mask as u32); + entry.edx &= ((caps.xfam_fixed0 & xss_mask) >> 32) as u32; + entry.edx |= ((caps.xfam_fixed1 & xss_mask) >> 32) as u32; + } } } - } - // Copy host L1 cache details if not populated by KVM - 0x8000_0005 => { - if entry.eax == 0 && entry.ebx == 0 && entry.ecx == 0 && entry.edx == 0 { - // SAFETY: cpuid called with valid leaves - if unsafe { std::arch::x86_64::__cpuid(0x8000_0000).eax } >= 0x8000_0005 { + // Copy host L1 cache details if not populated by KVM + 0x8000_0005 => { + if entry.eax == 0 && entry.ebx == 0 && entry.ecx == 0 && entry.edx == 0 { // SAFETY: cpuid called with valid leaves - let leaf = unsafe { std::arch::x86_64::__cpuid(0x8000_0005) }; - entry.eax = leaf.eax; - entry.ebx = leaf.ebx; - entry.ecx = leaf.ecx; - entry.edx = leaf.edx; + if unsafe { std::arch::x86_64::__cpuid(0x8000_0000).eax } >= 0x8000_0005 { + // SAFETY: cpuid called with valid leaves + let leaf = unsafe { std::arch::x86_64::__cpuid(0x8000_0005) }; + entry.eax = leaf.eax; + entry.ebx = leaf.ebx; + entry.ecx = leaf.ecx; + entry.edx = leaf.edx; + } } } - } - // Copy host L2 cache details if not populated by KVM - 0x8000_0006 => { - if entry.eax == 0 && entry.ebx == 0 && entry.ecx == 0 && entry.edx == 0 { - // SAFETY: cpuid called with valid leaves - if unsafe { std::arch::x86_64::__cpuid(0x8000_0000).eax } >= 0x8000_0006 { + // Copy host L2 cache details if not populated by KVM + 0x8000_0006 => { + if entry.eax == 0 && entry.ebx == 0 && entry.ecx == 0 && entry.edx == 0 { // SAFETY: cpuid called with valid leaves - let leaf = unsafe { std::arch::x86_64::__cpuid(0x8000_0006) }; - entry.eax = leaf.eax; - entry.ebx = leaf.ebx; - entry.ecx = leaf.ecx; - entry.edx = leaf.edx; + if unsafe { std::arch::x86_64::__cpuid(0x8000_0000).eax } >= 0x8000_0006 { + // SAFETY: cpuid called with valid leaves + let leaf = unsafe { std::arch::x86_64::__cpuid(0x8000_0006) }; + entry.eax = leaf.eax; + entry.ebx = leaf.ebx; + entry.ecx = leaf.ecx; + entry.edx = leaf.edx; + } } } - } - // Set CPU physical bits - 0x8000_0008 => { - entry.eax = (entry.eax & 0xffff_ff00) | (config.phys_bits as u32 & 0xff); - } - 0x4000_0001 => { - // These features are not supported by TDX - #[cfg(feature = "tdx")] - if config.tdx { - entry.eax &= !((1 << KVM_FEATURE_CLOCKSOURCE_BIT) - | (1 << KVM_FEATURE_CLOCKSOURCE2_BIT) - | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) - | (1 << KVM_FEATURE_ASYNC_PF_BIT) - | (1 << KVM_FEATURE_ASYNC_PF_VMEXIT_BIT) - | (1 << KVM_FEATURE_STEAL_TIME_BIT)) + // Set CPU physical bits + 0x8000_0008 => { + entry.eax = (entry.eax & 0xffff_ff00) | (config.phys_bits as u32 & 0xff); + } + 0x4000_0001 => { + // Enable KVM_FEATURE_MSI_EXT_DEST_ID. This allows the guest to target + // device interrupts to cpus with APIC IDs > 254 without interrupt remapping. + entry.eax |= 1 << KVM_FEATURE_MSI_EXT_DEST_ID; + + // These features are not supported by TDX + #[cfg(feature = "tdx")] + if config.tdx { + entry.eax &= !((1 << KVM_FEATURE_CLOCKSOURCE_BIT) + | (1 << KVM_FEATURE_CLOCKSOURCE2_BIT) + | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) + | (1 << KVM_FEATURE_ASYNC_PF_BIT) + | (1 << KVM_FEATURE_ASYNC_PF_VMEXIT_BIT) + | (1 << KVM_FEATURE_STEAL_TIME_BIT)) + } } + _ => {} } - _ => {} } - } - // Copy CPU identification string - for i in 0x8000_0002..=0x8000_0004 { - cpuid.retain(|c| c.function != i); - // SAFETY: call cpuid with valid leaves - let leaf = unsafe { std::arch::x86_64::__cpuid(i) }; - cpuid.push(CpuIdEntry { - function: i, - eax: leaf.eax, - ebx: leaf.ebx, - ecx: leaf.ecx, - edx: leaf.edx, - ..Default::default() - }); - } + // Copy CPU identification string + /* + TODO: Do we want to do this in the case of CPU profiles? + */ + for i in 0x8000_0002..=0x8000_0004 { + cpuid.retain(|c| c.function != i); + // SAFETY: call cpuid with valid leaves + let leaf = unsafe { std::arch::x86_64::__cpuid(i) }; + cpuid.push(CpuIdEntry { + function: i, + eax: leaf.eax, + ebx: leaf.ebx, + ecx: leaf.ecx, + edx: leaf.edx, + ..Default::default() + }); + } - if config.kvm_hyperv { - // Remove conflicting entries - cpuid.retain(|c| c.function != 0x4000_0000); - cpuid.retain(|c| c.function != 0x4000_0001); - // See "Hypervisor Top Level Functional Specification" for details - // Compliance with "Hv#1" requires leaves up to 0x4000_000a - cpuid.push(CpuIdEntry { - function: 0x40000000, - eax: 0x4000000a, // Maximum cpuid leaf - ebx: 0x756e694c, // "Linu" - ecx: 0x564b2078, // "x KV" - edx: 0x7648204d, // "M Hv" - ..Default::default() - }); - cpuid.push(CpuIdEntry { - function: 0x40000001, - eax: 0x31237648, // "Hv#1" - ..Default::default() - }); - cpuid.push(CpuIdEntry { - function: 0x40000002, - eax: 0x3839, // "Build number" - ebx: 0xa0000, // "Version" - ..Default::default() - }); - cpuid.push(CpuIdEntry { - function: 0x4000_0003, - eax: (1 << 1) // AccessPartitionReferenceCounter + if config.kvm_hyperv { + // Remove conflicting entries + cpuid.retain(|c| c.function != 0x4000_0000); + cpuid.retain(|c| c.function != 0x4000_0001); + // See "Hypervisor Top Level Functional Specification" for details + // Compliance with "Hv#1" requires leaves up to 0x4000_000a + cpuid.push(CpuIdEntry { + function: 0x40000000, + eax: 0x4000000a, // Maximum cpuid leaf + ebx: 0x756e694c, // "Linu" + ecx: 0x564b2078, // "x KV" + edx: 0x7648204d, // "M Hv" + ..Default::default() + }); + cpuid.push(CpuIdEntry { + function: 0x40000001, + eax: 0x31237648, // "Hv#1" + ..Default::default() + }); + cpuid.push(CpuIdEntry { + function: 0x40000002, + eax: 0x3839, // "Build number" + ebx: 0xa0000, // "Version" + ..Default::default() + }); + cpuid.push(CpuIdEntry { + function: 0x4000_0003, + eax: (1 << 1) // AccessPartitionReferenceCounter | (1 << 2) // AccessSynicRegs | (1 << 3) // AccessSyntheticTimerRegs | (1 << 9), // AccessPartitionReferenceTsc - edx: 1 << 3, // CPU dynamic partitioning - ..Default::default() - }); - cpuid.push(CpuIdEntry { - function: 0x4000_0004, - eax: 1 << 5, // Recommend relaxed timing - ..Default::default() - }); - for i in 0x4000_0005..=0x4000_000a { + edx: 1 << 3, // CPU dynamic partitioning + ..Default::default() + }); cpuid.push(CpuIdEntry { - function: i, + function: 0x4000_0004, + eax: 1 << 5, // Recommend relaxed timing ..Default::default() }); + for i in 0x4000_0005..=0x4000_000a { + cpuid.push(CpuIdEntry { + function: i, + ..Default::default() + }); + } } } - - Ok(cpuid) + if !use_custom_profile { + Ok(host_cpuid) + } else { + // Final compatibility checks to ensure that the CPUID values we return are compatible both with the CPU profile and the host we are currently running on. + let host_adjusted_to_profile = host_adjusted_to_profile.expect("The profile adjusted cpuid entries should exist as we checked that we have a custom CPU profile"); + + // Check that the host's cpuid is indeed compatible with the adjusted profile. This is not by construction. + info!("checking compatibility between host adjusted to profile and the host itself"); + CpuidFeatureEntry::check_cpuid_compatibility(&host_adjusted_to_profile, &host_cpuid) + .map_err(|_| Error::CpuProfileCpuidIncompatibility)?; + Ok(host_adjusted_to_profile) + } } pub fn configure_vcpu( vcpu: &Arc, - id: u8, + id: u32, boot_setup: Option<(EntryPoint, &GuestMemoryAtomic)>, cpuid: Vec, kvm_hyperv: bool, cpu_vendor: CpuVendor, - topology: Option<(u8, u8, u8)>, + topology: (u16, u16, u16, u16), ) -> super::Result<()> { - let x2apic_id = get_x2apic_id(id as u32, topology); + let x2apic_id = get_x2apic_id(id, Some(topology)); // Per vCPU CPUID changes; common are handled via generate_common_cpuid() let mut cpuid = cpuid; @@ -849,36 +947,27 @@ pub fn configure_vcpu( } assert!(apic_id_patched); - if let Some(t) = topology { - update_cpuid_topology(&mut cpuid, t.0, t.1, t.2, cpu_vendor, id); - } + update_cpuid_topology( + &mut cpuid, topology.0, topology.1, topology.2, topology.3, cpu_vendor, id, + ); // The TSC frequency CPUID leaf should not be included when running with HyperV emulation - if !kvm_hyperv { - if let Some(tsc_khz) = vcpu.tsc_khz().map_err(Error::GetTscFrequency)? { - // Need to check that the TSC doesn't vary with dynamic frequency - // SAFETY: cpuid called with valid leaves - if unsafe { std::arch::x86_64::__cpuid(0x8000_0007) }.edx - & (1u32 << INVARIANT_TSC_EDX_BIT) - > 0 - { - CpuidPatch::set_cpuid_reg( - &mut cpuid, - 0x4000_0000, - None, - CpuidReg::EAX, - 0x4000_0010, - ); - cpuid.retain(|c| c.function != 0x4000_0010); - cpuid.push(CpuIdEntry { - function: 0x4000_0010, - eax: tsc_khz, - ebx: 1000000, /* LAPIC resolution of 1ns (freq: 1GHz) is hardcoded in KVM's - * APIC_BUS_CYCLE_NS */ - ..Default::default() - }); - }; - } + if !kvm_hyperv && let Some(tsc_khz) = vcpu.tsc_khz().map_err(Error::GetTscFrequency)? { + // Need to check that the TSC doesn't vary with dynamic frequency + // SAFETY: cpuid called with valid leaves + if unsafe { std::arch::x86_64::__cpuid(0x8000_0007) }.edx & (1u32 << INVARIANT_TSC_EDX_BIT) + > 0 + { + CpuidPatch::set_cpuid_reg(&mut cpuid, 0x4000_0000, None, CpuidReg::EAX, 0x4000_0010); + cpuid.retain(|c| c.function != 0x4000_0010); + cpuid.push(CpuIdEntry { + function: 0x4000_0010, + eax: tsc_khz, + ebx: 1000000, /* LAPIC resolution of 1ns (freq: 1GHz) is hardcoded in KVM's + * APIC_BUS_CYCLE_NS */ + ..Default::default() + }); + }; } for c in &cpuid { @@ -896,7 +985,15 @@ pub fn configure_vcpu( if let Some((kernel_entry_point, guest_memory)) = boot_setup { regs::setup_regs(vcpu, kernel_entry_point).map_err(Error::RegsConfiguration)?; regs::setup_fpu(vcpu).map_err(Error::FpuConfiguration)?; - regs::setup_sregs(&guest_memory.memory(), vcpu).map_err(Error::SregsConfiguration)?; + + // CPUs are required (by Intel sdm spec) to boot in x2apic mode if any + // of the apic IDs is larger than 255. Experimentally, the Linux kernel + // does not recognize the last vCPU if x2apic is not enabled when + // there are 256 vCPUs in a flat hierarchy (i.e. max x2apic ID is 255), + // so we need to enable x2apic in this case as well. + let enable_x2_apic_mode = get_max_x2apic_id(topology) > MAX_SUPPORTED_CPUS_LEGACY; + regs::setup_sregs(&guest_memory.memory(), vcpu, enable_x2_apic_mode) + .map_err(Error::SregsConfiguration)?; } interrupts::set_lint(vcpu).map_err(|e| Error::LocalIntConfiguration(e.into()))?; Ok(()) @@ -946,14 +1043,13 @@ pub fn configure_system( cmdline_addr: GuestAddress, cmdline_size: usize, initramfs: &Option, - _num_cpus: u8, + _num_cpus: u32, setup_header: Option, rsdp_addr: Option, - sgx_epc_region: Option, serial_number: Option<&str>, uuid: Option<&str>, oem_strings: Option<&[&str]>, - topology: Option<(u8, u8, u8)>, + topology: Option<(u16, u16, u16, u16)>, ) -> super::Result<()> { // Write EBDA address to location where ACPICA expects to find it guest_mem @@ -969,10 +1065,10 @@ pub fn configure_system( mptable::setup_mptable(offset, guest_mem, _num_cpus, topology).map_err(Error::MpTableSetup)?; // Check that the RAM is not smaller than the RSDP start address - if let Some(rsdp_addr) = rsdp_addr { - if rsdp_addr.0 > guest_mem.last_addr().0 { - return Err(super::Error::RsdpPastRamEnd); - } + if let Some(rsdp_addr) = rsdp_addr + && rsdp_addr.0 > guest_mem.last_addr().0 + { + return Err(super::Error::RsdpPastRamEnd); } match setup_header { @@ -983,15 +1079,8 @@ pub fn configure_system( initramfs, hdr, rsdp_addr, - sgx_epc_region, - ), - None => configure_pvh( - guest_mem, - cmdline_addr, - initramfs, - rsdp_addr, - sgx_epc_region, ), + None => configure_pvh(guest_mem, cmdline_addr, initramfs, rsdp_addr), } } @@ -1083,7 +1172,6 @@ fn configure_pvh( cmdline_addr: GuestAddress, initramfs: &Option, rsdp_addr: Option, - sgx_epc_region: Option, ) -> super::Result<()> { const XEN_HVM_START_MAGIC_VALUE: u32 = 0x336ec578; @@ -1149,15 +1237,6 @@ fn configure_pvh( E820_RESERVED, ); - if let Some(sgx_epc_region) = sgx_epc_region { - add_memmap_entry( - &mut memmap, - sgx_epc_region.start().raw_value(), - sgx_epc_region.size(), - E820_RESERVED, - ); - } - start_info.memmap_entries = memmap.len() as u32; // Copy the vector with the memmap table to the MEMMAP_START address @@ -1204,7 +1283,6 @@ fn configure_32bit_entry( initramfs: &Option, setup_hdr: setup_header, rsdp_addr: Option, - sgx_epc_region: Option, ) -> super::Result<()> { const KERNEL_LOADER_OTHER: u8 = 0xff; @@ -1260,15 +1338,6 @@ fn configure_32bit_entry( E820_RESERVED, )?; - if let Some(sgx_epc_region) = sgx_epc_region { - add_e820_entry( - &mut params, - sgx_epc_region.start().raw_value(), - sgx_epc_region.size(), - E820_RESERVED, - )?; - } - if let Some(rsdp_addr) = rsdp_addr { params.acpi_rsdp_addr = rsdp_addr.0; } @@ -1361,21 +1430,24 @@ pub fn get_host_cpu_phys_bits(hypervisor: &Arc) -> u fn update_cpuid_topology( cpuid: &mut Vec, - threads_per_core: u8, - cores_per_die: u8, - dies_per_package: u8, + threads_per_core: u16, + cores_per_die: u16, + dies_per_package: u16, + packages: u16, cpu_vendor: CpuVendor, - id: u8, + id: u32, ) { let x2apic_id = get_x2apic_id( - id as u32, - Some((threads_per_core, cores_per_die, dies_per_package)), + id, + Some((threads_per_core, cores_per_die, dies_per_package, packages)), ); - let thread_width = 8 - (threads_per_core - 1).leading_zeros(); - let core_width = (8 - (cores_per_die - 1).leading_zeros()) + thread_width; - let die_width = (8 - (dies_per_package - 1).leading_zeros()) + core_width; + // Note: the topology defined here is per "package" (~NUMA node). + let thread_width = u16::BITS - (threads_per_core - 1).leading_zeros(); + let core_width = u16::BITS - (cores_per_die - 1).leading_zeros() + thread_width; + let die_width = u16::BITS - (dies_per_package - 1).leading_zeros() + core_width; + // The very old way: a flat number of logical CPUs per package: CPUID.1H:EBX[23:16] bits. let mut cpu_ebx = CpuidPatch::get_cpuid_reg(cpuid, 0x1, None, CpuidReg::EBX).unwrap_or(0); cpu_ebx |= ((dies_per_package as u32) * (cores_per_die as u32) * (threads_per_core as u32)) & (0xff << 16); @@ -1385,6 +1457,7 @@ fn update_cpuid_topology( cpu_edx |= 1 << 28; CpuidPatch::set_cpuid_reg(cpuid, 0x1, None, CpuidReg::EDX, cpu_edx); + // The legacy way: threads+cores per package. // CPU Topology leaf 0xb CpuidPatch::set_cpuid_reg(cpuid, 0xb, Some(0), CpuidReg::EAX, thread_width); CpuidPatch::set_cpuid_reg( @@ -1407,6 +1480,7 @@ fn update_cpuid_topology( CpuidPatch::set_cpuid_reg(cpuid, 0xb, Some(1), CpuidReg::ECX, 2 << 8); CpuidPatch::set_cpuid_reg(cpuid, 0xb, Some(1), CpuidReg::EDX, x2apic_id); + // The modern way: many-level hierarchy (but we here only support four levels). // CPU Topology leaf 0x1f CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(0), CpuidReg::EAX, thread_width); CpuidPatch::set_cpuid_reg( @@ -1483,7 +1557,7 @@ fn update_cpuid_topology( edx_bit: Some(28), }, ]; - CpuidPatch::patch_cpuid(cpuid, cpuid_patches); + CpuidPatch::patch_cpuid(cpuid, &cpuid_patches); CpuidPatch::set_cpuid_reg( cpuid, 0x8000_0008, @@ -1497,57 +1571,6 @@ fn update_cpuid_topology( } } } - -// The goal is to update the CPUID sub-leaves to reflect the number of EPC -// sections exposed to the guest. -fn update_cpuid_sgx( - cpuid: &mut Vec, - epc_sections: &[SgxEpcSection], -) -> Result<(), Error> { - // Something's wrong if there's no EPC section. - if epc_sections.is_empty() { - return Err(Error::NoSgxEpcSection); - } - // We can't go further if the hypervisor does not support SGX feature. - if !CpuidPatch::is_feature_enabled(cpuid, 0x7, 0, CpuidReg::EBX, 2) { - return Err(Error::MissingSgxFeature); - } - // We can't go further if the hypervisor does not support SGX_LC feature. - if !CpuidPatch::is_feature_enabled(cpuid, 0x7, 0, CpuidReg::ECX, 30) { - return Err(Error::MissingSgxLaunchControlFeature); - } - - // Get host CPUID for leaf 0x12, subleaf 0x2. This is to retrieve EPC - // properties such as confidentiality and integrity. - // SAFETY: call cpuid with valid leaves - let leaf = unsafe { std::arch::x86_64::__cpuid_count(0x12, 0x2) }; - - for (i, epc_section) in epc_sections.iter().enumerate() { - let subleaf_idx = i + 2; - let start = epc_section.start().raw_value(); - let size = epc_section.size(); - let eax = (start & 0xffff_f000) as u32 | 0x1; - let ebx = (start >> 32) as u32; - let ecx = (size & 0xffff_f000) as u32 | (leaf.ecx & 0xf); - let edx = (size >> 32) as u32; - // CPU Topology leaf 0x12 - CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::EAX, eax); - CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::EBX, ebx); - CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::ECX, ecx); - CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::EDX, edx); - } - - // Add one NULL entry to terminate the dynamic list - let subleaf_idx = epc_sections.len() + 2; - // CPU Topology leaf 0x12 - CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::EAX, 0); - CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::EBX, 0); - CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::ECX, 0); - CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::EDX, 0); - - Ok(()) -} - #[cfg(test)] mod tests { use linux_loader::loader::bootparam::boot_e820_entry; @@ -1578,7 +1601,6 @@ mod tests { None, None, None, - None, ); config_err.unwrap_err(); @@ -1603,7 +1625,6 @@ mod tests { None, None, None, - None, ) .unwrap(); @@ -1633,7 +1654,6 @@ mod tests { None, None, None, - None, ) .unwrap(); @@ -1649,7 +1669,6 @@ mod tests { None, None, None, - None, ) .unwrap(); } @@ -1721,22 +1740,27 @@ mod tests { #[test] fn test_get_x2apic_id() { - let x2apic_id = get_x2apic_id(0, Some((2, 3, 1))); + let x2apic_id = get_x2apic_id(0, Some((2, 3, 1, 1))); assert_eq!(x2apic_id, 0); - let x2apic_id = get_x2apic_id(1, Some((2, 3, 1))); + let x2apic_id = get_x2apic_id(1, Some((2, 3, 1, 1))); assert_eq!(x2apic_id, 1); - let x2apic_id = get_x2apic_id(2, Some((2, 3, 1))); + let x2apic_id = get_x2apic_id(2, Some((2, 3, 1, 1))); assert_eq!(x2apic_id, 2); - let x2apic_id = get_x2apic_id(6, Some((2, 3, 1))); + let x2apic_id = get_x2apic_id(6, Some((2, 3, 1, 1))); assert_eq!(x2apic_id, 8); - let x2apic_id = get_x2apic_id(7, Some((2, 3, 1))); + let x2apic_id = get_x2apic_id(7, Some((2, 3, 1, 1))); assert_eq!(x2apic_id, 9); - let x2apic_id = get_x2apic_id(8, Some((2, 3, 1))); + let x2apic_id = get_x2apic_id(8, Some((2, 3, 1, 1))); assert_eq!(x2apic_id, 10); + + let x2apic_id = get_x2apic_id(257, Some((1, 312, 1, 1))); + assert_eq!(x2apic_id, 257); + + assert_eq!(255, get_max_x2apic_id((1, 256, 1, 1))); } } diff --git a/arch/src/x86_64/mptable.rs b/arch/src/x86_64/mptable.rs index aaf6f1ddd7..2e2669b38a 100644 --- a/arch/src/x86_64/mptable.rs +++ b/arch/src/x86_64/mptable.rs @@ -11,9 +11,10 @@ use libc::c_uchar; use thiserror::Error; use vm_memory::{Address, ByteValued, Bytes, GuestAddress, GuestMemory, GuestMemoryError}; +use super::MAX_SUPPORTED_CPUS_LEGACY; +use crate::GuestMemoryMmap; use crate::layout::{APIC_START, HIGH_RAM_START, IOAPIC_START}; use crate::x86_64::{get_x2apic_id, mpspec}; -use crate::GuestMemoryMmap; // This is a workaround to the Rust enforcement specifying that any implementation of a foreign // trait (in this case `ByteValued`) where: @@ -61,9 +62,6 @@ pub enum Error { /// Failure while zeroing out the memory for the MP table. #[error("Failure while zeroing out the memory for the MP table")] Clear(#[source] GuestMemoryError), - /// Number of CPUs exceeds the maximum supported CPUs - #[error("Number of CPUs exceeds the maximum supported CPUs")] - TooManyCpus, /// Failure to write the MP floating pointer. #[error("Failure to write the MP floating pointer")] WriteMpfIntel(#[source] GuestMemoryError), @@ -89,11 +87,6 @@ pub enum Error { pub type Result = result::Result; -// With APIC/xAPIC, there are only 255 APIC IDs available. And IOAPIC occupies -// one APIC ID, so only 254 CPUs at maximum may be supported. Actually it's -// a large number for FC usecases. -pub const MAX_SUPPORTED_CPUS: u32 = 254; - // Most of these variables are sourced from the Intel MP Spec 1.4. const SMP_MAGIC_IDENT: &[c_uchar; 4] = b"_MP_"; const MPC_SIGNATURE: &[c_uchar; 4] = b"PCMP"; @@ -121,7 +114,7 @@ fn mpf_intel_compute_checksum(v: &mpspec::mpf_intel) -> u8 { (!checksum).wrapping_add(1) } -fn compute_mp_size(num_cpus: u8) -> usize { +fn compute_mp_size(num_cpus: u32) -> usize { mem::size_of::() + mem::size_of::() + mem::size_of::() * (num_cpus as usize) @@ -135,14 +128,15 @@ fn compute_mp_size(num_cpus: u8) -> usize { pub fn setup_mptable( offset: GuestAddress, mem: &GuestMemoryMmap, - num_cpus: u8, - topology: Option<(u8, u8, u8)>, + num_cpus: u32, + topology: Option<(u16, u16, u16, u16)>, ) -> Result<()> { if num_cpus > 0 { let cpu_id_max = num_cpus - 1; - let x2apic_id_max = get_x2apic_id(cpu_id_max.into(), topology); - if x2apic_id_max >= MAX_SUPPORTED_CPUS { - return Err(Error::TooManyCpus); + let x2apic_id_max = get_x2apic_id(cpu_id_max, topology); + if x2apic_id_max >= MAX_SUPPORTED_CPUS_LEGACY { + info!("Skipping mptable creation due to too many CPUs"); + return Ok(()); } } @@ -157,7 +151,7 @@ pub fn setup_mptable( } let mut checksum: u8 = 0; - let ioapicid: u8 = MAX_SUPPORTED_CPUS as u8 + 1; + let ioapicid: u8 = MAX_SUPPORTED_CPUS_LEGACY as u8 + 1; // The checked_add here ensures the all of the following base_mp.unchecked_add's will be without // overflow. @@ -195,7 +189,7 @@ pub fn setup_mptable( for cpu_id in 0..num_cpus { let mut mpc_cpu = MpcCpuWrapper(mpspec::mpc_cpu::default()); mpc_cpu.0.type_ = mpspec::MP_PROCESSOR as u8; - mpc_cpu.0.apicid = get_x2apic_id(cpu_id as u32, topology) as u8; + mpc_cpu.0.apicid = get_x2apic_id(cpu_id, topology) as u8; mpc_cpu.0.apicver = APIC_VERSION; mpc_cpu.0.cpuflag = mpspec::CPU_ENABLED as u8 | if cpu_id == 0 { @@ -394,11 +388,11 @@ mod tests { fn cpu_entry_count() { let mem = GuestMemoryMmap::from_ranges(&[( MPTABLE_START, - compute_mp_size(MAX_SUPPORTED_CPUS as u8), + compute_mp_size(MAX_SUPPORTED_CPUS_LEGACY), )]) .unwrap(); - for i in 0..MAX_SUPPORTED_CPUS as u8 { + for i in 0..MAX_SUPPORTED_CPUS_LEGACY { setup_mptable(MPTABLE_START, &mem, i, None).unwrap(); let mpf_intel: MpfIntelWrapper = mem.read_obj(MPTABLE_START).unwrap(); @@ -428,11 +422,9 @@ mod tests { #[test] fn cpu_entry_count_max() { - let cpus = MAX_SUPPORTED_CPUS + 1; - let mem = - GuestMemoryMmap::from_ranges(&[(MPTABLE_START, compute_mp_size(cpus as u8))]).unwrap(); + let cpus = MAX_SUPPORTED_CPUS_LEGACY + 1; + let mem = GuestMemoryMmap::from_ranges(&[(MPTABLE_START, compute_mp_size(cpus))]).unwrap(); - let result = setup_mptable(MPTABLE_START, &mem, cpus as u8, None); - result.unwrap_err(); + setup_mptable(MPTABLE_START, &mem, cpus, None).unwrap(); } } diff --git a/arch/src/x86_64/regs.rs b/arch/src/x86_64/regs.rs index 3826fdb6ce..706dcd0622 100644 --- a/arch/src/x86_64/regs.rs +++ b/arch/src/x86_64/regs.rs @@ -119,9 +119,13 @@ pub fn setup_regs(vcpu: &Arc, entry_point: EntryPoint) -> /// /// * `mem` - The memory that will be passed to the guest. /// * `vcpu` - Structure for the VCPU that holds the VCPU's fd. -pub fn setup_sregs(mem: &GuestMemoryMmap, vcpu: &Arc) -> Result<()> { +pub fn setup_sregs( + mem: &GuestMemoryMmap, + vcpu: &Arc, + enable_x2_apic_mode: bool, +) -> Result<()> { let mut sregs: SpecialRegisters = vcpu.get_sregs().map_err(Error::GetStatusRegisters)?; - configure_segments_and_sregs(mem, &mut sregs)?; + configure_segments_and_sregs(mem, &mut sregs, enable_x2_apic_mode)?; vcpu.set_sregs(&sregs).map_err(Error::SetStatusRegisters) } @@ -148,6 +152,7 @@ fn write_idt_value(val: u64, guest_mem: &GuestMemoryMmap) -> Result<()> { pub fn configure_segments_and_sregs( mem: &GuestMemoryMmap, sregs: &mut SpecialRegisters, + enable_x2_apic_mode: bool, ) -> Result<()> { let gdt_table: [u64; BOOT_GDT_MAX] = { // Configure GDT entries as specified by PVH boot protocol @@ -183,6 +188,11 @@ pub fn configure_segments_and_sregs( sregs.cr0 = CR0_PE; sregs.cr4 = 0; + if enable_x2_apic_mode { + const X2APIC_ENABLE_BIT: u64 = 1 << 10; + sregs.apic_base |= X2APIC_ENABLE_BIT; + } + Ok(()) } @@ -204,7 +214,7 @@ mod tests { fn segments_and_sregs() { let mut sregs: SpecialRegisters = Default::default(); let gm = create_guest_mem(); - configure_segments_and_sregs(&gm, &mut sregs).unwrap(); + configure_segments_and_sregs(&gm, &mut sregs, false).unwrap(); assert_eq!(0x0, read_u64(&gm, BOOT_GDT_START)); assert_eq!( 0xcf9b000000ffff, diff --git a/arch/src/x86_64/smbios.rs b/arch/src/x86_64/smbios.rs index 55a7df1e72..7d867a43c6 100644 --- a/arch/src/x86_64/smbios.rs +++ b/arch/src/x86_64/smbios.rs @@ -12,8 +12,8 @@ use thiserror::Error; use uuid::Uuid; use vm_memory::{Address, ByteValued, Bytes, GuestAddress}; -use crate::layout::SMBIOS_START; use crate::GuestMemoryMmap; +use crate::layout::SMBIOS_START; #[derive(Debug, Error)] pub enum Error { diff --git a/block/Cargo.toml b/block/Cargo.toml index 02bf37eb03..9823c1f818 100644 --- a/block/Cargo.toml +++ b/block/Cargo.toml @@ -1,6 +1,6 @@ [package] authors = ["The Chromium OS Authors", "The Cloud Hypervisor Authors"] -edition = "2021" +edition.workspace = true name = "block" version = "0.1.0" @@ -9,17 +9,17 @@ default = [] io_uring = ["dep:io-uring"] [dependencies] -byteorder = "1.5.0" +byteorder = { workspace = true } crc-any = "2.5.0" -io-uring = { version = "0.6.4", optional = true } -libc = "0.2.167" -log = "0.4.22" +io-uring = { version = "0.7.10", optional = true } +libc = { workspace = true } +log = { workspace = true } remain = "0.2.15" -serde = { version = "1.0.208", features = ["derive"] } -smallvec = "1.13.2" +serde = { workspace = true, features = ["derive"] } +smallvec = "1.15.1" thiserror = { workspace = true } uuid = { workspace = true, features = ["v4"] } -virtio-bindings = { workspace = true, features = ["virtio-v5_0_0"] } +virtio-bindings = { workspace = true } virtio-queue = { workspace = true } vm-memory = { workspace = true, features = [ "backend-atomic", @@ -28,3 +28,6 @@ vm-memory = { workspace = true, features = [ ] } vm-virtio = { path = "../vm-virtio" } vmm-sys-util = { workspace = true } + +[lints] +workspace = true diff --git a/block/src/async_io.rs b/block/src/async_io.rs index 3f37bd6e34..7d44a8c361 100644 --- a/block/src/async_io.rs +++ b/block/src/async_io.rs @@ -8,7 +8,7 @@ use std::os::fd::{AsRawFd, OwnedFd, RawFd}; use thiserror::Error; use vmm_sys_util::eventfd::EventFd; -use crate::DiskTopology; +use crate::{BatchRequest, DiskTopology}; #[derive(Error, Debug)] pub enum DiskFileError { @@ -18,6 +18,14 @@ pub enum DiskFileError { /// Failed creating a new AsyncIo. #[error("Failed creating a new AsyncIo")] NewAsyncIo(#[source] std::io::Error), + + /// Unsupported operation. + #[error("Unsupported operation")] + Unsupported, + + /// Resize failed + #[error("Resize failed")] + ResizeError, } pub type DiskFileResult = std::result::Result; @@ -61,6 +69,8 @@ pub trait DiskFile: Send { fn topology(&mut self) -> DiskTopology { DiskTopology::default() } + fn resize(&mut self, size: u64) -> DiskFileResult<()>; + /// Returns the file descriptor of the underlying disk image file. /// /// The file descriptor is supposed to be used for `fcntl()` calls but no @@ -79,6 +89,9 @@ pub enum AsyncIoError { /// Failed synchronizing file. #[error("Failed synchronizing file")] Fsync(#[source] std::io::Error), + /// Failed submitting batch requests. + #[error("Failed submitting batch requests: {0}")] + SubmitBatchRequests(#[source] std::io::Error), } pub type AsyncIoResult = std::result::Result; @@ -99,4 +112,10 @@ pub trait AsyncIo: Send { ) -> AsyncIoResult<()>; fn fsync(&mut self, user_data: Option) -> AsyncIoResult<()>; fn next_completed_request(&mut self) -> Option<(u64, i32)>; + fn batch_requests_enabled(&self) -> bool { + false + } + fn submit_batch_requests(&mut self, _batch_request: &[BatchRequest]) -> AsyncIoResult<()> { + Ok(()) + } } diff --git a/block/src/fcntl.rs b/block/src/fcntl.rs index 2e34de1d6a..3687288a6b 100644 --- a/block/src/fcntl.rs +++ b/block/src/fcntl.rs @@ -101,13 +101,52 @@ impl LockState { } } +/// The granularity of the advisory lock. +/// +/// The granularity has significant implications in typical cloud deployments +/// with network storage. The Linux kernel will sync advisory locks to network +/// file systems, but these backends may have different policies and handle +/// locks differently. For example, Netapp speaks a NFS API but will treat +/// advisory OFD locks for the whole file as mandatory locks, whereas byte-range +/// locks for the whole file will remain advisory [0]. +/// +/// As it is a valid use case to prevent multiple CHV instances from accessing +/// the same disk but disk management software (e.g., Cinder in OpenStack) +/// should be able to snapshot disks while VMs are running, we need special +/// control over the lock granularity. Therefore, it is a valid use case to lock +/// the whole byte range of a disk image without technically locking the whole +/// file - to get the best of both worlds. +/// +/// [0] https://kb.netapp.com/on-prem/ontap/da/NAS/NAS-KBs/How_is_Mandatory_Locking_supported_for_NFSv4_on_ONTAP_9 +#[derive(Clone, Copy, Debug)] +pub enum LockGranularity { + WholeFile, + ByteRange(u64 /* from, inclusive */, u64 /* len */), +} + +impl LockGranularity { + const fn l_start(self) -> u64 { + match self { + LockGranularity::WholeFile => 0, + LockGranularity::ByteRange(start, _) => start, + } + } + + const fn l_len(self) -> u64 { + match self { + LockGranularity::WholeFile => 0, /* EOF */ + LockGranularity::ByteRange(_, len) => len, + } + } +} + /// Returns a [`struct@libc::flock`] structure for the whole file. -const fn get_flock(lock_type: LockType) -> libc::flock { +const fn get_flock(lock_type: LockType, granularity: LockGranularity) -> libc::flock { libc::flock { l_type: lock_type.to_libc_val() as libc::c_short, l_whence: libc::SEEK_SET as libc::c_short, - l_start: 0, - l_len: 0, /* EOF */ + l_start: granularity.l_start() as libc::c_long, + l_len: granularity.l_len() as libc::c_long, l_pid: 0, /* filled by callee */ } } @@ -122,8 +161,13 @@ const fn get_flock(lock_type: LockType) -> libc::flock { /// - `file`: The file to acquire a lock for [`LockType`]. The file's state will /// be logically mutated, but not technically. /// - `lock_type`: The [`LockType`] -pub fn try_acquire_lock(file: Fd, lock_type: LockType) -> Result<(), LockError> { - let flock = get_flock(lock_type); +/// - `granularity`: The [`LockGranularity`]. +pub fn try_acquire_lock( + file: Fd, + lock_type: LockType, + granularity: LockGranularity, +) -> Result<(), LockError> { + let flock = get_flock(lock_type, granularity); let res = fcntl(file.as_raw_fd(), FcntlArg::F_OFD_SETLK(&flock)); match res { @@ -146,8 +190,9 @@ pub fn try_acquire_lock(file: Fd, lock_type: LockType) -> Result<() /// /// # Parameters /// - `file`: The file to clear all locks for [`LockType`]. -pub fn clear_lock(file: Fd) -> Result<(), LockError> { - try_acquire_lock(file, LockType::Unlock) +/// - `granularity`: The [`LockGranularity`]. +pub fn clear_lock(file: Fd, granularity: LockGranularity) -> Result<(), LockError> { + try_acquire_lock(file, LockType::Unlock, granularity) } /// Returns the current lock state using [`fcntl`] with respect to the given @@ -155,8 +200,12 @@ pub fn clear_lock(file: Fd) -> Result<(), LockError> { /// /// # Parameters /// - `file`: The file for which to get the lock state. -pub fn get_lock_state(file: Fd) -> Result { - let mut flock = get_flock(LockType::Write); +/// - `granularity`: The [`LockGranularity`]. +pub fn get_lock_state( + file: Fd, + granularity: LockGranularity, +) -> Result { + let mut flock = get_flock(LockType::Write, granularity); let res = fcntl(file.as_raw_fd(), FcntlArg::F_OFD_GETLK(&mut flock)); match res { 0 => { diff --git a/block/src/fixed_vhd.rs b/block/src/fixed_vhd.rs index 22ef4dd80d..379005ae28 100644 --- a/block/src/fixed_vhd.rs +++ b/block/src/fixed_vhd.rs @@ -6,8 +6,8 @@ use std::fs::File; use std::io::{Read, Seek, SeekFrom, Write}; use std::os::unix::io::{AsRawFd, RawFd}; -use crate::vhd::VhdFooter; use crate::BlockBackend; +use crate::vhd::VhdFooter; #[derive(Debug)] pub struct FixedVhd { diff --git a/block/src/fixed_vhd_async.rs b/block/src/fixed_vhd_async.rs index 6b51d070f8..07ad258c4a 100644 --- a/block/src/fixed_vhd_async.rs +++ b/block/src/fixed_vhd_async.rs @@ -12,7 +12,7 @@ use crate::async_io::{ }; use crate::fixed_vhd::FixedVhd; use crate::raw_async::RawFileAsync; -use crate::BlockBackend; +use crate::{BatchRequest, BlockBackend}; pub struct FixedVhdDiskAsync(FixedVhd); @@ -34,6 +34,10 @@ impl DiskFile for FixedVhdDiskAsync { ) as Box) } + fn resize(&mut self, _size: u64) -> DiskFileResult<()> { + Err(DiskFileError::Unsupported) + } + fn fd(&mut self) -> BorrowedDiskFd<'_> { BorrowedDiskFd::new(self.0.as_raw_fd()) } @@ -106,4 +110,12 @@ impl AsyncIo for FixedVhdAsync { fn next_completed_request(&mut self) -> Option<(u64, i32)> { self.raw_file_async.next_completed_request() } + + fn batch_requests_enabled(&self) -> bool { + true + } + + fn submit_batch_requests(&mut self, batch_request: &[BatchRequest]) -> AsyncIoResult<()> { + self.raw_file_async.submit_batch_requests(batch_request) + } } diff --git a/block/src/fixed_vhd_sync.rs b/block/src/fixed_vhd_sync.rs index b1f2118f19..0f05c66ad7 100644 --- a/block/src/fixed_vhd_sync.rs +++ b/block/src/fixed_vhd_sync.rs @@ -7,12 +7,12 @@ use std::os::unix::io::{AsRawFd, RawFd}; use vmm_sys_util::eventfd::EventFd; +use crate::BlockBackend; use crate::async_io::{ AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFile, DiskFileError, DiskFileResult, }; use crate::fixed_vhd::FixedVhd; use crate::raw_sync::RawFileSync; -use crate::BlockBackend; pub struct FixedVhdDiskSync(FixedVhd); @@ -34,6 +34,10 @@ impl DiskFile for FixedVhdDiskSync { ) as Box) } + fn resize(&mut self, _size: u64) -> DiskFileResult<()> { + Err(DiskFileError::Unsupported) + } + fn fd(&mut self) -> BorrowedDiskFd<'_> { BorrowedDiskFd::new(self.0.as_raw_fd()) } diff --git a/block/src/lib.rs b/block/src/lib.rs index 1424848ba3..5599258e3e 100644 --- a/block/src/lib.rs +++ b/block/src/lib.rs @@ -31,7 +31,7 @@ pub mod vhd; pub mod vhdx; pub mod vhdx_sync; -use std::alloc::{alloc_zeroed, dealloc, Layout}; +use std::alloc::{Layout, alloc_zeroed, dealloc}; use std::collections::VecDeque; use std::fmt::Debug; use std::fs::File; @@ -39,13 +39,13 @@ use std::io::{self, IoSlice, IoSliceMut, Read, Seek, SeekFrom, Write}; use std::os::linux::fs::MetadataExt; use std::os::unix::io::AsRawFd; use std::path::Path; -use std::sync::{Arc, MutexGuard}; +use std::sync::Arc; use std::time::Instant; use std::{cmp, result}; #[cfg(feature = "io_uring")] -use io_uring::{opcode, IoUring, Probe}; -use libc::{ioctl, S_IFBLK, S_IFMT}; +use io_uring::{IoUring, Probe, opcode}; +use libc::{S_IFBLK, S_IFMT, ioctl}; use serde::{Deserialize, Serialize}; use smallvec::SmallVec; use thiserror::Error; @@ -57,7 +57,7 @@ use vm_memory::{ }; use vm_virtio::{AccessPlatform, Translatable}; use vmm_sys_util::eventfd::EventFd; -use vmm_sys_util::{aio, ioctl_io_nr, ioctl_ioc_nr}; +use vmm_sys_util::{aio, ioctl_io_nr}; use crate::async_io::{AsyncIo, AsyncIoError, AsyncIoResult}; use crate::vhdx::VhdxError; @@ -139,6 +139,8 @@ pub enum ExecuteError { Read(#[source] GuestMemoryError), #[error("Failed to read_exact")] ReadExact(#[source] io::Error), + #[error("Can't execute an operation other than `read` on a read-only device")] + ReadOnly, #[error("Failed to seek")] Seek(#[source] io::Error), #[error("Failed to write")] @@ -168,6 +170,7 @@ impl ExecuteError { ExecuteError::Flush(_) => VIRTIO_BLK_S_IOERR, ExecuteError::Read(_) => VIRTIO_BLK_S_IOERR, ExecuteError::ReadExact(_) => VIRTIO_BLK_S_IOERR, + ExecuteError::ReadOnly => VIRTIO_BLK_S_IOERR, ExecuteError::Seek(_) => VIRTIO_BLK_S_IOERR, ExecuteError::Write(_) => VIRTIO_BLK_S_IOERR, ExecuteError::WriteAll(_) => VIRTIO_BLK_S_IOERR, @@ -229,6 +232,20 @@ pub struct AlignedOperation { layout: Layout, } +pub struct BatchRequest { + pub offset: libc::off_t, + pub iovecs: SmallVec<[libc::iovec; DEFAULT_DESCRIPTOR_VEC_SIZE]>, + pub user_data: u64, + pub request_type: RequestType, +} + +pub struct ExecuteAsync { + // `true` if the execution will complete asynchronously + pub async_complete: bool, + // request need to be batched for submission if any + pub batch_request: Option, +} + #[derive(Debug)] pub struct Request { pub request_type: RequestType, @@ -394,7 +411,7 @@ impl Request { disk_image: &mut dyn AsyncIo, serial: &[u8], user_data: u64, - ) -> result::Result { + ) -> result::Result { let sector = self.sector; let request_type = self.request_type; let offset = (sector << SECTOR_SHIFT) as libc::off_t; @@ -431,7 +448,7 @@ impl Request { // In case it's not properly aligned, an intermediate buffer is // created with the correct alignment, and a copy from/to the // origin buffer is performed, depending on the type of operation. - let iov_base = if (origin_ptr.as_ptr() as u64) % SECTOR_SIZE != 0 { + let iov_base = if !(origin_ptr.as_ptr() as u64).is_multiple_of(SECTOR_SIZE) { let layout = Layout::from_size_align(data_len, SECTOR_SIZE as usize).unwrap(); // SAFETY: layout has non-zero size let aligned_ptr = unsafe { alloc_zeroed(layout) }; @@ -470,6 +487,10 @@ impl Request { iovecs.push(iovec); } + let mut ret = ExecuteAsync { + async_complete: true, + batch_request: None, + }; // Queue operations expected to be submitted. match request_type { RequestType::In => { @@ -479,14 +500,32 @@ impl Request { .bitmap() .mark_dirty(0, *data_len as usize); } - disk_image - .read_vectored(offset, &iovecs, user_data) - .map_err(ExecuteError::AsyncRead)?; + if disk_image.batch_requests_enabled() { + ret.batch_request = Some(BatchRequest { + offset, + iovecs, + user_data, + request_type, + }); + } else { + disk_image + .read_vectored(offset, &iovecs, user_data) + .map_err(ExecuteError::AsyncRead)?; + } } RequestType::Out => { - disk_image - .write_vectored(offset, &iovecs, user_data) - .map_err(ExecuteError::AsyncWrite)?; + if disk_image.batch_requests_enabled() { + ret.batch_request = Some(BatchRequest { + offset, + iovecs, + user_data, + request_type, + }); + } else { + disk_image + .write_vectored(offset, &iovecs, user_data) + .map_err(ExecuteError::AsyncWrite)?; + } } RequestType::Flush => { disk_image @@ -504,12 +543,13 @@ impl Request { } mem.write_slice(serial, data_addr) .map_err(ExecuteError::Write)?; - return Ok(false); + ret.async_complete = false; + return Ok(ret); } RequestType::Unsupported(t) => return Err(ExecuteError::Unsupported(t)), } - Ok(true) + Ok(ret) } pub fn complete_async(&mut self) -> result::Result<(), Error> { @@ -646,10 +686,7 @@ pub fn block_io_uring_is_supported() -> bool { } } -pub trait AsyncAdaptor -where - F: Read + Write + Seek, -{ +pub trait AsyncAdaptor { fn read_vectored_sync( &mut self, offset: libc::off_t, @@ -657,7 +694,10 @@ where user_data: u64, eventfd: &EventFd, completion_list: &mut VecDeque<(u64, i32)>, - ) -> AsyncIoResult<()> { + ) -> AsyncIoResult<()> + where + Self: Read + Seek, + { // Convert libc::iovec into IoSliceMut let mut slices: SmallVec<[IoSliceMut; DEFAULT_DESCRIPTOR_VEC_SIZE]> = SmallVec::with_capacity(iovecs.len()); @@ -669,15 +709,13 @@ where } let result = { - let mut file = self.file(); - // Move the cursor to the right offset - file.seek(SeekFrom::Start(offset as u64)) + self.seek(SeekFrom::Start(offset as u64)) .map_err(AsyncIoError::ReadVectored)?; let mut r = 0; for b in slices.iter_mut() { - r += file.read(b).map_err(AsyncIoError::ReadVectored)?; + r += self.read(b).map_err(AsyncIoError::ReadVectored)?; } r }; @@ -695,7 +733,10 @@ where user_data: u64, eventfd: &EventFd, completion_list: &mut VecDeque<(u64, i32)>, - ) -> AsyncIoResult<()> { + ) -> AsyncIoResult<()> + where + Self: Write + Seek, + { // Convert libc::iovec into IoSlice let mut slices: SmallVec<[IoSlice; DEFAULT_DESCRIPTOR_VEC_SIZE]> = SmallVec::with_capacity(iovecs.len()); @@ -707,15 +748,13 @@ where } let result = { - let mut file = self.file(); - // Move the cursor to the right offset - file.seek(SeekFrom::Start(offset as u64)) + self.seek(SeekFrom::Start(offset as u64)) .map_err(AsyncIoError::WriteVectored)?; let mut r = 0; for b in slices.iter() { - r += file.write(b).map_err(AsyncIoError::WriteVectored)?; + r += self.write(b).map_err(AsyncIoError::WriteVectored)?; } r }; @@ -731,12 +770,13 @@ where user_data: Option, eventfd: &EventFd, completion_list: &mut VecDeque<(u64, i32)>, - ) -> AsyncIoResult<()> { + ) -> AsyncIoResult<()> + where + Self: Write, + { let result: i32 = { - let mut file = self.file(); - // Flush - file.flush().map_err(AsyncIoError::Fsync)?; + self.flush().map_err(AsyncIoError::Fsync)?; 0 }; @@ -748,8 +788,6 @@ where Ok(()) } - - fn file(&mut self) -> MutexGuard<'_, F>; } pub enum ImageType { diff --git a/block/src/qcow/mod.rs b/block/src/qcow/mod.rs index 6d74232ddf..14deafc856 100644 --- a/block/src/qcow/mod.rs +++ b/block/src/qcow/mod.rs @@ -24,11 +24,11 @@ use vmm_sys_util::file_traits::{FileSetLen, FileSync}; use vmm_sys_util::seek_hole::SeekHole; use vmm_sys_util::write_zeroes::{PunchHole, WriteZeroesAt}; +use crate::BlockBackend; use crate::qcow::qcow_raw_file::QcowRawFile; pub use crate::qcow::raw_file::RawFile; use crate::qcow::refcount::RefCount; use crate::qcow::vec_cache::{CacheMap, Cacheable, VecCache}; -use crate::BlockBackend; /// Nesting depth limit for disk formats that can open other disk files. const MAX_NESTING_DEPTH: u32 = 10; @@ -287,11 +287,12 @@ impl QcowHeader { let cluster_bits: u32 = DEFAULT_CLUSTER_BITS; let cluster_size: u32 = 0x01 << cluster_bits; let max_length: usize = (cluster_size - header_size) as usize; - if let Some(path) = backing_file { - if path.len() > max_length { - return Err(Error::BackingFileTooLong(path.len() - max_length)); - } + if let Some(path) = backing_file + && path.len() > max_length + { + return Err(Error::BackingFileTooLong(path.len() - max_length)); } + // L2 blocks are always one cluster long. They contain cluster_size/sizeof(u64) addresses. let entries_per_cluster: u32 = cluster_size / size_of::() as u32; let num_clusters: u32 = div_round_up_u64(size, u64::from(cluster_size)) as u32; @@ -425,7 +426,7 @@ fn max_refcount_clusters(refcount_order: u32, cluster_size: u32, num_clusters: u /// # Ok(()) /// # } /// ``` -#[derive(Debug)] +#[derive(Clone, Debug)] pub struct QcowFile { raw_file: QcowRawFile, header: QcowHeader, @@ -589,14 +590,12 @@ impl QcowFile { // Check for compressed blocks for l2_addr_disk in l1_table.get_values() { - if *l2_addr_disk != 0 { - if let Err(e) = Self::read_l2_cluster(&mut raw_file, *l2_addr_disk) { - if let Some(os_error) = e.raw_os_error() { - if os_error == ENOTSUP { - return Err(Error::CompressedBlocksNotSupported); - } - } - } + if *l2_addr_disk != 0 + && let Err(e) = Self::read_l2_cluster(&mut raw_file, *l2_addr_disk) + && let Some(os_error) = e.raw_os_error() + && os_error == ENOTSUP + { + return Err(Error::CompressedBlocksNotSupported); } } @@ -1584,11 +1583,11 @@ impl Seek for QcowFile { } }; - if let Some(o) = new_offset { - if o <= self.virtual_size() { - self.current_offset = o; - return Ok(o); - } + if let Some(o) = new_offset + && o <= self.virtual_size() + { + self.current_offset = o; + return Ok(o); } Err(std::io::Error::from_raw_os_error(EINVAL)) } @@ -1705,12 +1704,12 @@ fn offset_is_cluster_boundary(offset: u64, cluster_bits: u32) -> Result<()> { // Ceiling of the division of `dividend`/`divisor`. fn div_round_up_u64(dividend: u64, divisor: u64) -> u64 { - dividend / divisor + u64::from(dividend % divisor != 0) + dividend / divisor + u64::from(!dividend.is_multiple_of(divisor)) } // Ceiling of the division of `dividend`/`divisor`. fn div_round_up_u32(dividend: u32, divisor: u32) -> u32 { - dividend / divisor + u32::from(dividend % divisor != 0) + dividend / divisor + u32::from(!dividend.is_multiple_of(divisor)) } fn convert_copy(reader: &mut R, writer: &mut W, offset: u64, size: u64) -> Result<()> diff --git a/block/src/qcow/raw_file.rs b/block/src/qcow/raw_file.rs index cb96376015..f0eff54df3 100644 --- a/block/src/qcow/raw_file.rs +++ b/block/src/qcow/raw_file.rs @@ -8,7 +8,7 @@ // // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause -use std::alloc::{alloc_zeroed, dealloc, Layout}; +use std::alloc::{Layout, alloc_zeroed, dealloc}; use std::fs::{File, Metadata}; use std::io::{self, Read, Seek, SeekFrom, Write}; use std::os::unix::io::{AsRawFd, RawFd}; @@ -89,9 +89,9 @@ impl RawFile { let align64: u64 = self.alignment.try_into().unwrap(); - (self.position % align64 == 0) - && ((buf.as_ptr() as usize) % self.alignment == 0) - && (buf.len() % self.alignment == 0) + self.position.is_multiple_of(align64) + && (buf.as_ptr() as usize).is_multiple_of(self.alignment) + && buf.len().is_multiple_of(self.alignment) } pub fn set_len(&self, size: u64) -> std::io::Result<()> { diff --git a/block/src/qcow/vec_cache.rs b/block/src/qcow/vec_cache.rs index 67068fdded..4b18518b0e 100644 --- a/block/src/qcow/vec_cache.rs +++ b/block/src/qcow/vec_cache.rs @@ -4,8 +4,8 @@ // // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause -use std::collections::hash_map::IterMut; use std::collections::HashMap; +use std::collections::hash_map::IterMut; use std::io; use std::ops::{Index, IndexMut}; use std::slice::SliceIndex; @@ -123,10 +123,10 @@ impl CacheMap { if self.map.len() == self.capacity { // TODO(dgreid) - smarter eviction strategy. let to_evict = *self.map.iter().next().unwrap().0; - if let Some(evicted) = self.map.remove(&to_evict) { - if evicted.dirty() { - write_callback(to_evict, evicted)?; - } + if let Some(evicted) = self.map.remove(&to_evict) + && evicted.dirty() + { + write_callback(to_evict, evicted)?; } } self.map.insert(index, block); diff --git a/block/src/qcow_sync.rs b/block/src/qcow_sync.rs index f07e245e01..36b82e81cc 100644 --- a/block/src/qcow_sync.rs +++ b/block/src/qcow_sync.rs @@ -6,33 +6,32 @@ use std::collections::VecDeque; use std::fs::File; use std::io::{Seek, SeekFrom}; use std::os::fd::AsRawFd; -use std::sync::{Arc, Mutex, MutexGuard}; use vmm_sys_util::eventfd::EventFd; +use crate::AsyncAdaptor; use crate::async_io::{ AsyncIo, AsyncIoResult, BorrowedDiskFd, DiskFile, DiskFileError, DiskFileResult, }; use crate::qcow::{QcowFile, RawFile, Result as QcowResult}; -use crate::AsyncAdaptor; pub struct QcowDiskSync { - qcow_file: Arc>, + qcow_file: QcowFile, } impl QcowDiskSync { pub fn new(file: File, direct_io: bool) -> QcowResult { Ok(QcowDiskSync { - qcow_file: Arc::new(Mutex::new(QcowFile::from(RawFile::new(file, direct_io))?)), + qcow_file: QcowFile::from(RawFile::new(file, direct_io))?, }) } } impl DiskFile for QcowDiskSync { fn size(&mut self) -> DiskFileResult { - let mut file = self.qcow_file.lock().unwrap(); - - file.seek(SeekFrom::End(0)).map_err(DiskFileError::Size) + self.qcow_file + .seek(SeekFrom::End(0)) + .map_err(DiskFileError::Size) } fn new_async_io(&self, _ring_depth: u32) -> DiskFileResult> { @@ -40,19 +39,21 @@ impl DiskFile for QcowDiskSync { } fn fd(&mut self) -> BorrowedDiskFd<'_> { - let lock = self.qcow_file.lock().unwrap(); - BorrowedDiskFd::new(lock.as_raw_fd()) + BorrowedDiskFd::new(self.qcow_file.as_raw_fd()) + } + fn resize(&mut self, _size: u64) -> DiskFileResult<()> { + Err(DiskFileError::Unsupported) } } pub struct QcowSync { - qcow_file: Arc>, + qcow_file: QcowFile, eventfd: EventFd, completion_list: VecDeque<(u64, i32)>, } impl QcowSync { - pub fn new(qcow_file: Arc>) -> Self { + pub fn new(qcow_file: QcowFile) -> Self { QcowSync { qcow_file, eventfd: EventFd::new(libc::EFD_NONBLOCK) @@ -62,11 +63,7 @@ impl QcowSync { } } -impl AsyncAdaptor for Arc> { - fn file(&mut self) -> MutexGuard<'_, QcowFile> { - self.lock().unwrap() - } -} +impl AsyncAdaptor for QcowFile {} impl AsyncIo for QcowSync { fn notifier(&self) -> &EventFd { diff --git a/block/src/raw_async.rs b/block/src/raw_async.rs index 496445c6ad..a982623a59 100644 --- a/block/src/raw_async.rs +++ b/block/src/raw_async.rs @@ -6,13 +6,13 @@ use std::fs::File; use std::io::{Error, Seek, SeekFrom}; use std::os::unix::io::{AsRawFd, RawFd}; -use io_uring::{opcode, types, IoUring}; +use io_uring::{IoUring, opcode, types}; use vmm_sys_util::eventfd::EventFd; use crate::async_io::{ AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFile, DiskFileError, DiskFileResult, }; -use crate::DiskTopology; +use crate::{BatchRequest, DiskTopology, RequestType}; pub struct RawFileDisk { file: File, @@ -47,6 +47,19 @@ impl DiskFile for RawFileDisk { } } + fn resize(&mut self, size: u64) -> DiskFileResult<()> { + let borrowed_fd = self.fd(); + let raw_fd = borrowed_fd.as_raw_fd(); + + // SAFETY: FFI call into libc, trivially safe + let rc = unsafe { libc::ftruncate(raw_fd, size as libc::off_t) }; + if rc == 0 { + Ok(()) + } else { + Err(DiskFileError::ResizeError) + } + } + fn fd(&mut self) -> BorrowedDiskFd<'_> { BorrowedDiskFd::new(self.file.as_raw_fd()) } @@ -168,4 +181,77 @@ impl AsyncIo for RawFileAsync { .next() .map(|entry| (entry.user_data(), entry.result())) } + + fn batch_requests_enabled(&self) -> bool { + true + } + + fn submit_batch_requests(&mut self, batch_request: &[BatchRequest]) -> AsyncIoResult<()> { + if !self.batch_requests_enabled() { + return Ok(()); + } + + let (submitter, mut sq, _) = self.io_uring.split(); + let mut submitted = false; + + for req in batch_request { + match req.request_type { + RequestType::In => { + // SAFETY: we know the file descriptor is valid and we + // relied on vm-memory to provide the buffer address. + unsafe { + sq.push( + &opcode::Readv::new( + types::Fd(self.fd), + req.iovecs.as_ptr(), + req.iovecs.len() as u32, + ) + .offset(req.offset as u64) + .build() + .user_data(req.user_data), + ) + .map_err(|_| { + AsyncIoError::ReadVectored(Error::other("Submission queue is full")) + })? + }; + submitted = true; + } + RequestType::Out => { + // SAFETY: we know the file descriptor is valid and we + // relied on vm-memory to provide the buffer address. + unsafe { + sq.push( + &opcode::Writev::new( + types::Fd(self.fd), + req.iovecs.as_ptr(), + req.iovecs.len() as u32, + ) + .offset(req.offset as u64) + .build() + .user_data(req.user_data), + ) + .map_err(|_| { + AsyncIoError::WriteVectored(Error::other("Submission queue is full")) + })? + }; + submitted = true; + } + _ => { + unreachable!("Unexpected batch request type: {:?}", req.request_type) + } + } + } + + // Only submit if we actually queued something + if submitted { + // Update the submission queue and submit new operations to the + // io_uring instance. + sq.sync(); + submitter + .submit() + .map_err(AsyncIoError::SubmitBatchRequests)?; + } + + Ok(()) + } } diff --git a/block/src/raw_async_aio.rs b/block/src/raw_async_aio.rs index 9ef0c62619..7404e81c81 100644 --- a/block/src/raw_async_aio.rs +++ b/block/src/raw_async_aio.rs @@ -12,10 +12,10 @@ use std::os::unix::io::{AsRawFd, RawFd}; use vmm_sys_util::aio; use vmm_sys_util::eventfd::EventFd; +use crate::DiskTopology; use crate::async_io::{ AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFile, DiskFileError, DiskFileResult, }; -use crate::DiskTopology; pub struct RawFileDiskAio { file: File, @@ -50,6 +50,10 @@ impl DiskFile for RawFileDiskAio { } } + fn resize(&mut self, _size: u64) -> DiskFileResult<()> { + Err(DiskFileError::Unsupported) + } + fn fd(&mut self) -> BorrowedDiskFd<'_> { BorrowedDiskFd::new(self.file.as_raw_fd()) } diff --git a/block/src/raw_sync.rs b/block/src/raw_sync.rs index 54ba1acca6..43a9a5b3f0 100644 --- a/block/src/raw_sync.rs +++ b/block/src/raw_sync.rs @@ -9,10 +9,10 @@ use std::os::unix::io::{AsRawFd, RawFd}; use vmm_sys_util::eventfd::EventFd; +use crate::DiskTopology; use crate::async_io::{ AsyncIo, AsyncIoError, AsyncIoResult, BorrowedDiskFd, DiskFile, DiskFileError, DiskFileResult, }; -use crate::DiskTopology; pub struct RawFileDiskSync { file: File, @@ -47,6 +47,10 @@ impl DiskFile for RawFileDiskSync { fn fd(&mut self) -> BorrowedDiskFd<'_> { BorrowedDiskFd::new(self.file.as_raw_fd()) } + + fn resize(&mut self, _size: u64) -> DiskFileResult<()> { + Err(DiskFileError::Unsupported) + } } pub struct RawFileSync { diff --git a/block/src/vhd.rs b/block/src/vhd.rs index 2cc65ca0b9..6659ed0385 100644 --- a/block/src/vhd.rs +++ b/block/src/vhd.rs @@ -5,7 +5,7 @@ use std::fs::File; use std::io::{Seek, SeekFrom}; -use crate::{read_aligned_block_size, DiskTopology}; +use crate::{DiskTopology, read_aligned_block_size}; #[derive(Clone, Copy)] pub struct VhdFooter { @@ -123,7 +123,7 @@ mod tests { use vmm_sys_util::tempfile::TempFile; - use super::{is_fixed_vhd, VhdFooter}; + use super::{VhdFooter, is_fixed_vhd}; fn valid_fixed_vhd_footer() -> Vec { vec![ diff --git a/block/src/vhdx/mod.rs b/block/src/vhdx/mod.rs index 45974c5a3f..141c43c6d5 100644 --- a/block/src/vhdx/mod.rs +++ b/block/src/vhdx/mod.rs @@ -12,11 +12,11 @@ use remain::sorted; use thiserror::Error; use uuid::Uuid; +use crate::BlockBackend; use crate::vhdx::vhdx_bat::{BatEntry, VhdxBatError}; use crate::vhdx::vhdx_header::{RegionInfo, RegionTableEntry, VhdxHeader, VhdxHeaderError}; use crate::vhdx::vhdx_io::VhdxIoError; use crate::vhdx::vhdx_metadata::{DiskSpec, VhdxMetadataError}; -use crate::BlockBackend; mod vhdx_bat; mod vhdx_header; @@ -187,11 +187,11 @@ impl Seek for Vhdx { } }; - if let Some(o) = new_offset { - if o <= self.virtual_disk_size() { - self.current_offset = o; - return Ok(o); - } + if let Some(o) = new_offset + && o <= self.virtual_disk_size() + { + self.current_offset = o; + return Ok(o); } Err(std::io::Error::new( diff --git a/block/src/vhdx/vhdx_io.rs b/block/src/vhdx/vhdx_io.rs index 30e3837876..14feac8d9d 100644 --- a/block/src/vhdx/vhdx_io.rs +++ b/block/src/vhdx/vhdx_io.rs @@ -35,9 +35,7 @@ pub enum VhdxIoError { pub type Result = std::result::Result; macro_rules! align { - ($n:expr, $align:expr) => {{ - $n.div_ceil($align) * $align - }}; + ($n:expr, $align:expr) => {{ $n.div_ceil($align) * $align }}; } #[derive(Default)] diff --git a/block/src/vhdx_sync.rs b/block/src/vhdx_sync.rs index d832f5e3cc..0028672d36 100644 --- a/block/src/vhdx_sync.rs +++ b/block/src/vhdx_sync.rs @@ -5,31 +5,30 @@ use std::collections::VecDeque; use std::fs::File; use std::os::fd::AsRawFd; -use std::sync::{Arc, Mutex, MutexGuard}; use vmm_sys_util::eventfd::EventFd; +use crate::AsyncAdaptor; use crate::async_io::{ AsyncIo, AsyncIoResult, BorrowedDiskFd, DiskFile, DiskFileError, DiskFileResult, }; use crate::vhdx::{Result as VhdxResult, Vhdx}; -use crate::AsyncAdaptor; pub struct VhdxDiskSync { - vhdx_file: Arc>, + vhdx_file: Vhdx, } impl VhdxDiskSync { pub fn new(f: File) -> VhdxResult { Ok(VhdxDiskSync { - vhdx_file: Arc::new(Mutex::new(Vhdx::new(f)?)), + vhdx_file: Vhdx::new(f)?, }) } } impl DiskFile for VhdxDiskSync { fn size(&mut self) -> DiskFileResult { - Ok(self.vhdx_file.lock().unwrap().virtual_disk_size()) + Ok(self.vhdx_file.virtual_disk_size()) } fn new_async_io(&self, _ring_depth: u32) -> DiskFileResult> { @@ -39,20 +38,23 @@ impl DiskFile for VhdxDiskSync { ) } + fn resize(&mut self, _size: u64) -> DiskFileResult<()> { + Err(DiskFileError::Unsupported) + } + fn fd(&mut self) -> BorrowedDiskFd<'_> { - let lock = self.vhdx_file.lock().unwrap(); - BorrowedDiskFd::new(lock.as_raw_fd()) + BorrowedDiskFd::new(self.vhdx_file.as_raw_fd()) } } pub struct VhdxSync { - vhdx_file: Arc>, + vhdx_file: Vhdx, eventfd: EventFd, completion_list: VecDeque<(u64, i32)>, } impl VhdxSync { - pub fn new(vhdx_file: Arc>) -> std::io::Result { + pub fn new(vhdx_file: Vhdx) -> std::io::Result { Ok(VhdxSync { vhdx_file, eventfd: EventFd::new(libc::EFD_NONBLOCK)?, @@ -61,11 +63,7 @@ impl VhdxSync { } } -impl AsyncAdaptor for Arc> { - fn file(&mut self) -> MutexGuard<'_, Vhdx> { - self.lock().unwrap() - } -} +impl AsyncAdaptor for Vhdx {} impl AsyncIo for VhdxSync { fn notifier(&self) -> &EventFd { diff --git a/build.rs b/build.rs index 37a5ffd9fa..080c625599 100644 --- a/build.rs +++ b/build.rs @@ -9,14 +9,13 @@ use std::process::Command; fn main() { let mut version = "v".to_owned() + env!("CARGO_PKG_VERSION"); - if let Ok(git_out) = Command::new("git").args(["describe", "--dirty"]).output() { - if git_out.status.success() { - if let Ok(git_out_str) = String::from_utf8(git_out.stdout) { - version = git_out_str; - // Pop the trailing newline. - version.pop(); - } - } + if let Ok(git_out) = Command::new("git").args(["describe", "--dirty"]).output() + && git_out.status.success() + && let Ok(git_out_str) = String::from_utf8(git_out.stdout) + { + version = git_out_str; + // Pop the trailing newline. + version.pop(); } // Append CH_EXTRA_VERSION to version if it is set. diff --git a/devices/Cargo.toml b/devices/Cargo.toml index 334ec0e310..08aa7fd60d 100644 --- a/devices/Cargo.toml +++ b/devices/Cargo.toml @@ -1,22 +1,28 @@ [package] authors = ["The Chromium OS Authors"] -edition = "2021" +edition.workspace = true name = "devices" version = "0.1.0" [dependencies] acpi_tables = { workspace = true } -anyhow = "1.0.94" +anyhow = { workspace = true } arch = { path = "../arch" } -bitflags = "2.9.0" -byteorder = "1.5.0" +bitfield-struct = { version = "0.10.1", optional = true } +bitflags = { workspace = true } +byteorder = { workspace = true } event_monitor = { path = "../event_monitor" } hypervisor = { path = "../hypervisor" } -libc = "0.2.167" -log = "0.4.22" +libc = { workspace = true } +linux-loader = { workspace = true, features = [ + "bzimage", + "elf", + "pe", +], optional = true } +log = { workspace = true } num_enum = "0.7.2" pci = { path = "../pci" } -serde = { version = "1.0.208", features = ["derive"] } +serde = { workspace = true, features = ["derive"] } thiserror = { workspace = true } tpm = { path = "../tpm" } vm-allocator = { path = "../vm-allocator" } @@ -28,11 +34,20 @@ vm-memory = { workspace = true, features = [ ] } vm-migration = { path = "../vm-migration" } vmm-sys-util = { workspace = true } +zerocopy = { version = "0.8.26", features = [ + "alloc", + "derive", +], optional = true } [target.'cfg(any(target_arch = "aarch64", target_arch = "riscv64"))'.dependencies] arch = { path = "../arch" } [features] default = [] +fw_cfg = ["arch/fw_cfg", "bitfield-struct", "linux-loader", "zerocopy"] +ivshmem = [] kvm = ["arch/kvm"] pvmemcontrol = [] + +[lints] +workspace = true diff --git a/devices/src/acpi.rs b/devices/src/acpi.rs index 2a38f5974c..229b67be54 100644 --- a/devices/src/acpi.rs +++ b/devices/src/acpi.rs @@ -8,9 +8,9 @@ use std::sync::{Arc, Barrier}; use std::thread; use std::time::Instant; -use acpi_tables::{aml, Aml, AmlSink}; -use vm_device::interrupt::InterruptSourceGroup; +use acpi_tables::{Aml, AmlSink, aml}; use vm_device::BusDevice; +use vm_device::interrupt::InterruptSourceGroup; use vm_memory::GuestAddress; use vmm_sys_util::eventfd::EventFd; diff --git a/devices/src/aia.rs b/devices/src/aia.rs index 83ed1585f4..f3956727ac 100644 --- a/devices/src/aia.rs +++ b/devices/src/aia.rs @@ -40,7 +40,7 @@ pub struct Aia { impl Aia { pub fn new( - vcpu_count: u8, + vcpu_count: u32, interrupt_manager: Arc>, vm: Arc, ) -> Result { diff --git a/devices/src/gic.rs b/devices/src/gic.rs index afa5814a16..a157c3f25d 100644 --- a/devices/src/gic.rs +++ b/devices/src/gic.rs @@ -9,8 +9,8 @@ use std::sync::{Arc, Mutex}; use anyhow::anyhow; use arch::layout; -use hypervisor::arch::aarch64::gic::{GicState, Vgic, VgicConfig}; use hypervisor::CpuState; +use hypervisor::arch::aarch64::gic::{GicState, Vgic, VgicConfig}; use vm_device::interrupt::{ InterruptIndex, InterruptManager, InterruptSourceConfig, InterruptSourceGroup, LegacyIrqSourceConfig, MsiIrqGroupConfig, @@ -39,7 +39,7 @@ pub struct Gic { impl Gic { pub fn new( - vcpu_count: u8, + vcpu_count: u32, interrupt_manager: Arc>, vm: Arc, ) -> Result { diff --git a/devices/src/ioapic.rs b/devices/src/ioapic.rs index 7adbe4f66c..f8214e96c5 100644 --- a/devices/src/ioapic.rs +++ b/devices/src/ioapic.rs @@ -14,11 +14,11 @@ use std::sync::{Arc, Barrier}; use byteorder::{ByteOrder, LittleEndian}; use serde::{Deserialize, Serialize}; +use vm_device::BusDevice; use vm_device::interrupt::{ InterruptIndex, InterruptManager, InterruptSourceConfig, InterruptSourceGroup, MsiIrqGroupConfig, MsiIrqSourceConfig, }; -use vm_device::BusDevice; use vm_memory::GuestAddress; use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable}; use vmm_sys_util::eventfd::EventFd; @@ -171,7 +171,7 @@ impl BusDevice for Ioapic { return None; } - debug!("IOAPIC_W @ offset 0x{:x}", offset); + trace!("IOAPIC_W @ offset 0x{:x}", offset); let value = LittleEndian::read_u32(data); @@ -249,7 +249,7 @@ impl Ioapic { } fn ioapic_write(&mut self, val: u32) { - debug!("IOAPIC_W reg 0x{:x}, val 0x{:x}", self.reg_sel, val); + trace!("IOAPIC_W reg 0x{:x}, val 0x{:x}", self.reg_sel, val); match self.reg_sel as u8 { IOAPIC_REG_VERSION => { diff --git a/devices/src/ivshmem.rs b/devices/src/ivshmem.rs new file mode 100644 index 0000000000..50c056edf3 --- /dev/null +++ b/devices/src/ivshmem.rs @@ -0,0 +1,420 @@ +// Copyright © 2024 Tencent Corporation. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 +// + +use std::any::Any; +use std::path::PathBuf; +use std::result; +use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::{Arc, Barrier, Mutex}; + +use anyhow::anyhow; +use byteorder::{ByteOrder, LittleEndian}; +use pci::{ + BarReprogrammingParams, PCI_CONFIGURATION_ID, PciBarConfiguration, PciBarPrefetchable, + PciBarRegionType, PciClassCode, PciConfiguration, PciDevice, PciDeviceError, PciHeaderType, + PciSubclass, +}; +use serde::{Deserialize, Serialize}; +use thiserror::Error; +use vm_allocator::{AddressAllocator, SystemAllocator}; +use vm_device::{BusDevice, Resource, UserspaceMapping}; +use vm_memory::bitmap::AtomicBitmap; +use vm_memory::{Address, GuestAddress}; +use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable}; + +const IVSHMEM_BAR0_IDX: usize = 0; +const IVSHMEM_BAR1_IDX: usize = 1; +const IVSHMEM_BAR2_IDX: usize = 2; + +const IVSHMEM_VENDOR_ID: u16 = 0x1af4; +const IVSHMEM_DEVICE_ID: u16 = 0x1110; + +const IVSHMEM_REG_BAR_SIZE: u64 = 0x100; + +type GuestRegionMmap = vm_memory::GuestRegionMmap; + +#[derive(Debug, Error)] +pub enum IvshmemError { + #[error("Failed to retrieve PciConfigurationState: {0}")] + RetrievePciConfigurationState(#[source] anyhow::Error), + #[error("Failed to retrieve IvshmemDeviceState: {0}")] + RetrieveIvshmemDeviceStateState(#[source] anyhow::Error), + #[error("Failed to remove user memory region")] + RemoveUserMemoryRegion, + #[error("Failed to create user memory region.")] + CreateUserMemoryRegion, + #[error("Failed to create userspace mapping.")] + CreateUserspaceMapping, + #[error("Failed to remove old userspace mapping.")] + RemoveUserspaceMapping, +} + +#[derive(Copy, Clone)] +pub enum IvshmemSubclass { + Other = 0x00, +} + +impl PciSubclass for IvshmemSubclass { + fn get_register_value(&self) -> u8 { + *self as u8 + } +} + +pub trait IvshmemOps: Send + Sync { + fn map_ram_region( + &mut self, + start_addr: u64, + size: usize, + backing_file: Option, + ) -> Result<(Arc, UserspaceMapping), IvshmemError>; + + fn unmap_ram_region(&mut self, mapping: UserspaceMapping) -> Result<(), IvshmemError>; +} + +/// Inner-Vm Shared Memory Device (Ivshmem device) +/// +/// This device can share memory between host and guest(ivshmem-plain) +/// and share memory between guests(ivshmem-doorbell). +/// But only ivshmem-plain support now, ivshmem-doorbell doesn't support yet. +pub struct IvshmemDevice { + id: String, + + // ivshmem device registers + // (only used for ivshmem-doorbell, ivshmem-doorbell don't support yet) + _interrupt_mask: u32, + _interrupt_status: Arc, + _iv_position: u32, + _doorbell: u32, + + // PCI configuration registers. + configuration: PciConfiguration, + bar_regions: Vec, + + region_size: u64, + ivshmem_ops: Arc>, + backend_file: Option, + region: Option>, + userspace_mapping: Option, +} + +#[derive(Serialize, Deserialize, Default, Clone)] +pub struct IvshmemDeviceState { + interrupt_mask: u32, + interrupt_status: u32, + iv_position: u32, + doorbell: u32, +} + +impl IvshmemDevice { + pub fn new( + id: String, + region_size: u64, + backend_file: Option, + ivshmem_ops: Arc>, + snapshot: Option, + ) -> Result { + let pci_configuration_state = + vm_migration::state_from_id(snapshot.as_ref(), PCI_CONFIGURATION_ID).map_err(|e| { + IvshmemError::RetrievePciConfigurationState(anyhow!( + "Failed to get PciConfigurationState from Snapshot: {e}", + )) + })?; + + let state: Option = snapshot + .as_ref() + .map(|s| s.to_state()) + .transpose() + .map_err(|e| { + IvshmemError::RetrieveIvshmemDeviceStateState(anyhow!( + "Failed to get IvshmemDeviceState from Snapshot: {e}", + )) + })?; + + let configuration = PciConfiguration::new( + IVSHMEM_VENDOR_ID, + IVSHMEM_DEVICE_ID, + 0x1, + PciClassCode::MemoryController, + &IvshmemSubclass::Other, + None, + PciHeaderType::Device, + 0, + 0, + None, + pci_configuration_state, + ); + + let device = if let Some(s) = state { + IvshmemDevice { + id, + configuration, + bar_regions: vec![], + _interrupt_mask: s.interrupt_mask, + _interrupt_status: Arc::new(AtomicU32::new(s.interrupt_status)), + _iv_position: s.iv_position, + _doorbell: s.doorbell, + region_size, + ivshmem_ops, + region: None, + userspace_mapping: None, + backend_file, + } + } else { + IvshmemDevice { + id, + configuration, + bar_regions: vec![], + _interrupt_mask: 0, + _interrupt_status: Arc::new(AtomicU32::new(0)), + _iv_position: 0, + _doorbell: 0, + region_size, + ivshmem_ops, + region: None, + userspace_mapping: None, + backend_file, + } + }; + Ok(device) + } + + pub fn set_region( + &mut self, + region: Arc, + userspace_mapping: UserspaceMapping, + ) { + self.region = Some(region); + self.userspace_mapping = Some(userspace_mapping); + } + + pub fn config_bar_addr(&self) -> u64 { + self.configuration.get_bar_addr(IVSHMEM_BAR0_IDX) + } + + pub fn data_bar_addr(&self) -> u64 { + self.configuration.get_bar_addr(IVSHMEM_BAR2_IDX) + } + + fn state(&self) -> IvshmemDeviceState { + IvshmemDeviceState { + interrupt_mask: self._interrupt_mask, + interrupt_status: self._interrupt_status.load(Ordering::SeqCst), + iv_position: self._iv_position, + doorbell: self._doorbell, + } + } +} + +impl BusDevice for IvshmemDevice { + fn read(&mut self, base: u64, offset: u64, data: &mut [u8]) { + self.read_bar(base, offset, data) + } + + fn write(&mut self, base: u64, offset: u64, data: &[u8]) -> Option> { + self.write_bar(base, offset, data) + } +} + +impl PciDevice for IvshmemDevice { + fn allocate_bars( + &mut self, + _allocator: &Arc>, + mmio32_allocator: &mut AddressAllocator, + mmio64_allocator: &mut AddressAllocator, + resources: Option>, + ) -> std::result::Result, PciDeviceError> { + let mut bars = Vec::new(); + let mut bar0_addr = None; + let mut bar2_addr = None; + + let restoring = resources.is_some(); + if let Some(resources) = resources { + for resource in resources { + match resource { + Resource::PciBar { index, base, .. } => { + match index { + IVSHMEM_BAR0_IDX => { + bar0_addr = Some(GuestAddress(base)); + } + IVSHMEM_BAR1_IDX => {} + IVSHMEM_BAR2_IDX => { + bar2_addr = Some(GuestAddress(base)); + } + _ => { + error!("Unexpected pci bar index {index}"); + } + }; + } + _ => { + error!("Unexpected resource {resource:?}"); + } + } + } + if bar0_addr.is_none() || bar2_addr.is_none() { + return Err(PciDeviceError::MissingResource); + } + } + + // BAR0 holds device registers (256 Byte MMIO) + let bar0_addr = mmio32_allocator + .allocate(bar0_addr, IVSHMEM_REG_BAR_SIZE, None) + .ok_or(PciDeviceError::IoAllocationFailed(IVSHMEM_REG_BAR_SIZE))?; + debug!("ivshmem bar0 address 0x{:x}", bar0_addr.0); + + let bar0 = PciBarConfiguration::default() + .set_index(IVSHMEM_BAR0_IDX) + .set_address(bar0_addr.raw_value()) + .set_size(IVSHMEM_REG_BAR_SIZE) + .set_region_type(PciBarRegionType::Memory32BitRegion) + .set_prefetchable(PciBarPrefetchable::NotPrefetchable); + + // BAR1 holds MSI-X table and PBA (only ivshmem-doorbell). + + // BAR2 maps the shared memory object + let bar2_size = self.region_size; + let bar2_addr = mmio64_allocator + .allocate(bar2_addr, bar2_size, None) + .ok_or(PciDeviceError::IoAllocationFailed(bar2_size))?; + debug!("ivshmem bar2 address 0x{:x}", bar2_addr.0); + + let bar2 = PciBarConfiguration::default() + .set_index(IVSHMEM_BAR2_IDX) + .set_address(bar2_addr.raw_value()) + .set_size(bar2_size) + .set_region_type(PciBarRegionType::Memory64BitRegion) + .set_prefetchable(PciBarPrefetchable::Prefetchable); + + if !restoring { + self.configuration + .add_pci_bar(&bar0) + .map_err(|e| PciDeviceError::IoRegistrationFailed(bar0_addr.raw_value(), e))?; + self.configuration + .add_pci_bar(&bar2) + .map_err(|e| PciDeviceError::IoRegistrationFailed(bar2_addr.raw_value(), e))?; + } + + bars.push(bar0); + bars.push(bar2); + self.bar_regions = bars.clone(); + + Ok(bars) + } + + fn free_bars( + &mut self, + _allocator: &mut SystemAllocator, + _mmio32_allocator: &mut AddressAllocator, + _mmio64_allocator: &mut AddressAllocator, + ) -> std::result::Result<(), PciDeviceError> { + unimplemented!("Device hotplug and remove are not supported for ivshmem"); + } + + fn write_config_register( + &mut self, + reg_idx: usize, + offset: u64, + data: &[u8], + ) -> (Vec, Option>) { + ( + self.configuration + .write_config_register(reg_idx, offset, data), + None, + ) + } + + fn read_config_register(&mut self, reg_idx: usize) -> u32 { + self.configuration.read_reg(reg_idx) + } + + fn read_bar(&mut self, base: u64, offset: u64, data: &mut [u8]) { + debug!("read base {base:x} offset {offset}"); + + let mut bar_idx = 0; + for (idx, bar) in self.bar_regions.iter().enumerate() { + if bar.addr() == base { + bar_idx = idx; + } + } + match bar_idx { + // bar 0 + 0 => { + // ivshmem don't use interrupt, we return zero now. + LittleEndian::write_u32(data, 0); + } + // bar 2 + 1 => warn!("Unexpected read ivshmem memory idx: {offset}"), + _ => { + warn!("Invalid bar_idx: {bar_idx}"); + } + }; + } + + fn write_bar(&mut self, base: u64, offset: u64, _data: &[u8]) -> Option> { + debug!("write base {base:x} offset {offset}"); + warn!("Unexpected write ivshmem memory idx: {offset}"); + None + } + + fn move_bar(&mut self, old_base: u64, new_base: u64) -> result::Result<(), std::io::Error> { + if new_base == self.data_bar_addr() { + if let Some(old_mapping) = self.userspace_mapping.take() { + self.ivshmem_ops + .lock() + .unwrap() + .unmap_ram_region(old_mapping) + .map_err(std::io::Error::other)?; + } + let (region, new_mapping) = self + .ivshmem_ops + .lock() + .unwrap() + .map_ram_region( + new_base, + self.region_size as usize, + self.backend_file.clone(), + ) + .map_err(std::io::Error::other)?; + self.set_region(region, new_mapping); + } + for bar in self.bar_regions.iter_mut() { + if bar.addr() == old_base { + *bar = bar.set_address(new_base); + } + } + + Ok(()) + } + + fn as_any_mut(&mut self) -> &mut dyn Any { + self + } + + fn id(&self) -> Option { + Some(self.id.clone()) + } +} + +impl Pausable for IvshmemDevice {} + +impl Snapshottable for IvshmemDevice { + fn id(&self) -> String { + self.id.clone() + } + + // The snapshot/restore (also live migration) support only work for ivshmem-plain mode. + // Additional work is needed for supporting ivshmem-doorbell. + fn snapshot(&mut self) -> std::result::Result { + let mut snapshot = Snapshot::new_from_state(&self.state())?; + + // Snapshot PciConfiguration + snapshot.add_snapshot(self.configuration.id(), self.configuration.snapshot()?); + + Ok(snapshot) + } +} + +impl Transportable for IvshmemDevice {} + +impl Migratable for IvshmemDevice {} diff --git a/devices/src/legacy/cmos.rs b/devices/src/legacy/cmos.rs index 386281c67f..b0e140acc3 100644 --- a/devices/src/legacy/cmos.rs +++ b/devices/src/legacy/cmos.rs @@ -12,7 +12,7 @@ use std::{mem, thread}; // https://github.com/rust-lang/libc/issues/1848 #[cfg_attr(target_env = "musl", allow(deprecated))] use libc::time_t; -use libc::{clock_gettime, gmtime_r, timespec, tm, CLOCK_REALTIME}; +use libc::{CLOCK_REALTIME, clock_gettime, gmtime_r, timespec, tm}; use vm_device::BusDevice; use vmm_sys_util::eventfd::EventFd; diff --git a/devices/src/legacy/fw_cfg.rs b/devices/src/legacy/fw_cfg.rs new file mode 100644 index 0000000000..00b5bd7450 --- /dev/null +++ b/devices/src/legacy/fw_cfg.rs @@ -0,0 +1,954 @@ +// Copyright 2025 Google LLC. +// +// SPDX-License-Identifier: Apache-2.0 +// + +/// Cloud Hypervisor implementation of Qemu's fw_cfg spec +/// https://www.qemu.org/docs/master/specs/fw_cfg.html +/// Linux kernel fw_cfg driver header +/// https://github.com/torvalds/linux/blob/master/include/uapi/linux/qemu_fw_cfg.h +/// Uploading files to the guest via fw_cfg is supported for all kernels 4.6+ w/ CONFIG_FW_CFG_SYSFS enabled +/// https://cateee.net/lkddb/web-lkddb/FW_CFG_SYSFS.html +/// No kernel requirement if above functionality is not required, +/// only firmware must implement mechanism to interact with this fw_cfg device +use std::{ + fs::File, + io::{ErrorKind, Read, Result, Seek, SeekFrom}, + mem::offset_of, + os::unix::fs::FileExt, + sync::{Arc, Barrier}, +}; + +use acpi_tables::rsdp::Rsdp; +use arch::RegionType; +#[cfg(target_arch = "aarch64")] +use arch::aarch64::layout::{ + MEM_32BIT_DEVICES_START, MEM_32BIT_RESERVED_START, RAM_64BIT_START, RAM_START as HIGH_RAM_START, +}; +#[cfg(target_arch = "x86_64")] +use arch::layout::{ + EBDA_START, HIGH_RAM_START, MEM_32BIT_DEVICES_SIZE, MEM_32BIT_DEVICES_START, + MEM_32BIT_RESERVED_START, PCI_MMCONFIG_SIZE, PCI_MMCONFIG_START, RAM_64BIT_START, +}; +use bitfield_struct::bitfield; +#[cfg(target_arch = "x86_64")] +use linux_loader::bootparam::boot_params; +#[cfg(target_arch = "aarch64")] +use linux_loader::loader::pe::arm64_image_header as boot_params; +use vm_device::BusDevice; +use vm_memory::bitmap::AtomicBitmap; +use vm_memory::{ + ByteValued, Bytes, GuestAddress, GuestAddressSpace, GuestMemoryAtomic, GuestMemoryMmap, +}; +use vmm_sys_util::sock_ctrl_msg::IntoIovec; +use zerocopy::{FromBytes, FromZeros, Immutable, IntoBytes}; + +#[cfg(target_arch = "x86_64")] +// https://github.com/project-oak/oak/tree/main/stage0_bin#memory-layout +const STAGE0_START_ADDRESS: GuestAddress = GuestAddress(0xfffe_0000); +#[cfg(target_arch = "x86_64")] +const STAGE0_SIZE: usize = 0x2_0000; +const E820_RAM: u32 = 1; +const E820_RESERVED: u32 = 2; + +#[cfg(target_arch = "x86_64")] +const PORT_FW_CFG_SELECTOR: u64 = 0x510; +#[cfg(target_arch = "x86_64")] +const PORT_FW_CFG_DATA: u64 = 0x511; +#[cfg(target_arch = "x86_64")] +const PORT_FW_CFG_DMA_HI: u64 = 0x514; +#[cfg(target_arch = "x86_64")] +const PORT_FW_CFG_DMA_LO: u64 = 0x518; +#[cfg(target_arch = "x86_64")] +pub const PORT_FW_CFG_BASE: u64 = 0x510; +#[cfg(target_arch = "x86_64")] +pub const PORT_FW_CFG_WIDTH: u64 = 0xc; +#[cfg(target_arch = "aarch64")] +const PORT_FW_CFG_SELECTOR: u64 = 0x9030008; +#[cfg(target_arch = "aarch64")] +const PORT_FW_CFG_DATA: u64 = 0x9030000; +#[cfg(target_arch = "aarch64")] +const PORT_FW_CFG_DMA_HI: u64 = 0x9030010; +#[cfg(target_arch = "aarch64")] +const PORT_FW_CFG_DMA_LO: u64 = 0x9030014; +#[cfg(target_arch = "aarch64")] +pub const PORT_FW_CFG_BASE: u64 = 0x9030000; +#[cfg(target_arch = "aarch64")] +pub const PORT_FW_CFG_WIDTH: u64 = 0x10; + +const FW_CFG_SIGNATURE: u16 = 0x00; +const FW_CFG_ID: u16 = 0x01; +const FW_CFG_KERNEL_SIZE: u16 = 0x08; +const FW_CFG_INITRD_SIZE: u16 = 0x0b; +const FW_CFG_KERNEL_DATA: u16 = 0x11; +const FW_CFG_INITRD_DATA: u16 = 0x12; +const FW_CFG_CMDLINE_SIZE: u16 = 0x14; +const FW_CFG_CMDLINE_DATA: u16 = 0x15; +const FW_CFG_SETUP_SIZE: u16 = 0x17; +const FW_CFG_SETUP_DATA: u16 = 0x18; +const FW_CFG_FILE_DIR: u16 = 0x19; +const FW_CFG_KNOWN_ITEMS: usize = 0x20; + +pub const FW_CFG_FILE_FIRST: u16 = 0x20; +pub const FW_CFG_DMA_SIGNATURE: [u8; 8] = *b"QEMU CFG"; +// https://github.com/torvalds/linux/blob/master/include/uapi/linux/qemu_fw_cfg.h +pub const FW_CFG_ACPI_ID: &str = "QEMU0002"; +// Reserved (must be enabled) +const FW_CFG_F_RESERVED: u8 = 1 << 0; +// DMA Toggle Bit (enabled by default) +const FW_CFG_F_DMA: u8 = 1 << 1; +pub const FW_CFG_FEATURE: [u8; 4] = [FW_CFG_F_RESERVED | FW_CFG_F_DMA, 0, 0, 0]; + +const COMMAND_ALLOCATE: u32 = 0x1; +const COMMAND_ADD_POINTER: u32 = 0x2; +const COMMAND_ADD_CHECKSUM: u32 = 0x3; + +const ALLOC_ZONE_HIGH: u8 = 0x1; +const ALLOC_ZONE_FSEG: u8 = 0x2; + +const FW_CFG_FILENAME_TABLE_LOADER: &str = "etc/table-loader"; +const FW_CFG_FILENAME_RSDP: &str = "acpi/rsdp"; +const FW_CFG_FILENAME_ACPI_TABLES: &str = "acpi/tables"; + +#[derive(Debug)] +pub enum FwCfgContent { + Bytes(Vec), + Slice(&'static [u8]), + File(u64, File), + U32(u32), +} + +struct FwCfgContentAccess<'a> { + content: &'a FwCfgContent, + offset: u32, +} + +impl Read for FwCfgContentAccess<'_> { + fn read(&mut self, buf: &mut [u8]) -> Result { + match self.content { + FwCfgContent::File(offset, f) => { + Seek::seek(&mut (&*f), SeekFrom::Start(offset + self.offset as u64))?; + Read::read(&mut (&*f), buf) + } + FwCfgContent::Bytes(b) => match b.get(self.offset as usize..) { + Some(mut s) => s.read(buf), + None => Err(ErrorKind::UnexpectedEof)?, + }, + FwCfgContent::Slice(b) => match b.get(self.offset as usize..) { + Some(mut s) => s.read(buf), + None => Err(ErrorKind::UnexpectedEof)?, + }, + FwCfgContent::U32(n) => match n.to_le_bytes().get(self.offset as usize..) { + Some(mut s) => s.read(buf), + None => Err(ErrorKind::UnexpectedEof)?, + }, + } + } +} + +impl Default for FwCfgContent { + fn default() -> Self { + FwCfgContent::Slice(&[]) + } +} + +impl FwCfgContent { + fn size(&self) -> Result { + let ret = match self { + FwCfgContent::Bytes(v) => v.len(), + FwCfgContent::File(offset, f) => (f.metadata()?.len() - offset) as usize, + FwCfgContent::Slice(s) => s.len(), + FwCfgContent::U32(n) => size_of_val(n), + }; + u32::try_from(ret).map_err(|_| std::io::ErrorKind::InvalidInput.into()) + } + fn access(&self, offset: u32) -> FwCfgContentAccess<'_> { + FwCfgContentAccess { + content: self, + offset, + } + } +} + +#[derive(Debug, Default)] +pub struct FwCfgItem { + pub name: String, + pub content: FwCfgContent, +} + +/// https://www.qemu.org/docs/master/specs/fw_cfg.html +#[derive(Debug)] +pub struct FwCfg { + selector: u16, + data_offset: u32, + dma_address: u64, + items: Vec, // 0x20 and above + known_items: [FwCfgContent; FW_CFG_KNOWN_ITEMS], // 0x0 to 0x19 + memory: GuestMemoryAtomic>, +} + +#[repr(C)] +#[derive(Debug, IntoBytes, FromBytes)] +struct FwCfgDmaAccess { + control_be: u32, + length_be: u32, + address_be: u64, +} + +// https://github.com/torvalds/linux/blob/master/include/uapi/linux/qemu_fw_cfg.h#L67 +#[bitfield(u32)] +struct AccessControl { + // FW_CFG_DMA_CTL_ERROR = 0x01 + error: bool, + // FW_CFG_DMA_CTL_READ = 0x02 + read: bool, + #[bits(1)] + _unused2: u8, + // FW_CFG_DMA_CTL_SKIP = 0x04 + skip: bool, + #[bits(3)] + _unused3: u8, + // FW_CFG_DMA_CTL_ERROR = 0x08 + select: bool, + #[bits(7)] + _unused4: u8, + // FW_CFG_DMA_CTL_WRITE = 0x10 + write: bool, + #[bits(16)] + _unused: u32, +} + +#[repr(C)] +#[derive(Debug, IntoBytes, FromBytes)] +struct FwCfgFilesHeader { + count_be: u32, +} + +pub const FILE_NAME_SIZE: usize = 56; + +pub fn create_file_name(name: &str) -> [u8; FILE_NAME_SIZE] { + let mut c_name = [0u8; FILE_NAME_SIZE]; + let c_len = std::cmp::min(FILE_NAME_SIZE - 1, name.len()); + c_name[0..c_len].copy_from_slice(&name.as_bytes()[0..c_len]); + c_name +} + +#[allow(dead_code)] +#[repr(C, packed)] +#[derive(Debug, IntoBytes, FromBytes, Clone, Copy)] +struct BootE820Entry { + addr: u64, + size: u64, + type_: u32, +} + +#[repr(C)] +#[derive(Debug, IntoBytes, FromBytes)] +struct FwCfgFile { + size_be: u32, + select_be: u16, + _reserved: u16, + name: [u8; FILE_NAME_SIZE], +} + +#[repr(C, align(4))] +#[derive(Debug, IntoBytes, Immutable)] +struct Allocate { + command: u32, + file: [u8; FILE_NAME_SIZE], + align: u32, + zone: u8, + _pad: [u8; 63], +} + +#[repr(C, align(4))] +#[derive(Debug, IntoBytes, Immutable)] +struct AddPointer { + command: u32, + dst: [u8; FILE_NAME_SIZE], + src: [u8; FILE_NAME_SIZE], + offset: u32, + size: u8, + _pad: [u8; 7], +} + +#[repr(C, align(4))] +#[derive(Debug, IntoBytes, Immutable)] +struct AddChecksum { + command: u32, + file: [u8; FILE_NAME_SIZE], + offset: u32, + start: u32, + len: u32, + _pad: [u8; 56], +} + +fn create_intra_pointer(name: &str, offset: usize, size: u8) -> AddPointer { + AddPointer { + command: COMMAND_ADD_POINTER, + dst: create_file_name(name), + src: create_file_name(name), + offset: offset as u32, + size, + _pad: [0; 7], + } +} + +fn create_acpi_table_checksum(offset: usize, len: usize) -> AddChecksum { + AddChecksum { + command: COMMAND_ADD_CHECKSUM, + file: create_file_name(FW_CFG_FILENAME_ACPI_TABLES), + offset: (offset + offset_of!(AcpiTableHeader, checksum)) as u32, + start: offset as u32, + len: len as u32, + _pad: [0; 56], + } +} + +#[repr(C, align(4))] +#[derive(Debug, Clone, Default, FromBytes, IntoBytes)] +struct AcpiTableHeader { + signature: [u8; 4], + length: u32, + revision: u8, + checksum: u8, + oem_id: [u8; 6], + oem_table_id: [u8; 8], + oem_revision: u32, + asl_compiler_id: [u8; 4], + asl_compiler_revision: u32, +} + +struct AcpiTable { + rsdp: Rsdp, + tables: Vec, + table_pointers: Vec, + table_checksums: Vec<(usize, usize)>, +} + +impl AcpiTable { + fn pointers(&self) -> &[usize] { + &self.table_pointers + } + + fn checksums(&self) -> &[(usize, usize)] { + &self.table_checksums + } + + fn take(self) -> (Rsdp, Vec) { + (self.rsdp, self.tables) + } +} + +// Creates fw_cfg items used by firmware to load and verify Acpi tables +// https://github.com/qemu/qemu/blob/master/hw/acpi/bios-linker-loader.c +fn create_acpi_loader(acpi_table: AcpiTable) -> [FwCfgItem; 3] { + let mut table_loader_bytes: Vec = Vec::new(); + let allocate_rsdp = Allocate { + command: COMMAND_ALLOCATE, + file: create_file_name(FW_CFG_FILENAME_RSDP), + align: 4, + zone: ALLOC_ZONE_FSEG, + _pad: [0; 63], + }; + table_loader_bytes.extend(allocate_rsdp.as_bytes()); + + let allocate_tables = Allocate { + command: COMMAND_ALLOCATE, + file: create_file_name(FW_CFG_FILENAME_ACPI_TABLES), + align: 4, + zone: ALLOC_ZONE_HIGH, + _pad: [0; 63], + }; + table_loader_bytes.extend(allocate_tables.as_bytes()); + + for pointer_offset in acpi_table.pointers().iter() { + let pointer = create_intra_pointer(FW_CFG_FILENAME_ACPI_TABLES, *pointer_offset, 8); + table_loader_bytes.extend(pointer.as_bytes()); + } + for (offset, len) in acpi_table.checksums().iter() { + let checksum = create_acpi_table_checksum(*offset, *len); + table_loader_bytes.extend(checksum.as_bytes()); + } + let pointer_rsdp_to_xsdt = AddPointer { + command: COMMAND_ADD_POINTER, + dst: create_file_name(FW_CFG_FILENAME_RSDP), + src: create_file_name(FW_CFG_FILENAME_ACPI_TABLES), + offset: offset_of!(Rsdp, xsdt_addr) as u32, + size: 8, + _pad: [0; 7], + }; + table_loader_bytes.extend(pointer_rsdp_to_xsdt.as_bytes()); + let checksum_rsdp = AddChecksum { + command: COMMAND_ADD_CHECKSUM, + file: create_file_name(FW_CFG_FILENAME_RSDP), + offset: offset_of!(Rsdp, checksum) as u32, + start: 0, + len: offset_of!(Rsdp, length) as u32, + _pad: [0; 56], + }; + let checksum_rsdp_ext = AddChecksum { + command: COMMAND_ADD_CHECKSUM, + file: create_file_name(FW_CFG_FILENAME_RSDP), + offset: offset_of!(Rsdp, extended_checksum) as u32, + start: 0, + len: size_of::() as u32, + _pad: [0; 56], + }; + table_loader_bytes.extend(checksum_rsdp.as_bytes()); + table_loader_bytes.extend(checksum_rsdp_ext.as_bytes()); + + let table_loader = FwCfgItem { + name: FW_CFG_FILENAME_TABLE_LOADER.to_owned(), + content: FwCfgContent::Bytes(table_loader_bytes), + }; + let (rsdp, tables) = acpi_table.take(); + let acpi_rsdp = FwCfgItem { + name: FW_CFG_FILENAME_RSDP.to_owned(), + content: FwCfgContent::Bytes(rsdp.as_bytes().to_owned()), + }; + let apci_tables = FwCfgItem { + name: FW_CFG_FILENAME_ACPI_TABLES.to_owned(), + content: FwCfgContent::Bytes(tables), + }; + [table_loader, acpi_rsdp, apci_tables] +} + +impl FwCfg { + pub fn new(memory: GuestMemoryAtomic>) -> FwCfg { + const DEFAULT_ITEM: FwCfgContent = FwCfgContent::Slice(&[]); + let mut known_items = [DEFAULT_ITEM; FW_CFG_KNOWN_ITEMS]; + known_items[FW_CFG_SIGNATURE as usize] = FwCfgContent::Slice(&FW_CFG_DMA_SIGNATURE); + known_items[FW_CFG_ID as usize] = FwCfgContent::Slice(&FW_CFG_FEATURE); + let file_buf = Vec::from(FwCfgFilesHeader { count_be: 0 }.as_mut_bytes()); + known_items[FW_CFG_FILE_DIR as usize] = FwCfgContent::Bytes(file_buf); + + FwCfg { + selector: 0, + data_offset: 0, + dma_address: 0, + items: vec![], + known_items, + memory, + } + } + + pub fn populate_fw_cfg( + &mut self, + mem_size: Option, + kernel: Option, + initramfs: Option, + cmdline: Option, + fw_cfg_item_list: Option>, + ) -> Result<()> { + if let Some(mem_size) = mem_size { + self.add_e820(mem_size)? + } + if let Some(kernel) = kernel { + self.add_kernel_data(&kernel)?; + } + if let Some(cmdline) = cmdline { + self.add_kernel_cmdline(cmdline); + } + if let Some(initramfs) = initramfs { + self.add_initramfs_data(&initramfs)? + } + if let Some(fw_cfg_item_list) = fw_cfg_item_list { + for item in fw_cfg_item_list { + self.add_item(item)?; + } + } + Ok(()) + } + + pub fn add_e820(&mut self, mem_size: usize) -> Result<()> { + #[cfg(target_arch = "x86_64")] + let mut mem_regions = vec![ + (GuestAddress(0), EBDA_START.0 as usize, RegionType::Ram), + ( + MEM_32BIT_DEVICES_START, + MEM_32BIT_DEVICES_SIZE as usize, + RegionType::Reserved, + ), + ( + PCI_MMCONFIG_START, + PCI_MMCONFIG_SIZE as usize, + RegionType::Reserved, + ), + (STAGE0_START_ADDRESS, STAGE0_SIZE, RegionType::Reserved), + ]; + #[cfg(target_arch = "aarch64")] + let mut mem_regions = arch::aarch64::arch_memory_regions(); + if mem_size < MEM_32BIT_DEVICES_START.0 as usize { + mem_regions.push(( + HIGH_RAM_START, + mem_size - HIGH_RAM_START.0 as usize, + RegionType::Ram, + )); + } else { + mem_regions.push(( + HIGH_RAM_START, + MEM_32BIT_RESERVED_START.0 as usize - HIGH_RAM_START.0 as usize, + RegionType::Ram, + )); + mem_regions.push(( + RAM_64BIT_START, + mem_size - (MEM_32BIT_DEVICES_START.0 as usize), + RegionType::Ram, + )); + } + let mut bytes = vec![]; + for (addr, size, region) in mem_regions.iter() { + let type_ = match region { + RegionType::Ram => E820_RAM, + RegionType::Reserved => E820_RESERVED, + RegionType::SubRegion => continue, + }; + let mut entry = BootE820Entry { + addr: addr.0, + size: *size as u64, + type_, + }; + bytes.extend_from_slice(entry.as_mut_bytes()); + } + let item = FwCfgItem { + name: "etc/e820".to_owned(), + content: FwCfgContent::Bytes(bytes), + }; + self.add_item(item) + } + + fn file_dir_mut(&mut self) -> &mut Vec { + let FwCfgContent::Bytes(file_buf) = &mut self.known_items[FW_CFG_FILE_DIR as usize] else { + unreachable!("fw_cfg: selector {FW_CFG_FILE_DIR:#x} should be FwCfgContent::Byte!") + }; + file_buf + } + + fn update_count(&mut self) { + let mut header = FwCfgFilesHeader { + count_be: (self.items.len() as u32).to_be(), + }; + self.file_dir_mut()[0..4].copy_from_slice(header.as_mut_bytes()); + } + + pub fn add_item(&mut self, item: FwCfgItem) -> Result<()> { + let index = self.items.len(); + let c_name = create_file_name(&item.name); + let size = item.content.size()?; + let mut cfg_file = FwCfgFile { + size_be: size.to_be(), + select_be: (FW_CFG_FILE_FIRST + index as u16).to_be(), + _reserved: 0, + name: c_name, + }; + self.file_dir_mut() + .extend_from_slice(cfg_file.as_mut_bytes()); + self.items.push(item); + self.update_count(); + Ok(()) + } + + fn dma_read_content( + &self, + content: &FwCfgContent, + offset: u32, + len: u32, + address: u64, + ) -> Result { + let content_size = content.size()?.saturating_sub(offset); + let op_size = std::cmp::min(content_size, len); + let mut access = content.access(offset); + let mut buf = vec![0u8; op_size as usize]; + access.read_exact(buf.as_mut_bytes())?; + let r = self + .memory + .memory() + .write(buf.as_bytes(), GuestAddress(address)); + match r { + Err(e) => { + error!("fw_cfg: dma read error: {e:x?}"); + Err(ErrorKind::InvalidInput.into()) + } + Ok(size) => Ok(size as u32), + } + } + + fn dma_read(&mut self, selector: u16, len: u32, address: u64) -> Result<()> { + let op_size = if let Some(content) = self.known_items.get(selector as usize) { + self.dma_read_content(content, self.data_offset, len, address) + } else if let Some(item) = self.items.get((selector - FW_CFG_FILE_FIRST) as usize) { + self.dma_read_content(&item.content, self.data_offset, len, address) + } else { + error!("fw_cfg: selector {selector:#x} does not exist."); + Err(ErrorKind::NotFound.into()) + }?; + self.data_offset += op_size; + Ok(()) + } + + fn do_dma(&mut self) { + let dma_address = self.dma_address; + let mut access = FwCfgDmaAccess::new_zeroed(); + let dma_access = match self + .memory + .memory() + .read(access.as_mut_bytes(), GuestAddress(dma_address)) + { + Ok(_) => access, + Err(e) => { + error!("fw_cfg: invalid address of dma access {dma_address:#x}: {e:?}"); + return; + } + }; + let control = AccessControl(u32::from_be(dma_access.control_be)); + if control.select() { + self.selector = control.select() as u16; + } + let len = u32::from_be(dma_access.length_be); + let addr = u64::from_be(dma_access.address_be); + let ret = if control.read() { + self.dma_read(self.selector, len, addr) + } else if control.write() { + Err(ErrorKind::InvalidInput.into()) + } else if control.skip() { + self.data_offset += len; + Ok(()) + } else { + Err(ErrorKind::InvalidData.into()) + }; + let mut access_resp = AccessControl(0); + if let Err(e) = ret { + error!("fw_cfg: dma operation {dma_access:x?}: {e:x?}"); + access_resp.set_error(true); + } + if let Err(e) = self.memory.memory().write( + &access_resp.0.to_be_bytes(), + GuestAddress(dma_address + core::mem::offset_of!(FwCfgDmaAccess, control_be) as u64), + ) { + error!("fw_cfg: finishing dma: {e:?}") + } + } + + pub fn add_kernel_data(&mut self, file: &File) -> Result<()> { + let mut buffer = vec![0u8; size_of::()]; + file.read_exact_at(&mut buffer, 0)?; + let bp = boot_params::from_mut_slice(&mut buffer).unwrap(); + #[cfg(target_arch = "x86_64")] + { + // must set to 4 for backwards compatibility + // https://docs.kernel.org/arch/x86/boot.html#the-real-mode-kernel-header + if bp.hdr.setup_sects == 0 { + bp.hdr.setup_sects = 4; + } + // wildcard boot loader type + bp.hdr.type_of_loader = 0xff; + } + #[cfg(target_arch = "aarch64")] + let kernel_start = bp.text_offset; + #[cfg(target_arch = "x86_64")] + let kernel_start = (bp.hdr.setup_sects as usize + 1) * 512; + self.known_items[FW_CFG_SETUP_SIZE as usize] = FwCfgContent::U32(buffer.len() as u32); + self.known_items[FW_CFG_SETUP_DATA as usize] = FwCfgContent::Bytes(buffer); + self.known_items[FW_CFG_KERNEL_SIZE as usize] = + FwCfgContent::U32(file.metadata()?.len() as u32 - kernel_start as u32); + self.known_items[FW_CFG_KERNEL_DATA as usize] = + FwCfgContent::File(kernel_start as u64, file.try_clone()?); + Ok(()) + } + + pub fn add_kernel_cmdline(&mut self, s: std::ffi::CString) { + let bytes = s.into_bytes_with_nul(); + self.known_items[FW_CFG_CMDLINE_SIZE as usize] = FwCfgContent::U32(bytes.len() as u32); + self.known_items[FW_CFG_CMDLINE_DATA as usize] = FwCfgContent::Bytes(bytes); + } + + pub fn add_acpi( + &mut self, + rsdp: Rsdp, + tables: Vec, + table_checksums: Vec<(usize, usize)>, + table_pointers: Vec, + ) -> Result<()> { + let acpi_table = AcpiTable { + rsdp, + tables, + table_checksums, + table_pointers, + }; + let [table_loader, acpi_rsdp, apci_tables] = create_acpi_loader(acpi_table); + self.add_item(table_loader)?; + self.add_item(acpi_rsdp)?; + self.add_item(apci_tables) + } + + pub fn add_initramfs_data(&mut self, file: &File) -> Result<()> { + let initramfs_size = file.metadata()?.len(); + self.known_items[FW_CFG_INITRD_SIZE as usize] = FwCfgContent::U32(initramfs_size as _); + self.known_items[FW_CFG_INITRD_DATA as usize] = FwCfgContent::File(0, file.try_clone()?); + Ok(()) + } + + fn read_content(content: &FwCfgContent, offset: u32, data: &mut [u8], size: u32) -> Option { + let start = offset as usize; + let end = start + size as usize; + match content { + FwCfgContent::Bytes(b) => { + if b.len() >= size as usize { + data.copy_from_slice(&b[start..end]); + } + } + FwCfgContent::Slice(s) => { + if s.len() >= size as usize { + data.copy_from_slice(&s[start..end]); + } + } + FwCfgContent::File(o, f) => { + f.read_exact_at(data, o + offset as u64).ok()?; + } + FwCfgContent::U32(n) => { + let bytes = n.to_le_bytes(); + data.copy_from_slice(&bytes[start..end]); + } + }; + Some(size as u8) + } + + fn read_data(&mut self, data: &mut [u8], size: u32) -> u8 { + let ret = if let Some(content) = self.known_items.get(self.selector as usize) { + Self::read_content(content, self.data_offset, data, size) + } else if let Some(item) = self.items.get((self.selector - FW_CFG_FILE_FIRST) as usize) { + Self::read_content(&item.content, self.data_offset, data, size) + } else { + error!("fw_cfg: selector {:#x} does not exist.", self.selector); + None + }; + if let Some(val) = ret { + self.data_offset += size; + val + } else { + 0 + } + } +} + +impl BusDevice for FwCfg { + fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { + let port = offset + PORT_FW_CFG_BASE; + let size = data.len(); + match (port, size) { + (PORT_FW_CFG_SELECTOR, _) => { + error!("fw_cfg: selector register is write-only."); + } + (PORT_FW_CFG_DATA, _) => _ = self.read_data(data, size as u32), + (PORT_FW_CFG_DMA_HI, 4) => { + let addr = self.dma_address; + let addr_hi = (addr >> 32) as u32; + data.copy_from_slice(&addr_hi.to_be_bytes()); + } + (PORT_FW_CFG_DMA_LO, 4) => { + let addr = self.dma_address; + let addr_lo = (addr & 0xffff_ffff) as u32; + data.copy_from_slice(&addr_lo.to_be_bytes()); + } + _ => { + debug!( + "fw_cfg: read from unknown port {port:#x}: {size:#x} bytes and offset {offset:#x}." + ); + } + }; + } + + fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { + let port = offset + PORT_FW_CFG_BASE; + let size = data.size(); + match (port, size) { + (PORT_FW_CFG_SELECTOR, 2) => { + let mut buf = [0u8; 2]; + buf[..size].copy_from_slice(&data[..size]); + #[cfg(target_arch = "x86_64")] + let val = u16::from_le_bytes(buf); + #[cfg(target_arch = "aarch64")] + let val = u16::from_be_bytes(buf); + self.selector = val; + self.data_offset = 0; + } + (PORT_FW_CFG_DATA, 1) => error!("fw_cfg: data register is read-only."), + (PORT_FW_CFG_DMA_HI, 4) => { + let mut buf = [0u8; 4]; + buf[..size].copy_from_slice(&data[..size]); + let val = u32::from_be_bytes(buf); + self.dma_address &= 0xffff_ffff; + self.dma_address |= (val as u64) << 32; + } + (PORT_FW_CFG_DMA_LO, 4) => { + let mut buf = [0u8; 4]; + buf[..size].copy_from_slice(&data[..size]); + let val = u32::from_be_bytes(buf); + self.dma_address &= !0xffff_ffff; + self.dma_address |= val as u64; + self.do_dma(); + } + _ => debug!( + "fw_cfg: write to unknown port {port:#x}: {size:#x} bytes and offset {offset:#x} ." + ), + }; + None + } +} + +#[cfg(test)] +mod tests { + use std::ffi::CString; + use std::io::Write; + + use vmm_sys_util::tempfile::TempFile; + + use super::*; + + #[cfg(target_arch = "x86_64")] + const SELECTOR_OFFSET: u64 = 0; + #[cfg(target_arch = "aarch64")] + const SELECTOR_OFFSET: u64 = 8; + #[cfg(target_arch = "x86_64")] + const DATA_OFFSET: u64 = 1; + #[cfg(target_arch = "aarch64")] + const DATA_OFFSET: u64 = 0; + #[cfg(target_arch = "x86_64")] + const DMA_OFFSET: u64 = 4; + #[cfg(target_arch = "aarch64")] + const DMA_OFFSET: u64 = 16; + + #[test] + fn test_signature() { + let gm = GuestMemoryAtomic::new( + GuestMemoryMmap::from_ranges(&[(GuestAddress(0), RAM_64BIT_START.0 as usize)]).unwrap(), + ); + + let mut fw_cfg = FwCfg::new(gm); + + let mut data = vec![0u8]; + + let mut sig_iter = FW_CFG_DMA_SIGNATURE.into_iter(); + fw_cfg.write(0, SELECTOR_OFFSET, &[FW_CFG_SIGNATURE as u8, 0]); + loop { + if let Some(char) = sig_iter.next() { + fw_cfg.read(0, DATA_OFFSET, &mut data); + assert_eq!(data[0], char); + } else { + return; + } + } + } + #[test] + fn test_kernel_cmdline() { + let gm = GuestMemoryAtomic::new( + GuestMemoryMmap::from_ranges(&[(GuestAddress(0), RAM_64BIT_START.0 as usize)]).unwrap(), + ); + + let mut fw_cfg = FwCfg::new(gm); + + let cmdline = *b"cmdline\0"; + + fw_cfg.add_kernel_cmdline(CString::from_vec_with_nul(cmdline.to_vec()).unwrap()); + + let mut data = vec![0u8]; + + let mut cmdline_iter = cmdline.into_iter(); + fw_cfg.write(0, SELECTOR_OFFSET, &[FW_CFG_CMDLINE_DATA as u8, 0]); + loop { + if let Some(char) = cmdline_iter.next() { + fw_cfg.read(0, DATA_OFFSET, &mut data); + assert_eq!(data[0], char); + } else { + return; + } + } + } + + #[test] + fn test_initram_fs() { + let gm = GuestMemoryAtomic::new( + GuestMemoryMmap::from_ranges(&[(GuestAddress(0), RAM_64BIT_START.0 as usize)]).unwrap(), + ); + + let mut fw_cfg = FwCfg::new(gm); + + let temp = TempFile::new().unwrap(); + let mut temp_file = temp.as_file(); + + let initram_content = b"this is the initramfs"; + let written = temp_file.write(initram_content); + assert_eq!(written.unwrap(), 21); + let _ = fw_cfg.add_initramfs_data(temp_file); + + let mut data = vec![0u8]; + + let mut initram_iter = (*initram_content).into_iter(); + fw_cfg.write(0, SELECTOR_OFFSET, &[FW_CFG_INITRD_DATA as u8, 0]); + loop { + if let Some(char) = initram_iter.next() { + fw_cfg.read(0, DATA_OFFSET, &mut data); + assert_eq!(data[0], char); + } else { + return; + } + } + } + + #[test] + fn test_dma() { + let code = [ + 0xba, 0xf8, 0x03, 0x00, 0xd8, 0x04, b'0', 0xee, 0xb0, b'\n', 0xee, 0xf4, + ]; + + let content = FwCfgContent::Bytes(code.to_vec()); + + let mem_size = 0x1000; + let load_addr = GuestAddress(0x1000); + let mem: GuestMemoryMmap = + GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap(); + + // Note: In firmware we would just allocate FwCfgDmaAccess struct + // and use address of struct (&) as dma address + let mut access_control = AccessControl(0); + // bit 1 = read access + access_control.set_read(true); + // length of data to access + let length_be = (code.len() as u32).to_be(); + // guest address for data + let code_address = 0x1900_u64; + let address_be = code_address.to_be(); + let mut access = FwCfgDmaAccess { + control_be: access_control.0.to_be(), // bit(1) = read bit + length_be, + address_be, + }; + // access address is where to put the code + let access_address = GuestAddress(load_addr.0); + let address_bytes = access_address.0.to_be_bytes(); + let dma_lo: [u8; 4] = address_bytes[0..4].try_into().unwrap(); + let dma_hi: [u8; 4] = address_bytes[4..8].try_into().unwrap(); + + // writing the FwCfgDmaAccess to mem (this would just be self.dma_access.as_ref() in guest) + let _ = mem.write(access.as_mut_bytes(), access_address); + let mem_m = GuestMemoryAtomic::new(mem.clone()); + let mut fw_cfg = FwCfg::new(mem_m); + let cfg_item = FwCfgItem { + name: "code".to_string(), + content, + }; + let _ = fw_cfg.add_item(cfg_item); + + let mut data = [0u8; 12]; + + let _ = mem.read(&mut data, GuestAddress(code_address)); + assert_ne!(data, code); + + fw_cfg.write(0, SELECTOR_OFFSET, &[FW_CFG_FILE_FIRST as u8, 0]); + fw_cfg.write(0, DMA_OFFSET, &dma_lo); + fw_cfg.write(0, DMA_OFFSET + 4, &dma_hi); + let _ = mem.read(&mut data, GuestAddress(code_address)); + assert_eq!(data, code); + } +} diff --git a/devices/src/legacy/gpio_pl061.rs b/devices/src/legacy/gpio_pl061.rs index c7c66341a7..3a61238dd2 100644 --- a/devices/src/legacy/gpio_pl061.rs +++ b/devices/src/legacy/gpio_pl061.rs @@ -12,8 +12,8 @@ use std::{io, result}; use serde::{Deserialize, Serialize}; use thiserror::Error; -use vm_device::interrupt::InterruptSourceGroup; use vm_device::BusDevice; +use vm_device::interrupt::InterruptSourceGroup; use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable}; use crate::{read_le_u32, write_le_u32}; @@ -28,10 +28,10 @@ const GPIORIE: u64 = 0x414; // Raw Interrupt Status Register const GPIOMIS: u64 = 0x418; // Masked Interrupt Status Register const GPIOIC: u64 = 0x41c; // Interrupt Clear Register const GPIOAFSEL: u64 = 0x420; // Mode Control Select Register - // From 0x424 to 0xFDC => reserved space. - // From 0xFE0 to 0xFFC => Peripheral and PrimeCell Identification Registers which are Read Only registers. - // These registers can conceptually be treated as a 32-bit register, and PartNumber[11:0] is used to identify the peripheral. - // We are putting the expected values (look at 'Reset value' column from above mentioned document) in an array. +// From 0x424 to 0xFDC => reserved space. +// From 0xFE0 to 0xFFC => Peripheral and PrimeCell Identification Registers which are Read Only registers. +// These registers can conceptually be treated as a 32-bit register, and PartNumber[11:0] is used to identify the peripheral. +// We are putting the expected values (look at 'Reset value' column from above mentioned document) in an array. const GPIO_ID: [u8; 8] = [0x61, 0x10, 0x14, 0x00, 0x0d, 0xf0, 0x05, 0xb1]; // ID Margins const GPIO_ID_LOW: u64 = 0xfe0; diff --git a/devices/src/legacy/mod.rs b/devices/src/legacy/mod.rs index 3f58e5c842..1087d3d27d 100644 --- a/devices/src/legacy/mod.rs +++ b/devices/src/legacy/mod.rs @@ -8,6 +8,8 @@ mod cmos; #[cfg(target_arch = "x86_64")] mod debug_port; +#[cfg(feature = "fw_cfg")] +pub mod fw_cfg; #[cfg(target_arch = "x86_64")] mod fwdebug; #[cfg(target_arch = "aarch64")] @@ -22,6 +24,8 @@ mod uart_pl011; pub use self::cmos::Cmos; #[cfg(target_arch = "x86_64")] pub use self::debug_port::DebugPort; +#[cfg(feature = "fw_cfg")] +pub use self::fw_cfg::FwCfg; #[cfg(target_arch = "x86_64")] pub use self::fwdebug::FwDebugDevice; #[cfg(target_arch = "aarch64")] diff --git a/devices/src/legacy/rtc_pl031.rs b/devices/src/legacy/rtc_pl031.rs index 39c7911eed..9ff0c8c1b1 100644 --- a/devices/src/legacy/rtc_pl031.rs +++ b/devices/src/legacy/rtc_pl031.rs @@ -4,16 +4,18 @@ //! ARM PL031 Real Time Clock //! -//! This module implements a PL031 Real Time Clock (RTC) that provides to provides long time base counter. -//! This is achieved by generating an interrupt signal after counting for a programmed number of cycles of -//! a real-time clock input. +//! This module implements part of a PL031 Real Time Clock (RTC): +//! * provide a clock value via RTCDR +//! * no alarm is implemented through the match register +//! * no interrupt is generated +//! * RTC cannot be disabled via RTCCR +//! * no test registers //! +use std::result; use std::sync::{Arc, Barrier}; use std::time::Instant; -use std::{io, result}; use thiserror::Error; -use vm_device::interrupt::InterruptSourceGroup; use vm_device::BusDevice; use crate::{read_le_u32, write_le_u32}; @@ -29,11 +31,11 @@ const RTCIMSC: u64 = 0x10; // Interrupt Mask Set or Clear Register. const RTCRIS: u64 = 0x14; // Raw Interrupt Status. const RTCMIS: u64 = 0x18; // Masked Interrupt Status. const RTCICR: u64 = 0x1c; // Interrupt Clear Register. - // From 0x020 to 0xFDC => reserved space. - // From 0xFE0 to 0x1000 => Peripheral and PrimeCell Identification Registers which are Read Only registers. - // AMBA standard devices have CIDs (Cell IDs) and PIDs (Peripheral IDs). The linux kernel will look for these in order to assert the identity - // of these devices (i.e look at the `amba_device_try_add` function). - // We are putting the expected values (look at 'Reset value' column from above mentioned document) in an array. +// From 0x020 to 0xFDC => reserved space. +// From 0xFE0 to 0x1000 => Peripheral and PrimeCell Identification Registers which are Read Only registers. +// AMBA standard devices have CIDs (Cell IDs) and PIDs (Peripheral IDs). The linux kernel will look for these in order to assert the identity +// of these devices (i.e look at the `amba_device_try_add` function). +// We are putting the expected values (look at 'Reset value' column from above mentioned document) in an array. const PL031_ID: [u8; 8] = [0x31, 0x10, 0x14, 0x00, 0x0d, 0xf0, 0x05, 0xb1]; // We are only interested in the margins. const AMBA_ID_LOW: u64 = 0xFE0; @@ -45,8 +47,6 @@ pub const NANOS_PER_SECOND: u64 = 1_000_000_000; pub enum Error { #[error("Bad Write Offset: {0}")] BadWriteOffset(u64), - #[error("Failed to trigger interrupt")] - InterruptFailure(#[source] io::Error), } type Result = result::Result; @@ -107,31 +107,20 @@ pub struct Rtc { match_value: u32, // Writes to this register load an update value into the RTC. load: u32, - imsc: u32, - ris: u32, - interrupt: Arc, } impl Rtc { /// Constructs an AMBA PL031 RTC device. - pub fn new(interrupt: Arc) -> Self { + pub fn new() -> Self { Self { // This is used only for duration measuring purposes. previous_now: Instant::now(), tick_offset: get_time(ClockType::Real) as i64, match_value: 0, load: 0, - imsc: 0, - ris: 0, - interrupt, } } - fn trigger_interrupt(&mut self) -> Result<()> { - self.interrupt.trigger(0).map_err(Error::InterruptFailure)?; - Ok(()) - } - fn get_time(&self) -> u32 { let ts = (self.tick_offset as i128) + (Instant::now().duration_since(self.previous_now).as_nanos() as i128); @@ -155,16 +144,8 @@ impl Rtc { // we want to terminate the execution of the process. self.tick_offset = seconds_to_nanoseconds(i64::from(val)).unwrap(); } - RTCIMSC => { - self.imsc = val & 1; - self.trigger_interrupt()?; - } - RTCICR => { - // As per above mentioned doc, the interrupt is cleared by writing any data value to - // the Interrupt Clear Register. - self.ris = 0; - self.trigger_interrupt()?; - } + RTCIMSC => (), + RTCICR => (), RTCCR => (), // ignore attempts to turn off the timer. o => { return Err(Error::BadWriteOffset(o)); @@ -174,6 +155,12 @@ impl Rtc { } } +impl Default for Rtc { + fn default() -> Self { + Self::new() + } +} + impl BusDevice for Rtc { fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { let mut read_ok = true; @@ -189,10 +176,10 @@ impl BusDevice for Rtc { self.match_value } RTCLR => self.load, - RTCCR => 1, // RTC is always enabled. - RTCIMSC => self.imsc, - RTCRIS => self.ris, - RTCMIS => self.ris & self.imsc, + RTCCR => 1, // RTC is always enabled. + RTCIMSC => 0, // Interrupt is always disabled. + RTCRIS => 0, + RTCMIS => 0, _ => { read_ok = false; 0 @@ -230,9 +217,6 @@ impl BusDevice for Rtc { #[cfg(test)] mod tests { - use vm_device::interrupt::{InterruptIndex, InterruptSourceConfig}; - use vmm_sys_util::eventfd::EventFd; - use super::*; use crate::{ read_be_u16, read_be_u32, read_le_i32, read_le_u16, read_le_u64, write_be_u16, @@ -366,45 +350,9 @@ mod tests { assert!(seconds_to_nanoseconds(9_223_372_037).is_none()); } - struct TestInterrupt { - event_fd: EventFd, - } - - impl InterruptSourceGroup for TestInterrupt { - fn trigger(&self, _index: InterruptIndex) -> result::Result<(), std::io::Error> { - self.event_fd.write(1) - } - - fn update( - &self, - _index: InterruptIndex, - _config: InterruptSourceConfig, - _masked: bool, - _set_gsi: bool, - ) -> result::Result<(), std::io::Error> { - Ok(()) - } - - fn set_gsi(&self) -> result::Result<(), std::io::Error> { - Ok(()) - } - - fn notifier(&self, _index: InterruptIndex) -> Option { - Some(self.event_fd.try_clone().unwrap()) - } - } - - impl TestInterrupt { - fn new(event_fd: EventFd) -> Self { - TestInterrupt { event_fd } - } - } - #[test] fn test_rtc_read_write_and_event() { - let intr_evt = EventFd::new(libc::EFD_NONBLOCK).unwrap(); - - let mut rtc = Rtc::new(Arc::new(TestInterrupt::new(intr_evt.try_clone().unwrap()))); + let mut rtc = Rtc::new(); let mut data = [0; 4]; // Read and write to the MR register. @@ -427,15 +375,13 @@ mod tests { assert_eq!((v / NANOS_PER_SECOND) as u32, v_read); // Read and write to IMSC register. - // Test with non zero value. + // Test with non zero value. Our device ignores the write. let non_zero = 1; write_le_u32(&mut data, non_zero); rtc.write(LEGACY_RTC_MAPPED_IO_START, RTCIMSC, &data); - // The interrupt line should be on. - assert!(rtc.interrupt.notifier(0).unwrap().read().unwrap() == 1); rtc.read(LEGACY_RTC_MAPPED_IO_START, RTCIMSC, &mut data); let v = read_le_u32(&data); - assert_eq!(non_zero & 1, v); + assert_eq!(0, v); // Now test with 0. write_le_u32(&mut data, 0); @@ -447,8 +393,6 @@ mod tests { // Read and write to the ICR register. write_le_u32(&mut data, 1); rtc.write(LEGACY_RTC_MAPPED_IO_START, RTCICR, &data); - // The interrupt line should be on. - assert!(rtc.interrupt.notifier(0).unwrap().read().unwrap() > 1); let v_before = read_le_u32(&data); rtc.read(LEGACY_RTC_MAPPED_IO_START, RTCICR, &mut data); diff --git a/devices/src/legacy/serial.rs b/devices/src/legacy/serial.rs index 973c96b0c5..cbfb2c10b9 100644 --- a/devices/src/legacy/serial.rs +++ b/devices/src/legacy/serial.rs @@ -10,8 +10,8 @@ use std::sync::{Arc, Barrier}; use std::{io, result}; use serde::{Deserialize, Serialize}; -use vm_device::interrupt::InterruptSourceGroup; use vm_device::BusDevice; +use vm_device::interrupt::InterruptSourceGroup; use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable}; use vmm_sys_util::errno::Result; diff --git a/devices/src/legacy/uart_pl011.rs b/devices/src/legacy/uart_pl011.rs index b5603808bf..364dd59278 100644 --- a/devices/src/legacy/uart_pl011.rs +++ b/devices/src/legacy/uart_pl011.rs @@ -13,8 +13,8 @@ use std::{io, result}; use serde::{Deserialize, Serialize}; use thiserror::Error; -use vm_device::interrupt::InterruptSourceGroup; use vm_device::BusDevice; +use vm_device::interrupt::InterruptSourceGroup; use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable}; use crate::{read_le_u32, write_le_u32}; diff --git a/devices/src/lib.rs b/devices/src/lib.rs index 6ea4bc70bb..0c4bfb8ca4 100644 --- a/devices/src/lib.rs +++ b/devices/src/lib.rs @@ -24,6 +24,8 @@ pub mod gic; pub mod interrupt_controller; #[cfg(target_arch = "x86_64")] pub mod ioapic; +#[cfg(feature = "ivshmem")] +pub mod ivshmem; pub mod legacy; #[cfg(feature = "pvmemcontrol")] pub mod pvmemcontrol; @@ -33,7 +35,9 @@ pub mod pvpanic; pub mod tpm; pub use self::acpi::{AcpiGedDevice, AcpiPmTimerDevice, AcpiShutdownDevice}; -pub use self::pvpanic::{PvPanicDevice, PVPANIC_DEVICE_MMIO_SIZE}; +#[cfg(feature = "ivshmem")] +pub use self::ivshmem::IvshmemDevice; +pub use self::pvpanic::{PVPANIC_DEVICE_MMIO_SIZE, PvPanicDevice}; bitflags! { pub struct AcpiNotificationFlags: u8 { diff --git a/devices/src/pvmemcontrol.rs b/devices/src/pvmemcontrol.rs index d119a21a1a..2977a9a527 100644 --- a/devices/src/pvmemcontrol.rs +++ b/devices/src/pvmemcontrol.rs @@ -137,7 +137,8 @@ impl PvmemcontrolTransport { } unsafe fn as_register(self) -> PvmemcontrolTransportRegister { - self.payload.register + // SAFETY: We access initialized data. + unsafe { self.payload.register } } } @@ -519,7 +520,7 @@ impl PvmemcontrolBusDevice { ret_value: get_page_size().into(), arg0: MAJOR_VERSION.into(), arg1: MINOR_VERSION.into(), - }) + }); } FunctionCode::Dontneed => self.madvise(addr, length, libc::MADV_DONTNEED), FunctionCode::Remove => self.madvise(addr, length, libc::MADV_REMOVE), diff --git a/devices/src/pvpanic.rs b/devices/src/pvpanic.rs index 98e7bfa9cd..4fd61188b0 100644 --- a/devices/src/pvpanic.rs +++ b/devices/src/pvpanic.rs @@ -9,9 +9,9 @@ use std::sync::{Arc, Barrier, Mutex}; use anyhow::anyhow; use pci::{ - BarReprogrammingParams, PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, - PciClassCode, PciConfiguration, PciDevice, PciDeviceError, PciHeaderType, PciSubclass, - PCI_CONFIGURATION_ID, + BarReprogrammingParams, PCI_CONFIGURATION_ID, PciBarConfiguration, PciBarPrefetchable, + PciBarRegionType, PciClassCode, PciConfiguration, PciDevice, PciDeviceError, PciHeaderType, + PciSubclass, }; use serde::{Deserialize, Serialize}; use thiserror::Error; diff --git a/devices/src/tpm.rs b/devices/src/tpm.rs index c6ed5ce0a9..72fef3e539 100644 --- a/devices/src/tpm.rs +++ b/devices/src/tpm.rs @@ -12,8 +12,8 @@ use arch::aarch64::layout::{TPM_SIZE, TPM_START}; #[cfg(target_arch = "x86_64")] use arch::x86_64::layout::{TPM_SIZE, TPM_START}; use thiserror::Error; -use tpm::emulator::{BackendCmd, Emulator}; use tpm::TPM_CRB_BUFFER_MAX; +use tpm::emulator::{BackendCmd, Emulator}; use vm_device::BusDevice; #[derive(Error, Debug)] @@ -458,10 +458,9 @@ impl BusDevice for Tpm { CRB_CTRL_CANCEL => { if v == CRB_CANCEL_INVOKE && (self.regs[CRB_CTRL_START as usize] & CRB_START_INVOKE != 0) + && let Err(e) = self.emulator.cancel_cmd() { - if let Err(e) = self.emulator.cancel_cmd() { - error!("Failed to run cancel command. Error: {:?}", e); - } + error!("Failed to run cancel command. Error: {:?}", e); } } CRB_CTRL_START => { diff --git a/docs/fw_cfg.md b/docs/fw_cfg.md new file mode 100644 index 0000000000..73f10a7808 --- /dev/null +++ b/docs/fw_cfg.md @@ -0,0 +1,77 @@ +# Firmware Configuration (fw_cfg) Device + +The `fw_cfg` device is a QEMU-compatible device that allows the hypervisor to pass configuration and data to the guest operating system. This is particularly useful for firmware to access information like ACPI tables, kernel images, initramfs, kernel command lines, and other arbitrary data blobs. + +Cloud Hypervisor implements the `fw_cfg` device with DMA-enabled access. + +## Purpose + +The `fw_cfg` device serves as a generic information channel between the VMM and the guest. It can be used to: + +* Load the kernel, initramfs, and kernel command line for direct kernel boot with firmware. +* Provide ACPI tables to the guest firmware or OS. +* Pass custom configuration files or data blobs (e.g., attestation data, SEV-SNP launch secrets) to the guest. +* Supply an E820 memory map to the guest. + +## Enabling `fw_cfg` + +The `fw_cfg` device is enabled via the `fw_cfg` feature flag when building Cloud Hypervisor: + +```bash +cargo build --features fw_cfg +``` + +## Guest Kernel Configuration + +For the guest Linux kernel to recognize and use the `fw_cfg` device via sysfs, the following kernel configuration option must be enabled: + +* `CONFIG_FW_CFG_SYSFS=y` + +This option allows the kernel to expose `fw_cfg` entries under `/sys/firmware/qemu_fw_cfg/by_name/`. + +## Command Line Options + +The `fw_cfg` device is configured using the `--fw-cfg-config` command-line option. + +**Parameters:** +* `e820=on|off`: (Default: `on`) Whether to add an E820 memory map entry to `fw_cfg`. +* `kernel=on|off`: (Default: `on`) Whether to add the kernel image (specified by `--kernel`) to `fw_cfg`. +* `cmdline=on|off`: (Default: `on`) Whether to add the kernel command line (specified by `--cmdline`) to `fw_cfg`. +* `initramfs=on|off`: (Default: `on`) Whether to add the initramfs image (specified by `--initramfs`) to `fw_cfg`. +* `acpi_table=on|off`: (Default: `on`) Whether to add generated ACPI tables to `fw_cfg`. +* `items=[... : ...]`: A list of custom key-value pairs to be exposed via `fw_cfg`. + * `name=`: The path under which the item will appear in the guest's sysfs (e.g., `opt/org.example/my-data`). + * `file=`: The path to the file on the host whose content will be provided to the guest for this item. + +**Example Usage:** + +1. **Direct kernel boot with custom `fw_cfg` entries:** + + ```bash + cloud-hypervisor \ + --kernel /path/to/vmlinux \ + --cmdline "console=hvc0 root=/dev/vda1" \ + --disk path=/path/to/rootfs.img \ + --fw-cfg-config initramfs=off,items=[name=opt/org.mycorp/setup_info,file=/tmp/guest_setup.txt] \ + ... + ``` + In the guest, `/tmp/guest_setup.txt` from the host will be accessible at `/sys/firmware/qemu_fw_cfg/by_name/opt/org.mycorp/setup_info/raw`. + +2. **Disabling `fw_cfg` explicitly:** + + ```bash + cloud-hypervisor \ + --fw-cfg-config disable \ + ... + ``` + +## Accessing `fw_cfg` Items in the Guest + +If `CONFIG_FW_CFG_SYSFS` is enabled in the guest kernel, items added to `fw_cfg` can be accessed via sysfs. + +For example, an item added with `name=opt/org.example/my-data` will be available at: +`/sys/firmware/qemu_fw_cfg/by_name/opt/org.example/my-data/raw` + +The `raw` file contains the binary content of the host file provided. + +Standard items like kernel, initramfs, cmdline, and ACPI tables also have predefined names (e.g., `etc/kernel`, `etc/cmdline`) if they are enabled to be passed via `fw_cfg`. diff --git a/docs/intel_sgx.md b/docs/intel_sgx.md deleted file mode 100644 index 9f2ca76bdc..0000000000 --- a/docs/intel_sgx.md +++ /dev/null @@ -1,54 +0,0 @@ -# Intel SGX - -Intel® Software Guard Extensions (Intel® SGX) is an Intel technology designed -to increase the security of application code and data. Cloud Hypervisor supports -SGX virtualization through KVM. Because SGX is built on hardware features that -cannot be emulated in software, virtualizing SGX requires support in KVM and in -the host kernel. The required Linux and KVM changes can be found in Linux 5.13+. - -Utilizing SGX in the guest requires a kernel/OS with SGX support, e.g. a kernel -since release 5.11, see -[here](https://www.intel.com/content/www/us/en/developer/tools/software-guard-extensions/linux-overview.html). -Running Linux 5.13+ as the guest kernel allows nested virtualization of SGX. - -For more information about SGX, please refer to the [SGX Homepage](https://www.intel.com/content/www/us/en/developer/tools/software-guard-extensions/linux-overview.html). - -For more information about SGX SDK and how to test SGX, please refer to the -following [instructions](https://github.com/intel/linux-sgx). - -## Cloud Hypervisor support - -Assuming the host exposes `/dev/sgx_vepc`, we can pass SGX enclaves through -the guest. - -In order to use SGX enclaves within a Cloud Hypervisor VM, we must define one -or several Enclave Page Cache (EPC) sections. Here is an example of a VM being -created with 2 EPC sections, the first one being 64MiB with pre-allocated -memory, the second one being 32MiB with no pre-allocated memory. - -```bash -./cloud-hypervisor \ - --cpus boot=1 \ - --memory size=1G \ - --disk path=focal-server-cloudimg-amd64.raw \ - --kernel vmlinux \ - --cmdline "console=ttyS0 console=hvc0 root=/dev/vda1 rw" \ - --sgx-epc id=epc0,size=64M,prefault=on id=epc1,size=32M,prefault=off -``` - -Once booted, and assuming your guest kernel contains the patches from the -[KVM SGX Tree](https://github.com/intel/kvm-sgx), you can validate SGX devices -have been correctly created under `/dev/sgx`: - -```bash -ls /dev/sgx* -/dev/sgx_enclave /dev/sgx_provision /dev/sgx_vepc -``` - -From this point, it is possible to run any SGX application from the guest, as -it will access `/dev/sgx_enclave` device to create dedicated SGX enclaves. - -Note: There is only one contiguous SGX EPC region, which contains all SGX EPC -sections. This region is exposed through ACPI and marked as reserved through -the e820 table. It is treated as yet another device, which means it should -appear at the end of the guest address space. diff --git a/docs/ivshmem.md b/docs/ivshmem.md new file mode 100644 index 0000000000..3a7913c3f6 --- /dev/null +++ b/docs/ivshmem.md @@ -0,0 +1,51 @@ +# Inter-VM shared memory device + +The Inter-VM shared memory device (ivshmem) is designed to share a memory +region between a guest and the host. In order for all guests to be able to +pick up the shared memory area, it is modeled as a PCI device exposing said +memory to the guest as a PCI BAR. + +Device Specification is +at https://www.qemu.org/docs/master/specs/ivshmem-spec.html. + +Now we support setting a backend file to share data between host and guest. +In other words, we only support ivshmem-plain and ivshmem-doorbell is not +supported yet. + +## Usage + +`--ivshmem`, an optional argument, can be passed to enable ivshmem device. +This argument takes a file as a `path` value and a file size as a `size` value. + +``` +--ivshmem device backend file "path=,size="; +``` + +## Example + +Create a file with a size bigger than passed to `cloud-hypervisor`: + +``` +truncate -s 1M /tmp/ivshmem.data +``` + +Start application to mmap the file data to a memory region: + +``` +./cloud-hypervisor \ + --api-socket /tmp/cloud-hypervisor.sock \ + --kernel vmlinux \ + --disk path=focal-server-cloudimg-amd64.raw \ + --cpus boot=4 \ + --memory size=1024M \ + --ivshmem path=/tmp/ivshmem.data,size=1M +``` + +Insmod a ivshmem device driver to enable the device. The file data will be +mmapped to the PCI `bar2` of ivshmem device, +guest can r/w data by accessing this memory. + +A simple example of ivshmem driver can get from: +https://github.com/lisongqian/clh-linux/commits/ch-6.12.8-ivshmem + +The host process can r/w this data by remmaping the `/tmp/ivshmem.data`. diff --git a/docs/live_migration.md b/docs/live_migration.md index 94c9afc236..5c77d2625f 100644 --- a/docs/live_migration.md +++ b/docs/live_migration.md @@ -171,7 +171,13 @@ After a few seconds the VM should be up and you can interact with it. Initiate the Migration over TCP: ```console -src $ ch-remote --api-socket=/tmp/api send-migration tcp:{dst}:{port} +src $ ch-remote --api-socket=/tmp/api send-migration tcp:{dst}:{port} +``` + +With migration parameters: + +```console +src $ ch-remote --api-socket=/tmp/api send-migration tcp:{dst}:{port} --migration-timeout 60 --downtime 5000 ``` > Replace {dst}:{port} with the actual IP address and port of your destination host. @@ -180,3 +186,24 @@ After completing the above commands, the source VM will be migrated to the destination host and continue running there. The source VM instance will terminate normally. All ongoing processes and connections within the VM should remain intact after the migration. + +#### Migration Parameters + +Cloud Hypervisor supports additional parameters to control the +migration process: + +- `migration-timeout ` +Sets the maximum time (in seconds) allowed for the migration process. +If the migration takes longer than this timeout, it will be aborted. A +value of 0 means no timeout limit. +- `downtime ` +Sets the maximum acceptable downtime (in milliseconds) during the +migration. This parameter helps control the trade-off between migration +time and VM downtime. + +> The downtime limit is related to the cost of serialization +(deserialization) of vCPU and device state. Therefore, the expected +downtime is always shorter than the actual downtime. + +These parameters can be used with the `send-migration` command to +fine-tune the migration behavior according to your requirements. \ No newline at end of file diff --git a/docs/memory.md b/docs/memory.md index 46569449c8..a429ff1b78 100644 --- a/docs/memory.md +++ b/docs/memory.md @@ -437,12 +437,11 @@ struct NumaConfig { cpus: Option>, distances: Option>, memory_zones: Option>, - sgx_epc_sections: Option>, } ``` ``` ---numa Settings related to a given NUMA node "guest_numa_id=,cpus=,distances=,memory_zones=,sgx_epc_sections=" +--numa Settings related to a given NUMA node "guest_numa_id=,cpus=,distances=,memory_zones= ``` ### `guest_numa_id` @@ -550,26 +549,6 @@ _Example_ --numa guest_numa_id=0,memory_zones=[mem0,mem2] guest_numa_id=1,memory_zones=mem1 ``` -### `sgx_epc_sections` - -List of SGX EPC sections attached to the guest NUMA node identified by the -`guest_numa_id` option. This allows for describing a list of SGX EPC sections -which must be seen by the guest as belonging to the NUMA node `guest_numa_id`. - -Multiple values can be provided to define the list. Each value is a string -referring to an existing SGX EPC section identifier. Values are separated from -each other with the `,` separator. - -As soon as one tries to describe a list of values, `[` and `]` must be used to -demarcate the list. - -_Example_ - -``` ---sgx-epc id=epc0,size=32M id=epc1,size=64M id=epc2,size=32M ---numa guest_numa_id=0,sgx_epc_sections=epc1 guest_numa_id=1,sgx_epc_sections=[epc0,epc2] -``` - ### PCI bus Cloud Hypervisor supports guests with one or more PCI segments. The default PCI segment always diff --git a/docs/snapshot_restore.md b/docs/snapshot_restore.md index 67f29ce6dc..df7248805e 100644 --- a/docs/snapshot_restore.md +++ b/docs/snapshot_restore.md @@ -110,4 +110,4 @@ from the restored VM. ## Limitations -VFIO devices and Intel SGX are out of scope. +VFIO devices is out of scope. diff --git a/docs/windows.md b/docs/windows.md index e4945d939b..ebb902fbfa 100644 --- a/docs/windows.md +++ b/docs/windows.md @@ -4,10 +4,10 @@ Starting with the release version [0.10.0](https://github.com/cloud-hypervisor/c __Requirements__ -- Host with KVM enabled +- Host with KVM enabled - [UEFI](uefi.md) capable Windows guest image with Virtio drivers integrated -Any modern Windows Server version is compatible. Cloud Hypervisor has been successfully tested with Windows Server 2019 and Windows Server Core 2004. +Any modern Windows Server version is compatible, as well as Windows 11. Cloud Hypervisor has been successfully tested with Windows Server 2019, Windows Server Core 2004 and Windows 11 IoT Enterprise LTSC 2024. At the current stage, only UEFI capable Windows images are supported. This implies the presence of the OVMF firmware during the Windows installation and in any subsequent usage. BIOS boot is not supported. @@ -20,10 +20,15 @@ The subsequent sections will tell, in detail, how to prepare an appropriate Wind __Prerequisites__ - QEMU, version >=5.0.0 is recommended. -- Windows installation ISO. Obtained through MSDN, Visual Studio subscription, evaluation center, etc. +- Windows installation ISO. Obtained through MSDN, Visual Studio subscription, evaluation center, etc. - [VirtIO driver ISO](https://fedorapeople.org/groups/virt/virtio-win/direct-downloads/stable-virtio/) + - Please use the [VirtIO Windows 11 attestation file](https://fedorapeople.org/groups/virt/virtio-win/direct-downloads/upstream-virtio/virtio-win11-attestation-0.1-258.zip) + for Windows 11 - Suitable firmware for Cloud Hypervisor (`CLOUDHV.fd`) and for QEMU (`OVMF.fd`) -- With the suggested image size of 30G, there should be enough free disk space to hold the installation ISO and any other necessary files +- With the suggested image size of 30G for Windows Server, there should be enough free disk space to hold the installation ISO and any other necessary files + - For Windows 11, increasing this image size to 64GB is recommended (see [minimal requirements](https://support.microsoft.com/en-us/windows/windows-11-system-requirements-86c11283-ea52-4782-9efd-7674389a7ba3)) +- Windows 11 only: TPM 2.0 support +- Windows 11 only: 2 or more cores This step currently requires QEMU to install Windows onto the guest. QEMU is only used at the preparation stage, the resulting image is then fully functional with Cloud Hypervisor. @@ -37,11 +42,13 @@ OVMF_DIR=./FV ``` Create an empty image file, `raw` is supported. + ```shell qemu-img create -f raw $IMG_FILE 30G ``` -Begin the Windows installation process under QEMU +Begin the Windows installation process under QEMU for Windows Server: + ```shell qemu-system-x86_64 \ -machine q35,accel=kvm \ @@ -57,13 +64,56 @@ qemu-system-x86_64 \ -vga std ``` -Before the installation can proceed, point the Windows installation program to the VirtIO disk and install the necessary storage controller drivers. After that, the attached hard drive will become visible and the actual installation can commence. +For Windows 11 you can use `swtpm` to fulfill the TPM 2.0 requirement: + +```shell +# Create directory to store state +mkdir -p /tmp/mytpm1 +# Start swtpm daemon for TPM 2.0 support +swtpm socket \ + --tpm2 \ + --ctrl type=unixio,path=/tmp/swtpm-sock \ + --tpmstate dir=/tmp/mytpm1 \ + --flags startup-clear \ + --log level=20 \ + --log file=/tmp/swtpm.log \ + --daemon +``` + +Begin the Windows 11 installation process under QEMU like this: + +```shell +qemu-system-x86_64 \ + -machine q35,accel=kvm \ + -cpu host \ + -m 4G \ + -bios ./$OVMF_DIR/OVMF.fd \ + -cdrom ./$WIN_ISO_FILE \ + -drive file=./$VIRTIO_ISO_FILE,index=0,media=cdrom \ + -drive if=none,id=root,file=./$IMG_FILE \ + -device virtio-blk-pci,drive=root,disable-legacy=on \ + -device virtio-net-pci,netdev=mynet0,disable-legacy=on \ + -netdev user,id=mynet0 \ + -vga std \ + -smp 4 \ + -chardev socket,id=chrtpm,path=/tmp/swtpm-sock \ + -tpmdev emulator,id=tpm0,chardev=chrtpm \ + -device tpm-tis,tpmdev=tpm0 +``` + +This command needs at least `-smp 2` (2 cores), as well as the last three lines (TPM 2.0), to support Windows 11 minimal requirements. Additionally, using `OVMF_CODE.fd` leads to the following error: `qemu: could not load PC BIOS '././FV/OVMF_CODE.fd'`. Switching to `OVMF.fd` is therefore necessary. + +For more details about TPM specifically, please continue with the [TPM documentation](./tpm.md). + +Before the installation can proceed, point the Windows installation program to the VirtIO disk and install the necessary storage controller drivers. For Windows 11 with the attestation drivers, you need to navigate to the `viostor` directory to be able to see and install it. After that, the attached hard drive will become visible and the actual installation can commence. -After the installation has completed, proceed further to the configuration section. QEMU will be needed at least once more to enable the Windows Special Administration Console (SAC) and to possibly install extra device drivers. +Do not install network drivers for Windows 11 just yet, if you don't want to be forced to log-in to/create a Microsoft account. Simply select `I don't have internet` for now. + +After the installation has completed, proceed further to the [configuration section](#image-configuration). QEMU will be needed at least once more to enable/install the Windows Special Administration Console (SAC) and to possibly install extra device drivers. ## Image Usage -The basic command to boot a Windows image. The configuration section should be checked before executing it for the first time. +The basic command to boot a Windows image is shown in the next code snippet. The [configuration section](#image-configuration), as well as the [Getting Started section](../README.md#2-getting-started) should be checked before executing it for the first time. Please especially read the documentation for giving the cloud-hypervisor binary the correct capabilities for it to set TAP interfaces up on the host, otherwise the command below will fail: ```shell cloud-hypervisor \ @@ -85,19 +135,25 @@ In cases where the host processor supports address space > 39 bits, it might be To daemonize the Cloud Hypervisor process, `nohup` can be used. Some STDIO redirections might need to be done. In a simple case it is sufficient to just redirect all the output to `/dev/null`. +Be aware, currently, running the Windows 11 VM on Cloud Hypervisor with TPM 2.0 was not proven successful: `thread 'vcpu0' panicked`. Running the VM without TPM is a valid option though. Therefore the command as shown above is also valid for a Windows 11 VM. + ## Image Configuration ### Device Drivers After the Windows installation has finished under QEMU, there might be still devices with no drivers installed. This might happen for example, when a device was not used during the installation. In particular it is important to ensure that the VirtIO network device is setup correctly because further steps for the configuration and the usage require network in most case. -Boot once more under QEMU and use the [Device Manager](https://support.microsoft.com/en-in/help/4028443/windows-10-update-drivers), to ensure all the device drivers, and especially the network card, are installed correctly. Also, as Cloud Hypervisor can introduce new devices, it is advisable to repeat the procedure while booted under Cloud Hypervisor, when the RDP access to the image is functional. +Boot once more under QEMU and use the [Device Manager](https://support.microsoft.com/en-in/help/4028443/windows-10-update-drivers), to ensure all the device drivers, and especially the network card, are installed correctly. If not, right click on the unknown network device, choose `Update driver` and browse to the `NetKvm` directory on the CD. + +Also, as Cloud Hypervisor can introduce new devices, it is advisable to repeat the procedure while booted under Cloud Hypervisor, when the [RDP](#remote-desktop-protocol-rdp-enablement) access to the image is functional. ### Windows Special Administration Console (SAC) enablement SAC provides a text based console access to the Windows guest. As Cloud Hypervisor doesn't implement a VGA adaptor, SAC is an important instrument for the Windows guest management. -Boot the Windows image under QEMU and execute the below commands to permanently enable SAC +Boot the Windows image under QEMU. For all non-server Windows versions, the SAC needs to be downloaded and enabled first in the `Optional features` menu of Windows. + +Execute the below commands to permanently enable SAC. You might need admin privileges. ```cmd bcdedit /emssettings emsport:1 emsbaudrate:115200 @@ -105,15 +161,14 @@ bcdedit /ems on bcdedit /bootems on ``` -Once SAC is enabled, the image can be booted under Cloud Hypervisor. The SAC prompt will show up +Once SAC is enabled, the image can be booted under Cloud Hypervisor. The SAC prompt will show up
-Computer is booting, SAC started and initialized.                               
-                                                                                
-Use the "ch -?" command for information about using channels.                   
-Use the "?" command for general help.                                           
-                                                                                
-                                                                                
+Computer is booting, SAC started and initialized.
+
+Use the "ch -?" command for information about using channels.
+Use the "?" command for general help.
+
 SAC>
 
@@ -139,7 +194,7 @@ As the simplest option, using `--net tap=` in the Cloud Hypervisor command line
 SAC>i 10 192.168.249.2 255.255.255.0 192.168.249.1
-
+ Where `10` is the device index as shown by the `i` command. @@ -149,26 +204,38 @@ Additional steps are necessary to provide the guest with internet access. - On the guest, add the DNS server either by using `netsh` or by opening `Network and Connectivity Center` and editing the adapter properties. - On the host, configure the traffic forwarding. Replace the `NET_DEV` with the name of your network device. + ```shell NET_DEV=wlp3s0 sysctl -w net.ipv4.ip_forward=1 iptables -t nat -A POSTROUTING -o $NET_DEV -j MASQUERADE ``` +If needed, you can also allow ICMP from host to guest via the following command executed on the guest: + +```shell +netsh advfirewall firewall add rule name="Allow ICMPv4" protocol=icmpv4:8,any dir=in action=allow +``` + +This will enable simple `ping` requests from your host to the guest. + ### Remote Desktop Protocol (RDP) enablement #### Using QEMU - - Execute `SystemPropertiesRemote` - - In the properties window, choose "Allow remote connections to this computer" - - Click "Select Users" and add some user to the allow list + +- Execute `SystemPropertiesRemote` +- In the properties window, choose "Allow remote connections to this computer" +- Click "Select Users" and add some user to the allow list + #### Using powershell + ```powershell Set-ItemProperty "HKLM:\SYSTEM\CurrentControlSet\Control\Terminal Server\" -Name "fDenyTSConnections" -Value 0 Enable-NetFirewallRule -DisplayGroup "Remote Desktop" Add-LocalGroupMember -Group "Remote Desktop Users" -Member someuser ``` - -Administrators can always RDP, non administrator users have to be explicitly enabled. + +Administrators can always RDP, non administrator users have to be explicitly enabled. Once the configuration is set, RDP clients can connect to `192.168.249.2`. @@ -182,7 +249,15 @@ Start-Service sshd Set-Service -Name sshd -StartupType ‘Automatic’ ``` -This allows for SSH login from a remote machine, for example through the `administrator` user: `ssh administrator@192.168.249.2`. For a more detailed OpenSSH guide, please follow the MSDN article from the [links](#links) section. +This allows for SSH login from a remote machine, for example through the `administrator` user: `ssh administrator@192.168.249.2`. + +On Windows 11, opening the firewall was needed as well: + +```powershell +New-NetFirewallRule -Name sshd -DisplayName "OpenSSH Server" -Enabled True -Direction Inbound -Protocol TCP -Action Allow -LocalPort 22 +``` + +For a more detailed OpenSSH guide, please follow the MSDN article from the [links](#links) section. ## Hotplug capability @@ -196,6 +271,8 @@ Disk hotplug and hot-remove are supported. After the device has been hotplugged, ## Debugging +Disclaimer: This chapter was not verified on Windows 11 yet. Proceed with care. + The Windows guest debugging process relies heavily on QEMU and [socat](http://www.dest-unreach.org/socat/). The procedure requires two Windows VMs: - A debugger VM running under QEMU. @@ -203,7 +280,7 @@ The Windows guest debugging process relies heavily on QEMU and [socat](http://ww The connection between both guests happens over TCP, whereby on the guest side it is automatically translated to a COM port. Because the VMs are connected through TCP, the debugging infrastructure can be distributed over the network. The serial port, while slowly transferring data, is common enough to support a wide range of cases and tools. -In this exercise, [WinDbg](https://docs.microsoft.com/en-us/windows-hardware/drivers/debugger/) is used. Any other debugger of choice with the ability to use serial connection can be used instead. +In this exercise, [WinDbg](https://docs.microsoft.com/en-us/windows-hardware/drivers/debugger/) is used. Any other debugger of choice with the ability to use serial connection can be used instead. ### Debugger and Debuggee @@ -220,7 +297,7 @@ qemu-system-x86_64 \ -smp 1 \ -m 4G \ -cdrom ./$WIN_ISO_FILE \ - -drive file=./$VIRTIO_ISO_FILE,index=0,media=cdrom + -drive file=./$VIRTIO_ISO_FILE,index=0,media=cdrom \ -drive if=none,id=root,file=./windbg-disk.raw \ -device virtio-blk-pci,drive=root,disable-legacy=on \ -device virtio-net-pci,netdev=mynet0,disable-legacy=on \ @@ -256,7 +333,7 @@ bcdedit /debug on bcdedit /bootdebug on ``` -##### Turn on boot manager debug +##### Turn on boot manager debug ```cmd bcdedit /set {bootmgr} bootdebug on @@ -308,6 +385,7 @@ Once started, WinDbg will wait for an incoming connection which is going to be i ##### Under QEMU Essentially it would be the command like depicted in the guest preparation sections, with a few modifications: + ```shell qemu-system-x86_64 \ -machine q35,accel=kvm \ @@ -315,7 +393,7 @@ qemu-system-x86_64 \ -m 4G \ -bios ./$OVMF_DIR/OVMF_CODE.fd \ -cdrom ./$WIN_ISO_FILE \ - -drive file=./$VIRTIO_ISO_FILE,index=0,media=cdrom + -drive file=./$VIRTIO_ISO_FILE,index=0,media=cdrom \ -drive if=none,id=root,file=./$IMG_FILE \ -device virtio-blk-pci,drive=root,disable-legacy=on \ -device virtio-net-pci,netdev=mynet0,disable-legacy=on \ diff --git a/event_monitor/Cargo.toml b/event_monitor/Cargo.toml index 764cc6218e..41d3102807 100644 --- a/event_monitor/Cargo.toml +++ b/event_monitor/Cargo.toml @@ -1,11 +1,14 @@ [package] authors = ["The Cloud Hypervisor Authors"] -edition = "2021" +edition.workspace = true name = "event_monitor" version = "0.1.0" [dependencies] -flume = "0.11.1" -libc = "0.2.167" -serde = { version = "1.0.208", features = ["derive", "rc"] } +flume = { workspace = true } +libc = { workspace = true } +serde = { workspace = true, features = ["derive", "rc"] } serde_json = { workspace = true } + +[lints] +workspace = true diff --git a/fuzz/Cargo.lock b/fuzz/Cargo.lock index 967c057442..1750a54b58 100644 --- a/fuzz/Cargo.lock +++ b/fuzz/Cargo.lock @@ -7,7 +7,7 @@ name = "acpi_tables" version = "0.1.0" source = "git+https://github.com/rust-vmm/acpi_tables?branch=main#e08a3f0b0a59b98859dbf59f5aa7fd4d2eb4018a" dependencies = [ - "zerocopy 0.8.24", + "zerocopy 0.8.26", ] [[package]] @@ -61,9 +61,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.95" +version = "1.0.98" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34ac096ce696dc2fcabef30516bb13c0a68a11d30131d3df6f04711467681b04" +checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" [[package]] name = "arbitrary" @@ -93,7 +93,6 @@ dependencies = [ "uuid", "vm-fdt", "vm-memory", - "vm-migration", "vmm-sys-util", ] @@ -122,9 +121,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.9.0" +version = "2.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c8214115b7bf84099f1309324e63141d4c5d7cc26862f97a0a857dbefe165bd" +checksum = "2261d10cca569e4643e526d8dc2e62e433cc8aba21ab764233731f8d369bf394" [[package]] name = "block" @@ -177,18 +176,18 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "clap" -version = "4.5.13" +version = "4.5.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fbb260a053428790f3de475e304ff84cdbc4face759ea7a3e64c1edd938a7fc" +checksum = "7eac00902d9d136acd712710d71823fb8ac8004ca445a89e73a41d45aa712931" dependencies = [ "clap_builder", ] [[package]] name = "clap_builder" -version = "4.5.13" +version = "4.5.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64b17d7ea74e9f833c7dbf2cbe4fb12ff26783eda4782a8975b72f895c9b4d99" +checksum = "2ad9bbf750e73b5884fb8a211a9424a1906c1e156724260fdae972f31d70e1d6" dependencies = [ "anstream", "anstyle", @@ -198,9 +197,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.7.2" +version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97" +checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675" [[package]] name = "cloud-hypervisor-fuzz" @@ -301,7 +300,7 @@ dependencies = [ "acpi_tables", "anyhow", "arch", - "bitflags 2.9.0", + "bitflags 2.9.4", "byteorder", "event_monitor", "hypervisor", @@ -345,7 +344,7 @@ version = "4.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74351c3392ea1ff6cd2628e0042d268ac2371cb613252ff383b6dfa50d22fa79" dependencies = [ - "bitflags 2.9.0", + "bitflags 2.9.4", "libc", ] @@ -403,11 +402,11 @@ checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" [[package]] name = "gdbstub" -version = "0.7.2" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fbcc892208d6998fb57e7c3e05883def66f8130924bba066beb0cfe71566a9f6" +checksum = "71d66e32caf5dd59f561be0143e413e01d651bd8498eb9aa0be8c482c81c8d31" dependencies = [ - "bitflags 2.9.0", + "bitflags 2.9.4", "cfg-if", "log", "managed", @@ -417,9 +416,9 @@ dependencies = [ [[package]] name = "gdbstub_arch" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "328a9e9425db13770d0d11de6332a608854266e44c53d12776be7b4aa427e3de" +checksum = "22dde0e1b68787036ccedd0b1ff6f953527a0e807e571fbe898975203027278f" dependencies = [ "gdbstub", "num-traits", @@ -480,7 +479,7 @@ dependencies = [ "vfio-ioctls", "vm-memory", "vmm-sys-util", - "zerocopy 0.8.24", + "zerocopy 0.8.26", ] [[package]] @@ -531,31 +530,32 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.69" +version = "0.3.77" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" +checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" dependencies = [ + "once_cell", "wasm-bindgen", ] [[package]] name = "kvm-bindings" -version = "0.10.0" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa4933174d0cc4b77b958578cd45784071cc5ae212c2d78fbd755aaaa6dfa71a" +checksum = "d4b153a59bb3ca930ff8148655b2ef68c34259a623ae08cf2fb9b570b2e45363" dependencies = [ "serde", "vmm-sys-util", - "zerocopy 0.7.35", + "zerocopy 0.8.26", ] [[package]] name = "kvm-ioctls" -version = "0.19.1" +version = "0.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e013ae7fcd2c6a8f384104d16afe7ea02969301ea2bb2a56e44b011ebc907cab" +checksum = "b702df98508cb63ad89dd9beb9f6409761b30edca10d48e57941d3f11513a006" dependencies = [ - "bitflags 2.9.0", + "bitflags 2.9.4", "kvm-bindings", "libc", "vmm-sys-util", @@ -563,13 +563,13 @@ dependencies = [ [[package]] name = "landlock" -version = "0.4.1" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18738c5d4c7fae6727a96adb94722ef7ce82f3eafea0a11777e258a93816537e" +checksum = "b3d2ef408b88e913bfc6594f5e693d57676f6463ded7d8bf994175364320c706" dependencies = [ "enumflags2", "libc", - "thiserror 1.0.64", + "thiserror 2.0.12", ] [[package]] @@ -598,8 +598,7 @@ dependencies = [ [[package]] name = "linux-loader" version = "0.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "870c3814345f050991f99869417779f6062542bcf4ed81db7a1b926ad1306638" +source = "git+https://github.com/rust-vmm/linux-loader?branch=main#5fdaed87ddafc89d6abf0b50195a12d19133000d" dependencies = [ "vm-memory", ] @@ -635,7 +634,7 @@ checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" [[package]] name = "micro_http" version = "0.1.0" -source = "git+https://github.com/firecracker-microvm/micro-http?branch=main#4f621532e81ee2ad096a9c9592fdacc40d19de48" +source = "git+https://github.com/firecracker-microvm/micro-http?branch=main#bf5098916006912f8dd35aaa6daa5579c6c297b2" dependencies = [ "libc", "vmm-sys-util", @@ -643,16 +642,16 @@ dependencies = [ [[package]] name = "mshv-bindings" -version = "0.5.1" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "909de5fd4a5a3347a6c62872f6816e6279efd8615a753f10a3bc4daaef8a72ef" +checksum = "805cf329582f770f62cc612716a04c14815276ae266b6298375a672d3c5a5184" dependencies = [ "libc", "num_enum", "serde", "serde_derive", "vmm-sys-util", - "zerocopy 0.8.24", + "zerocopy 0.8.26", ] [[package]] @@ -830,7 +829,7 @@ checksum = "3779b94aeb87e8bd4e834cee3650289ee9e0d5677f976ecdb6d219e5f4f6cd94" dependencies = [ "rand_chacha", "rand_core", - "zerocopy 0.8.24", + "zerocopy 0.8.26", ] [[package]] @@ -865,15 +864,21 @@ dependencies = [ [[package]] name = "remain" -version = "0.2.14" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46aef80f842736de545ada6ec65b81ee91504efd6853f4b96de7414c42ae7443" +checksum = "d7ef12e84481ab4006cb942f8682bba28ece7270743e649442027c5db87df126" dependencies = [ "proc-macro2", "quote", "syn", ] +[[package]] +name = "rustversion" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a0d197bd2c9dc6e53b84da9556a69ba4cdfab8619eb41a8bd1cc2027a0f6b1d" + [[package]] name = "ryu" version = "1.0.18" @@ -917,9 +922,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.128" +version = "1.0.143" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ff5456707a1de34e7e37f2a6fd3d3f808c318259cbd01ab6377795054b483d8" +checksum = "d401abef1d108fbd9cbaebc3e46611f4b1021f714a0597a71f41ee463f5f4a5a" dependencies = [ "itoa", "memchr", @@ -929,9 +934,9 @@ dependencies = [ [[package]] name = "serde_with" -version = "3.9.0" +version = "3.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69cecfa94848272156ea67b2b1a53f20fc7bc638c4a46d2f8abde08f05f4b857" +checksum = "f2c45cd61fefa9db6f254525d46e392b852e0e61d9a1fd36e5bd183450a556d5" dependencies = [ "serde", "serde_derive", @@ -940,9 +945,9 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "3.9.0" +version = "3.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8fee4991ef4f274617a51ad4af30519438dacb2f56ac773b08a1922ff743350" +checksum = "de90945e6565ce0d9a25098082ed4ee4002e047cb59892c318d66821e14bb30f" dependencies = [ "darling", "proc-macro2", @@ -981,9 +986,9 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.13.2" +version = "1.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" [[package]] name = "spin" @@ -1073,7 +1078,6 @@ name = "tpm" version = "0.1.0" dependencies = [ "anyhow", - "byteorder", "libc", "log", "net_gen", @@ -1105,45 +1109,37 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.15.1" +version = "1.18.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0f540e3240398cce6128b64ba83fdbdd86129c16a3aa1a3a252efd66eb3d587" +checksum = "2f87b8aa10b915a06587d0dec516c282ff295b475d94abf425d62b57710070a2" dependencies = [ "getrandom 0.3.3", + "js-sys", "rand", - "uuid-macro-internal", -] - -[[package]] -name = "uuid-macro-internal" -version = "1.15.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9521621447c21497fac206ffe6e9f642f977c4f82eeba9201055f64884d9cb01" -dependencies = [ - "proc-macro2", - "quote", - "syn", + "wasm-bindgen", ] [[package]] name = "vfio-bindings" -version = "0.4.0" -source = "git+https://github.com/rust-vmm/vfio?branch=main#b135b8305c2cc8ec333e0cf77a780445cc98dcee" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "698c66a4522a31ab407a410a59c9660da036178e4fe3f371825cd6aad7d46837" dependencies = [ "vmm-sys-util", ] [[package]] name = "vfio-ioctls" -version = "0.2.0" -source = "git+https://github.com/rust-vmm/vfio?branch=main#b135b8305c2cc8ec333e0cf77a780445cc98dcee" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7af7e8d49719333e5eb52209417f26695c9ab2b117a82596a63a44947f97c5d6" dependencies = [ "byteorder", "kvm-bindings", "kvm-ioctls", "libc", "log", - "thiserror 1.0.64", + "thiserror 2.0.12", "vfio-bindings", "vm-memory", "vmm-sys-util", @@ -1151,16 +1147,17 @@ dependencies = [ [[package]] name = "vfio_user" -version = "0.1.0" -source = "git+https://github.com/rust-vmm/vfio-user?branch=main#3febcdd3fa2531623865663ca1721e1962ed9979" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8db5bc783aad75202ad4cbcdc5e893cff1dd8fa24a1bcdb4de8998d3c4d169a" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.9.4", "libc", "log", "serde", "serde_derive", "serde_json", - "thiserror 1.0.64", + "thiserror 2.0.12", "vfio-bindings", "vm-memory", "vmm-sys-util", @@ -1168,10 +1165,11 @@ dependencies = [ [[package]] name = "vhost" -version = "0.12.1" -source = "git+https://github.com/rust-vmm/vhost?rev=d983ae0#d983ae07f78663b7d24059667376992460b571a2" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a4dcad85a129d97d5d4b2f3c47a4affdeedd76bdcd02094bcb5d9b76cac2d05" dependencies = [ - "bitflags 2.9.0", + "bitflags 2.9.4", "libc", "uuid", "vm-memory", @@ -1180,29 +1178,26 @@ dependencies = [ [[package]] name = "virtio-bindings" -version = "0.2.4" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1711e61c00f8cb450bd15368152a1e37a12ef195008ddc7d0f4812f9e2b30a68" +checksum = "804f498a26d5a63be7bbb8bdcd3869c3f286c4c4a17108905276454da0caf8cb" [[package]] name = "virtio-devices" version = "0.1.0" dependencies = [ "anyhow", - "arc-swap", "block", "byteorder", "epoll", "event_monitor", "libc", "log", - "net_gen", "net_util", "pci", "rate_limiter", "seccompiler", "serde", - "serde_json", "serde_with", "serial_buffer", "thiserror 2.0.12", @@ -1219,9 +1214,9 @@ dependencies = [ [[package]] name = "virtio-queue" -version = "0.14.0" +version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "872e2f3fbd70a7e6f01689720cce3d5c2c5efe52b484dd07b674246ada0e9a8d" +checksum = "fb0479158f863e59323771a1f684d843962f76960b86fecfec2bfa9c8f0f9180" dependencies = [ "log", "virtio-bindings", @@ -1242,7 +1237,6 @@ dependencies = [ name = "vm-device" version = "0.1.0" dependencies = [ - "anyhow", "hypervisor", "serde", "thiserror 2.0.12", @@ -1258,9 +1252,9 @@ source = "git+https://github.com/rust-vmm/vm-fdt?branch=main#ef5bd734f5f66fb0772 [[package]] name = "vm-memory" -version = "0.16.1" +version = "0.16.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1720e7240cdc739f935456eb77f370d7e9b2a3909204da1e2b47bef1137a013" +checksum = "1fd5e56d48353c5f54ef50bd158a0452fc82f5383da840f7b8efc31695dd3b9d" dependencies = [ "arc-swap", "libc", @@ -1283,7 +1277,6 @@ dependencies = [ name = "vm-virtio" version = "0.1.0" dependencies = [ - "log", "virtio-queue", "vm-memory", ] @@ -1294,9 +1287,8 @@ version = "0.1.0" dependencies = [ "acpi_tables", "anyhow", - "arc-swap", "arch", - "bitflags 2.9.0", + "bitflags 2.9.4", "block", "cfg-if", "clap", @@ -1328,21 +1320,20 @@ dependencies = [ "vfio_user", "virtio-bindings", "virtio-devices", - "virtio-queue", "vm-allocator", "vm-device", "vm-memory", "vm-migration", "vm-virtio", "vmm-sys-util", - "zerocopy 0.8.24", + "zerocopy 0.8.26", ] [[package]] name = "vmm-sys-util" -version = "0.12.1" +version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d1435039746e20da4f8d507a72ee1b916f7b4b05af7a91c093d2c6561934ede" +checksum = "d21f366bf22bfba3e868349978766a965cbe628c323d58e026be80b8357ab789" dependencies = [ "bitflags 1.3.2", "libc", @@ -1367,24 +1358,24 @@ dependencies = [ [[package]] name = "wasm-bindgen" -version = "0.2.93" +version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a82edfc16a6c469f5f44dc7b571814045d60404b55a0ee849f9bcfa2e63dd9b5" +checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" dependencies = [ "cfg-if", "once_cell", + "rustversion", "wasm-bindgen-macro", ] [[package]] name = "wasm-bindgen-backend" -version = "0.2.93" +version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9de396da306523044d3302746f1208fa71d7532227f15e347e2d93e4145dd77b" +checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" dependencies = [ "bumpalo", "log", - "once_cell", "proc-macro2", "quote", "syn", @@ -1393,9 +1384,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.93" +version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "585c4c91a46b072c92e908d99cb1dcdf95c5218eeb6f3bf1efa991ee7a68cccf" +checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -1403,9 +1394,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.93" +version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836" +checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" dependencies = [ "proc-macro2", "quote", @@ -1416,9 +1407,12 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.93" +version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c62a0a307cb4a311d3a07867860911ca130c3494e8c2719593806c08bc5d0484" +checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" +dependencies = [ + "unicode-ident", +] [[package]] name = "winapi" @@ -1530,7 +1524,7 @@ version = "0.39.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1" dependencies = [ - "bitflags 2.9.0", + "bitflags 2.9.4", ] [[package]] @@ -1545,11 +1539,11 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.24" +version = "0.8.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2586fea28e186957ef732a5f8b3be2da217d65c5969d4b1e17f973ebbe876879" +checksum = "1039dd0d3c310cf05de012d8a39ff557cb0d23087fd44cad61df08fc31907a2f" dependencies = [ - "zerocopy-derive 0.8.24", + "zerocopy-derive 0.8.26", ] [[package]] @@ -1565,9 +1559,9 @@ dependencies = [ [[package]] name = "zerocopy-derive" -version = "0.8.24" +version = "0.8.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a996a8f63c5c4448cd959ac1bab0aaa3306ccfd060472f85943ee0750f0169be" +checksum = "9ecf5b4cc5364572d7f4c329661bcc82724222973f2cab6f050a4e5c22f75181" dependencies = [ "proc-macro2", "quote", diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index c4536bcfed..ccdad23241 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -11,6 +11,7 @@ cargo-fuzz = true [features] default = ["mshv_emulator"] igvm = [] +ivshmem = [] mshv_emulator = ["hypervisor/mshv_emulator"] pvmemcontrol = [] @@ -22,19 +23,24 @@ epoll = "4.3.3" hypervisor = { path = "../hypervisor", features = ["mshv_emulator"] } libc = "0.2.155" libfuzzer-sys = "0.4.7" -linux-loader = { version = "0.13.0", features = ["bzimage", "elf", "pe"] } +# TODO: update to 0.13.1+ +linux-loader = { git = "https://github.com/rust-vmm/linux-loader", branch = "main", features = [ + "bzimage", + "elf", + "pe", +] } micro_http = { git = "https://github.com/firecracker-microvm/micro-http", branch = "main" } -mshv-bindings = "0.5.0" +mshv-bindings = "0.6.0" net_util = { path = "../net_util" } seccompiler = "0.5.0" virtio-devices = { path = "../virtio-devices" } -virtio-queue = "0.14.0" +virtio-queue = "0.16.0" vm-device = { path = "../vm-device" } vm-memory = "0.16.0" vm-migration = { path = "../vm-migration" } vm-virtio = { path = "../vm-virtio" } vmm = { path = "../vmm", features = ["guest_debug"] } -vmm-sys-util = "0.12.1" +vmm-sys-util = "0.14.0" # Prevent this from interfering with workspaces [workspace] diff --git a/fuzz/fuzz_targets/http_api.rs b/fuzz/fuzz_targets/http_api.rs index 8f41903f6f..e9965ceddf 100644 --- a/fuzz/fuzz_targets/http_api.rs +++ b/fuzz/fuzz_targets/http_api.rs @@ -186,8 +186,6 @@ impl RequestHandler for StubApiRequestHandler { #[cfg(feature = "pvmemcontrol")] pvmemcontrol: None, iommu: false, - #[cfg(target_arch = "x86_64")] - sgx_epc: None, numa: None, watchdog: false, gdb: false, @@ -197,6 +195,8 @@ impl RequestHandler for StubApiRequestHandler { preserved_fds: None, landlock_enable: false, landlock_rules: None, + #[cfg(feature = "ivshmem")] + ivshmem: None, }), state: VmState::Running, memory_actual_size: 0, @@ -221,7 +221,7 @@ impl RequestHandler for StubApiRequestHandler { Ok(()) } - fn vm_resize(&mut self, _: Option, _: Option, _: Option) -> Result<(), VmError> { + fn vm_resize(&mut self, _: Option, _: Option, _: Option) -> Result<(), VmError> { Ok(()) } diff --git a/fuzz/fuzz_targets/pmem.rs b/fuzz/fuzz_targets/pmem.rs index e8cb488e77..e9247fb631 100644 --- a/fuzz/fuzz_targets/pmem.rs +++ b/fuzz/fuzz_targets/pmem.rs @@ -12,8 +12,9 @@ use std::{ffi, io}; use libc::{MAP_NORESERVE, MAP_PRIVATE, PROT_READ, PROT_WRITE}; use libfuzzer_sys::{fuzz_target, Corpus}; use seccompiler::SeccompAction; -use virtio_devices::{Pmem, UserspaceMapping, VirtioDevice, VirtioInterrupt, VirtioInterruptType}; +use virtio_devices::{Pmem, VirtioDevice, VirtioInterrupt, VirtioInterruptType}; use virtio_queue::{Queue, QueueT}; +use vm_device::UserspaceMapping; use vm_memory::bitmap::AtomicBitmap; use vm_memory::guest_memory::FileOffset; use vm_memory::{Bytes, GuestAddress, GuestMemoryAtomic, MmapRegion}; diff --git a/hypervisor/Cargo.toml b/hypervisor/Cargo.toml index fed24b6862..6fb6a02044 100644 --- a/hypervisor/Cargo.toml +++ b/hypervisor/Cargo.toml @@ -1,6 +1,6 @@ [package] authors = ["Microsoft Authors"] -edition = "2021" +edition.workspace = true license = "Apache-2.0 OR BSD-3-Clause" name = "hypervisor" version = "0.1.0" @@ -13,27 +13,30 @@ sev_snp = ["igvm", "igvm_defs"] tdx = [] [dependencies] -anyhow = "1.0.94" +anyhow = { workspace = true } arc-swap = "1.7.1" bitfield-struct = "0.10.1" -byteorder = "1.5.0" -cfg-if = "1.0.0" +byteorder = { workspace = true } +cfg-if = { workspace = true } concat-idents = "1.1.5" igvm = { workspace = true, optional = true } igvm_defs = { workspace = true, optional = true } -kvm-bindings = { workspace = true, optional = true, features = ["serde"] } +kvm-bindings = { workspace = true, optional = true, features = [ + "fam-wrappers", + "serde", +] } kvm-ioctls = { workspace = true, optional = true } -libc = "0.2.167" -log = "0.4.22" +libc = { workspace = true } +log = { workspace = true } mshv-bindings = { workspace = true, features = [ "fam-wrappers", "with-serde", ], optional = true } mshv-ioctls = { workspace = true, optional = true } open-enum = "0.5.2" -serde = { version = "1.0.208", features = ["derive", "rc"] } +serde = { workspace = true, features = ["derive", "rc"] } serde_json = { workspace = true } -serde_with = { version = "3.9.0", default-features = false, features = [ +serde_with = { workspace = true, default-features = false, features = [ "macros", ] } thiserror = { workspace = true } @@ -62,4 +65,7 @@ optional = true version = "1.21.0" [dev-dependencies] -env_logger = "0.11.3" +env_logger = { workspace = true } + +[lints] +workspace = true diff --git a/hypervisor/src/arch/x86/emulator/instructions/mod.rs b/hypervisor/src/arch/x86/emulator/instructions/mod.rs index c2d39aea09..945ce16bac 100644 --- a/hypervisor/src/arch/x86/emulator/instructions/mod.rs +++ b/hypervisor/src/arch/x86/emulator/instructions/mod.rs @@ -7,8 +7,8 @@ use iced_x86::*; use crate::arch::emulator::{EmulationError, PlatformEmulator, PlatformError}; -use crate::arch::x86::emulator::CpuStateManager; use crate::arch::x86::Exception; +use crate::arch::x86::emulator::CpuStateManager; pub mod cmp; pub mod mov; diff --git a/hypervisor/src/arch/x86/emulator/mod.rs b/hypervisor/src/arch/x86/emulator/mod.rs index 61bbd56fdf..778fd47f57 100644 --- a/hypervisor/src/arch/x86/emulator/mod.rs +++ b/hypervisor/src/arch/x86/emulator/mod.rs @@ -7,13 +7,13 @@ use anyhow::Context; use iced_x86::*; +use crate::StandardRegisters; use crate::arch::emulator::{EmulationError, EmulationResult, PlatformEmulator, PlatformError}; use crate::arch::x86::emulator::instructions::*; use crate::arch::x86::regs::{CR0_PE, EFER_LMA}; use crate::arch::x86::{ - segment_type_expand_down, segment_type_ro, Exception, SegmentRegister, SpecialRegisters, + Exception, SegmentRegister, SpecialRegisters, segment_type_expand_down, segment_type_ro, }; -use crate::StandardRegisters; #[macro_use] mod instructions; @@ -254,7 +254,7 @@ impl CpuStateManager for EmulatorCpuState { return Err(PlatformError::InvalidRegister(anyhow!( "read_reg invalid GPR {:?}", r - ))) + ))); } }; @@ -375,7 +375,7 @@ impl CpuStateManager for EmulatorCpuState { return Err(PlatformError::InvalidRegister(anyhow!( "write_reg invalid register {:?}", reg - ))) + ))); } } @@ -624,11 +624,11 @@ impl Emulator<'_, T> { last_decoded_ip = decoder.ip(); num_insn_emulated += 1; - if let Some(num_insn) = num_insn { - if num_insn_emulated >= num_insn { - // Exit the decoding loop, do not decode the next instruction. - stop_emulation = true; - } + if let Some(num_insn) = num_insn + && num_insn_emulated >= num_insn + { + // Exit the decoding loop, do not decode the next instruction. + stop_emulation = true; } } @@ -660,9 +660,9 @@ mod mock_vmm { use std::sync::{Arc, Mutex}; use super::*; + use crate::StandardRegisters; use crate::arch::x86::emulator::EmulatorCpuState as CpuState; use crate::arch::x86::gdt::{gdt_entry, segment_from_gdt}; - use crate::StandardRegisters; #[derive(Debug, Clone)] pub struct MockVmm { diff --git a/hypervisor/src/arch/x86/mod.rs b/hypervisor/src/arch/x86/mod.rs index c337624621..f81734f0a9 100644 --- a/hypervisor/src/arch/x86/mod.rs +++ b/hypervisor/src/arch/x86/mod.rs @@ -12,6 +12,12 @@ // use core::fmt; +#[cfg(feature = "kvm")] +use std::sync::OnceLock; + +use thiserror::Error; + +use crate::{CpuVendor, Hypervisor}; #[cfg(all(feature = "mshv_emulator", target_arch = "x86_64"))] pub mod emulator; @@ -306,16 +312,154 @@ pub struct MsrEntry { pub data: u64, } -#[serde_with::serde_as] -#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] -pub struct XsaveState { - #[serde_as(as = "[_; 1024usize]")] - pub region: [u32; 1024usize], +/// Error that may be returned when attempting to enable AMX state components for guests +#[derive(Debug, Error)] +pub enum AmxGuestSupportError { + /// Attempted to enable AMX on a CPU from a vendor that is not known to support AMX features. + #[error("The host CPU's vendor does not support AMX features. Only Intel provides such CPUs.")] + VendorDoesNotSupportAmx, + /// Unable to verify that the host supports AMX. + #[error("The host does not support AMX tile state components: errno={errno}")] + AmxNotSupported { errno: i64 }, + /// The syscall to check for AMX tile state support succeeded, but the returned + /// features did not match our expectations. + #[error( + "Could not verify AMX support. These are the supported features that were reported: features={features}" + )] + InvalidAmxTileFeatureCheck { features: usize }, + /// The request to enable AMX related state components for guests failed. + #[error("Failed to enable AMX tile state components for guests: errno={errno}")] + AmxGuestTileRequest { errno: i64 }, } -impl Default for XsaveState { - fn default() -> Self { - // SAFETY: this is plain old data structure - unsafe { ::std::mem::zeroed() } +/// The length of the XSAVE flexible array member (FAM). +/// This length increases when arch_prctl is utilized to dynamically add state components. +/// +/// IMPORTANT: This static should only be updated via methods on [`XsaveState`]. +#[cfg(feature = "kvm")] +static XSAVE_FAM_LENGTH: OnceLock = OnceLock::new(); + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct XsaveState(#[cfg(feature = "kvm")] pub(crate) kvm_bindings::Xsave); + +impl XsaveState { + const ARCH_GET_XCOMP_SUPP: usize = 0x1021; + const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025; + const ARCH_XCOMP_TILECFG: usize = 17; + const ARCH_XCOMP_TILEDATA: usize = 18; + + /// Construct an instance via the given initializer. + /// + /// As long as dynamically enabled state components have only been enabled + /// through static methods on this struct it is guaranteed that the + /// initialization routine is given an Xsave struct of the expected size. + #[cfg(feature = "kvm")] + pub(crate) fn with_initializer( + mut init: F, + ) -> Result> + where + F: FnMut(&mut kvm_bindings::Xsave) -> Result<(), E>, + E: Into>, + { + let fam_length = XSAVE_FAM_LENGTH.get().unwrap_or(&0); + + let mut xsave = kvm_bindings::Xsave::new(*fam_length)?; + + init(&mut xsave).map_err(Into::into)?; + Ok(Self(xsave)) + } + + /// This function enables the AMX related TILECFG and TILEDATA state components for guests. + /// + /// # Background + /// AMX uses a concept of tiles which are small 2D blocks of data stored in registers on the CPU, + /// where the TILECFG state component defines the shape and size of each tile (rows and columns), + /// and the TILEDATA state component holds the actual elements of these tiles used by matrix operations. + pub fn enable_amx_state_components( + hypervisor: &dyn Hypervisor, + ) -> Result<(), AmxGuestSupportError> { + Self::amx_supported(hypervisor)?; + Self::request_guest_amx_support()?; + + // If we are using the KVM hypervisor we meed to query for the new xsave2 size and update + // `XSAVE_FAM_LENGTH` accordingly. + #[cfg(feature = "kvm")] + { + // Obtain the number of bytes the kvm_xsave struct requires. + // This number is documented to always be at least 4096 bytes, but + let size = hypervisor.check_extension_int(kvm_ioctls::Cap::Xsave2); + // Reality check: We should at least have this number of bytes and probably more as we have enabled + // AMX tiles. If this is not the case, it is probably best to panic. + assert!(size >= 4096); + let fam_length = { + // Computation is documented in `[kvm_bindings::kvm_xsave2::len]` + ((size as usize) - size_of::()) + .div_ceil(size_of::()) + }; + let _ = XSAVE_FAM_LENGTH.set(fam_length); + } + + Ok(()) + } + + /// Checks whether the host supports AMX. + /// + /// The `hypervisor` is used to inform us about the + /// CPU vendor (AMX is currently only available on Intel CPUs). + /// + /// Returns `Ok` if AMX is supported on the host and `Err` otherwise. + fn amx_supported(hypervisor: &dyn Hypervisor) -> Result<(), AmxGuestSupportError> { + if !matches!(hypervisor.get_cpu_vendor(), CpuVendor::Intel) { + return Err(AmxGuestSupportError::VendorDoesNotSupportAmx); + } + // We make a syscall to get information about which dynamically enabled + // XSAVE state components are supported. The corresponding state + // component bits will get set in `features` + let mut features: usize = 0; + // SAFETY: Syscall with valid parameters + let result = unsafe { + libc::syscall( + libc::SYS_arch_prctl, + Self::ARCH_GET_XCOMP_SUPP, + &raw mut features, + ) + }; + // Ensure that both the TILECFG and TILEDATA state components are supported + let mask = (1 << Self::ARCH_XCOMP_TILECFG) | (1 << Self::ARCH_XCOMP_TILEDATA); + if result != 0 { + return Err(AmxGuestSupportError::AmxNotSupported { errno: result }); + } + + if (features & mask) == mask { + Ok(()) + } else { + Err(AmxGuestSupportError::InvalidAmxTileFeatureCheck { features }) + } + } + + /// Asks the kernel to provide AMX support for guests. + fn request_guest_amx_support() -> Result<(), AmxGuestSupportError> { + // Make a syscall to request permission for guests to use the TILECFG + // and TILEDATA state components. Note that as per the kernel + // [documentation](https://docs.kernel.org/arch/x86/xstate.html#dynamic-features-for-virtual-machines) + // we need to pass in the number of the highest XSTATE component which is required for + // the facility to work which in this case is TILEDATA. + // + // This syscall will alter the size of `kvm_xsave` when KVM is used as the hypervisor. + // + // SAFETY: Syscall with valid parameters + let result = unsafe { + libc::syscall( + libc::SYS_arch_prctl, + Self::ARCH_REQ_XCOMP_GUEST_PERM, + Self::ARCH_XCOMP_TILEDATA, + ) + }; + if result == 0 { + Ok(()) + } else { + // Unwrap is OK because we verified that `result` is not zero + Err(AmxGuestSupportError::AmxGuestTileRequest { errno: result }) + } } } diff --git a/hypervisor/src/cpu.rs b/hypervisor/src/cpu.rs index c2eb03a267..bfd24f12e8 100644 --- a/hypervisor/src/cpu.rs +++ b/hypervisor/src/cpu.rs @@ -10,6 +10,8 @@ // // +#[cfg(feature = "kvm")] +use std::os::fd::RawFd; #[cfg(target_arch = "aarch64")] use std::sync::Arc; @@ -17,18 +19,18 @@ use thiserror::Error; #[cfg(not(target_arch = "riscv64"))] use vm_memory::GuestAddress; -#[cfg(target_arch = "x86_64")] -use crate::arch::x86::{CpuIdEntry, FpuState, LapicState, MsrEntry, SpecialRegisters}; -#[cfg(feature = "tdx")] -use crate::kvm::{TdxExitDetails, TdxExitStatus}; #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] use crate::RegList; #[cfg(target_arch = "aarch64")] use crate::VcpuInit; +#[cfg(target_arch = "x86_64")] +use crate::arch::x86::{CpuIdEntry, FpuState, LapicState, MsrEntry, SpecialRegisters}; +#[cfg(feature = "tdx")] +use crate::kvm::{TdxExitDetails, TdxExitStatus}; use crate::{CpuState, MpState, StandardRegisters}; #[cfg(target_arch = "x86_64")] -#[derive(Copy, Clone, Default)] +#[derive(Debug, Copy, Clone, Default, serde::Serialize, serde::Deserialize, Eq, PartialEq)] pub enum CpuVendor { #[default] Unknown, @@ -334,6 +336,10 @@ pub enum HypervisorCpuError { /// #[error("Failed to inject NMI")] Nmi(#[source] anyhow::Error), + #[error("Failed to get nested guest state")] + GetNestedState(#[source] anyhow::Error), + #[error("Failed to set nested guest state")] + SetNestedState(#[source] anyhow::Error), } #[derive(Debug)] @@ -471,7 +477,7 @@ pub trait Vcpu: Send + Sync { &self, vm: &Arc, kvi: &mut VcpuInit, - id: u8, + id: u32, ) -> Result<()>; /// /// Returns VcpuInit with default value set @@ -498,7 +504,7 @@ pub trait Vcpu: Send + Sync { /// Configure core registers for a given CPU. /// #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] - fn setup_regs(&self, cpu_id: u8, boot_ip: u64, fdt_start: u64) -> Result<()>; + fn setup_regs(&self, cpu_id: u32, boot_ip: u64, fdt_start: u64) -> Result<()>; /// /// Check if the CPU supports PMU /// @@ -602,4 +608,11 @@ pub trait Vcpu: Send + Sync { /// Trigger NMI interrupt /// fn nmi(&self) -> Result<()>; + /// Returns the underlying vCPU FD of KVM. + /// + /// # SAFETY + /// This is safe as we only use this to map the KVM_RUN structure for the + /// signal handler and only use it from there. + #[cfg(feature = "kvm")] + unsafe fn get_kvm_vcpu_raw_fd(&self) -> RawFd; } diff --git a/hypervisor/src/hypervisor.rs b/hypervisor/src/hypervisor.rs index 4fc98fb8bb..1974b02861 100644 --- a/hypervisor/src/hypervisor.rs +++ b/hypervisor/src/hypervisor.rs @@ -13,6 +13,7 @@ use std::sync::Arc; use thiserror::Error; +use crate::HypervisorType; #[cfg(target_arch = "x86_64")] use crate::arch::x86::CpuIdEntry; #[cfg(target_arch = "x86_64")] @@ -20,7 +21,6 @@ use crate::cpu::CpuVendor; #[cfg(feature = "tdx")] use crate::kvm::TdxCapabilities; use crate::vm::Vm; -use crate::HypervisorType; #[derive(Error, Debug)] pub enum HypervisorError { @@ -111,6 +111,16 @@ pub trait Hypervisor: Send + Sync { /// Return a hypervisor-agnostic Vm trait object /// fn create_vm(&self) -> Result>; + + /// Query the hypervisor for the availability of an extension. + /// + /// + /// Generally 0 means no and 1 means yes, but some extensions may report + /// additional information in the integer return value. + /// + #[cfg(feature = "kvm")] + fn check_extension_int(&self, capability: kvm_ioctls::Cap) -> i32; + /// /// Create a Vm of a specific type using the underlying hypervisor /// Return a hypervisor-agnostic Vm trait object diff --git a/hypervisor/src/kvm/aarch64/gic/dist_regs.rs b/hypervisor/src/kvm/aarch64/gic/dist_regs.rs index 9135ad0031..9a3c719e7c 100644 --- a/hypervisor/src/kvm/aarch64/gic/dist_regs.rs +++ b/hypervisor/src/kvm/aarch64/gic/dist_regs.rs @@ -6,12 +6,12 @@ use kvm_ioctls::DeviceFd; use crate::arch::aarch64::gic::{Error, Result}; use crate::device::HypervisorDeviceError; use crate::kvm::kvm_bindings::{ - kvm_device_attr, KVM_DEV_ARM_VGIC_GRP_DIST_REGS, KVM_DEV_ARM_VGIC_GRP_NR_IRQS, + KVM_DEV_ARM_VGIC_GRP_DIST_REGS, KVM_DEV_ARM_VGIC_GRP_NR_IRQS, kvm_device_attr, }; /* Distributor registers as detailed at page 456 from - https://static.docs.arm.com/ihi0069/c/IHI0069C_gic_architecture_specification.pdf. + https://developer.arm.com/documentation/ihi0069/c/?lang=en. Address offsets are relative to the Distributor base address defined by the system memory map. Unless otherwise stated in the register description, all GIC registers are 32-bits wide. @@ -156,7 +156,7 @@ fn compute_reg_len(gic: &DeviceFd, reg: &DistReg, base: u32) -> Result { // that the model has. It is also the type of register where // a register relates to multiple interrupts. end = base + (reg.bpi as u32 * (num_irq - LAYOUT_IRQ_BASE) / 8); - if reg.bpi as u32 * (num_irq - LAYOUT_IRQ_BASE) % 8 > 0 { + if !(reg.bpi as u32 * (num_irq - LAYOUT_IRQ_BASE)).is_multiple_of(8) { end += REG_SIZE as u32; } } diff --git a/hypervisor/src/kvm/aarch64/gic/icc_regs.rs b/hypervisor/src/kvm/aarch64/gic/icc_regs.rs index f993581840..b084c89899 100644 --- a/hypervisor/src/kvm/aarch64/gic/icc_regs.rs +++ b/hypervisor/src/kvm/aarch64/gic/icc_regs.rs @@ -7,10 +7,11 @@ use kvm_ioctls::DeviceFd; use crate::arch::aarch64::gic::{Error, Result}; use crate::device::HypervisorDeviceError; use crate::kvm::kvm_bindings::{ - kvm_device_attr, KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS, KVM_REG_ARM64_SYSREG_CRM_MASK, + KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS, KVM_REG_ARM64_SYSREG_CRM_MASK, KVM_REG_ARM64_SYSREG_CRM_SHIFT, KVM_REG_ARM64_SYSREG_CRN_MASK, KVM_REG_ARM64_SYSREG_CRN_SHIFT, KVM_REG_ARM64_SYSREG_OP0_MASK, KVM_REG_ARM64_SYSREG_OP0_SHIFT, KVM_REG_ARM64_SYSREG_OP1_MASK, KVM_REG_ARM64_SYSREG_OP1_SHIFT, KVM_REG_ARM64_SYSREG_OP2_MASK, KVM_REG_ARM64_SYSREG_OP2_SHIFT, + kvm_device_attr, }; const KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT: u32 = 32; diff --git a/hypervisor/src/kvm/aarch64/gic/mod.rs b/hypervisor/src/kvm/aarch64/gic/mod.rs index cf4619bd7c..8bb79be2b5 100644 --- a/hypervisor/src/kvm/aarch64/gic/mod.rs +++ b/hypervisor/src/kvm/aarch64/gic/mod.rs @@ -216,9 +216,7 @@ impl KvmGicV3Its { 0, )?; - /* Finalize the GIC. - * See https://code.woboq.org/linux/linux/virt/kvm/arm/vgic/vgic-kvm-device.c.html#211. - */ + // Finalize the GIC. Self::set_device_attribute( &self.device, kvm_bindings::KVM_DEV_ARM_VGIC_GRP_CTRL, diff --git a/hypervisor/src/kvm/aarch64/gic/redist_regs.rs b/hypervisor/src/kvm/aarch64/gic/redist_regs.rs index 7adc0efefc..c06818e046 100644 --- a/hypervisor/src/kvm/aarch64/gic/redist_regs.rs +++ b/hypervisor/src/kvm/aarch64/gic/redist_regs.rs @@ -4,15 +4,15 @@ use kvm_ioctls::DeviceFd; +use crate::CpuState; use crate::arch::aarch64::gic::{Error, Result}; use crate::device::HypervisorDeviceError; +use crate::kvm::VcpuKvmState; use crate::kvm::kvm_bindings::{ - kvm_device_attr, kvm_one_reg, KVM_DEV_ARM_VGIC_GRP_REDIST_REGS, KVM_REG_ARM64, - KVM_REG_ARM64_SYSREG, KVM_REG_ARM64_SYSREG_OP0_MASK, KVM_REG_ARM64_SYSREG_OP0_SHIFT, - KVM_REG_ARM64_SYSREG_OP2_MASK, KVM_REG_ARM64_SYSREG_OP2_SHIFT, KVM_REG_SIZE_U64, + KVM_DEV_ARM_VGIC_GRP_REDIST_REGS, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG, + KVM_REG_ARM64_SYSREG_OP0_MASK, KVM_REG_ARM64_SYSREG_OP0_SHIFT, KVM_REG_ARM64_SYSREG_OP2_MASK, + KVM_REG_ARM64_SYSREG_OP2_SHIFT, KVM_REG_SIZE_U64, kvm_device_attr, kvm_one_reg, }; -use crate::kvm::VcpuKvmState; -use crate::CpuState; // Relevant redistributor registers that we want to save/restore. const GICR_CTLR: u32 = 0x0000; diff --git a/hypervisor/src/kvm/aarch64/mod.rs b/hypervisor/src/kvm/aarch64/mod.rs index 0bef5e07d3..a94ed55f1c 100644 --- a/hypervisor/src/kvm/aarch64/mod.rs +++ b/hypervisor/src/kvm/aarch64/mod.rs @@ -11,39 +11,14 @@ pub mod gic; use kvm_bindings::{ - kvm_mp_state, kvm_one_reg, kvm_regs, KVM_REG_ARM_COPROC_MASK, KVM_REG_ARM_CORE, - KVM_REG_SIZE_MASK, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64, + KVM_REG_ARM_COPROC_MASK, KVM_REG_ARM_CORE, KVM_REG_SIZE_MASK, KVM_REG_SIZE_U32, + KVM_REG_SIZE_U64, kvm_mp_state, kvm_one_reg, kvm_regs, }; pub use kvm_ioctls::{Cap, Kvm}; use serde::{Deserialize, Serialize}; use crate::kvm::{KvmError, KvmResult}; -// This macro gets the offset of a structure (i.e `str`) member (i.e `field`) without having -// an instance of that structure. -#[macro_export] -macro_rules! offset_of { - ($str:ty, $field:ident) => {{ - let tmp: std::mem::MaybeUninit<$str> = std::mem::MaybeUninit::uninit(); - let base = tmp.as_ptr(); - - // Avoid warnings when nesting `unsafe` blocks. - #[allow(unused_unsafe)] - // SAFETY: The pointer is valid and aligned, just not initialised. Using `addr_of` ensures - // that we don't actually read from `base` (which would be UB) nor create an intermediate - // reference. - let member = unsafe { core::ptr::addr_of!((*base).$field) } as *const u8; - - // Avoid warnings when nesting `unsafe` blocks. - #[allow(unused_unsafe)] - // SAFETY: The two pointers are within the same allocated object `tmp`. All requirements - // from offset_from are upheld. - unsafe { - member.offset_from(base as *const u8) as usize - } - }}; -} - // Following are macros that help with getting the ID of a aarch64 core register. // The core register are represented by the user_pt_regs structure. Look for it in // arch/arm64/include/uapi/asm/ptrace.h. diff --git a/hypervisor/src/kvm/mod.rs b/hypervisor/src/kvm/mod.rs index 591a09586f..eea499da51 100644 --- a/hypervisor/src/kvm/mod.rs +++ b/hypervisor/src/kvm/mod.rs @@ -12,12 +12,10 @@ use std::any::Any; use std::collections::HashMap; -#[cfg(target_arch = "x86_64")] -use std::fs::File; -#[cfg(target_arch = "x86_64")] -use std::os::unix::io::AsRawFd; -#[cfg(feature = "tdx")] -use std::os::unix::io::RawFd; +#[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] +use std::mem::offset_of; +#[cfg(any(feature = "tdx", feature = "kvm"))] +use std::os::unix::io::{AsRawFd, RawFd}; use std::result; #[cfg(target_arch = "x86_64")] use std::sync::atomic::{AtomicBool, Ordering}; @@ -29,46 +27,48 @@ use vmm_sys_util::eventfd::EventFd; #[cfg(target_arch = "aarch64")] use crate::aarch64::gic::KvmGicV3Its; #[cfg(target_arch = "aarch64")] -pub use crate::aarch64::{check_required_kvm_extensions, is_system_register, VcpuKvmState}; +pub use crate::aarch64::{VcpuKvmState, check_required_kvm_extensions, is_system_register}; #[cfg(target_arch = "aarch64")] use crate::arch::aarch64::gic::{Vgic, VgicConfig}; #[cfg(target_arch = "riscv64")] use crate::arch::riscv64::aia::{Vaia, VaiaConfig}; +#[cfg(target_arch = "aarch64")] +use crate::arm64_core_reg_id; #[cfg(target_arch = "riscv64")] use crate::riscv64::aia::KvmAiaImsics; #[cfg(target_arch = "riscv64")] pub use crate::riscv64::{ - aia::AiaImsicsState as AiaState, check_required_kvm_extensions, is_non_core_register, - VcpuKvmState, + VcpuKvmState, aia::AiaImsicsState as AiaState, check_required_kvm_extensions, + is_non_core_register, }; -use crate::vm::{self, InterruptSourceConfig, VmOps}; -#[cfg(target_arch = "aarch64")] -use crate::{arm64_core_reg_id, offset_of}; -use crate::{cpu, hypervisor, vec_with_array_field, HypervisorType}; #[cfg(target_arch = "riscv64")] -use crate::{offset_of, riscv64_reg_id}; +use crate::riscv64_reg_id; +use crate::vm::{self, InterruptSourceConfig, VmOps}; +use crate::{HypervisorType, cpu, hypervisor}; // x86_64 dependencies #[cfg(target_arch = "x86_64")] pub mod x86_64; #[cfg(target_arch = "x86_64")] use kvm_bindings::{ - kvm_enable_cap, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC, KVM_CAP_SPLIT_IRQCHIP, - KVM_GUESTDBG_USE_HW_BP, + KVM_CAP_HYPERV_SYNIC, KVM_CAP_SPLIT_IRQCHIP, KVM_CAP_X2APIC_API, KVM_GUESTDBG_USE_HW_BP, + KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK, KVM_X2APIC_API_USE_32BIT_IDS, MsrList, kvm_enable_cap, + kvm_msr_entry, }; #[cfg(target_arch = "x86_64")] use x86_64::check_required_kvm_extensions; #[cfg(target_arch = "x86_64")] pub use x86_64::{CpuId, ExtendedControlRegisters, MsrEntries, VcpuKvmState}; +#[cfg(target_arch = "x86_64")] +use crate::ClockData; #[cfg(target_arch = "x86_64")] use crate::arch::x86::{ - CpuIdEntry, FpuState, LapicState, MsrEntry, SpecialRegisters, XsaveState, NUM_IOAPIC_PINS, + CpuIdEntry, FpuState, LapicState, MsrEntry, NUM_IOAPIC_PINS, SpecialRegisters, XsaveState, }; -#[cfg(target_arch = "x86_64")] -use crate::ClockData; use crate::{ - CpuState, IoEventAddress, IrqRoutingEntry, MpState, StandardRegisters, UserMemoryRegion, + CpuState, IoEventAddress, IrqRoutingEntry, MpState, StandardRegisters, USER_MEMORY_REGION_LOG_DIRTY, USER_MEMORY_REGION_READ, USER_MEMORY_REGION_WRITE, + UserMemoryRegion, }; // aarch64 dependencies #[cfg(target_arch = "aarch64")] @@ -84,44 +84,39 @@ use std::mem; /// #[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))] pub use kvm_bindings::kvm_vcpu_events as VcpuEvents; +#[cfg(target_arch = "x86_64")] +use kvm_bindings::nested::KvmNestedStateBuffer; pub use kvm_bindings::{ - kvm_clock_data, kvm_create_device, kvm_create_device as CreateDevice, - kvm_device_attr as DeviceAttr, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_guest_debug, - kvm_irq_routing, kvm_irq_routing_entry, kvm_mp_state, kvm_run, kvm_userspace_memory_region, KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI, - KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID, + KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID, kvm_clock_data, + kvm_create_device, kvm_create_device as CreateDevice, kvm_device_attr as DeviceAttr, + kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_guest_debug, kvm_irq_routing, kvm_irq_routing_entry, + kvm_mp_state, kvm_run, kvm_userspace_memory_region, }; #[cfg(target_arch = "aarch64")] use kvm_bindings::{ - kvm_regs, user_fpsimd_state, user_pt_regs, KVM_GUESTDBG_USE_HW, KVM_NR_SPSR, KVM_REG_ARM64, - KVM_REG_ARM64_SYSREG, KVM_REG_ARM64_SYSREG_CRM_MASK, KVM_REG_ARM64_SYSREG_CRN_MASK, - KVM_REG_ARM64_SYSREG_OP0_MASK, KVM_REG_ARM64_SYSREG_OP1_MASK, KVM_REG_ARM64_SYSREG_OP2_MASK, - KVM_REG_ARM_CORE, KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64, + KVM_GUESTDBG_USE_HW, KVM_NR_SPSR, KVM_REG_ARM_CORE, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG, + KVM_REG_ARM64_SYSREG_CRM_MASK, KVM_REG_ARM64_SYSREG_CRN_MASK, KVM_REG_ARM64_SYSREG_OP0_MASK, + KVM_REG_ARM64_SYSREG_OP1_MASK, KVM_REG_ARM64_SYSREG_OP2_MASK, KVM_REG_SIZE_U32, + KVM_REG_SIZE_U64, KVM_REG_SIZE_U128, kvm_regs, user_pt_regs, }; #[cfg(target_arch = "riscv64")] -use kvm_bindings::{kvm_riscv_core, user_regs_struct, KVM_REG_RISCV_CORE}; +use kvm_bindings::{KVM_REG_RISCV_CORE, kvm_riscv_core}; #[cfg(feature = "tdx")] -use kvm_bindings::{kvm_run__bindgen_ty_1, KVMIO}; +use kvm_bindings::{KVMIO, kvm_run__bindgen_ty_1}; pub use kvm_ioctls::{Cap, Kvm, VcpuExit}; use thiserror::Error; use vfio_ioctls::VfioDeviceFd; +#[cfg(target_arch = "x86_64")] +use vmm_sys_util::ioctl_io_nr; #[cfg(feature = "tdx")] -use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_ioc_nr, ioctl_iowr_nr}; +use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_iowr_nr}; pub use {kvm_bindings, kvm_ioctls}; -#[cfg(target_arch = "aarch64")] -use crate::arch::aarch64::regs; #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] use crate::RegList; - -#[cfg(target_arch = "x86_64")] -const KVM_CAP_SGX_ATTRIBUTE: u32 = 196; - -#[cfg(target_arch = "x86_64")] -use vmm_sys_util::ioctl_io_nr; -#[cfg(all(not(feature = "tdx"), target_arch = "x86_64"))] -use vmm_sys_util::ioctl_ioc_nr; - +#[cfg(target_arch = "aarch64")] +use crate::arch::aarch64::regs; #[cfg(target_arch = "x86_64")] ioctl_io_nr!(KVM_NMI, kvm_bindings::KVMIO, 0x9a); @@ -492,6 +487,50 @@ impl KvmVm { pub fn check_extension(&self, c: Cap) -> bool { self.fd.check_extension(c) } + + #[cfg(target_arch = "x86_64")] + /// Translates the MSI extended destination ID bits according to the logic + /// found in the Linux kernel's KVM MSI handling in kvm_msi_to_lapic_irq()/x86_msi_msg_get_destid(): + /// https://github.com/torvalds/linux/blob/3957a5720157264dcc41415fbec7c51c4000fc2d/arch/x86/kvm/irq.c#L266 + /// https://github.com/torvalds/linux/blob/3957a5720157264dcc41415fbec7c51c4000fc2d/arch/x86/kernel/apic/apic.c#L2306 + /// + /// This function moves bits [11, 5] from `address_lo` to bits [46, 40] in the combined 64-bit + /// address, but only if the Remappable Format (RF) bit (bit 4) in `address_lo` is + /// not set and `address_hi` is zero. + /// + /// The function is roughly equivalent to `uint64_t kvm_swizzle_msi_ext_dest_id(uint64_t address)` in + /// qemu/target/i386/kvm/kvm.c: + /// https://github.com/qemu/qemu/blob/88f72048d2f5835a1b9eaba690c7861393aef283/target/i386/kvm/kvm.c#L6258 + fn translate_msi_ext_dest_id(mut address_lo: u32, mut address_hi: u32) -> (u32, u32) { + // Mask for extracting the RF (Remappable Format) bit from address_lo. + // In the MSI specification, this is bit 4. See + // VT-d spec section "Interrupt Requests in Remappable Format" + const REMAPPABLE_FORMAT_BIT_MASK: u32 = 0x10; + let remappable_format_bit_is_set = (address_lo & REMAPPABLE_FORMAT_BIT_MASK) != 0; + + // Only perform the bit swizzling if the RF bit is unset and the upper + // 32 bits of the address are all zero. This identifies the legacy format. + if address_hi == 0 && !remappable_format_bit_is_set { + // "Move" the bits [11,5] to bits [46,40]. This is a shift of 35 bits, but + // since address is already split up into lo and hi, it's only a shift of + // 3 (35 - 32) within hi. + // "Move" via getting the bits via mask, zeroing out that range, and then + // ORing them back in at the correct location. The destination was already + // checked to be all zeroes. + const EXT_ID_MASK: u32 = 0xfe0; + const EXT_ID_SHIFT: u32 = 3; + let ext_id = address_lo & EXT_ID_MASK; + address_lo &= !EXT_ID_MASK; + address_hi |= ext_id << EXT_ID_SHIFT; + } + + (address_lo, address_hi) + } + + #[cfg(not(target_arch = "x86_64"))] + fn translate_msi_ext_dest_id(address_lo: u32, address_hi: u32) -> (u32, u32) { + (address_lo, address_hi) + } } /// Implementation of Vm trait for KVM @@ -559,7 +598,7 @@ impl vm::Vm for KvmVm { /// fn create_vcpu( &self, - id: u8, + id: u32, vm_ops: Option>, ) -> vm::Result> { let fd = self @@ -647,8 +686,12 @@ impl vm::Vm for KvmVm { ..Default::default() }; - kvm_route.u.msi.address_lo = cfg.low_addr; - kvm_route.u.msi.address_hi = cfg.high_addr; + let (address_lo, address_hi) = + Self::translate_msi_ext_dest_id(cfg.low_addr, cfg.high_addr); + + kvm_route.u.msi.address_lo = address_lo; + kvm_route.u.msi.address_hi = address_hi; + kvm_route.u.msi.data = cfg.data; if self.check_extension(crate::kvm::Cap::MsiDevid) { @@ -693,10 +736,6 @@ impl vm::Vm for KvmVm { /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl. /// fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> { - let mut irq_routing = - vec_with_array_field::(entries.len()); - irq_routing[0].nr = entries.len() as u32; - irq_routing[0].flags = 0; let entries: Vec = entries .iter() .map(|entry| match entry { @@ -706,17 +745,11 @@ impl vm::Vm for KvmVm { }) .collect(); - // SAFETY: irq_routing initialized with entries.len() and now it is being turned into - // entries_slice with entries.len() again. It is guaranteed to be large enough to hold - // everything from entries. - unsafe { - let entries_slice: &mut [kvm_irq_routing_entry] = - irq_routing[0].entries.as_mut_slice(entries.len()); - entries_slice.copy_from_slice(&entries); - } + let irq_routing = + kvm_bindings::fam_wrappers::KvmIrqRouting::from_entries(&entries).unwrap(); self.fd - .set_gsi_routing(&irq_routing[0]) + .set_gsi_routing(&irq_routing) .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into())) } @@ -833,15 +866,24 @@ impl vm::Vm for KvmVm { } #[cfg(target_arch = "x86_64")] - fn enable_sgx_attribute(&self, file: File) -> vm::Result<()> { + fn enable_x2apic_api(&self) -> vm::Result<()> { + // From https://docs.kernel.org/virt/kvm/api.html: + // On x86, kvm_msi::address_hi is ignored unless the KVM_X2APIC_API_USE_32BIT_IDS feature of + // KVM_CAP_X2APIC_API capability is enabled. If it is enabled, address_hi bits 31-8 + // provide bits 31-8 of the destination id. Bits 7-0 of address_hi must be zero. + + // Thus KVM_X2APIC_API_USE_32BIT_IDS in combination with KVM_FEATURE_MSI_EXT_DEST_ID allows + // the guest to target interrupts to cpus with APIC IDs > 254. + let mut cap = kvm_enable_cap { - cap: KVM_CAP_SGX_ATTRIBUTE, + cap: KVM_CAP_X2APIC_API, ..Default::default() }; - cap.args[0] = file.as_raw_fd() as u64; + cap.args[0] = + (KVM_X2APIC_API_USE_32BIT_IDS | KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) as u64; self.fd .enable_cap(&cap) - .map_err(|e| vm::HypervisorVmError::EnableSgxAttribute(e.into()))?; + .map_err(|e| vm::HypervisorVmError::EnableX2ApicApi(e.into()))?; Ok(()) } @@ -1239,6 +1281,10 @@ impl hypervisor::Hypervisor for KvmHypervisor { self.create_vm_with_type(vm_type) } + fn check_extension_int(&self, capability: kvm_ioctls::Cap) -> i32 { + self.kvm.check_extension_int(capability) + } + fn check_required_extensions(&self) -> hypervisor::Result<()> { check_required_kvm_extensions(&self.kvm) .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into())) @@ -1369,7 +1415,7 @@ impl cpu::Vcpu for KvmVcpu { let mut state = kvm_regs::default(); let mut off = offset_of!(user_pt_regs, regs); // There are 31 user_pt_regs: - // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72 + // https://elixir.bootlin.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72 // These actually are the general-purpose registers of the Armv8-a // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register). for i in 0..31 { @@ -1449,8 +1495,8 @@ impl cpu::Vcpu for KvmVcpu { } // Now moving on to floating point registers which are stored in the user_fpsimd_state in the kernel: - // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53 - let mut off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, vregs); + // https://elixir.bootlin.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53 + let mut off = offset_of!(kvm_regs, fp_regs.vregs); for i in 0..32 { let mut bytes = [0_u8; 16]; self.fd @@ -1463,7 +1509,7 @@ impl cpu::Vcpu for KvmVcpu { } // Floating-point Status Register - let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpsr); + let off = offset_of!(kvm_regs, fp_regs.fpsr); let mut bytes = [0_u8; 4]; self.fd .lock() @@ -1473,7 +1519,7 @@ impl cpu::Vcpu for KvmVcpu { state.fp_regs.fpsr = u32::from_le_bytes(bytes); // Floating-point Control Register - let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpcr); + let off = offset_of!(kvm_regs, fp_regs.fpcr); let mut bytes = [0_u8; 4]; self.fd .lock() @@ -1507,7 +1553,7 @@ impl cpu::Vcpu for KvmVcpu { state.mode = u64::from_le_bytes(bytes); }; ($reg_name:ident) => { - let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, $reg_name); + let off = offset_of!(kvm_riscv_core, regs.$reg_name); let mut bytes = [0_u8; 8]; self.fd .lock() @@ -1654,7 +1700,7 @@ impl cpu::Vcpu for KvmVcpu { off += std::mem::size_of::(); } - let mut off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, vregs); + let mut off = offset_of!(kvm_regs, fp_regs.vregs); for i in 0..32 { self.fd .lock() @@ -1667,7 +1713,7 @@ impl cpu::Vcpu for KvmVcpu { off += mem::size_of::(); } - let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpsr); + let off = offset_of!(kvm_regs, fp_regs.fpsr); self.fd .lock() .unwrap() @@ -1677,7 +1723,7 @@ impl cpu::Vcpu for KvmVcpu { ) .map_err(|e| cpu::HypervisorCpuError::SetAarchCoreRegister(e.into()))?; - let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpcr); + let off = offset_of!(kvm_regs, fp_regs.fpcr); self.fd .lock() .unwrap() @@ -1715,7 +1761,7 @@ impl cpu::Vcpu for KvmVcpu { .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?; }; ($reg_name:ident) => { - let off = offset_of!(kvm_riscv_core, regs, user_regs_struct, $reg_name); + let off = offset_of!(kvm_riscv_core, regs.$reg_name); self.fd .lock() .unwrap() @@ -1987,7 +2033,8 @@ impl cpu::Vcpu for KvmVcpu { /// Triggers the running of the current virtual CPU returning an exit reason. /// fn run(&self) -> std::result::Result { - match self.fd.lock().unwrap().run() { + let mut lock = self.fd.lock().unwrap(); + match lock.run() { Ok(run) => match run { #[cfg(target_arch = "x86_64")] VcpuExit::IoIn(addr, data) => { @@ -2066,7 +2113,11 @@ impl cpu::Vcpu for KvmVcpu { }, Err(ref e) => match e.errno() { - libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore), + libc::EINTR => { + lock.set_kvm_immediate_exit(0); + Ok(cpu::VmExit::Ignore) + } + libc::EAGAIN => Ok(cpu::VmExit::Ignore), _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!( "VCPU error {:?}", e @@ -2160,7 +2211,7 @@ impl cpu::Vcpu for KvmVcpu { &self, vm: &Arc, kvi: &mut crate::VcpuInit, - id: u8, + id: u32, ) -> cpu::Result<()> { use std::arch::is_aarch64_feature_detected; #[allow(clippy::nonminimal_bool)] @@ -2290,11 +2341,9 @@ impl cpu::Vcpu for KvmVcpu { /// Configure core registers for a given CPU. /// #[cfg(target_arch = "aarch64")] - fn setup_regs(&self, cpu_id: u8, boot_ip: u64, fdt_start: u64) -> cpu::Result<()> { - let kreg_off = offset_of!(kvm_regs, regs); - + fn setup_regs(&self, cpu_id: u32, boot_ip: u64, fdt_start: u64) -> cpu::Result<()> { // Get the register index of the PSTATE (Processor State) register. - let pstate = offset_of!(user_pt_regs, pstate) + kreg_off; + let pstate = offset_of!(kvm_regs, regs.pstate); self.fd .lock() .unwrap() @@ -2307,7 +2356,7 @@ impl cpu::Vcpu for KvmVcpu { // Other vCPUs are powered off initially awaiting PSCI wakeup. if cpu_id == 0 { // Setting the PC (Processor Counter) to the current program address (kernel address). - let pc = offset_of!(user_pt_regs, pc) + kreg_off; + let pc = offset_of!(kvm_regs, regs.pc); self.fd .lock() .unwrap() @@ -2321,7 +2370,7 @@ impl cpu::Vcpu for KvmVcpu { // "The device tree blob (dtb) must be placed on an 8-byte boundary and must // not exceed 2 megabytes in size." -> https://www.kernel.org/doc/Documentation/arm64/booting.txt. // We are choosing to place it the end of DRAM. See `get_fdt_addr`. - let regs0 = offset_of!(user_pt_regs, regs) + kreg_off; + let regs0 = offset_of!(kvm_regs, regs.regs); self.fd .lock() .unwrap() @@ -2338,9 +2387,9 @@ impl cpu::Vcpu for KvmVcpu { /// /// Configure registers for a given RISC-V CPU. /// - fn setup_regs(&self, cpu_id: u8, boot_ip: u64, fdt_start: u64) -> cpu::Result<()> { + fn setup_regs(&self, cpu_id: u32, boot_ip: u64, fdt_start: u64) -> cpu::Result<()> { // Setting the A0 () to the hartid of this CPU. - let a0 = offset_of!(kvm_riscv_core, regs, user_regs_struct, a0); + let a0 = offset_of!(kvm_riscv_core, regs.a0); self.fd .lock() .unwrap() @@ -2351,7 +2400,7 @@ impl cpu::Vcpu for KvmVcpu { .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?; // Setting the PC (Processor Counter) to the current program address (kernel address). - let pc = offset_of!(kvm_riscv_core, regs, user_regs_struct, pc); + let pc = offset_of!(kvm_riscv_core, regs.pc); self.fd .lock() .unwrap() @@ -2362,9 +2411,11 @@ impl cpu::Vcpu for KvmVcpu { .map_err(|e| cpu::HypervisorCpuError::SetRiscvCoreRegister(e.into()))?; // Last mandatory thing to set -> the address pointing to the FDT (also called DTB). + // + // In an earlier version of https://www.kernel.org/doc/Documentation/arch/riscv/boot.rst: // "The device tree blob (dtb) must be placed on an 8-byte boundary and must - // not exceed 64 kilobytes in size." -> https://www.kernel.org/doc/Documentation/arch/riscv/boot.txt. - let a1 = offset_of!(kvm_riscv_core, regs, user_regs_struct, a1); + // not exceed 64 kilobytes in size." + let a1 = offset_of!(kvm_riscv_core, regs.a1); self.fd .lock() .unwrap() @@ -2423,6 +2474,7 @@ impl cpu::Vcpu for KvmVcpu { let xcrs = self.get_xcrs()?; let lapic_state = self.get_lapic()?; let fpu = self.get_fpu()?; + let nested_state = self.nested_state()?; // Try to get all MSRs based on the list previously retrieved from KVM. // If the number of MSRs obtained from GET_MSRS is different from the @@ -2497,6 +2549,7 @@ impl cpu::Vcpu for KvmVcpu { xcrs, mp_state, tsc_khz, + nested_state, } .into()) } @@ -2663,6 +2716,9 @@ impl cpu::Vcpu for KvmVcpu { self.set_xcrs(&state.xcrs)?; self.set_lapic(&state.lapic_state)?; self.set_fpu(&state.fpu)?; + if let Some(nested_state) = state.nested_state { + self.set_nested_state(&nested_state)?; + } if let Some(freq) = state.tsc_khz { self.set_tsc_khz(freq)?; @@ -2769,6 +2825,13 @@ impl cpu::Vcpu for KvmVcpu { self.fd.lock().unwrap().set_kvm_immediate_exit(exit.into()); } + #[cfg(feature = "kvm")] + unsafe fn get_kvm_vcpu_raw_fd(&self) -> RawFd { + let kvm_vcpu = self.fd.lock().unwrap(); + let kvm_vcpu = &*kvm_vcpu; + kvm_vcpu.as_raw_fd() + } + /// /// Returns the details about TDX exit reason /// @@ -2825,7 +2888,7 @@ impl cpu::Vcpu for KvmVcpu { /// Return the list of initial MSR entries for a VCPU /// fn boot_msr_entries(&self) -> Vec { - use crate::arch::x86::{msr_index, MTRR_ENABLE, MTRR_MEM_TYPE_WB}; + use crate::arch::x86::{MTRR_ENABLE, MTRR_MEM_TYPE_WB, msr_index}; [ msr!(msr_index::MSR_IA32_SYSENTER_CS), @@ -2941,13 +3004,11 @@ impl KvmVcpu { /// X86 specific call that returns the vcpu's current "xsave struct". /// fn get_xsave(&self) -> cpu::Result { - Ok(self - .fd - .lock() - .unwrap() - .get_xsave() - .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into()))? - .into()) + XsaveState::with_initializer(|state| + // SAFETY: Any configured dynamically enabled state components are always enabled via + // static methods on `XsaveState` hence we know that `state` has the expected size. + unsafe { self.fd.lock().unwrap().get_xsave2(state) }) + .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(anyhow::Error::from_boxed(e))) } #[cfg(target_arch = "x86_64")] @@ -2955,11 +3016,10 @@ impl KvmVcpu { /// X86 specific call that sets the vcpu's current "xsave struct". /// fn set_xsave(&self, xsave: &XsaveState) -> cpu::Result<()> { - let xsave: kvm_bindings::kvm_xsave = (*xsave).clone().into(); - self.fd - .lock() - .unwrap() - .set_xsave(&xsave) + // SAFETY: Any configured dynamically enabled state components are always enabled via + // static methods on `XsaveState` hence we know that the wrapped instance has the + // expected size. + unsafe { self.fd.lock().unwrap().set_xsave2(&xsave.0) } .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into())) } @@ -3012,6 +3072,36 @@ impl KvmVcpu { .set_vcpu_events(events) .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into())) } + + /// Get the state of the nested guest from the current vCPU, + /// if there is any. + #[cfg(target_arch = "x86_64")] + fn nested_state(&self) -> cpu::Result> { + let mut buffer = KvmNestedStateBuffer::empty(); + + let maybe_size = self + .fd + .lock() + .unwrap() + .get_nested_state(&mut buffer) + .map_err(|e| cpu::HypervisorCpuError::GetNestedState(e.into()))?; + + if let Some(_size) = maybe_size { + Ok(Some(buffer)) + } else { + Ok(None) + } + } + + /// Sets the state of the nested guest for the current vCPU. + #[cfg(target_arch = "x86_64")] + fn set_nested_state(&self, state: &KvmNestedStateBuffer) -> cpu::Result<()> { + self.fd + .lock() + .unwrap() + .set_nested_state(state) + .map_err(|e| cpu::HypervisorCpuError::GetNestedState(e.into())) + } } #[cfg(test)] @@ -3027,7 +3117,7 @@ mod tests { let vcpu0 = vm.create_vcpu(0, None).unwrap(); let core_regs = StandardRegisters::from(kvm_riscv_core { - regs: user_regs_struct { + regs: kvm_bindings::user_regs_struct { pc: 0x00, ra: 0x01, sp: 0x02, diff --git a/hypervisor/src/kvm/riscv64/aia.rs b/hypervisor/src/kvm/riscv64/aia.rs index 607c9034c5..1aebbafbe4 100644 --- a/hypervisor/src/kvm/riscv64/aia.rs +++ b/hypervisor/src/kvm/riscv64/aia.rs @@ -7,10 +7,10 @@ use std::any::Any; use kvm_ioctls::DeviceFd; use serde::{Deserialize, Serialize}; +use crate::Vm; use crate::arch::riscv64::aia::{Error, Result, Vaia, VaiaConfig}; use crate::device::HypervisorDeviceError; use crate::kvm::KvmVm; -use crate::Vm; pub struct KvmAiaImsics { /// The KVM device for the Aia diff --git a/hypervisor/src/kvm/riscv64/mod.rs b/hypervisor/src/kvm/riscv64/mod.rs index 94017d827b..07f54efb3d 100644 --- a/hypervisor/src/kvm/riscv64/mod.rs +++ b/hypervisor/src/kvm/riscv64/mod.rs @@ -5,49 +5,14 @@ pub mod aia; use kvm_bindings::{ - kvm_mp_state, kvm_one_reg, kvm_riscv_core, KVM_REG_RISCV_CORE, KVM_REG_RISCV_TYPE_MASK, - KVM_REG_SIZE_MASK, KVM_REG_SIZE_U64, + KVM_REG_RISCV_CORE, KVM_REG_RISCV_TYPE_MASK, KVM_REG_SIZE_MASK, KVM_REG_SIZE_U64, kvm_mp_state, + kvm_one_reg, kvm_riscv_core, }; pub use kvm_ioctls::{Cap, Kvm}; use serde::{Deserialize, Serialize}; use crate::kvm::{KvmError, KvmResult}; -// This macro gets the offset of a structure (i.e `str`) member (i.e `field`) without having -// an instance of that structure. -#[macro_export] -macro_rules! _offset_of { - ($str:ty, $field:ident) => {{ - let tmp: std::mem::MaybeUninit<$str> = std::mem::MaybeUninit::uninit(); - let base = tmp.as_ptr(); - - // Avoid warnings when nesting `unsafe` blocks. - #[allow(unused_unsafe)] - // SAFETY: The pointer is valid and aligned, just not initialised. Using `addr_of` ensures - // that we don't actually read from `base` (which would be UB) nor create an intermediate - // reference. - let member = unsafe { core::ptr::addr_of!((*base).$field) } as *const u8; - - // Avoid warnings when nesting `unsafe` blocks. - #[allow(unused_unsafe)] - // SAFETY: The two pointers are within the same allocated object `tmp`. All requirements - // from offset_from are upheld. - unsafe { - member.offset_from(base as *const u8) as usize - } - }}; -} - -#[macro_export] -macro_rules! offset_of { - ($reg_struct:ty, $field:ident) => { - $crate::_offset_of!($reg_struct, $field) - }; - ($outer_reg_struct:ty, $outer_field:ident, $($inner_reg_struct:ty, $inner_field:ident), +) => { - $crate::_offset_of!($outer_reg_struct, $outer_field) + offset_of!($($inner_reg_struct, $inner_field), +) - }; -} - // Following are macros that help with getting the ID of a riscv64 register, including config registers, core registers and timer registers. // The register of core registers are wrapped in the `user_regs_struct` structure. See: // https://elixir.bootlin.com/linux/v6.10/source/arch/riscv/include/uapi/asm/kvm.h#L62 diff --git a/hypervisor/src/kvm/x86_64/mod.rs b/hypervisor/src/kvm/x86_64/mod.rs index 4cf05a1ac3..2efd12b0e4 100644 --- a/hypervisor/src/kvm/x86_64/mod.rs +++ b/hypervisor/src/kvm/x86_64/mod.rs @@ -13,18 +13,18 @@ use serde::{Deserialize, Serialize}; /// Export generically-named wrappers of kvm-bindings for Unix-based platforms /// pub use { - kvm_bindings::kvm_cpuid_entry2, kvm_bindings::kvm_dtable, kvm_bindings::kvm_fpu, - kvm_bindings::kvm_lapic_state, kvm_bindings::kvm_mp_state as MpState, + kvm_bindings::CpuId, kvm_bindings::KVM_CPUID_FLAG_SIGNIFCANT_INDEX, kvm_bindings::MsrList, + kvm_bindings::Msrs as MsrEntries, kvm_bindings::kvm_cpuid_entry2, kvm_bindings::kvm_dtable, + kvm_bindings::kvm_fpu, kvm_bindings::kvm_lapic_state, kvm_bindings::kvm_mp_state as MpState, kvm_bindings::kvm_msr_entry, kvm_bindings::kvm_regs, kvm_bindings::kvm_segment, kvm_bindings::kvm_sregs, kvm_bindings::kvm_vcpu_events as VcpuEvents, kvm_bindings::kvm_xcrs as ExtendedControlRegisters, kvm_bindings::kvm_xsave, - kvm_bindings::CpuId, kvm_bindings::MsrList, kvm_bindings::Msrs as MsrEntries, - kvm_bindings::KVM_CPUID_FLAG_SIGNIFCANT_INDEX, + kvm_bindings::nested::KvmNestedStateBuffer, }; use crate::arch::x86::{ - CpuIdEntry, DescriptorTable, FpuState, LapicState, MsrEntry, SegmentRegister, SpecialRegisters, - XsaveState, CPUID_FLAG_VALID_INDEX, + CPUID_FLAG_VALID_INDEX, CpuIdEntry, DescriptorTable, FpuState, LapicState, MsrEntry, + SegmentRegister, SpecialRegisters, XsaveState, }; use crate::kvm::{Cap, Kvm, KvmError, KvmResult}; @@ -76,6 +76,9 @@ pub struct VcpuKvmState { pub xcrs: ExtendedControlRegisters, pub mp_state: MpState, pub tsc_khz: Option, + // Option to prevent useless 8K (de)serialization when no nested + // state exists. + pub nested_state: Option, } impl From for kvm_segment { @@ -288,18 +291,3 @@ impl From for kvm_msr_entry { } } } - -impl From for XsaveState { - fn from(s: kvm_xsave) -> Self { - Self { region: s.region } - } -} - -impl From for kvm_xsave { - fn from(s: XsaveState) -> Self { - Self { - region: s.region, - extra: Default::default(), - } - } -} diff --git a/hypervisor/src/lib.rs b/hypervisor/src/lib.rs index af383e3f3c..2e653708c5 100644 --- a/hypervisor/src/lib.rs +++ b/hypervisor/src/lib.rs @@ -61,7 +61,7 @@ pub use device::HypervisorDeviceError; #[cfg(all(feature = "kvm", target_arch = "aarch64"))] pub use kvm::aarch64; #[cfg(all(feature = "kvm", target_arch = "riscv64"))] -pub use kvm::{riscv64, AiaState}; +pub use kvm::{AiaState, riscv64}; pub use vm::{ DataMatch, HypervisorVmError, InterruptSourceConfig, LegacyIrqSourceConfig, MsiIrqSourceConfig, Vm, VmOps, @@ -69,7 +69,7 @@ pub use vm::{ pub use crate::hypervisor::{Hypervisor, HypervisorError}; -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Copy, Clone, serde::Serialize, serde::Deserialize, PartialEq, Eq)] pub enum HypervisorType { #[cfg(feature = "kvm")] Kvm, diff --git a/hypervisor/src/mshv/mod.rs b/hypervisor/src/mshv/mod.rs index 7783a2fc18..39a47fb733 100644 --- a/hypervisor/src/mshv/mod.rs +++ b/hypervisor/src/mshv/mod.rs @@ -14,7 +14,7 @@ use arc_swap::ArcSwap; use mshv_bindings::*; #[cfg(target_arch = "x86_64")] use mshv_ioctls::InterruptRequest; -use mshv_ioctls::{set_registers_64, Mshv, NoDatamatch, VcpuFd, VmFd, VmType}; +use mshv_ioctls::{Mshv, NoDatamatch, VcpuFd, VmFd, VmType, set_registers_64}; use vfio_ioctls::VfioDeviceFd; use vm::DataMatch; #[cfg(feature = "sev_snp")] @@ -32,7 +32,7 @@ use crate::arch::x86::emulator::Emulator; use crate::mshv::aarch64::emulator; use crate::mshv::emulator::MshvEmulatorContext; use crate::vm::{self, InterruptSourceConfig, VmOps}; -use crate::{cpu, hypervisor, vec_with_array_field, HypervisorType}; +use crate::{HypervisorType, cpu, hypervisor, vec_with_array_field}; #[cfg(feature = "sev_snp")] mod snp_constants; // x86_64 dependencies @@ -41,16 +41,16 @@ pub mod x86_64; // aarch64 dependencies #[cfg(target_arch = "aarch64")] pub mod aarch64; -#[cfg(target_arch = "x86_64")] -use std::fs::File; +#[cfg(feature = "kvm")] +use std::os::fd::RawFd; use std::os::unix::io::AsRawFd; #[cfg(target_arch = "aarch64")] use std::sync::Mutex; -#[cfg(target_arch = "aarch64")] -use aarch64::gic::{MshvGicV2M, BASE_SPI_IRQ}; #[cfg(target_arch = "aarch64")] pub use aarch64::VcpuMshvState; +#[cfg(target_arch = "aarch64")] +use aarch64::gic::{BASE_SPI_IRQ, MshvGicV2M}; #[cfg(feature = "sev_snp")] use igvm_defs::IGVM_VHS_SNP_ID_BLOCK; #[cfg(feature = "sev_snp")] @@ -59,7 +59,7 @@ use vmm_sys_util::eventfd::EventFd; #[cfg(target_arch = "x86_64")] pub use x86_64::*; #[cfg(target_arch = "x86_64")] -pub use x86_64::{emulator, VcpuMshvState}; +pub use x86_64::{VcpuMshvState, emulator}; /// /// Export generically-named wrappers of mshv-bindings for Unix-based platforms /// @@ -68,18 +68,18 @@ pub use { mshv_bindings::mshv_device_attr as DeviceAttr, mshv_ioctls, mshv_ioctls::DeviceFd, }; +#[cfg(target_arch = "x86_64")] +use crate::ClockData; #[cfg(target_arch = "aarch64")] use crate::arch::aarch64::gic::{Vgic, VgicConfig}; #[cfg(target_arch = "aarch64")] use crate::arch::aarch64::regs; #[cfg(target_arch = "x86_64")] use crate::arch::x86::{CpuIdEntry, FpuState, MsrEntry}; -#[cfg(target_arch = "x86_64")] -use crate::ClockData; use crate::{ - CpuState, IoEventAddress, IrqRoutingEntry, MpState, UserMemoryRegion, - USER_MEMORY_REGION_ADJUSTABLE, USER_MEMORY_REGION_EXECUTE, USER_MEMORY_REGION_READ, - USER_MEMORY_REGION_WRITE, + CpuState, IoEventAddress, IrqRoutingEntry, MpState, USER_MEMORY_REGION_ADJUSTABLE, + USER_MEMORY_REGION_EXECUTE, USER_MEMORY_REGION_READ, USER_MEMORY_REGION_WRITE, + UserMemoryRegion, }; pub const PAGE_SHIFT: usize = 12; @@ -427,6 +427,12 @@ impl hypervisor::Hypervisor for MshvHypervisor { let vm_type = 0; self.create_vm_with_type(vm_type) } + + #[cfg(feature = "kvm")] + fn check_extension_int(&self, _capability: kvm_ioctls::Cap) -> i32 { + unimplemented!() + } + #[cfg(target_arch = "x86_64")] /// /// Get the supported CpuID @@ -1262,7 +1268,7 @@ impl cpu::Vcpu for MshvVcpu { } #[cfg(target_arch = "aarch64")] - fn setup_regs(&self, cpu_id: u8, boot_ip: u64, fdt_start: u64) -> cpu::Result<()> { + fn setup_regs(&self, cpu_id: u32, boot_ip: u64, fdt_start: u64) -> cpu::Result<()> { let arr_reg_name_value = [( hv_register_name_HV_ARM64_REGISTER_PSTATE, regs::PSTATE_FAULT_BITS_64, @@ -1324,7 +1330,7 @@ impl cpu::Vcpu for MshvVcpu { &self, _vm: &Arc, _kvi: &mut crate::VcpuInit, - _id: u8, + _id: u32, ) -> cpu::Result<()> { Ok(()) } @@ -1514,7 +1520,7 @@ impl cpu::Vcpu for MshvVcpu { /// Return the list of initial MSR entries for a VCPU /// fn boot_msr_entries(&self) -> Vec { - use crate::arch::x86::{msr_index, MTRR_ENABLE, MTRR_MEM_TYPE_WB}; + use crate::arch::x86::{MTRR_ENABLE, MTRR_MEM_TYPE_WB, msr_index}; [ msr!(msr_index::MSR_IA32_SYSENTER_CS), @@ -1576,6 +1582,11 @@ impl cpu::Vcpu for MshvVcpu { Ok(()) } + + #[cfg(feature = "kvm")] + unsafe fn get_kvm_vcpu_raw_fd(&self) -> RawFd { + unimplemented!() + } } impl MshvVcpu { @@ -1834,9 +1845,10 @@ impl vm::Vm for MshvVm { /// fn create_vcpu( &self, - id: u8, + id: u32, vm_ops: Option>, ) -> vm::Result> { + let id: u8 = id.try_into().unwrap(); let vcpu_fd = self .fd .create_vcpu(id) @@ -1891,11 +1903,6 @@ impl vm::Vm for MshvVm { Ok(()) } - #[cfg(target_arch = "x86_64")] - fn enable_sgx_attribute(&self, _file: File) -> vm::Result<()> { - Ok(()) - } - fn register_ioevent( &self, fd: &EventFd, diff --git a/hypervisor/src/mshv/x86_64/emulator.rs b/hypervisor/src/mshv/x86_64/emulator.rs index a8f38ba86f..4ecdee2a42 100644 --- a/hypervisor/src/mshv/x86_64/emulator.rs +++ b/hypervisor/src/mshv/x86_64/emulator.rs @@ -44,12 +44,12 @@ impl MshvEmulatorContext<'_> { gpa ); - if let Some(vm_ops) = &self.vcpu.vm_ops { - if vm_ops.guest_mem_read(gpa, data).is_err() { - vm_ops - .mmio_read(gpa, data) - .map_err(|e| PlatformError::MemoryReadFailure(e.into()))?; - } + if let Some(vm_ops) = &self.vcpu.vm_ops + && vm_ops.guest_mem_read(gpa, data).is_err() + { + vm_ops + .mmio_read(gpa, data) + .map_err(|e| PlatformError::MemoryReadFailure(e.into()))?; } Ok(()) @@ -94,12 +94,12 @@ impl MshvEmulatorContext<'_> { gpa ); - if let Some(vm_ops) = &self.vcpu.vm_ops { - if vm_ops.guest_mem_write(gpa, data).is_err() { - vm_ops - .mmio_write(gpa, data) - .map_err(|e| PlatformError::MemoryWriteFailure(e.into()))?; - } + if let Some(vm_ops) = &self.vcpu.vm_ops + && vm_ops.guest_mem_write(gpa, data).is_err() + { + vm_ops + .mmio_write(gpa, data) + .map_err(|e| PlatformError::MemoryWriteFailure(e.into()))?; } Ok(()) diff --git a/hypervisor/src/mshv/x86_64/mod.rs b/hypervisor/src/mshv/x86_64/mod.rs index 1853d234d8..a25dcc3ca7 100644 --- a/hypervisor/src/mshv/x86_64/mod.rs +++ b/hypervisor/src/mshv/x86_64/mod.rs @@ -21,16 +21,16 @@ pub mod emulator; /// Export generically-named wrappers of mshv_bindings for Unix-based platforms /// pub use { - mshv_bindings::hv_cpuid_entry, mshv_bindings::mshv_user_mem_region as MemoryRegion, - mshv_bindings::msr_entry, mshv_bindings::AllVpStateComponents, mshv_bindings::CpuId, - mshv_bindings::DebugRegisters, mshv_bindings::FloatingPointUnit, - mshv_bindings::LapicState as MshvLapicState, mshv_bindings::MiscRegs as MiscRegisters, - mshv_bindings::MsrList, mshv_bindings::Msrs as MsrEntries, mshv_bindings::Msrs, + mshv_bindings::AllVpStateComponents, mshv_bindings::CpuId, mshv_bindings::DebugRegisters, + mshv_bindings::FloatingPointUnit, mshv_bindings::LapicState as MshvLapicState, + mshv_bindings::MiscRegs as MiscRegisters, mshv_bindings::MsrList, + mshv_bindings::Msrs as MsrEntries, mshv_bindings::Msrs, mshv_bindings::SegmentRegister as MshvSegmentRegister, mshv_bindings::SpecialRegisters as MshvSpecialRegisters, mshv_bindings::StandardRegisters as MshvStandardRegisters, mshv_bindings::SuspendRegisters, mshv_bindings::TableRegister, mshv_bindings::VcpuEvents, mshv_bindings::XSave as Xsave, - mshv_bindings::Xcrs as ExtendedControlRegisters, + mshv_bindings::Xcrs as ExtendedControlRegisters, mshv_bindings::hv_cpuid_entry, + mshv_bindings::mshv_user_mem_region as MemoryRegion, mshv_bindings::msr_entry, }; #[derive(Clone, Serialize, Deserialize)] @@ -60,16 +60,18 @@ impl fmt::Display for VcpuMshvState { msr_entries[i][1] = entry.data; msr_entries[i][0] = entry.index as u64; } - write!(f, "Number of MSRs: {}: MSRs: {:#010X?}, -- VCPU Events: {:?} -- Standard registers: {:?} Special Registers: {:?} ---- Floating Point Unit: {:?} --- Extended Control Register: {:?} --- DBG: {:?} --- VP States: {:?}", - msr_entries.len(), - msr_entries, - self.vcpu_events, - self.regs, - self.sregs, - self.fpu, - self.xcrs, - self.dbg, - self.vp_states, + write!( + f, + "Number of MSRs: {}: MSRs: {:#010X?}, -- VCPU Events: {:?} -- Standard registers: {:?} Special Registers: {:?} ---- Floating Point Unit: {:?} --- Extended Control Register: {:?} --- DBG: {:?} --- VP States: {:?}", + msr_entries.len(), + msr_entries, + self.vcpu_events, + self.regs, + self.sregs, + self.fpu, + self.xcrs, + self.dbg, + self.vp_states, ) } } diff --git a/hypervisor/src/vm.rs b/hypervisor/src/vm.rs index 306aed0ff8..a2f7921314 100644 --- a/hypervisor/src/vm.rs +++ b/hypervisor/src/vm.rs @@ -11,8 +11,6 @@ // use std::any::Any; -#[cfg(target_arch = "x86_64")] -use std::fs::File; use std::sync::Arc; #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] use std::sync::Mutex; @@ -22,6 +20,8 @@ use igvm_defs::IGVM_VHS_SNP_ID_BLOCK; use thiserror::Error; use vmm_sys_util::eventfd::EventFd; +#[cfg(target_arch = "x86_64")] +use crate::ClockData; #[cfg(target_arch = "aarch64")] use crate::arch::aarch64::gic::{Vgic, VgicConfig}; #[cfg(target_arch = "riscv64")] @@ -29,8 +29,6 @@ use crate::arch::riscv64::aia::{Vaia, VaiaConfig}; #[cfg(feature = "tdx")] use crate::arch::x86::CpuIdEntry; use crate::cpu::Vcpu; -#[cfg(target_arch = "x86_64")] -use crate::ClockData; use crate::{IoEventAddress, IrqRoutingEntry, UserMemoryRegion}; /// @@ -126,11 +124,10 @@ pub enum HypervisorVmError { #[error("Failed to enable split Irq")] EnableSplitIrq(#[source] anyhow::Error), /// - /// Enable SGX attribute error - /// - #[error("Failed to enable SGX attribute")] - EnableSgxAttribute(#[source] anyhow::Error), + /// Enable x2apic API error /// + #[error("Failed to enable x2apic API")] + EnableX2ApicApi(#[source] anyhow::Error), /// Get clock error /// #[error("Failed to get clock")] @@ -319,7 +316,7 @@ pub trait Vm: Send + Sync + Any { /// Unregister an event that will, when signaled, trigger the `gsi` IRQ. fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> Result<()>; /// Creates a new KVM vCPU file descriptor and maps the memory corresponding - fn create_vcpu(&self, id: u8, vm_ops: Option>) -> Result>; + fn create_vcpu(&self, id: u32, vm_ops: Option>) -> Result>; #[cfg(target_arch = "aarch64")] fn create_vgic(&self, config: VgicConfig) -> Result>>; #[cfg(target_arch = "riscv64")] @@ -358,8 +355,6 @@ pub trait Vm: Send + Sync + Any { /// Enable split Irq capability #[cfg(target_arch = "x86_64")] fn enable_split_irq(&self) -> Result<()>; - #[cfg(target_arch = "x86_64")] - fn enable_sgx_attribute(&self, file: File) -> Result<()>; /// Retrieve guest clock. #[cfg(target_arch = "x86_64")] fn get_clock(&self) -> Result; @@ -440,6 +435,11 @@ pub trait Vm: Send + Sync + Any { fn gain_page_access(&self, _gpa: u64, _size: u32) -> Result<()> { Ok(()) } + + #[cfg(all(feature = "kvm", target_arch = "x86_64"))] + fn enable_x2apic_api(&self) -> Result<()> { + unimplemented!("x2Apic is only supported on KVM/Linux hosts") + } } pub trait VmOps: Send + Sync { diff --git a/net_gen/Cargo.toml b/net_gen/Cargo.toml index c0edc11559..a99c7c995d 100644 --- a/net_gen/Cargo.toml +++ b/net_gen/Cargo.toml @@ -1,8 +1,12 @@ [package] authors = ["The Chromium OS Authors"] edition = "2021" +#edition.workspace = true name = "net_gen" version = "0.1.0" [dependencies] vmm-sys-util = { workspace = true } + +[lints] +workspace = true diff --git a/net_util/Cargo.toml b/net_util/Cargo.toml index 981b5b3d9e..a55db49f8a 100644 --- a/net_util/Cargo.toml +++ b/net_util/Cargo.toml @@ -1,17 +1,17 @@ [package] authors = ["The Chromium OS Authors"] -edition = "2021" +edition.workspace = true name = "net_util" version = "0.1.0" [dependencies] -epoll = "4.3.3" +epoll = { workspace = true } getrandom = "0.3.3" -libc = "0.2.167" -log = "0.4.22" +libc = { workspace = true } +log = { workspace = true } net_gen = { path = "../net_gen" } rate_limiter = { path = "../rate_limiter" } -serde = { version = "1.0.208", features = ["derive"] } +serde = { workspace = true, features = ["derive"] } thiserror = { workspace = true } virtio-bindings = { workspace = true } virtio-queue = { workspace = true } @@ -27,3 +27,6 @@ vmm-sys-util = { workspace = true } pnet = "0.35.0" pnet_datalink = "0.35.0" serde_json = { workspace = true } + +[lints] +workspace = true diff --git a/net_util/src/lib.rs b/net_util/src/lib.rs index a28bcc433a..4ad7a1c77e 100644 --- a/net_util/src/lib.rs +++ b/net_util/src/lib.rs @@ -23,18 +23,18 @@ use std::{io, mem, net}; use serde::{Deserialize, Serialize}; use thiserror::Error; use virtio_bindings::virtio_net::{ - virtio_net_hdr_v1, VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX, VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN, - VIRTIO_NET_F_GUEST_CSUM, VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_TSO4, - VIRTIO_NET_F_GUEST_TSO6, VIRTIO_NET_F_GUEST_UFO, VIRTIO_NET_F_MAC, VIRTIO_NET_F_MQ, + VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX, VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN, VIRTIO_NET_F_GUEST_CSUM, + VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, + VIRTIO_NET_F_GUEST_UFO, VIRTIO_NET_F_MAC, VIRTIO_NET_F_MQ, virtio_net_hdr_v1, }; -use vm_memory::bitmap::AtomicBitmap; use vm_memory::ByteValued; +use vm_memory::bitmap::AtomicBitmap; type GuestMemoryMmap = vm_memory::GuestMemoryMmap; pub use ctrl_queue::{CtrlQueue, Error as CtrlQueueError}; -pub use mac::{MacAddr, MAC_ADDR_LEN}; -pub use open_tap::{open_tap, Error as OpenTapError}; +pub use mac::{MAC_ADDR_LEN, MacAddr}; +pub use open_tap::{Error as OpenTapError, open_tap}; pub use queue_pair::{NetCounters, NetQueuePair, NetQueuePairError, RxVirtio, TxVirtio}; pub use tap::{Error as TapError, Tap}; diff --git a/net_util/src/open_tap.rs b/net_util/src/open_tap.rs index e711529ca7..95924d6df9 100644 --- a/net_util/src/open_tap.rs +++ b/net_util/src/open_tap.rs @@ -8,7 +8,7 @@ use std::{fs, io}; use thiserror::Error; -use super::{vnet_hdr_len, MacAddr, Tap, TapError}; +use super::{MacAddr, Tap, TapError, vnet_hdr_len}; #[derive(Error, Debug)] pub enum Error { @@ -76,7 +76,14 @@ fn open_tap_rx_q_0( let tap = match if_name { Some(name) => Tap::open_named(name, num_rx_q, flags).map_err(Error::TapOpen)?, // Create a new Tap device in Linux, if none was specified. - None => Tap::new(num_rx_q).map_err(Error::TapOpen)?, + None => { + let tap = Tap::new(num_rx_q).map_err(Error::TapOpen)?; + log::info!( + "Created tap device: name={}, num_rx_q={num_rx_q}", + tap.if_name_as_str() + ); + tap + } }; // Don't overwrite ip configuration of existing interfaces: if !tap_exists { @@ -135,7 +142,7 @@ pub fn open_tap( // same device. tap = open_tap_rx_q_0(if_name, ip_addr, netmask, host_mac, mtu, num_rx_q, flags)?; // Set the name of the tap device we open in subsequent iterations. - ifname = String::from_utf8(tap.get_if_name()).unwrap(); + ifname = tap.if_name_as_str().to_string(); } else { tap = Tap::open_named(ifname.as_str(), num_rx_q, flags).map_err(Error::TapOpen)?; diff --git a/net_util/src/queue_pair.rs b/net_util/src/queue_pair.rs index f28d759fe5..63fe677509 100644 --- a/net_util/src/queue_pair.rs +++ b/net_util/src/queue_pair.rs @@ -5,8 +5,8 @@ use std::io; use std::num::Wrapping; use std::os::unix::io::{AsRawFd, RawFd}; -use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; use rate_limiter::{RateLimiter, TokenType}; use thiserror::Error; @@ -15,7 +15,7 @@ use vm_memory::bitmap::Bitmap; use vm_memory::{Bytes, GuestMemory}; use vm_virtio::{AccessPlatform, Translatable}; -use super::{register_listener, unregister_listener, vnet_hdr_len, Tap}; +use super::{Tap, register_listener, unregister_listener, vnet_hdr_len}; #[derive(Clone)] pub struct TxVirtio { diff --git a/net_util/src/tap.rs b/net_util/src/tap.rs index 1dc0b7f486..222ca115d0 100644 --- a/net_util/src/tap.rs +++ b/net_util/src/tap.rs @@ -15,8 +15,8 @@ use thiserror::Error; use vmm_sys_util::ioctl::{ioctl_with_mut_ref, ioctl_with_ref, ioctl_with_val}; use super::{ - create_inet_socket, create_sockaddr, create_unix_socket, vnet_hdr_len, Error as NetUtilError, - MacAddr, + Error as NetUtilError, MacAddr, create_inet_socket, create_sockaddr, create_unix_socket, + vnet_hdr_len, }; use crate::mac::MAC_ADDR_LEN; @@ -65,6 +65,16 @@ pub struct Tap { if_name: Vec, } +impl Drop for Tap { + fn drop(&mut self) { + debug!( + "Dropping Tap: if_name={}, FD={}", + self.if_name_as_str(), + self.tap_file.as_raw_fd() + ); + } +} + impl PartialEq for Tap { fn eq(&self, other: &Tap) -> bool { self.if_name == other.if_name @@ -129,8 +139,15 @@ fn ipv6_mask_to_prefix(mask: Ipv6Addr) -> Result { } impl Tap { + /// The default naming scheme for Tap devices that are created by Cloud Hypervisor. + pub const DEFAULT_NAME_SCHEME: &'static str = "vmtap%d"; + + /// # Safety + /// The caller should ensure to pass a valid file descriptor and valid + /// arguments for the `ioctl()` syscall. unsafe fn ioctl_with_mut_ref(fd: &F, req: c_ulong, arg: &mut T) -> Result<()> { - let ret = ioctl_with_mut_ref(fd, req, arg); + // SAFETY: file descriptor is valid and return value is checked + let ret = unsafe { ioctl_with_mut_ref(fd, req, arg) }; if ret < 0 { return Err(Error::IoctlError(req, IoError::last_os_error())); } @@ -138,8 +155,12 @@ impl Tap { Ok(()) } + /// # Safety + /// The caller should ensure to pass a valid file descriptor and valid + /// arguments for the `ioctl()` syscall. unsafe fn ioctl_with_ref(fd: &F, req: c_ulong, arg: &T) -> Result<()> { - let ret = ioctl_with_ref(fd, req, arg); + // SAFETY: file descriptor is valid and return value is checked + let ret = unsafe { ioctl_with_ref(fd, req, arg) }; if ret < 0 { return Err(Error::IoctlError(req, IoError::last_os_error())); } @@ -147,8 +168,12 @@ impl Tap { Ok(()) } + /// # Safety + /// The caller should ensure to pass a valid file descriptor and valid + /// arguments for the `ioctl()` syscall. unsafe fn ioctl_with_val(fd: &F, req: c_ulong, arg: c_ulong) -> Result<()> { - let ret = ioctl_with_val(fd, req, arg); + // SAFETY: file descriptor is valid and return value is checked + let ret = unsafe { ioctl_with_val(fd, req, arg) }; if ret < 0 { return Err(Error::IoctlError(req, IoError::last_os_error())); } @@ -171,6 +196,7 @@ impl Tap { if fd < 0 { return Err(Error::OpenTun(IoError::last_os_error())); } + debug!("Opening Tap device with given name: ifname={if_name}, fd={fd}"); // SAFETY: We just checked that the fd is valid. let tuntap = unsafe { File::from_raw_fd(fd) }; @@ -224,7 +250,7 @@ impl Tap { /// Create a new tap interface. pub fn new(num_queue_pairs: usize) -> Result { - Self::open_named("vmtap%d", num_queue_pairs, None) + Self::open_named(Self::DEFAULT_NAME_SCHEME, num_queue_pairs, None) } pub fn from_tap_fd(fd: RawFd, num_queue_pairs: usize) -> Result { @@ -481,8 +507,31 @@ impl Tap { ifreq } - pub fn get_if_name(&self) -> Vec { - self.if_name.clone() + /// Returns the raw bytes of the interface name, which may or may not be + /// valid UTF-8. + pub fn if_name_as_bytes(&self) -> &[u8] { + &self.if_name + } + + /// Returns the interface name as a string, truncated at the first NUL byte + /// if present. + /// + /// # Panic + /// Panics if the interface name is not encoded as valid UTF-8. This can + /// only be caused by unrecoverable internal errors as users and management + /// software are only allowed to specify interfaces names as Rust strings, + /// thus valid UTF-8. Also, self-generated interface names form CHV are + /// also always created from Rust strings, thus valid UTF-8. + pub fn if_name_as_str(&self) -> &str { + // All bytes until first NUL. + let nul_terminated = self + .if_name_as_bytes() + .split(|&b| b == 0) + .next() + .unwrap_or(&[]); + + // Panicking here is fine, see function documentation. + std::str::from_utf8(nul_terminated).expect("Tap interface name should be valid UTF-8") } #[cfg(fuzzing)] @@ -514,9 +563,10 @@ impl AsRawFd for Tap { } #[cfg(test)] +#[cfg(devcli_testenv)] // we need special permissions in the ENV to create Tap devices mod tests { use std::net::Ipv4Addr; - use std::sync::{mpsc, LazyLock, Mutex}; + use std::sync::{LazyLock, Mutex, mpsc}; use std::time::Duration; use std::{str, thread}; @@ -825,13 +875,15 @@ mod tests { // We use a separate thread to wait for the test packet because the API exposed by pnet is // blocking. This thread will be killed when the main thread exits. - let _handle = thread::spawn(move || loop { - let buf = rx.next().unwrap(); - let p = ParsedPkt::new(buf); - p.print(); - - if let Some(ref udp) = p.udp { - if payload == udp.payload() { + let _handle = thread::spawn(move || { + loop { + let buf = rx.next().unwrap(); + let p = ParsedPkt::new(buf); + p.print(); + + if let Some(ref udp) = p.udp + && payload == udp.payload() + { channel_tx.send(true).unwrap(); break; } diff --git a/option_parser/Cargo.toml b/option_parser/Cargo.toml index 2b6d0fe110..3d76690b41 100644 --- a/option_parser/Cargo.toml +++ b/option_parser/Cargo.toml @@ -1,8 +1,11 @@ [package] authors = ["The Cloud Hypervisor Authors"] -edition = "2021" +edition.workspace = true name = "option_parser" version = "0.1.0" [dependencies] thiserror = { workspace = true } + +[lints] +workspace = true diff --git a/option_parser/src/lib.rs b/option_parser/src/lib.rs index f83524a20b..8f6760b5ab 100644 --- a/option_parser/src/lib.rs +++ b/option_parser/src/lib.rs @@ -9,6 +9,23 @@ use std::str::FromStr; use thiserror::Error; +mod private_trait { + // Voldemort trait that dispatches to `FromStr::from_str` on externally-defined types + // and to custom parsing code for types in this module. + pub trait Parseable + where + Self: Sized, + { + type Err; + // Actually does the parsing, but panics if the input doesn't have + // balanced quotes. This is fine because split_commas checks that the + // input has balanced quotes, and option names cannot contain anything + // that split_commas treats as special. + fn from_str(input: &str) -> Result::Err>; + } +} +use private_trait::Parseable; + #[derive(Default)] pub struct OptionParser { options: HashMap, @@ -29,43 +46,41 @@ pub enum OptionParserError { Conversion(String /* field */, String /* value */), #[error("invalid value: {0}")] InvalidValue(String), + #[error("failed to convert {1}")] + NumberConversion(#[source] ParseIntError, String), } type OptionParserResult = std::result::Result; fn split_commas(s: &str) -> OptionParserResult> { let mut list: Vec = Vec::new(); - let mut opened_brackets = 0; + let mut opened_brackets = 0u64; let mut in_quotes = false; let mut current = String::new(); for c in s.trim().chars() { match c { - '[' => { - opened_brackets += 1; - current.push('['); - } + // In quotes, only '"' is special + '"' => in_quotes = !in_quotes, + _ if in_quotes => {} + '[' => opened_brackets += 1, ']' => { - opened_brackets -= 1; - if opened_brackets < 0 { + if opened_brackets < 1 { return Err(OptionParserError::InvalidSyntax(s.to_owned())); } - current.push(']'); + opened_brackets -= 1; } - '"' => in_quotes = !in_quotes, - ',' => { - if opened_brackets > 0 || in_quotes { - current.push(',') - } else { - list.push(current); - current = String::new(); - } + ',' if opened_brackets == 0 => { + list.push(current); + current = String::new(); + continue; } - c => current.push(c), - } + _ => {} + }; + current.push(c); } list.push(current); - if opened_brackets != 0 || in_quotes { + if in_quotes || opened_brackets != 0 { return Err(OptionParserError::InvalidSyntax(s.to_owned())); } @@ -86,7 +101,6 @@ impl OptionParser { for option in split_commas(input)?.iter() { let parts: Vec<&str> = option.splitn(2, '=').collect(); - match self.options.get_mut(parts[0]) { None => return Err(OptionParserError::UnknownOption(parts[0].to_owned())), Some(value) => { @@ -106,6 +120,12 @@ impl OptionParser { } pub fn add(&mut self, option: &str) -> &mut Self { + // Check that option=value has balanced + // quotes and brackets iff value does. + assert!( + !option.contains(['"', '[', ']', '=', ',']), + "forbidden character in option name" + ); self.options.insert( option.to_owned(), OptionParserValue { @@ -133,7 +153,13 @@ impl OptionParser { self.options .get(option) .and_then(|v| v.value.clone()) - .and_then(|s| if s.is_empty() { None } else { Some(s) }) + .and_then(|s| { + if s.is_empty() { + None + } else { + Some(dequote(&s)) + } + }) } pub fn is_set(&self, option: &str) -> bool { @@ -143,12 +169,53 @@ impl OptionParser { .is_some() } - pub fn convert(&self, option: &str) -> OptionParserResult> { - match self.get(option) { + /// Parses the `addr` option of PCI devices and returns the PCI device as well as the function ID + /// + /// Returns a tuple consisting of the parsed IDs for device and function in this order. Returns an error if the + /// supplied `addr` values cannot be parsed to [`u8`]. The tuple might consist of two times [`None`] if `addr` was + /// not provided. + pub fn get_pci_device_function( + &self, + ) -> OptionParserResult<(Option, Option)> { + if let Some(addr_str) = self.get("addr") { + let (device_str, function_str) = addr_str + .split_once('.') + .ok_or(OptionParserError::InvalidValue(addr_str.to_owned()))?; + + // We also accept hex number with `0x` prefix, but need to strip it before conversion in case it's present. + let device_str = device_str.strip_prefix("0x").unwrap_or(device_str); + let device_id = u8::from_str_radix(device_str, 16) + .map_err(|e| OptionParserError::NumberConversion(e, addr_str.to_owned()))?; + + let function_str = function_str.strip_prefix("0x").unwrap_or(function_str); + let function_id = u8::from_str_radix(function_str, 16) + .map_err(|e| OptionParserError::NumberConversion(e, addr_str.to_owned()))?; + + // Currently CHV only support single-function devices. Those are mapped to function ID 0 in all cases, so we + // disallow the assignment of any other function ID. + if function_id != 0 { + todo!( + "Currently no multi function devices supported! Please use `0` as function ID." + ); + } + Ok((Some(device_id), Some(function_id))) + } else { + Ok((None, None)) + } + } + + pub fn convert(&self, option: &str) -> OptionParserResult> { + match self.options.get(option).and_then(|v| v.value.as_ref()) { None => Ok(None), - Some(v) => Ok(Some(v.parse().map_err(|_| { - OptionParserError::Conversion(option.to_owned(), v.to_owned()) - })?)), + Some(v) => { + Ok(if v.is_empty() { + None + } else { + Some(Parseable::from_str(v).map_err(|_| { + OptionParserError::Conversion(option.to_owned(), v.to_owned()) + })?) + }) + } } } } @@ -161,7 +228,7 @@ pub enum ToggleParseError { InvalidValue(String), } -impl FromStr for Toggle { +impl Parseable for Toggle { type Err = ToggleParseError; fn from_str(s: &str) -> std::result::Result { @@ -216,7 +283,7 @@ pub enum IntegerListParseError { InvalidValue(String), } -impl FromStr for IntegerList { +impl Parseable for IntegerList { type Err = IntegerListParseError; fn from_str(s: &str) -> std::result::Result { @@ -300,6 +367,7 @@ impl TupleValue for Vec { } } +#[derive(PartialEq, Eq, Debug)] pub struct Tuple(pub Vec<(S, T)>); #[derive(Error, Debug)] @@ -314,31 +382,39 @@ pub enum TupleError { InvalidInteger(#[source] ParseIntError), } -impl FromStr for Tuple { +impl Parseable for Tuple { type Err = TupleError; fn from_str(s: &str) -> std::result::Result { let mut list: Vec<(S, T)> = Vec::new(); - let body = s .trim() .strip_prefix('[') .and_then(|s| s.strip_suffix(']')) .ok_or_else(|| TupleError::InvalidValue(s.to_string()))?; - let tuples_list = split_commas(body).map_err(TupleError::SplitOutsideBrackets)?; for tuple in tuples_list.iter() { - let items: Vec<&str> = tuple.split('@').collect(); - - if items.len() != 2 { - return Err(TupleError::InvalidValue((*tuple).to_string())); + let mut in_quotes = false; + let mut last_idx = 0; + let mut first_val = None; + for (idx, c) in tuple.as_bytes().iter().enumerate() { + match c { + b'"' => in_quotes = !in_quotes, + b'@' if !in_quotes => { + if last_idx != 0 { + return Err(TupleError::InvalidValue((*tuple).to_string())); + } + first_val = Some(&tuple[last_idx..idx]); + last_idx = idx + 1; + } + _ => {} + } } - - let item1 = items[0] - .parse::() - .map_err(|_| TupleError::InvalidValue(items[0].to_owned()))?; - let item2 = TupleValue::parse_value(items[1])?; - + let item1 = ::from_str( + first_val.ok_or(TupleError::InvalidValue((*tuple).to_string()))?, + ) + .map_err(|_| TupleError::InvalidValue(first_val.unwrap().to_owned()))?; + let item2 = TupleValue::parse_value(&tuple[last_idx..])?; list.push((item1, item2)); } @@ -355,16 +431,48 @@ pub enum StringListParseError { InvalidValue(String), } -impl FromStr for StringList { +fn dequote(s: &str) -> String { + let mut prev_byte = b'\0'; + let mut in_quotes = false; + let mut out: Vec = vec![]; + for i in s.bytes() { + if i == b'"' { + if prev_byte == b'"' && !in_quotes { + out.push(b'"'); + } + in_quotes = !in_quotes; + } else { + out.push(i); + } + prev_byte = i + } + assert!(!in_quotes, "split_commas didn't reject unbalanced quotes"); + // SAFETY: the non-ASCII bytes in the output are the same + // and in the same order as those in the input, so if the + // input is valid UTF-8 the output will be as well. + unsafe { String::from_utf8_unchecked(out) } +} + +impl Parseable for T +where + T: FromStr + Sized, +{ + type Err = ::Err; + fn from_str(s: &str) -> std::result::Result { + dequote(s).parse() + } +} + +impl Parseable for StringList { type Err = StringListParseError; fn from_str(s: &str) -> std::result::Result { - let string_list: Vec = s - .trim() - .trim_matches(|c| c == '[' || c == ']') - .split(',') - .map(|e| e.to_owned()) - .collect(); + let string_list: Vec = + split_commas(s.trim().trim_matches(|c| c == '[' || c == ']')) + .map_err(|_| StringListParseError::InvalidValue(s.to_owned()))? + .iter() + .map(|e| e.to_owned()) + .collect(); Ok(StringList(string_list)) } @@ -385,6 +493,7 @@ mod tests { .add("topology") .add("cmdline"); + assert_eq!(split_commas("\"\"").unwrap(), vec!["\"\""]); parser.parse("size=128M,hanging_param").unwrap_err(); parser .parse("size=128M,too_many_equals=foo=bar") @@ -395,6 +504,8 @@ mod tests { assert_eq!(parser.get("size"), Some("128M".to_owned())); assert!(!parser.is_set("mergeable")); assert!(parser.is_set("size")); + parser.parse("size=").unwrap(); + assert!(parser.get("size").is_none()); parser.parse("size=128M,mergeable=on").unwrap(); assert_eq!(parser.get("size"), Some("128M".to_owned())); @@ -416,6 +527,14 @@ mod tests { parser.parse("topology=[").unwrap_err(); parser.parse("topology=[[[]]]]").unwrap_err(); + parser.parse("topology=[\"@\"\"b\"@[1,2]]").unwrap(); + assert_eq!( + parser + .convert::>>("topology") + .unwrap() + .unwrap(), + Tuple(vec![("@\"b".to_owned(), vec![1, 2])]) + ); parser.parse("cmdline=\"console=ttyS0,9600n8\"").unwrap(); assert_eq!( @@ -425,4 +544,14 @@ mod tests { parser.parse("cmdline=\"").unwrap_err(); parser.parse("cmdline=\"\"\"").unwrap_err(); } + + #[test] + fn parse_bytes() { + assert_eq!(::from_str("a=\"b\"").unwrap(), "a=b"); + } + + #[test] + fn check_dequote() { + assert_eq!(dequote("a\u{3b2}\"a\"\"\""), "a\u{3b2}a\"") + } } diff --git a/pci/Cargo.toml b/pci/Cargo.toml index 9273340879..760baae03d 100644 --- a/pci/Cargo.toml +++ b/pci/Cargo.toml @@ -1,21 +1,21 @@ [package] authors = ["Samuel Ortiz "] -edition = "2021" +edition.workspace = true name = "pci" version = "0.1.0" [features] default = [] -kvm = ["vfio-ioctls/kvm"] -mshv = ["vfio-ioctls/mshv"] +kvm = ["hypervisor/kvm", "vfio-ioctls/kvm"] +mshv = ["hypervisor/mshv", "vfio-ioctls/mshv"] [dependencies] -anyhow = "1.0.94" -byteorder = "1.5.0" +anyhow = { workspace = true } +byteorder = { workspace = true } hypervisor = { path = "../hypervisor" } -libc = "0.2.167" -log = "0.4.22" -serde = { version = "1.0.208", features = ["derive"] } +libc = { workspace = true } +log = { workspace = true } +serde = { workspace = true, features = ["derive"] } thiserror = { workspace = true } vfio-bindings = { workspace = true, features = ["fam-wrappers"] } vfio-ioctls = { workspace = true, default-features = false } @@ -29,3 +29,6 @@ vm-memory = { workspace = true, features = [ ] } vm-migration = { path = "../vm-migration" } vmm-sys-util = { workspace = true } + +[lints] +workspace = true diff --git a/pci/src/bus.rs b/pci/src/bus.rs index f6f8ce2d01..5b10788ede 100644 --- a/pci/src/bus.rs +++ b/pci/src/bus.rs @@ -13,11 +13,11 @@ use byteorder::{ByteOrder, LittleEndian}; use thiserror::Error; use vm_device::{Bus, BusDevice, BusDeviceSync}; +use crate::PciBarConfiguration; use crate::configuration::{ PciBarRegionType, PciBridgeSubclass, PciClassCode, PciConfiguration, PciHeaderType, }; use crate::device::{BarReprogrammingParams, DeviceRelocation, Error as PciDeviceError, PciDevice}; -use crate::PciBarConfiguration; const VENDOR_ID_INTEL: u16 = 0x8086; const DEVICE_ID_INTEL_VIRT_PCIE_HOST: u16 = 0x0d57; @@ -45,7 +45,7 @@ pub enum PciRootError { #[error("Invalid PCI device identifier provided")] InvalidPciDeviceSlot(usize), /// Valid PCI device identifier but already used. - #[error("Valid PCI device identifier but already used")] + #[error("Valid PCI device identifier but already used: {0}")] AlreadyInUsePciDeviceSlot(usize), } pub type Result = std::result::Result; @@ -166,15 +166,42 @@ impl PciBus { Ok(()) } - pub fn next_device_id(&mut self) -> Result { - for (idx, device_id) in self.device_ids.iter_mut().enumerate() { - if !(*device_id) { - *device_id = true; - return Ok(idx as u32); + /// Allocates a PCI device ID on the bus. + /// + /// - `id`: ID to allocate on the bus. If [`None`], the next free + /// device ID on the bus is allocated, else the ID given is + /// allocated + /// + /// ## Errors + /// * Returns [`PciRootError::AlreadyInUsePciDeviceSlot`] in case + /// the ID requested is already allocated. + /// * Returns [`PciRootError::InvalidPciDeviceSlot`] in case the + /// requested ID exceeds the maximum number of devices allowed per + /// bus (see [`NUM_DEVICE_IDS`]). + /// * If `id` is [`None`]: Returns + /// [`PciRootError::NoPciDeviceSlotAvailable`] if no free device + /// slot is available on the bus. + pub fn allocate_device_id(&mut self, id: Option) -> Result { + if let Some(id) = id { + if (id as usize) < NUM_DEVICE_IDS { + if !self.device_ids[id as usize] { + self.device_ids[id as usize] = true; + Ok(id as u32) + } else { + Err(PciRootError::AlreadyInUsePciDeviceSlot(id as usize)) + } + } else { + Err(PciRootError::InvalidPciDeviceSlot(id as usize)) } + } else { + for (idx, device_id) in self.device_ids.iter_mut().enumerate() { + if !(*device_id) { + *device_id = true; + return Ok(idx as u32); + } + } + Err(PciRootError::NoPciDeviceSlotAvailable) } - - Err(PciRootError::NoPciDeviceSlotAvailable) } pub fn get_device_id(&mut self, id: usize) -> Result<()> { @@ -484,3 +511,110 @@ fn parse_io_config_address(config_address: u32) -> (usize, usize, usize, usize) shift_and_mask(config_address, REGISTER_NUMBER_OFFSET, REGISTER_NUMBER_MASK), ) } + +#[cfg(test)] +mod unit_tests { + use std::error::Error; + use std::result::Result; + + use super::*; + + #[derive(Debug)] + struct MocRelocDevice; + + impl DeviceRelocation for MocRelocDevice { + fn move_bar( + &self, + _old_base: u64, + _new_base: u64, + _len: u64, + _pci_dev: &mut dyn PciDevice, + _region_type: PciBarRegionType, + ) -> Result<(), std::io::Error> { + Ok(()) + } + } + + fn setup_bus() -> PciBus { + let pci_root = PciRoot::new(None); + let moc_device_reloc = Arc::new(MocRelocDevice {}); + PciBus::new(pci_root, moc_device_reloc) + } + + #[test] + // Test to acquire all IDs that can be acquired + fn allocate_device_id_next_free() { + // The first address is occupied by the root + let mut bus = setup_bus(); + for expected_id in 1..NUM_DEVICE_IDS { + assert_eq!(expected_id as u32, bus.allocate_device_id(None).unwrap()); + } + } + + #[test] + // Test that requesting specific ID work + fn allocate_device_id_request_id() -> Result<(), Box> { + // The first address is occupied by the root + let mut bus = setup_bus(); + let max_id = (NUM_DEVICE_IDS - 1).try_into()?; + assert_eq!(0x01_u32, bus.allocate_device_id(Some(0x01))?); + assert_eq!(0x10_u32, bus.allocate_device_id(Some(0x10))?); + assert_eq!(max_id as u32, bus.allocate_device_id(Some(max_id))?); + Ok(()) + } + + #[test] + // Test that gaps resulting from explicit allocations are filled by implicit ones, + // beginning with the first free slot + fn allocate_device_id_fills_gaps() -> Result<(), Box> { + // The first address is occupied by the root + let mut bus = setup_bus(); + assert_eq!(0x01_u32, bus.allocate_device_id(Some(0x01))?); + assert_eq!(0x03_u32, bus.allocate_device_id(Some(0x03))?); + assert_eq!(0x06_u32, bus.allocate_device_id(Some(0x06))?); + assert_eq!(0x02_u32, bus.allocate_device_id(None)?); + assert_eq!(0x04_u32, bus.allocate_device_id(None)?); + assert_eq!(0x05_u32, bus.allocate_device_id(None)?); + assert_eq!(0x07_u32, bus.allocate_device_id(None)?); + Ok(()) + } + + #[test] + // Test that requesting the same ID twice fails + fn allocate_device_id_request_id_twice_fails() -> Result<(), Box> { + let mut bus = setup_bus(); + let max_id = (NUM_DEVICE_IDS - 1).try_into()?; + bus.allocate_device_id(Some(max_id))?; + let _result = bus.allocate_device_id(Some(max_id)); + assert!(matches!( + PciRootError::AlreadyInUsePciDeviceSlot(max_id.into()), + _result + )); + Ok(()) + } + + #[test] + // Test to request an invalid ID + fn allocate_device_id_request_invalid_id_fails() -> Result<(), Box> { + let mut bus = setup_bus(); + let max_id = (NUM_DEVICE_IDS + 1).try_into()?; + let _result = bus.allocate_device_id(Some(max_id)); + assert!(matches!( + PciRootError::InvalidPciDeviceSlot(max_id.into()), + _result + )); + Ok(()) + } + + #[test] + // Test to acquire an ID when all IDs were already acquired + fn allocate_device_id_none_left() { + // The first address is occupied by the root + let mut bus = setup_bus(); + for expected_id in 1..NUM_DEVICE_IDS { + assert_eq!(expected_id as u32, bus.allocate_device_id(None).unwrap()); + } + let _result = bus.allocate_device_id(None); + assert!(matches!(PciRootError::NoPciDeviceSlotAvailable, _result)); + } +} diff --git a/pci/src/configuration.rs b/pci/src/configuration.rs index 7264f7caf1..706947050c 100644 --- a/pci/src/configuration.rs +++ b/pci/src/configuration.rs @@ -828,10 +828,10 @@ impl PciConfiguration { let mut addr = u64::from(self.bars[bar_num].addr & self.writable_bits[bar_idx]); - if let Some(bar_type) = self.bars[bar_num].r#type { - if bar_type == PciBarRegionType::Memory64BitRegion { - addr |= u64::from(self.bars[bar_num + 1].addr) << 32; - } + if let Some(bar_type) = self.bars[bar_num].r#type + && bar_type == PciBarRegionType::Memory64BitRegion + { + addr |= u64::from(self.bars[bar_num + 1].addr) << 32; } addr @@ -907,19 +907,19 @@ impl PciConfiguration { } // Handle potential write to MSI-X message control register - if let Some(msix_cap_reg_idx) = self.msix_cap_reg_idx { - if let Some(msix_config) = &self.msix_config { - if msix_cap_reg_idx == reg_idx && offset == 2 && data.len() == 2 { - msix_config - .lock() - .unwrap() - .set_msg_ctl(LittleEndian::read_u16(data)); - } else if msix_cap_reg_idx == reg_idx && offset == 0 && data.len() == 4 { - msix_config - .lock() - .unwrap() - .set_msg_ctl((LittleEndian::read_u32(data) >> 16) as u16); - } + if let Some(msix_cap_reg_idx) = self.msix_cap_reg_idx + && let Some(msix_config) = &self.msix_config + { + if msix_cap_reg_idx == reg_idx && offset == 2 && data.len() == 2 { + msix_config + .lock() + .unwrap() + .set_msg_ctl(LittleEndian::read_u16(data)); + } else if msix_cap_reg_idx == reg_idx && offset == 0 && data.len() == 4 { + msix_config + .lock() + .unwrap() + .set_msg_ctl((LittleEndian::read_u32(data) >> 16) as u16); } } diff --git a/pci/src/device.rs b/pci/src/device.rs index cddb30fce9..3c5b3315f8 100644 --- a/pci/src/device.rs +++ b/pci/src/device.rs @@ -12,8 +12,8 @@ use thiserror::Error; use vm_allocator::{AddressAllocator, SystemAllocator}; use vm_device::Resource; -use crate::configuration::{self, PciBarRegionType}; use crate::PciBarConfiguration; +use crate::configuration::{self, PciBarRegionType}; #[derive(Error, Debug)] pub enum Error { diff --git a/pci/src/lib.rs b/pci/src/lib.rs index 438a8ce94a..c95a38b339 100644 --- a/pci/src/lib.rs +++ b/pci/src/lib.rs @@ -24,16 +24,16 @@ use serde::de::Visitor; pub use self::bus::{PciBus, PciConfigIo, PciConfigMmio, PciRoot, PciRootError}; pub use self::configuration::{ - PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciCapability, PciCapabilityId, - PciClassCode, PciConfiguration, PciExpressCapabilityId, PciHeaderType, PciMassStorageSubclass, - PciNetworkControllerSubclass, PciProgrammingInterface, PciSerialBusSubClass, PciSubclass, - PCI_CONFIGURATION_ID, + PCI_CONFIGURATION_ID, PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciCapability, + PciCapabilityId, PciClassCode, PciConfiguration, PciExpressCapabilityId, PciHeaderType, + PciMassStorageSubclass, PciNetworkControllerSubclass, PciProgrammingInterface, + PciSerialBusSubClass, PciSubclass, }; pub use self::device::{ BarReprogrammingParams, DeviceRelocation, Error as PciDeviceError, PciDevice, }; -pub use self::msi::{msi_num_enabled_vectors, MsiCap, MsiConfig}; -pub use self::msix::{MsixCap, MsixConfig, MsixTableEntry, MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE}; +pub use self::msi::{MsiCap, MsiConfig, msi_num_enabled_vectors}; +pub use self::msix::{MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE, MsixCap, MsixConfig, MsixTableEntry}; pub use self::vfio::{MmioRegion, VfioDmaMapping, VfioPciDevice, VfioPciError}; pub use self::vfio_user::{VfioUserDmaMapping, VfioUserPciDevice, VfioUserPciDeviceError}; diff --git a/pci/src/msi.rs b/pci/src/msi.rs index ebb7aa3e90..a0215dcdd3 100644 --- a/pci/src/msi.rs +++ b/pci/src/msi.rs @@ -271,15 +271,11 @@ impl MsiConfig { } } - if !old_enabled { - if let Err(e) = self.interrupt_source_group.enable() { - error!("Failed enabling irq_fd: {:?}", e); - } - } - } else if old_enabled { - if let Err(e) = self.interrupt_source_group.disable() { - error!("Failed disabling irq_fd: {:?}", e); + if !old_enabled && let Err(e) = self.interrupt_source_group.enable() { + error!("Failed enabling irq_fd: {:?}", e); } + } else if old_enabled && let Err(e) = self.interrupt_source_group.disable() { + error!("Failed disabling irq_fd: {:?}", e); } } } diff --git a/pci/src/msix.rs b/pci/src/msix.rs index f323a69179..718c4f83ee 100644 --- a/pci/src/msix.rs +++ b/pci/src/msix.rs @@ -211,7 +211,7 @@ impl MsixConfig { } pub fn read_table(&self, offset: u64, data: &mut [u8]) { - assert!((data.len() == 4 || data.len() == 8)); + assert!(data.len() == 4 || data.len() == 8); let index: usize = (offset / MSIX_TABLE_ENTRIES_MODULO) as usize; let modulo_offset = offset % MSIX_TABLE_ENTRIES_MODULO; @@ -264,7 +264,7 @@ impl MsixConfig { } pub fn write_table(&mut self, offset: u64, data: &[u8]) { - assert!((data.len() == 4 || data.len() == 8)); + assert!(data.len() == 4 || data.len() == 8); let index: usize = (offset / MSIX_TABLE_ENTRIES_MODULO) as usize; let modulo_offset = offset % MSIX_TABLE_ENTRIES_MODULO; @@ -360,7 +360,7 @@ impl MsixConfig { } pub fn read_pba(&mut self, offset: u64, data: &mut [u8]) { - assert!((data.len() == 4 || data.len() == 8)); + assert!(data.len() == 4 || data.len() == 8); let index: usize = (offset / MSIX_PBA_ENTRIES_MODULO) as usize; let modulo_offset = offset % MSIX_PBA_ENTRIES_MODULO; diff --git a/pci/src/vfio.rs b/pci/src/vfio.rs index 660f27bd4e..8372046acf 100644 --- a/pci/src/vfio.rs +++ b/pci/src/vfio.rs @@ -14,7 +14,7 @@ use std::sync::{Arc, Barrier, Mutex}; use anyhow::anyhow; use byteorder::{ByteOrder, LittleEndian}; use hypervisor::HypervisorVmError; -use libc::{sysconf, _SC_PAGESIZE}; +use libc::{_SC_PAGESIZE, sysconf}; use serde::{Deserialize, Serialize}; use thiserror::Error; use vfio_bindings::bindings::vfio::*; @@ -34,13 +34,13 @@ use vm_memory::{Address, GuestAddress, GuestAddressSpace, GuestMemory, GuestUsiz use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable}; use vmm_sys_util::eventfd::EventFd; -use crate::msi::{MsiConfigState, MSI_CONFIG_ID}; +use crate::msi::{MSI_CONFIG_ID, MsiConfigState}; use crate::msix::MsixConfigState; use crate::{ - msi_num_enabled_vectors, BarReprogrammingParams, MsiCap, MsiConfig, MsixCap, MsixConfig, - PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, PciBdf, PciCapabilityId, - PciClassCode, PciConfiguration, PciDevice, PciDeviceError, PciExpressCapabilityId, - PciHeaderType, PciSubclass, MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE, PCI_CONFIGURATION_ID, + BarReprogrammingParams, MSIX_CONFIG_ID, MSIX_TABLE_ENTRY_SIZE, MsiCap, MsiConfig, MsixCap, + MsixConfig, PCI_CONFIGURATION_ID, PciBarConfiguration, PciBarPrefetchable, PciBarRegionType, + PciBdf, PciCapabilityId, PciClassCode, PciConfiguration, PciDevice, PciDeviceError, + PciExpressCapabilityId, PciHeaderType, PciSubclass, msi_num_enabled_vectors, }; pub(crate) const VFIO_COMMON_ID: &str = "vfio_common"; @@ -190,7 +190,7 @@ pub(crate) struct Interrupt { impl Interrupt { fn update_msi(&mut self, offset: u64, data: &[u8]) -> Option { - if let Some(ref mut msi) = &mut self.msi { + if let Some(msi) = &mut self.msi { let action = msi.update(offset, data); return action; } @@ -199,7 +199,7 @@ impl Interrupt { } fn update_msix(&mut self, offset: u64, data: &[u8]) -> Option { - if let Some(ref mut msix) = &mut self.msix { + if let Some(msix) = &mut self.msix { let action = msix.update(offset, data); return action; } @@ -208,21 +208,20 @@ impl Interrupt { } fn accessed(&self, offset: u64) -> Option<(PciCapabilityId, u64)> { - if let Some(msi) = &self.msi { - if offset >= u64::from(msi.cap_offset) - && offset < u64::from(msi.cap_offset) + msi.cfg.size() - { - return Some(( - PciCapabilityId::MessageSignalledInterrupts, - u64::from(msi.cap_offset), - )); - } + if let Some(msi) = &self.msi + && offset >= u64::from(msi.cap_offset) + && offset < u64::from(msi.cap_offset) + msi.cfg.size() + { + return Some(( + PciCapabilityId::MessageSignalledInterrupts, + u64::from(msi.cap_offset), + )); } - if let Some(msix) = &self.msix { - if offset == u64::from(msix.cap_offset) { - return Some((PciCapabilityId::MsiX, u64::from(msix.cap_offset))); - } + if let Some(msix) = &self.msix + && offset == u64::from(msix.cap_offset) + { + return Some((PciCapabilityId::MsiX, u64::from(msix.cap_offset))); } None @@ -237,7 +236,7 @@ impl Interrupt { } fn msix_write_table(&mut self, offset: u64, data: &[u8]) { - if let Some(ref mut msix) = &mut self.msix { + if let Some(msix) = &mut self.msix { let offset = offset - u64::from(msix.cap.table_offset()); msix.bar.write_table(offset, data) } @@ -603,13 +602,12 @@ impl VfioCommon { type_, .. } = resource + && *index == bar_id as usize { - if *index == bar_id as usize { - restored_bar_addr = Some(GuestAddress(*base)); - region_size = *size; - region_type = PciBarRegionType::from(*type_); - break; - } + restored_bar_addr = Some(GuestAddress(*base)); + region_size = *size; + region_type = PciBarRegionType::from(*type_); + break; } } if restored_bar_addr.is_none() { @@ -925,24 +923,23 @@ impl VfioCommon { match PciCapabilityId::from(cap_id) { PciCapabilityId::MessageSignalledInterrupts => { - if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSI_IRQ_INDEX) { - if irq_info.count > 0 { - // Parse capability only if the VFIO device - // supports MSI. - let msg_ctl = self.parse_msi_capabilities(cap_iter); - self.initialize_msi(msg_ctl, cap_iter as u32, None); - } + if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSI_IRQ_INDEX) + && irq_info.count > 0 + { + // Parse capability only if the VFIO device + // supports MSI. + let msg_ctl = self.parse_msi_capabilities(cap_iter); + self.initialize_msi(msg_ctl, cap_iter as u32, None); } } PciCapabilityId::MsiX => { if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_MSIX_IRQ_INDEX) + && irq_info.count > 0 { - if irq_info.count > 0 { - // Parse capability only if the VFIO device - // supports MSI-X. - let msix_cap = self.parse_msix_capabilities(cap_iter); - self.initialize_msix(msix_cap, cap_iter as u32, bdf, None); - } + // Parse capability only if the VFIO device + // supports MSI-X. + let msix_cap = self.parse_msix_capabilities(cap_iter); + self.initialize_msix(msix_cap, cap_iter as u32, bdf, None); } } PciCapabilityId::PciExpress => pci_express_cap_found = true, @@ -1038,17 +1035,17 @@ impl VfioCommon { } pub(crate) fn enable_intx(&mut self) -> Result<(), VfioPciError> { - if let Some(intx) = &mut self.interrupt.intx { - if !intx.enabled { - if let Some(eventfd) = intx.interrupt_source_group.notifier(0) { - self.vfio_wrapper - .enable_irq(VFIO_PCI_INTX_IRQ_INDEX, vec![&eventfd]) - .map_err(VfioPciError::EnableIntx)?; + if let Some(intx) = &mut self.interrupt.intx + && !intx.enabled + { + if let Some(eventfd) = intx.interrupt_source_group.notifier(0) { + self.vfio_wrapper + .enable_irq(VFIO_PCI_INTX_IRQ_INDEX, vec![&eventfd]) + .map_err(VfioPciError::EnableIntx)?; - intx.enabled = true; - } else { - return Err(VfioPciError::MissingNotifier); - } + intx.enabled = true; + } else { + return Err(VfioPciError::MissingNotifier); } } @@ -1056,13 +1053,13 @@ impl VfioCommon { } pub(crate) fn disable_intx(&mut self) { - if let Some(intx) = &mut self.interrupt.intx { - if intx.enabled { - if let Err(e) = self.vfio_wrapper.disable_irq(VFIO_PCI_INTX_IRQ_INDEX) { - error!("Could not disable INTx: {}", e); - } else { - intx.enabled = false; - } + if let Some(intx) = &mut self.interrupt.intx + && intx.enabled + { + if let Err(e) = self.vfio_wrapper.disable_irq(VFIO_PCI_INTX_IRQ_INDEX) { + error!("Could not disable INTx: {}", e); + } else { + intx.enabled = false; } } } @@ -1118,12 +1115,12 @@ impl VfioCommon { } fn initialize_legacy_interrupt(&mut self) -> Result<(), VfioPciError> { - if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_INTX_IRQ_INDEX) { - if irq_info.count == 0 { - // A count of 0 means the INTx IRQ is not supported, therefore - // it shouldn't be initialized. - return Ok(()); - } + if let Some(irq_info) = self.vfio_wrapper.get_irq_info(VFIO_PCI_INTX_IRQ_INDEX) + && irq_info.count == 0 + { + // A count of 0 means the INTx IRQ is not supported, therefore + // it shouldn't be initialized. + return Ok(()); } if let Some(interrupt_source_group) = self.legacy_interrupt_group.clone() { @@ -1200,10 +1197,10 @@ impl VfioCommon { // INTx EOI // The guest reading from the BAR potentially means the interrupt has // been received and can be acknowledged. - if self.interrupt.intx_in_use() { - if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) { - error!("Failed unmasking INTx IRQ: {}", e); - } + if self.interrupt.intx_in_use() + && let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) + { + error!("Failed unmasking INTx IRQ: {}", e); } } @@ -1228,10 +1225,10 @@ impl VfioCommon { // INTx EOI // The guest writing to the BAR potentially means the interrupt has // been received and can be acknowledged. - if self.interrupt.intx_in_use() { - if let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) { - error!("Failed unmasking INTx IRQ: {}", e); - } + if self.interrupt.intx_in_use() + && let Err(e) = self.vfio_wrapper.unmask_irq(VFIO_PCI_INTX_IRQ_INDEX) + { + error!("Failed unmasking INTx IRQ: {}", e); } None @@ -1619,12 +1616,11 @@ impl VfioPciDevice { // Don't try to mmap the region if it contains MSI-X table or // MSI-X PBA subregion, and if we couldn't find MSIX_MAPPABLE // in the list of supported capabilities. - if let Some(msix) = self.common.interrupt.msix.as_ref() { - if (region.index == msix.cap.table_bir() || region.index == msix.cap.pba_bir()) - && !caps.contains(&VfioRegionInfoCap::MsixMappable) - { - continue; - } + if let Some(msix) = self.common.interrupt.msix.as_ref() + && (region.index == msix.cap.table_bir() || region.index == msix.cap.pba_bir()) + && !caps.contains(&VfioRegionInfoCap::MsixMappable) + { + continue; } let mmap_size = self.device.get_region_size(region.index); @@ -1664,9 +1660,8 @@ impl VfioPciDevice { if !is_page_size_aligned(area.size) || !is_page_size_aligned(area.offset) { warn!( "Could not mmap sparse area that is not page size aligned (offset = 0x{:x}, size = 0x{:x})", - area.offset, - area.size, - ); + area.offset, area.size, + ); return Ok(()); } @@ -1714,18 +1709,17 @@ impl VfioPciDevice { for region in self.common.mmio_regions.iter() { for user_memory_region in region.user_memory_regions.iter() { // Unmap from vfio container - if !self.iommu_attached { - if let Err(e) = self + if !self.iommu_attached + && let Err(e) = self .container .vfio_dma_unmap(user_memory_region.start, user_memory_region.size) .map_err(|e| VfioPciError::DmaUnmap(e, self.device_path.clone(), self.bdf)) - { - error!( - "Could not unmap mmio region from vfio container: \ + { + error!( + "Could not unmap mmio region from vfio container: \ iova 0x{:x}, size 0x{:x}: {}, ", - user_memory_region.start, user_memory_region.size, e - ); - } + user_memory_region.start, user_memory_region.size, e + ); } // Remove region @@ -1792,16 +1786,16 @@ impl Drop for VfioPciDevice { fn drop(&mut self) { self.unmap_mmio_regions(); - if let Some(msix) = &self.common.interrupt.msix { - if msix.bar.enabled() { - self.common.disable_msix(); - } + if let Some(msix) = &self.common.interrupt.msix + && msix.bar.enabled() + { + self.common.disable_msix(); } - if let Some(msi) = &self.common.interrupt.msi { - if msi.cfg.enabled() { - self.common.disable_msi() - } + if let Some(msi) = &self.common.interrupt.msi + && msi.cfg.enabled() + { + self.common.disable_msi() } if self.common.interrupt.intx_in_use() { @@ -1899,20 +1893,19 @@ impl PciDevice for VfioPciDevice { for user_memory_region in region.user_memory_regions.iter_mut() { // Unmap the old MMIO region from vfio container - if !self.iommu_attached { - if let Err(e) = self + if !self.iommu_attached + && let Err(e) = self .container .vfio_dma_unmap(user_memory_region.start, user_memory_region.size) .map_err(|e| { VfioPciError::DmaUnmap(e, self.device_path.clone(), self.bdf) }) - { - error!( - "Could not unmap mmio region from vfio container: \ + { + error!( + "Could not unmap mmio region from vfio container: \ iova 0x{:x}, size 0x{:x}: {}, ", - user_memory_region.start, user_memory_region.size, e - ); - } + user_memory_region.start, user_memory_region.size, e + ); } // Remove old region @@ -2040,9 +2033,9 @@ impl ExternalDmaMapping for VfioDmaMapping t as u64, Err(e) => { - return Err(io::Error::other( - format!("unable to retrieve user address for gpa 0x{gpa:x} from guest memory region: {e}") - )); + return Err(io::Error::other(format!( + "unable to retrieve user address for gpa 0x{gpa:x} from guest memory region: {e}" + ))); } } } else if self.mmio_regions.lock().unwrap().check_range(gpa, size) { diff --git a/pci/src/vfio_user.rs b/pci/src/vfio_user.rs index f23259f48f..e92c719964 100644 --- a/pci/src/vfio_user.rs +++ b/pci/src/vfio_user.rs @@ -24,7 +24,7 @@ use vm_memory::{ use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable}; use vmm_sys_util::eventfd::EventFd; -use crate::vfio::{UserMemoryRegion, Vfio, VfioCommon, VfioError, VFIO_COMMON_ID}; +use crate::vfio::{UserMemoryRegion, VFIO_COMMON_ID, Vfio, VfioCommon, VfioError}; use crate::{ BarReprogrammingParams, PciBarConfiguration, PciBdf, PciDevice, PciDeviceError, PciSubclass, VfioPciError, @@ -505,16 +505,16 @@ impl Drop for VfioUserPciDevice { fn drop(&mut self) { self.unmap_mmio_regions(); - if let Some(msix) = &self.common.interrupt.msix { - if msix.bar.enabled() { - self.common.disable_msix(); - } + if let Some(msix) = &self.common.interrupt.msix + && msix.bar.enabled() + { + self.common.disable_msix(); } - if let Some(msi) = &self.common.interrupt.msi { - if msi.cfg.enabled() { - self.common.disable_msi() - } + if let Some(msi) = &self.common.interrupt.msi + && msi.cfg.enabled() + { + self.common.disable_msi() } if self.common.interrupt.intx_in_use() { diff --git a/performance-metrics/Cargo.toml b/performance-metrics/Cargo.toml index 87dce7862e..531c6abe99 100644 --- a/performance-metrics/Cargo.toml +++ b/performance-metrics/Cargo.toml @@ -1,15 +1,17 @@ [package] authors = ["The Cloud Hypervisor Authors"] build = "../build.rs" -edition = "2021" +edition.workspace = true name = "performance-metrics" version = "0.1.0" [dependencies] -clap = { version = "4.5.13", features = ["wrap_help"] } -dirs = "6.0.0" -serde = { version = "1.0.208", features = ["derive", "rc"] } +clap = { workspace = true, features = ["wrap_help"] } +dirs = { workspace = true } +serde = { workspace = true, features = ["derive", "rc"] } serde_json = { workspace = true } test_infra = { path = "../test_infra" } thiserror = { workspace = true } -wait-timeout = "0.2.0" + +[lints] +workspace = true diff --git a/performance-metrics/src/main.rs b/performance-metrics/src/main.rs index e029b10d8d..220532f6cb 100644 --- a/performance-metrics/src/main.rs +++ b/performance-metrics/src/main.rs @@ -9,8 +9,8 @@ extern crate test_infra; mod performance_tests; use std::process::Command; -use std::sync::mpsc::channel; use std::sync::Arc; +use std::sync::mpsc::channel; use std::time::Duration; use std::{env, fmt, thread}; @@ -106,10 +106,45 @@ impl Default for MetricsReport { } } +#[derive(Clone, Copy, Default)] +pub enum ImageFormat { + #[default] + Raw, + Qcow2, + Vhd, + Vhdx, +} + +impl std::str::FromStr for ImageFormat { + type Err = (); + + fn from_str(s: &str) -> Result { + match s { + "raw" => Ok(ImageFormat::Raw), + "qcow2" => Ok(ImageFormat::Qcow2), + "vhd" => Ok(ImageFormat::Vhd), + "vhdx" => Ok(ImageFormat::Vhdx), + _ => Err(()), + } + } +} + +impl fmt::Display for ImageFormat { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + ImageFormat::Raw => write!(f, "raw"), + ImageFormat::Qcow2 => write!(f, "qcow2"), + ImageFormat::Vhd => write!(f, "vhd"), + ImageFormat::Vhdx => write!(f, "vhdx"), + } + } +} + #[derive(Default)] pub struct PerformanceTestOverrides { test_iterations: Option, test_timeout: Option, + test_image_format: Option, } impl fmt::Display for PerformanceTestOverrides { @@ -121,6 +156,10 @@ impl fmt::Display for PerformanceTestOverrides { write!(f, "test_timeout = {test_timeout}")?; } + if let Some(test_image_format) = self.test_image_format { + write!(f, "test_image_format = {test_image_format}")?; + } + Ok(()) } } @@ -686,6 +725,15 @@ fn main() { .help("Override test timeout, Ex. --timeout 5") .num_args(1), ) + .arg( + Arg::new("image-format") + .long("image-format") + .help( + "Override the image format used for block tests, supported values: qcow2, raw, vhd, vhdx. \ + Default is 'raw'.", + ) + .num_args(1), + ) .get_matches(); // It seems that the tool (ethr) used for testing the virtio-net latency @@ -712,8 +760,6 @@ fn main() { // Run performance tests sequentially and report results (in both readable/json format) let mut metrics_report: MetricsReport = Default::default(); - init_tests(); - let overrides = Arc::new(PerformanceTestOverrides { test_iterations: cmd_arguments .get_one::("iterations") @@ -725,8 +771,15 @@ fn main() { .map(|s| s.parse()) .transpose() .unwrap_or_default(), + test_image_format: cmd_arguments + .get_one::("image-format") + .map(|s| s.parse()) + .transpose() + .unwrap_or_default(), }); + init_tests(&overrides); + for test in test_list.iter() { if test_filter.is_empty() || test_filter.iter().any(|&s| test.name.contains(s)) { match run_test_with_timeout(test, &overrides) { diff --git a/performance-metrics/src/performance_tests.rs b/performance-metrics/src/performance_tests.rs index 7bbecf8898..e29dca1743 100644 --- a/performance-metrics/src/performance_tests.rs +++ b/performance-metrics/src/performance_tests.rs @@ -12,7 +12,7 @@ use std::{fs, thread}; use test_infra::{Error as InfraError, *}; use thiserror::Error; -use crate::{mean, PerformanceTestControl}; +use crate::{ImageFormat, PerformanceTestControl, PerformanceTestOverrides, mean}; #[cfg(target_arch = "x86_64")] pub const FOCAL_IMAGE_NAME: &str = "focal-server-cloudimg-amd64-custom-20210609-0.raw"; @@ -30,16 +30,30 @@ enum Error { RestoreTimeParse, } +// The test image cannot be created on tmpfs (e.g. /tmp) filesystem, +// as tmpfs does not support O_DIRECT const BLK_IO_TEST_IMG: &str = "/var/tmp/ch-blk-io-test.img"; -pub fn init_tests() { - // The test image cannot be created on tmpfs (e.g. /tmp) filesystem, - // as tmpfs does not support O_DIRECT - assert!(exec_host_command_output(&format!( - "dd if=/dev/zero of={BLK_IO_TEST_IMG} bs=1M count=4096" - )) - .status - .success()); +pub fn init_tests(overrides: &PerformanceTestOverrides) { + let mut cmd = format!("dd if=/dev/zero of={BLK_IO_TEST_IMG} bs=1M count=4096"); + + if let Some(o) = overrides.test_image_format { + match o { + ImageFormat::Raw => { /* Nothing to do */ } + ImageFormat::Qcow2 => { + cmd = + format!("qemu-img create -f qcow2 -o preallocation=full {BLK_IO_TEST_IMG} 4G"); + } + ImageFormat::Vhd => { + cmd = format!("qemu-img create -f vpc -o subformat=fixed {BLK_IO_TEST_IMG} 4G"); + } + ImageFormat::Vhdx => { + cmd = format!("qemu-img create -f vhdx -o subformat=fixed {BLK_IO_TEST_IMG} 4G"); + } + } + } + + assert!(exec_host_command_output(&cmd).status.success()); } pub fn cleanup_tests() { @@ -68,9 +82,9 @@ fn direct_kernel_boot_path() -> PathBuf { let mut kernel_path = workload_path; #[cfg(target_arch = "x86_64")] - kernel_path.push("vmlinux"); + kernel_path.push("vmlinux-x86_64"); #[cfg(target_arch = "aarch64")] - kernel_path.push("Image"); + kernel_path.push("Image-arm64"); kernel_path } diff --git a/rate_limiter/Cargo.toml b/rate_limiter/Cargo.toml index a286b9e20a..206ec7b7f8 100644 --- a/rate_limiter/Cargo.toml +++ b/rate_limiter/Cargo.toml @@ -1,11 +1,14 @@ [package] -edition = "2021" +edition.workspace = true name = "rate_limiter" version = "0.1.0" [dependencies] -epoll = "4.3.3" -libc = "0.2.167" -log = "0.4.22" +epoll = { workspace = true } +libc = { workspace = true } +log = { workspace = true } thiserror = { workspace = true } vmm-sys-util = { workspace = true } + +[lints] +workspace = true diff --git a/rate_limiter/src/group.rs b/rate_limiter/src/group.rs index 51e18196a8..a986a7f581 100644 --- a/rate_limiter/src/group.rs +++ b/rate_limiter/src/group.rs @@ -288,10 +288,10 @@ impl Drop for RateLimiterGroup { fn drop(&mut self) { self.kill_evt.write(1).unwrap(); - if let Some(t) = self.epoll_thread.take() { - if let Err(e) = t.join() { - error!("Error joining thread: {:?}", e); - } + if let Some(t) = self.epoll_thread.take() + && let Err(e) = t.join() + { + error!("Error joining thread: {:?}", e); } } } @@ -306,7 +306,7 @@ pub(crate) mod tests { use super::RateLimiterGroupHandle; use crate::group::RateLimiterGroup; - use crate::{TokenBucket, TokenType, REFILL_TIMER_INTERVAL_MS}; + use crate::{REFILL_TIMER_INTERVAL_MS, TokenBucket, TokenType}; impl RateLimiterGroupHandle { fn bandwidth(&self) -> Option { diff --git a/rate_limiter/src/lib.rs b/rate_limiter/src/lib.rs index 4202bfd1a1..72221416f1 100644 --- a/rate_limiter/src/lib.rs +++ b/rate_limiter/src/lib.rs @@ -48,8 +48,8 @@ extern crate log; use std::io; use std::os::unix::io::{AsRawFd, RawFd}; -use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Mutex; +use std::sync::atomic::{AtomicBool, Ordering}; use std::time::{Duration, Instant}; use thiserror::Error; @@ -470,7 +470,7 @@ impl RateLimiter { std::io::ErrorKind::WouldBlock => { return Err(Error::SpuriousRateLimiterEvent( "Rate limiter event handler called without a present timer", - )) + )); } _ => return Err(Error::TimerFdWaitError(err)), } @@ -486,7 +486,7 @@ impl RateLimiter { /// Updates the parameters of the token buckets associated with this RateLimiter. // TODO: Please note that, right now, the buckets become full after being updated. pub fn update_buckets(&mut self, bytes: BucketUpdate, ops: BucketUpdate) { - let mut guard = self.inner.lock().unwrap(); + let guard = self.inner.get_mut().unwrap(); match bytes { BucketUpdate::Disabled => guard.bandwidth = None, BucketUpdate::Update(tb) => guard.bandwidth = Some(tb), diff --git a/release-notes.md b/release-notes.md index 559066feb4..4dc982e838 100644 --- a/release-notes.md +++ b/release-notes.md @@ -1,50 +1,69 @@ +- [v48.0](#v480) + - [Experimental `fw_cfg` Device Support](#experimental-fw_cfg-device-support) + - [Experimental `ivshmem` Device Support](#experimental-ivshmem-device-support) + - [Firmware Boot Support on `riscv64`](#firmware-boot-support-on-riscv64) + - [Increased vCPU Limit on x86_64/kvm](#increased-vcpu-limit-on-x86_64kvm) + - [Improved Block Performance with Small Block Sizes](#improved-block-performance-with-small-block-sizes) + - [Faster VM Pause Operation](#faster-vm-pause-operation) + - [Updated Documentation on Windows Guest Support](#updated-documentation-on-windows-guest-support) + - [Policy on AI Generated Code](#policy-on-ai-generated-code) + - [Removed SGX Support](#removed-sgx-support) + - [Notable Bug Fixes](#notable-bug-fixes) + - [Contributors](#contributors) +- [v47.0](#v470) + - [Block Device Error Reporting to the Guest](#block-device-error-reporting-to-the-guest) + - [Nice Error Messages on Exit](#nice-error-messages-on-exit) + - [Alphabetically Sorted CLI Options for ch-remote](#alphabetically-sorted-cli-options-for-ch-remote) + - [Notable Bug Fixes](#notable-bug-fixes-1) + - [Deprecations](#deprecations) + - [Contributors](#contributors-1) - [v46.0](#v460) - [File-level Locking Support with `--disk`](#file-level-locking-support-with---disk) - [Improved Error Reporting with VM Resizing](#improved-error-reporting-with-vm-resizing) - [IPv6 Address Support with `--net`](#ipv6-address-support-with---net) - [Experimental AArch64 Support with the MSHV Hypervisor](#experimental-aarch64-support-with-the-mshv-hypervisor) - [Deprecated SGX Support](#deprecated-sgx-support) - - [Notable Bug Fixes](#notable-bug-fixes) - - [Contributors](#contributors) + - [Notable Bug Fixes](#notable-bug-fixes-2) + - [Contributors](#contributors-2) - [v45.0](#v450) - [Experimental `riscv64` Architecture Support](#experimental-riscv64-architecture-support) - [Alphabetically Sorted CLI Options](#alphabetically-sorted-cli-options) - [Improved Downtime of VM Live Migration](#improved-downtime-of-vm-live-migration) - - [Notable Bug Fixes](#notable-bug-fixes-1) - - [Contributors](#contributors-1) + - [Notable Bug Fixes](#notable-bug-fixes-3) + - [Contributors](#contributors-3) - [v44.0](#v440) - [Configurable `virtio-iommu` Address Width](#configurable-virtio-iommu-address-width) - [Notable Performance Improvements](#notable-performance-improvements) - [New Fuzzers](#new-fuzzers) - - [Notable Bug Fixes](#notable-bug-fixes-2) - - [Contributors](#contributors-2) + - [Notable Bug Fixes](#notable-bug-fixes-4) + - [Contributors](#contributors-4) - [v43.0](#v430) - [Live Migration over TCP Connections](#live-migration-over-tcp-connections) - [Notable Performance Improvements](#notable-performance-improvements-1) - - [Notable Bug Fixes](#notable-bug-fixes-3) - - [Contributors](#contributors-3) + - [Notable Bug Fixes](#notable-bug-fixes-5) + - [Contributors](#contributors-5) - [v42.0](#v420) - [SVE/SVE2 Support on AArch64](#svesve2-support-on-aarch64) - - [Notable Bug Fixes](#notable-bug-fixes-4) + - [Notable Bug Fixes](#notable-bug-fixes-6) - [Sponsorships](#sponsorships) - - [Contributors](#contributors-4) + - [Contributors](#contributors-6) - [v41.0](#v410) - [Experimental "Pvmemcontrol" Support](#experimental-pvmemcontrol-support) - [Sandboxing With Landlock Support](#sandboxing-with-landlock-support) - [Notable Performance Improvements](#notable-performance-improvements-2) - - [Notable Bug Fixes](#notable-bug-fixes-5) - - [Contributors](#contributors-5) + - [Notable Bug Fixes](#notable-bug-fixes-7) + - [Contributors](#contributors-7) - [v40.0](#v400) - [Support for Restoring File Descriptor Backed Network Devices](#support-for-restoring-file-descriptor-backed-network-devices) - - [Notable Bug Fixes](#notable-bug-fixes-6) - - [Contributors](#contributors-6) + - [Notable Bug Fixes](#notable-bug-fixes-8) + - [Contributors](#contributors-8) - [v39.0](#v390) - [Variable Sizing of PCI Apertures for Segments](#variable-sizing-of-pci-apertures-for-segments) - [Direct Booting with bzImages](#direct-booting-with-bzimages) - [Support for NVIDIA GPUDirect P2P Support](#support-for-nvidia-gpudirect-p2p-support) - [Guest NMI Injection Support](#guest-nmi-injection-support) - - [Notable Bug Fixes](#notable-bug-fixes-7) - - [Contributors](#contributors-7) + - [Notable Bug Fixes](#notable-bug-fixes-9) + - [Contributors](#contributors-9) - [v38.0](#v380) - [Group Rate Limiter on Block Devices](#group-rate-limiter-on-block-devices) - [CPU Pinning Support for Block Device Worker Thread](#cpu-pinning-support-for-block-device-worker-thread) @@ -52,16 +71,16 @@ - [New 'debug-console' Device](#new-debug-console-device) - [Improved VFIO Device Support](#improved-vfio-device-support) - [Extended CPU Affinity Support](#extended-cpu-affinity-support) - - [Notable Bug Fixes](#notable-bug-fixes-8) - - [Contributors](#contributors-8) + - [Notable Bug Fixes](#notable-bug-fixes-10) + - [Contributors](#contributors-10) - [v37.0](#v370) - [Long Term Support (LTS) Release](#long-term-support-lts-release) - [Multiple PCI segments Support for 32-bit VFIO devices](#multiple-pci-segments-support-for-32-bit-vfio-devices) - [Configurable Named TAP Devices](#configurable-named-tap-devices) - [TTY Output from Both Serial Device and Virtio Console](#tty-output-from-both-serial-device-and-virtio-console) - [Faster VM Restoration from Snapshots](#faster-vm-restoration-from-snapshots) - - [Notable Bug Fixes](#notable-bug-fixes-9) - - [Contributors](#contributors-9) + - [Notable Bug Fixes](#notable-bug-fixes-11) + - [Contributors](#contributors-11) - [v36.0](#v360) - [Command Line Changes](#command-line-changes) - [Enabled Features Reported via API Endpoint and CLI](#enabled-features-reported-via-api-endpoint-and-cli) @@ -70,31 +89,31 @@ - [Unix Socket Backend for Serial Port](#unix-socket-backend-for-serial-port) - [AIO Backend for Block Devices](#aio-backend-for-block-devices) - [Documentation Improvements](#documentation-improvements) - - [Notable Bug Fixes](#notable-bug-fixes-10) - - [Contributors](#contributors-10) + - [Notable Bug Fixes](#notable-bug-fixes-12) + - [Contributors](#contributors-12) - [v35.0](#v350) - [`virtio-vsock` Support for Linux Guest Kernel v6.3+](#virtio-vsock-support-for-linux-guest-kernel-v63) - [User Specified Serial Number for `virtio-block`](#user-specified-serial-number-for-virtio-block) - [vCPU TSC Frequency Included in Migration State](#vcpu-tsc-frequency-included-in-migration-state) - - [Notable Bug Fixes](#notable-bug-fixes-11) - - [Contributors](#contributors-11) + - [Notable Bug Fixes](#notable-bug-fixes-13) + - [Contributors](#contributors-13) - [v34.0](#v340) - [Paravirtualised Panic Device Support](#paravirtualised-panic-device-support) - [Improvements to VM Core Dump](#improvements-to-vm-core-dump) - [QCOW2 Support for Backing Files](#qcow2-support-for-backing-files) - [Minimum Host Kernel Bump](#minimum-host-kernel-bump) - - [Notable Bug Fixes](#notable-bug-fixes-12) - - [Contributors](#contributors-12) + - [Notable Bug Fixes](#notable-bug-fixes-14) + - [Contributors](#contributors-14) - [v33.0](#v330) - [D-Bus based API](#d-bus-based-api) - [Expose Host CPU Cache Details for AArch64](#expose-host-cpu-cache-details-for-aarch64) - - [Notable Bug Fixes](#notable-bug-fixes-13) - - [Contributors](#contributors-13) + - [Notable Bug Fixes](#notable-bug-fixes-15) + - [Contributors](#contributors-15) - [v32.0](#v320) - [Increased PCI Segment Limit](#increased-pci-segment-limit) - [API Changes](#api-changes) - - [Notable Bug Fixes](#notable-bug-fixes-14) - - [Contributors](#contributors-14) + - [Notable Bug Fixes](#notable-bug-fixes-16) + - [Contributors](#contributors-16) - [v31.1](#v311) - [v31.0](#v310) - [Update to Latest `acpi_tables`](#update-to-latest-acpi_tables) @@ -102,15 +121,15 @@ - [Improvements on Console `SIGWINCH` Handler](#improvements-on-console-sigwinch-handler) - [Remove Directory Support from `MemoryZoneConfig::file`](#remove-directory-support-from-memoryzoneconfigfile) - [Documentation Improvements](#documentation-improvements-1) - - [Notable Bug Fixes](#notable-bug-fixes-15) - - [Contributors](#contributors-15) + - [Notable Bug Fixes](#notable-bug-fixes-17) + - [Contributors](#contributors-17) - [v30.0](#v300) - [Command Line Changes for Reduced Binary Size](#command-line-changes-for-reduced-binary-size) - [Basic vfio-user Server Support](#basic-vfio-user-server-support) - [Heap Profiling Support](#heap-profiling-support) - [Documentation Improvements](#documentation-improvements-2) - - [Notable Bug Fixes](#notable-bug-fixes-16) - - [Contributors](#contributors-16) + - [Notable Bug Fixes](#notable-bug-fixes-18) + - [Contributors](#contributors-18) - [v28.2](#v282) - [v29.0](#v290) - [Release Binary Supports Both MSHV and KVM](#release-binary-supports-both-mshv-and-kvm) @@ -120,10 +139,10 @@ - [`AArch64` Documentation Integration](#aarch64-documentation-integration) - [`virtio-block` Counters Enhancement](#virtio-block-counters-enhancement) - [TCP Offload Control](#tcp-offload-control) - - [Notable Bug Fixes](#notable-bug-fixes-17) + - [Notable Bug Fixes](#notable-bug-fixes-19) - [Removals](#removals) - - [Deprecations](#deprecations) - - [Contributors](#contributors-17) + - [Deprecations](#deprecations-1) + - [Contributors](#contributors-19) - [v28.1](#v281) - [v28.0](#v280) - [Community Engagement (Reminder)](#community-engagement-reminder) @@ -131,9 +150,9 @@ - [Virtualised TPM Support](#virtualised-tpm-support) - [Transparent Huge Page Support](#transparent-huge-page-support) - [README Quick Start Improved](#readme-quick-start-improved) - - [Notable Bug Fixes](#notable-bug-fixes-18) + - [Notable Bug Fixes](#notable-bug-fixes-20) - [Removals](#removals-1) - - [Contributors](#contributors-18) + - [Contributors](#contributors-20) - [v27.0](#v270) - [Community Engagement](#community-engagement) - [Prebuilt Packages](#prebuilt-packages) @@ -142,41 +161,41 @@ - [Simplified Build Feature Flags](#simplified-build-feature-flags) - [Asynchronous Kernel Loading](#asynchronous-kernel-loading) - [GDB Support for AArch64](#gdb-support-for-aarch64) - - [Notable Bug Fixes](#notable-bug-fixes-19) - - [Deprecations](#deprecations-1) - - [Contributors](#contributors-19) + - [Notable Bug Fixes](#notable-bug-fixes-21) + - [Deprecations](#deprecations-2) + - [Contributors](#contributors-21) - [v26.0](#v260) - [SMBIOS Improvements via `--platform`](#smbios-improvements-via---platform) - [Unified Binary MSHV and KVM Support](#unified-binary-mshv-and-kvm-support) - - [Notable Bug Fixes](#notable-bug-fixes-20) - - [Deprecations](#deprecations-2) + - [Notable Bug Fixes](#notable-bug-fixes-22) + - [Deprecations](#deprecations-3) - [Removals](#removals-2) - - [Contributors](#contributors-20) + - [Contributors](#contributors-22) - [v25.0](#v250) - [`ch-remote` Improvements](#ch-remote-improvements-1) - [VM "Coredump" Support](#vm-coredump-support) - - [Notable Bug Fixes](#notable-bug-fixes-21) + - [Notable Bug Fixes](#notable-bug-fixes-23) - [Removals](#removals-3) - - [Contributors](#contributors-21) + - [Contributors](#contributors-23) - [v24.0](#v240) - [Bypass Mode for `virtio-iommu`](#bypass-mode-for-virtio-iommu) - [Ensure Identifiers Uniqueness](#ensure-identifiers-uniqueness) - [Sparse Mmap support](#sparse-mmap-support) - [Expose Platform Serial Number](#expose-platform-serial-number) - - [Notable Bug Fixes](#notable-bug-fixes-22) + - [Notable Bug Fixes](#notable-bug-fixes-24) - [Notable Improvements](#notable-improvements) - - [Deprecations](#deprecations-3) + - [Deprecations](#deprecations-4) - [New on the Website](#new-on-the-website) - - [Contributors](#contributors-22) + - [Contributors](#contributors-24) - [v23.1](#v231) - [v23.0](#v230) - [vDPA Support](#vdpa-support) - [Updated OS Support list](#updated-os-support-list) - [`AArch64` Memory Map Improvements](#aarch64-memory-map-improvements) - [`AMX` Support](#amx-support) - - [Notable Bug Fixes](#notable-bug-fixes-23) - - [Deprecations](#deprecations-4) - - [Contributors](#contributors-23) + - [Notable Bug Fixes](#notable-bug-fixes-25) + - [Deprecations](#deprecations-5) + - [Contributors](#contributors-25) - [v22.1](#v221) - [v22.0](#v220) - [GDB Debug Stub Support](#gdb-debug-stub-support) @@ -187,13 +206,13 @@ - [PMU Support for AArch64](#pmu-support-for-aarch64) - [Documentation Under CC-BY-4.0 License](#documentation-under-cc-by-40-license) - [Deprecation of "Classic" `virtiofsd`](#deprecation-of-classic-virtiofsd) - - [Notable Bug Fixes](#notable-bug-fixes-24) - - [Contributors](#contributors-24) + - [Notable Bug Fixes](#notable-bug-fixes-26) + - [Contributors](#contributors-26) - [v21.0](#v210) - [Efficient Local Live Migration (for Live Upgrade)](#efficient-local-live-migration-for-live-upgrade) - [Recommended Kernel is Now 5.15](#recommended-kernel-is-now-515) - - [Notable Bug fixes](#notable-bug-fixes-25) - - [Contributors](#contributors-25) + - [Notable Bug fixes](#notable-bug-fixes-27) + - [Contributors](#contributors-27) - [v20.2](#v202) - [v20.1](#v201) - [v20.0](#v200) @@ -202,8 +221,8 @@ - [Improved VFIO support](#improved-vfio-support) - [Safer code](#safer-code) - [Extended documentation](#extended-documentation) - - [Notable bug fixes](#notable-bug-fixes-26) - - [Contributors](#contributors-26) + - [Notable bug fixes](#notable-bug-fixes-28) + - [Contributors](#contributors-28) - [v19.0](#v190) - [Improved PTY handling for serial and `virtio-console`](#improved-pty-handling-for-serial-and-virtio-console) - [PCI boot time optimisations](#pci-boot-time-optimisations) @@ -211,8 +230,8 @@ - [Live migration enhancements](#live-migration-enhancements) - [`virtio-mem` support with `vfio-user`](#virtio-mem-support-with-vfio-user) - [AArch64 for `virtio-iommu`](#aarch64-for-virtio-iommu) - - [Notable bug fixes](#notable-bug-fixes-27) - - [Contributors](#contributors-27) + - [Notable bug fixes](#notable-bug-fixes-29) + - [Contributors](#contributors-29) - [v18.0](#v180) - [Experimental User Device (`vfio-user`) support](#experimental-user-device-vfio-user-support) - [Migration support for `vhost-user` devices](#migration-support-for-vhost-user-devices) @@ -222,31 +241,31 @@ - [Live migration on MSHV hypervisor](#live-migration-on-mshv-hypervisor) - [AArch64 CPU topology support](#aarch64-cpu-topology-support) - [Power button support on AArch64](#power-button-support-on-aarch64) - - [Notable bug fixes](#notable-bug-fixes-28) - - [Contributors](#contributors-28) + - [Notable bug fixes](#notable-bug-fixes-30) + - [Contributors](#contributors-30) - [v17.0](#v170) - [ARM64 NUMA support using ACPI](#arm64-numa-support-using-acpi) - [`Seccomp` support for MSHV backend](#seccomp-support-for-mshv-backend) - [Hotplug of `macvtap` devices](#hotplug-of-macvtap-devices) - [Improved SGX support](#improved-sgx-support) - [Inflight tracking for `vhost-user` devices](#inflight-tracking-for-vhost-user-devices) - - [Notable bug fixes](#notable-bug-fixes-29) - - [Contributors](#contributors-29) + - [Notable bug fixes](#notable-bug-fixes-31) + - [Contributors](#contributors-31) - [v16.0](#v160) - [Improved live migration support](#improved-live-migration-support) - [Improved `vhost-user` support](#improved-vhost-user-support) - [ARM64 ACPI and UEFI support](#arm64-acpi-and-uefi-support) - - [Notable bug fixes](#notable-bug-fixes-30) + - [Notable bug fixes](#notable-bug-fixes-32) - [Removed functionality](#removed-functionality) - - [Contributors](#contributors-30) + - [Contributors](#contributors-32) - [v15.0](#v150) - [Version numbering and stability guarantees](#version-numbering-and-stability-guarantees) - [Network device rate limiting](#network-device-rate-limiting) - [Support for runtime control of `virtio-net` guest offload](#support-for-runtime-control-of-virtio-net-guest-offload) - [`--api-socket` supports file descriptor parameter](#--api-socket-supports-file-descriptor-parameter) - [Bug fixes](#bug-fixes) - - [Deprecations](#deprecations-5) - - [Contributors](#contributors-31) + - [Deprecations](#deprecations-6) + - [Contributors](#contributors-33) - [v0.14.1](#v0141) - [v0.14.0](#v0140) - [Structured event monitoring](#structured-event-monitoring) @@ -255,8 +274,8 @@ - [Updated hotplug documentation](#updated-hotplug-documentation) - [PTY control for serial and `virtio-console`](#pty-control-for-serial-and-virtio-console) - [Block device rate limiting](#block-device-rate-limiting) - - [Deprecations](#deprecations-6) - - [Contributors](#contributors-32) + - [Deprecations](#deprecations-7) + - [Contributors](#contributors-34) - [v0.13.0](#v0130) - [Wider VFIO device support](#wider-vfio-device-support) - [Improved huge page support](#improved-huge-page-support) @@ -264,13 +283,13 @@ - [VHD disk image support](#vhd-disk-image-support) - [Improved Virtio device threading](#improved-virtio-device-threading) - [Clean shutdown support via synthetic power button](#clean-shutdown-support-via-synthetic-power-button) - - [Contributors](#contributors-33) + - [Contributors](#contributors-35) - [v0.12.0](#v0120) - [ARM64 enhancements](#arm64-enhancements) - [Removal of `vhost-user-net` and `vhost-user-block` self spawning](#removal-of-vhost-user-net-and-vhost-user-block-self-spawning) - [Migration of `vhost-user-fs` backend](#migration-of-vhost-user-fs-backend) - [Enhanced "info" API](#enhanced-info-api) - - [Contributors](#contributors-34) + - [Contributors](#contributors-36) - [v0.11.0](#v0110) - [`io_uring` support by default for `virtio-block`](#io_uring-support-by-default-for-virtio-block) - [Windows Guest Support](#windows-guest-support) @@ -282,15 +301,15 @@ - [Default Log Level Changed](#default-log-level-changed) - [New `--balloon` Parameter Added](#new---balloon-parameter-added) - [Experimental `virtio-watchdog` Support](#experimental-virtio-watchdog-support) - - [Notable Bug Fixes](#notable-bug-fixes-31) - - [Contributors](#contributors-35) + - [Notable Bug Fixes](#notable-bug-fixes-33) + - [Contributors](#contributors-37) - [v0.10.0](#v0100) - [`virtio-block` Support for Multiple Descriptors](#virtio-block-support-for-multiple-descriptors) - [Memory Zones](#memory-zones) - [`Seccomp` Sandbox Improvements](#seccomp-sandbox-improvements) - [Preliminary KVM HyperV Emulation Control](#preliminary-kvm-hyperv-emulation-control) - - [Notable Bug Fixes](#notable-bug-fixes-32) - - [Contributors](#contributors-36) + - [Notable Bug Fixes](#notable-bug-fixes-34) + - [Contributors](#contributors-38) - [v0.9.0](#v090) - [`io_uring` Based Block Device Support](#io_uring-based-block-device-support) - [Block and Network Device Statistics](#block-and-network-device-statistics) @@ -303,17 +322,17 @@ - [Enhancements to ARM64 Support](#enhancements-to-arm64-support) - [Intel SGX Support](#intel-sgx-support) - [`Seccomp` Sandbox Improvements](#seccomp-sandbox-improvements-1) - - [Notable Bug Fixes](#notable-bug-fixes-33) - - [Contributors](#contributors-37) + - [Notable Bug Fixes](#notable-bug-fixes-35) + - [Contributors](#contributors-39) - [v0.8.0](#v080) - [Experimental Snapshot and Restore Support](#experimental-snapshot-and-restore-support) - [Experimental ARM64 Support](#experimental-arm64-support) - [Support for Using 5-level Paging in Guests](#support-for-using-5-level-paging-in-guests) - [Virtio Device Interrupt Suppression for Network Devices](#virtio-device-interrupt-suppression-for-network-devices) - [`vhost_user_fs` Improvements](#vhost_user_fs-improvements) - - [Notable Bug Fixes](#notable-bug-fixes-34) + - [Notable Bug Fixes](#notable-bug-fixes-36) - [Command Line and API Changes](#command-line-and-api-changes) - - [Contributors](#contributors-38) + - [Contributors](#contributors-40) - [v0.7.0](#v070) - [Block, Network, Persistent Memory (PMEM), VirtioFS and Vsock hotplug](#block-network-persistent-memory-pmem-virtiofs-and-vsock-hotplug) - [Alternative `libc` Support](#alternative-libc-support) @@ -323,14 +342,14 @@ - [`Seccomp` Sandboxing](#seccomp-sandboxing) - [Updated Distribution Support](#updated-distribution-support) - [Command Line and API Changes](#command-line-and-api-changes-1) - - [Contributors](#contributors-39) + - [Contributors](#contributors-41) - [v0.6.0](#v060) - [Directly Assigned Devices Hotplug](#directly-assigned-devices-hotplug) - [Shared Filesystem Improvements](#shared-filesystem-improvements) - [Block and Networking IO Self Offloading](#block-and-networking-io-self-offloading) - [Command Line Interface](#command-line-interface) - [PVH Boot](#pvh-boot) - - [Contributors](#contributors-40) + - [Contributors](#contributors-42) - [v0.5.1](#v051) - [v0.5.0](#v050) - [Virtual Machine Dynamic Resizing](#virtual-machine-dynamic-resizing) @@ -338,7 +357,7 @@ - [New Interrupt Management Framework](#new-interrupt-management-framework) - [Development Tools](#development-tools) - [Kata Containers Integration](#kata-containers-integration) - - [Contributors](#contributors-41) + - [Contributors](#contributors-43) - [v0.4.0](#v040) - [Dynamic virtual CPUs addition](#dynamic-virtual-cpus-addition) - [Programmatic firmware tables generation](#programmatic-firmware-tables-generation) @@ -347,7 +366,7 @@ - [Userspace IOAPIC by default](#userspace-ioapic-by-default) - [PCI BAR reprogramming](#pci-bar-reprogramming) - [New `cloud-hypervisor` organization](#new-cloud-hypervisor-organization) - - [Contributors](#contributors-42) + - [Contributors](#contributors-44) - [v0.3.0](#v030) - [Block device offloading](#block-device-offloading) - [Network device backend](#network-device-backend) @@ -374,6 +393,143 @@ - [Unit testing](#unit-testing) - [Integration tests parallelization](#integration-tests-parallelization) +# v48.0 + +This release has been tracked in [v48.0 +group](https://github.com/orgs/cloud-hypervisor/projects/6/views/4?filterQuery=release%3A%22Release+48%22) +of our [roadmap project](https://github.com/orgs/cloud-hypervisor/projects/6/). + +### Experimental `fw_cfg` Device Support + +This feature enables passing configuration data and files, such as VM +boot configurations (kernel, kernel cmdline, e820 memory map, and ACPI +tables), from the host to the guest. (#7117) + +### Experimental `ivshmem` Device Support + +Support for inter-VM shared memory has been added. For more information, +please refer to the [ivshmem documentation](docs/ivshmem.md). (#6703) + +### Firmware Boot Support on `riscv64` + +In addition to direct kernel boot, firmware boot support has been added +on `riscv64` hosts. (#7249) + +### Increased vCPU Limit on x86_64/kvm + +The maximum number of supported vCPUs on x86_64 hosts using KVM has been +raised from 254 to 8192. (#7299) + +### Improved Block Performance with Small Block Sizes + +Performance for `virtio-blk` with small block sizes (16KB and below) +is enhanced via submitting async IO requests in batches. (#7146) + +### Faster VM Pause Operation + +The VM pause operation now is significantly faster particularly for VMs +with a large number of vCPUs. (#7290) + +### Updated Documentation on Windows Guest Support + +Our Windows documentation now includes instructions to run Windows 11 +guests, in addition to Windows Server guests. (#7218) + +### Policy on AI Generated Code + +We will decline any contributions known to contain contents generated or +derived from using Large Language Models (LLMs). Details can be found +in our [contributing documentation](CONTRIBUTING.md). (#7162) + +### Removed SGX Support + +The SGX support has been removed, as announced in the deprecation notice two +release cycles ago. (#7093) + +### Notable Bug Fixes + +* Seccomp filter fixes with glibc v2.42 (#7327) +* Various fixes related to (#7331, #7334, #7335) + +### Contributors + +Many thanks to everyone who has contributed to our release: + +* Alex Orozco +* Alyssa Ross +* Anirudh Rayabharam +* Bo Chen +* Demi Marie Obenour +* Lucas Grosche +* Muminul Islam +* Oliver Anderson +* Peter Oskolkov +* Philipp Schuster +* Ruoqing He +* Shubham Chakrawar +* Songqian Li +* Wei Liu + +# v47.0 + +This release has been tracked in [v47.0 +group](https://github.com/orgs/cloud-hypervisor/projects/6/views/4?filterQuery=release%3A%22Release+47%22) +of our [roadmap project](https://github.com/orgs/cloud-hypervisor/projects/6/). + +### Block Device Error Reporting to the Guest + +Instead of exiting on I/O errors, the `virtio-block` device now reports +errors to the guest using `VIRTIO_BLK_S_IOERR`. It improves the user +experience particularly when the guest rootfs is not backed by the +affected block device. (#7107) + +### Nice Error Messages on Exit + +We now have the chain of errors being reported and printed nicely, when +Cloud Hypervisor or ch-remote exits on errors. (#7066) + +### Alphabetically Sorted CLI Options for ch-remote + +To improve readability, ch-remote now prints help information in +alphabetical order. (#7130) + +### Notable Bug Fixes + +* Error out early when block device serial is too long (#7124) +* Fix partial commands being discarded for `virtio-vsock` (#7195) +* Disable the broken interrupt support for the `rtc_pl031` device to + prevent spurious guest interrupts (#7199) + +### Deprecations + +* A default IP (`192.168.249.1`) and mask (`255.255.255.0`) are + currently assigned to the `virtio-net` device if no value is specified + by users. Such behavior is now deprecated. Users of this behavior will + receive a warning message and should make adjustments. The behavior + will be removed in two release cycles (v49.0). + +### Contributors + +Many thanks to everyone who has contributed to our release: + +* Alyssa Ross +* Bo Chen +* Demi Marie Obenour +* Gauthier Jolly +* Hengqi Chen +* Jinank Jain +* Jinrong Liang +* Jean-Philippe Brucker +* Maximilian Güntner +* Muminul Islam +* Nuno Das Neves +* Philipp Schuster +* Ruoqing He +* Songqian Li +* Wei Liu +* Yi Wang +* ninollei + # v46.0 This release has been tracked in [v46.0 diff --git a/resources/Dockerfile b/resources/Dockerfile index ca527857f6..704e8602f3 100644 --- a/resources/Dockerfile +++ b/resources/Dockerfile @@ -8,7 +8,7 @@ FROM ubuntu:24.04 AS dev ARG TARGETARCH -ARG RUST_TOOLCHAIN="1.83.0" +ARG RUST_TOOLCHAIN="1.88.0" ARG CLH_SRC_DIR="/cloud-hypervisor" ARG CLH_BUILD_DIR="$CLH_SRC_DIR/build" ARG CARGO_REGISTRY_DIR="$CLH_BUILD_DIR/cargo_registry" @@ -123,7 +123,7 @@ RUN echo 'source $CARGO_HOME/env' >> "$HOME"/.bashrc \ && mkdir "$HOME"/.cargo \ && ln -s $CARGO_HOME/env "$HOME"/.cargo/env -# Allow pip to install packages system wide +# Allow pip to install packages system wide # hadolint ignore=DL3003,SC2046 RUN rm /usr/lib/python3.12/EXTERNALLY-MANAGED \ && git clone https://github.com/spdk/spdk \ diff --git a/scripts/dev_cli.sh b/scripts/dev_cli.sh index db560d529e..2c1eab6013 100755 --- a/scripts/dev_cli.sh +++ b/scripts/dev_cli.sh @@ -9,7 +9,7 @@ CLI_NAME="Cloud Hypervisor" CTR_IMAGE_TAG="ghcr.io/cloud-hypervisor/cloud-hypervisor" # Needs to match explicit version in docker-image.yaml workflow -CTR_IMAGE_VERSION="20250412-0" +CTR_IMAGE_VERSION="20250815-0" : "${CTR_IMAGE:=${CTR_IMAGE_TAG}:${CTR_IMAGE_VERSION}}" DOCKER_RUNTIME="docker" @@ -47,6 +47,9 @@ CARGO_GIT_REGISTRY_DIR="${CLH_BUILD_DIR}/cargo_git_registry" # Full path to the cargo target dir on the host. CARGO_TARGET_DIR="${CLH_BUILD_DIR}/cargo_target" +# Let tests know that the special environment is set up. +RUSTFLAGS="${RUSTFLAGS} --cfg devcli_testenv" + # Send a decorated message to stdout, followed by a new line # say() { @@ -193,7 +196,6 @@ cmd_help() { echo " Run the Cloud Hypervisor tests." echo " --unit Run the unit tests." echo " --integration Run the integration tests." - echo " --integration-sgx Run the SGX integration tests." echo " --integration-vfio Run the VFIO integration tests." echo " --integration-windows Run the Windows guest integration tests." echo " --integration-live-migration Run the live-migration integration tests." @@ -327,7 +329,6 @@ cmd_clean() { cmd_tests() { unit=false integration=false - integration_sgx=false integration_vfio=false integration_windows=false integration_live_migration=false @@ -346,7 +347,6 @@ cmd_tests() { } ;; "--unit") { unit=true; } ;; "--integration") { integration=true; } ;; - "--integration-sgx") { integration_sgx=true; } ;; "--integration-vfio") { integration_vfio=true; } ;; "--integration-windows") { integration_windows=true; } ;; "--integration-live-migration") { integration_live_migration=true; } ;; @@ -449,29 +449,6 @@ cmd_tests() { dbus-run-session ./scripts/run_integration_tests_"$(uname -m)".sh "$@" || fix_dir_perms $? || exit $? fi - if [ "$integration_sgx" = true ]; then - say "Running SGX integration tests for $target..." - $DOCKER_RUNTIME run \ - --workdir "$CTR_CLH_ROOT_DIR" \ - --rm \ - --privileged \ - --security-opt seccomp=unconfined \ - --ipc=host \ - --net="$CTR_CLH_NET" \ - --mount type=tmpfs,destination=/tmp \ - --volume /dev:/dev \ - --volume "$CLH_ROOT_DIR:$CTR_CLH_ROOT_DIR" \ - ${exported_volumes:+"$exported_volumes"} \ - --volume "$CLH_INTEGRATION_WORKLOADS:$CTR_CLH_INTEGRATION_WORKLOADS" \ - --env USER="root" \ - --env BUILD_TARGET="$target" \ - --env RUSTFLAGS="$rustflags" \ - --env TARGET_CC="$target_cc" \ - --env AUTH_DOWNLOAD_TOKEN="$AUTH_DOWNLOAD_TOKEN" \ - "$CTR_IMAGE" \ - ./scripts/run_integration_tests_sgx.sh "$@" || fix_dir_perms $? || exit $? - fi - if [ "$integration_vfio" = true ]; then say "Running VFIO integration tests for $target..." $DOCKER_RUNTIME run \ @@ -538,6 +515,7 @@ cmd_tests() { --env TARGET_CC="$target_cc" \ --env AUTH_DOWNLOAD_TOKEN="$AUTH_DOWNLOAD_TOKEN" \ --env LLVM_PROFILE_FILE="$LLVM_PROFILE_FILE" \ + --env MIGRATABLE_VERSION="$MIGRATABLE_VERSION" \ "$CTR_IMAGE" \ ./scripts/run_integration_tests_live_migration.sh "$@" || fix_dir_perms $? || exit $? fi diff --git a/scripts/gitlint/rules/on-behalf-of-marker.py b/scripts/gitlint/rules/on-behalf-of-marker.py new file mode 100644 index 0000000000..d08e334b17 --- /dev/null +++ b/scripts/gitlint/rules/on-behalf-of-marker.py @@ -0,0 +1,36 @@ +from gitlint.rules import LineRule, RuleViolation, CommitMessageTitle, CommitRule + +class BodyContainsOnBehalfOfSAPMarker(CommitRule): + """Enforce that each commit coming from an SAP contractor contains an + "On-behalf-of SAP user@sap.com" marker. + """ + + # A rule MUST have a human friendly name + name = "body-requires-on-behalf-of-sap" + + # A rule MUST have a *unique* id + # We recommend starting with UC (for User-defined Commit-rule). + id = "UC-sap" + + # Lower-case list of contractors + contractors = [ + "@cyberus-technology.de" + ] + + # Marker followed by " name.surname@sap.com" + marker = "On-behalf-of: SAP" + + def validate(self, commit): + if "@sap.com" in commit.author_email.lower(): + return + + # Allow third-party open-source contributions + if not any(contractor in commit.author_email.lower() for contractor in self.contractors): + return + + for line in commit.message.body: + if line.startswith(self.marker) and "@sap.com" in line.lower(): + return + + msg = f"Body does not contain a '{self.marker} user@sap.com' line" + return [RuleViolation(self.id, msg, line_nr=1)] diff --git a/scripts/run_integration_tests_aarch64.sh b/scripts/run_integration_tests_aarch64.sh index 0daa672e84..4afa523772 100755 --- a/scripts/run_integration_tests_aarch64.sh +++ b/scripts/run_integration_tests_aarch64.sh @@ -190,7 +190,9 @@ if [ $RES -ne 0 ]; then exit 1 fi +# Common configuration for every test run export RUST_BACKTRACE=1 +export RUSTFLAGS="$RUSTFLAGS" cargo build --all --release --target "$BUILD_TARGET" @@ -250,4 +252,19 @@ if [ $RES -eq 0 ]; then RES=$? fi +# Run tests on fw_cfg +if [ $RES -eq 0 ]; then + cargo build --features "fw_cfg" --all --release --target "$BUILD_TARGET" + export RUST_BACKTRACE=1 + time cargo test "fw_cfg::$test_filter" --target "$BUILD_TARGET" -- ${test_binary_args[*]} + RES=$? +fi + +if [ $RES -eq 0 ]; then + cargo build --features ivshmem --all --release --target "$BUILD_TARGET" + export RUST_BACKTRACE=1 + time cargo test "ivshmem::$test_filter" --target "$BUILD_TARGET" -- ${test_binary_args[*]} + RES=$? +fi + exit $RES diff --git a/scripts/run_integration_tests_live_migration.sh b/scripts/run_integration_tests_live_migration.sh index 0df9c01c91..f191b0baa1 100755 --- a/scripts/run_integration_tests_live_migration.sh +++ b/scripts/run_integration_tests_live_migration.sh @@ -11,6 +11,7 @@ mkdir -p "$WORKLOADS_DIR" process_common_args "$@" +migratable_version=v39.0 # For now these values are default for kvm test_features="" @@ -18,6 +19,15 @@ if [ "$hypervisor" = "mshv" ]; then test_features="--features mshv" fi +# if migratable version is set to override the default +if [ -n "${MIGRATABLE_VERSION}" ]; then + # validate the version if matched with vxx.0 + if ! [[ "${MIGRATABLE_VERSION}" =~ ^v[0-9]{2,}\.[0-9]$ ]]; then + echo "MIGRATABLE_VERSION should be in format vxx.0, e.g. v47.0" + exit 1 + fi + migratable_version=${MIGRATABLE_VERSION} +fi cp scripts/sha1sums-x86_64 "$WORKLOADS_DIR" FOCAL_OS_IMAGE_NAME="focal-server-cloudimg-amd64-custom-20210609-0.qcow2" @@ -45,8 +55,7 @@ fi popd || exit # Download Cloud Hypervisor binary from its last stable release -LAST_RELEASE_VERSION="v39.0" -CH_RELEASE_URL="https://github.com/cloud-hypervisor/cloud-hypervisor/releases/download/$LAST_RELEASE_VERSION/cloud-hypervisor-static" +CH_RELEASE_URL="https://github.com/cloud-hypervisor/cloud-hypervisor/releases/download/${migratable_version}/cloud-hypervisor-static" CH_RELEASE_NAME="cloud-hypervisor-static" pushd "$WORKLOADS_DIR" || exit time wget --quiet $CH_RELEASE_URL -O "$CH_RELEASE_NAME" || exit 1 @@ -74,7 +83,10 @@ PAGE_NUM=$((12288 * 1024 / HUGEPAGESIZE)) echo "$PAGE_NUM" | sudo tee /proc/sys/vm/nr_hugepages sudo chmod a+rwX /dev/hugepages +# Common configuration for every test run export RUST_BACKTRACE=1 +export RUSTFLAGS="$RUSTFLAGS" + time cargo test $test_features "live_migration_parallel::$test_filter" -- ${test_binary_args[*]} RES=$? diff --git a/scripts/run_integration_tests_rate_limiter.sh b/scripts/run_integration_tests_rate_limiter.sh index 56fb91e6e0..2e14aaffb2 100755 --- a/scripts/run_integration_tests_rate_limiter.sh +++ b/scripts/run_integration_tests_rate_limiter.sh @@ -55,7 +55,10 @@ fi cargo build --features mshv --all --release --target "$BUILD_TARGET" +# Common configuration for every test run export RUST_BACKTRACE=1 +export RUSTFLAGS="$RUSTFLAGS" + time cargo test $test_features "rate_limiter::$test_filter" -- --test-threads=1 ${test_binary_args[*]} RES=$? diff --git a/scripts/run_integration_tests_sgx.sh b/scripts/run_integration_tests_sgx.sh deleted file mode 100755 index b6549b6288..0000000000 --- a/scripts/run_integration_tests_sgx.sh +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env bash -# shellcheck disable=SC2048,SC2086 -set -x - -# shellcheck source=/dev/null -source "$HOME"/.cargo/env -source "$(dirname "$0")"/test-util.sh - -process_common_args "$@" - -if [[ "$hypervisor" = "mshv" ]]; then - echo "Unsupported SGX test for MSHV" - exit 1 -fi - -WORKLOADS_DIR="$HOME/workloads" -mkdir -p "$WORKLOADS_DIR" - -download_hypervisor_fw - -JAMMY_OS_IMAGE_NAME="jammy-server-cloudimg-amd64-custom-20241017-0.qcow2" -JAMMY_OS_IMAGE_URL="https://ch-images.azureedge.net/$JAMMY_OS_IMAGE_NAME" -JAMMY_OS_IMAGE="$WORKLOADS_DIR/$JAMMY_OS_IMAGE_NAME" -if [ ! -f "$JAMMY_OS_IMAGE" ]; then - pushd "$WORKLOADS_DIR" || exit - time wget --quiet $JAMMY_OS_IMAGE_URL || exit 1 - popd || exit -fi - -JAMMY_OS_RAW_IMAGE_NAME="jammy-server-cloudimg-amd64-custom-20241017-0.raw" -JAMMY_OS_RAW_IMAGE="$WORKLOADS_DIR/$JAMMY_OS_RAW_IMAGE_NAME" -if [ ! -f "$JAMMY_OS_RAW_IMAGE" ]; then - pushd "$WORKLOADS_DIR" || exit - time qemu-img convert -p -f qcow2 -O raw $JAMMY_OS_IMAGE_NAME $JAMMY_OS_RAW_IMAGE_NAME || exit 1 - popd || exit -fi - -CFLAGS="" -if [[ "${BUILD_TARGET}" == "x86_64-unknown-linux-musl" ]]; then - # shellcheck disable=SC2034 - CFLAGS="-I /usr/include/x86_64-linux-musl/ -idirafter /usr/include/" -fi - -cargo build --features mshv --all --release --target "$BUILD_TARGET" - -export RUST_BACKTRACE=1 - -time cargo test "sgx::$test_filter" -- ${test_binary_args[*]} -RES=$? - -exit $RES diff --git a/scripts/run_integration_tests_vfio.sh b/scripts/run_integration_tests_vfio.sh index 4d7bac60a4..b182c6612c 100755 --- a/scripts/run_integration_tests_vfio.sh +++ b/scripts/run_integration_tests_vfio.sh @@ -26,7 +26,10 @@ fi cargo build --features mshv --all --release --target "$BUILD_TARGET" +# Common configuration for every test run export RUST_BACKTRACE=1 +export RUSTFLAGS="$RUSTFLAGS" + time cargo test "vfio::test_nvidia" -- --test-threads=1 ${test_binary_args[*]} RES=$? diff --git a/scripts/run_integration_tests_windows_aarch64.sh b/scripts/run_integration_tests_windows_aarch64.sh index 92d66f805d..0190aa1c6c 100755 --- a/scripts/run_integration_tests_windows_aarch64.sh +++ b/scripts/run_integration_tests_windows_aarch64.sh @@ -36,7 +36,9 @@ dmsetup mknodes dmsetup create windows-snapshot-base --table "0 $img_blk_size snapshot-origin /dev/mapper/windows-base" dmsetup mknodes +# Common configuration for every test run export RUST_BACKTRACE=1 +export RUSTFLAGS="$RUSTFLAGS" cargo build --all --release --target "$BUILD_TARGET" diff --git a/scripts/run_integration_tests_windows_x86_64.sh b/scripts/run_integration_tests_windows_x86_64.sh index 2b11a6e687..714f6e4788 100755 --- a/scripts/run_integration_tests_windows_x86_64.sh +++ b/scripts/run_integration_tests_windows_x86_64.sh @@ -41,7 +41,9 @@ dmsetup mknodes cargo build --features mshv --all --release --target "$BUILD_TARGET" +# Common configuration for every test run export RUST_BACKTRACE=1 +export RUSTFLAGS="$RUSTFLAGS" # Only run with 1 thread to avoid tests interfering with one another because # Windows has a static IP configured diff --git a/scripts/run_integration_tests_x86_64.sh b/scripts/run_integration_tests_x86_64.sh index 11f5d6664a..662e274a93 100755 --- a/scripts/run_integration_tests_x86_64.sh +++ b/scripts/run_integration_tests_x86_64.sh @@ -20,9 +20,13 @@ fi cp scripts/sha1sums-x86_64 "$WORKLOADS_DIR" -download_hypervisor_fw +if [ ! -f "$WORKLOADS_DIR/hypervisor-fw" ]; then + download_hypervisor_fw +fi -download_ovmf +if [ ! -f "$WORKLOADS_DIR/CLOUDHV.fd" ]; then + download_ovmf +fi FOCAL_OS_IMAGE_NAME="focal-server-cloudimg-amd64-custom-20210609-0.qcow2" FOCAL_OS_IMAGE_URL="https://ch-images.azureedge.net/$FOCAL_OS_IMAGE_NAME" @@ -173,14 +177,16 @@ ulimit -l unlimited # Set number of open descriptors high enough for VFIO tests to run ulimit -n 4096 +# Common configuration for every test run export RUST_BACKTRACE=1 +export RUSTFLAGS="$RUSTFLAGS" + time cargo test --release --target "$BUILD_TARGET" $test_features "common_parallel::$test_filter" -- ${test_binary_args[*]} RES=$? # Run some tests in sequence since the result could be affected by other tests # running in parallel. if [ $RES -eq 0 ]; then - export RUST_BACKTRACE=1 time cargo test --release --target "$BUILD_TARGET" $test_features "common_sequential::$test_filter" -- --test-threads=1 ${test_binary_args[*]} RES=$? fi @@ -188,10 +194,22 @@ fi # Run tests on dbus_api if [ $RES -eq 0 ]; then cargo build --features "mshv,dbus_api" --all --release --target "$BUILD_TARGET" - export RUST_BACKTRACE=1 # integration tests now do not reply on build feature "dbus_api" time cargo test $test_features "dbus_api::$test_filter" -- ${test_binary_args[*]} RES=$? fi +# Run tests on fw_cfg +if [ $RES -eq 0 ]; then + cargo build --features "mshv,fw_cfg" --all --release --target "$BUILD_TARGET" + time cargo test "fw_cfg::$test_filter" --target "$BUILD_TARGET" -- ${test_binary_args[*]} + RES=$? +fi + +if [ $RES -eq 0 ]; then + cargo build --features ivshmem --all --release --target "$BUILD_TARGET" + time cargo test $test_features "ivshmem::$test_filter" --target "$BUILD_TARGET" -- ${test_binary_args[*]} + RES=$? +fi + exit $RES diff --git a/scripts/sha1sums-x86_64 b/scripts/sha1sums-x86_64 index 7fd5c56d87..e198816035 100644 --- a/scripts/sha1sums-x86_64 +++ b/scripts/sha1sums-x86_64 @@ -3,3 +3,5 @@ f1eccdc5e1b515dbad294426ab081b47ebfb97c0 focal-server-cloudimg-amd64-custom-2021 7f5a8358243a96adf61f5c20139b29f308f2c0e3 focal-server-cloudimg-amd64-custom-20210609-0.raw 5f10738920efb74f0bf854cadcd1b1fd544e49c8 jammy-server-cloudimg-amd64-custom-20241017-0.qcow2 c1dfbe7abde400e675844568dbe9d3914222f6de jammy-server-cloudimg-amd64-custom-20241017-0.raw +540ac358429305d7aa94e15363665d1c9d845982 hypervisor-fw +4e96fd0914a44005d40707b2b0c7e829e4086bd5 CLOUDHV.fd diff --git a/serial_buffer/Cargo.toml b/serial_buffer/Cargo.toml index 2c3993cff4..767c8a97ff 100644 --- a/serial_buffer/Cargo.toml +++ b/serial_buffer/Cargo.toml @@ -1,5 +1,8 @@ [package] authors = ["The Cloud Hypervisor Authors"] -edition = "2021" +edition.workspace = true name = "serial_buffer" version = "0.1.0" + +[lints] +workspace = true diff --git a/serial_buffer/src/lib.rs b/serial_buffer/src/lib.rs index 6b9182d4c7..f914f2ef54 100644 --- a/serial_buffer/src/lib.rs +++ b/serial_buffer/src/lib.rs @@ -5,8 +5,8 @@ use std::collections::VecDeque; use std::io::Write; -use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; const MAX_BUFFER_SIZE: usize = 1 << 20; diff --git a/src/bin/ch-remote.rs b/src/bin/ch-remote.rs index 9d8c4f68b4..9e042ebe5b 100644 --- a/src/bin/ch-remote.rs +++ b/src/bin/ch-remote.rs @@ -9,14 +9,17 @@ mod test_util; use std::io::Read; use std::marker::PhantomData; +use std::num::NonZeroU32; use std::os::unix::net::UnixStream; +use std::path::PathBuf; use std::process; use api_client::{ - simple_api_command, simple_api_command_with_fds, simple_api_full_command, - Error as ApiClientError, + Error as ApiClientError, simple_api_command, simple_api_command_with_fds, + simple_api_full_command, }; use clap::{Arg, ArgAction, ArgMatches, Command}; +use log::error; use option_parser::{ByteSized, ByteSizedParseError}; use thiserror::Error; use vmm::config::RestoreConfig; @@ -319,6 +322,22 @@ fn rest_api_do_command(matches: &ArgMatches, socket: &mut UnixStream) -> ApiResu )?; simple_api_command(socket, "PUT", "resize", Some(&resize)).map_err(Error::HttpApiClient) } + Some("resize-disk") => { + let resize_disk = resize_disk_config( + matches + .subcommand_matches("resize-disk") + .unwrap() + .get_one::("disk") + .unwrap(), + matches + .subcommand_matches("resize-disk") + .unwrap() + .get_one::("size") + .unwrap(), + )?; + simple_api_command(socket, "PUT", "resize-disk", Some(&resize_disk)) + .map_err(Error::HttpApiClient) + } Some("resize-zone") => { let resize_zone = resize_zone_config( matches @@ -473,11 +492,34 @@ fn rest_api_do_command(matches: &ArgMatches, socket: &mut UnixStream) -> ApiResu .subcommand_matches("send-migration") .unwrap() .get_one::("send_migration_config") - .unwrap(), + .unwrap() + .to_owned(), matches .subcommand_matches("send-migration") .unwrap() .get_flag("send_migration_local"), + *matches + .subcommand_matches("send-migration") + .unwrap() + .get_one::("downtime-ms") + .unwrap_or(&300), + *matches + .subcommand_matches("send-migration") + .unwrap() + .get_one::("migration-timeout-s") + .unwrap_or(&3600), + matches + .subcommand_matches("send-migration") + .unwrap() + .get_one::("connections") + .copied() + .and_then(NonZeroU32::new) + .unwrap_or(NonZeroU32::new(1).unwrap()), + matches + .subcommand_matches("send-migration") + .unwrap() + .get_one::("tls-dir") + .cloned(), ); simple_api_command(socket, "PUT", "send-migration", Some(&send_migration_data)) .map_err(Error::HttpApiClient) @@ -488,7 +530,13 @@ fn rest_api_do_command(matches: &ArgMatches, socket: &mut UnixStream) -> ApiResu .subcommand_matches("receive-migration") .unwrap() .get_one::("receive_migration_config") - .unwrap(), + .unwrap() + .to_owned(), + matches + .subcommand_matches("receive-migration") + .unwrap() + .get_one::("tls-dir") + .cloned(), ); simple_api_command( socket, @@ -692,6 +740,16 @@ fn dbus_api_do_command(matches: &ArgMatches, proxy: &DBusApi1ProxyBlocking<'_>) .subcommand_matches("send-migration") .unwrap() .get_flag("send_migration_local"), + *matches + .subcommand_matches("send-migration") + .unwrap() + .get_one::("downtime-ms") + .unwrap_or(&300), + *matches + .subcommand_matches("send-migration") + .unwrap() + .get_one::("migration-timeout-s") + .unwrap_or(&3600), ); proxy.api_vm_send_migration(&send_migration_data) } @@ -724,7 +782,7 @@ fn resize_config( memory: Option<&str>, balloon: Option<&str>, ) -> Result { - let desired_vcpus: Option = if let Some(cpus) = cpus { + let desired_vcpus: Option = if let Some(cpus) = cpus { Some(cpus.parse().map_err(Error::InvalidCpuCount)?) } else { None @@ -761,6 +819,18 @@ fn resize_config( Ok(serde_json::to_string(&resize).unwrap()) } +fn resize_disk_config(id: &str, size: &str) -> Result { + let resize_zone = vmm::api::VmResizeDiskData { + id: id.to_owned(), + desired_size: size + .parse::() + .map_err(Error::InvalidMemorySize)? + .0, + }; + + Ok(serde_json::to_string(&resize_zone).unwrap()) +} + fn resize_zone_config(id: &str, size: &str) -> Result { let resize_zone = vmm::api::VmResizeZoneData { id: id.to_owned(), @@ -873,18 +943,35 @@ fn coredump_config(destination_url: &str) -> String { serde_json::to_string(&coredump_config).unwrap() } -fn receive_migration_data(url: &str) -> String { +fn receive_migration_data(url: String, tls_dir: Option) -> String { let receive_migration_data = vmm::api::VmReceiveMigrationData { - receiver_url: url.to_owned(), + receiver_url: url, + tcp_serial_url: None, + // Only FDs transmitted via an SCM_RIGHTS UNIX Domain Socket message + // are valid. Transmitting specific FD nums via the HTTP API is + // almost always invalid. + net_fds: None, + tls_dir, }; serde_json::to_string(&receive_migration_data).unwrap() } -fn send_migration_data(url: &str, local: bool) -> String { +fn send_migration_data( + url: String, + local: bool, + downtime: u64, + migration_timeout: u64, + connections: NonZeroU32, + tls_dir: Option, +) -> String { let send_migration_data = vmm::api::VmSendMigrationData { - destination_url: url.to_owned(), + destination_url: url, local, + downtime, + migration_timeout, + connections, + tls_dir, }; serde_json::to_string(&send_migration_data).unwrap() @@ -996,7 +1083,14 @@ fn get_cli_commands_sorted() -> Box<[Command]> { .arg( Arg::new("receive_migration_config") .index(1) + // Live migration with net_fds not supported in ch-remote. .help(""), + ) + .arg( + Arg::new("tls-dir") + .long("tls-dir") + .help("directory with TLS certificates") + .num_args(1), ), Command::new("remove-device") .about("Remove VFIO and PCI device") @@ -1021,6 +1115,20 @@ fn get_cli_commands_sorted() -> Box<[Command]> { .help("New memory size in bytes (supports K/M/G suffix)") .num_args(1), ), + Command::new("resize-disk") + .about("grows/shrinks an attached disk") + .arg( + Arg::new("disk") + .long("disk") + .help("disk identifier") + .num_args(1), + ) + .arg( + Arg::new("size") + .long("size") + .help("new disk size") + .num_args(1), + ), Command::new("resize-zone") .about("Resize a memory zone") .arg( @@ -1045,6 +1153,32 @@ fn get_cli_commands_sorted() -> Box<[Command]> { Command::new("resume").about("Resume the VM"), Command::new("send-migration") .about("Initiate a VM migration") + .arg( + Arg::new("connections") + .long("connections") + .help("The number of connections to use for the migration") + .num_args(1) + .value_parser(clap::value_parser!(u32)) + .default_value("1"), + ) + .arg( + Arg::new("downtime-ms") + .long("downtime-ms") + .visible_alias("downtime") + .help("Set the expected maximum downtime in milliseconds") + .num_args(1) + .value_parser(clap::value_parser!(u64)) + .default_value("300"), + ) + .arg( + Arg::new("migration-timeout-s") + .long("migration-timeout-s") + .visible_alias("migration-timeout") + .help("Set the maximum allowed migration time in seconds") + .num_args(1) + .value_parser(clap::value_parser!(u64)) + .default_value("3600"), + ) .arg( Arg::new("send_migration_config") .index(1) @@ -1055,6 +1189,12 @@ fn get_cli_commands_sorted() -> Box<[Command]> { .long("local") .num_args(0) .action(ArgAction::SetTrue), + ) + .arg( + Arg::new("tls-dir") + .long("tls-dir") + .help("directory with TLS certificates") + .num_args(1), ), Command::new("shutdown").about("Shutdown the VM"), Command::new("shutdown-vmm").about("Shutdown the VMM"), @@ -1071,6 +1211,7 @@ fn get_cli_commands_sorted() -> Box<[Command]> { } fn main() { + env_logger::init(); let app = Command::new("ch-remote") .author(env!("CARGO_PKG_AUTHORS")) .version(env!("BUILD_VERSION")) @@ -1092,7 +1233,7 @@ fn main() { #[cfg(not(feature = "dbus_api"))] (Some(api_sock),) => TargetApi::HttpApi( UnixStream::connect(api_sock).unwrap_or_else(|e| { - eprintln!("Error opening HTTP socket: {e}"); + error!("Error opening HTTP socket: {e}"); process::exit(1) }), PhantomData, @@ -1100,7 +1241,7 @@ fn main() { #[cfg(feature = "dbus_api")] (Some(api_sock), None, None) => TargetApi::HttpApi( UnixStream::connect(api_sock).unwrap_or_else(|e| { - eprintln!("Error opening HTTP socket: {e}"); + error!("Error opening HTTP socket: {e}"); process::exit(1) }), PhantomData, @@ -1114,19 +1255,21 @@ fn main() { ) .map_err(Error::DBusApiClient) .unwrap_or_else(|e| { - eprintln!("Error creating D-Bus proxy: {e}"); + error!("Error creating D-Bus proxy: {e}"); process::exit(1) }), ), #[cfg(feature = "dbus_api")] (Some(_), Some(_) | None, Some(_) | None) => { - println!( + error!( "`api-socket` and (dbus-service-name or dbus-object-path) are mutually exclusive" ); process::exit(1); } _ => { - println!("Please either provide the api-socket option or dbus-service-name and dbus-object-path options"); + error!( + "Please either provide the api-socket option or dbus-service-name and dbus-object-path options" + ); process::exit(1); } }; diff --git a/src/lib.rs b/src/lib.rs index 1596a13f47..836ed8e2ed 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,6 +4,8 @@ use std::error::Error; +use log::error; + /// Prints a chain of errors to the user in a consistent manner. /// The user will see a clear chain of errors, followed by debug output /// for opening issues. @@ -17,6 +19,9 @@ pub fn cli_print_error_chain<'a>( &'a (dyn Error + 'static), ) -> Option, ) { + // Debug info. + error!("Fatal error: {top_error:?}"); + eprint!("Error: {component} exited with the following "); if top_error.source().is_none() { eprintln!("error:"); @@ -38,7 +43,4 @@ pub fn cli_print_error_chain<'a>( } }); } - - eprintln!(); - eprintln!("Debug Info: {top_error:?}"); } diff --git a/src/main.rs b/src/main.rs index 4a0fbe91f9..9436b3e160 100644 --- a/src/main.rs +++ b/src/main.rs @@ -8,27 +8,29 @@ mod test_util; use std::fs::File; use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; -use std::sync::mpsc::channel; use std::sync::Mutex; +use std::sync::mpsc::channel; use std::{env, io}; use clap::{Arg, ArgAction, ArgGroup, ArgMatches, Command}; use event_monitor::event; use libc::EFD_NONBLOCK; -use log::{warn, LevelFilter}; +use log::{LevelFilter, error, info, warn}; use option_parser::OptionParser; use seccompiler::SeccompAction; use signal_hook::consts::SIGSYS; use thiserror::Error; +use vmm::api::ApiAction; #[cfg(feature = "dbus_api")] -use vmm::api::dbus::{dbus_api_graceful_shutdown, DBusApiOptions}; +use vmm::api::dbus::{DBusApiOptions, dbus_api_graceful_shutdown}; use vmm::api::http::http_api_graceful_shutdown; -use vmm::api::ApiAction; use vmm::config::{RestoreConfig, VmParams}; use vmm::landlock::{Landlock, LandlockError}; use vmm::vm_config; -#[cfg(target_arch = "x86_64")] -use vmm::vm_config::SgxEpcConfig; +#[cfg(feature = "fw_cfg")] +use vmm::vm_config::FwCfgConfig; +#[cfg(feature = "ivshmem")] +use vmm::vm_config::IvshmemConfig; use vmm::vm_config::{ BalloonConfig, DeviceConfig, DiskConfig, FsConfig, LandlockConfig, NetConfig, NumaConfig, PciSegmentConfig, PmemConfig, RateLimiterGroupConfig, TpmConfig, UserDeviceConfig, VdpaConfig, @@ -131,29 +133,25 @@ impl log::Log for Logger { let now = std::time::Instant::now(); let duration = now.duration_since(self.start); + let duration_s = duration.as_secs_f32(); - if record.file().is_some() && record.line().is_some() { - write!( - *(*(self.output.lock().unwrap())), - "cloud-hypervisor: {:.6?}: <{}> {}:{}:{} -- {}\r\n", - duration, - std::thread::current().name().unwrap_or("anonymous"), - record.level(), - record.file().unwrap(), - record.line().unwrap(), - record.args() - ) + let location = if let (Some(file), Some(line)) = (record.file(), record.line()) { + format!("{}:{}", file, line) } else { - write!( - *(*(self.output.lock().unwrap())), - "cloud-hypervisor: {:.6?}: <{}> {}:{} -- {}\r\n", - duration, - std::thread::current().name().unwrap_or("anonymous"), - record.level(), - record.target(), - record.args() - ) - } + record.target().to_string() + }; + + let mut out = self.output.lock().unwrap(); + write!( + &mut *out, + // 10: 6 decimal places + sep => 0..999s will be properly aligned + "cloud-hypervisor: {:>10.6?}s: <{}> {}:{} -- {}\r\n", + duration_s, + std::thread::current().name().unwrap_or("anonymous"), + record.level(), + location, + record.args(), + ) .ok(); } fn flush(&self) {} @@ -269,6 +267,12 @@ fn get_cli_options_sorted( .help(FsConfig::SYNTAX) .num_args(1..) .group("vm-config"), + #[cfg(feature = "fw_cfg")] + Arg::new("fw-cfg-config") + .long("fw-cfg-config") + .help(FwCfgConfig::SYNTAX) + .num_args(1) + .group("vm-payload"), #[cfg(feature = "guest_debug")] Arg::new("gdb") .long("gdb") @@ -292,6 +296,12 @@ fn get_cli_options_sorted( .help("Path to initramfs image") .num_args(1) .group("vm-config"), + #[cfg(feature = "ivshmem")] + Arg::new("ivshmem") + .long("ivshmem") + .help(IvshmemConfig::SYNTAX) + .num_args(1) + .group("vm-config"), Arg::new("kernel") .long("kernel") .help( @@ -410,15 +420,9 @@ fn get_cli_options_sorted( .default_value("true"), Arg::new("serial") .long("serial") - .help("Control serial port: off|null|pty|tty|file=|socket=") + .help("Control serial port: off|null|pty|tty|file=|socket=|tcp=") .default_value("null") .group("vm-config"), - #[cfg(target_arch = "x86_64")] - Arg::new("sgx-epc") - .long("sgx-epc") - .help(SgxEpcConfig::SYNTAX) - .num_args(1..) - .group("vm-config"), Arg::new("tpm") .long("tpm") .num_args(1) @@ -553,15 +557,15 @@ fn start_vmm(cmd_arguments: ArgMatches) -> Result, Error> { // handler safe functions (writing to stderr) and manipulating signals. unsafe { signal_hook::low_level::register(signal_hook::consts::SIGSYS, || { - eprint!( + eprintln!( "\n==== Possible seccomp violation ====\n\ Try running with `strace -ff` to identify the cause and open an issue: \ - https://github.com/cloud-hypervisor/cloud-hypervisor/issues/new\n" + https://github.com/cloud-hypervisor/cloud-hypervisor/issues/new" ); signal_hook::low_level::emulate_default_handler(SIGSYS).unwrap(); }) } - .map_err(|e| eprintln!("Error adding SIGSYS signal handler: {e}")) + .map_err(|e| error!("Error adding SIGSYS signal handler: {e}")) .ok(); } @@ -575,13 +579,13 @@ fn start_vmm(cmd_arguments: ArgMatches) -> Result, Error> { // dedicated signal handling thread we'll start in a bit. for sig in &vmm::vm::Vm::HANDLED_SIGNALS { if let Err(e) = block_signal(*sig) { - eprintln!("Error blocking signals: {e}"); + error!("Error blocking signals: {e}"); } } for sig in &vmm::Vmm::HANDLED_SIGNALS { if let Err(e) = block_signal(*sig) { - eprintln!("Error blocking signals: {e}"); + error!("Error blocking signals: {e}"); } } @@ -746,10 +750,10 @@ fn start_vmm(cmd_arguments: ArgMatches) -> Result, Error> { Ok(()) })(); - if r.is_err() { - if let Err(e) = exit_evt.write(1) { - warn!("writing to exit EventFd: {e}"); - } + if r.is_err() + && let Err(e) = exit_evt.write(1) + { + warn!("writing to exit EventFd: {e}"); } if landlock_enable { @@ -853,6 +857,8 @@ fn main() { compile_error!("Feature 'tdx' and 'sev_snp' are mutually exclusive."); #[cfg(all(feature = "sev_snp", not(target_arch = "x86_64")))] compile_error!("Feature 'sev_snp' needs target 'x86_64'"); + #[cfg(all(feature = "fw_cfg", target_arch = "riscv64"))] + compile_error!("Feature 'fw_cfg' needs targets 'x86_64' or 'aarch64'"); #[cfg(feature = "dhat-heap")] let _profiler = dhat::Profiler::new_heap(); @@ -881,6 +887,7 @@ fn main() { let exit_code = match start_vmm(cmd_arguments) { Ok(path) => { path.map(|s| std::fs::remove_file(s).ok()); + info!("Cloud Hypervisor exited successfully"); 0 } Err(top_error) => { @@ -954,6 +961,7 @@ mod unit_tests { max_phys_bits: 46, affinity: None, features: CpuFeatures::default(), + profile: Default::default(), }, memory: MemoryConfig { size: 536_870_912, @@ -977,6 +985,8 @@ mod unit_tests { igvm: None, #[cfg(feature = "sev_snp")] host_data: None, + #[cfg(feature = "fw_cfg")] + fw_cfg_config: None, }), rate_limit_groups: None, disks: None, @@ -984,6 +994,7 @@ mod unit_tests { rng: RngConfig { src: PathBuf::from("/dev/urandom"), iommu: false, + bdf_device: None, }, balloon: None, fs: None, @@ -993,12 +1004,16 @@ mod unit_tests { mode: ConsoleOutputMode::Null, iommu: false, socket: None, + url: None, + bdf_device: None, }, console: ConsoleConfig { file: None, mode: ConsoleOutputMode::Tty, iommu: false, socket: None, + url: None, + bdf_device: None, }, #[cfg(target_arch = "x86_64")] debug_console: DebugConsoleConfig::default(), @@ -1010,8 +1025,6 @@ mod unit_tests { #[cfg(feature = "pvmemcontrol")] pvmemcontrol: None, iommu: false, - #[cfg(target_arch = "x86_64")] - sgx_epc: None, numa: None, watchdog: false, #[cfg(feature = "guest_debug")] @@ -1022,6 +1035,8 @@ mod unit_tests { preserved_fds: None, landlock_enable: false, landlock_rules: None, + #[cfg(feature = "ivshmem")] + ivshmem: None, }; assert_eq!(expected_vm_config, result_vm_config); @@ -1081,8 +1096,7 @@ mod unit_tests { #[test] fn test_valid_vm_config_memory() { - vec![ - ( + [( vec!["cloud-hypervisor", "--kernel", "/path/to/kernel", "--memory", "size=1073741824"], r#"{ "payload": {"kernel": "/path/to/kernel"}, @@ -1137,8 +1151,7 @@ mod unit_tests { "memory": {"size": 1073741824, "hotplug_method": "VirtioMem", "hotplug_size": 1073741824} }"#, true, - ), - ] + )] .iter() .for_each(|(cli, openapi, equal)| { compare_vm_config_cli_vs_json(cli, openapi, *equal); @@ -1202,6 +1215,24 @@ mod unit_tests { }"#, true, ), + ( + vec![ + "cloud-hypervisor", + "--kernel", + "/path/to/kernel", + "--disk", + "path=/path/to/disk/1,addr=15.0", + "path=/path/to/disk/2", + ], + r#"{ + "payload": {"kernel": "/path/to/kernel"}, + "disks": [ + {"path": "/path/to/disk/1", "bdf_device": 21}, + {"path": "/path/to/disk/2"} + ] + }"#, + true, + ), ( vec![ "cloud-hypervisor", @@ -1289,7 +1320,7 @@ mod unit_tests { #[test] fn test_valid_vm_config_net() { - vec![ + [ // This test is expected to fail because the default MAC address is // randomly generated. There's no way we can have twice the same // default value. @@ -1413,6 +1444,20 @@ mod unit_tests { }"#, true, ), + ( + vec![ + "cloud-hypervisor", "--kernel", "/path/to/kernel", + "--net", + "mac=12:34:56:78:90:ab,host_mac=34:56:78:90:ab:cd,tap=tap0,ip=1.2.3.4,mask=5.6.7.8,addr=08.0", + ], + r#"{ + "payload": {"kernel": "/path/to/kernel"}, + "net": [ + {"mac": "12:34:56:78:90:ab", "host_mac": "34:56:78:90:ab:cd", "tap": "tap0", "ip": "1.2.3.4", "mask": "5.6.7.8", "num_queues": 2, "queue_size": 256, "bdf_device": 8} + ] + }"#, + true, + ), #[cfg(target_arch = "x86_64")] ( vec![ @@ -1484,11 +1529,11 @@ mod unit_tests { "--kernel", "/path/to/kernel", "--rng", - "src=/path/to/entropy/source", + "src=/path/to/entropy/source,addr=11.0", ], r#"{ "payload": {"kernel": "/path/to/kernel"}, - "rng": {"src": "/path/to/entropy/source"} + "rng": {"src": "/path/to/entropy/source", "bdf_device": 17} }"#, true, )] @@ -1505,14 +1550,14 @@ mod unit_tests { "cloud-hypervisor", "--kernel", "/path/to/kernel", "--memory", "shared=true", "--fs", - "tag=virtiofs1,socket=/path/to/sock1", + "tag=virtiofs1,socket=/path/to/sock1,addr=10.0", "tag=virtiofs2,socket=/path/to/sock2", ], r#"{ "payload": {"kernel": "/path/to/kernel"}, "memory" : { "shared": true, "size": 536870912 }, "fs": [ - {"tag": "virtiofs1", "socket": "/path/to/sock1"}, + {"tag": "virtiofs1", "socket": "/path/to/sock1", "bdf_device": 16}, {"tag": "virtiofs2", "socket": "/path/to/sock2"} ] }"#, @@ -1584,13 +1629,13 @@ mod unit_tests { "--kernel", "/path/to/kernel", "--pmem", - "file=/path/to/img/1,size=1G", + "file=/path/to/img/1,size=1G,addr=1F.0", "file=/path/to/img/2,size=2G", ], r#"{ "payload": {"kernel": "/path/to/kernel"}, "pmem": [ - {"file": "/path/to/img/1", "size": 1073741824}, + {"file": "/path/to/img/1", "size": 1073741824,"bdf_device": 31}, {"file": "/path/to/img/2", "size": 2147483648} ] }"#, @@ -1767,7 +1812,7 @@ mod unit_tests { #[test] #[cfg(target_arch = "x86_64")] fn test_valid_vm_config_devices() { - vec![ + [ ( vec![ "cloud-hypervisor", @@ -1868,13 +1913,13 @@ mod unit_tests { "--kernel", "/path/to/kernel", "--vdpa", - "path=/path/to/device/1", + "path=/path/to/device/1,addr=18.0", "path=/path/to/device/2,num_queues=2", ], r#"{ "payload": {"kernel": "/path/to/kernel"}, "vdpa": [ - {"path": "/path/to/device/1", "num_queues": 1}, + {"path": "/path/to/device/1", "num_queues": 1, "bdf_device": 24}, {"path": "/path/to/device/2", "num_queues": 2} ] }"#, @@ -1913,11 +1958,11 @@ mod unit_tests { "--kernel", "/path/to/kernel", "--vsock", - "cid=123,socket=/path/to/sock/1", + "cid=123,socket=/path/to/sock/1,addr=0F.0", ], r#"{ "payload": {"kernel": "/path/to/kernel"}, - "vsock": {"cid": 123, "socket": "/path/to/sock/1"} + "vsock": {"cid": 123, "socket": "/path/to/sock/1", "bdf_device": 15} }"#, true, ), diff --git a/test_infra/Cargo.toml b/test_infra/Cargo.toml index 0374bb0fad..e6ea592c39 100644 --- a/test_infra/Cargo.toml +++ b/test_infra/Cargo.toml @@ -1,16 +1,18 @@ [package] authors = ["The Cloud Hypervisor Authors"] -edition = "2021" +edition.workspace = true name = "test_infra" version = "0.1.0" [dependencies] -dirs = "6.0.0" -epoll = "4.3.3" -libc = "0.2.167" -serde = { version = "1.0.208", features = ["derive", "rc"] } +dirs = { workspace = true } +epoll = { workspace = true } +libc = { workspace = true } serde_json = { workspace = true } -ssh2 = { version = "0.9.4", features = ["vendored-openssl"] } +ssh2 = { version = "0.9.5", features = ["vendored-openssl"] } thiserror = { workspace = true } vmm-sys-util = { workspace = true } -wait-timeout = "0.2.0" +wait-timeout = { workspace = true } + +[lints] +workspace = true diff --git a/test_infra/src/lib.rs b/test_infra/src/lib.rs index df47de835c..8b260a0a63 100644 --- a/test_infra/src/lib.rs +++ b/test_infra/src/lib.rs @@ -481,13 +481,14 @@ pub fn rate_limited_copy, Q: AsRef>(from: P, to: Q) -> io:: match fs::copy(&from, &to) { Err(e) => { - if let Some(errno) = e.raw_os_error() { - if errno == libc::ENOSPC { - eprintln!("Copy returned ENOSPC. Attempt {i} of 10. Sleeping."); - thread::sleep(std::time::Duration::new(60, 0)); - continue; - } + if let Some(errno) = e.raw_os_error() + && errno == libc::ENOSPC + { + eprintln!("Copy returned ENOSPC. Attempt {i} of 10. Sleeping."); + thread::sleep(std::time::Duration::new(60, 0)); + continue; } + return Err(e); } Ok(i) => return Ok(i), @@ -1061,24 +1062,6 @@ impl Guest { } } - #[cfg(target_arch = "x86_64")] - pub fn check_sgx_support(&self) -> Result<(), Error> { - self.ssh_command( - "cpuid -l 0x7 -s 0 | tr -s [:space:] | grep -q 'SGX: \ - Software Guard Extensions supported = true'", - )?; - self.ssh_command( - "cpuid -l 0x7 -s 0 | tr -s [:space:] | grep -q 'SGX_LC: \ - SGX launch config supported = true'", - )?; - self.ssh_command( - "cpuid -l 0x12 -s 0 | tr -s [:space:] | grep -q 'SGX1 \ - supported = true'", - )?; - - Ok(()) - } - pub fn get_pci_bridge_class(&self) -> Result { Ok(self .ssh_command("cat /sys/bus/pci/devices/0000:00:00.0/class")? @@ -1112,12 +1095,11 @@ impl Guest { let vendors: Vec<&str> = vendors.split('\n').collect(); for (index, d_id) in devices.iter().enumerate() { - if *d_id == device_id { - if let Some(v_id) = vendors.get(index) { - if *v_id == vendor_id { - return Ok(true); - } - } + if *d_id == device_id + && let Some(v_id) = vendors.get(index) + && *v_id == vendor_id + { + return Ok(true); } } @@ -1136,10 +1118,12 @@ impl Guest { thread::sleep(std::time::Duration::new(10, 0)); // Write something to vsock from the host - assert!(exec_host_command_status(&format!( - "echo -e \"CONNECT 16\\nHelloWorld!\" | socat - UNIX-CONNECT:{socket}" - )) - .success()); + assert!( + exec_host_command_status(&format!( + "echo -e \"CONNECT 16\\nHelloWorld!\" | socat - UNIX-CONNECT:{socket}" + )) + .success() + ); // Wait for the thread to terminate. listen_socat.join().unwrap(); @@ -1152,10 +1136,11 @@ impl Guest { #[cfg(target_arch = "x86_64")] pub fn check_nvidia_gpu(&self) { - assert!(self - .ssh_command("nvidia-smi") - .unwrap() - .contains("NVIDIA L40S")); + assert!( + self.ssh_command("nvidia-smi") + .unwrap() + .contains("NVIDIA L40S") + ); } pub fn reboot_linux(&self, current_reboot_count: u32, custom_timeout: Option) { @@ -1244,18 +1229,14 @@ impl Guest { } } +#[derive(Default)] pub enum VerbosityLevel { + #[default] Warn, Info, Debug, } -impl Default for VerbosityLevel { - fn default() -> Self { - Self::Warn - } -} - impl Display for VerbosityLevel { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use VerbosityLevel::*; @@ -1352,11 +1333,9 @@ impl<'a> GuestCommand<'a> { if pipesize >= PIPE_SIZE && pipesize1 >= PIPE_SIZE { Ok(child) } else { - Err(std::io::Error::other( - format!( - "resizing pipe w/ 'fnctl' failed: stdout pipesize {pipesize}, stderr pipesize {pipesize1}" - ), - )) + Err(std::io::Error::other(format!( + "resizing pipe w/ 'fnctl' failed: stdout pipesize {pipesize}, stderr pipesize {pipesize1}" + ))) } } else { // The caller should call .wait() on the returned child @@ -1776,3 +1755,27 @@ pub fn measure_virtio_net_latency(guest: &Guest, test_timeout: u32) -> Result Option { + let devices: Vec<&str> = output.split("\n\n").collect(); + + for device in devices { + if device.contains(device_desc) { + for line in device.lines() { + let line = line.trim(); + let line_start_str = format!("Region {bar_index}: Memory at"); + // for example: Region 2: Memory at 200000000 (64-bit, non-prefetchable) [size=1M] + if line.starts_with(line_start_str.as_str()) { + let parts: Vec<&str> = line.split_whitespace().collect(); + if parts.len() >= 4 { + let addr_str = parts[4]; + return Some(String::from(addr_str)); + } + } + } + } + } + None +} diff --git a/tests/integration.rs b/tests/integration.rs index 0536864fd2..d3539fe877 100644 --- a/tests/integration.rs +++ b/tests/integration.rs @@ -2,6 +2,7 @@ // // SPDX-License-Identifier: Apache-2.0 // +#![cfg(devcli_testenv)] #![allow(clippy::undocumented_unsafe_blocks)] // When enabling the `mshv` feature, we skip quite some tests and // hence have known dead-code. This annotation silences dead-code @@ -11,14 +12,16 @@ extern crate test_infra; use std::collections::HashMap; -use std::io::{BufRead, Read, Seek, Write}; +use std::ffi::CStr; +use std::fs::OpenOptions; +use std::io::{BufRead, Read, Seek, SeekFrom, Write}; use std::net::TcpListener; use std::os::unix::io::AsRawFd; use std::path::PathBuf; use std::process::{Child, Command, Stdio}; use std::string::String; use std::sync::mpsc::Receiver; -use std::sync::{mpsc, Mutex}; +use std::sync::{Mutex, mpsc}; use std::time::Duration; use std::{fs, io, thread}; @@ -520,8 +523,7 @@ fn temp_snapshot_dir_path(tmp_dir: &TempDir) -> String { } fn temp_vmcore_file_path(tmp_dir: &TempDir) -> String { - let vmcore_file = String::from(tmp_dir.as_path().join("vmcore").to_str().unwrap()); - vmcore_file + String::from(tmp_dir.as_path().join("vmcore").to_str().unwrap()) } // Creates the path for direct kernel boot and return the path. @@ -717,10 +719,12 @@ fn setup_ovs_dpdk() { assert!(exec_host_command_status("service openvswitch-switch restart").success()); // Create OVS-DPDK bridge and ports - assert!(exec_host_command_status( - "ovs-vsctl add-br ovsbr0 -- set bridge ovsbr0 datapath_type=netdev", - ) - .success()); + assert!( + exec_host_command_status( + "ovs-vsctl add-br ovsbr0 -- set bridge ovsbr0 datapath_type=netdev", + ) + .success() + ); assert!(exec_host_command_status("ovs-vsctl add-port ovsbr0 vhost-user1 -- set Interface vhost-user1 type=dpdkvhostuserclient options:vhost-server-path=/tmp/dpdkvhostclient1").success()); assert!(exec_host_command_status("ovs-vsctl add-port ovsbr0 vhost-user2 -- set Interface vhost-user2 type=dpdkvhostuserclient options:vhost-server-path=/tmp/dpdkvhostclient2").success()); assert!(exec_host_command_status("ip link set up dev ovsbr0").success()); @@ -1656,8 +1660,10 @@ fn _test_virtio_fs( "{{\"id\":\"myfs0\",\"bdf\":\"{pci_segment:04x}:00:01.0\"}}" ))); } else { - assert!(String::from_utf8_lossy(&cmd_output) - .contains("{\"id\":\"myfs0\",\"bdf\":\"0000:00:06.0\"}")); + assert!( + String::from_utf8_lossy(&cmd_output) + .contains("{\"id\":\"myfs0\",\"bdf\":\"0000:00:06.0\"}") + ); } thread::sleep(std::time::Duration::new(10, 0)); @@ -1737,8 +1743,10 @@ fn _test_virtio_fs( "{{\"id\":\"myfs0\",\"bdf\":\"{pci_segment:04x}:00:01.0\"}}" ))); } else { - assert!(String::from_utf8_lossy(&cmd_output) - .contains("{\"id\":\"myfs0\",\"bdf\":\"0000:00:06.0\"}")); + assert!( + String::from_utf8_lossy(&cmd_output) + .contains("{\"id\":\"myfs0\",\"bdf\":\"0000:00:06.0\"}") + ); } thread::sleep(std::time::Duration::new(10, 0)); @@ -1892,8 +1900,10 @@ fn _test_virtio_vsock(hotplug: bool) { Some(format!("cid=3,socket={socket},id=test0").as_str()), ); assert!(cmd_success); - assert!(String::from_utf8_lossy(&cmd_output) - .contains("{\"id\":\"test0\",\"bdf\":\"0000:00:06.0\"}")); + assert!( + String::from_utf8_lossy(&cmd_output) + .contains("{\"id\":\"test0\",\"bdf\":\"0000:00:06.0\"}") + ); thread::sleep(std::time::Duration::new(10, 0)); // Check adding a second one fails assert!(!remote_command( @@ -2110,18 +2120,20 @@ fn get_counters(api_socket: &str) -> Counters { fn pty_read(mut pty: std::fs::File) -> Receiver { let (tx, rx) = mpsc::channel::(); - thread::spawn(move || loop { - thread::sleep(std::time::Duration::new(1, 0)); - let mut buf = [0; 512]; - match pty.read(&mut buf) { - Ok(_bytes) => { - let output = std::str::from_utf8(&buf).unwrap().to_string(); - match tx.send(output) { - Ok(_) => (), - Err(_) => break, + thread::spawn(move || { + loop { + thread::sleep(std::time::Duration::new(1, 0)); + let mut buf = [0; 512]; + match pty.read(&mut buf) { + Ok(_bytes) => { + let output = std::str::from_utf8(&buf).unwrap().to_string(); + match tx.send(output) { + Ok(_) => (), + Err(_) => break, + } } + Err(_) => break, } - Err(_) => break, } }); rx @@ -2255,9 +2267,11 @@ fn _test_virtio_iommu(acpi: bool) { guest.wait_vm_boot(None).unwrap(); // Verify the virtio-iommu device is present. - assert!(guest - .does_device_vendor_pair_match("0x1057", "0x1af4") - .unwrap_or_default()); + assert!( + guest + .does_device_vendor_pair_match("0x1057", "0x1af4") + .unwrap_or_default() + ); // On AArch64, if the guest system boots from FDT, the behavior of IOMMU is a bit // different with ACPI. @@ -2317,9 +2331,11 @@ fn get_reboot_count(guest: &Guest) -> u32 { fn enable_guest_watchdog(guest: &Guest, watchdog_sec: u32) { // Check for PCI device - assert!(guest - .does_device_vendor_pair_match("0x1063", "0x1af4") - .unwrap_or_default()); + assert!( + guest + .does_device_vendor_pair_match("0x1063", "0x1af4") + .unwrap_or_default() + ); // Enable systemd watchdog guest @@ -2333,14 +2349,157 @@ fn enable_guest_watchdog(guest: &Guest, watchdog_sec: u32) { fn make_guest_panic(guest: &Guest) { // Check for pvpanic device - assert!(guest - .does_device_vendor_pair_match("0x0011", "0x1b36") - .unwrap_or_default()); + assert!( + guest + .does_device_vendor_pair_match("0x0011", "0x1b36") + .unwrap_or_default() + ); // Trigger guest a panic guest.ssh_command("screen -dmS reboot sh -c \"sleep 5; echo s | tee /proc/sysrq-trigger; echo c | sudo tee /proc/sysrq-trigger\"").unwrap(); } +// ivshmem test +// This case validates that read data from host(host write data to ivshmem backend file, +// guest read data from ivshmem pci bar2 memory) +// and write data to host(guest write data to ivshmem pci bar2 memory, host read it from +// ivshmem backend file). +// It also checks the size of the shared memory region. +fn _test_ivshmem(guest: &Guest, ivshmem_file_path: String, file_size: &str) { + let test_message_read = String::from("ivshmem device test data read"); + // Modify backend file data before function test + let mut file = OpenOptions::new() + .read(true) + .write(true) + .open(ivshmem_file_path.as_str()) + .unwrap(); + file.seek(SeekFrom::Start(0)).unwrap(); + file.write_all(test_message_read.as_bytes()).unwrap(); + file.write_all(b"\0").unwrap(); + file.flush().unwrap(); + + let output = fs::read_to_string(ivshmem_file_path.as_str()).unwrap(); + let nul_pos = output.as_bytes().iter().position(|&b| b == 0).unwrap(); + let c_str = CStr::from_bytes_until_nul(&output.as_bytes()[..=nul_pos]).unwrap(); + let file_message = c_str.to_string_lossy().to_string(); + // Check if the backend file data is correct + assert_eq!(test_message_read, file_message); + + let device_id_line = String::from( + guest + .ssh_command("lspci -D | grep \"Inter-VM shared memory\"") + .unwrap() + .trim(), + ); + // Check if ivshmem exists + assert!(!device_id_line.is_empty()); + let device_id = device_id_line.split(" ").next().unwrap(); + // Check shard memory size + assert_eq!( + guest + .ssh_command( + format!("lspci -vv -s {device_id} | grep -c \"Region 2.*size={file_size}\"") + .as_str(), + ) + .unwrap() + .trim() + .parse::() + .unwrap_or_default(), + 1 + ); + + // guest don't have gcc or g++, try to use python to test :( + // This python program try to mmap the ivshmem pci bar2 memory and read the data from it. + let ivshmem_test_read = format!( + r#" +import os +import mmap +from ctypes import create_string_buffer, c_char, memmove + +if __name__ == "__main__": + device_path = f"/sys/bus/pci/devices/{device_id}/resource2" + fd = os.open(device_path, os.O_RDWR | os.O_SYNC) + + PAGE_SIZE = os.sysconf('SC_PAGESIZE') + + with mmap.mmap(fd, PAGE_SIZE, flags=mmap.MAP_SHARED, + prot=mmap.PROT_READ | mmap.PROT_WRITE, offset=0) as shmem: + c_buf = (c_char * PAGE_SIZE).from_buffer(shmem) + null_pos = c_buf.raw.find(b'\x00') + valid_data = c_buf.raw[:null_pos] if null_pos != -1 else c_buf.raw + print(valid_data.decode('utf-8', errors='replace'), end="") + shmem.flush() + del c_buf + + os.close(fd) + "# + ); + guest + .ssh_command( + format!( + r#"cat << EOF > test_read.py +{ivshmem_test_read} +EOF +"# + ) + .as_str(), + ) + .unwrap(); + let guest_message = guest.ssh_command("sudo python3 test_read.py").unwrap(); + + // Check the probe message in host and guest + assert_eq!(test_message_read, guest_message); + + let test_message_write = "ivshmem device test data write"; + // Then the program writes a test message to the memory and flush it. + let ivshmem_test_write = format!( + r#" +import os +import mmap +from ctypes import create_string_buffer, c_char, memmove + +if __name__ == "__main__": + device_path = f"/sys/bus/pci/devices/{device_id}/resource2" + test_message = "{test_message_write}" + fd = os.open(device_path, os.O_RDWR | os.O_SYNC) + + PAGE_SIZE = os.sysconf('SC_PAGESIZE') + + with mmap.mmap(fd, PAGE_SIZE, flags=mmap.MAP_SHARED, + prot=mmap.PROT_READ | mmap.PROT_WRITE, offset=0) as shmem: + shmem.flush() + c_buf = (c_char * PAGE_SIZE).from_buffer(shmem) + encoded_msg = test_message.encode('utf-8').ljust(1000, b'\x00') + memmove(c_buf, encoded_msg, len(encoded_msg)) + shmem.flush() + del c_buf + + os.close(fd) + "# + ); + + guest + .ssh_command( + format!( + r#"cat << EOF > test_write.py +{ivshmem_test_write} +EOF +"# + ) + .as_str(), + ) + .unwrap(); + + let _ = guest.ssh_command("sudo python3 test_write.py").unwrap(); + + let output = fs::read_to_string(ivshmem_file_path.as_str()).unwrap(); + let nul_pos = output.as_bytes().iter().position(|&b| b == 0).unwrap(); + let c_str = CStr::from_bytes_until_nul(&output.as_bytes()[..=nul_pos]).unwrap(); + let file_message = c_str.to_string_lossy().to_string(); + // Check to send data from guest to host + assert_eq!(test_message_write, file_message); +} + mod common_parallel { use std::fs::OpenOptions; use std::io::SeekFrom; @@ -2706,7 +2865,6 @@ mod common_parallel { } #[test] - #[cfg(not(feature = "mshv"))] fn test_user_defined_memory_regions() { let focal = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(focal)); @@ -2774,7 +2932,6 @@ mod common_parallel { } #[test] - #[cfg(not(feature = "mshv"))] fn test_guest_numa_nodes() { _test_guest_numa_nodes(false); } @@ -2833,13 +2990,17 @@ mod common_parallel { ), ); assert!(cmd_success); - assert!(String::from_utf8_lossy(&cmd_output) - .contains("{\"id\":\"test0\",\"bdf\":\"0001:00:01.0\"}")); + assert!( + String::from_utf8_lossy(&cmd_output) + .contains("{\"id\":\"test0\",\"bdf\":\"0001:00:01.0\"}") + ); // Check IOMMU setup - assert!(guest - .does_device_vendor_pair_match("0x1057", "0x1af4") - .unwrap_or_default()); + assert!( + guest + .does_device_vendor_pair_match("0x1057", "0x1af4") + .unwrap_or_default() + ); assert_eq!( guest .ssh_command("ls /sys/kernel/iommu_groups/0/devices") @@ -3199,11 +3360,7 @@ mod common_parallel { assert_eq!(guest.get_cpu_count().unwrap_or_default(), 1); assert!(guest.get_total_memory().unwrap_or_default() > 480_000); - let grep_cmd = if cfg!(target_arch = "x86_64") { - "grep -c PCI-MSI /proc/interrupts" - } else { - "grep -c ITS-PCI-MSIX /proc/interrupts" - }; + let grep_cmd = "grep -c PCI-MSI /proc/interrupts"; assert_eq!( guest .ssh_command(grep_cmd) @@ -3387,12 +3544,12 @@ mod common_parallel { const FULL_VHDX_FILE_SIZE: u64 = 112 << 20; const DYNAMIC_VHDX_NAME: &str = "dynamic.vhdx"; - let mut workload_path = dirs::home_dir().unwrap(); - workload_path.push("workloads"); + let focal = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(focal)); + let kernel_path = direct_kernel_boot_path(); - let mut vhdx_file_path = workload_path; - vhdx_file_path.push(DYNAMIC_VHDX_NAME); - let vhdx_path = vhdx_file_path.to_str().unwrap(); + let vhdx_pathbuf = guest.tmp_dir.as_path().join(DYNAMIC_VHDX_NAME); + let vhdx_path = vhdx_pathbuf.to_str().unwrap(); // Generate a 100 MiB dynamic VHDX file std::process::Command::new("qemu-img") @@ -3401,15 +3558,11 @@ mod common_parallel { .arg(vhdx_path) .arg(VIRTUAL_DISK_SIZE.to_string()) .output() - .expect("Expect generating dynamic VHDx image from RAW image"); + .expect("Expect generating dynamic VHDX image"); // Check if the size matches with empty VHDx file size assert_eq!(vhdx_image_size(vhdx_path), EMPTY_VHDX_FILE_SIZE); - let focal = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); - let guest = Guest::new(Box::new(focal)); - let kernel_path = direct_kernel_boot_path(); - let mut cloud_child = GuestCommand::new(&guest) .args(["--cpus", "boot=1"]) .args(["--memory", "size=512M"]) @@ -3779,13 +3932,11 @@ mod common_parallel { } #[test] - #[cfg(not(feature = "mshv"))] fn test_virtio_fs_multi_segment_hotplug() { _test_virtio_fs(&prepare_virtiofsd, true, Some(15)) } #[test] - #[cfg(not(feature = "mshv"))] fn test_virtio_fs_multi_segment() { _test_virtio_fs(&prepare_virtiofsd, false, Some(15)) } @@ -4298,9 +4449,11 @@ mod common_parallel { let r = std::panic::catch_unwind(|| { guest.wait_vm_boot(None).unwrap(); - assert!(guest - .does_device_vendor_pair_match("0x1043", "0x1af4") - .unwrap_or_default()); + assert!( + guest + .does_device_vendor_pair_match("0x1043", "0x1af4") + .unwrap_or_default() + ); guest.ssh_command(&cmd).unwrap(); }); @@ -4808,6 +4961,7 @@ mod common_parallel { let focal = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(focal)); let api_socket = temp_api_path(&guest.tmp_dir); + let console_str = "console=ttyS0"; let kernel_path = direct_kernel_boot_path(); @@ -4815,7 +4969,14 @@ mod common_parallel { .args(["--cpus", "boot=2,max=4"]) .args(["--memory", "size=512M"]) .args(["--kernel", kernel_path.to_str().unwrap()]) - .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .args([ + "--cmdline", + DIRECT_KERNEL_BOOT_CMDLINE + .replace("console=hvc0 ", console_str) + .as_str(), + ]) + .args(["--serial", "tty"]) + .args(["--console", "off"]) .default_disks() .default_net() .args(["--api-socket", &api_socket]) @@ -4972,7 +5133,6 @@ mod common_parallel { } #[test] - #[cfg(not(feature = "mshv"))] fn test_virtio_mem() { let focal = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(focal)); @@ -5046,7 +5206,6 @@ mod common_parallel { #[test] #[cfg(target_arch = "x86_64")] - #[cfg(not(feature = "mshv"))] // Test both vCPU and memory resizing together fn test_resize() { let focal = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); @@ -5270,8 +5429,10 @@ mod common_parallel { ), ); assert!(cmd_success); - assert!(String::from_utf8_lossy(&cmd_output) - .contains("{\"id\":\"test0\",\"bdf\":\"0000:00:06.0\"}")); + assert!( + String::from_utf8_lossy(&cmd_output) + .contains("{\"id\":\"test0\",\"bdf\":\"0000:00:06.0\"}") + ); thread::sleep(std::time::Duration::new(10, 0)); @@ -5317,8 +5478,10 @@ mod common_parallel { ), ); assert!(cmd_success); - assert!(String::from_utf8_lossy(&cmd_output) - .contains("{\"id\":\"test0\",\"bdf\":\"0000:00:06.0\"}")); + assert!( + String::from_utf8_lossy(&cmd_output) + .contains("{\"id\":\"test0\",\"bdf\":\"0000:00:06.0\"}") + ); thread::sleep(std::time::Duration::new(10, 0)); @@ -5674,7 +5837,6 @@ mod common_parallel { } #[test] - #[cfg(not(feature = "mshv"))] fn test_virtio_balloon_free_page_reporting() { let focal = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(focal)); @@ -5817,8 +5979,10 @@ mod common_parallel { "{{\"id\":\"test0\",\"bdf\":\"{pci_segment:04x}:00:01.0\"}}" ))); } else { - assert!(String::from_utf8_lossy(&cmd_output) - .contains("{\"id\":\"test0\",\"bdf\":\"0000:00:06.0\"}")); + assert!( + String::from_utf8_lossy(&cmd_output) + .contains("{\"id\":\"test0\",\"bdf\":\"0000:00:06.0\"}") + ); } // Check that /dev/pmem0 exists and the block size is 128M @@ -5948,8 +6112,10 @@ mod common_parallel { "{{\"id\":\"test0\",\"bdf\":\"{pci_segment:04x}:00:01.0\"}}" ))); } else { - assert!(String::from_utf8_lossy(&cmd_output) - .contains("{\"id\":\"test0\",\"bdf\":\"0000:00:05.0\"}")); + assert!( + String::from_utf8_lossy(&cmd_output) + .contains("{\"id\":\"test0\",\"bdf\":\"0000:00:05.0\"}") + ); } thread::sleep(std::time::Duration::new(5, 0)); @@ -5992,8 +6158,10 @@ mod common_parallel { "{{\"id\":\"test1\",\"bdf\":\"{pci_segment:04x}:00:01.0\"}}" ))); } else { - assert!(String::from_utf8_lossy(&cmd_output) - .contains("{\"id\":\"test1\",\"bdf\":\"0000:00:05.0\"}")); + assert!( + String::from_utf8_lossy(&cmd_output) + .contains("{\"id\":\"test1\",\"bdf\":\"0000:00:05.0\"}") + ); } thread::sleep(std::time::Duration::new(5, 0)); @@ -6430,15 +6598,19 @@ mod common_parallel { let phy_net = "eth0"; // Create a macvtap interface for the guest VM to use - assert!(exec_host_command_status(&format!( - "sudo ip link add link {phy_net} name {guest_macvtap_name} type macvtap mod bridge" - )) - .success()); - assert!(exec_host_command_status(&format!( - "sudo ip link set {} address {} up", - guest_macvtap_name, guest.network.guest_mac - )) - .success()); + assert!( + exec_host_command_status(&format!( + "sudo ip link add link {phy_net} name {guest_macvtap_name} type macvtap mod bridge" + )) + .success() + ); + assert!( + exec_host_command_status(&format!( + "sudo ip link set {} address {} up", + guest_macvtap_name, guest.network.guest_mac + )) + .success() + ); assert!( exec_host_command_status(&format!("sudo ip link show {guest_macvtap_name}")).success() ); @@ -6457,16 +6629,20 @@ mod common_parallel { // Create a macvtap on the same physical net interface for // the host machine to use - assert!(exec_host_command_status(&format!( - "sudo ip link add link {phy_net} name {host_macvtap_name} type macvtap mod bridge" - )) - .success()); + assert!( + exec_host_command_status(&format!( + "sudo ip link add link {phy_net} name {host_macvtap_name} type macvtap mod bridge" + )) + .success() + ); // Use default mask "255.255.255.0" - assert!(exec_host_command_status(&format!( - "sudo ip address add {}/24 dev {}", - guest.network.host_ip, host_macvtap_name - )) - .success()); + assert!( + exec_host_command_status(&format!( + "sudo ip address add {}/24 dev {}", + guest.network.host_ip, host_macvtap_name + )) + .success() + ); assert!( exec_host_command_status(&format!("sudo ip link set dev {host_macvtap_name} up")) .success() @@ -6502,11 +6678,15 @@ mod common_parallel { remote_command_w_output(&api_socket, "add-net", Some(&net_params)); assert!(cmd_success); #[cfg(target_arch = "x86_64")] - assert!(String::from_utf8_lossy(&cmd_output) - .contains("{\"id\":\"_net2\",\"bdf\":\"0000:00:05.0\"}")); + assert!( + String::from_utf8_lossy(&cmd_output) + .contains("{\"id\":\"_net2\",\"bdf\":\"0000:00:05.0\"}") + ); #[cfg(target_arch = "aarch64")] - assert!(String::from_utf8_lossy(&cmd_output) - .contains("{\"id\":\"_net0\",\"bdf\":\"0000:00:05.0\"}")); + assert!( + String::from_utf8_lossy(&cmd_output) + .contains("{\"id\":\"_net0\",\"bdf\":\"0000:00:05.0\"}") + ); } // The functional connectivity provided by the virtio-net device @@ -6682,21 +6862,27 @@ mod common_parallel { fn setup_spdk_nvme(nvme_dir: &std::path::Path) -> Child { cleanup_spdk_nvme(); - assert!(exec_host_command_status(&format!( - "mkdir -p {}", - nvme_dir.join("nvme-vfio-user").to_str().unwrap() - )) - .success()); - assert!(exec_host_command_status(&format!( - "truncate {} -s 128M", - nvme_dir.join("test-disk.raw").to_str().unwrap() - )) - .success()); - assert!(exec_host_command_status(&format!( - "mkfs.ext4 {}", - nvme_dir.join("test-disk.raw").to_str().unwrap() - )) - .success()); + assert!( + exec_host_command_status(&format!( + "mkdir -p {}", + nvme_dir.join("nvme-vfio-user").to_str().unwrap() + )) + .success() + ); + assert!( + exec_host_command_status(&format!( + "truncate {} -s 128M", + nvme_dir.join("test-disk.raw").to_str().unwrap() + )) + .success() + ); + assert!( + exec_host_command_status(&format!( + "mkfs.ext4 {}", + nvme_dir.join("test-disk.raw").to_str().unwrap() + )) + .success() + ); // Start the SPDK nvmf_tgt daemon to present NVMe device as a VFIO user device let child = Command::new("/usr/local/bin/spdk-nvme/nvmf_tgt") @@ -6710,11 +6896,13 @@ mod common_parallel { 3, std::time::Duration::new(5, 0), )); - assert!(exec_host_command_status(&format!( - "/usr/local/bin/spdk-nvme/rpc.py bdev_aio_create {} test 512", - nvme_dir.join("test-disk.raw").to_str().unwrap() - )) - .success()); + assert!( + exec_host_command_status(&format!( + "/usr/local/bin/spdk-nvme/rpc.py bdev_aio_create {} test 512", + nvme_dir.join("test-disk.raw").to_str().unwrap() + )) + .success() + ); assert!(exec_host_command_status( "/usr/local/bin/spdk-nvme/rpc.py nvmf_create_subsystem nqn.2019-07.io.spdk:cnode -a -s test" ) @@ -6775,8 +6963,10 @@ mod common_parallel { )), ); assert!(cmd_success); - assert!(String::from_utf8_lossy(&cmd_output) - .contains("{\"id\":\"vfio_user0\",\"bdf\":\"0000:00:05.0\"}")); + assert!( + String::from_utf8_lossy(&cmd_output) + .contains("{\"id\":\"vfio_user0\",\"bdf\":\"0000:00:05.0\"}") + ); thread::sleep(std::time::Duration::new(10, 0)); @@ -6881,15 +7071,19 @@ mod common_parallel { Some("id=myvdpa0,path=/dev/vhost-vdpa-1,num_queues=1,pci_segment=1,iommu=on"), ); assert!(cmd_success); - assert!(String::from_utf8_lossy(&cmd_output) - .contains("{\"id\":\"myvdpa0\",\"bdf\":\"0001:00:01.0\"}")); + assert!( + String::from_utf8_lossy(&cmd_output) + .contains("{\"id\":\"myvdpa0\",\"bdf\":\"0001:00:01.0\"}") + ); thread::sleep(std::time::Duration::new(10, 0)); // Check IOMMU setup - assert!(guest - .does_device_vendor_pair_match("0x1057", "0x1af4") - .unwrap_or_default()); + assert!( + guest + .does_device_vendor_pair_match("0x1057", "0x1af4") + .unwrap_or_default() + ); assert_eq!( guest .ssh_command("ls /sys/kernel/iommu_groups/0/devices") @@ -7279,98 +7473,515 @@ mod dbus_api { } } -mod common_sequential { +mod ivshmem { use std::fs::remove_dir_all; + use std::process::Command; - use crate::*; - - #[test] - #[cfg(not(feature = "mshv"))] - fn test_memory_mergeable_on() { - test_memory_mergeable(true) - } - - fn snapshot_and_check_events(api_socket: &str, snapshot_dir: &str, event_path: &str) { - // Pause the VM - assert!(remote_command(api_socket, "pause", None)); - let latest_events: [&MetaEvent; 2] = [ - &MetaEvent { - event: "pausing".to_string(), - device_id: None, - }, - &MetaEvent { - event: "paused".to_string(), - device_id: None, - }, - ]; - // See: #5938 - thread::sleep(std::time::Duration::new(1, 0)); - assert!(check_latest_events_exact(&latest_events, event_path)); - - // Take a snapshot from the VM - assert!(remote_command( - api_socket, - "snapshot", - Some(format!("file://{snapshot_dir}").as_str()), - )); - - // Wait to make sure the snapshot is completed - thread::sleep(std::time::Duration::new(10, 0)); - - let latest_events = [ - &MetaEvent { - event: "snapshotting".to_string(), - device_id: None, - }, - &MetaEvent { - event: "snapshotted".to_string(), - device_id: None, - }, - ]; - // See: #5938 - thread::sleep(std::time::Duration::new(1, 0)); - assert!(check_latest_events_exact(&latest_events, event_path)); - } - - // One thing to note about this test. The virtio-net device is heavily used - // through each ssh command. There's no need to perform a dedicated test to - // verify the migration went well for virtio-net. - #[test] - #[cfg(not(feature = "mshv"))] - fn test_snapshot_restore_hotplug_virtiomem() { - _test_snapshot_restore(true); - } + use test_infra::{Guest, GuestCommand, UbuntuDiskConfig, handle_child_output, kill_child}; - #[test] - fn test_snapshot_restore_basic() { - _test_snapshot_restore(false); - } + use crate::*; - fn _test_snapshot_restore(use_hotplug: bool) { + fn _test_live_migration_ivshmem(local: bool) { let focal = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); let guest = Guest::new(Box::new(focal)); let kernel_path = direct_kernel_boot_path(); - - let api_socket_source = format!("{}.1", temp_api_path(&guest.tmp_dir)); - + let console_text = String::from("On a branch floating down river a cricket, singing."); let net_id = "net123"; let net_params = format!( "id={},tap=,mac={},ip={},mask=255.255.255.0", net_id, guest.network.guest_mac, guest.network.host_ip ); - let mut mem_params = "size=2G"; - - if use_hotplug { - mem_params = "size=2G,hotplug_method=virtio-mem,hotplug_size=32G" - } - let cloudinit_params = format!( - "path={},iommu=on", - guest.disk_config.disk(DiskType::CloudInit).unwrap() - ); + let memory_param: &[&str] = if local { + &["--memory", "size=4G,shared=on"] + } else { + &["--memory", "size=4G"] + }; - let socket = temp_vsock_path(&guest.tmp_dir); - let event_path = temp_event_monitor_path(&guest.tmp_dir); + let boot_vcpus = 2; + let max_vcpus = 4; + + let pmem_temp_file = TempFile::new().unwrap(); + pmem_temp_file.as_file().set_len(128 << 20).unwrap(); + std::process::Command::new("mkfs.ext4") + .arg(pmem_temp_file.as_path()) + .output() + .expect("Expect creating disk image to succeed"); + let pmem_path = String::from("/dev/pmem0"); + + let ivshmem_file_path = String::from( + guest + .tmp_dir + .as_path() + .join("ivshmem.data") + .to_str() + .unwrap(), + ); + let file_size = "1M"; + + // Create a file to be used as the shared memory + Command::new("dd") + .args([ + "if=/dev/zero", + format!("of={ivshmem_file_path}").as_str(), + format!("bs={file_size}").as_str(), + "count=1", + ]) + .status() + .unwrap(); + + // Start the source VM + let src_vm_path = clh_command("cloud-hypervisor"); + let src_api_socket = temp_api_path(&guest.tmp_dir); + let mut src_vm_cmd = GuestCommand::new_with_binary_path(&guest, &src_vm_path); + src_vm_cmd + .args([ + "--cpus", + format!("boot={boot_vcpus},max={max_vcpus}").as_str(), + ]) + .args(memory_param) + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .default_disks() + .args(["--net", net_params.as_str()]) + .args(["--api-socket", &src_api_socket]) + .args([ + "--pmem", + format!("file={}", pmem_temp_file.as_path().to_str().unwrap(),).as_str(), + ]) + .args([ + "--ivshmem", + format!("path={ivshmem_file_path},size={file_size}").as_str(), + ]); + let mut src_child = src_vm_cmd.capture_output().spawn().unwrap(); + + // Start the destination VM + let mut dest_api_socket = temp_api_path(&guest.tmp_dir); + dest_api_socket.push_str(".dest"); + let mut dest_child = GuestCommand::new(&guest) + .args(["--api-socket", &dest_api_socket]) + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot(None).unwrap(); + + // Make sure the source VM is functional + // Check the number of vCPUs + assert_eq!(guest.get_cpu_count().unwrap_or_default(), boot_vcpus); + // Check the guest RAM + assert!(guest.get_total_memory().unwrap_or_default() > 3_840_000); + // Check the guest virtio-devices, e.g. block, rng, console, and net + guest.check_devices_common(None, Some(&console_text), Some(&pmem_path)); + // x86_64: Following what's done in the `test_snapshot_restore`, we need + // to make sure that removing and adding back the virtio-net device does + // not break the live-migration support for virtio-pci. + #[cfg(target_arch = "x86_64")] + { + assert!(remote_command( + &src_api_socket, + "remove-device", + Some(net_id), + )); + thread::sleep(Duration::new(10, 0)); + + // Plug the virtio-net device again + assert!(remote_command( + &src_api_socket, + "add-net", + Some(net_params.as_str()), + )); + thread::sleep(Duration::new(10, 0)); + } + + // Check ivshmem device in src guest. + _test_ivshmem(&guest, ivshmem_file_path.clone(), file_size); + // Allow some normal time to elapse to check we don't get spurious reboots + thread::sleep(std::time::Duration::new(40, 0)); + + // Start the live-migration + let migration_socket = String::from( + guest + .tmp_dir + .as_path() + .join("live-migration.sock") + .to_str() + .unwrap(), + ); + + assert!( + live_migration::start_live_migration( + &migration_socket, + &src_api_socket, + &dest_api_socket, + local, + 300, + 60 + ), + "Unsuccessful command: 'send-migration' or 'receive-migration'." + ); + }); + + // Check and report any errors occurred during the live-migration + if r.is_err() { + live_migration::print_and_panic( + src_child, + dest_child, + None, + "Error occurred during live-migration", + ); + } + + // Check the source vm has been terminated successful (give it '3s' to settle) + thread::sleep(std::time::Duration::new(3, 0)); + if !src_child.try_wait().unwrap().is_some_and(|s| s.success()) { + live_migration::print_and_panic( + src_child, + dest_child, + None, + "source VM was not terminated successfully.", + ); + }; + + // Post live-migration check to make sure the destination VM is functional + let r = std::panic::catch_unwind(|| { + // Perform same checks to validate VM has been properly migrated + assert_eq!(guest.get_cpu_count().unwrap_or_default(), boot_vcpus); + assert!(guest.get_total_memory().unwrap_or_default() > 3_840_000); + + guest.check_devices_common(None, Some(&console_text), Some(&pmem_path)); + + // Check ivshmem device + _test_ivshmem(&guest, ivshmem_file_path, file_size); + }); + + // Clean-up the destination VM and make sure it terminated correctly + let _ = dest_child.kill(); + let dest_output = dest_child.wait_with_output().unwrap(); + handle_child_output(r, &dest_output); + + // Check the destination VM has the expected 'console_text' from its output + let r = std::panic::catch_unwind(|| { + assert!(String::from_utf8_lossy(&dest_output.stdout).contains(&console_text)); + }); + handle_child_output(r, &dest_output); + } + + #[test] + fn test_ivshmem() { + let focal = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(focal)); + let api_socket = temp_api_path(&guest.tmp_dir); + + let kernel_path = direct_kernel_boot_path(); + + let ivshmem_file_path = String::from( + guest + .tmp_dir + .as_path() + .join("ivshmem.data") + .to_str() + .unwrap(), + ); + let file_size = "1M"; + + // Create a file to be used as the shared memory + Command::new("dd") + .args([ + "if=/dev/zero", + format!("of={ivshmem_file_path}").as_str(), + format!("bs={file_size}").as_str(), + "count=1", + ]) + .status() + .unwrap(); + + let mut child = GuestCommand::new(&guest) + .args(["--cpus", "boot=2"]) + .args(["--memory", "size=512M"]) + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .default_disks() + .default_net() + .args([ + "--ivshmem", + format!("path={ivshmem_file_path},size={file_size}").as_str(), + ]) + .args(["--api-socket", &api_socket]) + .capture_output() + .spawn() + .unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot(None).unwrap(); + _test_ivshmem(&guest, ivshmem_file_path, file_size); + }); + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); + } + + #[test] + fn test_snapshot_restore_ivshmem() { + let focal = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(focal)); + let kernel_path = direct_kernel_boot_path(); + + let api_socket_source = format!("{}.1", temp_api_path(&guest.tmp_dir)); + + let ivshmem_file_path = String::from( + guest + .tmp_dir + .as_path() + .join("ivshmem.data") + .to_str() + .unwrap(), + ); + let file_size = "1M"; + + // Create a file to be used as the shared memory + Command::new("dd") + .args([ + "if=/dev/zero", + format!("of={ivshmem_file_path}").as_str(), + format!("bs={file_size}").as_str(), + "count=1", + ]) + .status() + .unwrap(); + + let socket = temp_vsock_path(&guest.tmp_dir); + let event_path = temp_event_monitor_path(&guest.tmp_dir); + + let mut child = GuestCommand::new(&guest) + .args(["--api-socket", &api_socket_source]) + .args(["--event-monitor", format!("path={event_path}").as_str()]) + .args(["--cpus", "boot=2"]) + .args(["--memory", "size=1G"]) + .args(["--kernel", kernel_path.to_str().unwrap()]) + .default_disks() + .default_net() + .args(["--vsock", format!("cid=3,socket={socket}").as_str()]) + .args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE]) + .args([ + "--ivshmem", + format!("path={ivshmem_file_path},size={file_size}").as_str(), + ]) + .capture_output() + .spawn() + .unwrap(); + + let console_text = String::from("On a branch floating down river a cricket, singing."); + // Create the snapshot directory + let snapshot_dir = temp_snapshot_dir_path(&guest.tmp_dir); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot(None).unwrap(); + + // Check the number of vCPUs + assert_eq!(guest.get_cpu_count().unwrap_or_default(), 2); + + common_sequential::snapshot_and_check_events( + &api_socket_source, + &snapshot_dir, + &event_path, + ); + }); + + // Shutdown the source VM and check console output + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + handle_child_output(r, &output); + + // Remove the vsock socket file. + Command::new("rm") + .arg("-f") + .arg(socket.as_str()) + .output() + .unwrap(); + + let api_socket_restored = format!("{}.2", temp_api_path(&guest.tmp_dir)); + let event_path_restored = format!("{}.2", temp_event_monitor_path(&guest.tmp_dir)); + + // Restore the VM from the snapshot + let mut child = GuestCommand::new(&guest) + .args(["--api-socket", &api_socket_restored]) + .args([ + "--event-monitor", + format!("path={event_path_restored}").as_str(), + ]) + .args([ + "--restore", + format!("source_url=file://{snapshot_dir}").as_str(), + ]) + .capture_output() + .spawn() + .unwrap(); + + // Wait for the VM to be restored + thread::sleep(std::time::Duration::new(20, 0)); + + let latest_events = [&MetaEvent { + event: "restored".to_string(), + device_id: None, + }]; + assert!(check_latest_events_exact( + &latest_events, + &event_path_restored + )); + + // Remove the snapshot dir + let _ = remove_dir_all(snapshot_dir.as_str()); + + let r = std::panic::catch_unwind(|| { + // Resume the VM + assert!(remote_command(&api_socket_restored, "resume", None)); + // There is no way that we can ensure the 'write()' to the + // event file is completed when the 'resume' request is + // returned successfully, because the 'write()' was done + // asynchronously from a different thread of Cloud + // Hypervisor (e.g. the event-monitor thread). + thread::sleep(std::time::Duration::new(1, 0)); + let latest_events = [ + &MetaEvent { + event: "resuming".to_string(), + device_id: None, + }, + &MetaEvent { + event: "resumed".to_string(), + device_id: None, + }, + ]; + assert!(check_latest_events_exact( + &latest_events, + &event_path_restored + )); + + // Check the number of vCPUs + assert_eq!(guest.get_cpu_count().unwrap_or_default(), 2); + guest.check_devices_common(Some(&socket), Some(&console_text), None); + _test_ivshmem(&guest, ivshmem_file_path, file_size); + }); + // Shutdown the target VM and check console output + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + handle_child_output(r, &output); + + let r = std::panic::catch_unwind(|| { + assert!(String::from_utf8_lossy(&output.stdout).contains(&console_text)); + }); + + handle_child_output(r, &output); + } + + #[test] + fn test_live_migration_ivshmem() { + _test_live_migration_ivshmem(false) + } + + #[test] + fn test_live_migration_ivshmem_local() { + _test_live_migration_ivshmem(true) + } +} + +mod common_sequential { + use std::fs::remove_dir_all; + + use crate::*; + + #[test] + #[cfg(not(feature = "mshv"))] + fn test_memory_mergeable_on() { + test_memory_mergeable(true) + } + + pub(crate) fn snapshot_and_check_events( + api_socket: &str, + snapshot_dir: &str, + event_path: &str, + ) { + // Pause the VM + assert!(remote_command(api_socket, "pause", None)); + let latest_events: [&MetaEvent; 2] = [ + &MetaEvent { + event: "pausing".to_string(), + device_id: None, + }, + &MetaEvent { + event: "paused".to_string(), + device_id: None, + }, + ]; + // See: #5938 + thread::sleep(std::time::Duration::new(1, 0)); + assert!(check_latest_events_exact(&latest_events, event_path)); + + // Take a snapshot from the VM + assert!(remote_command( + api_socket, + "snapshot", + Some(format!("file://{snapshot_dir}").as_str()), + )); + + // Wait to make sure the snapshot is completed + thread::sleep(std::time::Duration::new(10, 0)); + + let latest_events = [ + &MetaEvent { + event: "snapshotting".to_string(), + device_id: None, + }, + &MetaEvent { + event: "snapshotted".to_string(), + device_id: None, + }, + ]; + // See: #5938 + thread::sleep(std::time::Duration::new(1, 0)); + assert!(check_latest_events_exact(&latest_events, event_path)); + } + + // One thing to note about this test. The virtio-net device is heavily used + // through each ssh command. There's no need to perform a dedicated test to + // verify the migration went well for virtio-net. + #[test] + fn test_snapshot_restore_hotplug_virtiomem() { + _test_snapshot_restore(true); + } + + #[test] + fn test_snapshot_restore_basic() { + _test_snapshot_restore(false); + } + + fn _test_snapshot_restore(use_hotplug: bool) { + let focal = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(focal)); + let kernel_path = direct_kernel_boot_path(); + + let api_socket_source = format!("{}.1", temp_api_path(&guest.tmp_dir)); + + let net_id = "net123"; + let net_params = format!( + "id={},tap=,mac={},ip={},mask=255.255.255.0", + net_id, guest.network.guest_mac, guest.network.host_ip + ); + let mut mem_params = "size=2G"; + + if use_hotplug { + mem_params = "size=2G,hotplug_method=virtio-mem,hotplug_size=32G" + } + + let cloudinit_params = format!( + "path={},iommu=on", + guest.disk_config.disk(DiskType::CloudInit).unwrap() + ); + + let socket = temp_vsock_path(&guest.tmp_dir); + let event_path = temp_event_monitor_path(&guest.tmp_dir); let mut child = GuestCommand::new(&guest) .args(["--api-socket", &api_socket_source]) @@ -7837,7 +8448,7 @@ mod common_sequential { let device_params = { let mut data = vec![]; if pvpanic { - data.push("--pvpanic"); + data.push(String::from("--pvpanic")); } data }; @@ -8853,8 +9464,10 @@ mod windows { Some(format!("path={disk},readonly=off").as_str()), ); assert!(cmd_success); - assert!(String::from_utf8_lossy(&cmd_output) - .contains(format!("\"id\":\"{disk_id}\"").as_str())); + assert!( + String::from_utf8_lossy(&cmd_output) + .contains(format!("\"id\":\"{disk_id}\"").as_str()) + ); thread::sleep(std::time::Duration::new(5, 0)); // Online disk devices windows_guest.disks_set_rw(); @@ -8979,50 +9592,6 @@ mod windows { } } -#[cfg(target_arch = "x86_64")] -mod sgx { - use crate::*; - - #[test] - fn test_sgx() { - let jammy_image = JAMMY_IMAGE_NAME.to_string(); - let jammy = UbuntuDiskConfig::new(jammy_image); - let guest = Guest::new(Box::new(jammy)); - - let mut child = GuestCommand::new(&guest) - .args(["--cpus", "boot=1"]) - .args(["--memory", "size=512M"]) - .args(["--kernel", fw_path(FwType::RustHypervisorFirmware).as_str()]) - .default_disks() - .default_net() - .args(["--sgx-epc", "id=epc0,size=64M"]) - .capture_output() - .spawn() - .unwrap(); - - let r = std::panic::catch_unwind(|| { - guest.wait_vm_boot(None).unwrap(); - - // Check if SGX is correctly detected in the guest. - guest.check_sgx_support().unwrap(); - - // Validate the SGX EPC section is 64MiB. - assert_eq!( - guest - .ssh_command("cpuid -l 0x12 -s 2 | grep 'section size' | cut -d '=' -f 2") - .unwrap() - .trim(), - "0x0000000004000000" - ); - }); - - let _ = child.kill(); - let output = child.wait_with_output().unwrap(); - - handle_child_output(r, &output); - } -} - #[cfg(target_arch = "x86_64")] mod vfio { use crate::*; @@ -9108,8 +9677,10 @@ mod vfio { Some(format!("id=vfio0,path={NVIDIA_VFIO_DEVICE}").as_str()), ); assert!(cmd_success); - assert!(String::from_utf8_lossy(&cmd_output) - .contains("{\"id\":\"vfio0\",\"bdf\":\"0000:00:06.0\"}")); + assert!( + String::from_utf8_lossy(&cmd_output) + .contains("{\"id\":\"vfio0\",\"bdf\":\"0000:00:06.0\"}") + ); thread::sleep(std::time::Duration::new(10, 0)); @@ -9187,10 +9758,12 @@ mod vfio { let r = std::panic::catch_unwind(|| { guest.wait_vm_boot(None).unwrap(); - assert!(guest - .ssh_command("sudo dmesg") - .unwrap() - .contains("input address: 42 bits")); + assert!( + guest + .ssh_command("sudo dmesg") + .unwrap() + .contains("input address: 42 bits") + ); }); let _ = child.kill(); @@ -9203,11 +9776,13 @@ mod vfio { mod live_migration { use crate::*; - fn start_live_migration( + pub fn start_live_migration( migration_socket: &str, src_api_socket: &str, dest_api_socket: &str, local: bool, + downtime: u64, + timeout: u64, ) -> bool { // Start to receive migration from the destination VM let mut receive_migration = Command::new(clh_command("ch-remote")) @@ -9228,6 +9803,10 @@ mod live_migration { format!("--api-socket={}", &src_api_socket), "send-migration".to_string(), format! {"unix:{migration_socket}"}, + "--downtime".to_string(), + format!("{downtime}"), + "--migration-timeout".to_string(), + format!("{timeout}"), ] .to_vec(); @@ -9289,7 +9868,12 @@ mod live_migration { send_success && receive_success } - fn print_and_panic(src_vm: Child, dest_vm: Child, ovs_vm: Option, message: &str) -> ! { + pub fn print_and_panic( + src_vm: Child, + dest_vm: Child, + ovs_vm: Option, + message: &str, + ) -> ! { let mut src_vm = src_vm; let mut dest_vm = dest_vm; @@ -9306,13 +9890,13 @@ mod live_migration { let _ = dest_vm.kill(); let dest_output = dest_vm.wait_with_output().unwrap(); eprintln!( - "\n\n==== Start 'destination_vm' stdout ====\n\n{}\n\n==== End 'destination_vm' stdout ====", - String::from_utf8_lossy(&dest_output.stdout) - ); + "\n\n==== Start 'destination_vm' stdout ====\n\n{}\n\n==== End 'destination_vm' stdout ====", + String::from_utf8_lossy(&dest_output.stdout) + ); eprintln!( - "\n\n==== Start 'destination_vm' stderr ====\n\n{}\n\n==== End 'destination_vm' stderr ====", - String::from_utf8_lossy(&dest_output.stderr) - ); + "\n\n==== Start 'destination_vm' stderr ====\n\n{}\n\n==== End 'destination_vm' stderr ====", + String::from_utf8_lossy(&dest_output.stderr) + ); if let Some(ovs_vm) = ovs_vm { let mut ovs_vm = ovs_vm; @@ -9447,8 +10031,18 @@ mod live_migration { .unwrap(), ); + let downtime = 100_000; // 100s + let migration_timeout = 1000; // 1000s + assert!( - start_live_migration(&migration_socket, &src_api_socket, &dest_api_socket, local), + start_live_migration( + &migration_socket, + &src_api_socket, + &dest_api_socket, + local, + downtime, + migration_timeout + ), "Unsuccessful command: 'send-migration' or 'receive-migration'." ); }); @@ -9621,8 +10215,18 @@ mod live_migration { .unwrap(), ); + let downtime = 100000; + let migration_timeout = 1000; + assert!( - start_live_migration(&migration_socket, &src_api_socket, &dest_api_socket, local), + start_live_migration( + &migration_socket, + &src_api_socket, + &dest_api_socket, + local, + downtime, + migration_timeout + ), "Unsuccessful command: 'send-migration' or 'receive-migration'." ); }); @@ -9839,8 +10443,18 @@ mod live_migration { .unwrap(), ); + let downtime = 100000; + let migration_timeout = 1000; + assert!( - start_live_migration(&migration_socket, &src_api_socket, &dest_api_socket, local), + start_live_migration( + &migration_socket, + &src_api_socket, + &dest_api_socket, + local, + downtime, + migration_timeout + ), "Unsuccessful command: 'send-migration' or 'receive-migration'." ); }); @@ -10055,8 +10669,18 @@ mod live_migration { .unwrap(), ); + let downtime = 100000; + let migration_timeout = 1000; + assert!( - start_live_migration(&migration_socket, &src_api_socket, &dest_api_socket, local), + start_live_migration( + &migration_socket, + &src_api_socket, + &dest_api_socket, + local, + downtime, + migration_timeout + ), "Unsuccessful command: 'send-migration' or 'receive-migration'." ); }); @@ -10165,8 +10789,18 @@ mod live_migration { .unwrap(), ); + let downtime = 100000; + let migration_timeout = 1000; + assert!( - start_live_migration(&migration_socket, &src_api_socket, &dest_api_socket, local), + start_live_migration( + &migration_socket, + &src_api_socket, + &dest_api_socket, + local, + downtime, + migration_timeout + ), "Unsuccessful command: 'send-migration' or 'receive-migration'." ); }); @@ -10312,8 +10946,18 @@ mod live_migration { .unwrap(), ); + let downtime = 100000; + let migration_timeout = 1000; + assert!( - start_live_migration(&migration_socket, &src_api_socket, &dest_api_socket, true), + start_live_migration( + &migration_socket, + &src_api_socket, + &dest_api_socket, + true, + downtime, + migration_timeout + ), "Unsuccessful command: 'send-migration' or 'receive-migration'." ); }); @@ -10339,7 +10983,7 @@ mod live_migration { ); }; - // Post live-migration check to make sure the destination VM is funcational + // Post live-migration check to make sure the destination VM is functional let r = std::panic::catch_unwind(|| { // Perform same checks to validate VM has been properly migrated assert_eq!(guest.get_cpu_count().unwrap_or_default(), boot_vcpus); @@ -10368,7 +11012,12 @@ mod live_migration { .port() } - fn start_live_migration_tcp(src_api_socket: &str, dest_api_socket: &str) -> bool { + fn start_live_migration_tcp( + src_api_socket: &str, + dest_api_socket: &str, + downtime: u64, + timeout: u64, + ) -> bool { // Get an available TCP port let migration_port = get_available_port(); let host_ip = "127.0.0.1"; @@ -10395,6 +11044,10 @@ mod live_migration { &format!("--api-socket={src_api_socket}"), "send-migration", &format!("tcp:{host_ip}:{migration_port}"), + "--downtime", + &format!("{downtime}"), + "--migration-timeout", + &format!("{timeout}"), ]) .stdin(Stdio::null()) .stderr(Stdio::piped()) @@ -10465,6 +11118,8 @@ mod live_migration { .output() .expect("Expect creating disk image to succeed"); let pmem_path = String::from("/dev/pmem0"); + let downtime = 100000; + let timeout = 1000; // Start the source VM let src_vm_path = clh_command("cloud-hypervisor"); @@ -10527,7 +11182,7 @@ mod live_migration { } // Start TCP live migration assert!( - start_live_migration_tcp(&src_api_socket, &dest_api_socket), + start_live_migration_tcp(&src_api_socket, &dest_api_socket, downtime, timeout), "Unsuccessful command: 'send-migration' or 'receive-migration'." ); }); @@ -10652,25 +11307,21 @@ mod live_migration { } #[test] - #[cfg(not(feature = "mshv"))] fn test_live_migration_numa() { _test_live_migration_numa(false, false) } #[test] - #[cfg(not(feature = "mshv"))] fn test_live_migration_numa_local() { _test_live_migration_numa(false, true) } #[test] - #[cfg(not(feature = "mshv"))] fn test_live_upgrade_numa() { _test_live_migration_numa(true, false) } #[test] - #[cfg(not(feature = "mshv"))] fn test_live_upgrade_numa_local() { _test_live_migration_numa(true, true) } @@ -10874,11 +11525,13 @@ mod rate_limiter { String::from(test_img_dir.as_path().join("blk.img").to_str().unwrap()); // Create the test block image - assert!(exec_host_command_output(&format!( - "dd if=/dev/zero of={blk_rate_limiter_test_img} bs=1M count=1024" - )) - .status - .success()); + assert!( + exec_host_command_output(&format!( + "dd if=/dev/zero of={blk_rate_limiter_test_img} bs=1M count=1024" + )) + .status + .success() + ); let test_blk_params = if bandwidth { format!( @@ -10983,11 +11636,13 @@ mod rate_limiter { .unwrap(), ); - assert!(exec_host_command_output(&format!( - "dd if=/dev/zero of={test_img_path} bs=1M count=1024" - )) - .status - .success()); + assert!( + exec_host_command_output(&format!( + "dd if=/dev/zero of={test_img_path} bs=1M count=1024" + )) + .status + .success() + ); disk_args.push(format!( "path={test_img_path},num_queues={num_queues},rate_limit_group=group0" @@ -11073,3 +11728,55 @@ mod rate_limiter { _test_rate_limiter_group_block(false, 2, 2); } } + +#[cfg(not(target_arch = "riscv64"))] +mod fw_cfg { + use crate::*; + + #[test] + fn test_fw_cfg() { + let jammy = UbuntuDiskConfig::new(JAMMY_IMAGE_NAME.to_string()); + let guest = Guest::new(Box::new(jammy)); + let mut cmd = GuestCommand::new(&guest); + + let kernel_path = direct_kernel_boot_path(); + let cmd_line = DIRECT_KERNEL_BOOT_CMDLINE; + + let test_file = guest.tmp_dir.as_path().join("test-file"); + std::fs::write(&test_file, "test-file-content").unwrap(); + + cmd.args(["--cpus", "boot=4"]) + .args(["--memory", "size=512M"]) + .args(["--kernel", kernel_path.to_str().unwrap()]) + .args(["--cmdline", cmd_line]) + .default_disks() + .default_net() + .args([ + "--fw-cfg-config", + &format!( + "initramfs=off,items=[name=opt/org.test/test-file,file={}]", + test_file.to_str().unwrap() + ), + ]) + .capture_output(); + + let mut child = cmd.spawn().unwrap(); + + let r = std::panic::catch_unwind(|| { + guest.wait_vm_boot(None).unwrap(); + // Wait a while for guest + thread::sleep(std::time::Duration::new(3, 0)); + let result = guest + .ssh_command( + "sudo cat /sys/firmware/qemu_fw_cfg/by_name/opt/org.test/test-file/raw", + ) + .unwrap(); + assert_eq!(result, "test-file-content"); + }); + + kill_child(&mut child); + let output = child.wait_with_output().unwrap(); + + handle_child_output(r, &output); + } +} diff --git a/tpm/Cargo.toml b/tpm/Cargo.toml index 076be121ef..82dc8f79be 100644 --- a/tpm/Cargo.toml +++ b/tpm/Cargo.toml @@ -6,10 +6,12 @@ name = "tpm" version = "0.1.0" [dependencies] -anyhow = "1.0.81" -byteorder = "1.5.0" -libc = "0.2.153" -log = "0.4.21" +anyhow = { workspace = true } +libc = { workspace = true } +log = { workspace = true } net_gen = { path = "../net_gen" } thiserror = { workspace = true } vmm-sys-util = { workspace = true } + +[lints] +workspace = true diff --git a/tracer/Cargo.toml b/tracer/Cargo.toml index 5d4cb678c9..1ac9f4e393 100644 --- a/tracer/Cargo.toml +++ b/tracer/Cargo.toml @@ -1,14 +1,17 @@ [package] authors = ["The Cloud Hypervisor Authors"] -edition = "2021" +edition.workspace = true name = "tracer" version = "0.1.0" [dependencies] -libc = "0.2.167" -log = "0.4.22" -serde = { version = "1.0.208", features = ["derive", "rc"] } +libc = { workspace = true } +log = { workspace = true } +serde = { workspace = true, features = ["derive", "rc"] } serde_json = { workspace = true } [features] tracing = [] + +[lints] +workspace = true diff --git a/vhost_user_block/Cargo.toml b/vhost_user_block/Cargo.toml index 4c63cfa4d0..e674c96eac 100644 --- a/vhost_user_block/Cargo.toml +++ b/vhost_user_block/Cargo.toml @@ -1,17 +1,16 @@ [package] authors = ["The Cloud Hypervisor Authors"] build = "../build.rs" -edition = "2021" +edition.workspace = true name = "vhost_user_block" version = "0.1.0" [dependencies] block = { path = "../block" } -clap = { version = "4.5.13", features = ["cargo", "wrap_help"] } -env_logger = "0.11.3" -epoll = "4.3.3" -libc = "0.2.167" -log = "0.4.22" +clap = { workspace = true, features = ["cargo", "wrap_help"] } +env_logger = { workspace = true } +libc = { workspace = true } +log = { workspace = true } option_parser = { path = "../option_parser" } thiserror = { workspace = true } vhost = { workspace = true, features = ["vhost-user-backend"] } @@ -20,3 +19,6 @@ virtio-bindings = { workspace = true } virtio-queue = { workspace = true } vm-memory = { workspace = true } vmm-sys-util = { workspace = true } + +[lints] +workspace = true diff --git a/vhost_user_block/src/lib.rs b/vhost_user_block/src/lib.rs index 040eb18353..3977a25ab5 100644 --- a/vhost_user_block/src/lib.rs +++ b/vhost_user_block/src/lib.rs @@ -19,13 +19,13 @@ use std::time::Instant; use std::{convert, io, process, result}; use block::qcow::{self, ImageType, QcowFile}; -use block::{build_serial, Request, VirtioBlockConfig}; +use block::{Request, VirtioBlockConfig, build_serial}; use libc::EFD_NONBLOCK; use log::*; use option_parser::{OptionParser, OptionParserError, Toggle}; use thiserror::Error; -use vhost::vhost_user::message::*; use vhost::vhost_user::Listener; +use vhost::vhost_user::message::*; use vhost_user_backend::bitmap::BitmapMmapRegion; use vhost_user_backend::{VhostUserBackendMut, VhostUserDaemon, VringRwLock, VringState, VringT}; use virtio_bindings::virtio_blk::*; @@ -334,8 +334,8 @@ impl VhostUserBackendMut for VhostUserBlkBackend { } fn set_event_idx(&mut self, enabled: bool) { - for thread in self.threads.iter() { - thread.lock().unwrap().event_idx = enabled; + for thread in self.threads.iter_mut() { + thread.get_mut().unwrap().event_idx = enabled; } } @@ -352,7 +352,7 @@ impl VhostUserBackendMut for VhostUserBlkBackend { debug!("event received: {:?}", device_event); - let mut thread = self.threads[thread_id].lock().unwrap(); + let thread = self.threads[thread_id].get_mut().unwrap(); match device_event { 0 => { let mut vring = vrings[0].get_mut(); diff --git a/vhost_user_net/Cargo.toml b/vhost_user_net/Cargo.toml index d36763b790..849ad5426e 100644 --- a/vhost_user_net/Cargo.toml +++ b/vhost_user_net/Cargo.toml @@ -1,16 +1,16 @@ [package] authors = ["The Cloud Hypervisor Authors"] build = "../build.rs" -edition = "2021" +edition.workspace = true name = "vhost_user_net" version = "0.1.0" [dependencies] -clap = { version = "4.5.13", features = ["cargo", "wrap_help"] } -env_logger = "0.11.3" -epoll = "4.3.3" -libc = "0.2.167" -log = "0.4.22" +clap = { workspace = true, features = ["cargo", "wrap_help"] } +env_logger = { workspace = true } +epoll = { workspace = true } +libc = { workspace = true } +log = { workspace = true } net_util = { path = "../net_util" } option_parser = { path = "../option_parser" } thiserror = { workspace = true } @@ -19,3 +19,6 @@ vhost-user-backend = { workspace = true } virtio-bindings = { workspace = true } vm-memory = { workspace = true } vmm-sys-util = { workspace = true } + +[lints] +workspace = true diff --git a/vhost_user_net/src/lib.rs b/vhost_user_net/src/lib.rs index 433f0c7a29..b5c0aa8019 100644 --- a/vhost_user_net/src/lib.rs +++ b/vhost_user_net/src/lib.rs @@ -15,12 +15,12 @@ use std::{io, process}; use libc::EFD_NONBLOCK; use log::*; use net_util::{ - open_tap, MacAddr, NetCounters, NetQueuePair, OpenTapError, RxVirtio, Tap, TxVirtio, + MacAddr, NetCounters, NetQueuePair, OpenTapError, RxVirtio, Tap, TxVirtio, open_tap, }; use option_parser::{OptionParser, OptionParserError, Toggle}; use thiserror::Error; -use vhost::vhost_user::message::*; use vhost::vhost_user::Listener; +use vhost::vhost_user::message::*; use vhost_user_backend::bitmap::BitmapMmapRegion; use vhost_user_backend::{VhostUserBackendMut, VhostUserDaemon, VringRwLock, VringT}; use virtio_bindings::virtio_config::{VIRTIO_F_NOTIFY_ON_EMPTY, VIRTIO_F_VERSION_1}; @@ -205,7 +205,7 @@ impl VhostUserBackendMut for VhostUserNetBackend { vrings: &[VringRwLock>], thread_id: usize, ) -> VhostUserBackendResult<()> { - let mut thread = self.threads[thread_id].lock().unwrap(); + let thread = self.threads[thread_id].get_mut().unwrap(); match device_event { 0 => { if !thread.net.rx_tap_listening { @@ -348,7 +348,7 @@ pub fn start_net_backend(backend_command: &str) { let backend_config = match VhostUserNetBackendConfig::parse(backend_command) { Ok(config) => config, Err(e) => { - eprintln!("Failed parsing parameters {e:?}"); + error!("Failed parsing parameters {e:?}"); process::exit(1); } }; diff --git a/virtio-devices/Cargo.toml b/virtio-devices/Cargo.toml index fd86b5e3cf..5cbfe145f4 100644 --- a/virtio-devices/Cargo.toml +++ b/virtio-devices/Cargo.toml @@ -1,31 +1,30 @@ [package] authors = ["The Cloud Hypervisor Authors"] -edition = "2021" +edition.workspace = true name = "virtio-devices" version = "0.1.0" [features] default = [] +kvm = ["pci/kvm"] +mshv = ["pci/mshv"] sev_snp = ["mshv-ioctls"] [dependencies] -anyhow = "1.0.94" -arc-swap = "1.7.1" +anyhow = { workspace = true } block = { path = "../block" } -byteorder = "1.5.0" -epoll = "4.3.3" +byteorder = { workspace = true } +epoll = { workspace = true } event_monitor = { path = "../event_monitor" } -libc = "0.2.167" -log = "0.4.22" +libc = { workspace = true } +log = { workspace = true } mshv-ioctls = { workspace = true, optional = true } -net_gen = { path = "../net_gen" } net_util = { path = "../net_util" } pci = { path = "../pci" } rate_limiter = { path = "../rate_limiter" } seccompiler = { workspace = true } -serde = { version = "1.0.208", features = ["derive"] } -serde_json = { workspace = true } -serde_with = { version = "3.9.0", default-features = false, features = [ +serde = { workspace = true, features = ["derive"] } +serde_with = { workspace = true, default-features = false, features = [ "macros", ] } serial_buffer = { path = "../serial_buffer" } @@ -36,7 +35,7 @@ vhost = { workspace = true, features = [ "vhost-user-frontend", "vhost-vdpa", ] } -virtio-bindings = { workspace = true, features = ["virtio-v5_0_0"] } +virtio-bindings = { workspace = true } virtio-queue = { workspace = true } vm-allocator = { path = "../vm-allocator" } vm-device = { path = "../vm-device" } @@ -48,3 +47,6 @@ vm-memory = { workspace = true, features = [ vm-migration = { path = "../vm-migration" } vm-virtio = { path = "../vm-virtio" } vmm-sys-util = { workspace = true } + +[lints] +workspace = true diff --git a/virtio-devices/src/balloon.rs b/virtio-devices/src/balloon.rs index 890dfbd9fc..1a5e202fdf 100644 --- a/virtio-devices/src/balloon.rs +++ b/virtio-devices/src/balloon.rs @@ -37,9 +37,9 @@ use vmm_sys_util::eventfd::EventFd; use crate::seccomp_filters::Thread; use crate::thread_helper::spawn_virtio_thread; use crate::{ - ActivateResult, EpollHelper, EpollHelperError, EpollHelperHandler, GuestMemoryMmap, - VirtioCommon, VirtioDevice, VirtioDeviceType, VirtioInterrupt, VirtioInterruptType, - EPOLL_HELPER_EVENT_LAST, VIRTIO_F_VERSION_1, + ActivateResult, EPOLL_HELPER_EVENT_LAST, EpollHelper, EpollHelperError, EpollHelperHandler, + GuestMemoryMmap, VIRTIO_F_VERSION_1, VirtioCommon, VirtioDevice, VirtioDeviceType, + VirtioInterrupt, VirtioInterruptType, }; const QUEUE_SIZE: u16 = 128; @@ -265,7 +265,7 @@ impl BalloonEpollHandler { error!("The head contains the request type is not right"); return Err(Error::UnexpectedWriteOnlyDescriptor); } - if desc.len() as usize % data_chunk_size != 0 { + if !(desc.len() as usize).is_multiple_of(data_chunk_size) { error!("the request size {} is not right", desc.len()); return Err(Error::InvalidRequest); } @@ -575,12 +575,12 @@ impl VirtioDevice for Balloon { let data_len = data.len() as u64; if offset + data_len > config_len { error!( - "Out-of-bound access to configuration: config_len = {} offset = {:x} length = {} for {}", - config_len, - offset, - data_len, - self.device_type() - ); + "Out-of-bound access to configuration: config_len = {} offset = {:x} length = {} for {}", + config_len, + offset, + data_len, + self.device_type() + ); return; } diff --git a/virtio-devices/src/block.rs b/virtio-devices/src/block.rs index ff28f3ba76..4b3ff81775 100644 --- a/virtio-devices/src/block.rs +++ b/virtio-devices/src/block.rs @@ -18,11 +18,13 @@ use std::sync::{Arc, Barrier}; use std::{io, result}; use anyhow::anyhow; -use block::async_io::{AsyncIo, AsyncIoError, DiskFile}; -use block::fcntl::{get_lock_state, LockError, LockType}; -use block::{build_serial, fcntl, Request, RequestType, VirtioBlockConfig}; -use rate_limiter::group::{RateLimiterGroup, RateLimiterGroupHandle}; +use block::async_io::{AsyncIo, AsyncIoError, DiskFile, DiskFileError}; +use block::fcntl::{LockError, LockGranularity, LockType, get_lock_state}; +use block::{ + ExecuteAsync, ExecuteError, Request, RequestType, VirtioBlockConfig, build_serial, fcntl, +}; use rate_limiter::TokenType; +use rate_limiter::group::{RateLimiterGroup, RateLimiterGroupHandle}; use seccompiler::SeccompAction; use serde::{Deserialize, Serialize}; use thiserror::Error; @@ -36,9 +38,9 @@ use vm_virtio::AccessPlatform; use vmm_sys_util::eventfd::EventFd; use super::{ - ActivateError, ActivateResult, EpollHelper, EpollHelperError, EpollHelperHandler, - Error as DeviceError, VirtioCommon, VirtioDevice, VirtioDeviceType, VirtioInterruptType, - EPOLL_HELPER_EVENT_LAST, + ActivateError, ActivateResult, EPOLL_HELPER_EVENT_LAST, EpollHelper, EpollHelperError, + EpollHelperHandler, Error as DeviceError, VirtioCommon, VirtioDevice, VirtioDeviceType, + VirtioInterruptType, }; use crate::seccomp_filters::Thread; use crate::thread_helper::spawn_virtio_thread; @@ -91,6 +93,14 @@ pub enum Error { /// The path of the disk image. path: PathBuf, }, + #[error("disk image size is not a multiple of {}", SECTOR_SIZE)] + InvalidSize, + #[error("Failed to pause/resume vcpus")] + FailedPauseResume(#[source] MigratableError), + #[error("Failed signal config interrupt")] + FailedSignalingConfigChange(#[source] io::Error), + #[error("Disk resize failed")] + FailedDiskResize(#[source] DiskFileError), } pub type Result = result::Result; @@ -133,7 +143,7 @@ struct BlockEpollHandler { queue: Queue, mem: GuestMemoryAtomic, disk_image: Box, - disk_nsectors: u64, + disk_nsectors: Arc, interrupt_cb: Arc, serial: Vec, kill_evt: EventFd, @@ -144,13 +154,29 @@ struct BlockEpollHandler { inflight_requests: VecDeque<(u16, Request)>, rate_limiter: Option, access_platform: Option>, - read_only: bool, host_cpus: Option>, + acked_features: u64, +} + +fn has_feature(features: u64, feature_flag: u64) -> bool { + (features & (1u64 << feature_flag)) != 0 } impl BlockEpollHandler { + fn check_request(features: u64, request_type: RequestType) -> result::Result<(), ExecuteError> { + if has_feature(features, VIRTIO_BLK_F_RO.into()) && request_type != RequestType::In { + // For virtio spec compliance + // "A device MUST set the status byte to VIRTIO_BLK_S_IOERR for a write request + // if the VIRTIO_BLK_F_RO feature if offered, and MUST NOT write any data." + return Err(ExecuteError::ReadOnly); + } + Ok(()) + } + fn process_queue_submit(&mut self) -> Result<()> { let queue = &mut self.queue; + let mut batch_requests = Vec::new(); + let mut batch_inflight_requests = Vec::new(); while let Some(mut desc_chain) = queue.pop_descriptor_chain(self.mem.memory()) { let mut request = Request::parse(&mut desc_chain, self.access_platform.as_ref()) @@ -159,10 +185,8 @@ impl BlockEpollHandler { // For virtio spec compliance // "A device MUST set the status byte to VIRTIO_BLK_S_IOERR for a write request // if the VIRTIO_BLK_F_RO feature if offered, and MUST NOT write any data." - if self.read_only - && (request.request_type == RequestType::Out - || request.request_type == RequestType::Flush) - { + if let Err(e) = Self::check_request(self.acked_features, request.request_type) { + warn!("Request check failed: {:x?} {:?}", request, e); desc_chain .memory() .write_obj(VIRTIO_BLK_S_IOERR, request.status_addr) @@ -214,15 +238,29 @@ impl BlockEpollHandler { let result = request.execute_async( desc_chain.memory(), - self.disk_nsectors, + self.disk_nsectors.load(Ordering::SeqCst), self.disk_image.as_mut(), &self.serial, desc_chain.head_index() as u64, ); - if let Ok(true) = result { - self.inflight_requests - .push_back((desc_chain.head_index(), request)); + if let Ok(ExecuteAsync { + async_complete: true, + batch_request, + }) = result + { + if let Some(batch_request) = batch_request { + match batch_request.request_type { + RequestType::In | RequestType::Out => batch_requests.push(batch_request), + _ => { + unreachable!( + "Unexpected batch request type: {:?}", + request.request_type + ) + } + } + } + batch_inflight_requests.push((desc_chain.head_index(), request)); } else { let status = match result { Ok(_) => VIRTIO_BLK_S_OK, @@ -248,6 +286,31 @@ impl BlockEpollHandler { } } + match self.disk_image.submit_batch_requests(&batch_requests) { + Ok(()) => { + self.inflight_requests.extend(batch_inflight_requests); + } + Err(e) => { + // If batch submission fails, report VIRTIO_BLK_S_IOERR for all requests. + for (user_data, request) in batch_inflight_requests { + warn!( + "Request failed with batch submission: {:x?} {:?}", + request, e + ); + let desc_index = user_data; + let mem = self.mem.memory(); + mem.write_obj(VIRTIO_BLK_S_IOERR as u8, request.status_addr) + .map_err(Error::RequestStatus)?; + queue + .add_used(mem.deref(), desc_index, 0) + .map_err(Error::QueueAddUsed)?; + queue + .enable_notification(mem.deref()) + .map_err(Error::QueueEnableNotification)?; + } + } + } + Ok(()) } @@ -576,14 +639,13 @@ pub struct Block { id: String, disk_image: Box, disk_path: PathBuf, - disk_nsectors: u64, + disk_nsectors: Arc, config: VirtioBlockConfig, writeback: Arc, counters: BlockCounters, seccomp_action: SeccompAction, rate_limiter: Option>, exit_evt: EventFd, - read_only: bool, serial: Vec, queue_affinity: BTreeMap>, } @@ -708,33 +770,58 @@ impl Block { id, disk_image, disk_path, - disk_nsectors, + disk_nsectors: Arc::new(AtomicU64::new(disk_nsectors)), config, writeback: Arc::new(AtomicBool::new(true)), counters: BlockCounters::default(), seccomp_action, rate_limiter, exit_evt, - read_only, serial, queue_affinity, }) } + fn read_only(&self) -> bool { + has_feature(self.features(), VIRTIO_BLK_F_RO.into()) + } + + /// Returns the granularity for the advisory lock for this disk. + // TODO In future, we could add a `lock_granularity=` configuration to the CLI. + // For now, we stick to QEMU behavior. + fn lock_granularity(&mut self) -> LockGranularity { + let fallback = LockGranularity::WholeFile; + + self.disk_image + .size() + .map(|size| LockGranularity::ByteRange(0, size)) + // use a safe fallback + .unwrap_or_else(|e| { + log::warn!( + "Can't get disk size for id={},path={}, falling back to {:?}: error: {e}", + self.id, + self.disk_path.display(), + fallback + ); + fallback + }) + } + /// Tries to set an advisory lock for the corresponding disk image. pub fn try_lock_image(&mut self) -> Result<()> { - let lock_type = match self.read_only { + let lock_type = match self.read_only() { true => LockType::Read, false => LockType::Write, }; + let granularity = self.lock_granularity(); log::debug!( - "Attempting to acquire {lock_type:?} lock for disk image id={},path={}", + "Attempting to acquire {lock_type:?} lock for disk image: id={},path={},granularity={granularity:?}", self.id, self.disk_path.display() ); let fd = self.disk_image.fd(); - fcntl::try_acquire_lock(fd, lock_type).map_err(|error| { - let current_lock = get_lock_state(fd); + fcntl::try_acquire_lock(fd, lock_type, granularity).map_err(|error| { + let current_lock = get_lock_state(fd, granularity); // Don't propagate the error to the outside, as it is not useful at all. Instead, // we try to log additional help to the user. if let Ok(current_lock) = current_lock { @@ -758,10 +845,12 @@ impl Block { /// Releases the advisory lock held for the corresponding disk image. pub fn unlock_image(&mut self) -> Result<()> { + let granularity = self.lock_granularity(); + // It is very unlikely that this fails; // Should we remove the Result to simplify the error propagation on // higher levels? - fcntl::clear_lock(self.disk_image.fd()).map_err(|error| Error::LockDiskImage { + fcntl::clear_lock(self.disk_image.fd(), granularity).map_err(|error| Error::LockDiskImage { path: self.disk_path.clone(), error, lock_type: LockType::Unlock, @@ -771,7 +860,7 @@ impl Block { fn state(&self) -> BlockState { BlockState { disk_path: self.disk_path.to_str().unwrap().to_owned(), - disk_nsectors: self.disk_nsectors, + disk_nsectors: self.disk_nsectors.load(Ordering::SeqCst), avail_features: self.common.avail_features, acked_features: self.common.acked_features, config: self.config, @@ -798,6 +887,34 @@ impl Block { self.writeback.store(writeback, Ordering::Release); } + pub fn resize(&mut self, new_size: u64) -> Result<()> { + if !new_size.is_multiple_of(SECTOR_SIZE) { + return Err(Error::InvalidSize); + } + + self.disk_image + .resize(new_size) + .map_err(Error::FailedDiskResize)?; + + let nsectors = new_size / SECTOR_SIZE; + + self.common.pause().map_err(Error::FailedPauseResume)?; + + self.disk_nsectors.store(nsectors, Ordering::SeqCst); + self.config.capacity = nsectors; + self.state().disk_nsectors = nsectors; + + self.common.resume().map_err(Error::FailedPauseResume)?; + + if let Some(interrupt_cb) = self.common.interrupt_cb.as_ref() { + interrupt_cb + .trigger(VirtioInterruptType::Config) + .map_err(Error::FailedSignalingConfigChange) + } else { + Ok(()) + } + } + #[cfg(fuzzing)] pub fn wait_for_epoll_threads(&mut self) { self.common.wait_for_epoll_threads(); @@ -885,7 +1002,7 @@ impl VirtioDevice for Block { error!("failed to create new AsyncIo: {}", e); ActivateError::BadActivate })?, - disk_nsectors: self.disk_nsectors, + disk_nsectors: self.disk_nsectors.clone(), interrupt_cb: interrupt_cb.clone(), serial: self.serial.clone(), kill_evt, @@ -904,8 +1021,8 @@ impl VirtioDevice for Block { .transpose() .unwrap(), access_platform: self.common.access_platform.clone(), - read_only: self.read_only, host_cpus: self.queue_affinity.get(&queue_idx).cloned(), + acked_features: self.common.acked_features, }; let paused = self.common.paused.clone(); diff --git a/virtio-devices/src/console.rs b/virtio-devices/src/console.rs index f05c0c4f4d..6f237b5eed 100644 --- a/virtio-devices/src/console.rs +++ b/virtio-devices/src/console.rs @@ -22,9 +22,9 @@ use vm_virtio::{AccessPlatform, Translatable}; use vmm_sys_util::eventfd::EventFd; use super::{ - ActivateResult, EpollHelper, EpollHelperError, EpollHelperHandler, Error as DeviceError, - VirtioCommon, VirtioDevice, VirtioDeviceType, VirtioInterruptType, EPOLL_HELPER_EVENT_LAST, - VIRTIO_F_IOMMU_PLATFORM, VIRTIO_F_VERSION_1, + ActivateResult, EPOLL_HELPER_EVENT_LAST, EpollHelper, EpollHelperError, EpollHelperHandler, + Error as DeviceError, VIRTIO_F_IOMMU_PLATFORM, VIRTIO_F_VERSION_1, VirtioCommon, VirtioDevice, + VirtioDeviceType, VirtioInterruptType, }; use crate::seccomp_filters::Thread; use crate::thread_helper::spawn_virtio_thread; @@ -454,12 +454,11 @@ impl EpollHelperHandler for ConsoleEpollHandler { } if self.endpoint.is_pty() { self.file_event_registered = false; - if event.events & libc::EPOLLHUP as u32 != 0 { - if let Some(pty_write_out) = &self.write_out { - if pty_write_out.load(Ordering::Acquire) { - pty_write_out.store(false, Ordering::Release); - } - } + if event.events & libc::EPOLLHUP as u32 != 0 + && let Some(pty_write_out) = &self.write_out + && pty_write_out.load(Ordering::Acquire) + { + pty_write_out.store(false, Ordering::Release); } else { // If the EPOLLHUP flag is not up on the associated event, we // can assume the other end of the PTY is connected and therefore @@ -731,10 +730,10 @@ impl VirtioDevice for Console { .acked_features .store(self.common.acked_features, Ordering::Relaxed); - if self.common.feature_acked(VIRTIO_CONSOLE_F_SIZE) { - if let Err(e) = interrupt_cb.trigger(VirtioInterruptType::Config) { - error!("Failed to signal console driver: {:?}", e); - } + if self.common.feature_acked(VIRTIO_CONSOLE_F_SIZE) + && let Err(e) = interrupt_cb.trigger(VirtioInterruptType::Config) + { + error!("Failed to signal console driver: {:?}", e); } let (kill_evt, pause_evt) = self.common.dup_eventfds(); diff --git a/virtio-devices/src/device.rs b/virtio-devices/src/device.rs index 06b245e271..c0d24902a1 100644 --- a/virtio-devices/src/device.rs +++ b/virtio-devices/src/device.rs @@ -15,6 +15,7 @@ use std::thread; use libc::EFD_NONBLOCK; use virtio_queue::Queue; +use vm_device::UserspaceMapping; use vm_memory::{GuestAddress, GuestMemoryAtomic, GuestUsize}; use vm_migration::{MigratableError, Pausable}; use vm_virtio::{AccessPlatform, VirtioDeviceType}; @@ -37,15 +38,6 @@ pub trait VirtioInterrupt: Send + Sync { } } -#[derive(Clone)] -pub struct UserspaceMapping { - pub host_addr: u64, - pub mem_slot: u32, - pub addr: GuestAddress, - pub len: GuestUsize, - pub mergeable: bool, -} - #[derive(Clone)] pub struct VirtioSharedMemory { pub offset: u64, diff --git a/virtio-devices/src/iommu.rs b/virtio-devices/src/iommu.rs index 6a0ccd6be3..f2795a8cff 100644 --- a/virtio-devices/src/iommu.rs +++ b/virtio-devices/src/iommu.rs @@ -24,8 +24,8 @@ use vm_virtio::AccessPlatform; use vmm_sys_util::eventfd::EventFd; use super::{ - ActivateResult, EpollHelper, EpollHelperError, EpollHelperHandler, Error as DeviceError, - VirtioCommon, VirtioDevice, VirtioDeviceType, EPOLL_HELPER_EVENT_LAST, VIRTIO_F_VERSION_1, + ActivateResult, EPOLL_HELPER_EVENT_LAST, EpollHelper, EpollHelperError, EpollHelperHandler, + Error as DeviceError, VIRTIO_F_VERSION_1, VirtioCommon, VirtioDevice, VirtioDeviceType, }; use crate::seccomp_filters::Thread; use crate::thread_helper::spawn_virtio_thread; @@ -421,13 +421,12 @@ impl Request { // If any other mappings exist in the domain for other containers, // make sure to issue these mappings for the new endpoint/container if let Some(domain_mappings) = &mapping.domains.read().unwrap().get(&domain_id) + && let Some(ext_map) = ext_mapping.get(&endpoint) { - if let Some(ext_map) = ext_mapping.get(&endpoint) { - for (virt_start, addr_map) in &domain_mappings.mappings { - ext_map - .map(*virt_start, addr_map.gpa, addr_map.size) - .map_err(Error::ExternalUnmapping)?; - } + for (virt_start, addr_map) in &domain_mappings.mappings { + ext_map + .map(*virt_start, addr_map.gpa, addr_map.size) + .map_err(Error::ExternalUnmapping)?; } } @@ -489,7 +488,7 @@ impl Request { .write() .unwrap() .iter() - .filter(|(_, &d)| d == domain_id) + .filter(|&(_, &d)| d == domain_id) .map(|(&e, _)| e) .collect(); @@ -553,7 +552,7 @@ impl Request { .write() .unwrap() .iter() - .filter(|(_, &d)| d == domain_id) + .filter(|&(_, &d)| d == domain_id) .map(|(&e, _)| e) .collect(); @@ -575,7 +574,7 @@ impl Request { .get_mut(&domain_id) .unwrap() .mappings - .retain(|&x, _| (x < req.virt_start || x > req.virt_end)); + .retain(|&x, _| x < req.virt_start || x > req.virt_end); } VIRTIO_IOMMU_T_PROBE => { if desc_size_left != size_of::() { @@ -654,13 +653,13 @@ fn detach_endpoint_from_domain( mapping.endpoints.write().unwrap().remove(&endpoint); // Trigger external unmapping for the endpoint if necessary. - if let Some(domain_mappings) = &mapping.domains.read().unwrap().get(&domain_id) { - if let Some(ext_map) = ext_mapping.get(&endpoint) { - for (virt_start, addr_map) in &domain_mappings.mappings { - ext_map - .unmap(*virt_start, addr_map.size) - .map_err(Error::ExternalUnmapping)?; - } + if let Some(domain_mappings) = &mapping.domains.read().unwrap().get(&domain_id) + && let Some(ext_map) = ext_mapping.get(&endpoint) + { + for (virt_start, addr_map) in &domain_mappings.mappings { + ext_map + .unmap(*virt_start, addr_map.size) + .map_err(Error::ExternalUnmapping)?; } } @@ -669,7 +668,7 @@ fn detach_endpoint_from_domain( .write() .unwrap() .iter() - .filter(|(_, &d)| d == domain_id) + .filter(|&(_, &d)| d == domain_id) .count() == 0 { diff --git a/virtio-devices/src/lib.rs b/virtio-devices/src/lib.rs index 6a21eb2734..86359da657 100644 --- a/virtio-devices/src/lib.rs +++ b/virtio-devices/src/lib.rs @@ -47,14 +47,14 @@ pub use self::balloon::Balloon; pub use self::block::{Block, BlockState}; pub use self::console::{Console, ConsoleResizer, Endpoint}; pub use self::device::{ - DmaRemapping, UserspaceMapping, VirtioCommon, VirtioDevice, VirtioInterrupt, - VirtioInterruptType, VirtioSharedMemoryList, + DmaRemapping, VirtioCommon, VirtioDevice, VirtioInterrupt, VirtioInterruptType, + VirtioSharedMemoryList, }; pub use self::epoll_helper::{ - EpollHelper, EpollHelperError, EpollHelperHandler, EPOLL_HELPER_EVENT_LAST, + EPOLL_HELPER_EVENT_LAST, EpollHelper, EpollHelperError, EpollHelperHandler, }; pub use self::iommu::{AccessPlatformMapping, Iommu, IommuMapping}; -pub use self::mem::{BlocksState, Mem, VirtioMemMappingSource, VIRTIO_MEM_ALIGN_SIZE}; +pub use self::mem::{BlocksState, Mem, VIRTIO_MEM_ALIGN_SIZE, VirtioMemMappingSource}; pub use self::net::{Net, NetCtrlEpollHandler}; pub use self::pmem::Pmem; pub use self::rng::Rng; diff --git a/virtio-devices/src/mem.rs b/virtio-devices/src/mem.rs index 5739a12fae..0be5b24f62 100644 --- a/virtio-devices/src/mem.rs +++ b/virtio-devices/src/mem.rs @@ -18,7 +18,7 @@ use std::collections::BTreeMap; use std::mem::size_of; use std::os::unix::io::{AsRawFd, RawFd}; use std::sync::atomic::AtomicBool; -use std::sync::{mpsc, Arc, Barrier, Mutex}; +use std::sync::{Arc, Barrier, Mutex, mpsc}; use std::{io, result}; use anyhow::anyhow; @@ -36,9 +36,9 @@ use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottabl use vmm_sys_util::eventfd::EventFd; use super::{ - ActivateError, ActivateResult, EpollHelper, EpollHelperError, EpollHelperHandler, - Error as DeviceError, VirtioCommon, VirtioDevice, VirtioDeviceType, EPOLL_HELPER_EVENT_LAST, - VIRTIO_F_VERSION_1, + ActivateError, ActivateResult, EPOLL_HELPER_EVENT_LAST, EpollHelper, EpollHelperError, + EpollHelperHandler, Error as DeviceError, VIRTIO_F_VERSION_1, VirtioCommon, VirtioDevice, + VirtioDeviceType, }; use crate::seccomp_filters::Thread; use crate::thread_helper::spawn_virtio_thread; @@ -193,35 +193,35 @@ unsafe impl ByteValued for VirtioMemConfig {} impl VirtioMemConfig { fn validate(&self) -> result::Result<(), Error> { - if self.addr % self.block_size != 0 { + if !self.addr.is_multiple_of(self.block_size) { return Err(Error::ValidateError(anyhow!( "addr 0x{:x} is not aligned on block_size 0x{:x}", self.addr, self.block_size ))); } - if self.region_size % self.block_size != 0 { + if !self.region_size.is_multiple_of(self.block_size) { return Err(Error::ValidateError(anyhow!( "region_size 0x{:x} is not aligned on block_size 0x{:x}", self.region_size, self.block_size ))); } - if self.usable_region_size % self.block_size != 0 { + if !self.usable_region_size.is_multiple_of(self.block_size) { return Err(Error::ValidateError(anyhow!( "usable_region_size 0x{:x} is not aligned on block_size 0x{:x}", self.usable_region_size, self.block_size ))); } - if self.plugged_size % self.block_size != 0 { + if !self.plugged_size.is_multiple_of(self.block_size) { return Err(Error::ValidateError(anyhow!( "plugged_size 0x{:x} is not aligned on block_size 0x{:x}", self.plugged_size, self.block_size ))); } - if self.requested_size % self.block_size != 0 { + if !self.requested_size.is_multiple_of(self.block_size) { return Err(Error::ValidateError(anyhow!( "requested_size 0x{:x} is not aligned on block_size 0x{:x}", self.requested_size, @@ -244,7 +244,7 @@ impl VirtioMemConfig { size, self.region_size ))); - } else if size % self.block_size != 0 { + } else if !size.is_multiple_of(self.block_size) { return Err(Error::ResizeError(anyhow!( "new size 0x{:x} is not aligned on block_size 0x{:x}", size, @@ -267,7 +267,7 @@ impl VirtioMemConfig { // Start address must be aligned on block_size, the size must be // greater than 0, and all blocks covered by the request must be // in the usable region. - if addr % self.block_size != 0 + if !addr.is_multiple_of(self.block_size) || size == 0 || (addr < self.addr || addr + size > self.addr + self.usable_region_size) { @@ -392,6 +392,8 @@ impl BlocksState { } } + // TODO We can avoid creating a new bitmap here, if we switch the code + // to use Vec to keep dirty bits and just pass it as is. MemoryRangeTable::from_bitmap(bitmap, start_addr, VIRTIO_MEM_DEFAULT_BLOCK_SIZE) } } @@ -475,11 +477,9 @@ impl MemEpollHandler { return VIRTIO_MEM_RESP_ERROR; } - if !plug { - if let Err(e) = self.discard_memory_range(offset, size) { - error!("failed discarding memory range: {:?}", e); - return VIRTIO_MEM_RESP_ERROR; - } + if !plug && let Err(e) = self.discard_memory_range(offset, size) { + error!("failed discarding memory range: {:?}", e); + return VIRTIO_MEM_RESP_ERROR; } self.blocks_state diff --git a/virtio-devices/src/net.rs b/virtio-devices/src/net.rs index 7d2a4a4597..2c4e085ca2 100644 --- a/virtio-devices/src/net.rs +++ b/virtio-devices/src/net.rs @@ -18,8 +18,8 @@ use anyhow::anyhow; #[cfg(not(fuzzing))] use net_util::virtio_features_to_tap_offload; use net_util::{ - build_net_config_space, build_net_config_space_with_mq, open_tap, CtrlQueue, MacAddr, - NetCounters, NetQueuePair, OpenTapError, RxVirtio, Tap, TapError, TxVirtio, VirtioNetConfig, + CtrlQueue, MacAddr, NetCounters, NetQueuePair, OpenTapError, RxVirtio, Tap, TapError, TxVirtio, + VirtioNetConfig, build_net_config_space, build_net_config_space_with_mq, open_tap, }; use seccompiler::SeccompAction; use serde::{Deserialize, Serialize}; @@ -34,9 +34,9 @@ use vm_virtio::AccessPlatform; use vmm_sys_util::eventfd::EventFd; use super::{ - ActivateError, ActivateResult, EpollHelper, EpollHelperError, EpollHelperHandler, - Error as DeviceError, RateLimiterConfig, VirtioCommon, VirtioDevice, VirtioDeviceType, - VirtioInterruptType, EPOLL_HELPER_EVENT_LAST, + ActivateError, ActivateResult, EPOLL_HELPER_EVENT_LAST, EpollHelper, EpollHelperError, + EpollHelperHandler, Error as DeviceError, RateLimiterConfig, VirtioCommon, VirtioDevice, + VirtioDeviceType, VirtioInterruptType, }; use crate::seccomp_filters::Thread; use crate::thread_helper::spawn_virtio_thread; @@ -255,9 +255,9 @@ impl NetEpollHandler { || !self.driver_awake { self.signal_used_queue(self.queue_index_base)?; - debug!("Signalling RX queue"); + trace!("Signalling RX queue"); } else { - debug!("Not signalling RX queue"); + trace!("Not signalling RX queue"); } Ok(()) } @@ -613,11 +613,12 @@ impl Net { for fd in fds.iter() { // Duplicate so that it can survive reboots // SAFETY: FFI call to dup. Trivially safe. - let fd = unsafe { libc::dup(*fd) }; - if fd < 0 { + let fd_duped = unsafe { libc::dup(*fd) }; + if fd_duped < 0 { return Err(Error::DuplicateTapFd(std::io::Error::last_os_error())); } - let tap = Tap::from_tap_fd(fd, num_queue_pairs).map_err(Error::TapError)?; + debug!("dup'ed fd {fd} => {fd_duped} for virtio-net device {id}"); + let tap = Tap::from_tap_fd(fd_duped, num_queue_pairs).map_err(Error::TapError)?; taps.push(tap); } @@ -661,16 +662,29 @@ impl Net { impl Drop for Net { fn drop(&mut self) { + // Get a comma-separated list of the interface names of the tap devices + // associated with this network device. + let ifnames_str = self + .taps + .iter() + .map(|tap| tap.if_name_as_str()) + .collect::>(); + let ifnames_str = ifnames_str.join(","); + debug!( + "virtio-net device closed: id={}, ifnames=[{ifnames_str}]", + self.id + ); + if let Some(kill_evt) = self.common.kill_evt.take() { // Ignore the result because there is nothing we can do about it. let _ = kill_evt.write(1); } // Needed to ensure all references to tap FDs are dropped (#4868) self.common.wait_for_epoll_threads(); - if let Some(thread) = self.ctrl_queue_epoll_thread.take() { - if let Err(e) = thread.join() { - error!("Error joining thread: {:?}", e); - } + if let Some(thread) = self.ctrl_queue_epoll_thread.take() + && let Err(e) = thread.join() + { + error!("Error joining thread: {:?}", e); } } } @@ -706,7 +720,7 @@ impl VirtioDevice for Net { let num_queues = queues.len(); let event_idx = self.common.feature_acked(VIRTIO_RING_F_EVENT_IDX.into()); - if self.common.feature_acked(VIRTIO_NET_F_CTRL_VQ.into()) && num_queues % 2 != 0 { + if self.common.feature_acked(VIRTIO_NET_F_CTRL_VQ.into()) && !num_queues.is_multiple_of(2) { let ctrl_queue_index = num_queues - 1; let (_, mut ctrl_queue, ctrl_queue_evt) = queues.remove(ctrl_queue_index); diff --git a/virtio-devices/src/pmem.rs b/virtio-devices/src/pmem.rs index 5f7ee9457b..8e41f0ec05 100644 --- a/virtio-devices/src/pmem.rs +++ b/virtio-devices/src/pmem.rs @@ -18,6 +18,7 @@ use seccompiler::SeccompAction; use serde::{Deserialize, Serialize}; use thiserror::Error; use virtio_queue::{DescriptorChain, Queue, QueueT}; +use vm_device::UserspaceMapping; use vm_memory::{ Address, ByteValued, Bytes, GuestAddress, GuestAddressSpace, GuestMemoryAtomic, GuestMemoryError, GuestMemoryLoadGuard, @@ -27,9 +28,9 @@ use vm_virtio::{AccessPlatform, Translatable}; use vmm_sys_util::eventfd::EventFd; use super::{ - ActivateError, ActivateResult, EpollHelper, EpollHelperError, EpollHelperHandler, - Error as DeviceError, UserspaceMapping, VirtioCommon, VirtioDevice, VirtioDeviceType, - EPOLL_HELPER_EVENT_LAST, VIRTIO_F_IOMMU_PLATFORM, VIRTIO_F_VERSION_1, + ActivateError, ActivateResult, EPOLL_HELPER_EVENT_LAST, EpollHelper, EpollHelperError, + EpollHelperHandler, Error as DeviceError, VIRTIO_F_IOMMU_PLATFORM, VIRTIO_F_VERSION_1, + VirtioCommon, VirtioDevice, VirtioDeviceType, }; use crate::seccomp_filters::Thread; use crate::thread_helper::spawn_virtio_thread; diff --git a/virtio-devices/src/rng.rs b/virtio-devices/src/rng.rs index 8429e3b31e..6dccb2de19 100644 --- a/virtio-devices/src/rng.rs +++ b/virtio-devices/src/rng.rs @@ -21,9 +21,9 @@ use vm_virtio::{AccessPlatform, Translatable}; use vmm_sys_util::eventfd::EventFd; use super::{ - ActivateError, ActivateResult, EpollHelper, EpollHelperError, EpollHelperHandler, - Error as DeviceError, VirtioCommon, VirtioDevice, VirtioDeviceType, EPOLL_HELPER_EVENT_LAST, - VIRTIO_F_IOMMU_PLATFORM, VIRTIO_F_VERSION_1, + ActivateError, ActivateResult, EPOLL_HELPER_EVENT_LAST, EpollHelper, EpollHelperError, + EpollHelperHandler, Error as DeviceError, VIRTIO_F_IOMMU_PLATFORM, VIRTIO_F_VERSION_1, + VirtioCommon, VirtioDevice, VirtioDeviceType, }; use crate::seccomp_filters::Thread; use crate::thread_helper::spawn_virtio_thread; diff --git a/virtio-devices/src/seccomp_filters.rs b/virtio-devices/src/seccomp_filters.rs index 2579e4f142..5986e72ea3 100644 --- a/virtio-devices/src/seccomp_filters.rs +++ b/virtio-devices/src/seccomp_filters.rs @@ -4,6 +4,7 @@ // // SPDX-License-Identifier: Apache-2.0 +use libc::{FIONBIO, TIOCGWINSZ, TUNSETOFFLOAD}; use seccompiler::SeccompCmpOp::Eq; use seccompiler::{ BpfProgram, Error, SeccompAction, SeccompCmpArgLen as ArgLen, SeccompCondition as Cond, @@ -46,26 +47,21 @@ macro_rules! or { ($($x:expr),*) => (vec![$($x),*]) } -// See include/uapi/asm-generic/ioctls.h in the kernel code. -const TIOCGWINSZ: u64 = 0x5413; -const FIONBIO: u64 = 0x5421; - // See include/uapi/linux/vfio.h in the kernel code. const VFIO_IOMMU_MAP_DMA: u64 = 0x3b71; const VFIO_IOMMU_UNMAP_DMA: u64 = 0x3b72; -// See include/uapi/linux/if_tun.h in the kernel code. -const TUNSETOFFLOAD: u64 = 0x4004_54d0; - #[cfg(feature = "sev_snp")] fn mshv_sev_snp_ioctl_seccomp_rule() -> SeccompRule { - and![Cond::new( - 1, - ArgLen::Dword, - Eq, - mshv_ioctls::MSHV_MODIFY_GPA_HOST_ACCESS() - ) - .unwrap()] + and![ + Cond::new( + 1, + ArgLen::Dword, + Eq, + mshv_ioctls::MSHV_MODIFY_GPA_HOST_ACCESS() + ) + .unwrap() + ] } #[cfg(feature = "sev_snp")] @@ -75,7 +71,7 @@ fn create_mshv_sev_snp_ioctl_seccomp_rule() -> Vec { fn create_virtio_console_ioctl_seccomp_rule() -> Vec { or![ - and![Cond::new(1, ArgLen::Dword, Eq, TIOCGWINSZ).unwrap()], + and![Cond::new(1, ArgLen::Dword, Eq, TIOCGWINSZ as _).unwrap()], #[cfg(feature = "sev_snp")] mshv_sev_snp_ioctl_seccomp_rule(), ] @@ -157,7 +153,7 @@ fn virtio_net_thread_rules() -> Vec<(i64, Vec)> { fn create_virtio_net_ctl_ioctl_seccomp_rule() -> Vec { or![ - and![Cond::new(1, ArgLen::Dword, Eq, TUNSETOFFLOAD).unwrap()], + and![Cond::new(1, ArgLen::Dword, Eq, TUNSETOFFLOAD as _).unwrap()], #[cfg(feature = "sev_snp")] mshv_sev_snp_ioctl_seccomp_rule(), ] @@ -231,7 +227,7 @@ fn virtio_vhost_block_thread_rules() -> Vec<(i64, Vec)> { fn create_vsock_ioctl_seccomp_rule() -> Vec { or![ - and![Cond::new(1, ArgLen::Dword, Eq, FIONBIO,).unwrap()], + and![Cond::new(1, ArgLen::Dword, Eq, FIONBIO as _).unwrap()], #[cfg(feature = "sev_snp")] mshv_sev_snp_ioctl_seccomp_rule(), ] diff --git a/virtio-devices/src/thread_helper.rs b/virtio-devices/src/thread_helper.rs index 41eb99b7c7..74aaddf8d4 100644 --- a/virtio-devices/src/thread_helper.rs +++ b/virtio-devices/src/thread_helper.rs @@ -6,12 +6,12 @@ use std::panic::AssertUnwindSafe; use std::thread::{self, JoinHandle}; -use seccompiler::{apply_filter, SeccompAction}; +use seccompiler::{SeccompAction, apply_filter}; use vmm_sys_util::eventfd::EventFd; -use crate::epoll_helper::EpollHelperError; -use crate::seccomp_filters::{get_seccomp_filter, Thread}; use crate::ActivateError; +use crate::epoll_helper::EpollHelperError; +use crate::seccomp_filters::{Thread, get_seccomp_filter}; pub(crate) fn spawn_virtio_thread( name: &str, @@ -36,12 +36,12 @@ where thread::Builder::new() .name(name.to_string()) .spawn(move || { - if !seccomp_filter.is_empty() { - if let Err(e) = apply_filter(&seccomp_filter) { - error!("Error applying seccomp filter: {:?}", e); - thread_exit_evt.write(1).ok(); - return; - } + if !seccomp_filter.is_empty() + && let Err(e) = apply_filter(&seccomp_filter) + { + error!("Error applying seccomp filter: {:?}", e); + thread_exit_evt.write(1).ok(); + return; } match std::panic::catch_unwind(AssertUnwindSafe(f)) { Err(_) => { diff --git a/virtio-devices/src/transport/mod.rs b/virtio-devices/src/transport/mod.rs index fae6f166b8..9214de5dbe 100644 --- a/virtio-devices/src/transport/mod.rs +++ b/virtio-devices/src/transport/mod.rs @@ -5,7 +5,7 @@ use vmm_sys_util::eventfd::EventFd; mod pci_common_config; mod pci_device; -pub use pci_common_config::{VirtioPciCommonConfig, VIRTIO_PCI_COMMON_CONFIG_ID}; +pub use pci_common_config::{VIRTIO_PCI_COMMON_CONFIG_ID, VirtioPciCommonConfig}; pub use pci_device::{VirtioPciDevice, VirtioPciDeviceActivator, VirtioPciDeviceError}; pub trait VirtioTransport { diff --git a/virtio-devices/src/transport/pci_common_config.rs b/virtio-devices/src/transport/pci_common_config.rs index c87e36e92b..37790133a6 100644 --- a/virtio-devices/src/transport/pci_common_config.rs +++ b/virtio-devices/src/transport/pci_common_config.rs @@ -234,7 +234,7 @@ impl VirtioPciCommonConfig { } fn read_common_config_word(&self, offset: u64, queues: &[Queue]) -> u16 { - debug!("read_common_config_word: offset 0x{:x}", offset); + trace!("read_common_config_word: offset 0x{:x}", offset); match offset { 0x10 => self.msix_config.load(Ordering::Acquire), 0x12 => queues.len() as u16, // num_queues @@ -261,36 +261,28 @@ impl VirtioPciCommonConfig { let ready = value == 1; q.set_ready(ready); // Translate address of descriptor table and vrings. - if let Some(access_platform) = &self.access_platform { - if ready { - let desc_table = access_platform - .translate_gva( - q.desc_table(), - get_vring_size(VringType::Desc, q.size()), - ) - .unwrap(); - let avail_ring = access_platform - .translate_gva( - q.avail_ring(), - get_vring_size(VringType::Avail, q.size()), - ) - .unwrap(); - let used_ring = access_platform - .translate_gva(q.used_ring(), get_vring_size(VringType::Used, q.size())) - .unwrap(); - q.set_desc_table_address( - Some((desc_table & 0xffff_ffff) as u32), - Some((desc_table >> 32) as u32), - ); - q.set_avail_ring_address( - Some((avail_ring & 0xffff_ffff) as u32), - Some((avail_ring >> 32) as u32), - ); - q.set_used_ring_address( - Some((used_ring & 0xffff_ffff) as u32), - Some((used_ring >> 32) as u32), - ); - } + if ready && let Some(access_platform) = &self.access_platform { + let desc_table = access_platform + .translate_gva(q.desc_table(), get_vring_size(VringType::Desc, q.size())) + .unwrap(); + let avail_ring = access_platform + .translate_gva(q.avail_ring(), get_vring_size(VringType::Avail, q.size())) + .unwrap(); + let used_ring = access_platform + .translate_gva(q.used_ring(), get_vring_size(VringType::Used, q.size())) + .unwrap(); + q.set_desc_table_address( + Some((desc_table & 0xffff_ffff) as u32), + Some((desc_table >> 32) as u32), + ); + q.set_avail_ring_address( + Some((avail_ring & 0xffff_ffff) as u32), + Some((avail_ring >> 32) as u32), + ); + q.set_used_ring_address( + Some((used_ring & 0xffff_ffff) as u32), + Some((used_ring >> 32) as u32), + ); } }), _ => { diff --git a/virtio-devices/src/transport/pci_device.rs b/virtio-devices/src/transport/pci_device.rs index f493e32ab0..b05020bb47 100644 --- a/virtio-devices/src/transport/pci_device.rs +++ b/virtio-devices/src/transport/pci_device.rs @@ -35,11 +35,11 @@ use vm_virtio::AccessPlatform; use vmm_sys_util::eventfd::EventFd; use super::pci_common_config::VirtioPciCommonConfigState; -use crate::transport::{VirtioPciCommonConfig, VirtioTransport, VIRTIO_PCI_COMMON_CONFIG_ID}; +use crate::transport::{VIRTIO_PCI_COMMON_CONFIG_ID, VirtioPciCommonConfig, VirtioTransport}; use crate::{ - ActivateResult, GuestMemoryMmap, VirtioDevice, VirtioDeviceType, VirtioInterrupt, - VirtioInterruptType, DEVICE_ACKNOWLEDGE, DEVICE_DRIVER, DEVICE_DRIVER_OK, DEVICE_FAILED, - DEVICE_FEATURES_OK, DEVICE_INIT, + ActivateResult, DEVICE_ACKNOWLEDGE, DEVICE_DRIVER, DEVICE_DRIVER_OK, DEVICE_FAILED, + DEVICE_FEATURES_OK, DEVICE_INIT, GuestMemoryMmap, VirtioDevice, VirtioDeviceType, + VirtioInterrupt, VirtioInterruptType, }; /// Vector value used to disable MSI for a queue. @@ -968,18 +968,17 @@ impl PciDevice for VirtioPciDevice { if let Resource::PciBar { index, base, type_, .. } = resource + && index == VIRTIO_COMMON_BAR_INDEX { - if index == VIRTIO_COMMON_BAR_INDEX { - settings_bar_addr = Some(GuestAddress(base)); - use_64bit_bar = match type_ { - PciBarType::Io => { - return Err(PciDeviceError::InvalidResource(resource)) - } - PciBarType::Mmio32 => false, - PciBarType::Mmio64 => true, - }; - break; - } + settings_bar_addr = Some(GuestAddress(base)); + use_64bit_bar = match type_ { + PciBarType::Io => { + return Err(PciDeviceError::InvalidResource(resource)); + } + PciBarType::Mmio32 => false, + PciBarType::Mmio64 => true, + }; + break; } } // Error out if no resource was matching the BAR id. diff --git a/virtio-devices/src/vdpa.rs b/virtio-devices/src/vdpa.rs index 7a8952f211..6ca4f7ddaa 100644 --- a/virtio-devices/src/vdpa.rs +++ b/virtio-devices/src/vdpa.rs @@ -12,11 +12,12 @@ use anyhow::anyhow; use serde::{Deserialize, Serialize}; use thiserror::Error; use vhost::vdpa::{VhostVdpa, VhostVdpaIovaRange}; +use vhost::vhost_kern::VhostKernFeatures; use vhost::vhost_kern::vdpa::VhostKernVdpa; use vhost::vhost_kern::vhost_binding::VHOST_BACKEND_F_SUSPEND; -use vhost::vhost_kern::VhostKernFeatures; use vhost::{VhostBackend, VringConfigData}; -use virtio_queue::{Descriptor, Queue, QueueT}; +use virtio_queue::desc::RawDescriptor; +use virtio_queue::{Queue, QueueT}; use vm_device::dma_mapping::ExternalDmaMapping; use vm_memory::{GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic}; use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable}; @@ -24,9 +25,9 @@ use vm_virtio::{AccessPlatform, Translatable}; use vmm_sys_util::eventfd::EventFd; use crate::{ - ActivateError, ActivateResult, GuestMemoryMmap, VirtioCommon, VirtioDevice, VirtioInterrupt, - VirtioInterruptType, DEVICE_ACKNOWLEDGE, DEVICE_DRIVER, DEVICE_DRIVER_OK, DEVICE_FEATURES_OK, - VIRTIO_F_IOMMU_PLATFORM, + ActivateError, ActivateResult, DEVICE_ACKNOWLEDGE, DEVICE_DRIVER, DEVICE_DRIVER_OK, + DEVICE_FEATURES_OK, GuestMemoryMmap, VIRTIO_F_IOMMU_PLATFORM, VirtioCommon, VirtioDevice, + VirtioInterrupt, VirtioInterruptType, }; #[derive(Error, Debug)] @@ -245,7 +246,7 @@ impl Vdpa { flags: 0u32, desc_table_addr: queue.desc_table().translate_gpa( self.common.access_platform.as_ref(), - queue_size as usize * std::mem::size_of::(), + queue_size as usize * std::mem::size_of::(), ), used_ring_addr: queue.used_ring().translate_gpa( self.common.access_platform.as_ref(), diff --git a/virtio-devices/src/vhost_user/blk.rs b/virtio-devices/src/vhost_user/blk.rs index 87ca4130a8..7c0e20c1ac 100644 --- a/virtio-devices/src/vhost_user/blk.rs +++ b/virtio-devices/src/vhost_user/blk.rs @@ -9,8 +9,8 @@ use block::VirtioBlockConfig; use seccompiler::SeccompAction; use serde::{Deserialize, Serialize}; use vhost::vhost_user::message::{ - VhostUserConfigFlags, VhostUserProtocolFeatures, VhostUserVirtioFeatures, - VHOST_USER_CONFIG_OFFSET, + VHOST_USER_CONFIG_OFFSET, VhostUserConfigFlags, VhostUserProtocolFeatures, + VhostUserVirtioFeatures, }; use vhost::vhost_user::{FrontendReqHandler, VhostUserFrontend, VhostUserFrontendReqHandler}; use virtio_bindings::virtio_blk::{ @@ -26,11 +26,11 @@ use vmm_sys_util::eventfd::EventFd; use super::super::{ActivateResult, VirtioCommon, VirtioDevice, VirtioDeviceType}; use super::vu_common_ctrl::{VhostUserConfig, VhostUserHandle}; -use super::{Error, Result, DEFAULT_VIRTIO_FEATURES}; +use super::{DEFAULT_VIRTIO_FEATURES, Error, Result}; use crate::seccomp_filters::Thread; use crate::thread_helper::spawn_virtio_thread; use crate::vhost_user::VhostUserCommon; -use crate::{GuestMemoryMmap, GuestRegionMmap, VirtioInterrupt, VIRTIO_F_IOMMU_PLATFORM}; +use crate::{GuestMemoryMmap, GuestRegionMmap, VIRTIO_F_IOMMU_PLATFORM, VirtioInterrupt}; const DEFAULT_QUEUE_NUMBER: usize = 1; @@ -134,8 +134,10 @@ impl Blk { }; if num_queues > backend_num_queues { - error!("vhost-user-blk requested too many queues ({}) since the backend only supports {}\n", - num_queues, backend_num_queues); + error!( + "vhost-user-blk requested too many queues ({}) since the backend only supports {}\n", + num_queues, backend_num_queues + ); return Err(Error::BadQueueNum); } @@ -211,16 +213,16 @@ impl Blk { impl Drop for Blk { fn drop(&mut self) { - if let Some(kill_evt) = self.common.kill_evt.take() { - if let Err(e) = kill_evt.write(1) { - error!("failed to kill vhost-user-blk: {:?}", e); - } + if let Some(kill_evt) = self.common.kill_evt.take() + && let Err(e) = kill_evt.write(1) + { + error!("failed to kill vhost-user-blk: {:?}", e); } self.common.wait_for_epoll_threads(); - if let Some(thread) = self.epoll_thread.take() { - if let Err(e) = thread.join() { - error!("Error joining thread: {:?}", e); - } + if let Some(thread) = self.epoll_thread.take() + && let Err(e) = thread.join() + { + error!("Error joining thread: {:?}", e); } } } @@ -265,16 +267,15 @@ impl VirtioDevice for Blk { } self.config.writeback = data[0]; - if let Some(vu) = &self.vu_common.vu { - if let Err(e) = vu + if let Some(vu) = &self.vu_common.vu + && let Err(e) = vu .lock() .unwrap() .socket_handle() .set_config(offset as u32, VhostUserConfigFlags::WRITABLE, data) .map_err(Error::VhostUserSetConfig) - { - error!("Failed setting vhost-user-blk configuration: {:?}", e); - } + { + error!("Failed setting vhost-user-blk configuration: {:?}", e); } } @@ -327,11 +328,11 @@ impl VirtioDevice for Blk { self.common.resume().ok()?; } - if let Some(vu) = &self.vu_common.vu { - if let Err(e) = vu.lock().unwrap().reset_vhost_user() { - error!("Failed to reset vhost-user daemon: {:?}", e); - return None; - } + if let Some(vu) = &self.vu_common.vu + && let Err(e) = vu.lock().unwrap().reset_vhost_user() + { + error!("Failed to reset vhost-user daemon: {:?}", e); + return None; } if let Some(kill_evt) = self.common.kill_evt.take() { diff --git a/virtio-devices/src/vhost_user/fs.rs b/virtio-devices/src/vhost_user/fs.rs index 1a24f1c2dd..c420bb7a33 100644 --- a/virtio-devices/src/vhost_user/fs.rs +++ b/virtio-devices/src/vhost_user/fs.rs @@ -7,24 +7,24 @@ use std::{result, thread}; use seccompiler::SeccompAction; use serde::{Deserialize, Serialize}; -use serde_with::{serde_as, Bytes}; +use serde_with::{Bytes, serde_as}; use vhost::vhost_user::message::{VhostUserProtocolFeatures, VhostUserVirtioFeatures}; use vhost::vhost_user::{FrontendReqHandler, VhostUserFrontend, VhostUserFrontendReqHandler}; use virtio_queue::Queue; +use vm_device::UserspaceMapping; use vm_memory::{ByteValued, GuestMemoryAtomic}; use vm_migration::protocol::MemoryRangeTable; use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable}; use vmm_sys_util::eventfd::EventFd; use super::vu_common_ctrl::VhostUserHandle; -use super::{Error, Result, DEFAULT_VIRTIO_FEATURES}; +use super::{DEFAULT_VIRTIO_FEATURES, Error, Result}; use crate::seccomp_filters::Thread; use crate::thread_helper::spawn_virtio_thread; use crate::vhost_user::VhostUserCommon; use crate::{ - ActivateResult, GuestMemoryMmap, GuestRegionMmap, MmapRegion, UserspaceMapping, VirtioCommon, - VirtioDevice, VirtioDeviceType, VirtioInterrupt, VirtioSharedMemoryList, - VIRTIO_F_IOMMU_PLATFORM, + ActivateResult, GuestMemoryMmap, GuestRegionMmap, MmapRegion, VIRTIO_F_IOMMU_PLATFORM, + VirtioCommon, VirtioDevice, VirtioDeviceType, VirtioInterrupt, VirtioSharedMemoryList, }; const NUM_QUEUE_OFFSET: usize = 1; @@ -148,9 +148,9 @@ impl Fs { if num_queues > backend_num_queues { error!( - "vhost-user-fs requested too many queues ({}) since the backend only supports {}\n", - num_queues, backend_num_queues - ); + "vhost-user-fs requested too many queues ({}) since the backend only supports {}\n", + num_queues, backend_num_queues + ); return Err(Error::BadQueueNum); } @@ -227,10 +227,10 @@ impl Drop for Fs { let _ = kill_evt.write(1); } self.common.wait_for_epoll_threads(); - if let Some(thread) = self.epoll_thread.take() { - if let Err(e) = thread.join() { - error!("Error joining thread: {:?}", e); - } + if let Some(thread) = self.epoll_thread.take() + && let Err(e) = thread.join() + { + error!("Error joining thread: {:?}", e); } } } @@ -308,11 +308,11 @@ impl VirtioDevice for Fs { self.common.resume().ok()?; } - if let Some(vu) = &self.vu_common.vu { - if let Err(e) = vu.lock().unwrap().reset_vhost_user() { - error!("Failed to reset vhost-user daemon: {:?}", e); - return None; - } + if let Some(vu) = &self.vu_common.vu + && let Err(e) = vu.lock().unwrap().reset_vhost_user() + { + error!("Failed to reset vhost-user daemon: {:?}", e); + return None; } if let Some(kill_evt) = self.common.kill_evt.take() { diff --git a/virtio-devices/src/vhost_user/mod.rs b/virtio-devices/src/vhost_user/mod.rs index 188942721f..14aa173b78 100644 --- a/virtio-devices/src/vhost_user/mod.rs +++ b/virtio-devices/src/vhost_user/mod.rs @@ -10,11 +10,11 @@ use std::sync::{Arc, Barrier, Mutex}; use anyhow::anyhow; use serde::{Deserialize, Serialize}; use thiserror::Error; +use vhost::Error as VhostError; use vhost::vhost_user::message::{ VhostUserInflight, VhostUserProtocolFeatures, VhostUserVirtioFeatures, }; use vhost::vhost_user::{FrontendReqHandler, VhostUserFrontendReqHandler}; -use vhost::Error as VhostError; use virtio_queue::{Error as QueueError, Queue}; use vm_memory::mmap::MmapRegionError; use vm_memory::{Address, Error as MmapError, GuestAddressSpace, GuestMemory, GuestMemoryAtomic}; @@ -24,10 +24,10 @@ use vmm_sys_util::eventfd::EventFd; use vu_common_ctrl::VhostUserHandle; use crate::{ - ActivateError, EpollHelper, EpollHelperError, EpollHelperHandler, GuestMemoryMmap, - GuestRegionMmap, VirtioInterrupt, EPOLL_HELPER_EVENT_LAST, VIRTIO_F_IN_ORDER, - VIRTIO_F_NOTIFICATION_DATA, VIRTIO_F_ORDER_PLATFORM, VIRTIO_F_RING_EVENT_IDX, - VIRTIO_F_RING_INDIRECT_DESC, VIRTIO_F_VERSION_1, + ActivateError, EPOLL_HELPER_EVENT_LAST, EpollHelper, EpollHelperError, EpollHelperHandler, + GuestMemoryMmap, GuestRegionMmap, VIRTIO_F_IN_ORDER, VIRTIO_F_NOTIFICATION_DATA, + VIRTIO_F_ORDER_PLATFORM, VIRTIO_F_RING_EVENT_IDX, VIRTIO_F_RING_INDIRECT_DESC, + VIRTIO_F_VERSION_1, VirtioInterrupt, }; pub mod blk; diff --git a/virtio-devices/src/vhost_user/net.rs b/virtio-devices/src/vhost_user/net.rs index 930f557419..0f4561bca5 100644 --- a/virtio-devices/src/vhost_user/net.rs +++ b/virtio-devices/src/vhost_user/net.rs @@ -5,7 +5,7 @@ use std::sync::atomic::AtomicBool; use std::sync::{Arc, Barrier, Mutex}; use std::{result, thread}; -use net_util::{build_net_config_space, CtrlQueue, MacAddr, VirtioNetConfig}; +use net_util::{CtrlQueue, MacAddr, VirtioNetConfig, build_net_config_space}; use seccompiler::SeccompAction; use serde::{Deserialize, Serialize}; use vhost::vhost_user::message::{VhostUserProtocolFeatures, VhostUserVirtioFeatures}; @@ -28,9 +28,9 @@ use crate::thread_helper::spawn_virtio_thread; use crate::vhost_user::vu_common_ctrl::{VhostUserConfig, VhostUserHandle}; use crate::vhost_user::{Error, Result, VhostUserCommon}; use crate::{ - ActivateResult, GuestMemoryMmap, GuestRegionMmap, NetCtrlEpollHandler, VirtioCommon, - VirtioDevice, VirtioDeviceType, VirtioInterrupt, VIRTIO_F_IOMMU_PLATFORM, - VIRTIO_F_RING_EVENT_IDX, VIRTIO_F_VERSION_1, + ActivateResult, GuestMemoryMmap, GuestRegionMmap, NetCtrlEpollHandler, VIRTIO_F_IOMMU_PLATFORM, + VIRTIO_F_RING_EVENT_IDX, VIRTIO_F_VERSION_1, VirtioCommon, VirtioDevice, VirtioDeviceType, + VirtioInterrupt, }; const DEFAULT_QUEUE_NUMBER: usize = 2; @@ -168,8 +168,10 @@ impl Net { }; if num_queues > backend_num_queues { - error!("vhost-user-net requested too many queues ({}) since the backend only supports {}\n", - num_queues, backend_num_queues); + error!( + "vhost-user-net requested too many queues ({}) since the backend only supports {}\n", + num_queues, backend_num_queues + ); return Err(Error::BadQueueNum); } @@ -241,23 +243,24 @@ impl Net { impl Drop for Net { fn drop(&mut self) { - if let Some(kill_evt) = self.common.kill_evt.take() { - if let Err(e) = kill_evt.write(1) { - error!("failed to kill vhost-user-net: {:?}", e); - } + if let Some(kill_evt) = self.common.kill_evt.take() + && let Err(e) = kill_evt.write(1) + { + error!("failed to kill vhost-user-net: {:?}", e); } self.common.wait_for_epoll_threads(); - if let Some(thread) = self.epoll_thread.take() { - if let Err(e) = thread.join() { - error!("Error joining thread: {:?}", e); - } + if let Some(thread) = self.epoll_thread.take() + && let Err(e) = thread.join() + { + error!("Error joining thread: {:?}", e); } - if let Some(thread) = self.ctrl_queue_epoll_thread.take() { - if let Err(e) = thread.join() { - error!("Error joining thread: {:?}", e); - } + + if let Some(thread) = self.ctrl_queue_epoll_thread.take() + && let Err(e) = thread.join() + { + error!("Error joining thread: {:?}", e); } } } @@ -298,7 +301,7 @@ impl VirtioDevice for Net { let num_queues = queues.len(); let event_idx = self.common.feature_acked(VIRTIO_RING_F_EVENT_IDX.into()); - if self.common.feature_acked(VIRTIO_NET_F_CTRL_VQ.into()) && num_queues % 2 != 0 { + if self.common.feature_acked(VIRTIO_NET_F_CTRL_VQ.into()) && !num_queues.is_multiple_of(2) { let ctrl_queue_index = num_queues - 1; let (_, mut ctrl_queue, ctrl_queue_evt) = queues.remove(ctrl_queue_index); @@ -380,11 +383,11 @@ impl VirtioDevice for Net { self.common.resume().ok()?; } - if let Some(vu) = &self.vu_common.vu { - if let Err(e) = vu.lock().unwrap().reset_vhost_user() { - error!("Failed to reset vhost-user daemon: {:?}", e); - return None; - } + if let Some(vu) = &self.vu_common.vu + && let Err(e) = vu.lock().unwrap().reset_vhost_user() + { + error!("Failed to reset vhost-user daemon: {:?}", e); + return None; } if let Some(kill_evt) = self.common.kill_evt.take() { diff --git a/virtio-devices/src/vhost_user/vu_common_ctrl.rs b/virtio-devices/src/vhost_user/vu_common_ctrl.rs index 60ee6b32dc..2e55782934 100644 --- a/virtio-devices/src/vhost_user/vu_common_ctrl.rs +++ b/virtio-devices/src/vhost_user/vu_common_ctrl.rs @@ -5,8 +5,8 @@ use std::ffi; use std::fs::File; use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; use std::os::unix::net::UnixListener; -use std::sync::atomic::Ordering; use std::sync::Arc; +use std::sync::atomic::Ordering; use std::thread::sleep; use std::time::{Duration, Instant}; @@ -18,7 +18,8 @@ use vhost::vhost_user::{ Frontend, FrontendReqHandler, VhostUserFrontend, VhostUserFrontendReqHandler, }; use vhost::{VhostBackend, VhostUserDirtyLogRegion, VhostUserMemoryRegionInfo, VringConfigData}; -use virtio_queue::{Descriptor, Queue, QueueT}; +use virtio_queue::desc::RawDescriptor; +use virtio_queue::{Queue, QueueT}; use vm_memory::{ Address, Error as MmapError, FileOffset, GuestAddress, GuestMemory, GuestMemoryRegion, }; @@ -28,8 +29,8 @@ use vmm_sys_util::eventfd::EventFd; use super::{Error, Result}; use crate::vhost_user::Inflight; use crate::{ - get_host_address_range, GuestMemoryMmap, GuestRegionMmap, MmapRegion, VirtioInterrupt, - VirtioInterruptType, + GuestMemoryMmap, GuestRegionMmap, MmapRegion, VirtioInterrupt, VirtioInterruptType, + get_host_address_range, }; // Size of a dirty page for vhost-user. @@ -212,7 +213,7 @@ impl VhostUserHandle { desc_table_addr: get_host_address_range( mem, GuestAddress(queue.desc_table()), - actual_size * std::mem::size_of::(), + actual_size * std::mem::size_of::(), ) .ok_or(Error::DescriptorTableAddress)? as u64, // The used ring is {flags: u16; idx: u16; virtq_used_elem [{id: u16, len: u16}; actual_size]}, @@ -316,17 +317,16 @@ impl VhostUserHandle { .get_features() .map_err(Error::VhostUserGetFeatures)?; - if acked_features & VhostUserVirtioFeatures::PROTOCOL_FEATURES.bits() != 0 { - if let Some(acked_protocol_features) = + if acked_features & VhostUserVirtioFeatures::PROTOCOL_FEATURES.bits() != 0 + && let Some(acked_protocol_features) = VhostUserProtocolFeatures::from_bits(acked_protocol_features) - { - self.vu - .set_protocol_features(acked_protocol_features) - .map_err(Error::VhostUserSetProtocolFeatures)?; + { + self.vu + .set_protocol_features(acked_protocol_features) + .map_err(Error::VhostUserSetProtocolFeatures)?; - if acked_protocol_features.contains(VhostUserProtocolFeatures::REPLY_ACK) { - self.vu.set_hdr_flags(VhostUserHeaderFlag::NEED_REPLY); - } + if acked_protocol_features.contains(VhostUserProtocolFeatures::REPLY_ACK) { + self.vu.set_hdr_flags(VhostUserHeaderFlag::NEED_REPLY); } } @@ -398,7 +398,7 @@ impl VhostUserHandle { acked_features: 0, vrings_info: None, queue_indexes: Vec::new(), - }) + }); } Err(e) => e, }; @@ -573,12 +573,16 @@ impl VhostUserHandle { // divide it by 8. let len = region.size() / 8; // SAFETY: region is of size len - let bitmap = unsafe { + let bitmap: &[u64] = unsafe { // Cast the pointer to u64 let ptr = region.as_ptr() as *const u64; - std::slice::from_raw_parts(ptr, len).to_vec() + std::slice::from_raw_parts(ptr, len) }; - Ok(MemoryRangeTable::from_bitmap(bitmap, 0, 4096)) + Ok(MemoryRangeTable::from_bitmap( + bitmap.iter().copied(), + 0, + 4096, + )) } else { Err(Error::MissingShmLogRegion) } diff --git a/virtio-devices/src/vsock/csm/connection.rs b/virtio-devices/src/vsock/csm/connection.rs index 50f3c30341..e36f58ccb7 100644 --- a/virtio-devices/src/vsock/csm/connection.rs +++ b/virtio-devices/src/vsock/csm/connection.rs @@ -89,7 +89,7 @@ use super::super::defs::uapi; use super::super::packet::VsockPacket; use super::super::{Result as VsockResult, VsockChannel, VsockEpollListener, VsockError}; use super::txbuf::TxBuf; -use super::{defs, ConnState, Error, PendingRx, PendingRxSet, Result}; +use super::{ConnState, Error, PendingRx, PendingRxSet, Result, defs}; /// A self-managing connection object, that handles communication between a guest-side AF_VSOCK /// socket and a host-side `Read + Write + AsRawFd` stream. @@ -1158,10 +1158,11 @@ mod tests { // When there's data in the TX buffer, the connection should ask to be notified when it // can write to its backing stream. - assert!(ctx - .conn - .get_polled_evset() - .contains(epoll::Events::EPOLLOUT)); + assert!( + ctx.conn + .get_polled_evset() + .contains(epoll::Events::EPOLLOUT) + ); assert_eq!(ctx.conn.tx_buf.len(), data.len()); // Unlock the write stream and notify the connection it can now write its buffered @@ -1212,10 +1213,11 @@ mod tests { stream.write_state = StreamState::Closed; ctx.set_stream(stream); - assert!(ctx - .conn - .get_polled_evset() - .contains(epoll::Events::EPOLLOUT)); + assert!( + ctx.conn + .get_polled_evset() + .contains(epoll::Events::EPOLLOUT) + ); ctx.notify_epollout(); assert_eq!(ctx.conn.state, ConnState::Killed); } diff --git a/virtio-devices/src/vsock/csm/txbuf.rs b/virtio-devices/src/vsock/csm/txbuf.rs index 4c16913f34..1a8c5dd151 100644 --- a/virtio-devices/src/vsock/csm/txbuf.rs +++ b/virtio-devices/src/vsock/csm/txbuf.rs @@ -5,7 +5,7 @@ use std::io::Write; use std::num::Wrapping; -use super::{defs, Error, Result}; +use super::{Error, Result, defs}; /// A simple ring-buffer implementation, used by vsock connections to buffer TX (guest -> host) /// data. Memory for this buffer is allocated lazily, since buffering will only be needed when diff --git a/virtio-devices/src/vsock/device.rs b/virtio-devices/src/vsock/device.rs index f8c024833b..4d073c4923 100644 --- a/virtio-devices/src/vsock/device.rs +++ b/virtio-devices/src/vsock/device.rs @@ -47,10 +47,10 @@ use super::{VsockBackend, VsockPacket}; use crate::seccomp_filters::Thread; use crate::thread_helper::spawn_virtio_thread; use crate::{ - ActivateResult, EpollHelper, EpollHelperError, EpollHelperHandler, Error as DeviceError, - GuestMemoryMmap, VirtioCommon, VirtioDevice, VirtioDeviceType, VirtioInterrupt, - VirtioInterruptType, EPOLL_HELPER_EVENT_LAST, VIRTIO_F_IN_ORDER, VIRTIO_F_IOMMU_PLATFORM, - VIRTIO_F_VERSION_1, + ActivateResult, EPOLL_HELPER_EVENT_LAST, EpollHelper, EpollHelperError, EpollHelperHandler, + Error as DeviceError, GuestMemoryMmap, VIRTIO_F_IN_ORDER, VIRTIO_F_IOMMU_PLATFORM, + VIRTIO_F_VERSION_1, VirtioCommon, VirtioDevice, VirtioDeviceType, VirtioInterrupt, + VirtioInterruptType, }; const QUEUE_SIZE: u16 = 256; diff --git a/virtio-devices/src/vsock/mod.rs b/virtio-devices/src/vsock/mod.rs index ecd523d1c0..3cd9592a64 100644 --- a/virtio-devices/src/vsock/mod.rs +++ b/virtio-devices/src/vsock/mod.rs @@ -172,7 +172,7 @@ pub mod tests { use vm_virtio::queue::testing::VirtQueue as GuestQ; use vmm_sys_util::eventfd::EventFd; - use super::device::{VsockEpollHandler, RX_QUEUE_EVENT, TX_QUEUE_EVENT}; + use super::device::{RX_QUEUE_EVENT, TX_QUEUE_EVENT, VsockEpollHandler}; use super::packet::VSOCK_PKT_HDR_SIZE; use super::*; use crate::device::{VirtioInterrupt, VirtioInterruptType}; diff --git a/virtio-devices/src/vsock/packet.rs b/virtio-devices/src/vsock/packet.rs index 01ee18fbb1..a6749381df 100644 --- a/virtio-devices/src/vsock/packet.rs +++ b/virtio-devices/src/vsock/packet.rs @@ -24,7 +24,7 @@ use virtio_queue::DescriptorChain; use vm_memory::{Address, GuestMemory}; use vm_virtio::{AccessPlatform, Translatable}; -use super::{defs, Result, VsockError}; +use super::{Result, VsockError, defs}; use crate::get_host_address_range; // The vsock packet header is defined by the C struct: @@ -427,8 +427,8 @@ mod tests { use super::super::tests::TestContext; use super::*; - use crate::vsock::defs::MAX_PKT_BUF_SIZE; use crate::GuestMemoryMmap; + use crate::vsock::defs::MAX_PKT_BUF_SIZE; macro_rules! create_context { ($test_ctx:ident, $handler_ctx:ident) => { diff --git a/virtio-devices/src/vsock/unix/mod.rs b/virtio-devices/src/vsock/unix/mod.rs index 13c9883194..bb44698121 100644 --- a/virtio-devices/src/vsock/unix/mod.rs +++ b/virtio-devices/src/vsock/unix/mod.rs @@ -13,9 +13,9 @@ mod muxer; mod muxer_killq; mod muxer_rxq; +pub use Error as VsockUnixError; pub use muxer::VsockMuxer as VsockUnixBackend; use thiserror::Error; -pub use Error as VsockUnixError; mod defs { /// Maximum number of established connections that we can handle. diff --git a/virtio-devices/src/vsock/unix/muxer.rs b/virtio-devices/src/vsock/unix/muxer.rs index 55e819d4b7..842e02677d 100644 --- a/virtio-devices/src/vsock/unix/muxer.rs +++ b/virtio-devices/src/vsock/unix/muxer.rs @@ -52,7 +52,7 @@ use super::super::{ }; use super::muxer_killq::MuxerKillQ; use super::muxer_rxq::MuxerRxQ; -use super::{defs, Error, MuxerConnection, Result}; +use super::{Error, MuxerConnection, Result, defs}; /// A unique identifier of a `MuxerConnection` object. Connections are stored in a hash map, /// keyed by a `ConnMapKey` object. @@ -437,10 +437,10 @@ impl VsockMuxer { if let Some(EpollListener::LocalStream(stream)) = self.listener_map.get_mut(&fd) { let port = Self::read_local_stream_port(&mut self.partial_command_map, stream); - if let Err(Error::UnixRead(ref e)) = port { - if e.kind() == ErrorKind::WouldBlock { - return; - } + if let Err(Error::UnixRead(ref e)) = port + && e.kind() == ErrorKind::WouldBlock + { + return; } let stream = match self.remove_listener(fd) { @@ -493,15 +493,18 @@ impl VsockMuxer { const MIN_COMMAND_LEN: usize = 10; // Bring in the minimum number of bytes that we should be able to read. - stream - .read_exact(&mut command.buf[command.len..MIN_COMMAND_LEN]) - .map_err(Error::UnixRead)?; - command.len = MIN_COMMAND_LEN; + if command.len < MIN_COMMAND_LEN { + command.len += stream + .read(&mut command.buf[command.len..MIN_COMMAND_LEN]) + .map_err(Error::UnixRead)?; + } // Now, finish reading the destination port number, by bringing in one byte at a time, // until we reach an EOL terminator (or our buffer space runs out). Yeah, not // particularly proud of this approach, but it will have to do for now. - while command.buf[command.len - 1] != b'\n' && command.len < command.buf.len() { + while command.len.checked_sub(1).map(|n| command.buf[n]) != Some(b'\n') + && command.len < command.buf.len() + { command.len += stream .read(&mut command.buf[command.len..=command.len]) .map_err(Error::UnixRead)?; diff --git a/virtio-devices/src/vsock/unix/muxer_killq.rs b/virtio-devices/src/vsock/unix/muxer_killq.rs index 925f4d9383..b9cf47f4d1 100644 --- a/virtio-devices/src/vsock/unix/muxer_killq.rs +++ b/virtio-devices/src/vsock/unix/muxer_killq.rs @@ -29,7 +29,7 @@ use std::collections::{HashMap, VecDeque}; use std::time::Instant; use super::muxer::ConnMapKey; -use super::{defs, MuxerConnection}; +use super::{MuxerConnection, defs}; /// A kill queue item, holding the connection key and the scheduled time for termination. /// @@ -111,11 +111,12 @@ impl MuxerKillQ { /// the queue has expired. Otherwise, `None` is returned. /// pub fn pop(&mut self) -> Option { - if let Some(item) = self.q.front() { - if Instant::now() > item.kill_time { - return Some(self.q.pop_front().unwrap().key); - } + if let Some(item) = self.q.front() + && Instant::now() > item.kill_time + { + return Some(self.q.pop_front().unwrap().key); } + None } diff --git a/virtio-devices/src/vsock/unix/muxer_rxq.rs b/virtio-devices/src/vsock/unix/muxer_rxq.rs index 701db3459f..077cbc8899 100644 --- a/virtio-devices/src/vsock/unix/muxer_rxq.rs +++ b/virtio-devices/src/vsock/unix/muxer_rxq.rs @@ -20,7 +20,7 @@ use std::collections::{HashMap, VecDeque}; use super::super::VsockChannel; use super::muxer::{ConnMapKey, MuxerRx}; -use super::{defs, MuxerConnection}; +use super::{MuxerConnection, defs}; /// The muxer RX queue. /// diff --git a/virtio-devices/src/watchdog.rs b/virtio-devices/src/watchdog.rs index fdfc977d96..23a33a3e57 100644 --- a/virtio-devices/src/watchdog.rs +++ b/virtio-devices/src/watchdog.rs @@ -23,9 +23,9 @@ use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottabl use vmm_sys_util::eventfd::EventFd; use super::{ - ActivateError, ActivateResult, EpollHelper, EpollHelperError, EpollHelperHandler, - Error as DeviceError, VirtioCommon, VirtioDevice, VirtioDeviceType, EPOLL_HELPER_EVENT_LAST, - VIRTIO_F_VERSION_1, + ActivateError, ActivateResult, EPOLL_HELPER_EVENT_LAST, EpollHelper, EpollHelperError, + EpollHelperHandler, Error as DeviceError, VIRTIO_F_VERSION_1, VirtioCommon, VirtioDevice, + VirtioDeviceType, }; use crate::seccomp_filters::Thread; use crate::thread_helper::spawn_virtio_thread; diff --git a/vm-allocator/Cargo.toml b/vm-allocator/Cargo.toml index 1cc0ae9e1f..a4996d6dc3 100644 --- a/vm-allocator/Cargo.toml +++ b/vm-allocator/Cargo.toml @@ -1,6 +1,6 @@ [package] authors = ["The Chromium OS Authors"] -edition = "2021" +edition.workspace = true name = "vm-allocator" version = "0.1.0" @@ -9,8 +9,11 @@ default = [] kvm = ["arch/kvm"] [dependencies] -libc = "0.2.167" +libc = { workspace = true } vm-memory = { workspace = true } [target.'cfg(any(target_arch = "aarch64", target_arch = "riscv64"))'.dependencies] arch = { path = "../arch" } + +[lints] +workspace = true diff --git a/vm-allocator/src/address.rs b/vm-allocator/src/address.rs index a6e11dc63a..c847dd473a 100644 --- a/vm-allocator/src/address.rs +++ b/vm-allocator/src/address.rs @@ -68,7 +68,7 @@ impl AddressAllocator { } fn align_address(&self, address: GuestAddress, alignment: GuestUsize) -> GuestAddress { - let align_adjust = if address.raw_value() % alignment != 0 { + let align_adjust = if !address.raw_value().is_multiple_of(alignment) { alignment - (address.raw_value() % alignment) } else { 0 @@ -196,10 +196,10 @@ impl AddressAllocator { /// Free an already allocated address range. /// We can only free a range if it matches exactly an already allocated range. pub fn free(&mut self, address: GuestAddress, size: GuestUsize) { - if let Some(&range_size) = self.ranges.get(&address) { - if size == range_size { - self.ranges.remove(&address); - } + if let Some(&range_size) = self.ranges.get(&address) + && size == range_size + { + self.ranges.remove(&address); } } diff --git a/vm-allocator/src/page_size.rs b/vm-allocator/src/page_size.rs index 96ae01edf7..6dedb6847a 100644 --- a/vm-allocator/src/page_size.rs +++ b/vm-allocator/src/page_size.rs @@ -1,7 +1,7 @@ // Copyright 2023 Arm Limited (or its affiliates). All rights reserved. // SPDX-License-Identifier: Apache-2.0 -use libc::{sysconf, _SC_PAGESIZE}; +use libc::{_SC_PAGESIZE, sysconf}; /// get host page size pub fn get_page_size() -> u64 { diff --git a/vm-device/Cargo.toml b/vm-device/Cargo.toml index cc24dc4764..a57ea57f5b 100644 --- a/vm-device/Cargo.toml +++ b/vm-device/Cargo.toml @@ -1,6 +1,6 @@ [package] authors = ["The Cloud Hypervisor Authors"] -edition = "2021" +edition.workspace = true name = "vm-device" version = "0.1.0" @@ -10,10 +10,12 @@ kvm = ["vfio-ioctls/kvm"] mshv = ["vfio-ioctls/mshv"] [dependencies] -anyhow = "1.0.94" hypervisor = { path = "../hypervisor" } -serde = { version = "1.0.208", features = ["derive", "rc"] } +serde = { workspace = true, features = ["derive", "rc"] } thiserror = { workspace = true } vfio-ioctls = { workspace = true, default-features = false } vm-memory = { workspace = true, features = ["backend-mmap"] } vmm-sys-util = { workspace = true } + +[lints] +workspace = true diff --git a/vm-device/src/lib.rs b/vm-device/src/lib.rs index c10731ea95..f484e9f14e 100644 --- a/vm-device/src/lib.rs +++ b/vm-device/src/lib.rs @@ -4,6 +4,7 @@ // use serde::{Deserialize, Serialize}; +use vm_memory::{GuestAddress, GuestUsize}; mod bus; pub mod dma_mapping; @@ -58,3 +59,12 @@ pub enum Resource { /// KVM memslot index. KvmMemSlot(u32), } + +#[derive(Clone)] +pub struct UserspaceMapping { + pub host_addr: u64, + pub mem_slot: u32, + pub addr: GuestAddress, + pub len: GuestUsize, + pub mergeable: bool, +} diff --git a/vm-migration/Cargo.toml b/vm-migration/Cargo.toml index 5a992c6070..2053afc472 100644 --- a/vm-migration/Cargo.toml +++ b/vm-migration/Cargo.toml @@ -1,12 +1,17 @@ [package] authors = ["The Cloud Hypervisor Authors"] -edition = "2021" +edition.workspace = true name = "vm-migration" version = "0.1.0" [dependencies] -anyhow = "1.0.94" -serde = { version = "1.0.208", features = ["derive", "rc"] } +anyhow = { workspace = true } +itertools = { workspace = true } +rustls = { workspace = true } +serde = { workspace = true, features = ["derive", "rc"] } serde_json = { workspace = true } thiserror = { workspace = true } vm-memory = { workspace = true, features = ["backend-atomic", "backend-mmap"] } + +[lints] +workspace = true diff --git a/vm-migration/src/bitpos_iterator.rs b/vm-migration/src/bitpos_iterator.rs new file mode 100644 index 0000000000..8d70c7ff6b --- /dev/null +++ b/vm-migration/src/bitpos_iterator.rs @@ -0,0 +1,88 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 + +/// An iterator that turns a sequence of u64s into a sequence of bit positions +/// that are set. +/// +/// This is useful to iterate over dirty memory bitmaps. +struct BitposIterator { + underlying_it: I, + + /// How many u64's we've already consumed. + word_pos: usize, + + /// If we already started working on a u64, it's here. Together with the bit + /// position where we have to continue. + current_word: Option<(u64, u32)>, +} + +impl Iterator for BitposIterator +where + I: Iterator, +{ + type Item = u64; + + fn next(&mut self) -> Option { + loop { + if self.current_word.is_none() { + self.current_word = self.underlying_it.next().map(|w| (w, 0)); + } + + let (word, word_bit) = self.current_word?; + + // Continue early if there is no chance to find something. + if word != 0 && word_bit < 64 { + let shifted_word = word >> word_bit; + if shifted_word != 0 { + let zeroes = shifted_word.trailing_zeros(); + + self.current_word = Some((word, zeroes + word_bit + 1)); + let next_bitpos = + u64::try_from(self.word_pos).unwrap() * 64 + u64::from(word_bit + zeroes); + + return Some(next_bitpos); + } + } + + self.current_word = None; + self.word_pos += 1; + } + } +} + +pub trait BitposIteratorExt: Iterator + Sized { + /// Turn an iterator over `u64` into an iterator over the bit positions of + /// all 1s. We basically treat the incoming `u64` as one gigantic integer + /// and just spit out which bits are set. + fn bit_positions(self) -> impl Iterator { + BitposIterator { + underlying_it: self, + word_pos: 0, + current_word: None, + } + } +} + +impl + Sized> BitposIteratorExt for I {} + +#[cfg(test)] +mod tests { + use super::*; + + fn bitpos_check(inp: &[u64], out: &[u64]) { + assert_eq!(inp.iter().copied().bit_positions().collect::>(), out); + } + + #[test] + fn bitpos_iterator_works() { + bitpos_check(&[], &[]); + bitpos_check(&[0], &[]); + bitpos_check(&[1], &[0]); + bitpos_check(&[5], &[0, 2]); + bitpos_check(&[3 + 32], &[0, 1, 5]); + bitpos_check(&[1 << 63], &[63]); + + bitpos_check(&[1, 1 + 32], &[0, 64, 69]); + } +} diff --git a/vm-migration/src/lib.rs b/vm-migration/src/lib.rs index 00f322636a..daaa5d0d53 100644 --- a/vm-migration/src/lib.rs +++ b/vm-migration/src/lib.rs @@ -9,7 +9,9 @@ use thiserror::Error; use crate::protocol::MemoryRangeTable; +mod bitpos_iterator; pub mod protocol; +pub mod tls; #[derive(Error, Debug)] pub enum MigratableError { @@ -49,8 +51,11 @@ pub enum MigratableError { #[error("Failed to complete migration for migratable component")] CompleteMigration(#[source] anyhow::Error), - #[error("Failed to release a disk lock before the migration")] + #[error("Failed to release a disk lock")] UnlockError(#[source] anyhow::Error), + + #[error("TLS error")] + Tls(#[from] tls::TlsError), } /// A Pausable component can be paused and resumed. diff --git a/vm-migration/src/protocol.rs b/vm-migration/src/protocol.rs index 274baf0397..f9cb3e4188 100644 --- a/vm-migration/src/protocol.rs +++ b/vm-migration/src/protocol.rs @@ -5,10 +5,12 @@ use std::io::{Read, Write}; +use itertools::Itertools; use serde::{Deserialize, Serialize}; use vm_memory::ByteValued; use crate::MigratableError; +use crate::bitpos_iterator::BitposIteratorExt; // Migration protocol // 1: Source establishes communication with destination (file socket or TCP connection.) @@ -50,8 +52,9 @@ use crate::MigratableError; // The source can at any time send an "abandon request" to cancel #[repr(u16)] -#[derive(Copy, Clone)] +#[derive(Debug, Copy, Clone, Default, PartialEq, Eq)] pub enum Command { + #[default] Invalid, Start, Config, @@ -62,12 +65,6 @@ pub enum Command { MemoryFd, } -impl Default for Command { - fn default() -> Self { - Self::Invalid - } -} - #[repr(C)] #[derive(Default, Copy, Clone)] pub struct Request { @@ -139,19 +136,14 @@ impl Request { } #[repr(u16)] -#[derive(Copy, Clone, PartialEq, Eq)] +#[derive(Copy, Clone, PartialEq, Eq, Default)] pub enum Status { + #[default] Invalid, Ok, Error, } -impl Default for Status { - fn default() -> Self { - Self::Invalid - } -} - #[repr(C)] #[derive(Default, Copy, Clone)] pub struct Response { @@ -184,6 +176,10 @@ impl Response { self.status } + pub fn length(&self) -> u64 { + self.length + } + pub fn read_from(fd: &mut dyn Read) -> Result { let mut response = Response::default(); fd.read_exact(Self::as_mut_slice(&mut response)) @@ -215,44 +211,130 @@ impl Response { } #[repr(C)] -#[derive(Clone, Default, Serialize, Deserialize)] +#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)] pub struct MemoryRange { pub gpa: u64, pub length: u64, } -#[derive(Clone, Default, Serialize, Deserialize)] +impl MemoryRange { + /// Turn an iterator over the dirty bitmap into an iterator of dirty ranges. + pub fn dirty_ranges( + bitmap: impl IntoIterator, + start_addr: u64, + page_size: u64, + ) -> impl Iterator { + bitmap + .into_iter() + .bit_positions() + // Turn them into single-element ranges for coalesce. + .map(|b| b..(b + 1)) + // Merge adjacent ranges. + .coalesce(|prev, curr| { + if prev.end == curr.start { + Ok(prev.start..curr.end) + } else { + Err((prev, curr)) + } + }) + .map(move |r| Self { + gpa: start_addr + r.start * page_size, + length: (r.end - r.start) * page_size, + }) + } +} + +#[derive(Debug, Clone, Default, Serialize, Deserialize)] pub struct MemoryRangeTable { data: Vec, } -impl MemoryRangeTable { - pub fn from_bitmap(bitmap: Vec, start_addr: u64, page_size: u64) -> Self { - let mut table = MemoryRangeTable::default(); - let mut entry: Option = None; - for (i, block) in bitmap.iter().enumerate() { - for j in 0..64 { - let is_page_dirty = ((block >> j) & 1u64) != 0u64; - let page_offset = ((i * 64) + j) as u64 * page_size; - if is_page_dirty { - if let Some(entry) = &mut entry { - entry.length += page_size; - } else { - entry = Some(MemoryRange { - gpa: start_addr + page_offset, - length: page_size, - }); +#[derive(Debug, Clone, Default)] +struct MemoryRangeTableIterator { + chunk_size: u64, + data: Vec, +} + +impl MemoryRangeTableIterator { + pub fn new(table: &MemoryRangeTable, chunk_size: u64) -> Self { + MemoryRangeTableIterator { + chunk_size, + data: table.data.clone(), + } + } +} + +impl Iterator for MemoryRangeTableIterator { + type Item = MemoryRangeTable; + + /// Return the next memory range in the table, making sure that + /// the returned range is not larger than `chunk_size`. + /// + /// **Note**: Do not rely on the order of the ranges returned by this + /// iterator. This allows for a more efficient implementation. + fn next(&mut self) -> Option { + let mut ranges: Vec = vec![]; + let mut ranges_size: u64 = 0; + + loop { + assert!(ranges_size <= self.chunk_size); + + if ranges_size == self.chunk_size || self.data.is_empty() { + break; + } + + if let Some(range) = self.data.pop() { + let next_range: MemoryRange = if ranges_size + range.length > self.chunk_size { + // How many bytes we need to put back into the table. + let leftover_bytes = ranges_size + range.length - self.chunk_size; + assert!(leftover_bytes <= range.length); + let returned_bytes = range.length - leftover_bytes; + assert!(returned_bytes <= range.length); + assert!(leftover_bytes + returned_bytes == range.length); + + self.data.push(MemoryRange { + gpa: range.gpa + returned_bytes, + length: leftover_bytes, + }); + MemoryRange { + gpa: range.gpa, + length: returned_bytes, } - } else if let Some(entry) = entry.take() { - table.push(entry); - } + } else { + range + }; + + ranges_size += next_range.length; + ranges.push(next_range); } } - if let Some(entry) = entry.take() { - table.push(entry); + + if ranges.is_empty() { + None + } else { + Some(MemoryRangeTable { data: ranges }) } + } +} + +impl MemoryRangeTable { + pub fn ranges(&self) -> &[MemoryRange] { + &self.data + } + + /// Partitions the table into chunks of at most `chunk_size` bytes. + pub fn partition(&self, chunk_size: u64) -> impl Iterator { + MemoryRangeTableIterator::new(self, chunk_size) + } - table + pub fn from_bitmap( + bitmap: impl IntoIterator, + start_addr: u64, + page_size: u64, + ) -> Self { + Self { + data: MemoryRange::dirty_ranges(bitmap, start_addr, page_size).collect(), + } } pub fn regions(&self) -> &[MemoryRange] { @@ -264,7 +346,7 @@ impl MemoryRangeTable { } pub fn read_from(fd: &mut dyn Read, length: u64) -> Result { - assert!(length as usize % std::mem::size_of::() == 0); + assert!((length as usize).is_multiple_of(size_of::())); let mut data: Vec = Vec::new(); data.resize_with( @@ -311,3 +393,62 @@ impl MemoryRangeTable { Self { data } } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_memory_range_table() { + let mut table = MemoryRangeTable::default(); + // Test blocks that are shorter than the chunk size. + table.push(MemoryRange { + gpa: 0, + length: 1 << 10, + }); + // Test blocks that are longer than the chunk size. + table.push(MemoryRange { + gpa: 0x1000, + length: 3 << 20, + }); + // And add another blocks, so we get a chunk that spans two memory + // ranges. + table.push(MemoryRange { + gpa: 4 << 20, + length: 1 << 20, + }); + + let table = table; // drop mut + + let chunks = table + .partition(2 << 20) + .map(|table| table.data) + .collect::>(); + + // The implementation currently returns the ranges in reverse order. If + // this tests becomes more complex, we can compare everything as sets. + assert_eq!( + chunks, + vec![ + vec![ + MemoryRange { + gpa: 4 << 20, + length: 1 << 20 + }, + MemoryRange { + gpa: 0x1000, + length: 1 << 20 + } + ], + vec![MemoryRange { + gpa: 0x1000 + (1 << 20), + length: 2 << 20 + },], + vec![MemoryRange { + gpa: 0, + length: 1 << 10 + }] + ] + ); + } +} diff --git a/vm-migration/src/tls.rs b/vm-migration/src/tls.rs new file mode 100644 index 0000000000..a44a76ebc8 --- /dev/null +++ b/vm-migration/src/tls.rs @@ -0,0 +1,261 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 +// +use std::io::{self, Read, Write}; +use std::net::TcpStream; +use std::os::fd::{AsFd, BorrowedFd}; +use std::path::Path; +use std::sync::Arc; + +use rustls::pki_types::pem::PemObject; +use rustls::pki_types::{CertificateDer, InvalidDnsNameError, PrivateKeyDer, ServerName}; +use rustls::{ + ClientConfig, ClientConnection, RootCertStore, ServerConfig, ServerConnection, StreamOwned, +}; +use thiserror::Error; +use vm_memory::bitmap::BitmapSlice; +use vm_memory::io::{ReadVolatile, WriteVolatile}; +use vm_memory::{VolatileMemoryError, VolatileSlice}; + +use crate::MigratableError; + +#[derive(Error, Debug)] +pub enum TlsError { + #[error( + "The provided input could not be parsed because it is not a syntactically-valid DNS Name." + )] + InvalidDnsName(#[source] InvalidDnsNameError), + + #[error("Rustls protocol error")] + RustlsError(#[from] rustls::Error), + + #[error("Rustls protocol IO error")] + RustlsIoError(#[from] std::io::Error), + + #[error("Error during TLS handshake: {0}")] + HandshakeError(String), +} + +// This TlsStream will be later encapsulated in a SocketStream. Thus it has to +// implement the same traits. It is important that we never directly read from +// or write to the TcpStream encapsulated in StreamOwned. +#[derive(Debug)] +pub enum TlsStream { + Client(StreamOwned), + Server(StreamOwned), +} + +// The TLS-Stream objects cannot read or write volatile, thus we need a buffer +// between the VolatileSlice and the TLS stream (see ReadVolatile and +// WriteVolatile implementations below). Allocating this buffer in these +// function calls would make it very slow, thus we tie the buffer to the stream +// with this wrapper. +pub struct TlsStreamWrapper { + stream: TlsStream, + // Used only in ReadVolatile and WriteVolatile + buf: Vec, +} + +static MAX_CHUNK: usize = 1024 * 64; + +impl TlsStreamWrapper { + pub fn new(stream: TlsStream) -> Self { + Self { + stream, + buf: Vec::new(), + } + } +} + +impl Read for TlsStream { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + match self { + TlsStream::Client(s) => s.read(buf), + TlsStream::Server(s) => s.read(buf), + } + } +} + +impl Read for TlsStreamWrapper { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + Read::read(&mut self.stream, buf) + } +} + +impl Write for TlsStream { + fn write(&mut self, buf: &[u8]) -> io::Result { + match self { + TlsStream::Client(s) => s.write(buf), + TlsStream::Server(s) => s.write(buf), + } + } + fn flush(&mut self) -> io::Result<()> { + match self { + TlsStream::Client(s) => s.flush(), + TlsStream::Server(s) => s.flush(), + } + } +} + +impl Write for TlsStreamWrapper { + fn write(&mut self, buf: &[u8]) -> io::Result { + Write::write(&mut self.stream, buf) + } + fn flush(&mut self) -> io::Result<()> { + Write::flush(&mut self.stream) + } +} + +// Reading from or writing to these FDs would break the connection, because +// those reads or writes wouldn't go through rustls. But the FD is used to wait +// until it becomes readable. +impl AsFd for TlsStreamWrapper { + fn as_fd(&self) -> BorrowedFd<'_> { + match &self.stream { + TlsStream::Client(s) => s.get_ref().as_fd(), + TlsStream::Server(s) => s.get_ref().as_fd(), + } + } +} + +impl ReadVolatile for TlsStreamWrapper { + fn read_volatile( + &mut self, + vs: &mut VolatileSlice, + ) -> std::result::Result { + let len = vs.len().min(MAX_CHUNK); + + if len == 0 { + return Ok(0); + } + + if self.buf.len() < len { + self.buf.resize(len, 0); + } + + let buf = &mut self.buf[..len]; + let n = + Read::read(&mut self.stream, &mut buf[..len]).map_err(VolatileMemoryError::IOError)?; + + if n == 0 { + return Ok(0); + } + + vs.copy_from(&buf[..n]); + self.buf.clear(); + + Ok(n) + } +} + +impl WriteVolatile for TlsStreamWrapper { + fn write_volatile( + &mut self, + vs: &VolatileSlice, + ) -> std::result::Result { + let len = vs.len().min(MAX_CHUNK); + if len == 0 { + return Ok(0); + } + + if self.buf.len() < len { + self.buf.resize(len, 0); + } + + let buf = &mut self.buf[..len]; + let n = vs.copy_to(&mut buf[..len]); + + if n == 0 { + return Ok(0); + } + + let n = Write::write(&mut self.stream, &buf[..n]).map_err(VolatileMemoryError::IOError)?; + self.buf.clear(); + + Ok(n) + } +} + +// A small wrapper to be put into ReceiveListener::Tls. It carries the +// TLS-Config and creates a TlsStream after the TcpConnection accepted a +// connection. +#[derive(Debug, Clone)] +pub struct TlsConnectionWrapper { + config: Arc, +} + +impl TlsConnectionWrapper { + pub fn new(cert_dir: &Path) -> Self { + let certs = CertificateDer::pem_file_iter(cert_dir.join("server-cert.pem")) + .unwrap() + .map(|cert| cert.unwrap()) + .collect(); + let key = PrivateKeyDer::from_pem_file(cert_dir.join("server-key.pem")).unwrap(); + let config = ServerConfig::builder() + .with_no_client_auth() + .with_single_cert(certs, key) + .map_err(TlsError::RustlsError) + .unwrap(); + let config = Arc::new(config); + Self { config } + } + + pub fn wrap( + &self, + socket: TcpStream, + ) -> std::result::Result { + let conn = ServerConnection::new(self.config.clone()).map_err(TlsError::RustlsError)?; + + let mut tls = StreamOwned::new(conn, socket); + while tls.conn.is_handshaking() { + let (rd, wr) = tls + .conn + .complete_io(&mut tls.sock) + .map_err(TlsError::RustlsIoError)?; + if rd == 0 && wr == 0 { + Err(TlsError::HandshakeError( + "EOF during TLS handshake".to_string(), + ))?; + } + } + + Ok(TlsStreamWrapper::new(TlsStream::Server(tls))) + } +} + +pub fn client_stream( + socket: TcpStream, + cert_dir: &Path, + hostname: &str, +) -> std::result::Result, MigratableError> { + let mut root_store = RootCertStore::empty(); + root_store.add_parsable_certificates( + CertificateDer::pem_file_iter(cert_dir.join("ca-cert.pem")) + .expect("Cannot open CA file") + .map(|result| result.unwrap()), + ); + let config = ClientConfig::builder() + .with_root_certificates(root_store) + .with_no_client_auth(); + let config = Arc::new(config); + let server_name = + ServerName::try_from(hostname.to_string()).map_err(TlsError::InvalidDnsName)?; + let conn = ClientConnection::new(config.clone(), server_name.clone()) + .map_err(TlsError::RustlsError)?; + + let mut tls = StreamOwned::new(conn, socket); + while tls.conn.is_handshaking() { + let (rd, wr) = tls + .conn + .complete_io(&mut tls.sock) + .map_err(TlsError::RustlsIoError)?; + if rd == 0 && wr == 0 { + Err(TlsError::HandshakeError( + "EOF during TLS handshake".to_string(), + ))?; + } + } + + Ok(tls) +} diff --git a/vm-virtio/Cargo.toml b/vm-virtio/Cargo.toml index 7a5492430e..228f552416 100644 --- a/vm-virtio/Cargo.toml +++ b/vm-virtio/Cargo.toml @@ -1,6 +1,6 @@ [package] authors = ["The Cloud Hypervisor Authors"] -edition = "2021" +edition.workspace = true name = "vm-virtio" version = "0.1.0" @@ -8,10 +8,12 @@ version = "0.1.0" default = [] [dependencies] -log = "0.4.22" virtio-queue = { workspace = true } vm-memory = { workspace = true, features = [ "backend-atomic", "backend-bitmap", "backend-mmap", ] } + +[lints] +workspace = true diff --git a/vm-virtio/src/queue.rs b/vm-virtio/src/queue.rs index 4e55cc4b5d..c33f6e5996 100644 --- a/vm-virtio/src/queue.rs +++ b/vm-virtio/src/queue.rs @@ -12,7 +12,8 @@ pub mod testing { use std::marker::PhantomData; use std::mem; - use virtio_queue::{Queue, QueueT, VirtqUsedElem}; + use virtio_queue::desc::split::VirtqUsedElem; + use virtio_queue::{Queue, QueueT}; use vm_memory::bitmap::AtomicBitmap; use vm_memory::{Address, Bytes, GuestAddress, GuestUsize}; diff --git a/vmm/Cargo.toml b/vmm/Cargo.toml index 342260c0da..0608aef7fb 100644 --- a/vmm/Cargo.toml +++ b/vmm/Cargo.toml @@ -1,6 +1,6 @@ [package] authors = ["The Cloud Hypervisor Authors"] -edition = "2021" +edition.workspace = true name = "vmm" version = "0.1.0" @@ -8,17 +8,26 @@ version = "0.1.0" dbus_api = ["blocking", "futures", "zbus"] default = [] dhat-heap = ["dhat"] # For heap profiling +fw_cfg = ["devices/fw_cfg"] guest_debug = ["gdbstub", "gdbstub_arch", "kvm"] igvm = ["dep:igvm", "hex", "igvm_defs", "mshv-bindings", "range_map_vec"] io_uring = ["block/io_uring"] +ivshmem = ["devices/ivshmem"] kvm = [ "arch/kvm", "hypervisor/kvm", "pci/kvm", "vfio-ioctls/kvm", + "virtio-devices/kvm", "vm-device/kvm", ] -mshv = ["hypervisor/mshv", "pci/mshv", "vfio-ioctls/mshv", "vm-device/mshv"] +mshv = [ + "hypervisor/mshv", + "pci/mshv", + "vfio-ioctls/mshv", + "virtio-devices/mshv", + "vm-device/mshv", +] pvmemcontrol = ["devices/pvmemcontrol"] sev_snp = ["arch/sev_snp", "hypervisor/sev_snp", "virtio-devices/sev_snp"] tdx = ["arch/tdx", "hypervisor/tdx"] @@ -26,30 +35,32 @@ tracing = ["tracer/tracing"] [dependencies] acpi_tables = { workspace = true } -anyhow = "1.0.94" -arc-swap = "1.7.1" +anyhow = { workspace = true } arch = { path = "../arch" } -bitflags = "2.9.0" +bitflags = { workspace = true } block = { path = "../block" } blocking = { version = "1.6.1", optional = true } -cfg-if = "1.0.0" -clap = "4.5.13" +cfg-if = { workspace = true } +clap = { workspace = true } devices = { path = "../devices" } -dhat = { version = "0.3.3", optional = true } -epoll = "4.3.3" +dhat = { workspace = true, optional = true } +epoll = { workspace = true } event_monitor = { path = "../event_monitor" } -flume = "0.11.1" +flume = { workspace = true } futures = { version = "0.3.31", optional = true } -gdbstub = { version = "0.7.1", optional = true } -gdbstub_arch = { version = "0.3.0", optional = true } +gdbstub = { version = "0.7.6", optional = true } +gdbstub_arch = { version = "0.3.2", optional = true } hex = { version = "0.4.3", optional = true } hypervisor = { path = "../hypervisor" } igvm = { workspace = true, optional = true } igvm_defs = { workspace = true, optional = true } -landlock = "0.4.0" -libc = "0.2.167" +kvm-bindings = { workspace = true } +landlock = "0.4.2" +libc = { workspace = true } linux-loader = { workspace = true, features = ["bzimage", "elf", "pe"] } -log = "0.4.22" +log = { workspace = true } +# Special fork of micro_http that combines HTTP traffic over a UNIX domain +# socket with UNIX' SCM_RIGHTS mechanism for transferring file descriptors. micro_http = { git = "https://github.com/firecracker-microvm/micro-http", branch = "main" } mshv-bindings = { workspace = true, features = [ "fam-wrappers", @@ -61,18 +72,18 @@ pci = { path = "../pci" } range_map_vec = { version = "0.2.0", optional = true } rate_limiter = { path = "../rate_limiter" } seccompiler = { workspace = true } -serde = { version = "1.0.208", features = ["derive", "rc"] } +serde = { workspace = true, features = ["derive", "rc"] } serde_json = { workspace = true } serial_buffer = { path = "../serial_buffer" } -signal-hook = "0.3.18" +signal-hook = { workspace = true } thiserror = { workspace = true } tracer = { path = "../tracer" } uuid = { workspace = true } vfio-ioctls = { workspace = true, default-features = false } vfio_user = { workspace = true } +vhost = { workspace = true } virtio-bindings = { workspace = true } virtio-devices = { path = "../virtio-devices" } -virtio-queue = { workspace = true } vm-allocator = { path = "../vm-allocator" } vm-device = { path = "../vm-device" } vm-memory = { workspace = true, features = [ @@ -85,3 +96,6 @@ vm-virtio = { path = "../vm-virtio" } vmm-sys-util = { workspace = true, features = ["with-serde"] } zbus = { version = "5.7.1", optional = true } zerocopy = { workspace = true, features = ["alloc", "derive"] } + +[lints] +workspace = true diff --git a/vmm/src/acpi.rs b/vmm/src/acpi.rs index d45f6a7196..de2f581294 100644 --- a/vmm/src/acpi.rs +++ b/vmm/src/acpi.rs @@ -5,16 +5,16 @@ use std::sync::{Arc, Mutex}; use std::time::Instant; +use acpi_tables::Aml; use acpi_tables::rsdp::Rsdp; #[cfg(target_arch = "aarch64")] use acpi_tables::sdt::GenericAddress; use acpi_tables::sdt::Sdt; -use acpi_tables::Aml; -#[cfg(target_arch = "aarch64")] -use arch::aarch64::DeviceInfoForFdt; #[cfg(target_arch = "aarch64")] use arch::DeviceType; use arch::NumaNodes; +#[cfg(target_arch = "aarch64")] +use arch::aarch64::DeviceInfoForFdt; use bitflags::bitflags; use pci::PciBdf; use tracer::trace_scoped; @@ -192,6 +192,8 @@ pub fn create_dsdt_table( dsdt } +const FACP_DSDT_OFFSET: usize = 140; + fn create_facp_table(dsdt_offset: GuestAddress, device_manager: &Arc>) -> Sdt { trace_scoped!("create_facp_table"); @@ -241,7 +243,7 @@ fn create_facp_table(dsdt_offset: GuestAddress, device_manager: &Arc Sdt { fn create_srat_table( numa_nodes: &NumaNodes, - #[cfg(target_arch = "x86_64")] topology: Option<(u8, u8, u8)>, + #[cfg(target_arch = "x86_64")] topology: Option<(u16, u16, u16, u16)>, ) -> Sdt { let mut srat = Sdt::new(*b"SRAT", 36, 3, *b"CLOUDH", *b"CHSRAT ", 1); // SRAT reserved 12 bytes @@ -312,21 +314,11 @@ fn create_srat_table( )) } - #[cfg(target_arch = "x86_64")] - for section in &node.sgx_epc_sections { - srat.append(MemoryAffinity::from_range( - section.start().raw_value(), - section.size(), - proximity_domain, - MemAffinityFlags::ENABLE, - )) - } - for cpu in &node.cpus { #[cfg(target_arch = "x86_64")] - let x2apic_id = arch::x86_64::get_x2apic_id(*cpu as u32, topology); + let x2apic_id = arch::x86_64::get_x2apic_id(*cpu, topology); #[cfg(target_arch = "aarch64")] - let x2apic_id = *cpu as u32; + let x2apic_id = *cpu; // Flags // - Enabled = 1 (bit 0) @@ -626,80 +618,73 @@ fn create_viot_table(iommu_bdf: &PciBdf, devices_bdf: &[PciBdf]) -> Sdt { viot } -pub fn create_acpi_tables( - guest_mem: &GuestMemoryMmap, +// Generate ACPI tables based on the given DSDT address +// +// # Returns +// +// * `Rsdp` is the generated RSDP. +// * `Vec` contains the generated bytes for ACPI tables. +// * `Vec` contains a list of table pointers stored in XSDT. +fn create_acpi_tables_internal( + dsdt_addr: GuestAddress, device_manager: &Arc>, cpu_manager: &Arc>, memory_manager: &Arc>, numa_nodes: &NumaNodes, tpm_enabled: bool, -) -> GuestAddress { - trace_scoped!("create_acpi_tables"); - - let start_time = Instant::now(); - let rsdp_offset = arch::layout::RSDP_POINTER; - let mut tables: Vec = Vec::new(); +) -> (Rsdp, Vec, Vec) { + // Generated bytes for ACPI tables + let mut tables_bytes: Vec = Vec::new(); + // List of table pointers stored in XSDT + let mut xsdt_table_pointers: Vec = Vec::new(); // DSDT let dsdt = create_dsdt_table(device_manager, cpu_manager, memory_manager); - let dsdt_offset = rsdp_offset.checked_add(Rsdp::len() as u64).unwrap(); - guest_mem - .write_slice(dsdt.as_slice(), dsdt_offset) - .expect("Error writing DSDT table"); + tables_bytes.extend_from_slice(dsdt.as_slice()); // FACP aka FADT - let facp = create_facp_table(dsdt_offset, device_manager); - let facp_offset = dsdt_offset.checked_add(dsdt.len() as u64).unwrap(); - guest_mem - .write_slice(facp.as_slice(), facp_offset) - .expect("Error writing FACP table"); - tables.push(facp_offset.0); + let facp = create_facp_table(dsdt_addr, device_manager); + let facp_addr = dsdt_addr.checked_add(dsdt.len() as u64).unwrap(); + tables_bytes.extend_from_slice(facp.as_slice()); + xsdt_table_pointers.push(facp_addr.0); // MADT let madt = cpu_manager.lock().unwrap().create_madt(); - let madt_offset = facp_offset.checked_add(facp.len() as u64).unwrap(); - guest_mem - .write_slice(madt.as_slice(), madt_offset) - .expect("Error writing MADT table"); - tables.push(madt_offset.0); + let madt_addr = facp_addr.checked_add(facp.len() as u64).unwrap(); + tables_bytes.extend_from_slice(madt.as_slice()); + xsdt_table_pointers.push(madt_addr.0); let mut prev_tbl_len = madt.len() as u64; - let mut prev_tbl_off = madt_offset; + let mut prev_tbl_addr = madt_addr; // PPTT #[cfg(target_arch = "aarch64")] { let pptt = cpu_manager.lock().unwrap().create_pptt(); - let pptt_offset = prev_tbl_off.checked_add(prev_tbl_len).unwrap(); - guest_mem - .write_slice(pptt.as_slice(), pptt_offset) - .expect("Error writing PPTT table"); - tables.push(pptt_offset.0); + let pptt_addr = prev_tbl_addr.checked_add(prev_tbl_len).unwrap(); + tables_bytes.extend_from_slice(pptt.as_slice()); + xsdt_table_pointers.push(pptt_addr.0); prev_tbl_len = pptt.len() as u64; - prev_tbl_off = pptt_offset; + prev_tbl_addr = pptt_addr; } // GTDT #[cfg(target_arch = "aarch64")] { let gtdt = create_gtdt_table(); - let gtdt_offset = prev_tbl_off.checked_add(prev_tbl_len).unwrap(); - guest_mem - .write_slice(gtdt.as_slice(), gtdt_offset) - .expect("Error writing GTDT table"); - tables.push(gtdt_offset.0); + let gtdt_addr = prev_tbl_addr.checked_add(prev_tbl_len).unwrap(); + tables_bytes.extend_from_slice(gtdt.as_slice()); + xsdt_table_pointers.push(gtdt_addr.0); prev_tbl_len = gtdt.len() as u64; - prev_tbl_off = gtdt_offset; + prev_tbl_addr = gtdt_addr; } // MCFG let mcfg = create_mcfg_table(device_manager.lock().unwrap().pci_segments()); - let mcfg_offset = prev_tbl_off.checked_add(prev_tbl_len).unwrap(); - guest_mem - .write_slice(mcfg.as_slice(), mcfg_offset) - .expect("Error writing MCFG table"); - tables.push(mcfg_offset.0); + let mcfg_addr = prev_tbl_addr.checked_add(prev_tbl_len).unwrap(); + tables_bytes.extend_from_slice(mcfg.as_slice()); + xsdt_table_pointers.push(mcfg_addr.0); prev_tbl_len = mcfg.len() as u64; - prev_tbl_off = mcfg_offset; + prev_tbl_addr = mcfg_addr; // SPCR and DBG2 #[cfg(target_arch = "aarch64")] @@ -727,36 +712,30 @@ pub fn create_acpi_tables( // SPCR let spcr = create_spcr_table(serial_device_addr, serial_device_irq); - let spcr_offset = prev_tbl_off.checked_add(prev_tbl_len).unwrap(); - guest_mem - .write_slice(spcr.as_slice(), spcr_offset) - .expect("Error writing SPCR table"); - tables.push(spcr_offset.0); + let spcr_addr = prev_tbl_addr.checked_add(prev_tbl_len).unwrap(); + tables_bytes.extend_from_slice(spcr.as_slice()); + xsdt_table_pointers.push(spcr_addr.0); prev_tbl_len = spcr.len() as u64; - prev_tbl_off = spcr_offset; + prev_tbl_addr = spcr_addr; // DBG2 let dbg2 = create_dbg2_table(serial_device_addr); - let dbg2_offset = prev_tbl_off.checked_add(prev_tbl_len).unwrap(); - guest_mem - .write_slice(dbg2.as_slice(), dbg2_offset) - .expect("Error writing DBG2 table"); - tables.push(dbg2_offset.0); + let dbg2_addr = prev_tbl_addr.checked_add(prev_tbl_len).unwrap(); + tables_bytes.extend_from_slice(dbg2.as_slice()); + xsdt_table_pointers.push(dbg2_addr.0); prev_tbl_len = dbg2.len() as u64; - prev_tbl_off = dbg2_offset; + prev_tbl_addr = dbg2_addr; } if tpm_enabled { // TPM2 Table let tpm2 = create_tpm2_table(); - let tpm2_offset = prev_tbl_off.checked_add(prev_tbl_len).unwrap(); - guest_mem - .write_slice(tpm2.as_slice(), tpm2_offset) - .expect("Error writing TPM2 table"); - tables.push(tpm2_offset.0); + let tpm2_addr = prev_tbl_addr.checked_add(prev_tbl_len).unwrap(); + tables_bytes.extend_from_slice(tpm2.as_slice()); + xsdt_table_pointers.push(tpm2_addr.0); prev_tbl_len = tpm2.len() as u64; - prev_tbl_off = tpm2_offset; + prev_tbl_addr = tpm2_addr; } // SRAT and SLIT // Only created if the NUMA nodes list is not empty. @@ -769,34 +748,28 @@ pub fn create_acpi_tables( #[cfg(target_arch = "x86_64")] topology, ); - let srat_offset = prev_tbl_off.checked_add(prev_tbl_len).unwrap(); - guest_mem - .write_slice(srat.as_slice(), srat_offset) - .expect("Error writing SRAT table"); - tables.push(srat_offset.0); + let srat_addr = prev_tbl_addr.checked_add(prev_tbl_len).unwrap(); + tables_bytes.extend_from_slice(srat.as_slice()); + xsdt_table_pointers.push(srat_addr.0); // SLIT let slit = create_slit_table(numa_nodes); - let slit_offset = srat_offset.checked_add(srat.len() as u64).unwrap(); - guest_mem - .write_slice(slit.as_slice(), slit_offset) - .expect("Error writing SRAT table"); - tables.push(slit_offset.0); + let slit_addr = srat_addr.checked_add(srat.len() as u64).unwrap(); + tables_bytes.extend_from_slice(slit.as_slice()); + xsdt_table_pointers.push(slit_addr.0); prev_tbl_len = slit.len() as u64; - prev_tbl_off = slit_offset; + prev_tbl_addr = slit_addr; }; #[cfg(target_arch = "aarch64")] { let iort = create_iort_table(device_manager.lock().unwrap().pci_segments()); - let iort_offset = prev_tbl_off.checked_add(prev_tbl_len).unwrap(); - guest_mem - .write_slice(iort.as_slice(), iort_offset) - .expect("Error writing IORT table"); - tables.push(iort_offset.0); + let iort_addr = prev_tbl_addr.checked_add(prev_tbl_len).unwrap(); + tables_bytes.extend_from_slice(iort.as_slice()); + xsdt_table_pointers.push(iort_addr.0); prev_tbl_len = iort.len() as u64; - prev_tbl_off = iort_offset; + prev_tbl_addr = iort_addr; } // VIOT @@ -804,38 +777,125 @@ pub fn create_acpi_tables( { let viot = create_viot_table(iommu_bdf, devices_bdf); - let viot_offset = prev_tbl_off.checked_add(prev_tbl_len).unwrap(); - guest_mem - .write_slice(viot.as_slice(), viot_offset) - .expect("Error writing VIOT table"); - tables.push(viot_offset.0); + let viot_addr = prev_tbl_addr.checked_add(prev_tbl_len).unwrap(); + tables_bytes.extend_from_slice(viot.as_slice()); + xsdt_table_pointers.push(viot_addr.0); prev_tbl_len = viot.len() as u64; - prev_tbl_off = viot_offset; + prev_tbl_addr = viot_addr; } // XSDT let mut xsdt = Sdt::new(*b"XSDT", 36, 1, *b"CLOUDH", *b"CHXSDT ", 1); - for table in tables { - xsdt.append(table); + for table_pointer in &xsdt_table_pointers { + xsdt.append(*table_pointer); } xsdt.update_checksum(); - let xsdt_offset = prev_tbl_off.checked_add(prev_tbl_len).unwrap(); - guest_mem - .write_slice(xsdt.as_slice(), xsdt_offset) - .expect("Error writing XSDT table"); + let xsdt_addr = prev_tbl_addr.checked_add(prev_tbl_len).unwrap(); + tables_bytes.extend_from_slice(xsdt.as_slice()); // RSDP - let rsdp = Rsdp::new(*b"CLOUDH", xsdt_offset.0); + let rsdp = Rsdp::new(*b"CLOUDH", xsdt_addr.0); + + (rsdp, tables_bytes, xsdt_table_pointers) +} + +#[cfg(feature = "fw_cfg")] +pub fn create_acpi_tables_for_fw_cfg( + device_manager: &Arc>, + cpu_manager: &Arc>, + memory_manager: &Arc>, + numa_nodes: &NumaNodes, + tpm_enabled: bool, +) -> Result<(), crate::vm::Error> { + let dsdt_offset = GuestAddress(0); + let (rsdp, table_bytes, xsdt_table_pointers) = create_acpi_tables_internal( + dsdt_offset, + device_manager, + cpu_manager, + memory_manager, + numa_nodes, + tpm_enabled, + ); + let mut pointer_offsets: Vec = vec![]; + let mut checksums: Vec<(usize, usize)> = vec![]; + + let xsdt_addr = rsdp.xsdt_addr.get() as usize; + let xsdt_checksum = (xsdt_addr, table_bytes.len() - xsdt_addr); + + // create pointer offsets (use location of pointers in XSDT table) + // XSDT doesn't have a pointer to DSDT so we use FACP's pointer to DSDT + let facp_offset = xsdt_table_pointers[0] as usize; + pointer_offsets.push(facp_offset + FACP_DSDT_OFFSET); + let mut current_offset = xsdt_addr + 36; + for _ in 0..xsdt_table_pointers.len() { + pointer_offsets.push(current_offset); + current_offset += 8; + } + + // create (offset, len) pairs for firmware to calculate + // table checksums and verify ACPI tables + let mut i = 0; + while i < xsdt_table_pointers.len() - 1 { + let current_table_offset = xsdt_table_pointers[i]; + let current_table_length = xsdt_table_pointers[i + 1] - current_table_offset; + checksums.push((current_table_offset as usize, current_table_length as usize)); + i += 1; + } + checksums.push(( + xsdt_table_pointers[xsdt_table_pointers.len() - 1] as usize, + 0, + )); + checksums.push(xsdt_checksum); + + device_manager + .lock() + .unwrap() + .fw_cfg() + .expect("fw_cfg must be present") + .lock() + .unwrap() + .add_acpi(rsdp, table_bytes, checksums, pointer_offsets) + .map_err(crate::vm::Error::CreatingAcpiTables) +} + +pub fn create_acpi_tables( + guest_mem: &GuestMemoryMmap, + device_manager: &Arc>, + cpu_manager: &Arc>, + memory_manager: &Arc>, + numa_nodes: &NumaNodes, + tpm_enabled: bool, +) -> GuestAddress { + trace_scoped!("create_acpi_tables"); + + let start_time = Instant::now(); + let rsdp_addr = arch::layout::RSDP_POINTER; + let dsdt_addr = rsdp_addr.checked_add(Rsdp::len() as u64).unwrap(); + + let (rsdp, tables_bytes, _xsdt_table_pointers) = create_acpi_tables_internal( + dsdt_addr, + device_manager, + cpu_manager, + memory_manager, + numa_nodes, + tpm_enabled, + ); + guest_mem - .write_slice(rsdp.as_bytes(), rsdp_offset) + .write_slice(rsdp.as_bytes(), rsdp_addr) .expect("Error writing RSDP"); + guest_mem + .write_slice(tables_bytes.as_slice(), dsdt_addr) + .expect("Error writing ACPI tables"); + info!( "Generated ACPI tables: took {}µs size = {}", Instant::now().duration_since(start_time).as_micros(), - xsdt_offset.0 + xsdt.len() as u64 - rsdp_offset.0 + Rsdp::len() + tables_bytes.len(), ); - rsdp_offset + + rsdp_addr } #[cfg(feature = "tdx")] diff --git a/vmm/src/api/dbus/mod.rs b/vmm/src/api/dbus/mod.rs index 85bd9d54fa..f329b86bcd 100644 --- a/vmm/src/api/dbus/mod.rs +++ b/vmm/src/api/dbus/mod.rs @@ -3,14 +3,14 @@ // SPDX-License-Identifier: Apache-2.0 // use std::panic::AssertUnwindSafe; -use std::sync::mpsc::Sender; use std::sync::Arc; +use std::sync::mpsc::Sender; use std::thread; use futures::channel::oneshot; -use futures::{executor, FutureExt}; +use futures::{FutureExt, executor}; use hypervisor::HypervisorType; -use seccompiler::{apply_filter, SeccompAction}; +use seccompiler::{SeccompAction, apply_filter}; use vmm_sys_util::eventfd::EventFd; use zbus::connection::Builder; use zbus::fdo::{self, Result}; @@ -26,7 +26,7 @@ use crate::api::{ VmReceiveMigration, VmRemoveDevice, VmResize, VmResizeZone, VmRestore, VmResume, VmSendMigration, VmShutdown, VmSnapshot, VmmPing, VmmShutdown, }; -use crate::seccomp_filters::{get_seccomp_filter, Thread}; +use crate::seccomp_filters::{Thread, get_seccomp_filter}; use crate::{Error as VmmError, NetConfig, Result as VmmResult, VmConfig}; pub type DBusApiShutdownChannels = (oneshot::Sender<()>, oneshot::Receiver<()>); diff --git a/vmm/src/api/http/http_endpoint.rs b/vmm/src/api/http/http_endpoint.rs index ff1de56296..35ef0ba946 100644 --- a/vmm/src/api/http/http_endpoint.rs +++ b/vmm/src/api/http/http_endpoint.rs @@ -4,23 +4,60 @@ // SPDX-License-Identifier: Apache-2.0 // +//! # HTTP Endpoints of the Cloud Hypervisor API +//! +//! ## Special Handling for virtio-net Devices Backed by Network File Descriptors (FDs) +//! +//! Some of the HTTP handlers here implement special logic for virtio-net +//! devices **backed by network FDs** to enable live-migration, state save/ +//! resume (restore), and similar VM lifecycle events. +//! +//! The utilized mechanism requires that the control software (e.g., libvirt) +//! connects to Cloud Hypervisor by using a UNIX domain socket and that it +//! passes file descriptors (FDs) via _ancillary_ messages - specifically using +//! the `SCM_RIGHTS` mechanism described in [`cmsg(3)`]. These ancillary +//! messages must accompany the primary payload (HTTP JSON REST API in this +//! case). The Linux kernel handles these messages by `dup()`ing the referenced +//! FDs from the sender process into the receiving process, thereby ensuring +//! they are valid and usable in the target context. +//! +//! Once these valid file descriptors are received here, we integrate the actual +//! FDs into the VM's configuration, allowing the virtio-net device to +//! function correctly with its backing network resources. +//! +//! We can receive these FDs as we use a **special** HTTP library that is aware +//! of the just described mechanism. +//! +//! [`cmsg(3)`]: https://man7.org/linux/man-pages/man3/cmsg.3.html + use std::fs::File; use std::os::unix::io::IntoRawFd; -use std::sync::mpsc::Sender; +use std::sync::mpsc::{Receiver, Sender, SyncSender}; +use std::sync::{LazyLock, Mutex}; use micro_http::{Body, Method, Request, Response, StatusCode, Version}; use vmm_sys_util::eventfd::EventFd; -use crate::api::http::{error_response, EndpointHandler, HttpError}; +/// Helper to make the VmSendMigration call blocking as long as a migration is ongoing. +#[allow(clippy::type_complexity)] +pub static ONGOING_LIVEMIGRATION: LazyLock<( + SyncSender>, + Mutex>>, +)> = LazyLock::new(|| { + let (sender, receiver) = std::sync::mpsc::sync_channel(0); + (sender, Mutex::new(receiver)) +}); + #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use crate::api::VmCoredump; +use crate::api::http::{EndpointHandler, HttpError, error_response}; use crate::api::{ AddDisk, ApiAction, ApiError, ApiRequest, NetConfig, VmAddDevice, VmAddFs, VmAddNet, VmAddPmem, VmAddUserDevice, VmAddVdpa, VmAddVsock, VmBoot, VmConfig, VmCounters, VmDelete, VmNmi, VmPause, - VmPowerButton, VmReboot, VmReceiveMigration, VmRemoveDevice, VmResize, VmResizeZone, VmRestore, - VmResume, VmSendMigration, VmShutdown, VmSnapshot, + VmPowerButton, VmReboot, VmReceiveMigration, VmReceiveMigrationData, VmRemoveDevice, VmResize, + VmResizeDisk, VmResizeZone, VmRestore, VmResume, VmSendMigration, VmShutdown, VmSnapshot, }; -use crate::config::RestoreConfig; +use crate::config::{RestoreConfig, RestoredNetConfig}; use crate::cpu::Error as CpuError; use crate::vm::Error as VmError; @@ -47,11 +84,27 @@ impl EndpointHandler for VmCreate { }; if let Some(ref mut nets) = vm_config.net { - if nets.iter().any(|net| net.fds.is_some()) { - warn!("Ignoring FDs sent via the HTTP request body"); - } - for net in nets { - net.fds = None; + let mut cfgs = nets.iter_mut().collect::>(); + let cfgs = cfgs.as_mut_slice(); + + // For the VmCreate call, we do not accept FDs from the socket currently. + // This call sets all FDs to null while doing the same logging as + // similar code paths. + let res = apply_new_fds_to_cfg::( + vec![], + cfgs, + &|cfg| cfg.id.as_deref(), + &|_| 0, + &|cfg| cfg.fds.as_deref(), + &|cfg, value| { + assert!(value.is_none()); + cfg.fds = None + }, + ) + .map_err(|e| error_response(e, StatusCode::InternalServerError)); + + if let Err(e) = res { + return e; } } @@ -185,32 +238,112 @@ vm_action_put_handler_body!(VmAddVdpa); vm_action_put_handler_body!(VmAddVsock); vm_action_put_handler_body!(VmAddUserDevice); vm_action_put_handler_body!(VmRemoveDevice); +vm_action_put_handler_body!(VmResizeDisk); vm_action_put_handler_body!(VmResizeZone); vm_action_put_handler_body!(VmSnapshot); -vm_action_put_handler_body!(VmReceiveMigration); -vm_action_put_handler_body!(VmSendMigration); #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] vm_action_put_handler_body!(VmCoredump); +/// Applies FDs to the network config of a given device, as part of the special +/// handling for virtio-net devices backed by network FDs. +/// +/// See [module description] for more info. +/// +/// [module description]: self +fn apply_new_fds_to_cfg( + // List of new files (well, actually FDs) that back up a virtio-net device. + files: Vec, + // List of network configurations where each network can have `n` FDs. + network_cfgs: &mut [&mut T], + // Callback to return the ID. + network_cfg_extract_id: &impl Fn(&T) -> Option<&str>, + // Callback to extract the amount of expected FDs. + network_cfg_extract_num_fds_fn: &impl Fn(&T) -> usize, + // Callback to extract the FDs that are part of the type (transmitted via + // the HTTP body) + network_cfg_extract_fds_fn: &impl Fn(&T) -> Option<&[i32]>, + // Callback to set any FDs in the type to the new value. The new value + // is either `Some` with a non-empty Vector or `None`. + network_cfg_replace_fds: &impl Fn(&mut T, Option>), +) -> Result<(), HttpError> { + let expected_fds: usize = network_cfgs + .iter() + .map(|cfg| network_cfg_extract_num_fds_fn(cfg)) + .sum(); + + let mut fds = files + .into_iter() + .map(|f| f.into_raw_fd()) + .collect::>(); + + if fds.len() != expected_fds { + error!( + "Number of FDs expected: {}, but received: {}", + expected_fds, + fds.len() + ); + return Err(HttpError::BadRequest); + } + + for network_cfg in network_cfgs { + let has_fds_from_http_body = network_cfg_extract_fds_fn(network_cfg).is_some(); + if has_fds_from_http_body { + // Only FDs transmitted via an SCM_RIGHTS UNIX Domain Socket message + // are valid. Any provided over the HTTP API are set to `-1` in our + // specialized serializer callbacks. + warn!( + "FD numbers were present in HTTP request body for virtio-net device {:?} but will be ignored", + network_cfg_extract_id(network_cfg) + ); + + // Reset old value in any case; if there are FDs, they are invalid. + network_cfg_replace_fds(*network_cfg, None); + } + + let n = network_cfg_extract_num_fds_fn(network_cfg); + if n > 0 { + let new_fds = fds.drain(..n).collect::>(); + log::debug!( + "Applying network FDs received via UNIX domain socket to virtio-net device: id={:?}, fds={new_fds:?}", + network_cfg_extract_id(network_cfg) + ); + network_cfg_replace_fds(*network_cfg, Some(new_fds)); + } + } + + // We checked that `fds.len() != expected_fds`; so if we panic here, we have a hard + // programming bug + assert!(fds.is_empty()); + + Ok(()) +} + impl PutHandler for VmAddNet { fn handle_request( &'static self, api_notifier: EventFd, api_sender: Sender, body: &Option, - mut files: Vec, + files: Vec, ) -> std::result::Result, HttpError> { if let Some(body) = body { let mut net_cfg: NetConfig = serde_json::from_slice(body.raw())?; - if net_cfg.fds.is_some() { - warn!("Ignoring FDs sent via the HTTP request body"); - net_cfg.fds = None; - } - if !files.is_empty() { - let fds = files.drain(..).map(|f| f.into_raw_fd()).collect(); - net_cfg.fds = Some(fds); - } + + let mut net_cfgs = [&mut net_cfg]; + let num_fds = files.len(); + apply_new_fds_to_cfg::( + files, + &mut net_cfgs, + &|cfg| cfg.id.as_deref(), + // We only have one single network here, so it wants all available FDs. + &|_| num_fds, + &|cfg| cfg.fds.as_deref(), + &|cfg, value| { + cfg.fds = value; + }, + )?; + self.send(api_notifier, api_sender, net_cfg) .map_err(HttpError::ApiError) } else { @@ -221,6 +354,84 @@ impl PutHandler for VmAddNet { impl GetHandler for VmAddNet {} +// Special Handling for virtio-net Devices Backed by Network File Descriptors +// +// See above. +impl PutHandler for VmReceiveMigration { + fn handle_request( + &'static self, + api_notifier: EventFd, + api_sender: Sender, + body: &Option, + files: Vec, + ) -> std::result::Result, HttpError> { + if let Some(body) = body { + let mut net_cfg: VmReceiveMigrationData = serde_json::from_slice(body.raw())?; + if let Some(cfgs) = &mut net_cfg.net_fds { + let mut cfgs = cfgs.iter_mut().collect::>(); + let cfgs = cfgs.as_mut_slice(); + apply_new_fds_to_cfg::( + files, + cfgs, + &|cfg| Some(&cfg.id), + &|cfg| cfg.num_fds, + &|cfg| cfg.fds.as_deref(), + &|cfg, value| { + cfg.fds = value; + }, + )?; + } + + self.send(api_notifier, api_sender, net_cfg) + .map_err(HttpError::ApiError) + } else { + Err(HttpError::BadRequest) + } + } +} + +impl GetHandler for VmReceiveMigration {} + +// Special Handling for virtio-net Devices Backed by Network File Descriptors +// +// See above. +impl PutHandler for VmSendMigration { + fn handle_request( + &'static self, + api_notifier: EventFd, + api_sender: Sender, + body: &Option, + _files: Vec, + ) -> std::result::Result, HttpError> { + if let Some(body) = body { + let res = self + .send( + api_notifier, + api_sender, + serde_json::from_slice(body.raw())?, + ) + .map_err(HttpError::ApiError)?; + + info!("live migration started"); + + let (_, receiver) = &*ONGOING_LIVEMIGRATION; + + info!("waiting for live migration result"); + let mig_res = receiver.lock().unwrap().recv().unwrap(); + info!("received live migration result"); + + // We forward the migration error here to the guest + mig_res + .map(|_| res) + .map_err(|e| HttpError::ApiError(ApiError::VmSendMigration(e))) + } else { + Err(HttpError::BadRequest) + } + } +} + +impl GetHandler for VmSendMigration {} + impl PutHandler for VmResize { fn handle_request( &'static self, @@ -249,41 +460,32 @@ impl PutHandler for VmResize { impl GetHandler for VmResize {} +// Special handling for virtio-net devices backed by network FDs. +// See module description for more info. impl PutHandler for VmRestore { fn handle_request( &'static self, api_notifier: EventFd, api_sender: Sender, body: &Option, - mut files: Vec, + files: Vec, ) -> std::result::Result, HttpError> { if let Some(body) = body { let mut restore_cfg: RestoreConfig = serde_json::from_slice(body.raw())?; - let mut fds = Vec::new(); - if !files.is_empty() { - fds = files.drain(..).map(|f| f.into_raw_fd()).collect(); - } - let expected_fds = match restore_cfg.net_fds { - Some(ref net_fds) => net_fds.iter().map(|net| net.num_fds).sum(), - None => 0, - }; - if fds.len() != expected_fds { - error!( - "Number of FDs expected: {}, but received: {}", - expected_fds, - fds.len() - ); - return Err(HttpError::BadRequest); - } - if let Some(ref mut nets) = restore_cfg.net_fds { - warn!("Ignoring FDs sent via the HTTP request body"); - let mut start_idx = 0; - for restored_net in nets.iter_mut() { - let end_idx = start_idx + restored_net.num_fds; - restored_net.fds = Some(fds[start_idx..end_idx].to_vec()); - start_idx = end_idx; - } + if let Some(cfgs) = restore_cfg.net_fds.as_mut() { + let mut cfgs = cfgs.iter_mut().collect::>(); + let cfgs = cfgs.as_mut_slice(); + apply_new_fds_to_cfg::( + files, + cfgs, + &|cfg| Some(&cfg.id), + &|cfg| cfg.num_fds, + &|cfg| cfg.fds.as_deref(), + &|cfg, value| { + cfg.fds = value; + }, + )?; } self.send(api_notifier, api_sender, restore_cfg) diff --git a/vmm/src/api/http/mod.rs b/vmm/src/api/http/mod.rs index 418a4d4961..4dfbf7b9b0 100644 --- a/vmm/src/api/http/mod.rs +++ b/vmm/src/api/http/mod.rs @@ -10,15 +10,15 @@ use std::os::unix::io::{IntoRawFd, RawFd}; use std::os::unix::net::UnixListener; use std::panic::AssertUnwindSafe; use std::path::PathBuf; -use std::sync::mpsc::Sender; use std::sync::LazyLock; +use std::sync::mpsc::Sender; use std::thread; use hypervisor::HypervisorType; use micro_http::{ Body, HttpServer, MediaType, Method, Request, Response, ServerError, StatusCode, Version, }; -use seccompiler::{apply_filter, SeccompAction}; +use seccompiler::{SeccompAction, apply_filter}; use serde_json::Error as SerdeError; use thiserror::Error; use vmm_sys_util::eventfd::EventFd; @@ -29,11 +29,11 @@ use crate::api::VmCoredump; use crate::api::{ AddDisk, ApiError, ApiRequest, VmAddDevice, VmAddFs, VmAddNet, VmAddPmem, VmAddUserDevice, VmAddVdpa, VmAddVsock, VmBoot, VmCounters, VmDelete, VmNmi, VmPause, VmPowerButton, VmReboot, - VmReceiveMigration, VmRemoveDevice, VmResize, VmResizeZone, VmRestore, VmResume, + VmReceiveMigration, VmRemoveDevice, VmResize, VmResizeDisk, VmResizeZone, VmRestore, VmResume, VmSendMigration, VmShutdown, VmSnapshot, }; use crate::landlock::Landlock; -use crate::seccomp_filters::{get_seccomp_filter, Thread}; +use crate::seccomp_filters::{Thread, get_seccomp_filter}; use crate::{Error as VmmError, Result}; pub mod http_endpoint; @@ -249,6 +249,10 @@ pub static HTTP_ROUTES: LazyLock = LazyLock::new(|| { endpoint!("/vm.resize"), Box::new(VmActionHandler::new(&VmResize)), ); + r.routes.insert( + endpoint!("/vm.resize-disk"), + Box::new(VmActionHandler::new(&VmResizeDisk)), + ); r.routes.insert( endpoint!("/vm.resize-zone"), Box::new(VmActionHandler::new(&VmResizeZone)), diff --git a/vmm/src/api/mod.rs b/vmm/src/api/mod.rs index 95c4019b48..73e352a11d 100644 --- a/vmm/src/api/mod.rs +++ b/vmm/src/api/mod.rs @@ -34,7 +34,9 @@ pub mod dbus; pub mod http; use std::io; -use std::sync::mpsc::{channel, RecvError, SendError, Sender}; +use std::num::NonZeroU32; +use std::path::PathBuf; +use std::sync::mpsc::{RecvError, SendError, Sender, channel}; use micro_http::Body; use serde::{Deserialize, Serialize}; @@ -45,14 +47,14 @@ use vmm_sys_util::eventfd::EventFd; #[cfg(feature = "dbus_api")] pub use self::dbus::start_dbus_thread; pub use self::http::{start_http_fd_thread, start_http_path_thread}; -use crate::config::RestoreConfig; +use crate::Error as VmmError; +use crate::config::{RestoreConfig, RestoredNetConfig}; use crate::device_tree::DeviceTree; use crate::vm::{Error as VmError, VmState}; use crate::vm_config::{ DeviceConfig, DiskConfig, FsConfig, NetConfig, PmemConfig, UserDeviceConfig, VdpaConfig, VmConfig, VsockConfig, }; -use crate::Error as VmmError; /// API errors are sent back from the VMM API server through the ApiResponse. #[derive(Error, Debug)] @@ -117,8 +119,8 @@ pub enum ApiError { #[error("The VM could not be snapshotted")] VmSnapshot(#[source] VmError), - /// The VM could not restored. - #[error("The VM could not restored")] + /// The VM could not be restored. + #[error("The VM could not be restored")] VmRestore(#[source] VmError), /// The VM could not be coredumped. @@ -133,6 +135,10 @@ pub enum ApiError { #[error("The VM could not be resized")] VmResize(#[source] VmError), + /// The disk could not be resized. + #[error("The disk could not be resized")] + VmResizeDisk(#[source] VmError), + /// The memory zone could not be resized. #[error("The memory zone could not be resized")] VmResizeZone(#[source] VmError), @@ -217,11 +223,17 @@ pub struct VmmPingResponse { #[derive(Clone, Deserialize, Serialize, Default, Debug)] pub struct VmResizeData { - pub desired_vcpus: Option, + pub desired_vcpus: Option, pub desired_ram: Option, pub desired_balloon: Option, } +#[derive(Clone, Deserialize, Serialize, Default, Debug)] +pub struct VmResizeDiskData { + pub id: String, + pub desired_size: u64, +} + #[derive(Clone, Deserialize, Serialize, Default, Debug)] pub struct VmResizeZoneData { pub id: String, @@ -245,19 +257,53 @@ pub struct VmCoredumpData { pub destination_url: String, } -#[derive(Clone, Deserialize, Serialize, Default, Debug)] +#[derive(Clone, Deserialize, Serialize, Debug)] pub struct VmReceiveMigrationData { /// URL for the reception of migration state pub receiver_url: String, + /// Optional URL if the TCP serial configuration must be changed during + /// migration. Example: "192.168.1.1:2222". + pub tcp_serial_url: Option, + /// Map with new network FDs on the new host. + pub net_fds: Option>, + /// Directory containing the TLS server certificate (server-cert.pem) and TLS server key (server-key.pem). + #[serde(default)] + pub tls_dir: Option, } -#[derive(Clone, Deserialize, Serialize, Default, Debug)] +#[derive(Clone, Deserialize, Serialize, Debug)] pub struct VmSendMigrationData { - /// URL to migrate the VM to + /// URL to migrate the VM to. + /// + /// This is not actually a URL, but we are stuck with the name, because it's + /// part of the HTTP API. The destination is a string, such as + /// tcp:: or unix:/path/to/socket. pub destination_url: String, /// Send memory across socket without copying #[serde(default)] pub local: bool, + /// Microsecond level downtime + #[serde(default = "default_downtime")] + pub downtime: u64, + /// Second level migration timeout + #[serde(default)] + pub migration_timeout: u64, + /// The number of parallel connections for migration + #[serde(default = "default_connections")] + pub connections: NonZeroU32, + /// Directory containing the TLS root CA certificate (ca-cert.pem) + #[serde(default)] + pub tls_dir: Option, +} + +// Default value for downtime the same as qemu. +fn default_downtime() -> u64 { + 300 +} + +// We use a single connection for backward compatibility as default. +fn default_connections() -> NonZeroU32 { + NonZeroU32::new(1).unwrap() } pub enum ApiResponsePayload { @@ -307,13 +353,15 @@ pub trait RequestHandler { fn vm_resize( &mut self, - desired_vcpus: Option, + desired_vcpus: Option, desired_ram: Option, desired_balloon: Option, ) -> Result<(), VmError>; fn vm_resize_zone(&mut self, id: String, desired_ram: u64) -> Result<(), VmError>; + fn vm_resize_disk(&mut self, id: String, desired_size: u64) -> Result<(), VmError>; + fn vm_add_device(&mut self, device_cfg: DeviceConfig) -> Result>, VmError>; fn vm_add_user_device( @@ -1135,6 +1183,44 @@ impl ApiAction for VmResize { } } +pub struct VmResizeDisk; + +impl ApiAction for VmResizeDisk { + type RequestBody = VmResizeDiskData; + type ResponseBody = Option; + + fn request( + &self, + resize_disk_data: Self::RequestBody, + response_sender: Sender, + ) -> ApiRequest { + Box::new(move |vmm| { + info!("API request event: VmResizeDisk {:?}", resize_disk_data); + println!("xxxxxx"); + + let response = vmm + .vm_resize_disk(resize_disk_data.id, resize_disk_data.desired_size) + .map_err(ApiError::VmResizeDisk) + .map(|_| ApiResponsePayload::Empty); + + response_sender + .send(response) + .map_err(VmmError::ApiResponseSend)?; + + Ok(false) + }) + } + + fn send( + &self, + api_evt: EventFd, + api_sender: Sender, + data: Self::RequestBody, + ) -> ApiResult { + get_response_body(self, api_evt, api_sender, data) + } +} + pub struct VmResizeZone; impl ApiAction for VmResizeZone { diff --git a/vmm/src/api/openapi/cloud-hypervisor.yaml b/vmm/src/api/openapi/cloud-hypervisor.yaml index 80a4fa2572..80d5812bd6 100644 --- a/vmm/src/api/openapi/cloud-hypervisor.yaml +++ b/vmm/src/api/openapi/cloud-hypervisor.yaml @@ -607,10 +607,6 @@ components: $ref: "#/components/schemas/VdpaConfig" vsock: $ref: "#/components/schemas/VsockConfig" - sgx_epc: - type: array - items: - $ref: "#/components/schemas/SgxEpcConfig" numa: type: array items: @@ -1143,21 +1139,6 @@ components: id: type: string - SgxEpcConfig: - required: - - id - - size - type: object - properties: - id: - type: string - size: - type: integer - format: int64 - prefault: - type: boolean - default: false - NumaDistance: required: - destination @@ -1192,10 +1173,6 @@ components: type: array items: type: string - sgx_epc_sections: - type: array - items: - type: string pci_segments: type: array items: @@ -1268,10 +1245,24 @@ components: - destination_url type: object properties: + connections: + type: integer + format: int64 + default: 1 destination_url: type: string local: type: boolean + downtime: + type: integer + format: int64 + description: Maximum downtime in milliseconds during migration + default: 500 + migration_timeout: + type: integer + format: int64 + description: Total timeout for migration in milliseconds (0 = no limit) + default: 0 VmAddUserDevice: required: diff --git a/vmm/src/clone3.rs b/vmm/src/clone3.rs index f08e5ad31c..eca0d90632 100644 --- a/vmm/src/clone3.rs +++ b/vmm/src/clone3.rs @@ -1,7 +1,7 @@ // Copyright 2021 Alyssa Ross // SPDX-License-Identifier: Apache-2.0 -use libc::{c_long, size_t, syscall, SYS_clone3}; +use libc::{SYS_clone3, c_long, size_t, syscall}; pub const CLONE_CLEAR_SIGHAND: u64 = 0x100000000; @@ -22,6 +22,17 @@ pub struct clone_args { pub cgroup: u64, } +/// # Safety +/// `size` must have the proper size to match `args`. +/// Further, the caller needs to check the return value. +/// +/// # Return +/// - On success: +/// - Parent: child PID (`c_long`) +/// - Child: `0` +/// - On error: `-1` and `errno` is set +#[must_use] pub unsafe fn clone3(args: &mut clone_args, size: size_t) -> c_long { - syscall(SYS_clone3, args, size) + // SAFETY: parameters are assumed to be valid + unsafe { syscall(SYS_clone3, args, size) } } diff --git a/vmm/src/config.rs b/vmm/src/config.rs index b2d940a66b..815ad7a7ea 100644 --- a/vmm/src/config.rs +++ b/vmm/src/config.rs @@ -4,10 +4,14 @@ // use std::collections::{BTreeSet, HashMap}; +#[cfg(feature = "ivshmem")] +use std::fs; +use std::os::fd::RawFd; use std::path::PathBuf; use std::result; use std::str::FromStr; +use arch::CpuProfile; use clap::ArgMatches; use option_parser::{ ByteSized, IntegerList, OptionParser, OptionParserError, StringList, Toggle, Tuple, @@ -25,6 +29,11 @@ use crate::vm_config::*; const MAX_NUM_PCI_SEGMENTS: u16 = 96; const MAX_IOMMU_ADDRESS_WIDTH_BITS: u8 = 64; +#[cfg(all(feature = "kvm", target_arch = "x86_64"))] +const MAX_SUPPORTED_CPUS: u32 = 8192; +#[cfg(not(all(feature = "kvm", target_arch = "x86_64")))] +const MAX_SUPPORTED_CPUS: u32 = 255; + /// Errors associated with VM configuration parameters. #[derive(Debug, Error)] pub enum Error { @@ -107,14 +116,6 @@ pub enum Error { /// Failed parsing restore parameters #[error("Error parsing --restore")] ParseRestore(#[source] OptionParserError), - /// Failed parsing SGX EPC parameters - #[cfg(target_arch = "x86_64")] - #[error("Error parsing --sgx-epc")] - ParseSgxEpc(#[source] OptionParserError), - /// Missing 'id' from SGX EPC section - #[cfg(target_arch = "x86_64")] - #[error("Error parsing --sgx-epc: id missing")] - ParseSgxEpcIdMissing, /// Failed parsing NUMA parameters #[error("Error parsing --numa")] ParseNuma(#[source] OptionParserError), @@ -154,31 +155,49 @@ pub enum Error { /// Failed parsing TPM device #[error("Error parsing --tpm")] ParseTpm(#[source] OptionParserError), + #[cfg(feature = "ivshmem")] + /// Failed parsing ivsmem device + #[error("Error parsing --ivshmem")] + ParseIvshmem(#[source] OptionParserError), /// Missing path for TPM device #[error("Error parsing --tpm: path missing")] ParseTpmPathMissing, + #[cfg(feature = "ivshmem")] + /// Missing path for ivsmem device + #[error("Error parsing --ivshmem: path missing")] + ParseIvshmemPathMissing, /// Error parsing Landlock rules #[error("Error parsing --landlock-rules")] ParseLandlockRules(#[source] OptionParserError), /// Missing fields in Landlock rules #[error("Error parsing --landlock-rules: path/access field missing")] ParseLandlockMissingFields, + #[cfg(feature = "fw_cfg")] + /// Failed Parsing FwCfgItem config + #[error("Error parsing --fw-cfg-config items")] + ParseFwCfgItem(#[source] OptionParserError), + /// Failed parsing addr option + #[error("Error parsing --addr")] + ParsePciAddr(#[source] OptionParserError), } #[derive(Debug, PartialEq, Eq, Error)] pub enum ValidationError { - /// No kernel specified - #[error("No kernel specified")] - KernelMissing, /// Missing file value for console #[error("Path missing when using file console mode")] ConsoleFileMissing, + /// Missing TCP address for console + #[error("Address missing when using TCP console mode")] + ConsoleTcpAddressMissing, /// Missing socket path for console #[error("Path missing when using socket console mode")] ConsoleSocketPathMissing, /// Max is less than boot #[error("Max CPUs lower than boot CPUs")] CpusMaxLowerThanBoot, + /// Too many CPUs. + #[error("Too many CPUs: specified {0} but {MAX_SUPPORTED_CPUS} is the limit")] + TooManyCpus(u32 /* specified CPUs */), /// Missing file value for debug-console #[cfg(target_arch = "x86_64")] #[error("Path missing when using file mode for debug console")] @@ -215,8 +234,8 @@ pub enum ValidationError { #[error("Number of queues to virtio_net does not match the number of input FDs")] VnetQueueFdMismatch, /// Using reserved fd - #[error("Reserved fd number (<= 2)")] - VnetReservedFd, + #[error("Reserved fd number (fd={0} <= 2)")] + VnetReservedFd(RawFd), /// Hardware checksum offload is disabled. #[error("\"offload_tso\" and \"offload_ufo\" depend on \"offload_csum\"")] NoHardwareChecksumOffload, @@ -259,7 +278,9 @@ pub enum ValidationError { #[error("Invalid PCI segment aperture weight: {0}")] InvalidPciSegmentApertureWeight(u32), /// Invalid IOMMU address width in bits - #[error("IOMMU address width in bits ({0}) should be less than or equal to {MAX_IOMMU_ADDRESS_WIDTH_BITS}")] + #[error( + "IOMMU address width in bits ({0}) should be less than or equal to {MAX_IOMMU_ADDRESS_WIDTH_BITS}" + )] InvalidIommuAddressWidthBits(u8), /// Balloon too big #[error("Ballon size ({0}) greater than RAM ({1})")] @@ -318,6 +339,32 @@ pub enum ValidationError { /// Invalid block device serial length #[error("Block device serial length ({0}) exceeds maximum allowed length ({1})")] InvalidSerialLength(usize, usize), + #[cfg(feature = "fw_cfg")] + /// FwCfg missing kernel + #[error("Error --fw-cfg-config: missing --kernel")] + FwCfgMissingKernel, + #[cfg(feature = "fw_cfg")] + /// FwCfg missing cmdline + #[error("Error --fw-cfg-config: missing --cmdline")] + FwCfgMissingCmdline, + #[cfg(feature = "fw_cfg")] + /// FwCfg missing initramfs + #[error("Error --fw-cfg-config: missing --initramfs")] + FwCfgMissingInitramfs, + #[cfg(feature = "ivshmem")] + /// Invalid Ivshmem input size + #[error("Invalid ivshmem input size")] + InvalidIvshmemInputSize(u64), + #[cfg(feature = "ivshmem")] + /// Invalid Ivshmem backend file size + #[error("Invalid ivshmem backend file size")] + InvalidIvshmemSize(u64), + #[cfg(feature = "ivshmem")] + /// Invalid Ivshmem backend file path + #[error("Invalid ivshmem backend file path")] + InvalidIvshmemPath, + #[error("Payload configuration is not bootable")] + PayloadError(#[from] PayloadConfigError), } type ValidationResult = std::result::Result; @@ -358,8 +405,6 @@ pub struct VmParams<'a> { #[cfg(feature = "pvmemcontrol")] pub pvmemcontrol: bool, pub pvpanic: bool, - #[cfg(target_arch = "x86_64")] - pub sgx_epc: Option>, pub numa: Option>, pub watchdog: bool, #[cfg(feature = "guest_debug")] @@ -373,6 +418,10 @@ pub struct VmParams<'a> { pub host_data: Option<&'a str>, pub landlock_enable: bool, pub landlock_rules: Option>, + #[cfg(feature = "fw_cfg")] + pub fw_cfg_config: Option<&'a str>, + #[cfg(feature = "ivshmem")] + pub ivshmem: Option<&'a str>, } impl<'a> VmParams<'a> { @@ -421,10 +470,6 @@ impl<'a> VmParams<'a> { #[cfg(feature = "pvmemcontrol")] let pvmemcontrol = args.get_flag("pvmemcontrol"); let pvpanic = args.get_flag("pvpanic"); - #[cfg(target_arch = "x86_64")] - let sgx_epc: Option> = args - .get_many::("sgx-epc") - .map(|x| x.map(|y| y as &str).collect()); let numa: Option> = args .get_many::("numa") .map(|x| x.map(|y| y as &str).collect()); @@ -444,7 +489,11 @@ impl<'a> VmParams<'a> { let landlock_rules: Option> = args .get_many::("landlock-rules") .map(|x| x.map(|y| y as &str).collect()); - + #[cfg(feature = "fw_cfg")] + let fw_cfg_config: Option<&str> = + args.get_one::("fw-cfg-config").map(|x| x as &str); + #[cfg(feature = "ivshmem")] + let ivshmem: Option<&str> = args.get_one::("ivshmem").map(|x| x as &str); VmParams { cpus, memory, @@ -471,8 +520,6 @@ impl<'a> VmParams<'a> { #[cfg(feature = "pvmemcontrol")] pvmemcontrol, pvpanic, - #[cfg(target_arch = "x86_64")] - sgx_epc, numa, watchdog, #[cfg(feature = "guest_debug")] @@ -486,6 +533,10 @@ impl<'a> VmParams<'a> { host_data, landlock_enable, landlock_rules, + #[cfg(feature = "fw_cfg")] + fw_cfg_config, + #[cfg(feature = "ivshmem")] + ivshmem, } } } @@ -550,14 +601,15 @@ impl CpusConfig { .add("kvm_hyperv") .add("max_phys_bits") .add("affinity") + .add("profile") .add("features"); parser.parse(cpus).map_err(Error::ParseCpus)?; - let boot_vcpus: u8 = parser + let boot_vcpus: u32 = parser .convert("boot") .map_err(Error::ParseCpus)? .unwrap_or(DEFAULT_VCPUS); - let max_vcpus: u8 = parser + let max_vcpus: u32 = parser .convert("max") .map_err(Error::ParseCpus)? .unwrap_or(boot_vcpus); @@ -572,7 +624,7 @@ impl CpusConfig { .map_err(Error::ParseCpus)? .unwrap_or(DEFAULT_MAX_PHYS_BITS); let affinity = parser - .convert::>>("affinity") + .convert::>>("affinity") .map_err(Error::ParseCpus)? .map(|v| { v.0.iter() @@ -582,6 +634,12 @@ impl CpusConfig { }) .collect() }); + + let profile = parser + .convert::("profile") + .map_err(Error::ParseCpus)? + .unwrap_or_default(); + let features_list = parser .convert::("features") .map_err(Error::ParseCpus)? @@ -593,6 +651,7 @@ impl CpusConfig { // list as it will always be checked for. #[allow(unused_mut)] let mut features = CpuFeatures::default(); + #[allow(clippy::never_loop)] for s in features_list.0 { match >::as_ref(&s) { #[cfg(target_arch = "x86_64")] @@ -612,6 +671,7 @@ impl CpusConfig { max_phys_bits, affinity, features, + profile, }) } } @@ -1029,7 +1089,7 @@ impl DiskConfig { ops_size=,ops_one_time_burst=,ops_refill_time=,\ id=,pci_segment=,rate_limit_group=,\ queue_affinity=,\ - serial="; + serial=,addr="; pub fn parse(disk: &str) -> Result { let mut parser = OptionParser::new(); @@ -1054,7 +1114,8 @@ impl DiskConfig { .add("pci_segment") .add("serial") .add("rate_limit_group") - .add("queue_affinity"); + .add("queue_affinity") + .add("addr"); parser.parse(disk).map_err(Error::ParseDisk)?; let path = parser.get("path").map(PathBuf::from); @@ -1166,6 +1227,10 @@ impl DiskConfig { None }; + let (bdf_device, _bdf_function) = parser + .get_pci_device_function() + .map_err(Error::ParsePciAddr)?; + Ok(DiskConfig { path, readonly, @@ -1183,6 +1248,7 @@ impl DiskConfig { pci_segment, serial, queue_affinity, + bdf_device, }) } @@ -1204,10 +1270,11 @@ impl DiskConfig { return Err(ValidationError::InvalidPciSegment(self.pci_segment)); } - if let Some(iommu_segments) = platform_config.iommu_segments.as_ref() { - if iommu_segments.contains(&self.pci_segment) && !self.iommu { - return Err(ValidationError::OnIommuSegment(self.pci_segment)); - } + if let Some(iommu_segments) = platform_config.iommu_segments.as_ref() + && iommu_segments.contains(&self.pci_segment) + && !self.iommu + { + return Err(ValidationError::OnIommuSegment(self.pci_segment)); } } @@ -1216,13 +1283,13 @@ impl DiskConfig { } // Check Block device serial length - if let Some(ref serial) = self.serial { - if serial.len() > VIRTIO_BLK_ID_BYTES as usize { - return Err(ValidationError::InvalidSerialLength( - serial.len(), - VIRTIO_BLK_ID_BYTES as usize, - )); - } + if let Some(ref serial) = self.serial + && serial.len() > VIRTIO_BLK_ID_BYTES as usize + { + return Err(ValidationError::InvalidSerialLength( + serial.len(), + VIRTIO_BLK_ID_BYTES as usize, + )); } Ok(()) @@ -1253,7 +1320,7 @@ impl NetConfig { vhost_user=,socket=,vhost_mode=client|server,\ bw_size=,bw_one_time_burst=,bw_refill_time=,\ ops_size=,ops_one_time_burst=,ops_refill_time=,pci_segment=\ - offload_tso=on|off,offload_ufo=on|off,offload_csum=on|off\""; + offload_tso=on|off,offload_ufo=on|off,offload_csum=on|off,addr=DD.F\""; pub fn parse(net: &str) -> Result { let mut parser = OptionParser::new(); @@ -1282,7 +1349,8 @@ impl NetConfig { .add("ops_size") .add("ops_one_time_burst") .add("ops_refill_time") - .add("pci_segment"); + .add("pci_segment") + .add("addr"); parser.parse(net).map_err(Error::ParseNetwork)?; let tap = parser.get("tap"); @@ -1398,6 +1466,10 @@ impl NetConfig { None }; + let (bdf_device, _bdf_function) = parser + .get_pci_device_function() + .map_err(Error::ParsePciAddr)?; + let config = NetConfig { tap, ip, @@ -1418,6 +1490,7 @@ impl NetConfig { offload_tso, offload_ufo, offload_csum, + bdf_device, }; Ok(config) } @@ -1434,7 +1507,12 @@ impl NetConfig { if let Some(fds) = self.fds.as_ref() { for fd in fds { if *fd <= 2 { - return Err(ValidationError::VnetReservedFd); + // If we see this, most likely our live migration path for network FDs failed. + log::debug!( + "virtio-net devices {:?} unexpectedly reports invalid FD", + self.id + ); + return Err(ValidationError::VnetReservedFd(*fd)); } } } @@ -1452,17 +1530,18 @@ impl NetConfig { return Err(ValidationError::InvalidPciSegment(self.pci_segment)); } - if let Some(iommu_segments) = platform_config.iommu_segments.as_ref() { - if iommu_segments.contains(&self.pci_segment) && !self.iommu { - return Err(ValidationError::OnIommuSegment(self.pci_segment)); - } + if let Some(iommu_segments) = platform_config.iommu_segments.as_ref() + && iommu_segments.contains(&self.pci_segment) + && !self.iommu + { + return Err(ValidationError::OnIommuSegment(self.pci_segment)); } } - if let Some(mtu) = self.mtu { - if mtu < virtio_devices::net::MIN_MTU { - return Err(ValidationError::InvalidMtu(mtu)); - } + if let Some(mtu) = self.mtu + && mtu < virtio_devices::net::MIN_MTU + { + return Err(ValidationError::InvalidMtu(mtu)); } if !self.offload_csum && (self.offload_tso || self.offload_ufo) { @@ -1476,7 +1555,7 @@ impl NetConfig { impl RngConfig { pub fn parse(rng: &str) -> Result { let mut parser = OptionParser::new(); - parser.add("src").add("iommu"); + parser.add("src").add("iommu").add("addr"); parser.parse(rng).map_err(Error::ParseRng)?; let src = PathBuf::from( @@ -1490,20 +1569,27 @@ impl RngConfig { .unwrap_or(Toggle(false)) .0; - Ok(RngConfig { src, iommu }) + let (bdf_device, _bdf_function) = parser + .get_pci_device_function() + .map_err(Error::ParsePciAddr)?; + + Ok(RngConfig { + src, + iommu, + bdf_device, + }) } } impl BalloonConfig { - pub const SYNTAX: &'static str = - "Balloon parameters \"size=,deflate_on_oom=on|off,\ - free_page_reporting=on|off\""; + pub const SYNTAX: &'static str = "Balloon parameters \"size=,deflate_on_oom=on|off,\ + free_page_reporting=on|off,addr=\""; pub fn parse(balloon: &str) -> Result { let mut parser = OptionParser::new(); parser.add("size"); parser.add("deflate_on_oom"); - parser.add("free_page_reporting"); + parser.add("free_page_reporting").add("addr"); parser.parse(balloon).map_err(Error::ParseBalloon)?; let size = parser @@ -1524,10 +1610,15 @@ impl BalloonConfig { .unwrap_or(Toggle(false)) .0; + let (bdf_device, _bdf_function) = parser + .get_pci_device_function() + .map_err(Error::ParsePciAddr)?; + Ok(BalloonConfig { size, deflate_on_oom, free_page_reporting, + bdf_device, }) } } @@ -1535,7 +1626,8 @@ impl BalloonConfig { impl FsConfig { pub const SYNTAX: &'static str = "virtio-fs parameters \ \"tag=,socket=,num_queues=,\ - queue_size=,id=,pci_segment=\""; + queue_size=,id=,pci_segment=,\ + addr=\""; pub fn parse(fs: &str) -> Result { let mut parser = OptionParser::new(); @@ -1545,7 +1637,8 @@ impl FsConfig { .add("num_queues") .add("socket") .add("id") - .add("pci_segment"); + .add("pci_segment") + .add("addr"); parser.parse(fs).map_err(Error::ParseFileSystem)?; let tag = parser.get("tag").ok_or(Error::ParseFsTagMissing)?; @@ -1570,6 +1663,10 @@ impl FsConfig { .map_err(Error::ParseFileSystem)? .unwrap_or_default(); + let (bdf_device, _bdf_function) = parser + .get_pci_device_function() + .map_err(Error::ParsePciAddr)?; + Ok(FsConfig { tag, socket, @@ -1577,6 +1674,7 @@ impl FsConfig { queue_size, id, pci_segment, + bdf_device, }) } @@ -1590,12 +1688,12 @@ impl FsConfig { return Err(ValidationError::InvalidPciSegment(self.pci_segment)); } - if let Some(iommu_segments) = platform_config.iommu_segments.as_ref() { - if iommu_segments.contains(&self.pci_segment) { - return Err(ValidationError::IommuNotSupportedOnSegment( - self.pci_segment, - )); - } + if let Some(iommu_segments) = platform_config.iommu_segments.as_ref() + && iommu_segments.contains(&self.pci_segment) + { + return Err(ValidationError::IommuNotSupportedOnSegment( + self.pci_segment, + )); } } @@ -1603,10 +1701,106 @@ impl FsConfig { } } +#[cfg(feature = "fw_cfg")] +impl FwCfgConfig { + pub const SYNTAX: &'static str = "Boot params to pass to FW CFG device \ + \"e820=on|off,kernel=on|off,cmdline=on|off,initramfs=on|off,acpi_table=on|off, \ + items=[name0=,file0=:name1=,file1=]\""; + pub fn parse(fw_cfg_config: &str) -> Result { + let mut parser = OptionParser::new(); + parser + .add("e820") + .add("kernel") + .add("cmdline") + .add("initramfs") + .add("acpi_table") + .add("items"); + parser.parse(fw_cfg_config).map_err(Error::ParseFwCfgItem)?; + let e820 = parser + .convert::("e820") + .map_err(Error::ParseFwCfgItem)? + .unwrap_or(Toggle(true)) + .0; + let kernel = parser + .convert::("kernel") + .map_err(Error::ParseFwCfgItem)? + .unwrap_or(Toggle(true)) + .0; + let cmdline = parser + .convert::("cmdline") + .map_err(Error::ParseFwCfgItem)? + .unwrap_or(Toggle(true)) + .0; + let initramfs = parser + .convert::("initramfs") + .map_err(Error::ParseFwCfgItem)? + .unwrap_or(Toggle(true)) + .0; + let acpi_tables = parser + .convert::("acpi_table") + .map_err(Error::ParseFwCfgItem)? + .unwrap_or(Toggle(true)) + .0; + let items = if parser.is_set("items") { + Some( + parser + .convert::("items") + .map_err(Error::ParseFwCfgItem)? + .unwrap(), + ) + } else { + None + }; + + Ok(FwCfgConfig { + e820, + kernel, + cmdline, + initramfs, + acpi_tables, + items, + }) + } + pub fn validate(&self, vm_config: &VmConfig) -> ValidationResult<()> { + let payload = vm_config.payload.as_ref().unwrap(); + if self.kernel && payload.kernel.is_none() { + return Err(ValidationError::FwCfgMissingKernel); + } else if self.cmdline && payload.cmdline.is_none() { + return Err(ValidationError::FwCfgMissingCmdline); + } else if self.initramfs && payload.initramfs.is_none() { + return Err(ValidationError::FwCfgMissingInitramfs); + } + Ok(()) + } +} + +#[cfg(feature = "fw_cfg")] +impl FwCfgItem { + pub fn parse(fw_cfg: &str) -> Result { + let mut parser = OptionParser::new(); + parser.add("name").add("file"); + parser.parse(fw_cfg).map_err(Error::ParseFwCfgItem)?; + + let name = + parser + .get("name") + .ok_or(Error::ParseFwCfgItem(OptionParserError::InvalidValue( + "missing FwCfgItem name".to_string(), + )))?; + let file = parser + .get("file") + .map(PathBuf::from) + .ok_or(Error::ParseFwCfgItem(OptionParserError::InvalidValue( + "missing FwCfgItem file path".to_string(), + )))?; + Ok(FwCfgItem { name, file }) + } +} + impl PmemConfig { pub const SYNTAX: &'static str = "Persistent memory parameters \ \"file=,size=,iommu=on|off,\ - discard_writes=on|off,id=,pci_segment=\""; + discard_writes=on|off,id=,pci_segment=,addr=\""; pub fn parse(pmem: &str) -> Result { let mut parser = OptionParser::new(); @@ -1616,7 +1810,8 @@ impl PmemConfig { .add("iommu") .add("discard_writes") .add("id") - .add("pci_segment"); + .add("pci_segment") + .add("addr"); parser.parse(pmem).map_err(Error::ParsePersistentMemory)?; let file = PathBuf::from(parser.get("file").ok_or(Error::ParsePmemFileMissing)?); @@ -1640,6 +1835,10 @@ impl PmemConfig { .map_err(Error::ParsePersistentMemory)? .unwrap_or_default(); + let (bdf_device, _bdf_function) = parser + .get_pci_device_function() + .map_err(Error::ParsePciAddr)?; + Ok(PmemConfig { file, size, @@ -1647,6 +1846,7 @@ impl PmemConfig { discard_writes, id, pci_segment, + bdf_device, }) } @@ -1656,10 +1856,11 @@ impl PmemConfig { return Err(ValidationError::InvalidPciSegment(self.pci_segment)); } - if let Some(iommu_segments) = platform_config.iommu_segments.as_ref() { - if iommu_segments.contains(&self.pci_segment) && !self.iommu { - return Err(ValidationError::OnIommuSegment(self.pci_segment)); - } + if let Some(iommu_segments) = platform_config.iommu_segments.as_ref() + && iommu_segments.contains(&self.pci_segment) + && !self.iommu + { + return Err(ValidationError::OnIommuSegment(self.pci_segment)); } } @@ -1677,11 +1878,14 @@ impl ConsoleConfig { .add_valueless("null") .add("file") .add("iommu") - .add("socket"); + .add("tcp") + .add("socket") + .add("addr"); parser.parse(console).map_err(Error::ParseConsole)?; let mut file: Option = default_consoleconfig_file(); let mut socket: Option = None; + let mut url: Option = None; let mut mode: ConsoleOutputMode = ConsoleOutputMode::Off; if parser.is_set("off") { @@ -1691,6 +1895,19 @@ impl ConsoleConfig { mode = ConsoleOutputMode::Tty } else if parser.is_set("null") { mode = ConsoleOutputMode::Null + } else if parser.is_set("tcp") { + mode = ConsoleOutputMode::Tcp; + url = Some( + parser + .get("tcp") + .ok_or(Error::Validation(ValidationError::ConsoleTcpAddressMissing))?, + ); + if parser.is_set("file") { + file = + Some(PathBuf::from(parser.get("file").ok_or( + Error::Validation(ValidationError::ConsoleFileMissing), + )?)); + } } else if parser.is_set("file") { mode = ConsoleOutputMode::File; file = @@ -1711,11 +1928,17 @@ impl ConsoleConfig { .unwrap_or(Toggle(false)) .0; + let (bdf_device, _bdf_function) = parser + .get_pci_device_function() + .map_err(Error::ParsePciAddr)?; + Ok(Self { file, mode, iommu, socket, + url, + bdf_device, }) } } @@ -1756,17 +1979,18 @@ impl DebugConsoleConfig { return Err(Error::ParseConsoleInvalidModeGiven); } - if parser.is_set("iobase") { - if let Some(iobase_opt) = parser.get("iobase") { - if !iobase_opt.starts_with("0x") { - return Err(Error::Validation(ValidationError::InvalidIoPortHex( - iobase_opt, - ))); - } - iobase = Some(u16::from_str_radix(&iobase_opt[2..], 16).map_err(|_| { + if parser.is_set("iobase") + && let Some(iobase_opt) = parser.get("iobase") + { + if !iobase_opt.starts_with("0x") { + return Err(Error::Validation(ValidationError::InvalidIoPortHex( + iobase_opt, + ))); + } + iobase = + Some(u16::from_str_radix(&iobase_opt[2..], 16).map_err(|_| { Error::Validation(ValidationError::InvalidIoPortHex(iobase_opt)) })?); - } } Ok(Self { file, mode, iobase }) @@ -1774,8 +1998,8 @@ impl DebugConsoleConfig { } impl DeviceConfig { - pub const SYNTAX: &'static str = - "Direct device assignment parameters \"path=,iommu=on|off,id=,pci_segment=\""; + pub const SYNTAX: &'static str = "Direct device assignment parameters \"\ + path=,iommu=on|off,id=,pci_segment=\""; pub fn parse(device: &str) -> Result { let mut parser = OptionParser::new(); @@ -1819,10 +2043,11 @@ impl DeviceConfig { return Err(ValidationError::InvalidPciSegment(self.pci_segment)); } - if let Some(iommu_segments) = platform_config.iommu_segments.as_ref() { - if iommu_segments.contains(&self.pci_segment) && !self.iommu { - return Err(ValidationError::OnIommuSegment(self.pci_segment)); - } + if let Some(iommu_segments) = platform_config.iommu_segments.as_ref() + && iommu_segments.contains(&self.pci_segment) + && !self.iommu + { + return Err(ValidationError::OnIommuSegment(self.pci_segment)); } } @@ -1862,12 +2087,12 @@ impl UserDeviceConfig { return Err(ValidationError::InvalidPciSegment(self.pci_segment)); } - if let Some(iommu_segments) = platform_config.iommu_segments.as_ref() { - if iommu_segments.contains(&self.pci_segment) { - return Err(ValidationError::IommuNotSupportedOnSegment( - self.pci_segment, - )); - } + if let Some(iommu_segments) = platform_config.iommu_segments.as_ref() + && iommu_segments.contains(&self.pci_segment) + { + return Err(ValidationError::IommuNotSupportedOnSegment( + self.pci_segment, + )); } } @@ -1878,7 +2103,7 @@ impl UserDeviceConfig { impl VdpaConfig { pub const SYNTAX: &'static str = "vDPA device \ \"path=,num_queues=,iommu=on|off,\ - id=,pci_segment=\""; + id=,pci_segment=,addr=\""; pub fn parse(vdpa: &str) -> Result { let mut parser = OptionParser::new(); @@ -1887,7 +2112,8 @@ impl VdpaConfig { .add("num_queues") .add("iommu") .add("id") - .add("pci_segment"); + .add("pci_segment") + .add("addr"); parser.parse(vdpa).map_err(Error::ParseVdpa)?; let path = parser @@ -1909,12 +2135,17 @@ impl VdpaConfig { .map_err(Error::ParseVdpa)? .unwrap_or_default(); + let (bdf_device, _bdf_function) = parser + .get_pci_device_function() + .map_err(Error::ParsePciAddr)?; + Ok(VdpaConfig { path, num_queues, iommu, id, pci_segment, + bdf_device, }) } @@ -1924,10 +2155,11 @@ impl VdpaConfig { return Err(ValidationError::InvalidPciSegment(self.pci_segment)); } - if let Some(iommu_segments) = platform_config.iommu_segments.as_ref() { - if iommu_segments.contains(&self.pci_segment) && !self.iommu { - return Err(ValidationError::OnIommuSegment(self.pci_segment)); - } + if let Some(iommu_segments) = platform_config.iommu_segments.as_ref() + && iommu_segments.contains(&self.pci_segment) + && !self.iommu + { + return Err(ValidationError::OnIommuSegment(self.pci_segment)); } } @@ -1937,7 +2169,8 @@ impl VdpaConfig { impl VsockConfig { pub const SYNTAX: &'static str = "Virtio VSOCK parameters \ - \"cid=,socket=,iommu=on|off,id=,pci_segment=\""; + \"cid=,socket=,iommu=on|off,id=,\ + pci_segment=,addr=\""; pub fn parse(vsock: &str) -> Result { let mut parser = OptionParser::new(); @@ -1946,7 +2179,8 @@ impl VsockConfig { .add("cid") .add("iommu") .add("id") - .add("pci_segment"); + .add("pci_segment") + .add("addr"); parser.parse(vsock).map_err(Error::ParseVsock)?; let socket = parser @@ -1968,12 +2202,17 @@ impl VsockConfig { .map_err(Error::ParseVsock)? .unwrap_or_default(); + let (bdf_device, _bdf_function) = parser + .get_pci_device_function() + .map_err(Error::ParsePciAddr)?; + Ok(VsockConfig { cid, socket, iommu, id, pci_segment, + bdf_device, }) } @@ -1983,10 +2222,11 @@ impl VsockConfig { return Err(ValidationError::InvalidPciSegment(self.pci_segment)); } - if let Some(iommu_segments) = platform_config.iommu_segments.as_ref() { - if iommu_segments.contains(&self.pci_segment) && !self.iommu { - return Err(ValidationError::OnIommuSegment(self.pci_segment)); - } + if let Some(iommu_segments) = platform_config.iommu_segments.as_ref() + && iommu_segments.contains(&self.pci_segment) + && !self.iommu + { + return Err(ValidationError::OnIommuSegment(self.pci_segment)); } } @@ -1994,36 +2234,10 @@ impl VsockConfig { } } -#[cfg(target_arch = "x86_64")] -impl SgxEpcConfig { - pub const SYNTAX: &'static str = "SGX EPC parameters \ - \"id=,size=,prefault=on|off\""; - - pub fn parse(sgx_epc: &str) -> Result { - let mut parser = OptionParser::new(); - parser.add("id").add("size").add("prefault"); - parser.parse(sgx_epc).map_err(Error::ParseSgxEpc)?; - - let id = parser.get("id").ok_or(Error::ParseSgxEpcIdMissing)?; - let size = parser - .convert::("size") - .map_err(Error::ParseSgxEpc)? - .unwrap_or(ByteSized(0)) - .0; - let prefault = parser - .convert::("prefault") - .map_err(Error::ParseSgxEpc)? - .unwrap_or(Toggle(false)) - .0; - - Ok(SgxEpcConfig { id, size, prefault }) - } -} - impl NumaConfig { pub const SYNTAX: &'static str = "Settings related to a given NUMA node \ \"guest_numa_id=,cpus=,distances=,\ - memory_zones=,sgx_epc_sections=,\ + memory_zones=,\ pci_segments=\""; pub fn parse(numa: &str) -> Result { @@ -2033,7 +2247,6 @@ impl NumaConfig { .add("cpus") .add("distances") .add("memory_zones") - .add("sgx_epc_sections") .add("pci_segments"); parser.parse(numa).map_err(Error::ParseNuma)?; @@ -2045,7 +2258,7 @@ impl NumaConfig { let cpus = parser .convert::("cpus") .map_err(Error::ParseNuma)? - .map(|v| v.0.iter().map(|e| *e as u8).collect()); + .map(|v| v.0.iter().map(|e| *e as u32).collect()); let distances = parser .convert::>("distances") .map_err(Error::ParseNuma)? @@ -2061,11 +2274,6 @@ impl NumaConfig { .convert::("memory_zones") .map_err(Error::ParseNuma)? .map(|v| v.0); - #[cfg(target_arch = "x86_64")] - let sgx_epc_sections = parser - .convert::("sgx_epc_sections") - .map_err(Error::ParseNuma)? - .map(|v| v.0); let pci_segments = parser .convert::("pci_segments") .map_err(Error::ParseNuma)? @@ -2075,8 +2283,6 @@ impl NumaConfig { cpus, distances, memory_zones, - #[cfg(target_arch = "x86_64")] - sgx_epc_sections, pci_segments, }) } @@ -2087,27 +2293,35 @@ pub struct RestoredNetConfig { pub id: String, #[serde(default)] pub num_fds: usize, - #[serde( - default, - serialize_with = "serialize_restorednetconfig_fds", - deserialize_with = "deserialize_restorednetconfig_fds" - )] + // Special deserialize handling: + // A serialize-deserialize cycle typically happens across processes. + // The old FD is almost certainly invalid in the new process. + // One way to get actual FDs here in a new process is the `receive-migration` + // path via a UNIX Domain socket: An SCM_RIGHTS UNIX Domain Socket message + // passes new FDs to the Cloud Hypervisor process, but these FDs are handled + // in the HTTP API handler. + #[serde(default, deserialize_with = "deserialize_restorednetconfig_fds")] pub fds: Option>, } -fn serialize_restorednetconfig_fds( - x: &Option>, - s: S, -) -> std::result::Result -where - S: serde::Serializer, -{ - if let Some(x) = x { - warn!("'RestoredNetConfig' contains FDs that can't be serialized correctly. Serializing them as invalid FDs."); - let invalid_fds = vec![-1; x.len()]; - s.serialize_some(&invalid_fds) - } else { - s.serialize_none() +impl RestoredNetConfig { + // Ensure all net devices from 'VmConfig' backed by FDs have a + // corresponding 'RestoreNetConfig' with a matched 'id' and expected + // number of FDs. + pub fn validate(&self, vm_config: &VmConfig) -> ValidationResult<()> { + let found = vm_config + .net + .iter() + .flatten() + .any(|net| net.id.as_ref() == Some(&self.id)); + + if !found { + Err(ValidationError::RestoreMissingRequiredNetId( + self.id.clone(), + )) + } else { + Ok(()) + } } } @@ -2119,7 +2333,12 @@ where { let invalid_fds: Option> = Option::deserialize(d)?; if let Some(invalid_fds) = invalid_fds { - warn!("'RestoredNetConfig' contains FDs that can't be deserialized correctly. Deserializing them as invalid FDs."); + // If the live-migration path is used properly, new FDs are passed as + // SCM_RIGHTS message. So, we don't get them from the serialized JSON + // anyway. + debug!( + "FDs in 'RestoredNetConfig' won't be deserialized as they are most likely invalid now. Deserializing them as -1." + ); Ok(Some(vec![-1; invalid_fds.len()])) } else { Ok(None) @@ -2279,6 +2498,47 @@ impl LandlockConfig { } } +#[cfg(feature = "ivshmem")] +impl IvshmemConfig { + pub const SYNTAX: &'static str = "Ivshmem device. Specify the backend file path and size \ + for the shared memory: \"path=, size=\" \ + \nThe must be a power of 2 (e.g., 2M, 4M, etc.), as it represents the size \ + of the memory region mapped to the guest. Default size is 128M."; + pub fn parse(ivshmem: &str) -> Result { + let mut parser = OptionParser::new(); + parser.add("path").add("size"); + parser.parse(ivshmem).map_err(Error::ParseIvshmem)?; + let path = parser + .get("path") + .map(PathBuf::from) + .ok_or(Error::ParseIvshmemPathMissing)?; + let size = parser + .convert::("size") + .map_err(Error::ParseIvshmem)? + .unwrap_or(ByteSized((DEFAULT_IVSHMEM_SIZE << 20) as u64)) + .0; + Ok(IvshmemConfig { + path, + size: size as usize, + }) + } + + pub fn validate(&self) -> ValidationResult<()> { + let size = self.size as u64; + let path = &self.path; + // size must = 2^n + if !size.is_power_of_two() { + return Err(ValidationError::InvalidIvshmemInputSize(size)); + } + let metadata = fs::metadata(path.to_str().unwrap()) + .map_err(|_| ValidationError::InvalidIvshmemPath)?; + if metadata.len() < size { + return Err(ValidationError::InvalidIvshmemSize(metadata.len())); + } + Ok(()) + } +} + impl VmConfig { fn validate_identifier( id_list: &mut BTreeSet, @@ -2320,9 +2580,13 @@ impl VmConfig { pub fn validate(&mut self) -> ValidationResult> { let mut id_list = BTreeSet::new(); + // Is the payload configuration bootable? self.payload - .as_ref() - .ok_or(ValidationError::KernelMissing)?; + .as_mut() + .ok_or(ValidationError::PayloadError( + PayloadConfigError::MissingBootitem, + ))? + .validate()?; #[cfg(feature = "tdx")] { @@ -2340,10 +2604,10 @@ impl VmConfig { { let host_data_opt = &self.payload.as_ref().unwrap().host_data; - if let Some(host_data) = host_data_opt { - if host_data.len() != 64 { - return Err(ValidationError::InvalidHostData); - } + if let Some(host_data) = host_data_opt + && host_data.len() != 64 + { + return Err(ValidationError::InvalidHostData); } } // The 'conflict' check is introduced in commit 24438e0390d3 @@ -2383,6 +2647,15 @@ impl VmConfig { return Err(ValidationError::CpusMaxLowerThanBoot); } + if self.cpus.max_vcpus > MAX_SUPPORTED_CPUS { + // Note: historically, Cloud Hypervisor did not support more than 255(254 on x64) + // vCPUs: self.cpus.max_vcpus was of type u8, so 255 was the maximum; + // on x86_64, the legacy mptable/apic was limited to 254 CPUs. + // + // Now the limit is lifted on x86_64 targets. Other targests/archs: TBD. + return Err(ValidationError::TooManyCpus(self.cpus.max_vcpus)); + } + if let Some(rate_limit_groups) = &self.rate_limit_groups { for rate_limit_group in rate_limit_groups { rate_limit_group.validate(self)?; @@ -2474,7 +2747,10 @@ impl VmConfig { return Err(ValidationError::CpuTopologyDiesPerPackage); } - let total = t.threads_per_core * t.cores_per_die * t.dies_per_package * t.packages; + let total: u32 = (t.threads_per_core as u32) + * (t.cores_per_die as u32) + * (t.dies_per_package as u32) + * (t.packages as u32); if total != self.cpus.max_vcpus { return Err(ValidationError::CpuTopologyCount); } @@ -2510,10 +2786,10 @@ impl VmConfig { } } - if let Some(vsock) = &self.vsock { - if [!0, 0, 1, 2].contains(&vsock.cid) { - return Err(ValidationError::VsockSpecialCid(vsock.cid)); - } + if let Some(vsock) = &self.vsock + && [!0, 0, 1, 2].contains(&vsock.cid) + { + return Err(ValidationError::VsockSpecialCid(vsock.cid)); } if let Some(balloon) = &self.balloon { @@ -2610,14 +2886,6 @@ impl VmConfig { } } - #[cfg(target_arch = "x86_64")] - if let Some(sgx_epcs) = &self.sgx_epc { - for sgx_epc in sgx_epcs.iter() { - let id = sgx_epc.id.clone(); - Self::validate_identifier(&mut id_list, &Some(id))?; - } - } - if let Some(pci_segments) = &self.pci_segments { for pci_segment in pci_segments { pci_segment.validate(self)?; @@ -2636,6 +2904,10 @@ impl VmConfig { landlock_rule.validate()?; } } + #[cfg(feature = "ivshmem")] + if let Some(ivshmem_config) = &self.ivshmem { + ivshmem_config.validate()?; + } Ok(id_list) } @@ -2661,6 +2933,14 @@ impl VmConfig { disks = Some(disk_config_list); } + #[cfg(feature = "fw_cfg")] + let fw_cfg_config = if let Some(fw_cfg_config_str) = vm_params.fw_cfg_config { + let fw_cfg_config = FwCfgConfig::parse(fw_cfg_config_str)?; + Some(fw_cfg_config) + } else { + None + }; + let mut net: Option> = None; if let Some(net_list) = &vm_params.net { let mut net_config_list = Vec::new(); @@ -2755,21 +3035,6 @@ impl VmConfig { let platform = vm_params.platform.map(PlatformConfig::parse).transpose()?; - #[cfg(target_arch = "x86_64")] - let mut sgx_epc: Option> = None; - #[cfg(target_arch = "x86_64")] - { - if let Some(sgx_epc_list) = &vm_params.sgx_epc { - warn!("SGX support is deprecated and will be removed in a future release."); - let mut sgx_epc_config_list = Vec::new(); - for item in sgx_epc_list.iter() { - let sgx_epc_config = SgxEpcConfig::parse(item)?; - sgx_epc_config_list.push(sgx_epc_config); - } - sgx_epc = Some(sgx_epc_config_list); - } - } - let mut numa: Option> = None; if let Some(numa_list) = &vm_params.numa { let mut numa_config_list = Vec::new(); @@ -2797,6 +3062,8 @@ impl VmConfig { igvm: vm_params.igvm.map(PathBuf::from), #[cfg(feature = "sev_snp")] host_data: vm_params.host_data.map(|s| s.to_string()), + #[cfg(feature = "fw_cfg")] + fw_cfg_config, }) } else { None @@ -2823,6 +3090,14 @@ impl VmConfig { ); } + #[cfg(feature = "ivshmem")] + let mut ivshmem: Option = None; + #[cfg(feature = "ivshmem")] + if let Some(iv) = vm_params.ivshmem { + let ivshmem_conf = IvshmemConfig::parse(iv)?; + ivshmem = Some(ivshmem_conf); + } + let mut config = VmConfig { cpus: CpusConfig::parse(vm_params.cpus)?, memory: MemoryConfig::parse(vm_params.memory, vm_params.memory_zones)?, @@ -2846,8 +3121,6 @@ impl VmConfig { pvmemcontrol, pvpanic: vm_params.pvpanic, iommu: false, // updated in VmConfig::validate() - #[cfg(target_arch = "x86_64")] - sgx_epc, numa, watchdog: vm_params.watchdog, #[cfg(feature = "guest_debug")] @@ -2858,6 +3131,8 @@ impl VmConfig { preserved_fds: None, landlock_enable: vm_params.landlock_enable, landlock_rules, + #[cfg(feature = "ivshmem")] + ivshmem, }; config.validate().map_err(Error::Validation)?; Ok(config) @@ -2916,11 +3191,11 @@ impl VmConfig { } // Remove if vsock device - if let Some(vsock) = self.vsock.as_ref() { - if vsock.id.as_ref().map(|id| id.as_ref()) == Some(id) { - self.vsock = None; - removed = true; - } + if let Some(vsock) = self.vsock.as_ref() + && vsock.id.as_ref().map(|id| id.as_ref()) == Some(id) + { + self.vsock = None; + removed = true; } removed @@ -2930,6 +3205,8 @@ impl VmConfig { /// To use this safely, the caller must guarantee that the input /// fds are all valid. pub unsafe fn add_preserved_fds(&mut self, mut fds: Vec) { + debug!("adding preserved FDs to VM list: {fds:?}"); + if fds.is_empty() { return; } @@ -2975,8 +3252,6 @@ impl Clone for VmConfig { user_devices: self.user_devices.clone(), vdpa: self.vdpa.clone(), vsock: self.vsock.clone(), - #[cfg(target_arch = "x86_64")] - sgx_epc: self.sgx_epc.clone(), numa: self.numa.clone(), pci_segments: self.pci_segments.clone(), platform: self.platform.clone(), @@ -2985,8 +3260,19 @@ impl Clone for VmConfig { .preserved_fds .as_ref() // SAFETY: FFI call with valid FDs - .map(|fds| fds.iter().map(|fd| unsafe { libc::dup(*fd) }).collect()), + .map(|fds| { + fds.iter() + .map(|fd| { + // SAFETY: Trivially safe. + let fd_duped = unsafe { libc::dup(*fd) }; + warn!("Cloning VM config: duping preserved FD {fd} => {fd_duped}"); + fd_duped + }) + .collect() + }), landlock_rules: self.landlock_rules.clone(), + #[cfg(feature = "ivshmem")] + ivshmem: self.ivshmem.clone(), ..*self } } @@ -2995,6 +3281,7 @@ impl Clone for VmConfig { impl Drop for VmConfig { fn drop(&mut self) { if let Some(mut fds) = self.preserved_fds.take() { + debug!("Closing preserved FDs from VM: fds={fds:?}"); for fd in fds.drain(..) { // SAFETY: FFI call with valid FDs unsafe { libc::close(fd) }; @@ -3236,6 +3523,7 @@ mod tests { pci_segment: 0, serial: None, queue_affinity: None, + bdf_device: None, } } @@ -3330,6 +3618,13 @@ mod tests { ..disk_fixture() } ); + assert_eq!( + DiskConfig::parse("path=/path/to_file,addr=15.0")?, + DiskConfig { + bdf_device: Some(21), + ..disk_fixture() + } + ); Ok(()) } @@ -3354,6 +3649,7 @@ mod tests { offload_tso: true, offload_ufo: true, offload_csum: true, + bdf_device: None, } } @@ -3397,7 +3693,9 @@ mod tests { ); assert_eq!( - NetConfig::parse("mac=de:ad:be:ef:12:34,host_mac=12:34:de:ad:be:ef,num_queues=4,queue_size=1024,iommu=on")?, + NetConfig::parse( + "mac=de:ad:be:ef:12:34,host_mac=12:34:de:ad:be:ef,num_queues=4,queue_size=1024,iommu=on" + )?, NetConfig { num_queues: 4, queue_size: 1024, @@ -3416,6 +3714,14 @@ mod tests { } ); + assert_eq!( + NetConfig::parse("mac=de:ad:be:ef:12:34,host_mac=12:34:de:ad:be:ef,addr=08.0")?, + NetConfig { + bdf_device: Some(8), + ..net_fixture() + } + ); + Ok(()) } @@ -3434,6 +3740,7 @@ mod tests { RngConfig { src: PathBuf::from("/dev/random"), iommu: true, + bdf_device: None, } ); assert_eq!( @@ -3443,6 +3750,13 @@ mod tests { ..Default::default() } ); + assert_eq!( + RngConfig::parse("addr=10.0")?, + RngConfig { + bdf_device: Some(16), + ..Default::default() + } + ); Ok(()) } @@ -3454,6 +3768,7 @@ mod tests { queue_size: 1024, id: None, pci_segment: 0, + bdf_device: None, } } @@ -3473,6 +3788,14 @@ mod tests { } ); + assert_eq!( + FsConfig::parse("tag=mytag,socket=/tmp/sock,addr=0F.0")?, + FsConfig { + bdf_device: Some(15), + ..fs_fixture() + } + ); + Ok(()) } @@ -3484,6 +3807,7 @@ mod tests { discard_writes: false, id: None, pci_segment: 0, + bdf_device: None, } } @@ -3511,6 +3835,13 @@ mod tests { ..pmem_fixture() } ); + assert_eq!( + PmemConfig::parse("file=/tmp/pmem,size=128M,addr=1F.0")?, + PmemConfig { + bdf_device: Some(31), + ..pmem_fixture() + } + ); Ok(()) } @@ -3526,6 +3857,8 @@ mod tests { iommu: false, file: None, socket: None, + url: None, + bdf_device: None, } ); assert_eq!( @@ -3535,6 +3868,8 @@ mod tests { iommu: false, file: None, socket: None, + url: None, + bdf_device: None, } ); assert_eq!( @@ -3544,6 +3879,8 @@ mod tests { iommu: false, file: None, socket: None, + url: None, + bdf_device: None, } ); assert_eq!( @@ -3553,6 +3890,8 @@ mod tests { iommu: false, file: None, socket: None, + url: None, + bdf_device: None, } ); assert_eq!( @@ -3562,6 +3901,8 @@ mod tests { iommu: false, file: Some(PathBuf::from("/tmp/console")), socket: None, + url: None, + bdf_device: None, } ); assert_eq!( @@ -3571,6 +3912,8 @@ mod tests { iommu: true, file: None, socket: None, + url: None, + bdf_device: None, } ); assert_eq!( @@ -3580,6 +3923,8 @@ mod tests { iommu: true, file: Some(PathBuf::from("/tmp/console")), socket: None, + url: None, + bdf_device: None, } ); assert_eq!( @@ -3589,6 +3934,8 @@ mod tests { iommu: true, file: None, socket: Some(PathBuf::from("/tmp/serial.sock")), + url: None, + bdf_device: None, } ); Ok(()) @@ -3640,6 +3987,7 @@ mod tests { iommu: false, id: None, pci_segment: 0, + bdf_device: None, } } @@ -3656,6 +4004,13 @@ mod tests { ..vdpa_fixture() } ); + assert_eq!( + VdpaConfig::parse("path=/dev/vhost-vdpa,addr=0A.0")?, + VdpaConfig { + bdf_device: Some(10), + ..vdpa_fixture() + } + ); Ok(()) } @@ -3684,6 +4039,7 @@ mod tests { iommu: false, id: None, pci_segment: 0, + bdf_device: None, } ); assert_eq!( @@ -3694,6 +4050,19 @@ mod tests { iommu: true, id: None, pci_segment: 0, + bdf_device: None, + } + ); + + assert_eq!( + VsockConfig::parse("socket=/tmp/sock,cid=3,iommu=on,addr=08.0")?, + VsockConfig { + cid: 3, + socket: PathBuf::from("/tmp/sock"), + iommu: true, + id: None, + pci_segment: 0, + bdf_device: Some(8), } ); Ok(()) @@ -3760,8 +4129,6 @@ mod tests { pvmemcontrol: None, pvpanic: false, iommu: false, - #[cfg(target_arch = "x86_64")] - sgx_epc: None, numa: None, watchdog: false, #[cfg(feature = "guest_debug")] @@ -3775,6 +4142,7 @@ mod tests { id: Some("net0".to_owned()), num_queues: 2, fds: Some(vec![-1, -1, -1, -1]), + bdf_device: Some(15), ..net_fixture() }, NetConfig { @@ -3791,6 +4159,8 @@ mod tests { ]), landlock_enable: false, landlock_rules: None, + #[cfg(feature = "ivshmem")] + ivshmem: None, }; let valid_config = RestoreConfig { @@ -3901,8 +4271,6 @@ mod tests { cpus: None, distances: None, memory_zones: None, - #[cfg(target_arch = "x86_64")] - sgx_epc_sections: None, pci_segments: None, } } @@ -3939,6 +4307,8 @@ mod tests { host_data: Some( "243eb7dc1a21129caa91dcbb794922b933baecb5823a377eb431188673288c07".to_string(), ), + #[cfg(feature = "fw_cfg")] + fw_cfg_config: None, }), rate_limit_groups: None, disks: None, @@ -3946,6 +4316,7 @@ mod tests { rng: RngConfig { src: PathBuf::from("/dev/urandom"), iommu: false, + bdf_device: None, }, balloon: None, fs: None, @@ -3955,12 +4326,16 @@ mod tests { mode: ConsoleOutputMode::Null, iommu: false, socket: None, + url: None, + bdf_device: None, }, console: ConsoleConfig { file: None, mode: ConsoleOutputMode::Tty, iommu: false, socket: None, + url: None, + bdf_device: None, }, #[cfg(target_arch = "x86_64")] debug_console: DebugConsoleConfig::default(), @@ -3972,8 +4347,6 @@ mod tests { pvmemcontrol: None, pvpanic: false, iommu: false, - #[cfg(target_arch = "x86_64")] - sgx_epc: None, numa: None, watchdog: false, #[cfg(feature = "guest_debug")] @@ -3984,6 +4357,8 @@ mod tests { preserved_fds: None, landlock_enable: false, landlock_rules: None, + #[cfg(feature = "ivshmem")] + ivshmem: None, }; valid_config.validate().unwrap(); @@ -3997,7 +4372,9 @@ mod tests { invalid_config.payload = None; assert_eq!( invalid_config.validate(), - Err(ValidationError::KernelMissing) + Err(ValidationError::PayloadError( + PayloadConfigError::MissingBootitem + )) ); let mut invalid_config = valid_config.clone(); @@ -4101,7 +4478,7 @@ mod tests { }]); assert_eq!( invalid_config.validate(), - Err(ValidationError::VnetReservedFd) + Err(ValidationError::VnetReservedFd(0)) ); let mut invalid_config = valid_config.clone(); @@ -4254,6 +4631,7 @@ mod tests { id: None, iommu: true, pci_segment: 1, + bdf_device: None, }); still_valid_config.validate().unwrap(); @@ -4330,6 +4708,7 @@ mod tests { id: None, iommu: false, pci_segment: 1, + bdf_device: None, }); assert_eq!( invalid_config.validate(), @@ -4556,6 +4935,8 @@ mod tests { igvm: None, #[cfg(feature = "sev_snp")] host_data: Some("".to_string()), + #[cfg(feature = "fw_cfg")] + fw_cfg_config: None, }); config_with_no_host_data.validate().unwrap_err(); @@ -4570,6 +4951,8 @@ mod tests { igvm: None, #[cfg(feature = "sev_snp")] host_data: None, + #[cfg(feature = "fw_cfg")] + fw_cfg_config: None, }); valid_config_with_no_host_data.validate().unwrap(); @@ -4586,6 +4969,8 @@ mod tests { host_data: Some( "243eb7dc1a21129caa91dcbb794922b933baecb5823a377eb43118867328".to_string(), ), + #[cfg(feature = "fw_cfg")] + fw_cfg_config: None, }); config_with_invalid_host_data.validate().unwrap_err(); } @@ -4617,4 +5002,50 @@ mod tests { ); Ok(()) } + #[test] + #[cfg(feature = "fw_cfg")] + fn test_fw_cfg_config_item_list_parsing() -> Result<()> { + // Empty list + FwCfgConfig::parse("items=[]").unwrap_err(); + // Missing closing bracket + FwCfgConfig::parse("items=[name=opt/org.test/fw_cfg_test_item,file=/tmp/fw_cfg_test_item") + .unwrap_err(); + // Single Item + assert_eq!( + FwCfgConfig::parse( + "items=[name=opt/org.test/fw_cfg_test_item,file=/tmp/fw_cfg_test_item]" + )?, + FwCfgConfig { + items: Some(FwCfgItemList { + item_list: vec![FwCfgItem { + name: "opt/org.test/fw_cfg_test_item".to_string(), + file: PathBuf::from("/tmp/fw_cfg_test_item"), + }] + }), + ..Default::default() + }, + ); + // Multiple Items + assert_eq!( + FwCfgConfig::parse( + "items=[name=opt/org.test/fw_cfg_test_item,file=/tmp/fw_cfg_test_item:name=opt/org.test/fw_cfg_test_item2,file=/tmp/fw_cfg_test_item2]" + )?, + FwCfgConfig { + items: Some(FwCfgItemList { + item_list: vec![ + FwCfgItem { + name: "opt/org.test/fw_cfg_test_item".to_string(), + file: PathBuf::from("/tmp/fw_cfg_test_item"), + }, + FwCfgItem { + name: "opt/org.test/fw_cfg_test_item2".to_string(), + file: PathBuf::from("/tmp/fw_cfg_test_item2"), + } + ] + }), + ..Default::default() + }, + ); + Ok(()) + } } diff --git a/vmm/src/console_devices.rs b/vmm/src/console_devices.rs index c4137733bc..19ac18e3ef 100644 --- a/vmm/src/console_devices.rs +++ b/vmm/src/console_devices.rs @@ -10,8 +10,9 @@ // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause // -use std::fs::{read_link, File, OpenOptions}; +use std::fs::{File, OpenOptions, read_link}; use std::mem::zeroed; +use std::net::TcpListener; use std::os::fd::{AsRawFd, FromRawFd, RawFd}; use std::os::unix::fs::OpenOptionsExt; use std::os::unix::net::UnixListener; @@ -19,12 +20,12 @@ use std::path::PathBuf; use std::sync::{Arc, Mutex}; use std::{io, result}; -use libc::{cfmakeraw, isatty, tcgetattr, tcsetattr, termios, TCSANOW}; +use libc::{TCSANOW, cfmakeraw, isatty, tcgetattr, tcsetattr, termios}; use thiserror::Error; +use crate::Vmm; use crate::sigwinch_listener::listen_for_sigwinch_on_tty; use crate::vm_config::ConsoleOutputMode; -use crate::Vmm; const TIOCSPTLCK: libc::c_int = 0x4004_5431; const TIOCGPTPEER: libc::c_int = 0x5441; @@ -40,6 +41,10 @@ pub enum ConsoleDeviceError { #[error("No socket option support for console device")] NoSocketOptionSupportForConsoleDevice, + /// Error parsing the TCP address + #[error("Wrong TCP address format: {0}")] + WrongTcpAddressFormat(std::string::String), + /// Error setting pty raw mode #[error("Error setting pty raw mode")] SetPtyRaw(#[source] vmm_sys_util::errno::Error), @@ -62,6 +67,7 @@ pub enum ConsoleOutput { Tty(Arc), Null, Socket(Arc), + Tcp(Arc, Option>), Off, } @@ -225,8 +231,9 @@ pub(crate) fn pre_create_console_devices(vmm: &mut Vmm) -> ConsoleDeviceResult { - return Err(ConsoleDeviceError::NoSocketOptionSupportForConsoleDevice) + return Err(ConsoleDeviceError::NoSocketOptionSupportForConsoleDevice); } + ConsoleOutputMode::Tcp => ConsoleOutput::Null, ConsoleOutputMode::Null => ConsoleOutput::Null, ConsoleOutputMode::Off => ConsoleOutput::Off, }, @@ -264,6 +271,21 @@ pub(crate) fn pre_create_console_devices(vmm: &mut Vmm) -> ConsoleDeviceResult { + let url = vmconfig.serial.url.as_ref().unwrap(); + let socket_addr: std::net::SocketAddr = url + .parse() + .map_err(|_| ConsoleDeviceError::WrongTcpAddressFormat(url.to_string()))?; + let listener = TcpListener::bind(socket_addr) + .map_err(ConsoleDeviceError::CreateConsoleDevice)?; + + let mut f = None; + if let Some(p) = &vmconfig.serial.file { + let file = File::create(p).map_err(ConsoleDeviceError::CreateConsoleDevice)?; + f = Some(Arc::new(file)); + } + ConsoleOutput::Tcp(Arc::new(listener), f) + } ConsoleOutputMode::Null => ConsoleOutput::Null, ConsoleOutputMode::Off => ConsoleOutput::Off, }, @@ -288,8 +310,9 @@ pub(crate) fn pre_create_console_devices(vmm: &mut Vmm) -> ConsoleDeviceResult { - return Err(ConsoleDeviceError::NoSocketOptionSupportForConsoleDevice) + return Err(ConsoleDeviceError::NoSocketOptionSupportForConsoleDevice); } + ConsoleOutputMode::Tcp => ConsoleOutput::Null, ConsoleOutputMode::Null => ConsoleOutput::Null, ConsoleOutputMode::Off => ConsoleOutput::Off, }, diff --git a/vmm/src/cpu.rs b/vmm/src/cpu.rs index e26946538e..416bce5d45 100644 --- a/vmm/src/cpu.rs +++ b/vmm/src/cpu.rs @@ -23,7 +23,7 @@ use std::{cmp, io, result, thread}; #[cfg(not(target_arch = "riscv64"))] use acpi_tables::sdt::Sdt; -use acpi_tables::{aml, Aml}; +use acpi_tables::{Aml, aml}; use anyhow::anyhow; #[cfg(target_arch = "x86_64")] use arch::x86_64::get_x2apic_id; @@ -34,30 +34,30 @@ use devices::interrupt_controller::InterruptController; #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] -use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs}; +use gdbstub_arch::x86::reg::{X86_64CoreRegs as CoreRegs, X86SegmentRegs}; +#[cfg(target_arch = "x86_64")] +use hypervisor::CpuVendor; +#[cfg(feature = "kvm")] +use hypervisor::HypervisorType; +#[cfg(feature = "guest_debug")] +use hypervisor::StandardRegisters; #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] use hypervisor::arch::aarch64::regs::{ID_AA64MMFR0_EL1, TCR_EL1, TTBR1_EL1}; -#[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] -use hypervisor::arch::x86::msr_index; #[cfg(target_arch = "x86_64")] use hypervisor::arch::x86::CpuIdEntry; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use hypervisor::arch::x86::MsrEntry; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use hypervisor::arch::x86::SpecialRegisters; +#[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] +use hypervisor::arch::x86::msr_index; #[cfg(feature = "tdx")] use hypervisor::kvm::{TdxExitDetails, TdxExitStatus}; -#[cfg(target_arch = "x86_64")] -use hypervisor::CpuVendor; -#[cfg(feature = "kvm")] -use hypervisor::HypervisorType; -#[cfg(feature = "guest_debug")] -use hypervisor::StandardRegisters; use hypervisor::{CpuState, HypervisorCpuError, VmExit, VmOps}; use libc::{c_void, siginfo_t}; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use linux_loader::elf::Elf64_Nhdr; -use seccompiler::{apply_filter, SeccompAction}; +use seccompiler::{SeccompAction, apply_filter}; use thiserror::Error; use tracer::trace_scoped; use vm_device::BusDevice; @@ -67,28 +67,38 @@ use vm_memory::ByteValued; use vm_memory::{Bytes, GuestAddressSpace}; use vm_memory::{GuestAddress, GuestMemoryAtomic}; use vm_migration::{ - snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable, - Transportable, + Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable, Transportable, + snapshot_from_id, }; use vmm_sys_util::eventfd::EventFd; -use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN}; +use vmm_sys_util::signal::{SIGRTMIN, register_signal_handler}; use zerocopy::{FromBytes, Immutable, IntoBytes}; +#[cfg(feature = "kvm")] +use {kvm_bindings::kvm_run, std::cell::Cell, std::os::fd::RawFd, std::sync::RwLock}; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use crate::coredump::{ - CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable, - GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE, - NT_PRSTATUS, + COREDUMP_NAME_SIZE, CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, + Elf64Writable, GuestDebuggableError, NT_PRSTATUS, NoteDescType, X86_64ElfPrStatus, + X86_64UserRegs, }; #[cfg(feature = "guest_debug")] -use crate::gdb::{get_raw_tid, Debuggable, DebuggableError}; -#[cfg(target_arch = "x86_64")] -use crate::memory_manager::MemoryManager; -use crate::seccomp_filters::{get_seccomp_filter, Thread}; +use crate::gdb::{Debuggable, DebuggableError, get_raw_tid}; +use crate::seccomp_filters::{Thread, get_seccomp_filter}; #[cfg(target_arch = "x86_64")] use crate::vm::physical_bits; use crate::vm_config::CpusConfig; -use crate::{GuestMemoryMmap, CPU_MANAGER_SNAPSHOT_ID}; +use crate::{CPU_MANAGER_SNAPSHOT_ID, GuestMemoryMmap}; + +#[cfg(feature = "kvm")] +thread_local! { + static KVM_RUN: Cell<*mut kvm_run> = const {Cell::new(core::ptr::null_mut())}; +} +#[cfg(feature = "kvm")] +/// Tell signal handler to not access certain stuff anymore during shutdown. +/// Otherwise => panics. +/// Better alternative would be to prevent signals there at all. +pub static IS_IN_SHUTDOWN: RwLock = RwLock::new(false); #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] /// Extract the specified bits of a 64-bit integer. @@ -198,8 +208,8 @@ pub enum Error { #[error("Error setting up AMX")] AmxEnable(#[source] anyhow::Error), - #[error("Maximum number of vCPUs exceeds host limit")] - MaximumVcpusExceeded, + #[error("Maximum number of vCPUs {0} exceeds host limit {1}")] + MaximumVcpusExceeded(u32, u32), #[cfg(feature = "sev_snp")] #[error("Failed to set sev control register")] @@ -338,7 +348,7 @@ macro_rules! round_up { pub struct Vcpu { // The hypervisor abstracted CPU. vcpu: Arc, - id: u8, + id: u32, #[cfg(target_arch = "aarch64")] mpidr: u64, saved_state: Option, @@ -356,8 +366,8 @@ impl Vcpu { /// * `vm_ops` - Optional object for exit handling. /// * `cpu_vendor` - CPU vendor as reported by __cpuid(0x0) pub fn new( - id: u8, - apic_id: u8, + id: u32, + apic_id: u32, vm: &Arc, vm_ops: Option>, #[cfg(target_arch = "x86_64")] cpu_vendor: CpuVendor, @@ -390,7 +400,7 @@ impl Vcpu { boot_setup: Option<(EntryPoint, &GuestMemoryAtomic)>, #[cfg(target_arch = "x86_64")] cpuid: Vec, #[cfg(target_arch = "x86_64")] kvm_hyperv: bool, - #[cfg(target_arch = "x86_64")] topology: Option<(u8, u8, u8)>, + #[cfg(target_arch = "x86_64")] topology: (u16, u16, u16, u16), ) -> Result<()> { #[cfg(target_arch = "aarch64")] { @@ -487,6 +497,13 @@ impl Vcpu { .map_err(Error::VcpuSetGicrBaseAddr)?; Ok(()) } + + #[cfg(feature = "kvm")] + pub fn get_kvm_vcpu_raw_fd(&self) -> RawFd { + // SAFETY: We happen to know that all current uses respect the safety contract. + // TODO find a better way to keep this safe and/or express its fragile state. + unsafe { self.vcpu.get_kvm_vcpu_raw_fd() } + } } impl Pausable for Vcpu {} @@ -526,14 +543,14 @@ pub struct CpuManager { #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, vcpu_states: Vec, - selected_cpu: u8, + selected_cpu: u32, vcpus: Vec>>, seccomp_action: SeccompAction, vm_ops: Arc, #[cfg_attr(target_arch = "aarch64", allow(dead_code))] acpi_address: Option, - proximity_domain_per_cpu: BTreeMap, - affinity: BTreeMap>, + proximity_domain_per_cpu: BTreeMap, + affinity: BTreeMap>, dynamic: bool, hypervisor: Arc, #[cfg(feature = "sev_snp")] @@ -555,11 +572,13 @@ impl BusDevice for CpuManager { match offset { CPU_SELECTION_OFFSET => { - data[0] = self.selected_cpu; + assert!(data.len() >= core::mem::size_of::()); + data[0..core::mem::size_of::()] + .copy_from_slice(&self.selected_cpu.to_le_bytes()); } CPU_STATUS_OFFSET => { if self.selected_cpu < self.max_vcpus() { - let state = &self.vcpu_states[usize::from(self.selected_cpu)]; + let state = &self.vcpu_states[usize::try_from(self.selected_cpu).unwrap()]; if state.active() { data[0] |= 1 << CPU_ENABLE_FLAG; } @@ -585,11 +604,13 @@ impl BusDevice for CpuManager { fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option> { match offset { CPU_SELECTION_OFFSET => { - self.selected_cpu = data[0]; + assert!(data.len() >= core::mem::size_of::()); + self.selected_cpu = + u32::from_le_bytes(data[0..core::mem::size_of::()].try_into().unwrap()); } CPU_STATUS_OFFSET => { if self.selected_cpu < self.max_vcpus() { - let state = &mut self.vcpu_states[usize::from(self.selected_cpu)]; + let state = &mut self.vcpu_states[usize::try_from(self.selected_cpu).unwrap()]; // The ACPI code writes back a 1 to acknowledge the insertion if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG) && state.inserting @@ -603,10 +624,10 @@ impl BusDevice for CpuManager { state.removing = false; } // Trigger removal of vCPU - if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG { - if let Err(e) = self.remove_vcpu(self.selected_cpu) { - error!("Error removing vCPU: {:?}", e); - } + if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG + && let Err(e) = self.remove_vcpu(self.selected_cpu) + { + error!("Error removing vCPU: {:?}", e); } } else { warn!("Out of range vCPU id: {}", self.selected_cpu); @@ -631,6 +652,7 @@ struct VcpuState { handle: Option>, kill: Arc, vcpu_run_interrupted: Arc, + /// Used to ACK state changes from the run vCPU loop to the CPU Manager. paused: Arc, } @@ -639,13 +661,25 @@ impl VcpuState { self.handle.is_some() } + /// Sends a signal to the underlying thread. + /// + /// Please call [`Self::wait_until_signal_acknowledged`] afterward to block + /// until the vCPU thread has acknowledged the signal. fn signal_thread(&self) { if let Some(handle) = self.handle.as_ref() { + // SAFETY: FFI call with correct arguments + unsafe { + libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN()); + } + } + } + + /// Blocks until the vCPU thread has acknowledged the signal. + /// + /// This is the counterpart of [`Self::signal_thread`]. + fn wait_until_signal_acknowledged(&self) { + if let Some(_handle) = self.handle.as_ref() { loop { - // SAFETY: FFI call with correct arguments - unsafe { - libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN()); - } if self.vcpu_run_interrupted.load(Ordering::SeqCst) { break; } else { @@ -688,49 +722,27 @@ impl CpuManager { numa_nodes: &NumaNodes, #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, ) -> Result>> { - if u32::from(config.max_vcpus) > hypervisor.get_max_vcpus() { - return Err(Error::MaximumVcpusExceeded); + if config.max_vcpus > hypervisor.get_max_vcpus() { + return Err(Error::MaximumVcpusExceeded( + config.max_vcpus, + hypervisor.get_max_vcpus(), + )); } - let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus)); - vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default); + let max_vcpus = usize::try_from(config.max_vcpus).unwrap(); + let mut vcpu_states = Vec::with_capacity(max_vcpus); + vcpu_states.resize_with(max_vcpus, VcpuState::default); let hypervisor_type = hypervisor.hypervisor_type(); #[cfg(target_arch = "x86_64")] let cpu_vendor = hypervisor.get_cpu_vendor(); #[cfg(target_arch = "x86_64")] if config.features.amx { - const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024; - const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025; - const XFEATURE_XTILEDATA: usize = 18; - const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA; - - // SAFETY: the syscall is only modifying kernel internal - // data structures that the kernel is itself expected to safeguard. - let amx_tile = unsafe { - libc::syscall( - libc::SYS_arch_prctl, - ARCH_REQ_XCOMP_GUEST_PERM, - XFEATURE_XTILEDATA, - ) - }; - - if amx_tile != 0 { - return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); - } else { - let mask: usize = 0; - // SAFETY: the mask being modified (not marked mutable as it is - // modified in unsafe only which is permitted) isn't in use elsewhere. - let result = unsafe { - libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask) - }; - if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK { - return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported"))); - } - } + hypervisor::arch::x86::XsaveState::enable_amx_state_components(hypervisor.as_ref()) + .map_err(|e| crate::cpu::Error::AmxEnable(e.into()))?; } - let proximity_domain_per_cpu: BTreeMap = { + let proximity_domain_per_cpu: BTreeMap = { let mut cpu_list = Vec::new(); for (proximity_domain, numa_node) in numa_nodes.iter() { for cpu in numa_node.cpus.iter() { @@ -771,7 +783,7 @@ impl CpuManager { #[cfg(feature = "guest_debug")] vm_debug_evt, selected_cpu: 0, - vcpus: Vec::with_capacity(usize::from(config.max_vcpus)), + vcpus: Vec::with_capacity(max_vcpus), seccomp_action, vm_ops, acpi_address: None, @@ -787,28 +799,20 @@ impl CpuManager { #[cfg(target_arch = "x86_64")] pub fn populate_cpuid( &mut self, - memory_manager: &Arc>, hypervisor: &Arc, #[cfg(feature = "tdx")] tdx: bool, ) -> Result<()> { - let sgx_epc_sections = memory_manager - .lock() - .unwrap() - .sgx_epc_region() - .as_ref() - .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect()); - self.cpuid = { let phys_bits = physical_bits(hypervisor, self.config.max_phys_bits); arch::generate_common_cpuid( hypervisor, &arch::CpuidConfig { - sgx_epc_sections, phys_bits, kvm_hyperv: self.config.kvm_hyperv, #[cfg(feature = "tdx")] tdx, amx: self.config.features.amx, + profile: self.config.profile, }, ) .map_err(Error::CommonCpuId)? @@ -817,19 +821,19 @@ impl CpuManager { Ok(()) } - fn create_vcpu(&mut self, cpu_id: u8, snapshot: Option) -> Result>> { + fn create_vcpu(&mut self, cpu_id: u32, snapshot: Option) -> Result>> { info!("Creating vCPU: cpu_id = {}", cpu_id); #[cfg(target_arch = "x86_64")] let topology = self.get_vcpu_topology(); #[cfg(target_arch = "x86_64")] - let x2apic_id = arch::x86_64::get_x2apic_id(cpu_id as u32, topology); + let x2apic_id = arch::x86_64::get_x2apic_id(cpu_id, topology); #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] - let x2apic_id = cpu_id as u32; + let x2apic_id = cpu_id; let mut vcpu = Vcpu::new( cpu_id, - x2apic_id as u8, + x2apic_id, &self.vm, Some(self.vm_ops.clone()), #[cfg(target_arch = "x86_64")] @@ -884,8 +888,22 @@ impl CpuManager { #[cfg(target_arch = "x86_64")] let topology = self.config.topology.clone().map_or_else( - || Some((1, self.boot_vcpus(), 1)), - |t| Some((t.threads_per_core, t.cores_per_die, t.dies_per_package)), + || { + ( + 1_u16, + u16::try_from(self.boot_vcpus()).unwrap(), + 1_u16, + 1_u16, + ) + }, + |t| { + ( + t.threads_per_core, + t.cores_per_die, + t.dies_per_package, + t.packages, + ) + }, ); #[cfg(target_arch = "x86_64")] vcpu.configure( @@ -907,7 +925,7 @@ impl CpuManager { /// Only create new vCPUs if there aren't any inactive ones to reuse fn create_vcpus( &mut self, - desired_vcpus: u8, + desired_vcpus: u32, snapshot: Option, ) -> Result>>> { let mut vcpus: Vec>> = vec![]; @@ -924,7 +942,7 @@ impl CpuManager { } // Only create vCPUs in excess of all the allocated vCPUs. - for cpu_id in self.vcpus.len() as u8..desired_vcpus { + for cpu_id in self.vcpus.len() as u32..desired_vcpus { vcpus.push(self.create_vcpu( cpu_id, // TODO: The special format of the CPU id can be removed once @@ -962,7 +980,7 @@ impl CpuManager { fn start_vcpu( &mut self, vcpu: Arc>, - vcpu_id: u8, + vcpu_id: u32, vcpu_thread_barrier: Arc, inserting: bool, ) -> Result<()> { @@ -973,16 +991,20 @@ impl CpuManager { #[cfg(feature = "guest_debug")] let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap(); let panic_exit_evt = self.exit_evt.try_clone().unwrap(); - let vcpu_kill_signalled = self.vcpus_kill_signalled.clone(); - let vcpu_pause_signalled = self.vcpus_pause_signalled.clone(); - let vcpu_kick_signalled = self.vcpus_kick_signalled.clone(); + let vcpus_kill_signalled_clone = self.vcpus_kill_signalled.clone(); + let vcpus_pause_signalled_clone = self.vcpus_pause_signalled.clone(); + let vcpus_kick_signalled_clone = self.vcpus_kick_signalled.clone(); - let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone(); - let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)] + let vcpu_kill = self.vcpu_states[usize::try_from(vcpu_id).unwrap()] + .kill + .clone(); + let vcpu_run_interrupted = self.vcpu_states[usize::try_from(vcpu_id).unwrap()] .vcpu_run_interrupted .clone(); let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone(); - let vcpu_paused = self.vcpu_states[usize::from(vcpu_id)].paused.clone(); + let vcpu_paused = self.vcpu_states[usize::try_from(vcpu_id).unwrap()] + .paused + .clone(); // Prepare the CPU set the current vCPU is expected to run onto. let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| { @@ -1014,6 +1036,28 @@ impl CpuManager { thread::Builder::new() .name(format!("vcpu{vcpu_id}")) .spawn(move || { + // init thread-local kvm_run structure + #[cfg(feature = "kvm")] + { + let raw_kvm_fd = vcpu.lock().unwrap().get_kvm_vcpu_raw_fd(); + + // SAFETY: We know the FD is valid and have the proper args. + let buffer = unsafe { + libc::mmap( + core::ptr::null_mut(), + 4096, + libc::PROT_WRITE | libc::PROT_READ, + libc::MAP_SHARED, + raw_kvm_fd, + 0, + ) + }; + assert!(!buffer.is_null()); + assert_ne!(buffer, libc::MAP_FAILED); + let kvm_run = buffer.cast::(); + KVM_RUN.set(kvm_run); + } + // Schedule the thread to run on the expected CPU set if let Some(cpuset) = cpuset.as_ref() { // SAFETY: FFI call with correct arguments @@ -1036,15 +1080,42 @@ impl CpuManager { } // Apply seccomp filter for vcpu thread. - if !vcpu_seccomp_filter.is_empty() { - if let Err(e) = + if !vcpu_seccomp_filter.is_empty() && let Err(e) = apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter) { error!("Error applying seccomp filter: {:?}", e); return; } - } + + #[cfg(not(feature = "kvm"))] extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {} + #[cfg(feature = "kvm")] + extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) { + // We do not need a self-pipe for safe UNIX signal handling here as in this + // signal handler, we only expect the same signal over and over again. While + // different signals can interrupt a signal being handled, the same signal + // again can't by default. Therefore, this is safe. + + // This lock prevents accessing thread locals when a signal is received + // in the teardown phase of the Rust standard library. Otherwise, we would + // panic. + // + // Masking signals would be a nicer approach but this is the pragmatic + // solution. + // + // We don't have lock contention in normal operation. When the writer + // sets the bool to true, the lock is only held for a couple of µs. + let lock = IS_IN_SHUTDOWN.read().unwrap(); + if *lock { + return; + } + + let kvm_run = KVM_RUN.get(); + // SAFETY: the mapping is valid + let kvm_run = unsafe { + kvm_run.as_mut().expect("kvm_run should have been mapped as part of vCPU setup") }; + kvm_run.immediate_exit = 1; + } // This uses an async signal safe handler to kill the vcpu handles. register_signal_handler(SIGRTMIN(), handle_signal) .expect("Failed to register vcpu signal handler"); @@ -1065,7 +1136,7 @@ impl CpuManager { // loads and stores to different atomics and we need // to see them in a consistent order in all threads - if vcpu_pause_signalled.load(Ordering::SeqCst) { + if vcpus_pause_signalled_clone.load(Ordering::SeqCst) { // As a pause can be caused by PIO & MMIO exits then we need to ensure they are // completed by returning to KVM_RUN. From the kernel docs: // @@ -1083,24 +1154,27 @@ impl CpuManager { #[cfg(feature = "kvm")] if matches!(hypervisor_type, HypervisorType::Kvm) { - vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true); - if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) { + let lock = vcpu.lock(); + let lock = lock.as_ref().unwrap(); + lock.vcpu.set_immediate_exit(true); + if !matches!(lock.run(), Ok(VmExit::Ignore)) { error!("Unexpected VM exit on \"immediate_exit\" run"); break; } - vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false); + lock.vcpu.set_immediate_exit(false); } vcpu_run_interrupted.store(true, Ordering::SeqCst); vcpu_paused.store(true, Ordering::SeqCst); - while vcpu_pause_signalled.load(Ordering::SeqCst) { + while vcpus_pause_signalled_clone.load(Ordering::SeqCst) { thread::park(); } + vcpu_paused.store(false, Ordering::SeqCst); vcpu_run_interrupted.store(false, Ordering::SeqCst); } - if vcpu_kick_signalled.load(Ordering::SeqCst) { + if vcpus_kick_signalled_clone.load(Ordering::SeqCst) { vcpu_run_interrupted.store(true, Ordering::SeqCst); #[cfg(target_arch = "x86_64")] match vcpu.lock().as_ref().unwrap().vcpu.nmi() { @@ -1113,7 +1187,7 @@ impl CpuManager { } // We've been told to terminate - if vcpu_kill_signalled.load(Ordering::SeqCst) + if vcpus_kill_signalled_clone.load(Ordering::SeqCst) || vcpu_kill.load(Ordering::SeqCst) { vcpu_run_interrupted.store(true, Ordering::SeqCst); @@ -1132,7 +1206,7 @@ impl CpuManager { info!("VmExit::Debug"); #[cfg(feature = "guest_debug")] { - vcpu_pause_signalled.store(true, Ordering::SeqCst); + vcpus_pause_signalled_clone.store(true, Ordering::SeqCst); let raw_tid = get_raw_tid(vcpu_id as usize); vm_debug_evt.write(raw_tid as u64).unwrap(); } @@ -1193,7 +1267,7 @@ impl CpuManager { } // We've been told to terminate - if vcpu_kill_signalled.load(Ordering::SeqCst) + if vcpus_kill_signalled_clone.load(Ordering::SeqCst) || vcpu_kill.load(Ordering::SeqCst) { vcpu_run_interrupted.store(true, Ordering::SeqCst); @@ -1213,8 +1287,8 @@ impl CpuManager { // On hot plug calls into this function entry_point is None. It is for // those hotplug CPU additions that we need to set the inserting flag. - self.vcpu_states[usize::from(vcpu_id)].handle = handle; - self.vcpu_states[usize::from(vcpu_id)].inserting = inserting; + self.vcpu_states[usize::try_from(vcpu_id).unwrap()].handle = handle; + self.vcpu_states[usize::try_from(vcpu_id).unwrap()].inserting = inserting; Ok(()) } @@ -1222,7 +1296,7 @@ impl CpuManager { /// Start up as many vCPUs threads as needed to reach `desired_vcpus` fn activate_vcpus( &mut self, - desired_vcpus: u8, + desired_vcpus: u32, inserting: bool, paused: Option, ) -> Result<()> { @@ -1257,11 +1331,11 @@ impl CpuManager { Ok(()) } - fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) { + fn mark_vcpus_for_removal(&mut self, desired_vcpus: u32) { // Mark vCPUs for removal, actual removal happens on ejection for cpu_id in desired_vcpus..self.present_vcpus() { - self.vcpu_states[usize::from(cpu_id)].removing = true; - self.vcpu_states[usize::from(cpu_id)] + self.vcpu_states[usize::try_from(cpu_id).unwrap()].removing = true; + self.vcpu_states[usize::try_from(cpu_id).unwrap()] .pending_removal .store(true, Ordering::SeqCst); } @@ -1276,11 +1350,12 @@ impl CpuManager { false } - fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> { + fn remove_vcpu(&mut self, cpu_id: u32) -> Result<()> { info!("Removing vCPU: cpu_id = {}", cpu_id); - let state = &mut self.vcpu_states[usize::from(cpu_id)]; + let state = &mut self.vcpu_states[usize::try_from(cpu_id).unwrap()]; state.kill.store(true, Ordering::SeqCst); state.signal_thread(); + state.wait_until_signal_acknowledged(); state.join_thread()?; state.handle = None; @@ -1306,7 +1381,7 @@ impl CpuManager { } pub fn start_restored_vcpus(&mut self) -> Result<()> { - self.activate_vcpus(self.vcpus.len() as u8, false, Some(true)) + self.activate_vcpus(self.vcpus.len() as u32, false, Some(true)) .map_err(|e| { Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e)) })?; @@ -1314,7 +1389,7 @@ impl CpuManager { Ok(()) } - pub fn resize(&mut self, desired_vcpus: u8) -> Result { + pub fn resize(&mut self, desired_vcpus: u32) -> Result { if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal { return Ok(false); } @@ -1348,6 +1423,21 @@ impl CpuManager { } } + /// Signal to the spawned threads (vCPUs and console signal handler). + /// + /// For the vCPU threads this will interrupt the KVM_RUN ioctl() allowing + /// the loop to check the shared state booleans. + fn signal_vcpus(&self) { + // Splitting this into two loops reduced the time to pause many vCPUs + // massively. Example: 254 vCPUs. >254ms -> ~4ms. + for state in self.vcpu_states.iter() { + state.signal_thread(); + } + for state in self.vcpu_states.iter() { + state.wait_until_signal_acknowledged(); + } + } + pub fn shutdown(&mut self) -> Result<()> { // Tell the vCPUs to stop themselves next time they go through the loop self.vcpus_kill_signalled.store(true, Ordering::SeqCst); @@ -1360,12 +1450,7 @@ impl CpuManager { state.unpark_thread(); } - // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads - // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set - // above. - for state in self.vcpu_states.iter() { - state.signal_thread(); - } + self.signal_vcpus(); // Wait for all the threads to finish. This removes the state from the vector. for mut state in self.vcpu_states.drain(..) { @@ -1387,11 +1472,11 @@ impl CpuManager { Ok(()) } - pub fn boot_vcpus(&self) -> u8 { + pub fn boot_vcpus(&self) -> u32 { self.config.boot_vcpus } - pub fn max_vcpus(&self) -> u8 { + pub fn max_vcpus(&self) -> u32 { self.config.max_vcpus } @@ -1401,10 +1486,10 @@ impl CpuManager { self.cpuid.clone() } - fn present_vcpus(&self) -> u8 { + fn present_vcpus(&self) -> u32 { self.vcpu_states .iter() - .fold(0, |acc, state| acc + state.active() as u8) + .fold(0, |acc, state| acc + state.active() as u32) } #[cfg(target_arch = "aarch64")] @@ -1423,11 +1508,15 @@ impl CpuManager { .collect() } - pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> { - self.config - .topology - .clone() - .map(|t| (t.threads_per_core, t.cores_per_die, t.packages)) + pub fn get_vcpu_topology(&self) -> Option<(u16, u16, u16, u16)> { + self.config.topology.clone().map(|t| { + ( + t.threads_per_core, + t.cores_per_die, + t.dies_per_package, + t.packages, + ) + }) } #[cfg(not(target_arch = "riscv64"))] @@ -1442,12 +1531,12 @@ impl CpuManager { madt.write(36, arch::layout::APIC_START.0); for cpu in 0..self.config.max_vcpus { - let x2apic_id = get_x2apic_id(cpu.into(), self.get_vcpu_topology()); + let x2apic_id = get_x2apic_id(cpu, self.get_vcpu_topology()); let lapic = LocalX2Apic { r#type: acpi::ACPI_X2APIC_PROCESSOR, length: 16, - processor_id: cpu.into(), + processor_id: cpu, apic_id: x2apic_id, flags: if cpu < self.config.boot_vcpus { 1 << MADT_CPU_ENABLE_FLAG @@ -1501,8 +1590,8 @@ impl CpuManager { r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE, length: 80, reserved0: 0, - cpu_interface_number: cpu as u32, - uid: cpu as u32, + cpu_interface_number: cpu, + uid: cpu, flags: 1, parking_version: 0, performance_interrupt: 0, @@ -1570,8 +1659,10 @@ impl CpuManager { // If topology is not specified, the default setting is: // 1 package, multiple cores, 1 thread per core // This is also the behavior when PPTT is missing. - let (threads_per_core, cores_per_package, packages) = - self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1)); + let (threads_per_core, cores_per_die, dies_per_package, packages) = self + .get_vcpu_topology() + .unwrap_or((1, u16::try_from(self.max_vcpus()).unwrap(), 1, 1)); + let cores_per_package = cores_per_die * dies_per_package; let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT ", 1); @@ -1795,7 +1886,7 @@ impl CpuManager { _ => { return Err(Error::TranslateVirtualAddress(anyhow!(format!( "PA range not supported {pa_range}" - )))) + )))); } }; @@ -1910,11 +2001,7 @@ impl CpuManager { pub(crate) fn nmi(&self) -> Result<()> { self.vcpus_kick_signalled.store(true, Ordering::SeqCst); - - for state in self.vcpu_states.iter() { - state.signal_thread(); - } - + self.signal_vcpus(); self.vcpus_kick_signalled.store(false, Ordering::SeqCst); Ok(()) @@ -1922,11 +2009,11 @@ impl CpuManager { } struct Cpu { - cpu_id: u8, + cpu_id: u32, proximity_domain: u32, dynamic: bool, #[cfg(target_arch = "x86_64")] - topology: Option<(u8, u8, u8)>, + topology: Option<(u16, u16, u16, u16)>, } #[cfg(target_arch = "x86_64")] @@ -1938,12 +2025,12 @@ const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1; impl Cpu { #[cfg(target_arch = "x86_64")] fn generate_mat(&self) -> Vec { - let x2apic_id = arch::x86_64::get_x2apic_id(self.cpu_id.into(), self.topology); + let x2apic_id = arch::x86_64::get_x2apic_id(self.cpu_id, self.topology); let lapic = LocalX2Apic { r#type: crate::acpi::ACPI_X2APIC_PROCESSOR, length: 16, - processor_id: self.cpu_id.into(), + processor_id: self.cpu_id, apic_id: x2apic_id, flags: 1 << MADT_CPU_ENABLE_FLAG, _reserved: 0, @@ -2045,7 +2132,7 @@ impl Aml for Cpu { } struct CpuNotify { - cpu_id: u8, + cpu_id: u32, } impl Aml for CpuNotify { @@ -2060,7 +2147,7 @@ impl Aml for CpuNotify { } struct CpuMethods { - max_vcpus: u8, + max_vcpus: u32, dynamic: bool, } @@ -2098,7 +2185,7 @@ impl Aml for CpuMethods { let mut cpu_notifies_refs: Vec<&dyn Aml> = Vec::new(); for cpu_id in 0..self.max_vcpus { - cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]); + cpu_notifies_refs.push(&cpu_notifies[usize::try_from(cpu_id).unwrap()]); } aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(sink); @@ -2276,17 +2363,11 @@ impl Pausable for CpuManager { // Tell the vCPUs to pause themselves next time they exit self.vcpus_pause_signalled.store(true, Ordering::SeqCst); - // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads - // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set - // above. - for state in self.vcpu_states.iter() { - state.signal_thread(); - } + self.signal_vcpus(); + #[cfg(all(feature = "kvm", target_arch = "x86_64"))] for vcpu in self.vcpus.iter() { - let mut vcpu = vcpu.lock().unwrap(); - vcpu.pause()?; - #[cfg(all(feature = "kvm", target_arch = "x86_64"))] + let vcpu = vcpu.lock().unwrap(); if !self.config.kvm_hyperv { vcpu.vcpu.notify_guest_clock_paused().map_err(|e| { MigratableError::Pause(anyhow!( @@ -2301,6 +2382,7 @@ impl Pausable for CpuManager { // activated vCPU change their state to ensure they have parked. for state in self.vcpu_states.iter() { if state.active() { + // wait for vCPU to update state while !state.paused.load(Ordering::SeqCst) { // To avoid a priority inversion with the vCPU thread thread::sleep(std::time::Duration::from_millis(1)); @@ -2312,20 +2394,26 @@ impl Pausable for CpuManager { } fn resume(&mut self) -> std::result::Result<(), MigratableError> { - for vcpu in self.vcpus.iter() { - vcpu.lock().unwrap().resume()?; - } - - // Toggle the vCPUs pause boolean + // Ensure that vCPUs keep running after being unpark() in + // their run vCPU loop. self.vcpus_pause_signalled.store(false, Ordering::SeqCst); - // Unpark all the VCPU threads. - // Once unparked, the next thing they will do is checking for the pause - // boolean. Since it'll be set to false, they will exit their pause loop - // and go back to vmx root. - for state in self.vcpu_states.iter() { - state.paused.store(false, Ordering::SeqCst); - state.unpark_thread(); + // Unpark all the vCPU threads. + // Step 1/2: signal each thread + { + for state in self.vcpu_states.iter() { + state.unpark_thread(); + } + } + // Step 2/2: wait for state ACK + { + for state in self.vcpu_states.iter() { + // wait for vCPU to update state + while state.paused.load(Ordering::SeqCst) { + // To avoid a priority inversion with the vCPU thread + thread::sleep(std::time::Duration::from_millis(1)); + } + } } Ok(()) } @@ -2624,7 +2712,7 @@ impl CpuElf64Writable for CpuManager { pos += descsz - size_of::() - size_of::(); let orig_rax: u64 = 0; - let gregs = self.vcpus[usize::from(vcpu_id)] + let gregs = self.vcpus[usize::try_from(vcpu_id).unwrap()] .lock() .unwrap() .vcpu @@ -2652,7 +2740,7 @@ impl CpuElf64Writable for CpuManager { orig_rax, ]; - let sregs = self.vcpus[usize::from(vcpu_id)] + let sregs = self.vcpus[usize::try_from(vcpu_id).unwrap()] .lock() .unwrap() .vcpu @@ -2726,7 +2814,7 @@ impl CpuElf64Writable for CpuManager { pos += round_up!(COREDUMP_NAME_SIZE as usize, 4); - let gregs = self.vcpus[usize::from(vcpu_id)] + let gregs = self.vcpus[usize::try_from(vcpu_id).unwrap()] .lock() .unwrap() .vcpu @@ -2755,7 +2843,7 @@ impl CpuElf64Writable for CpuManager { gregs.get_r15(), ]; - let sregs = self.vcpus[usize::from(vcpu_id)] + let sregs = self.vcpus[usize::try_from(vcpu_id).unwrap()] .lock() .unwrap() .vcpu @@ -2827,8 +2915,8 @@ mod tests { use arch::layout::{BOOT_STACK_POINTER, ZERO_PAGE_START}; use arch::x86_64::interrupts::*; use arch::x86_64::regs::*; - use hypervisor::arch::x86::{FpuState, LapicState}; use hypervisor::StandardRegisters; + use hypervisor::arch::x86::{FpuState, LapicState}; use linux_loader::loader::bootparam::setup_header; #[test] @@ -2881,7 +2969,7 @@ mod tests { #[test] fn test_setup_msrs() { - use hypervisor::arch::x86::{msr_index, MsrEntry}; + use hypervisor::arch::x86::{MsrEntry, msr_index}; let hv = hypervisor::new().unwrap(); let vm = hv.create_vm().expect("new VM fd creation failed"); @@ -2963,19 +3051,19 @@ mod tests { #[cfg(test)] mod tests { #[cfg(feature = "kvm")] - use std::mem; + use std::{mem, mem::offset_of}; use arch::layout; + use hypervisor::HypervisorCpuError; use hypervisor::arch::aarch64::regs::MPIDR_EL1; #[cfg(feature = "kvm")] + use hypervisor::arm64_core_reg_id; + #[cfg(feature = "kvm")] use hypervisor::kvm::aarch64::is_system_register; #[cfg(feature = "kvm")] use hypervisor::kvm::kvm_bindings::{ - user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG, KVM_REG_ARM_CORE, KVM_REG_SIZE_U64, + KVM_REG_ARM_CORE, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG, KVM_REG_SIZE_U64, user_pt_regs, }; - use hypervisor::HypervisorCpuError; - #[cfg(feature = "kvm")] - use hypervisor::{arm64_core_reg_id, offset_of}; #[test] fn test_setup_regs() { diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index fccdd75bfa..4206aa155d 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -9,9 +9,9 @@ // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause // -use std::collections::{BTreeMap, BTreeSet, HashMap}; +use std::collections::{BTreeMap, BTreeSet, HashMap, VecDeque}; use std::fs::{File, OpenOptions}; -use std::io::{self, stdout, IsTerminal, Seek, SeekFrom}; +use std::io::{self, IsTerminal, Seek, SeekFrom, stdout}; use std::num::Wrapping; use std::os::unix::fs::OpenOptionsExt; use std::os::unix::io::{AsRawFd, FromRawFd}; @@ -23,14 +23,14 @@ use std::time::Instant; use acpi_tables::sdt::GenericAddress; #[cfg(not(target_arch = "riscv64"))] -use acpi_tables::{aml, Aml}; +use acpi_tables::{Aml, aml}; #[cfg(not(target_arch = "riscv64"))] use anyhow::anyhow; #[cfg(target_arch = "x86_64")] use arch::layout::{APIC_START, IOAPIC_SIZE, IOAPIC_START}; -use arch::{layout, NumaNodes}; #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] use arch::{DeviceType, MmioDeviceInfo}; +use arch::{NumaNodes, layout}; use block::async_io::DiskFile; use block::fixed_vhd_sync::FixedVhdDiskSync; use block::qcow_sync::QcowDiskSync; @@ -38,7 +38,7 @@ use block::raw_async_aio::RawFileDiskAio; use block::raw_sync::RawFileDiskSync; use block::vhdx_sync::VhdxDiskSync; use block::{ - block_aio_is_supported, block_io_uring_is_supported, detect_image_type, qcow, vhdx, ImageType, + ImageType, block_aio_is_supported, block_io_uring_is_supported, detect_image_type, qcow, vhdx, }; #[cfg(feature = "io_uring")] use block::{fixed_vhd_async::FixedVhdDiskAsync, raw_async::RawFileDisk}; @@ -53,19 +53,28 @@ use devices::gic; use devices::interrupt_controller::InterruptController; #[cfg(target_arch = "x86_64")] use devices::ioapic; +#[cfg(feature = "ivshmem")] +use devices::ivshmem::{IvshmemError, IvshmemOps}; #[cfg(target_arch = "aarch64")] use devices::legacy::Pl011; #[cfg(any(target_arch = "x86_64", target_arch = "riscv64"))] use devices::legacy::Serial; +#[cfg(all(feature = "fw_cfg", target_arch = "x86_64"))] +use devices::legacy::fw_cfg::FW_CFG_ACPI_ID; +#[cfg(feature = "fw_cfg")] +use devices::legacy::{ + FwCfg, + fw_cfg::{PORT_FW_CFG_BASE, PORT_FW_CFG_WIDTH}, +}; #[cfg(feature = "pvmemcontrol")] use devices::pvmemcontrol::{PvmemcontrolBusDevice, PvmemcontrolPciDevice}; -use devices::{interrupt_controller, AcpiNotificationFlags}; +use devices::{AcpiNotificationFlags, interrupt_controller}; +use hypervisor::IoEventAddress; #[cfg(target_arch = "aarch64")] use hypervisor::arch::aarch64::regs::AARCH64_PMU_IRQ; -use hypervisor::IoEventAddress; use libc::{ - tcsetattr, termios, MAP_NORESERVE, MAP_PRIVATE, MAP_SHARED, O_TMPFILE, PROT_READ, PROT_WRITE, - TCSANOW, + MAP_NORESERVE, MAP_PRIVATE, MAP_SHARED, O_TMPFILE, PROT_READ, PROT_WRITE, TCSANOW, tcsetattr, + termios, }; use pci::{ DeviceRelocation, MmioRegion, PciBarRegionType, PciBdf, PciDevice, VfioDmaMapping, @@ -88,32 +97,34 @@ use vm_device::dma_mapping::ExternalDmaMapping; use vm_device::interrupt::{ InterruptIndex, InterruptManager, LegacyIrqGroupConfig, MsiIrqGroupConfig, }; -use vm_device::{Bus, BusDevice, BusDeviceSync, Resource}; +use vm_device::{Bus, BusDevice, BusDeviceSync, Resource, UserspaceMapping}; use vm_memory::guest_memory::FileOffset; use vm_memory::{Address, GuestAddress, GuestMemoryRegion, GuestUsize, MmapRegion}; #[cfg(target_arch = "x86_64")] use vm_memory::{GuestAddressSpace, GuestMemory}; use vm_migration::protocol::MemoryRangeTable; use vm_migration::{ - snapshot_from_id, state_from_id, Migratable, MigratableError, Pausable, Snapshot, SnapshotData, - Snapshottable, Transportable, + Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable, Transportable, + snapshot_from_id, state_from_id, }; use vm_virtio::{AccessPlatform, VirtioDeviceType}; use vmm_sys_util::eventfd::EventFd; use crate::console_devices::{ConsoleDeviceError, ConsoleInfo, ConsoleOutput}; -use crate::cpu::{CpuManager, CPU_MANAGER_ACPI_SIZE}; +use crate::cpu::{CPU_MANAGER_ACPI_SIZE, CpuManager}; use crate::device_tree::{DeviceNode, DeviceTree}; use crate::interrupt::{LegacyUserspaceInterruptManager, MsiInterruptManager}; -use crate::memory_manager::{Error as MemoryManagerError, MemoryManager, MEMORY_MANAGER_ACPI_SIZE}; +use crate::memory_manager::{Error as MemoryManagerError, MEMORY_MANAGER_ACPI_SIZE, MemoryManager}; use crate::pci_segment::PciSegment; use crate::serial_manager::{Error as SerialManagerError, SerialManager}; +#[cfg(feature = "ivshmem")] +use crate::vm_config::IvshmemConfig; use crate::vm_config::{ - ConsoleOutputMode, DeviceConfig, DiskConfig, FsConfig, NetConfig, PmemConfig, UserDeviceConfig, - VdpaConfig, VhostMode, VmConfig, VsockConfig, DEFAULT_IOMMU_ADDRESS_WIDTH_BITS, - DEFAULT_PCI_SEGMENT_APERTURE_WEIGHT, + ConsoleOutputMode, DEFAULT_IOMMU_ADDRESS_WIDTH_BITS, DEFAULT_PCI_SEGMENT_APERTURE_WEIGHT, + DeviceConfig, DiskConfig, FsConfig, NetConfig, PmemConfig, UserDeviceConfig, VdpaConfig, + VhostMode, VmConfig, VsockConfig, }; -use crate::{device_node, GuestRegionMmap, PciDeviceInfo, DEVICE_MANAGER_SNAPSHOT_ID}; +use crate::{DEVICE_MANAGER_SNAPSHOT_ID, GuestRegionMmap, PciDeviceInfo, device_node}; #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] const MMIO_LEN: u64 = 0x1000; @@ -133,6 +144,8 @@ const PVMEMCONTROL_DEVICE_NAME: &str = "__pvmemcontrol"; const BALLOON_DEVICE_NAME: &str = "__balloon"; const CONSOLE_DEVICE_NAME: &str = "__console"; const PVPANIC_DEVICE_NAME: &str = "__pvpanic"; +#[cfg(feature = "ivshmem")] +const IVSHMEM_DEVICE_NAME: &str = "__ivshmem"; // Devices that the user may name and for which we generate // identifiers if the user doesn't give one @@ -454,7 +467,7 @@ pub enum DeviceManagerError { /// Failed to find an available PCI device ID. #[error("Failed to find an available PCI device ID")] - NextPciDeviceId(#[source] pci::PciRootError), + AllocatePciDeviceId(#[source] pci::PciRootError), /// Could not reserve the PCI device ID. #[error("Could not reserve the PCI device ID")] @@ -625,6 +638,11 @@ pub enum DeviceManagerError { #[error("Cannot create a PvPanic device")] PvPanicCreate(#[source] devices::pvpanic::PvPanicError), + #[cfg(feature = "ivshmem")] + /// Cannot create a ivshmem device + #[error("Cannot create a ivshmem device: {0}")] + IvshmemCreate(IvshmemError), + /// Cannot create a RateLimiterGroup #[error("Cannot create a RateLimiterGroup")] RateLimiterGroupCreate(#[source] rate_limiter::group::Error), @@ -644,6 +662,15 @@ pub enum DeviceManagerError { /// Cannot lock images of all block devices. #[error("Cannot lock images of all block devices")] DiskLockError(#[source] virtio_devices::block::Error), + + #[cfg(feature = "fw_cfg")] + /// Error adding fw_cfg to bus. + #[error("Error adding fw_cfg to bus")] + ErrorAddingFwCfgToBus(#[source] vm_device::BusError), + + /// Disk resizing failed. + #[error("Disk resize error")] + DiskResizeError(#[source] virtio_devices::block::Error), } pub type DeviceManagerResult = result::Result; @@ -749,12 +776,13 @@ impl DeviceRelocation for AddressManager { if let Some(node) = self.device_tree.lock().unwrap().get_mut(&id) { let mut resource_updated = false; for resource in node.resources.iter_mut() { - if let Resource::PciBar { base, type_, .. } = resource { - if PciBarRegionType::from(*type_) == region_type && *base == old_base { - *base = new_base; - resource_updated = true; - break; - } + if let Resource::PciBar { base, type_, .. } = resource + && PciBarRegionType::from(*type_) == region_type + && *base == old_base + { + *base = new_base; + resource_updated = true; + break; } } @@ -791,43 +819,41 @@ impl DeviceRelocation for AddressManager { } else { let virtio_dev = virtio_pci_dev.virtio_device(); let mut virtio_dev = virtio_dev.lock().unwrap(); - if let Some(mut shm_regions) = virtio_dev.get_shm_regions() { - if shm_regions.addr.raw_value() == old_base { - let mem_region = self.vm.make_user_memory_region( - shm_regions.mem_slot, - old_base, - shm_regions.len, - shm_regions.host_addr, - false, - false, - ); - - self.vm.remove_user_memory_region(mem_region).map_err(|e| { - io::Error::other(format!("failed to remove user memory region: {e:?}")) - })?; + if let Some(mut shm_regions) = virtio_dev.get_shm_regions() + && shm_regions.addr.raw_value() == old_base + { + let mem_region = self.vm.make_user_memory_region( + shm_regions.mem_slot, + old_base, + shm_regions.len, + shm_regions.host_addr, + false, + false, + ); - // Create new mapping by inserting new region to KVM. - let mem_region = self.vm.make_user_memory_region( - shm_regions.mem_slot, - new_base, - shm_regions.len, - shm_regions.host_addr, - false, - false, - ); - - self.vm.create_user_memory_region(mem_region).map_err(|e| { - io::Error::other(format!("failed to create user memory regions: {e:?}")) - })?; + self.vm.remove_user_memory_region(mem_region).map_err(|e| { + io::Error::other(format!("failed to remove user memory region: {e:?}")) + })?; - // Update shared memory regions to reflect the new mapping. - shm_regions.addr = GuestAddress(new_base); - virtio_dev.set_shm_regions(shm_regions).map_err(|e| { - io::Error::other(format!( - "failed to update shared memory regions: {e:?}" - )) - })?; - } + // Create new mapping by inserting new region to KVM. + let mem_region = self.vm.make_user_memory_region( + shm_regions.mem_slot, + new_base, + shm_regions.len, + shm_regions.host_addr, + false, + false, + ); + + self.vm.create_user_memory_region(mem_region).map_err(|e| { + io::Error::other(format!("failed to create user memory regions: {e:?}")) + })?; + + // Update shared memory regions to reflect the new mapping. + shm_regions.addr = GuestAddress(new_base); + virtio_dev.set_shm_regions(shm_regions).map_err(|e| { + io::Error::other(format!("failed to update shared memory regions: {e:?}")) + })?; } } } @@ -870,6 +896,7 @@ struct MetaVirtioDevice { iommu: bool, id: String, pci_segment: u16, + bdf_device: Option, dma_handler: Option>, } @@ -955,7 +982,7 @@ pub struct DeviceManager { cpu_manager: Arc>, // The virtio devices on the system - virtio_devices: Vec, + virtio_devices: VecDeque, /// All disks. Needed for locking and unlocking the images. block_devices: Vec>>, @@ -1070,6 +1097,13 @@ pub struct DeviceManager { rate_limit_groups: HashMap>, mmio_regions: Arc>>, + + #[cfg(feature = "fw_cfg")] + fw_cfg: Option>>, + + #[cfg(feature = "ivshmem")] + // ivshmem device + ivshmem_device: Option>>, } fn create_mmio_allocators( @@ -1287,7 +1321,7 @@ impl DeviceManager { config, memory_manager, cpu_manager, - virtio_devices: Vec::new(), + virtio_devices: VecDeque::new(), block_devices: vec![], bus_devices: Vec::new(), device_id_cnt, @@ -1334,6 +1368,10 @@ impl DeviceManager { snapshot, rate_limit_groups, mmio_regions: Arc::new(Mutex::new(Vec::new())), + #[cfg(feature = "fw_cfg")] + fw_cfg: None, + #[cfg(feature = "ivshmem")] + ivshmem_device: None, }; let device_manager = Arc::new(Mutex::new(device_manager)); @@ -1369,8 +1407,6 @@ impl DeviceManager { ) -> DeviceManagerResult<()> { trace_scoped!("create_devices"); - let mut virtio_devices: Vec = Vec::new(); - self.cpu_manager .lock() .unwrap() @@ -1421,12 +1457,8 @@ impl DeviceManager { self.original_termios_opt = original_termios_opt; - self.console = self.add_console_devices( - &legacy_interrupt_manager, - &mut virtio_devices, - console_info, - console_resize_pipe, - )?; + self.console = + self.add_console_devices(&legacy_interrupt_manager, console_info, console_resize_pipe)?; #[cfg(not(target_arch = "riscv64"))] if let Some(tpm) = self.config.clone().lock().unwrap().tpm.as_ref() { @@ -1436,11 +1468,8 @@ impl DeviceManager { } self.legacy_interrupt_manager = Some(legacy_interrupt_manager); - virtio_devices.append(&mut self.make_virtio_devices()?); - - self.add_pci_devices(virtio_devices.clone())?; - - self.virtio_devices = virtio_devices; + self.make_virtio_devices()?; + self.add_pci_devices()?; // Add pvmemcontrol if required #[cfg(feature = "pvmemcontrol")] @@ -1457,6 +1486,57 @@ impl DeviceManager { self.pvpanic_device = self.add_pvpanic_device()?; } + #[cfg(feature = "ivshmem")] + if let Some(ivshmem) = self.config.clone().lock().unwrap().ivshmem.as_ref() { + self.ivshmem_device = self.add_ivshmem_device(ivshmem)?; + } + + Ok(()) + } + + #[cfg(feature = "fw_cfg")] + pub fn create_fw_cfg_device(&mut self) -> Result<(), DeviceManagerError> { + let fw_cfg = Arc::new(Mutex::new(devices::legacy::FwCfg::new( + self.memory_manager.lock().as_ref().unwrap().guest_memory(), + ))); + + self.fw_cfg = Some(fw_cfg.clone()); + + self.bus_devices + .push(Arc::clone(&fw_cfg) as Arc); + + #[cfg(target_arch = "x86_64")] + self.address_manager + .io_bus + .insert(fw_cfg, PORT_FW_CFG_BASE, PORT_FW_CFG_WIDTH) + .map_err(DeviceManagerError::ErrorAddingFwCfgToBus)?; + + // default address for fw_cfg on arm via mmio + // https://github.com/torvalds/linux/blob/master/drivers/firmware/qemu_fw_cfg.c#L27 + #[cfg(target_arch = "aarch64")] + { + self.address_manager + .mmio_bus + .insert(fw_cfg.clone(), PORT_FW_CFG_BASE, PORT_FW_CFG_WIDTH) + .map_err(DeviceManagerError::ErrorAddingFwCfgToBus)?; + + let fw_cfg_irq = self + .address_manager + .allocator + .lock() + .unwrap() + .allocate_irq() + .unwrap(); + + self.id_to_dev_info.insert( + (DeviceType::FwCfg, "fw-cfg".to_string()), + MmioDeviceInfo { + addr: PORT_FW_CFG_BASE, + len: PORT_FW_CFG_WIDTH, + irq: fw_cfg_irq, + }, + ); + } Ok(()) } @@ -1498,10 +1578,7 @@ impl DeviceManager { } #[allow(unused_variables)] - fn add_pci_devices( - &mut self, - virtio_devices: Vec, - ) -> DeviceManagerResult<()> { + fn add_pci_devices(&mut self) -> DeviceManagerResult<()> { let iommu_id = String::from(IOMMU_DEVICE_NAME); let iommu_address_width_bits = @@ -1543,7 +1620,7 @@ impl DeviceManager { let mut iommu_attached_devices = Vec::new(); { - for handle in virtio_devices { + for handle in self.virtio_devices.clone() { let mapping: Option> = if handle.iommu { self.iommu_mapping.clone() } else { @@ -1556,6 +1633,7 @@ impl DeviceManager { handle.id, handle.pci_segment, handle.dma_handler, + handle.bdf_device, )?; if handle.iommu { @@ -1570,21 +1648,22 @@ impl DeviceManager { iommu_attached_devices.append(&mut vfio_user_iommu_device_ids); // Add all devices from forced iommu segments - if let Some(platform_config) = self.config.lock().unwrap().platform.as_ref() { - if let Some(iommu_segments) = platform_config.iommu_segments.as_ref() { - for segment in iommu_segments { - for device in 0..32 { - let bdf = PciBdf::new(*segment, 0, device, 0); - if !iommu_attached_devices.contains(&bdf) { - iommu_attached_devices.push(bdf); - } + if let Some(platform_config) = self.config.lock().unwrap().platform.as_ref() + && let Some(iommu_segments) = platform_config.iommu_segments.as_ref() + { + for segment in iommu_segments { + for device in 0..32 { + let bdf = PciBdf::new(*segment, 0, device, 0); + if !iommu_attached_devices.contains(&bdf) { + iommu_attached_devices.push(bdf); } } } } if let Some(iommu_device) = iommu_device { - let dev_id = self.add_virtio_pci_device(iommu_device, &None, iommu_id, 0, None)?; + let dev_id = + self.add_virtio_pci_device(iommu_device, &None, iommu_id, 0, None, None)?; self.iommu_attached_devices = Some((dev_id, iommu_attached_devices)); } } @@ -1662,7 +1741,7 @@ impl DeviceManager { ) -> DeviceManagerResult>> { let interrupt_controller: Arc> = Arc::new(Mutex::new( aia::Aia::new( - self.config.lock().unwrap().cpus.boot_vcpus, + self.config.lock().unwrap().cpus.boot_vcpus as u32, Arc::clone(&self.msi_interrupt_manager), self.address_manager.vm.clone(), ) @@ -1931,13 +2010,7 @@ impl DeviceManager { .allocate_irq() .unwrap(); - let interrupt_group = interrupt_manager - .create_group(LegacyIrqGroupConfig { - irq: rtc_irq as InterruptIndex, - }) - .map_err(DeviceManagerError::CreateInterruptGroup)?; - - let rtc_device = Arc::new(Mutex::new(devices::legacy::Rtc::new(interrupt_group))); + let rtc_device = Arc::new(Mutex::new(devices::legacy::Rtc::new())); self.bus_devices .push(Arc::clone(&rtc_device) as Arc); @@ -2237,7 +2310,6 @@ impl DeviceManager { fn add_virtio_console_device( &mut self, - virtio_devices: &mut Vec, console_fd: ConsoleOutput, resize_pipe: Option>, ) -> DeviceManagerResult>> { @@ -2272,6 +2344,9 @@ impl DeviceManager { ConsoleOutput::Socket(_) => { return Err(DeviceManagerError::NoSocketOptionSupportForConsoleDevice); } + ConsoleOutput::Tcp(_, _) => { + return Err(DeviceManagerError::NoSocketOptionSupportForConsoleDevice); + } ConsoleOutput::Null => Endpoint::Null, ConsoleOutput::Off => return Ok(None), }; @@ -2293,14 +2368,21 @@ impl DeviceManager { ) .map_err(DeviceManagerError::CreateVirtioConsole)?; let virtio_console_device = Arc::new(Mutex::new(virtio_console_device)); - virtio_devices.push(MetaVirtioDevice { + let device = MetaVirtioDevice { virtio_device: Arc::clone(&virtio_console_device) as Arc>, iommu: console_config.iommu, id: id.clone(), pci_segment: 0, dma_handler: None, - }); + bdf_device: console_config.bdf_device, + }; + + if console_config.bdf_device.is_some() { + self.virtio_devices.push_front(device); + } else { + self.virtio_devices.push_back(device); + } // Fill the device tree with a new node. In case of restore, we // know there is nothing to do, so we can simply override the @@ -2326,7 +2408,6 @@ impl DeviceManager { fn add_console_devices( &mut self, interrupt_manager: &Arc>, - virtio_devices: &mut Vec, console_info: Option, console_resize_pipe: Option>, ) -> DeviceManagerResult> { @@ -2346,12 +2427,16 @@ impl DeviceManager { | ConsoleOutput::Null | ConsoleOutput::Pty(_) | ConsoleOutput::Socket(_) => None, + ConsoleOutput::Tcp(_, _) => None, }; if !matches!(console_info.serial_main_fd, ConsoleOutput::Off) { let serial = self.add_serial_device(interrupt_manager, serial_writer)?; self.serial_manager = match console_info.serial_main_fd { - ConsoleOutput::Pty(_) | ConsoleOutput::Tty(_) | ConsoleOutput::Socket(_) => { + ConsoleOutput::Pty(_) + | ConsoleOutput::Tty(_) + | ConsoleOutput::Socket(_) + | ConsoleOutput::Tcp(_, _) => { let serial_manager = SerialManager::new( serial, console_info.serial_main_fd, @@ -2384,17 +2469,15 @@ impl DeviceManager { | ConsoleOutput::Null | ConsoleOutput::Pty(_) | ConsoleOutput::Socket(_) => None, + ConsoleOutput::Tcp(_, _) => None, }; if let Some(writer) = debug_console_writer { let _ = self.add_debug_console_device(writer)?; } } - let console_resizer = self.add_virtio_console_device( - virtio_devices, - console_info.console_main_fd, - console_resize_pipe, - )?; + let console_resizer = + self.add_virtio_console_device(console_info.console_main_fd, console_resize_pipe)?; Ok(Arc::new(Console { console_resizer })) } @@ -2452,35 +2535,33 @@ impl DeviceManager { Ok(()) } - fn make_virtio_devices(&mut self) -> DeviceManagerResult> { - let mut devices: Vec = Vec::new(); - + fn make_virtio_devices(&mut self) -> DeviceManagerResult<()> { // Create "standard" virtio devices (net/block/rng) - devices.append(&mut self.make_virtio_block_devices()?); - devices.append(&mut self.make_virtio_net_devices()?); - devices.append(&mut self.make_virtio_rng_devices()?); + self.make_virtio_block_devices()?; + self.make_virtio_net_devices()?; + self.make_virtio_rng_devices()?; // Add virtio-fs if required - devices.append(&mut self.make_virtio_fs_devices()?); + self.make_virtio_fs_devices()?; // Add virtio-pmem if required - devices.append(&mut self.make_virtio_pmem_devices()?); + self.make_virtio_pmem_devices()?; // Add virtio-vsock if required - devices.append(&mut self.make_virtio_vsock_devices()?); + self.make_virtio_vsock_devices()?; - devices.append(&mut self.make_virtio_mem_devices()?); + self.make_virtio_mem_devices()?; // Add virtio-balloon if required - devices.append(&mut self.make_virtio_balloon_devices()?); + self.make_virtio_balloon_devices()?; // Add virtio-watchdog device - devices.append(&mut self.make_virtio_watchdog_devices()?); + self.make_virtio_watchdog_devices()?; // Add vDPA devices if required - devices.append(&mut self.make_vdpa_devices()?); + self.make_vdpa_devices()?; - Ok(devices) + Ok(()) } // Cache whether aio is supported to avoid checking for very block device @@ -2748,21 +2829,25 @@ impl DeviceManager { id, pci_segment: disk_cfg.pci_segment, dma_handler: None, + bdf_device: disk_cfg.bdf_device, }) } - fn make_virtio_block_devices(&mut self) -> DeviceManagerResult> { - let mut devices = Vec::new(); - + fn make_virtio_block_devices(&mut self) -> DeviceManagerResult<()> { let mut block_devices = self.config.lock().unwrap().disks.clone(); if let Some(disk_list_cfg) = &mut block_devices { for disk_cfg in disk_list_cfg.iter_mut() { - devices.push(self.make_virtio_block_device(disk_cfg, false)?); + let device = self.make_virtio_block_device(disk_cfg, false)?; + if disk_cfg.bdf_device.is_some() { + self.virtio_devices.push_front(device); + } else { + self.virtio_devices.push_back(device); + } } } self.config.lock().unwrap().disks = block_devices; - Ok(devices) + Ok(()) } fn make_virtio_net_device( @@ -2780,6 +2865,7 @@ impl DeviceManager { let (virtio_device, migratable_device) = if net_cfg.vhost_user { let socket = net_cfg.vhost_socket.as_ref().unwrap().clone(); + debug!("Creating virtio-net device with vhost-user backend: {socket}"); let vu_cfg = VhostUserConfig { socket, num_queues: net_cfg.num_queues, @@ -2822,6 +2908,7 @@ impl DeviceManager { let state = state_from_id(self.snapshot.as_ref(), id.as_str()) .map_err(DeviceManagerError::RestoreGetState)?; let virtio_net = if let Some(ref tap_if_name) = net_cfg.tap { + debug!("Creating virtio-net device from Tap device: {tap_if_name}"); Arc::new(Mutex::new( virtio_devices::Net::new( id.clone(), @@ -2847,6 +2934,7 @@ impl DeviceManager { .map_err(DeviceManagerError::CreateVirtioNet)?, )) } else if let Some(fds) = &net_cfg.fds { + debug!("Creating virtio-net device from network FDs: {fds:?}"); let net = virtio_devices::Net::from_tap_fds( id.clone(), fds, @@ -2873,6 +2961,9 @@ impl DeviceManager { Arc::new(Mutex::new(net)) } else { + debug!( + "Creating virtio-net device: no ifname or FDs given, creating new Tap device" + ); Arc::new(Mutex::new( virtio_devices::Net::new( id.clone(), @@ -2919,26 +3010,29 @@ impl DeviceManager { id, pci_segment: net_cfg.pci_segment, dma_handler: None, + bdf_device: net_cfg.bdf_device, }) } /// Add virto-net and vhost-user-net devices - fn make_virtio_net_devices(&mut self) -> DeviceManagerResult> { - let mut devices = Vec::new(); + fn make_virtio_net_devices(&mut self) -> DeviceManagerResult<()> { let mut net_devices = self.config.lock().unwrap().net.clone(); if let Some(net_list_cfg) = &mut net_devices { for net_cfg in net_list_cfg.iter_mut() { - devices.push(self.make_virtio_net_device(net_cfg)?); + let device = self.make_virtio_net_device(net_cfg)?; + if net_cfg.bdf_device.is_some() { + self.virtio_devices.push_front(device); + } else { + self.virtio_devices.push_back(device); + } } } self.config.lock().unwrap().net = net_devices; - Ok(devices) + Ok(()) } - fn make_virtio_rng_devices(&mut self) -> DeviceManagerResult> { - let mut devices = Vec::new(); - + fn make_virtio_rng_devices(&mut self) -> DeviceManagerResult<()> { // Add virtio-rng if required let rng_config = self.config.lock().unwrap().rng.clone(); if let Some(rng_path) = rng_config.src.to_str() { @@ -2959,14 +3053,20 @@ impl DeviceManager { ) .map_err(DeviceManagerError::CreateVirtioRng)?, )); - devices.push(MetaVirtioDevice { + let device = MetaVirtioDevice { virtio_device: Arc::clone(&virtio_rng_device) as Arc>, iommu: rng_config.iommu, id: id.clone(), pci_segment: 0, dma_handler: None, - }); + bdf_device: rng_config.bdf_device, + }; + if rng_config.bdf_device.is_some() { + self.virtio_devices.push_front(device); + } else { + self.virtio_devices.push_back(device); + } // Fill the device tree with a new node. In case of restore, we // know there is nothing to do, so we can simply override the @@ -2977,7 +3077,7 @@ impl DeviceManager { .insert(id.clone(), device_node!(id, virtio_rng_device)); } - Ok(devices) + Ok(()) } fn make_virtio_fs_device( @@ -3027,24 +3127,28 @@ impl DeviceManager { id, pci_segment: fs_cfg.pci_segment, dma_handler: None, + bdf_device: fs_cfg.bdf_device, }) } else { Err(DeviceManagerError::NoVirtioFsSock) } } - fn make_virtio_fs_devices(&mut self) -> DeviceManagerResult> { - let mut devices = Vec::new(); - + fn make_virtio_fs_devices(&mut self) -> DeviceManagerResult<()> { let mut fs_devices = self.config.lock().unwrap().fs.clone(); if let Some(fs_list_cfg) = &mut fs_devices { for fs_cfg in fs_list_cfg.iter_mut() { - devices.push(self.make_virtio_fs_device(fs_cfg)?); + let device = self.make_virtio_fs_device(fs_cfg)?; + if fs_cfg.bdf_device.is_some() { + self.virtio_devices.push_front(device); + } else { + self.virtio_devices.push_back(device); + } } } self.config.lock().unwrap().fs = fs_devices; - Ok(devices) + Ok(()) } fn make_virtio_pmem_device( @@ -3174,7 +3278,7 @@ impl DeviceManager { .create_userspace_mapping(region_base, region_size, host_addr, false, false, false) .map_err(DeviceManagerError::MemoryManager)?; - let mapping = virtio_devices::UserspaceMapping { + let mapping = UserspaceMapping { host_addr, mem_slot, addr: GuestAddress(region_base), @@ -3216,21 +3320,26 @@ impl DeviceManager { id, pci_segment: pmem_cfg.pci_segment, dma_handler: None, + bdf_device: pmem_cfg.bdf_device, }) } - fn make_virtio_pmem_devices(&mut self) -> DeviceManagerResult> { - let mut devices = Vec::new(); + fn make_virtio_pmem_devices(&mut self) -> DeviceManagerResult<()> { // Add virtio-pmem if required let mut pmem_devices = self.config.lock().unwrap().pmem.clone(); if let Some(pmem_list_cfg) = &mut pmem_devices { for pmem_cfg in pmem_list_cfg.iter_mut() { - devices.push(self.make_virtio_pmem_device(pmem_cfg)?); + let device = self.make_virtio_pmem_device(pmem_cfg)?; + if pmem_cfg.bdf_device.is_some() { + self.virtio_devices.push_front(device); + } else { + self.virtio_devices.push_back(device); + } } } self.config.lock().unwrap().pmem = pmem_devices; - Ok(devices) + Ok(()) } fn make_virtio_vsock_device( @@ -3287,24 +3396,26 @@ impl DeviceManager { id, pci_segment: vsock_cfg.pci_segment, dma_handler: None, + bdf_device: vsock_cfg.bdf_device, }) } - fn make_virtio_vsock_devices(&mut self) -> DeviceManagerResult> { - let mut devices = Vec::new(); - + fn make_virtio_vsock_devices(&mut self) -> DeviceManagerResult<()> { let mut vsock = self.config.lock().unwrap().vsock.clone(); - if let Some(ref mut vsock_cfg) = &mut vsock { - devices.push(self.make_virtio_vsock_device(vsock_cfg)?); + if let Some(vsock_cfg) = &mut vsock { + let device = self.make_virtio_vsock_device(vsock_cfg)?; + if vsock_cfg.bdf_device.is_some() { + self.virtio_devices.push_front(device); + } else { + self.virtio_devices.push_back(device); + } } self.config.lock().unwrap().vsock = vsock; - Ok(devices) + Ok(()) } - fn make_virtio_mem_devices(&mut self) -> DeviceManagerResult> { - let mut devices = Vec::new(); - + fn make_virtio_mem_devices(&mut self) -> DeviceManagerResult<()> { let mm = self.memory_manager.clone(); let mut mm = mm.lock().unwrap(); for (memory_zone_id, memory_zone) in mm.memory_zones_mut().iter_mut() { @@ -3339,13 +3450,14 @@ impl DeviceManager { self.virtio_mem_devices.push(Arc::clone(&virtio_mem_device)); - devices.push(MetaVirtioDevice { + self.virtio_devices.push_back(MetaVirtioDevice { virtio_device: Arc::clone(&virtio_mem_device) as Arc>, iommu: false, id: memory_zone_id.clone(), pci_segment: 0, dma_handler: None, + bdf_device: None, }); // Fill the device tree with a new node. In case of restore, we @@ -3358,7 +3470,7 @@ impl DeviceManager { } } - Ok(devices) + Ok(()) } #[cfg(feature = "pvmemcontrol")] @@ -3372,7 +3484,7 @@ impl DeviceManager { let pci_segment_id = 0x0_u16; let (pci_segment_id, pci_device_bdf, resources) = - self.pci_resources(&id, pci_segment_id)?; + self.pci_resources(&id, pci_segment_id, None)?; info!("Creating pvmemcontrol device: id = {}", id); let (pvmemcontrol_pci_device, pvmemcontrol_bus_device) = @@ -3403,9 +3515,7 @@ impl DeviceManager { Ok((pvmemcontrol_bus_device, pvmemcontrol_pci_device)) } - fn make_virtio_balloon_devices(&mut self) -> DeviceManagerResult> { - let mut devices = Vec::new(); - + fn make_virtio_balloon_devices(&mut self) -> DeviceManagerResult<()> { if let Some(balloon_config) = &self.config.lock().unwrap().balloon { let id = String::from(BALLOON_DEVICE_NAME); info!("Creating virtio-balloon device: id = {}", id); @@ -3428,14 +3538,21 @@ impl DeviceManager { self.balloon = Some(virtio_balloon_device.clone()); - devices.push(MetaVirtioDevice { + let device = MetaVirtioDevice { virtio_device: Arc::clone(&virtio_balloon_device) as Arc>, iommu: false, id: id.clone(), pci_segment: 0, dma_handler: None, - }); + bdf_device: balloon_config.bdf_device, + }; + + if balloon_config.bdf_device.is_some() { + self.virtio_devices.push_front(device); + } else { + self.virtio_devices.push_back(device); + } self.device_tree .lock() @@ -3443,14 +3560,12 @@ impl DeviceManager { .insert(id.clone(), device_node!(id, virtio_balloon_device)); } - Ok(devices) + Ok(()) } - fn make_virtio_watchdog_devices(&mut self) -> DeviceManagerResult> { - let mut devices = Vec::new(); - + fn make_virtio_watchdog_devices(&mut self) -> DeviceManagerResult<()> { if !self.config.lock().unwrap().watchdog { - return Ok(devices); + return Ok(()); } let id = String::from(WATCHDOG_DEVICE_NAME); @@ -3469,13 +3584,14 @@ impl DeviceManager { ) .map_err(DeviceManagerError::CreateVirtioWatchdog)?, )); - devices.push(MetaVirtioDevice { + self.virtio_devices.push_back(MetaVirtioDevice { virtio_device: Arc::clone(&virtio_watchdog_device) as Arc>, iommu: false, id: id.clone(), pci_segment: 0, dma_handler: None, + bdf_device: None, }); self.device_tree @@ -3483,7 +3599,7 @@ impl DeviceManager { .unwrap() .insert(id.clone(), device_node!(id, virtio_watchdog_device)); - Ok(devices) + Ok(()) } fn make_vdpa_device( @@ -3534,21 +3650,26 @@ impl DeviceManager { id, pci_segment: vdpa_cfg.pci_segment, dma_handler: Some(vdpa_mapping), + bdf_device: vdpa_cfg.bdf_device, }) } - fn make_vdpa_devices(&mut self) -> DeviceManagerResult> { - let mut devices = Vec::new(); + fn make_vdpa_devices(&mut self) -> DeviceManagerResult<()> { // Add vdpa if required let mut vdpa_devices = self.config.lock().unwrap().vdpa.clone(); if let Some(vdpa_list_cfg) = &mut vdpa_devices { for vdpa_cfg in vdpa_list_cfg.iter_mut() { - devices.push(self.make_vdpa_device(vdpa_cfg)?); + let device = self.make_vdpa_device(vdpa_cfg)?; + if vdpa_cfg.bdf_device.is_some() { + self.virtio_devices.push_front(device); + } else { + self.virtio_devices.push_back(device); + } } } self.config.lock().unwrap().vdpa = vdpa_devices; - Ok(devices) + Ok(()) } fn next_device_name(&mut self, prefix: &str) -> DeviceManagerResult { @@ -3620,7 +3741,7 @@ impl DeviceManager { }; let (pci_segment_id, pci_device_bdf, resources) = - self.pci_resources(&vfio_name, device_cfg.pci_segment)?; + self.pci_resources(&vfio_name, device_cfg.pci_segment, None)?; let mut needs_dma_mapping = false; @@ -3857,7 +3978,7 @@ impl DeviceManager { }; let (pci_segment_id, pci_device_bdf, resources) = - self.pci_resources(&vfio_user_name, device_cfg.pci_segment)?; + self.pci_resources(&vfio_user_name, device_cfg.pci_segment, None)?; let legacy_interrupt_group = if let Some(legacy_interrupt_manager) = &self.legacy_interrupt_manager { @@ -3969,6 +4090,7 @@ impl DeviceManager { virtio_device_id: String, pci_segment_id: u16, dma_handler: Option>, + bdf_device: Option, ) -> DeviceManagerResult { let id = format!("{VIRTIO_PCI_DEVICE_NAME_PREFIX}-{virtio_device_id}"); @@ -3977,7 +4099,7 @@ impl DeviceManager { node.children = vec![virtio_device_id.clone()]; let (pci_segment_id, pci_device_bdf, resources) = - self.pci_resources(&id, pci_segment_id)?; + self.pci_resources(&id, pci_segment_id, bdf_device)?; // Update the existing virtio node by setting the parent. if let Some(node) = self.device_tree.lock().unwrap().get_mut(&virtio_device_id) { @@ -4114,7 +4236,7 @@ impl DeviceManager { info!("Creating pvpanic device {}", id); let (pci_segment_id, pci_device_bdf, resources) = - self.pci_resources(&id, pci_segment_id)?; + self.pci_resources(&id, pci_segment_id, None)?; let snapshot = snapshot_from_id(self.snapshot.as_ref(), id.as_str()); @@ -4142,10 +4264,62 @@ impl DeviceManager { Ok(Some(pvpanic_device)) } + #[cfg(feature = "ivshmem")] + fn add_ivshmem_device( + &mut self, + ivshmem_cfg: &IvshmemConfig, + ) -> DeviceManagerResult>>> { + let id = String::from(IVSHMEM_DEVICE_NAME); + let pci_segment_id = 0x0_u16; + info!("Creating ivshmem device {}", id); + + let (pci_segment_id, pci_device_bdf, resources) = + self.pci_resources(&id, pci_segment_id, None)?; + let snapshot = snapshot_from_id(self.snapshot.as_ref(), id.as_str()); + + let ivshmem_ops = Arc::new(Mutex::new(IvshmemHandler { + memory_manager: self.memory_manager.clone(), + })); + let ivshmem_device = Arc::new(Mutex::new( + devices::IvshmemDevice::new( + id.clone(), + ivshmem_cfg.size as u64, + Some(ivshmem_cfg.path.clone()), + ivshmem_ops.clone(), + snapshot, + ) + .map_err(DeviceManagerError::IvshmemCreate)?, + )); + let new_resources = self.add_pci_device( + ivshmem_device.clone(), + ivshmem_device.clone(), + pci_segment_id, + pci_device_bdf, + resources, + )?; + + let start_addr = ivshmem_device.lock().unwrap().data_bar_addr(); + let (region, mapping) = ivshmem_ops + .lock() + .unwrap() + .map_ram_region(start_addr, ivshmem_cfg.size, Some(ivshmem_cfg.path.clone())) + .map_err(DeviceManagerError::IvshmemCreate)?; + ivshmem_device.lock().unwrap().set_region(region, mapping); + + let mut node = device_node!(id, ivshmem_device); + node.resources = new_resources; + node.pci_bdf = Some(pci_device_bdf); + node.pci_device_handle = None; + self.device_tree.lock().unwrap().insert(id, node); + + Ok(Some(ivshmem_device)) + } + fn pci_resources( &self, id: &str, pci_segment_id: u16, + pci_device_id: Option, ) -> DeviceManagerResult<(u16, PciBdf, Option>)> { // Look for the id in the device tree. If it can be found, that means // the device is being restored, otherwise it's created from scratch. @@ -4172,7 +4346,8 @@ impl DeviceManager { (pci_segment_id, pci_device_bdf, resources) } else { - let pci_device_bdf = self.pci_segments[pci_segment_id as usize].next_device_bdf()?; + let pci_device_bdf = + self.pci_segments[pci_segment_id as usize].allocate_device_bdf(pci_device_id)?; (pci_segment_id, pci_device_bdf, None) }) @@ -4187,6 +4362,11 @@ impl DeviceManager { &self.address_manager.mmio_bus } + #[cfg(feature = "fw_cfg")] + pub fn fw_cfg(&self) -> Option<&Arc>> { + self.fw_cfg.as_ref() + } + pub fn allocator(&self) -> &Arc> { &self.address_manager.allocator } @@ -4215,14 +4395,14 @@ impl DeviceManager { .add_memory_region(new_region) .map_err(DeviceManagerError::UpdateMemoryForVirtioDevice)?; - if let Some(dma_handler) = &handle.dma_handler { - if !handle.iommu { - let gpa = new_region.start_addr().0; - let size = new_region.len(); - dma_handler - .map(gpa, gpa, size) - .map_err(DeviceManagerError::VirtioDmaMap)?; - } + if let Some(dma_handler) = &handle.dma_handler + && !handle.iommu + { + let gpa = new_region.start_addr().0; + let size = new_region.len(); + dma_handler + .map(gpa, gpa, size) + .map_err(DeviceManagerError::VirtioDmaMap)?; } } @@ -4267,6 +4447,10 @@ impl DeviceManager { Ok(()) } + /// Notifies the VM for a hotplug. + /// + /// This call doesn't wait for the vCPU receiving the + /// interrupt to acknowledge. pub fn notify_hotplug( &self, _notification_type: AcpiNotificationFlags, @@ -4379,8 +4563,25 @@ impl DeviceManager { .device_type(), ); match device_type { - VirtioDeviceType::Net - | VirtioDeviceType::Block + VirtioDeviceType::Net => { + let mut config = self.config.lock().unwrap(); + let nets = config.net.as_deref_mut().unwrap(); + let net_dev_cfg = nets + .iter_mut() + .find(|net| net.id.as_ref() == Some(&id)) + .unwrap(); + let fds = net_dev_cfg.fds.take().unwrap_or(Vec::new()); + + debug!("Closing preserved FDs from virtio-net device: id={id}, fds={fds:?}"); + for fd in fds { + config.preserved_fds.as_mut().unwrap().retain(|x| *x != fd); + // SAFETY: Trivially safe. We know the FD is not referenced any longer. + unsafe { + libc::close(fd); + } + } + } + VirtioDeviceType::Block | VirtioDeviceType::Pmem | VirtioDeviceType::Fs | VirtioDeviceType::Vsock => {} @@ -4441,10 +4642,10 @@ impl DeviceManager { }; let mut iommu_attached = false; - if let Some((_, iommu_attached_devices)) = &self.iommu_attached_devices { - if iommu_attached_devices.contains(&pci_device_bdf) { - iommu_attached = true; - } + if let Some((_, iommu_attached_devices)) = &self.iommu_attached_devices + && iommu_attached_devices.contains(&pci_device_bdf) + { + iommu_attached = true; } let (pci_device, bus_device, virtio_device, remove_dma_handler) = match pci_device_handle { @@ -4475,16 +4676,16 @@ impl DeviceManager { .map_err(|e| DeviceManagerError::UnRegisterIoevent(e.into()))?; } - if let Some(dma_handler) = dev.dma_handler() { - if !iommu_attached { - for (_, zone) in self.memory_manager.lock().unwrap().memory_zones().iter() { - for region in zone.regions() { - let iova = region.start_addr().0; - let size = region.len(); - dma_handler - .unmap(iova, size) - .map_err(DeviceManagerError::VirtioDmaUnmap)?; - } + if let Some(dma_handler) = dev.dma_handler() + && !iommu_attached + { + for (_, zone) in self.memory_manager.lock().unwrap().memory_zones().iter() { + for region in zone.regions() { + let iova = region.start_addr().0; + let size = region.len(); + dma_handler + .unmap(iova, size) + .map_err(DeviceManagerError::VirtioDmaUnmap)?; } } } @@ -4612,7 +4813,7 @@ impl DeviceManager { // Add the virtio device to the device manager list. This is important // as the list is used to notify virtio devices about memory updates // for instance. - self.virtio_devices.push(handle.clone()); + self.virtio_devices.push_back(handle.clone()); let mapping: Option> = if handle.iommu { self.iommu_mapping.clone() @@ -4626,6 +4827,7 @@ impl DeviceManager { handle.id.clone(), handle.pci_segment, handle.dma_handler, + handle.bdf_device, )?; // Update the PCIU bitmap @@ -4746,6 +4948,18 @@ impl DeviceManager { 0 } + pub fn resize_disk(&mut self, device_id: &str, new_size: u64) -> DeviceManagerResult<()> { + for dev in &self.block_devices { + let mut disk = dev.lock().unwrap(); + if disk.id() == device_id { + return disk + .resize(new_size) + .map_err(DeviceManagerError::DiskResizeError); + } + } + Err(DeviceManagerError::UnknownDeviceId(device_id.to_string())) + } + pub fn device_tree(&self) -> Arc> { self.device_tree.clone() } @@ -4810,6 +5024,74 @@ impl DeviceManager { } } +#[cfg(feature = "ivshmem")] +struct IvshmemHandler { + memory_manager: Arc>, +} + +#[cfg(feature = "ivshmem")] +impl IvshmemOps for IvshmemHandler { + fn map_ram_region( + &mut self, + start_addr: u64, + size: usize, + backing_file: Option, + ) -> Result<(Arc, UserspaceMapping), IvshmemError> { + info!("Creating ivshmem mem region at 0x{:x}", start_addr); + + let region = MemoryManager::create_ram_region( + &backing_file, + 0, + GuestAddress(start_addr), + size, + false, + true, + false, + None, + None, + None, + false, + ) + .map_err(|_| IvshmemError::CreateUserMemoryRegion)?; + let mem_slot = self + .memory_manager + .lock() + .unwrap() + .create_userspace_mapping( + region.start_addr().0, + region.len(), + region.as_ptr() as u64, + false, + false, + false, + ) + .map_err(|_| IvshmemError::CreateUserspaceMapping)?; + let mapping = UserspaceMapping { + host_addr: region.as_ptr() as u64, + mem_slot, + addr: GuestAddress(region.start_addr().0), + len: region.len(), + mergeable: false, + }; + Ok((region, mapping)) + } + + fn unmap_ram_region(&mut self, mapping: UserspaceMapping) -> Result<(), IvshmemError> { + self.memory_manager + .lock() + .unwrap() + .remove_userspace_mapping( + mapping.addr.raw_value(), + mapping.len, + mapping.host_addr, + mapping.mergeable, + mapping.mem_slot, + ) + .map_err(|_| IvshmemError::RemoveUserspaceMapping)?; + Ok(()) + } +} + fn numa_node_id_from_memory_zone_id(numa_nodes: &NumaNodes, memory_zone_id: &str) -> Option { for (numa_node_id, numa_node) in numa_nodes.iter() { if numa_node.memory_zones.contains(&memory_zone_id.to_owned()) { @@ -4961,6 +5243,27 @@ impl Aml for DeviceManager { ) .to_aml_bytes(sink); + #[cfg(all(feature = "fw_cfg", target_arch = "x86_64"))] + if self.fw_cfg.is_some() { + aml::Device::new( + "_SB_.FWCF".into(), + vec![ + &aml::Name::new("_HID".into(), &FW_CFG_ACPI_ID.to_string()), + &aml::Name::new("_STA".into(), &0xB_usize), + &aml::Name::new( + "_CRS".into(), + &aml::ResourceTemplate::new(vec![&aml::IO::new( + PORT_FW_CFG_BASE as u16, + PORT_FW_CFG_BASE as u16, + 0x01, + PORT_FW_CFG_WIDTH as u8, + )]), + ), + ], + ) + .to_aml_bytes(sink); + } + // Serial device #[cfg(target_arch = "x86_64")] let serial_irq = 4; diff --git a/vmm/src/gdb.rs b/vmm/src/gdb.rs index 16c9f64d98..ef4f4de8fd 100644 --- a/vmm/src/gdb.rs +++ b/vmm/src/gdb.rs @@ -11,24 +11,24 @@ use std::sync::mpsc; use gdbstub::arch::Arch; use gdbstub::common::{Signal, Tid}; use gdbstub::conn::{Connection, ConnectionExt}; -use gdbstub::stub::{run_blocking, DisconnectReason, MultiThreadStopReason}; +use gdbstub::stub::{DisconnectReason, MultiThreadStopReason, run_blocking}; +use gdbstub::target::ext::base::BaseOps; use gdbstub::target::ext::base::multithread::{ MultiThreadBase, MultiThreadResume, MultiThreadResumeOps, MultiThreadSingleStep, MultiThreadSingleStepOps, }; -use gdbstub::target::ext::base::BaseOps; use gdbstub::target::ext::breakpoints::{ Breakpoints, BreakpointsOps, HwBreakpoint, HwBreakpointOps, }; use gdbstub::target::{Target, TargetError, TargetResult}; #[cfg(target_arch = "aarch64")] -use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; -#[cfg(target_arch = "aarch64")] use gdbstub_arch::aarch64::AArch64 as GdbArch; -#[cfg(target_arch = "x86_64")] -use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs; +#[cfg(target_arch = "aarch64")] +use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; #[cfg(target_arch = "x86_64")] use gdbstub_arch::x86::X86_64_SSE as GdbArch; +#[cfg(target_arch = "x86_64")] +use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs; use thiserror::Error; use vm_memory::{GuestAddress, GuestMemoryAtomic, GuestMemoryError}; diff --git a/vmm/src/igvm/igvm_loader.rs b/vmm/src/igvm/igvm_loader.rs index 75eeb33582..03dd4d2472 100644 --- a/vmm/src/igvm/igvm_loader.rs +++ b/vmm/src/igvm/igvm_loader.rs @@ -10,21 +10,21 @@ use std::sync::{Arc, Mutex}; use igvm::snp_defs::SevVmsa; use igvm::{IgvmDirectiveHeader, IgvmFile, IgvmPlatformHeader, IsolationType}; +#[cfg(feature = "sev_snp")] +use igvm_defs::{IGVM_VHS_MEMORY_MAP_ENTRY, MemoryMapEntryType}; use igvm_defs::{ - IgvmPageDataType, IgvmPlatformType, IGVM_VHS_PARAMETER, IGVM_VHS_PARAMETER_INSERT, + IGVM_VHS_PARAMETER, IGVM_VHS_PARAMETER_INSERT, IgvmPageDataType, IgvmPlatformType, }; -#[cfg(feature = "sev_snp")] -use igvm_defs::{MemoryMapEntryType, IGVM_VHS_MEMORY_MAP_ENTRY}; use mshv_bindings::*; use thiserror::Error; use zerocopy::IntoBytes; +#[cfg(feature = "sev_snp")] +use crate::GuestMemoryMmap; use crate::cpu::CpuManager; use crate::igvm::loader::Loader; -use crate::igvm::{BootPageAcceptance, IgvmLoadedInfo, StartupMemoryType, HV_PAGE_SIZE}; +use crate::igvm::{BootPageAcceptance, HV_PAGE_SIZE, IgvmLoadedInfo, StartupMemoryType}; use crate::memory_manager::MemoryManager; -#[cfg(feature = "sev_snp")] -use crate::GuestMemoryMmap; #[derive(Debug, Error)] pub enum Error { @@ -66,8 +66,8 @@ enum ParameterAreaState { #[cfg(feature = "sev_snp")] fn igvm_memmap_from_ram_range(ram_range: (u64, u64)) -> IGVM_VHS_MEMORY_MAP_ENTRY { - assert!(ram_range.0 % HV_PAGE_SIZE == 0); - assert!((ram_range.1 - ram_range.0) % HV_PAGE_SIZE == 0); + assert!(ram_range.0.is_multiple_of(HV_PAGE_SIZE)); + assert!((ram_range.1 - ram_range.0).is_multiple_of(HV_PAGE_SIZE)); IGVM_VHS_MEMORY_MAP_ENTRY { starting_gpa_page_number: ram_range.0 / HV_PAGE_SIZE, @@ -179,7 +179,7 @@ pub fn load_igvm( data_type, data, } => { - debug_assert!(data.len() as u64 % HV_PAGE_SIZE == 0); + debug_assert!((data.len() as u64).is_multiple_of(HV_PAGE_SIZE)); // TODO: only 4k or empty page data supported right now assert!(data.len() as u64 == HV_PAGE_SIZE || data.is_empty()); @@ -428,11 +428,11 @@ pub fn load_igvm( let gpas_grouped = gpas .iter() .fold(Vec::>::new(), |mut acc, gpa| { - if let Some(last_vec) = acc.last_mut() { - if last_vec[0].page_type == gpa.page_type { - last_vec.push(*gpa); - return acc; - } + if let Some(last_vec) = acc.last_mut() + && last_vec[0].page_type == gpa.page_type + { + last_vec.push(*gpa); + return acc; } acc.push(vec![*gpa]); acc diff --git a/vmm/src/igvm/loader.rs b/vmm/src/igvm/loader.rs index 215c84c50e..316cadb1ac 100644 --- a/vmm/src/igvm/loader.rs +++ b/vmm/src/igvm/loader.rs @@ -10,7 +10,7 @@ use vm_memory::{ GuestMemoryRegion, }; -use crate::igvm::{BootPageAcceptance, StartupMemoryType, HV_PAGE_SIZE}; +use crate::igvm::{BootPageAcceptance, HV_PAGE_SIZE, StartupMemoryType}; /// Structure to hold the guest memory info/layout to check /// the if the memory is accepted within the layout. diff --git a/vmm/src/landlock.rs b/vmm/src/landlock.rs index 3defeaefbd..e7efd9cbcc 100644 --- a/vmm/src/landlock.rs +++ b/vmm/src/landlock.rs @@ -4,13 +4,13 @@ use std::convert::TryFrom; use std::io::Error as IoError; -use std::path::PathBuf; +use std::path::Path; #[cfg(test)] use landlock::make_bitflags; use landlock::{ - path_beneath_rules, Access, AccessFs, BitFlags, Ruleset, RulesetAttr, RulesetCreated, - RulesetCreatedAttr, RulesetError, ABI, + ABI, Access, AccessFs, BitFlags, Compatible, Ruleset, RulesetAttr, RulesetCreated, + RulesetCreatedAttr, RulesetError, path_beneath_rules, }; use thiserror::Error; @@ -59,7 +59,7 @@ impl TryFrom<&str> for LandlockAccess { _ => { return Err(LandlockError::InvalidLandlockAccess( format!("Invalid access: {c}").to_string(), - )) + )); } }; } @@ -75,8 +75,10 @@ impl Landlock { let file_access = AccessFs::from_all(ABI); let def_ruleset = Ruleset::default() + .set_compatibility(landlock::CompatLevel::HardRequirement) .handle_access(file_access) - .map_err(LandlockError::ManageRuleset)?; + .map_err(LandlockError::ManageRuleset)? + .set_compatibility(landlock::CompatLevel::HardRequirement); // By default, rulesets are created in `BestEffort` mode. This lets Landlock // to enable all the supported rules and silently ignore the unsupported ones. @@ -87,13 +89,13 @@ impl Landlock { pub(crate) fn add_rule( &mut self, - path: PathBuf, + path: &Path, access: BitFlags, ) -> Result<(), LandlockError> { // path_beneath_rules in landlock crate handles file and directory access rules. // Incoming path/s are passed to path_beneath_rules, so that we don't // have to worry about the type of the path. - let paths = vec![path.clone()]; + let paths = vec![&path]; let path_beneath_rules = path_beneath_rules(paths, access); self.ruleset .as_mut() @@ -104,7 +106,7 @@ impl Landlock { pub(crate) fn add_rule_with_access( &mut self, - path: PathBuf, + path: &Path, access: &str, ) -> Result<(), LandlockError> { self.add_rule(path, LandlockAccess::try_from(access)?.access)?; diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 2cf1cb6e95..3d92c74a8c 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -8,57 +8,81 @@ extern crate event_monitor; #[macro_use] extern crate log; +/// Amount of iterations before auto-converging starts. +const AUTO_CONVERGE_ITERATION_DELAY: u64 = 2; +/// Step size in percent to increase the vCPU throttling. +const AUTO_CONVERGE_STEP_SIZE: u8 = 10; +/// Amount of iterations after that we increase vCPU throttling. +const AUTO_CONVERGE_ITERATION_INCREASE: u64 = 2; +/// Maximum vCPU throttling value. +const AUTO_CONVERGE_MAX: u8 = 99; + use std::collections::HashMap; use std::fs::File; -use std::io::{stdout, Read, Write}; +use std::io::{ErrorKind, Read, Write, stdout}; use std::net::{TcpListener, TcpStream}; +use std::os::fd::{AsFd, BorrowedFd}; use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; use std::os::unix::net::{UnixListener, UnixStream}; use std::panic::AssertUnwindSafe; use std::path::PathBuf; use std::rc::Rc; -use std::sync::mpsc::{Receiver, RecvError, SendError, Sender}; -use std::sync::{Arc, Mutex}; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::mpsc::{Receiver, RecvError, SendError, Sender, TrySendError}; +use std::sync::{Arc, Barrier, Mutex}; +use std::thread::JoinHandle; #[cfg(not(target_arch = "riscv64"))] -use std::time::Instant; -use std::{io, result, thread}; +use std::time::{Duration, Instant}; +use std::{io, mem, result, thread}; -use anyhow::anyhow; +use anyhow::{Context, anyhow}; #[cfg(feature = "dbus_api")] use api::dbus::{DBusApiOptions, DBusApiShutdownChannels}; use api::http::HttpApiHandle; -use console_devices::{pre_create_console_devices, ConsoleInfo}; +use arch::PAGE_SIZE; +#[cfg(all(feature = "kvm", target_arch = "x86_64"))] +use arch::x86_64::MAX_SUPPORTED_CPUS_LEGACY; +use console_devices::{ConsoleInfo, pre_create_console_devices}; use landlock::LandlockError; -use libc::{tcsetattr, termios, EFD_NONBLOCK, SIGINT, SIGTERM, TCSANOW}; +use libc::{EFD_NONBLOCK, SIGINT, SIGTERM, TCSANOW, tcsetattr, termios}; use memory_manager::MemoryManagerSnapshotData; use pci::PciBdf; -use seccompiler::{apply_filter, SeccompAction}; +use seccompiler::{SeccompAction, apply_filter}; use serde::ser::{SerializeStruct, Serializer}; use serde::{Deserialize, Serialize}; use signal_hook::iterator::{Handle, Signals}; use thiserror::Error; use tracer::trace_scoped; use vm_memory::bitmap::{AtomicBitmap, BitmapSlice}; -use vm_memory::{ReadVolatile, VolatileMemoryError, VolatileSlice, WriteVolatile}; +use vm_memory::{ + GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, ReadVolatile, + VolatileMemoryError, VolatileSlice, WriteVolatile, +}; use vm_migration::protocol::*; -use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable}; +use vm_migration::tls::{TlsConnectionWrapper, TlsStream, TlsStreamWrapper}; +use vm_migration::{ + Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable, tls, +}; use vmm_sys_util::eventfd::EventFd; use vmm_sys_util::signal::unblock_signal; use vmm_sys_util::sock_ctrl_msg::ScmSocket; +use crate::api::http::http_endpoint::ONGOING_LIVEMIGRATION; use crate::api::{ ApiRequest, ApiResponse, RequestHandler, VmInfoResponse, VmReceiveMigrationData, VmSendMigrationData, VmmPingResponse, }; -use crate::config::{add_to_config, RestoreConfig}; +use crate::config::{RestoreConfig, add_to_config}; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use crate::coredump::GuestDebuggable; +#[cfg(feature = "kvm")] +use crate::cpu::IS_IN_SHUTDOWN; use crate::landlock::Landlock; use crate::memory_manager::MemoryManager; #[cfg(all(feature = "kvm", target_arch = "x86_64"))] use crate::migration::get_vm_snapshot; use crate::migration::{recv_vm_config, recv_vm_state}; -use crate::seccomp_filters::{get_seccomp_filter, Thread}; +use crate::seccomp_filters::{Thread, get_seccomp_filter}; use crate::vm::{Error as VmError, Vm, VmState}; use crate::vm_config::{ DeviceConfig, DiskConfig, FsConfig, NetConfig, PmemConfig, UserDeviceConfig, VdpaConfig, @@ -88,6 +112,7 @@ mod pci_segment; pub mod seccomp_filters; mod serial_manager; mod sigwinch_listener; +mod vcpu_throttling; pub mod vm; pub mod vm_config; @@ -223,6 +248,7 @@ pub enum EpollDispatch { Api = 2, ActivateVirtioDevices = 3, Debug = 4, + CheckMigration = 5, Unknown, } @@ -235,6 +261,7 @@ impl From for EpollDispatch { 2 => Api, 3 => ActivateVirtioDevices, 4 => Debug, + 5 => CheckMigration, _ => Unknown, } } @@ -243,6 +270,7 @@ impl From for EpollDispatch { enum SocketStream { Unix(UnixStream), Tcp(TcpStream), + Tls(Box), } impl Read for SocketStream { @@ -250,6 +278,7 @@ impl Read for SocketStream { match self { SocketStream::Unix(stream) => stream.read(buf), SocketStream::Tcp(stream) => stream.read(buf), + SocketStream::Tls(stream) => stream.read(buf), } } } @@ -259,6 +288,7 @@ impl Write for SocketStream { match self { SocketStream::Unix(stream) => stream.write(buf), SocketStream::Tcp(stream) => stream.write(buf), + SocketStream::Tls(stream) => stream.write(buf), } } @@ -266,15 +296,17 @@ impl Write for SocketStream { match self { SocketStream::Unix(stream) => stream.flush(), SocketStream::Tcp(stream) => stream.flush(), + SocketStream::Tls(stream) => stream.flush(), } } } -impl AsRawFd for SocketStream { - fn as_raw_fd(&self) -> RawFd { +impl AsFd for SocketStream { + fn as_fd(&self) -> BorrowedFd<'_> { match self { - SocketStream::Unix(s) => s.as_raw_fd(), - SocketStream::Tcp(s) => s.as_raw_fd(), + SocketStream::Unix(s) => s.as_fd(), + SocketStream::Tcp(s) => s.as_fd(), + SocketStream::Tls(s) => s.as_fd(), } } } @@ -287,6 +319,7 @@ impl ReadVolatile for SocketStream { match self { SocketStream::Unix(s) => s.read_volatile(buf), SocketStream::Tcp(s) => s.read_volatile(buf), + SocketStream::Tls(s) => s.read_volatile(buf), } } @@ -297,6 +330,7 @@ impl ReadVolatile for SocketStream { match self { SocketStream::Unix(s) => s.read_exact_volatile(buf), SocketStream::Tcp(s) => s.read_exact_volatile(buf), + SocketStream::Tls(s) => s.read_exact_volatile(buf), } } } @@ -309,6 +343,7 @@ impl WriteVolatile for SocketStream { match self { SocketStream::Unix(s) => s.write_volatile(buf), SocketStream::Tcp(s) => s.write_volatile(buf), + SocketStream::Tls(s) => s.write_volatile(buf), } } @@ -319,6 +354,7 @@ impl WriteVolatile for SocketStream { match self { SocketStream::Unix(s) => s.write_all_volatile(buf), SocketStream::Tcp(s) => s.write_all_volatile(buf), + SocketStream::Tls(s) => s.write_all_volatile(buf), } } } @@ -405,6 +441,8 @@ pub fn feature_list() -> Vec { "dbus_api".to_string(), #[cfg(feature = "dhat-heap")] "dhat-heap".to_string(), + #[cfg(feature = "fw_cfg")] + "fw_cfg".to_string(), #[cfg(feature = "guest_debug")] "guest_debug".to_string(), #[cfg(feature = "igvm")] @@ -421,6 +459,8 @@ pub fn feature_list() -> Vec { "tdx".to_string(), #[cfg(feature = "tracing")] "tracing".to_string(), + #[cfg(feature = "ivshmem")] + "ivshmem".to_string(), ] } @@ -640,6 +680,101 @@ impl VmmVersionInfo { } } +#[derive(Debug, Clone)] +struct MigrationState { + current_dirty_pages: u64, + downtime: Duration, + downtime_start: Instant, + iteration: u64, + iteration_cost_time: Duration, + iteration_start_time: Instant, + mb_per_sec: f64, + pages_per_second: u64, + pending_size: u64, + start_time: Instant, + threshold_size: u64, + total_time: Duration, + total_transferred_bytes: u64, + total_transferred_dirty_pages: u64, +} + +impl MigrationState { + pub fn new() -> Self { + Self { + current_dirty_pages: 0, + downtime: Duration::default(), + downtime_start: Instant::now(), + iteration: 0, + iteration_cost_time: Duration::default(), + iteration_start_time: Instant::now(), + mb_per_sec: 0.0, + pages_per_second: 0, + pending_size: 0, + start_time: Instant::now(), + threshold_size: 0, + total_time: Duration::default(), + total_transferred_bytes: 0, + total_transferred_dirty_pages: 0, + } + } +} + +/// Abstraction for the thread controlling and performing the live migration. +/// +/// The migration thread also takes ownership of the [`Vm`] from the [`Vmm`]. +struct MigrationWorker { + vm: Vm, + check_migration_evt: EventFd, + config: VmSendMigrationData, + #[cfg(all(feature = "kvm", target_arch = "x86_64"))] + hypervisor: Arc, +} + +impl MigrationWorker { + /// Performs any final cleanup after failed live migrations. + /// + /// Helper for [`Self::migrate`]. + fn migrate_error_cleanup(&mut self) -> result::Result<(), MigratableError> { + // Stop logging dirty pages only for non-local migrations + if !self.config.local { + self.vm.stop_dirty_log()?; + } + + Ok(()) + } + + /// Migrate and cleanup. + fn migrate(&mut self) -> result::Result<(), MigratableError> { + debug!("start sending migration"); + Vmm::send_migration( + &mut self.vm, + #[cfg(all(feature = "kvm", target_arch = "x86_64"))] + self.hypervisor.clone(), + self.config.clone(), + ).inspect_err(|_| { + let e = self.migrate_error_cleanup(); + if let Err(e) = e { + error!("Failed to clean up after a failed live migration. VM might keep running but in an odd or possibly slowed-down state: {e}"); + } + })?; + + Ok(()) + } + + /// Perform the migration and communicate with the [`Vmm`] thread. + fn run(mut self) -> (Vm, result::Result<(), MigratableError>) { + debug!("migration thread is starting"); + + let res = self.migrate().inspect_err(|e| error!("migrate error: {e}")); + + // Notify VMM thread to get migration result by joining this thread. + self.check_migration_evt.write(1).unwrap(); + + debug!("migration thread is finished"); + (self.vm, res) + } +} + pub struct VmmThreadHandle { pub thread_handle: thread::JoinHandle>, #[cfg(feature = "dbus_api")] @@ -647,6 +782,41 @@ pub struct VmmThreadHandle { pub http_api_handle: Option, } +/// Describes the current ownership of a running VM. +#[allow(clippy::large_enum_variant)] +pub enum MaybeVmOwnership { + /// The VMM holds the ownership of the VM. + Vmm(Vm), + /// The VM is temporarily blocked by the current ongoing migration. + Migration, + /// No VM is running. + None, +} + +impl MaybeVmOwnership { + /// Takes the VM and replaces it with [`Self::Migration`]. + /// + /// # Panics + /// This method panics if `self` is not [`Self::Vmm`]. + fn take_vm_for_migration(&mut self) -> Vm { + if !matches!(self, Self::Vmm(_)) { + panic!("should only be called when a migration can start"); + } + + match mem::replace(self, Self::Migration) { + MaybeVmOwnership::Vmm(vm) => vm, + _ => unreachable!(), + } + } + + fn vm_mut(&mut self) -> Option<&mut Vm> { + match self { + MaybeVmOwnership::Vmm(vm) => Some(vm), + _ => None, + } + } +} + pub struct Vmm { epoll: EpollContext, exit_evt: EventFd, @@ -657,7 +827,7 @@ pub struct Vmm { #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, version: VmmVersionInfo, - vm: Option, + vm: MaybeVmOwnership, vm_config: Option>>, seccomp_action: SeccompAction, hypervisor: Arc, @@ -667,6 +837,706 @@ pub struct Vmm { original_termios_opt: Arc>>, console_resize_pipe: Option>, console_info: Option, + check_migration_evt: EventFd, + /// Handle to the [`MigrationWorker`] thread. + /// + /// The handle will return the [`Vm`] back in any case. Further, the underlying error (if any) is returned. + migration_thread_handle: Option)>>, +} + +/// Wait for a file descriptor to become readable. In this case, we return +/// true. In case, the eventfd was signaled, return false. +fn wait_for_readable( + fd: &impl AsFd, + eventfd: &EventFd, +) -> std::result::Result { + let fd_event = eventfd.as_raw_fd().as_raw_fd(); + let fd_io = fd.as_fd().as_raw_fd(); + let mut poll_fds = [ + libc::pollfd { + fd: fd_event, + events: libc::POLLIN, + revents: 0, + }, + libc::pollfd { + fd: fd_io, + events: libc::POLLIN, + revents: 0, + }, + ]; + + // SAFETY: This is safe, because the file descriptors are valid and the + // poll_fds array is properly initialized. + let ret = unsafe { libc::poll(poll_fds.as_mut_ptr(), poll_fds.len() as libc::nfds_t, -1) }; + + if ret < 0 { + return Err(std::io::Error::last_os_error()); + } + + if poll_fds[0].revents & libc::POLLIN != 0 { + return Ok(false); + } + if poll_fds[1].revents & libc::POLLIN != 0 { + return Ok(true); + } + + panic!("Poll returned, but neither file descriptor is readable?"); +} + +/// Abstract over the different types of listeners that can be used to receive connections. +#[derive(Debug)] +enum ReceiveListener { + Tcp(TcpListener), + Unix(UnixListener, Option), + Tls(TcpListener, TlsConnectionWrapper), +} + +impl AsFd for ReceiveListener { + fn as_fd(&self) -> BorrowedFd<'_> { + match self { + ReceiveListener::Tcp(listener) => listener.as_fd(), + ReceiveListener::Unix(listener, _) => listener.as_fd(), + ReceiveListener::Tls(listener, _) => listener.as_fd(), + } + } +} + +impl ReceiveListener { + /// Block until a connection is accepted. + fn accept(&mut self) -> std::result::Result { + match self { + ReceiveListener::Tcp(listener) => listener + .accept() + .map(|(socket, _)| SocketStream::Tcp(socket)), + ReceiveListener::Unix(listener, opt_path) => { + let socket = listener + .accept() + .map(|(socket, _)| SocketStream::Unix(socket))?; + + // Remove the UNIX socket file after accepting the connection. Is this actually safe? If a user + // moves the file and creates a new one with the same name, we will delete the wrong file. + // Sounds like a confused deputy to me. + // + // TODO Don't do this? + if let Some(path) = opt_path.take() { + std::fs::remove_file(&path)?; + } + + Ok(socket) + } + ReceiveListener::Tls(listener, conn) => listener.accept().map(|(socket, _)| { + conn.wrap(socket) + .map(|s| SocketStream::Tls(Box::new(s))) + .map_err(std::io::Error::other) + })?, + } + } + + /// Same as accept(), but returns None if the eventfd is signaled. + fn abortable_accept( + &mut self, + eventfd: &EventFd, + ) -> std::result::Result, std::io::Error> { + wait_for_readable(&self, eventfd)? + .then(|| self.accept()) + .transpose() + } + + fn try_clone(&self) -> std::result::Result { + match self { + ReceiveListener::Tcp(listener) => listener.try_clone().map(ReceiveListener::Tcp), + ReceiveListener::Unix(listener, opt_path) => listener + .try_clone() + .map(|listener| ReceiveListener::Unix(listener, opt_path.clone())), + ReceiveListener::Tls(listener, conn) => listener + .try_clone() + .map(|listener| ReceiveListener::Tls(listener, conn.clone())), + } + } +} + +/// Handles a `Memory` request by writing its payload to the VM memory. +fn vm_receive_memory( + req: &Request, + socket: &mut T, + guest_mem: &GuestMemoryAtomic, +) -> std::result::Result<(), MigratableError> +where + T: Read + ReadVolatile, +{ + assert_eq!(req.command(), Command::Memory); + + // Read table + let ranges = MemoryRangeTable::read_from(socket, req.length())?; + let mem = guest_mem.memory(); + + for range in ranges.regions() { + let mut offset: u64 = 0; + // Here we are manually handling the retry in case we can't the + // whole region at once because we can't use the implementation + // from vm-memory::GuestMemory of read_exact_from() as it is not + // following the correct behavior. For more info about this issue + // see: https://github.com/rust-vmm/vm-memory/issues/174 + loop { + let bytes_read = mem + .read_volatile_from( + GuestAddress(range.gpa + offset), + socket, + (range.length - offset) as usize, + ) + .map_err(|e| { + MigratableError::MigrateReceive(anyhow!( + "Error receiving memory from socket: {}", + e + )) + })?; + offset += bytes_read as u64; + + if offset == range.length { + break; + } + } + } + + Ok(()) +} + +/// We keep track of additional connections for receiving VM migration data +/// here. +struct ReceiveAdditionalConnections { + terminate_fd: EventFd, + + // This is only an option to be able to join it in the destructor. + accept_thread: Option>, +} + +impl ReceiveAdditionalConnections { + /// Create a pair of file descriptors that map to the same underlying event_fd. + fn event_fd_pair() -> std::result::Result<(EventFd, EventFd), std::io::Error> { + let event_fd = EventFd::new(0)?; + Ok((event_fd.try_clone()?, event_fd)) + } + + /// Handle incoming requests. + /// + /// For now we only handle `Command::Memory` requests here. Everything else + /// needs to come via the main connection. This function returns when the + /// abort_event_fd is triggered or the connection is closed or encountered + /// an error. + fn handle_requests( + socket: &mut SocketStream, + abort_event_fd: &EventFd, + guest_memory: &GuestMemoryAtomic, + ) -> std::result::Result<(), MigratableError> { + loop { + if !wait_for_readable(socket, abort_event_fd).map_err(|e| { + MigratableError::MigrateReceive(anyhow!("Failed to poll descriptors: {e}")) + })? { + info!("Got signal to tear down connection."); + return Ok(()); + } + + // TODO We only check whether we should abort when waiting for a new + // request. If the sender just stops sending data mid-request, we + // should still be abortable, but we are not... In this case, we + // will hang forever. But given that the sender is also in charge of + // driving the migration to completion, this is not a major concern. + // In the long run, it would be preferable to move I/O to + // asynchronous tasks to be able to handle aborts more gracefully. + + let req = match Request::read_from(socket) { + Ok(req) => req, + Err(MigratableError::MigrateSocket(io_error)) + if io_error.kind() == ErrorKind::UnexpectedEof => + { + debug!("Connection closed by peer"); + return Ok(()); + } + Err(e) => return Err(e), + }; + + if req.command() != Command::Memory { + return Err(MigratableError::MigrateReceive(anyhow!( + "Dropping connection. Only Memory commands are allowed on additional connections, but got {:?}", + req.command() + ))); + } + + vm_receive_memory(&req, socket, guest_memory)?; + Response::ok().write_to(socket)?; + } + } + + /// Starts a thread to accept incoming connections and handle them. These + /// additional connections are used to receive additional memory regions + /// during VM migration. + fn new( + listener: ReceiveListener, + guest_memory: GuestMemoryAtomic, + ) -> std::result::Result { + let (terminate_fd1, terminate_fd2) = Self::event_fd_pair()?; + + let accept_thread = std::thread::spawn(move || { + let terminate_fd = terminate_fd2; + let mut listener = listener; + let mut threads: Vec> = Vec::new(); + while let Ok(Some(mut socket)) = listener.abortable_accept(&terminate_fd) { + let guest_memory = guest_memory.clone(); + let terminate_fd = terminate_fd.try_clone().unwrap(); + + // We handle errors locally and log them. Passing them along is + // painful with little value. + threads.push(std::thread::spawn(move || { + if let Err(e) = Self::handle_requests(&mut socket, &terminate_fd, &guest_memory) + { + error!( + "Failed to read more requests on additional receive connection: {}", + e + ); + } + })); + } + + info!("Stopped accepting additional connections. Cleaning up threads."); + threads.into_iter().for_each(|thread| { + thread.join().unwrap(); + }); + }); + + Ok(Self { + accept_thread: Some(accept_thread), + terminate_fd: terminate_fd1, + }) + } + + /// Stop accepting additional connections and tear down all connections. + /// + /// This function does not wait for the operation to complete. + fn signal_termination(&self) { + // It's not really worth propagating this error, because it only happens if + // something hit the fan and we can't really do anything about it. + if let Err(e) = self.terminate_fd.write(1) { + error!("Failed to wake up other threads: {}", e); + } + } +} + +impl Drop for ReceiveAdditionalConnections { + fn drop(&mut self) { + self.signal_termination(); + // This unwrap is safe, because we never write a None into + // self.accept_thread in other places. + let _accept_thread = self.accept_thread.take().unwrap(); + + // TODO The accept thread tries to join all threads it started, but we + // haven't implemented tearing them down yet. + // accept_thread.join().unwrap(); + } +} + +/// The receiver's state machine behind the migration protocol. +enum ReceiveMigrationState { + /// The connection is established and we haven't received any commands yet. + Established, + + /// We received the start command. + Started, + + /// We received file descriptors for memory. This can only happen on UNIX domain sockets. + MemoryFdsReceived(Vec<(u32, File)>), + + /// We received the VM configuration. We keep the memory configuration around to populate guest memory. + /// From this point on, the sender can start sending memory updates. + /// + /// While the memory manager can also be used to populate guest memory, we keep a direct reference to + /// the memory around to populate guest memory without having to acquire a lock. + Configured( + Arc>, + GuestMemoryAtomic, + ReceiveAdditionalConnections, + ), + + /// Memory is populated and we received the state. The VM is ready to go. + StateReceived, + + /// The migration is successful. + Completed, + + /// The migration couldn't complete, either due to an error or because the sender abandoned the migration. + Aborted, +} + +impl ReceiveMigrationState { + fn finished(&self) -> bool { + matches!( + self, + ReceiveMigrationState::Completed | ReceiveMigrationState::Aborted + ) + } +} + +/// The different kinds of messages we can send to memory sending threads. +#[derive(Debug)] +enum SendMemoryThreadMessage { + Memory(Arc), + Barrier(Arc), + Disconnect, +} + +/// This struct keeps track of additional threads we use to send VM memory. +struct SendAdditionalConnections { + guest_memory: GuestMemoryAtomic, + threads: Vec>, + channels: Vec>, + // If an error occurs in one of the worker threads, the worker signals this + // using this flag. Only the main thread checks this variable, the other + // workers will be stopped in the destructor. + cancel: Arc, + // The first worker encountering an error will transmit the error using + // this channel. + error_rx: std::sync::mpsc::Receiver, +} + +/// Send memory from the given table. +fn vm_send_memory( + guest_memory: &GuestMemoryAtomic, + socket: &mut SocketStream, + table: &MemoryRangeTable, +) -> result::Result<(), MigratableError> { + if table.regions().is_empty() { + return Ok(()); + } + + Request::memory(table.length()).write_to(socket)?; + table.write_to(socket)?; + // And then the memory itself + send_memory_regions(guest_memory, table, socket)?; + Response::read_from(socket)?.ok_or_abandon( + socket, + MigratableError::MigrateSend(anyhow!("Error during dirty memory migration")), + )?; + + Ok(()) +} + +impl SendAdditionalConnections { + /// How many requests can be waiting to be sent for each connection. This + /// can be set to zero to disable buffering. Whether we need to buffer + /// requests is currently unclear. If this is set too high, some connections + /// might go unused, because work pools up on some connections. + const BUFFERED_REQUESTS_PER_THREAD: usize = 1; + + /// The size of each chunk of memory to send. + /// + /// We want to make this large, because each chunk is acknowledged and we + /// wait for the ack before sending the next chunk. The challenge is that if + /// it is _too_ large, we become more sensitive to network issues, like + /// packet drops in individual connections, because large amounts of data + /// can pool when throughput on one connection is temporarily reduced. + /// + /// We can consider making this configurable, but a better network protocol + /// that doesn't require ACKs would be more efficient. + /// + /// The best-case throughput per connection can be estimated via: + /// effective_throughput = chunk_size / (chunk_size / throughput_per_connection + round_trip_time) + const CHUNK_SIZE: u64 = 64 /* MiB */ << 20; + + fn new( + send_data_migration: &VmSendMigrationData, + guest_mem: &GuestMemoryAtomic, + ) -> std::result::Result { + let mut threads = Vec::new(); + let mut channels = Vec::new(); + let cancel = Arc::new(AtomicBool::new(false)); + let (error_tx, error_rx) = std::sync::mpsc::channel::(); + + let additional_connections = send_data_migration.connections.get() - 1; + for n in 0..(additional_connections) { + let socket = (match send_migration_socket(send_data_migration) { + Err(e) if n == 0 => { + // If we encounter a problem on the first additional + // connection, we just assume the other side doesn't support + // multiple connections and carry on. + info!( + "Couldn't establish additional connections for sending VM memory: {e}, ignoring!" + ); + break; + } + otherwise => otherwise, + })?; + let guest_mem = guest_mem.clone(); + let (send, recv) = std::sync::mpsc::sync_channel::( + Self::BUFFERED_REQUESTS_PER_THREAD, + ); + let cancel = cancel.clone(); + let err_tx = error_tx.clone(); + + let thread = thread::spawn(move || { + info!("Spawned thread to send VM memory."); + + let mut total_sent = 0; + let mut socket = socket; + + for msg in recv { + match msg { + SendMemoryThreadMessage::Memory(table) => { + match vm_send_memory(&guest_mem, &mut socket, &table) { + Ok(()) => { + total_sent += table + .ranges() + .iter() + .map(|range| range.length) + .sum::(); + } + Err(e) => { + // Only the first thread that encounters an + // error sends it to the main thread. + if cancel.swap(true, Ordering::AcqRel) + && let Err(e) = err_tx.send(e) + { + error!("Could not send error to main thread: {e}"); + } + // After that we exit gracefully. Note that + // this also closes our mpsc channel. + break; + } + }; + } + SendMemoryThreadMessage::Barrier(barrier) => { + barrier.wait(); + } + SendMemoryThreadMessage::Disconnect => { + break; + } + } + } + info!("Sent {} MiB via additional connection.", total_sent >> 20); + }); + + threads.push(thread); + channels.push(send); + } + + Ok(Self { + guest_memory: guest_mem.clone(), + threads, + channels, + cancel, + error_rx, + }) + } + + /// Wait until all data that is in-flight has actually been sent and acknowledged. + fn wait_for_pending_data(&self) { + assert_eq!(self.channels.len(), self.threads.len()); + + // TODO We don't actually need the threads to block at the barrier. We + // can probably find a better implementation that involves less + // synchronization. + + let barrier = Arc::new(Barrier::new(self.channels.len() + 1)); + + for channel in &self.channels { + channel + .send(SendMemoryThreadMessage::Barrier(barrier.clone())) + // The unwrap only fails fi + .unwrap(); + } + + barrier.wait(); + } + + /// Send memory via all connections that we have. This may be just one. + /// `socket` is the original socket that was used to connect to the + /// destination. + /// + /// When this function returns, all memory has been sent and acknowledged. + fn send_memory( + &self, + table: &MemoryRangeTable, + socket: &mut SocketStream, + ) -> std::result::Result<(), MigratableError> { + let thread_len = self.threads.len(); + assert_eq!(thread_len, self.channels.len()); + + // In case, we didn't manage to establish additional connections, don't + // bother sending memory in chunks. This would just lower throughput, + // because we wait for a response after each chunk instead of sending + // everything in one go. + if thread_len == 0 { + vm_send_memory(&self.guest_memory, socket, table)?; + return Ok(()); + } + + // The chunk size is chosen to be big enough so that even very fast + // links need some milliseconds to send it. + 'next_partition: for chunk in table.partition(Self::CHUNK_SIZE) { + // If one of the workers encountered an error, we return it. + if self.cancel.load(Ordering::Acquire) { + return Err(self.error_rx.recv().unwrap()); + } + + let chunk = Arc::new(chunk); + + // Find the first free channel and send the chunk via it. + // + // TODO A better implementation wouldn't always start at the + // first thread, but go round-robin. + for channel in &self.channels { + match channel.try_send(SendMemoryThreadMessage::Memory(chunk.clone())) { + Ok(()) => continue 'next_partition, + Err(TrySendError::Full(_)) => { + // Try next channel. + } + Err(TrySendError::Disconnected(_)) => { + return Err(MigratableError::MigrateSend(anyhow!( + "Sending thread died?" + ))); + } + } + } + + // Fallback to sending the chunk via the control connection. + vm_send_memory(&self.guest_memory, socket, &chunk)?; + } + + self.wait_for_pending_data(); + + Ok(()) + } +} + +impl Drop for SendAdditionalConnections { + fn drop(&mut self) { + info!("Sending disconnect message to channels"); + self.channels.drain(..).for_each(|channel| { + // One of the workers may have died and thus closed the channel. + // Thus we cannot simply do send().unwrap(). + let e = channel.send(SendMemoryThreadMessage::Disconnect); + if let Err(e) = e { + error!("Could not send disconnect message to worker thread: {e}"); + } + }); + + info!("Waiting for threads to finish"); + self.threads + .drain(..) + .for_each(|thread| thread.join().unwrap()); + info!("Threads finished"); + } +} + +/// Establishes a connection to a migration destination socket (TCP or UNIX). +fn send_migration_socket( + send_data_migration: &VmSendMigrationData, +) -> std::result::Result { + if let Some(address) = send_data_migration.destination_url.strip_prefix("tcp:") { + info!("Connecting to TCP socket at {}", address); + + let socket = TcpStream::connect(address).map_err(|e| { + MigratableError::MigrateSend(anyhow!("Error connecting to TCP socket: {}", e)) + })?; + + if send_data_migration.tls_dir.is_none() { + Ok(SocketStream::Tcp(socket)) + } else { + info!("Live Migration will be encrypted using TLS."); + // The address may still contain a port. I think we should build something more robust to also handle IPv6. + let tls_stream = tls::client_stream( + socket, + send_data_migration.tls_dir.as_ref().unwrap(), + address + .split_once(':') + .map(|(host, _)| host) + .unwrap_or(address), + )?; + Ok(SocketStream::Tls(Box::new(TlsStreamWrapper::new( + TlsStream::Client(tls_stream), + )))) + } + } else if let Some(path) = &send_data_migration.destination_url.strip_prefix("unix:") { + info!("Connecting to UNIX socket at {:?}", path); + + let socket = UnixStream::connect(path).map_err(|e| { + MigratableError::MigrateSend(anyhow!("Error connecting to UNIX socket: {}", e)) + })?; + + Ok(SocketStream::Unix(socket)) + } else { + Err(MigratableError::MigrateSend(anyhow!( + "Invalid destination: {}", + send_data_migration.destination_url + ))) + } +} + +/// Creates a listener socket for receiving incoming migration connections (TCP or UNIX). +fn receive_migration_listener( + receiver_data_migration: &VmReceiveMigrationData, +) -> std::result::Result { + if let Some(address) = receiver_data_migration.receiver_url.strip_prefix("tcp:") { + let listener = TcpListener::bind(address).map_err(|e| { + MigratableError::MigrateReceive(anyhow!("Error binding to TCP socket: {}", e)) + })?; + + if receiver_data_migration.tls_dir.is_none() { + Ok(ReceiveListener::Tcp(listener)) + } else { + Ok(ReceiveListener::Tls( + listener, + TlsConnectionWrapper::new(receiver_data_migration.tls_dir.as_ref().unwrap()), + )) + } + } else if let Some(path) = receiver_data_migration.receiver_url.strip_prefix("unix:") { + UnixListener::bind(path) + .map_err(|e| { + MigratableError::MigrateReceive(anyhow!("Error binding to UNIX socket: {}", e)) + }) + .map(|listener| ReceiveListener::Unix(listener, Some(path.into()))) + } else { + Err(MigratableError::MigrateSend(anyhow!( + "Invalid source: {}", + receiver_data_migration.receiver_url + ))) + } +} + +fn send_memory_regions( + guest_memory: &GuestMemoryAtomic, + ranges: &MemoryRangeTable, + fd: &mut SocketStream, +) -> std::result::Result<(), MigratableError> { + let mem = guest_memory.memory(); + + for range in ranges.regions() { + let mut offset: u64 = 0; + // Here we are manually handling the retry in case we can't the + // whole region at once because we can't use the implementation + // from vm-memory::GuestMemory of write_all_to() as it is not + // following the correct behavior. For more info about this issue + // see: https://github.com/rust-vmm/vm-memory/issues/174 + loop { + let bytes_written = mem + .write_volatile_to( + GuestAddress(range.gpa + offset), + fd, + (range.length - offset) as usize, + ) + .map_err(|e| { + MigratableError::MigrateSend(anyhow!( + "Error transferring memory to socket: {}", + e + )) + })?; + offset += bytes_written as u64; + + if offset == range.length { + break; + } + } + } + + Ok(()) } impl Vmm { @@ -723,16 +1593,15 @@ impl Vmm { thread::Builder::new() .name("vmm_signal_handler".to_string()) .spawn(move || { - if !signal_handler_seccomp_filter.is_empty() { - if let Err(e) = apply_filter(&signal_handler_seccomp_filter) - .map_err(Error::ApplySeccompFilter) - { - error!("Error applying seccomp filter: {:?}", e); - exit_evt.write(1).ok(); - return; - } + if !signal_handler_seccomp_filter.is_empty() && let Err(e) = apply_filter(&signal_handler_seccomp_filter) + .map_err(Error::ApplySeccompFilter) + { + error!("Error applying seccomp filter: {:?}", e); + exit_evt.write(1).ok(); + return; } - if landlock_enable{ + + if landlock_enable { match Landlock::new() { Ok(landlock) => { let _ = landlock.restrict_self().map_err(Error::ApplyLandlock).map_err(|e| { @@ -750,11 +1619,11 @@ impl Vmm { std::panic::catch_unwind(AssertUnwindSafe(|| { Vmm::signal_handler(signals, original_termios_opt, &exit_evt); })) - .map_err(|_| { - error!("vmm signal_handler thread panicked"); - exit_evt.write(1).ok() - }) - .ok(); + .map_err(|_| { + error!("vmm signal_handler thread panicked"); + exit_evt.write(1).ok() + }) + .ok(); }) .map_err(Error::SignalHandlerSpawn)?, ); @@ -777,6 +1646,7 @@ impl Vmm { let mut epoll = EpollContext::new().map_err(Error::Epoll)?; let reset_evt = EventFd::new(EFD_NONBLOCK).map_err(Error::EventFdCreate)?; let activate_evt = EventFd::new(EFD_NONBLOCK).map_err(Error::EventFdCreate)?; + let check_migration_evt = EventFd::new(EFD_NONBLOCK).map_err(Error::EventFdCreate)?; epoll .add_event(&exit_evt, EpollDispatch::Exit) @@ -799,6 +1669,10 @@ impl Vmm { .add_event(&debug_evt, EpollDispatch::Debug) .map_err(Error::Epoll)?; + epoll + .add_event(&check_migration_evt, EpollDispatch::CheckMigration) + .map_err(Error::Epoll)?; + Ok(Vmm { epoll, exit_evt, @@ -809,7 +1683,7 @@ impl Vmm { #[cfg(feature = "guest_debug")] vm_debug_evt, version: vmm_version, - vm: None, + vm: MaybeVmOwnership::None, vm_config: None, seccomp_action, hypervisor, @@ -819,29 +1693,183 @@ impl Vmm { original_termios_opt: Arc::new(Mutex::new(None)), console_resize_pipe: None, console_info: None, + check_migration_evt, + migration_thread_handle: None, }) } - fn vm_receive_config( + /// Try to receive a file descriptor from a socket. Returns the slot number and the file descriptor. + fn vm_receive_memory_fd( + socket: &mut SocketStream, + ) -> std::result::Result<(u32, File), MigratableError> { + if let SocketStream::Unix(unix_socket) = socket { + let mut buf = [0u8; 4]; + let (_, file) = unix_socket.recv_with_fd(&mut buf).map_err(|e| { + MigratableError::MigrateReceive(anyhow!("Error receiving slot from socket: {}", e)) + })?; + + file.ok_or_else(|| MigratableError::MigrateReceive(anyhow!("Failed to receive socket"))) + .map(|file| (u32::from_le_bytes(buf), file)) + } else { + Err(MigratableError::MigrateReceive(anyhow!( + "Unsupported socket type" + ))) + } + } + + /// Handle a migration command and advance the protocol state machine. + /// + /// **Note**: This function is responsible for consuming any payloads! It also must + /// _not_ write any response to the socket. + fn vm_receive_migration_step( &mut self, + listener: &ReceiveListener, + socket: &mut SocketStream, + state: ReceiveMigrationState, req: &Request, - socket: &mut T, - existing_memory_files: Option>, - ) -> std::result::Result>, MigratableError> - where - T: Read + Write, - { - // Read in config data along with memory manager data - let mut data: Vec = Vec::new(); - data.resize_with(req.length() as usize, Default::default); - socket - .read_exact(&mut data) - .map_err(MigratableError::MigrateSocket)?; + receive_data_migration: &VmReceiveMigrationData, + ) -> std::result::Result { + use ReceiveMigrationState::*; + + let invalid_command = || { + Err(MigratableError::MigrateReceive(anyhow!( + "Can't handle command in current state" + ))) + }; - let vm_migration_config: VmMigrationConfig = - serde_json::from_slice(&data).map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error deserialising config: {}", e)) - })?; + let mut configure_vm = + |socket: &mut SocketStream, + memory_files: HashMap| + -> std::result::Result { + let memory_manager = self.vm_receive_config( + req, + socket, + memory_files, + receive_data_migration.tcp_serial_url.clone(), + )?; + + if let Some(ref restored_net_configs) = receive_data_migration.net_fds { + // TODO do some validation + //restored_net_config.validate(); + // Update VM's net configurations with new fds received for restore operation + + let mut vm_config = self.vm_config.as_mut().unwrap().lock().unwrap(); + + for net in restored_net_configs { + for net_config in vm_config.net.iter_mut().flatten() { + // update only if the net dev is backed by FDs + if net_config.id.as_ref() == Some(&net.id) && net_config.fds.is_some() { + log::debug!( + "overwriting net fds: id={}, old={:?}, new={:?}", + net.id, + &net_config.fds, + &net.fds + ); + net_config.fds.clone_from(&net.fds); + } + } + } + } + + let guest_memory = memory_manager.lock().unwrap().guest_memory(); + Ok(Configured( + memory_manager, + guest_memory.clone(), + listener + .try_clone() + .and_then(|l| ReceiveAdditionalConnections::new(l, guest_memory)) + .map_err(|e| { + MigratableError::MigrateReceive(anyhow!( + "Failed to create receive additional connections: {}", + e + )) + })?, + )) + }; + + let recv_memory_fd = + |socket: &mut SocketStream, + mut memory_files: Vec<(u32, File)>| + -> std::result::Result { + let (slot, file) = Self::vm_receive_memory_fd(socket)?; + + memory_files.push((slot, file)); + Ok(MemoryFdsReceived(memory_files)) + }; + + if req.command() == Command::Abandon { + return Ok(Aborted); + } + + match state { + Established => match req.command() { + Command::Start => Ok(Started), + _ => invalid_command(), + }, + Started => match req.command() { + Command::MemoryFd => recv_memory_fd(socket, Vec::new()), + Command::Config => configure_vm(socket, Default::default()), + _ => invalid_command(), + }, + MemoryFdsReceived(memory_files) => match req.command() { + Command::MemoryFd => recv_memory_fd(socket, memory_files), + Command::Config => configure_vm(socket, HashMap::from_iter(memory_files)), + _ => invalid_command(), + }, + Configured(memory_manager, guest_memory, receive_additional_connections) => { + match req.command() { + Command::Memory => { + vm_receive_memory(req, socket, &guest_memory)?; + Ok(Configured( + memory_manager, + guest_memory, + receive_additional_connections, + )) + } + Command::State => { + self.vm_receive_state(req, socket, memory_manager)?; + Ok(StateReceived) + } + _ => invalid_command(), + } + } + StateReceived => match req.command() { + Command::Complete => { + // The unwrap is safe, because the state machine makes sure we called + // vm_receive_state before, which creates the VM. + let vm = self.vm.vm_mut().unwrap(); + vm.resume()?; + Ok(Completed) + } + _ => invalid_command(), + }, + Completed | Aborted => { + unreachable!("Performed a step on the finished state machine") + } + } + } + + fn vm_receive_config( + &mut self, + req: &Request, + socket: &mut T, + existing_memory_files: HashMap, + tcp_serial_url: Option, + ) -> std::result::Result>, MigratableError> + where + T: Read, + { + // Read in config data along with memory manager data + let mut data: Vec = Vec::new(); + data.resize_with(req.length() as usize, Default::default); + socket + .read_exact(&mut data) + .map_err(MigratableError::MigrateSocket)?; + + let vm_migration_config: VmMigrationConfig = + serde_json::from_slice(&data).map_err(|e| { + MigratableError::MigrateReceive(anyhow!("Error deserialising config: {}", e)) + })?; #[cfg(all(feature = "kvm", target_arch = "x86_64"))] self.vm_check_cpuid_compatibility( @@ -851,6 +1879,12 @@ impl Vmm { let config = vm_migration_config.vm_config.clone(); self.vm_config = Some(vm_migration_config.vm_config); + + if let Some(tcp_serial_url) = tcp_serial_url { + let mut vm_config = self.vm_config.as_mut().unwrap().lock().unwrap(); + vm_config.serial.url = Some(tcp_serial_url); + } + self.console_info = Some(pre_create_console_devices(self).map_err(|e| { MigratableError::MigrateReceive(anyhow!("Error creating console devices: {:?}", e)) })?); @@ -884,6 +1918,11 @@ impl Vmm { )) })?; + #[cfg(all(feature = "kvm", target_arch = "x86_64"))] + if config.lock().unwrap().max_apic_id() > MAX_SUPPORTED_CPUS_LEGACY { + vm.enable_x2apic_api().unwrap(); + } + let phys_bits = vm::physical_bits(&self.hypervisor, config.lock().unwrap().cpus.max_phys_bits); @@ -896,8 +1935,6 @@ impl Vmm { false, Some(&vm_migration_config.memory_manager_data), existing_memory_files, - #[cfg(target_arch = "x86_64")] - None, ) .map_err(|e| { MigratableError::MigrateReceive(anyhow!( @@ -906,8 +1943,6 @@ impl Vmm { )) })?; - Response::ok().write_to(socket)?; - Ok(memory_manager) } @@ -918,7 +1953,7 @@ impl Vmm { mm: Arc>, ) -> std::result::Result<(), MigratableError> where - T: Read + Write, + T: Read, { // Read in state data let mut data: Vec = Vec::new(); @@ -971,132 +2006,194 @@ impl Vmm { // Create VM vm.restore().map_err(|e| { - Response::error().write_to(socket).ok(); MigratableError::MigrateReceive(anyhow!("Failed restoring the Vm: {}", e)) })?; - self.vm = Some(vm); - - Response::ok().write_to(socket)?; + self.vm = MaybeVmOwnership::Vmm(vm); Ok(()) } - fn vm_receive_memory( - &mut self, - req: &Request, - socket: &mut T, - memory_manager: &mut MemoryManager, - ) -> std::result::Result<(), MigratableError> - where - T: Read + ReadVolatile + Write, - { - // Read table - let table = MemoryRangeTable::read_from(socket, req.length())?; - - // And then read the memory itself - memory_manager - .receive_memory_regions(&table, socket) - .inspect_err(|_| { - Response::error().write_to(socket).ok(); - })?; - Response::ok().write_to(socket)?; - Ok(()) - } - - fn socket_url_to_path(url: &str) -> result::Result { - url.strip_prefix("unix:") - .ok_or_else(|| { - MigratableError::MigrateSend(anyhow!("Could not extract path from URL: {}", url)) - }) - .map(|s| s.into()) + fn can_increase_autoconverge_step(s: &MigrationState) -> bool { + if s.iteration < AUTO_CONVERGE_ITERATION_DELAY { + false + } else { + let iteration = s.iteration - AUTO_CONVERGE_ITERATION_DELAY; + iteration.is_multiple_of(AUTO_CONVERGE_ITERATION_INCREASE) + } } - fn send_migration_socket( - destination_url: &str, - ) -> std::result::Result { - if let Some(address) = destination_url.strip_prefix("tcp:") { - info!("Connecting to TCP socket at {}", address); + fn memory_copy_iterations( + vm: &mut Vm, + mem_send: &SendAdditionalConnections, + socket: &mut SocketStream, + s: &mut MigrationState, + migration_timeout: Duration, + migrate_downtime_limit: Duration, + ) -> result::Result { + let mut bandwidth = 0.0; + let mut iteration_table; - let socket = TcpStream::connect(address).map_err(|e| { - MigratableError::MigrateSend(anyhow!("Error connecting to TCP socket: {}", e)) - })?; + loop { + // todo: check if auto-converge is enabled at all? + if Self::can_increase_autoconverge_step(s) && vm.throttle_percent() < AUTO_CONVERGE_MAX + { + let current_throttle = vm.throttle_percent(); + let new_throttle = current_throttle + AUTO_CONVERGE_STEP_SIZE; + let new_throttle = std::cmp::min(new_throttle, AUTO_CONVERGE_MAX); + log::info!("Increasing auto-converge: {new_throttle}%"); + if new_throttle != current_throttle { + vm.set_throttle_percent(new_throttle); + } + } - Ok(SocketStream::Tcp(socket)) - } else { - let path = Vmm::socket_url_to_path(destination_url)?; - info!("Connecting to UNIX socket at {:?}", path); + // Update the start time of the iteration + s.iteration_start_time = Instant::now(); - let socket = UnixStream::connect(&path).map_err(|e| { - MigratableError::MigrateSend(anyhow!("Error connecting to UNIX socket: {}", e)) - })?; + // Increment iteration counter + s.iteration += 1; - Ok(SocketStream::Unix(socket)) - } - } + // Check if migration has timed out + // migration_timeout > 0 means enabling the timeout check, 0 means disabling the timeout check + if !migration_timeout.is_zero() && s.start_time.elapsed() > migration_timeout { + warn!("Migration timed out after {:?}", migration_timeout); + Request::abandon().write_to(socket)?; + Response::read_from(socket)?.ok_or_abandon( + socket, + MigratableError::MigrateSend(anyhow!("Migration timed out")), + )?; + } - fn receive_migration_socket( - receiver_url: &str, - ) -> std::result::Result { - if let Some(address) = receiver_url.strip_prefix("tcp:") { - let listener = TcpListener::bind(address).map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error binding to TCP socket: {}", e)) - })?; + // Get the dirty page table + iteration_table = vm.dirty_log()?; - let (socket, _addr) = listener.accept().map_err(|e| { - MigratableError::MigrateReceive(anyhow!( - "Error accepting connection on TCP socket: {}", - e - )) - })?; + // Update the pending size (amount of data to transfer) + s.pending_size = iteration_table + .regions() + .iter() + .map(|range| range.length) + .sum(); - Ok(SocketStream::Tcp(socket)) - } else { - let path = Vmm::socket_url_to_path(receiver_url)?; - let listener = UnixListener::bind(&path).map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error binding to UNIX socket: {}", e)) - })?; + // Update thresholds + if bandwidth > 0.0 { + s.threshold_size = bandwidth as u64 * migrate_downtime_limit.as_millis() as u64; + } - let (socket, _addr) = listener.accept().map_err(|e| { - MigratableError::MigrateReceive(anyhow!( - "Error accepting connection on UNIX socket: {}", - e - )) - })?; + // Enter the final stage of migration when the suspension conditions are met + if s.iteration > 1 && s.pending_size <= s.threshold_size { + break; + } - // Remove the UNIX socket file after accepting the connection - std::fs::remove_file(&path).map_err(|e| { - MigratableError::MigrateReceive(anyhow!("Error removing UNIX socket file: {}", e)) - })?; + // Update the number of dirty pages + s.total_transferred_bytes += s.pending_size; + s.current_dirty_pages = s.pending_size.div_ceil(PAGE_SIZE as u64); + s.total_transferred_dirty_pages += s.current_dirty_pages; + + // Send the current dirty pages + let transfer_start = Instant::now(); + mem_send.send_memory(&iteration_table, socket)?; + let transfer_time = transfer_start.elapsed().as_millis() as f64; + + // Update bandwidth + if transfer_time > 0.0 && s.pending_size > 0 { + bandwidth = s.pending_size as f64 / transfer_time; + // Convert bandwidth to MB/s + s.mb_per_sec = (bandwidth * 1000.0) / (1024.0 * 1024.0); + } - Ok(SocketStream::Unix(socket)) + // Update iteration cost time + s.iteration_cost_time = s.iteration_start_time.elapsed(); + if s.iteration_cost_time.as_millis() > 0 { + s.pages_per_second = + s.current_dirty_pages * 1000 / s.iteration_cost_time.as_millis() as u64; + } + debug!( + "iteration {}: cost={}ms, throttle={}%", + s.iteration, + s.iteration_cost_time.as_millis(), + vm.throttle_percent() + ); } + + Ok(iteration_table) } - // Returns true if there were dirty pages to send - fn vm_maybe_send_dirty_pages( + fn do_memory_migration( vm: &mut Vm, socket: &mut SocketStream, - ) -> result::Result { - // Send (dirty) memory table - let table = vm.dirty_log()?; + s: &mut MigrationState, + send_data_migration: &VmSendMigrationData, + ) -> result::Result<(), MigratableError> { + let mem_send = SendAdditionalConnections::new(send_data_migration, &vm.guest_memory())?; + + // Start logging dirty pages + vm.start_dirty_log()?; + + mem_send.send_memory(&vm.memory_range_table()?, socket)?; + + // Define the maximum allowed downtime 2000 seconds(2000000 milliseconds) + const MAX_MIGRATE_DOWNTIME: u64 = 2000000; + + // Verify that downtime must be between 1 and MAX_MIGRATE_DOWNTIME + if send_data_migration.downtime == 0 || send_data_migration.downtime > MAX_MIGRATE_DOWNTIME + { + return Err(MigratableError::MigrateSend(anyhow!( + "downtime_limit must be an integer in the range of 1 to {} ms", + MAX_MIGRATE_DOWNTIME + ))); + } - // But if there are no regions go straight to pause - if table.regions().is_empty() { - return Ok(false); + let migration_timeout = Duration::from_secs(send_data_migration.migration_timeout); + let migrate_downtime_limit = Duration::from_millis(send_data_migration.downtime); + + // Verify that downtime must be less than the migration timeout + if !migration_timeout.is_zero() && migrate_downtime_limit >= migration_timeout { + return Err(MigratableError::MigrateSend(anyhow!( + "downtime_limit {}ms must be less than migration_timeout {}ms", + send_data_migration.downtime, + send_data_migration.migration_timeout * 1000 + ))); } - Request::memory(table.length()).write_to(socket).unwrap(); - table.write_to(socket)?; - // And then the memory itself - vm.send_memory_regions(&table, socket)?; - Response::read_from(socket)?.ok_or_abandon( + let iteration_table = Self::memory_copy_iterations( + vm, + &mem_send, socket, - MigratableError::MigrateSend(anyhow!("Error during dirty memory migration")), + s, + migration_timeout, + migrate_downtime_limit, )?; - Ok(true) + info!("Entering downtime phase"); + s.downtime_start = Instant::now(); + // End throttle thread + info!("stopping vcpu thread"); + vm.stop_vcpu_throttling(); + info!("stopped vcpu thread"); + info!("pausing VM"); + vm.pause()?; + info!("paused VM"); + + // Send last batch of dirty pages + let mut final_table = vm.dirty_log()?; + final_table.extend(iteration_table.clone()); + mem_send.send_memory(&final_table, socket)?; + // Update statistics + s.pending_size = final_table.regions().iter().map(|range| range.length).sum(); + s.total_transferred_bytes += s.pending_size; + s.current_dirty_pages = s.pending_size.div_ceil(PAGE_SIZE as u64); + s.total_transferred_dirty_pages += s.current_dirty_pages; + + // Stop logging dirty pages + vm.stop_dirty_log()?; + + Ok(()) } + /// Performs a live-migration. + /// + /// This function performs necessary after-migration cleanup only in the + /// good case. Callers are responsible for properly handling failed + /// migrations. fn send_migration( vm: &mut Vm, #[cfg(all(feature = "kvm", target_arch = "x86_64"))] hypervisor: Arc< @@ -1104,8 +2201,10 @@ impl Vmm { >, send_data_migration: VmSendMigrationData, ) -> result::Result<(), MigratableError> { + let mut s = MigrationState::new(); + // Set up the socket connection - let mut socket = Self::send_migration_socket(&send_data_migration.destination_url)?; + let mut socket = send_migration_socket(&send_data_migration)?; // Start the migration Request::start().write_to(&mut socket)?; @@ -1125,18 +2224,26 @@ impl Vmm { ))); }; - let amx = vm_config.lock().unwrap().cpus.features.amx; - let phys_bits = - vm::physical_bits(&hypervisor, vm_config.lock().unwrap().cpus.max_phys_bits); + let (amx, phys_bits, profile, kvm_hyperv) = { + let guard = vm_config.lock().unwrap(); + let amx = guard.cpus.features.amx; + let max_phys_bits = guard.cpus.max_phys_bits; + let profile = guard.cpus.profile; + let kvm_hyperv = guard.cpus.kvm_hyperv; + // Drop lock before function call + core::mem::drop(guard); + let phys_bits = vm::physical_bits(&hypervisor, max_phys_bits); + (amx, phys_bits, profile, kvm_hyperv) + }; arch::generate_common_cpuid( &hypervisor, &arch::CpuidConfig { - sgx_epc_sections: None, phys_bits, - kvm_hyperv: vm_config.lock().unwrap().cpus.kvm_hyperv, + kvm_hyperv, #[cfg(feature = "tdx")] tdx: false, amx, + profile, }, ) .map_err(|e| { @@ -1155,6 +2262,11 @@ impl Vmm { "--local option is not supported with TCP sockets", ))); } + SocketStream::Tls(_tls_socket) => { + return Err(MigratableError::MigrateSend(anyhow!( + "--local option is not supported with TCP sockets", + ))); + } } } @@ -1181,36 +2293,7 @@ impl Vmm { // Now pause VM vm.pause()?; } else { - // Start logging dirty pages - vm.start_dirty_log()?; - - // Send memory table - let table = vm.memory_range_table()?; - Request::memory(table.length()) - .write_to(&mut socket) - .unwrap(); - table.write_to(&mut socket)?; - // And then the memory itself - vm.send_memory_regions(&table, &mut socket)?; - Response::read_from(&mut socket)?.ok_or_abandon( - &mut socket, - MigratableError::MigrateSend(anyhow!("Error during dirty memory migration")), - )?; - - // Try at most 5 passes of dirty memory sending - const MAX_DIRTY_MIGRATIONS: usize = 5; - for i in 0..MAX_DIRTY_MIGRATIONS { - info!("Dirty memory migration {} of {}", i, MAX_DIRTY_MIGRATIONS); - if !Self::vm_maybe_send_dirty_pages(vm, &mut socket)? { - break; - } - } - - // Now pause VM - vm.pause()?; - - // Send last batch of dirty pages - Self::vm_maybe_send_dirty_pages(vm, &mut socket)?; + Self::do_memory_migration(vm, &mut socket, &mut s, &send_data_migration)?; } // We release the locks early to enable locking them on the destination host. @@ -1218,6 +2301,14 @@ impl Vmm { vm.release_disk_locks() .map_err(|e| MigratableError::UnlockError(anyhow!("{e}")))?; + #[cfg(feature = "kvm")] + // Prevent signal handler to access thread local storage when signals are received + // close to the end when thread-local storage is already destroyed. + { + let mut lock = IS_IN_SHUTDOWN.write().unwrap(); + *lock = true; + } + // Capture snapshot and send it let vm_snapshot = vm.snapshot()?; let snapshot_data = serde_json::to_vec(&vm_snapshot).unwrap(); @@ -1237,11 +2328,17 @@ impl Vmm { MigratableError::MigrateSend(anyhow!("Error completing migration")), )?; + // Record downtime + s.downtime = s.downtime_start.elapsed(); + // Stop logging dirty pages if !send_data_migration.local { vm.stop_dirty_log()?; } + // Record total migration time + s.total_time = s.start_time.elapsed(); + info!("Migration complete"); // Let every Migratable object know about the migration being complete @@ -1262,20 +2359,30 @@ impl Vmm { }; // We check the `CPUID` compatibility of between the source vm and destination, which is - // mostly about feature compatibility and "topology/sgx" leaves are not relevant. + // mostly about feature compatibility. let dest_cpuid = &{ let vm_config = &src_vm_config.lock().unwrap(); + if vm_config.cpus.features.amx { + // Need to enable AMX tile state components before generating common cpuid + // as this affects what Hypervisor::get_supported_cpuid returns. + hypervisor::arch::x86::XsaveState::enable_amx_state_components( + self.hypervisor.as_ref(), + ) + .context("Unable to enable AMX before generating common CPUID") + .map_err(MigratableError::MigrateReceive)?; + } + let phys_bits = vm::physical_bits(&self.hypervisor, vm_config.cpus.max_phys_bits); arch::generate_common_cpuid( &self.hypervisor.clone(), &arch::CpuidConfig { - sgx_epc_sections: None, phys_bits, kvm_hyperv: vm_config.cpus.kvm_hyperv, #[cfg(feature = "tdx")] tdx: false, amx: vm_config.cpus.features.amx, + profile: vm_config.cpus.profile, }, ) .map_err(|e| { @@ -1296,6 +2403,10 @@ impl Vmm { vm_config: Arc>, prefault: bool, ) -> std::result::Result<(), VmError> { + if matches!(self.vm, MaybeVmOwnership::Migration) { + return Err(VmError::VmMigrating); + } + let snapshot = recv_vm_state(source_url).map_err(VmError::Restore)?; #[cfg(all(feature = "kvm", target_arch = "x86_64"))] let vm_snapshot = get_vm_snapshot(&snapshot).map_err(VmError::Restore)?; @@ -1338,7 +2449,7 @@ impl Vmm { Some(source_url), Some(prefault), )?; - self.vm = Some(vm); + self.vm = MaybeVmOwnership::Vmm(vm); if self .vm_config @@ -1353,10 +2464,53 @@ impl Vmm { } // Now we can restore the rest of the VM. - if let Some(ref mut vm) = self.vm { - vm.restore() - } else { - Err(VmError::VmNotCreated) + // PANIC: won't panic, we just checked that the VM is there. + self.vm.vm_mut().unwrap().restore() + } + + /// Checks the migration result. + /// + /// This should be called when the migration thread indicated a state + /// change (and therefore, its termination). The function checks the result + /// of that thread and either shuts down the VMM on success or keeps the VM + /// and the VMM running on migration failure. + fn check_migration_result(&mut self) { + // At this point, the thread must be finished. + // If we fail here, we have lost anyway. Just panic. + let (vm, migration_res) = self + .migration_thread_handle + .take() + .expect("should have thread") + .join() + .expect("should have joined"); + + // Give VMM back control. + self.vm = MaybeVmOwnership::Vmm(vm); + + match migration_res { + Ok(()) => { + { + info!("Sending Receiver in HTTP thread that migration succeeded"); + let (sender, _) = &*ONGOING_LIVEMIGRATION; + // unblock API call; propagate migration result + sender.send(Ok(())).unwrap(); + } + + // Shutdown the VM after the migration succeeded + if let Err(e) = self.exit_evt.write(1) { + error!("Failed shutting down the VM after migration: {}", e); + } + } + Err(e) => { + error!("Migration failed: {}", e); + { + info!("Sending Receiver in HTTP thread that migration failed"); + let (sender, _) = &*ONGOING_LIVEMIGRATION; + // unblock API call; propagate migration result + sender.send(Err(e)).unwrap(); + } + // we don't fail the VMM here, it just continues running its VM + } } } @@ -1410,7 +2564,7 @@ impl Vmm { self.vm_reboot().map_err(Error::VmReboot)?; } EpollDispatch::ActivateVirtioDevices => { - if let Some(ref vm) = self.vm { + if let MaybeVmOwnership::Vmm(ref vm) = self.vm { let count = self.activate_evt.read().map_err(Error::EventFdRead)?; info!( "Trying to activate pending virtio devices: count = {}", @@ -1438,7 +2592,7 @@ impl Vmm { // Read from the API receiver channel let gdb_request = gdb_receiver.recv().map_err(Error::GdbRequestRecv)?; - let response = if let Some(ref mut vm) = self.vm { + let response = if let MaybeVmOwnership::Vmm(ref mut vm) = self.vm { vm.debug_request(&gdb_request.payload, gdb_request.cpu_id) } else { Err(VmError::VmNotRunning) @@ -1453,6 +2607,14 @@ impl Vmm { } #[cfg(not(feature = "guest_debug"))] EpollDispatch::Debug => {} + EpollDispatch::CheckMigration => { + info!("VM migration check event"); + // Consume the event. + self.check_migration_evt + .read() + .map_err(Error::EventFdRead)?; + self.check_migration_result(); + } } } } @@ -1506,102 +2668,116 @@ impl RequestHandler for Vmm { tracer::start(); info!("Booting VM"); event!("vm", "booting"); - let r = { - trace_scoped!("vm_boot"); - // If we don't have a config, we cannot boot a VM. - if self.vm_config.is_none() { - return Err(VmError::VmMissingConfig); - }; - // console_info is set to None in vm_shutdown. re-populate here if empty - if self.console_info.is_none() { - self.console_info = - Some(pre_create_console_devices(self).map_err(VmError::CreateConsoleDevices)?); - } - - // Create a new VM if we don't have one yet. - if self.vm.is_none() { - let exit_evt = self.exit_evt.try_clone().map_err(VmError::EventFdClone)?; - let reset_evt = self.reset_evt.try_clone().map_err(VmError::EventFdClone)?; - #[cfg(feature = "guest_debug")] - let vm_debug_evt = self - .vm_debug_evt - .try_clone() - .map_err(VmError::EventFdClone)?; - let activate_evt = self - .activate_evt - .try_clone() - .map_err(VmError::EventFdClone)?; - - if let Some(ref vm_config) = self.vm_config { - let vm = Vm::new( - Arc::clone(vm_config), - exit_evt, - reset_evt, - #[cfg(feature = "guest_debug")] - vm_debug_evt, - &self.seccomp_action, - self.hypervisor.clone(), - activate_evt, - self.console_info.clone(), - self.console_resize_pipe.clone(), - Arc::clone(&self.original_termios_opt), - None, - None, - None, - )?; - - self.vm = Some(vm); - } + if matches!(self.vm, MaybeVmOwnership::Migration) { + return Err(VmError::VmMigrating); + } + + trace_scoped!("vm_boot"); + // If we don't have a config, we cannot boot a VM. + if self.vm_config.is_none() { + return Err(VmError::VmMissingConfig); + }; + + // console_info is set to None in vm_shutdown. re-populate here if empty + if self.console_info.is_none() { + self.console_info = + Some(pre_create_console_devices(self).map_err(VmError::CreateConsoleDevices)?); + } + + // Create a new VM if we don't have one yet. + if matches!(self.vm, MaybeVmOwnership::None) { + let exit_evt = self.exit_evt.try_clone().map_err(VmError::EventFdClone)?; + let reset_evt = self.reset_evt.try_clone().map_err(VmError::EventFdClone)?; + #[cfg(feature = "guest_debug")] + let vm_debug_evt = self + .vm_debug_evt + .try_clone() + .map_err(VmError::EventFdClone)?; + let activate_evt = self + .activate_evt + .try_clone() + .map_err(VmError::EventFdClone)?; + + if let Some(ref vm_config) = self.vm_config { + let vm = Vm::new( + Arc::clone(vm_config), + exit_evt, + reset_evt, + #[cfg(feature = "guest_debug")] + vm_debug_evt, + &self.seccomp_action, + self.hypervisor.clone(), + activate_evt, + self.console_info.clone(), + self.console_resize_pipe.clone(), + Arc::clone(&self.original_termios_opt), + None, + None, + None, + )?; + + self.vm = MaybeVmOwnership::Vmm(vm); } + } - // Now we can boot the VM. - if let Some(ref mut vm) = self.vm { - vm.boot() - } else { - Err(VmError::VmNotCreated) + // Now we can boot the VM. + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + vm.boot()?; + event!("vm", "booted"); } - }; - tracer::end(); - if r.is_ok() { - event!("vm", "booted"); + MaybeVmOwnership::None => { + return Err(VmError::VmNotCreated); + } + _ => unreachable!(), } - r + + tracer::end(); + Ok(()) } fn vm_pause(&mut self) -> result::Result<(), VmError> { - if let Some(ref mut vm) = self.vm { - vm.pause().map_err(VmError::Pause) - } else { - Err(VmError::VmNotRunning) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => vm.pause().map_err(VmError::Pause), + MaybeVmOwnership::Migration => Err(VmError::VmMigrating)?, + MaybeVmOwnership::None => Err(VmError::VmNotRunning)?, } } fn vm_resume(&mut self) -> result::Result<(), VmError> { - if let Some(ref mut vm) = self.vm { - vm.resume().map_err(VmError::Resume) - } else { - Err(VmError::VmNotRunning) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => vm.resume().map_err(VmError::Resume), + MaybeVmOwnership::Migration => Err(VmError::VmMigrating)?, + MaybeVmOwnership::None => Err(VmError::VmNotRunning)?, } } fn vm_snapshot(&mut self, destination_url: &str) -> result::Result<(), VmError> { - if let Some(ref mut vm) = self.vm { - // Drain console_info so that FDs are not reused - let _ = self.console_info.take(); - vm.snapshot() - .map_err(VmError::Snapshot) - .and_then(|snapshot| { - vm.send(&snapshot, destination_url) - .map_err(VmError::SnapshotSend) - }) - } else { - Err(VmError::VmNotRunning) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + // Drain console_info so that FDs are not reused + let _ = self.console_info.take(); + vm.snapshot() + .map_err(VmError::Snapshot) + .and_then(|snapshot| { + vm.send(&snapshot, destination_url) + .map_err(VmError::SnapshotSend) + }) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating)?, + MaybeVmOwnership::None => Err(VmError::VmNotRunning)?, } } fn vm_restore(&mut self, restore_cfg: RestoreConfig) -> result::Result<(), VmError> { - if self.vm.is_some() || self.vm_config.is_some() { + match &self.vm { + MaybeVmOwnership::Vmm(_vm) => return Err(VmError::VmAlreadyCreated), + MaybeVmOwnership::Migration => return Err(VmError::VmMigrating), + MaybeVmOwnership::None => (), + }; + + if self.vm_config.is_some() { return Err(VmError::VmAlreadyCreated); } @@ -1626,7 +2802,7 @@ impl RequestHandler for Vmm { for net in restored_nets.iter() { for net_config in vm_net_configs.iter_mut() { // update only if the net dev is backed by FDs - if net_config.id == Some(net.id.clone()) && net_config.fds.is_some() { + if net_config.id.as_ref() == Some(&net.id) && net_config.fds.is_some() { net_config.fds.clone_from(&net.fds); } } @@ -1648,21 +2824,25 @@ impl RequestHandler for Vmm { #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] fn vm_coredump(&mut self, destination_url: &str) -> result::Result<(), VmError> { - if let Some(ref mut vm) = self.vm { - vm.coredump(destination_url).map_err(VmError::Coredump) - } else { - Err(VmError::VmNotRunning) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + vm.coredump(destination_url).map_err(VmError::Coredump) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => Err(VmError::VmNotRunning), } } fn vm_shutdown(&mut self) -> result::Result<(), VmError> { - let r = if let Some(ref mut vm) = self.vm.take() { - // Drain console_info so that the FDs are not reused - let _ = self.console_info.take(); - vm.shutdown() - } else { - Err(VmError::VmNotRunning) + let vm = match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => vm, + MaybeVmOwnership::Migration => return Err(VmError::VmMigrating), + MaybeVmOwnership::None => return Err(VmError::VmNotRunning), }; + // Drain console_info so that the FDs are not reused + let _ = self.console_info.take(); + let r = vm.shutdown(); + self.vm = MaybeVmOwnership::None; if r.is_ok() { event!("vm", "shutdown"); @@ -1675,13 +2855,14 @@ impl RequestHandler for Vmm { event!("vm", "rebooting"); // First we stop the current VM - let config = if let Some(mut vm) = self.vm.take() { - let config = vm.get_config(); - vm.shutdown()?; - config - } else { - return Err(VmError::VmNotCreated); + let vm = match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => vm, + MaybeVmOwnership::Migration => return Err(VmError::VmMigrating), + MaybeVmOwnership::None => return Err(VmError::VmNotRunning), }; + let config = vm.get_config(); + vm.shutdown()?; + self.vm = MaybeVmOwnership::None; // vm.shutdown() closes all the console devices, so set console_info to None // so that the closed FD #s are not reused. @@ -1730,7 +2911,7 @@ impl RequestHandler for Vmm { // And we boot it vm.boot()?; - self.vm = Some(vm); + self.vm = MaybeVmOwnership::Vmm(vm); event!("vm", "rebooted"); @@ -1738,33 +2919,38 @@ impl RequestHandler for Vmm { } fn vm_info(&self) -> result::Result { - match &self.vm_config { - Some(vm_config) => { - let state = match &self.vm { - Some(vm) => vm.get_state()?, - None => VmState::Created, - }; - let config = vm_config.lock().unwrap().clone(); - - let mut memory_actual_size = config.memory.total_size(); - if let Some(vm) = &self.vm { - memory_actual_size -= vm.balloon_size(); - } - - let device_tree = self - .vm - .as_ref() - .map(|vm| vm.device_tree().lock().unwrap().clone()); + let vm_config = self.vm_config.as_ref().ok_or(VmError::VmNotCreated)?; + let vm_config = vm_config.lock().unwrap().clone(); + + let state = match &self.vm { + MaybeVmOwnership::Vmm(vm) => vm.get_state()?, + // TODO in theory one could live-migrate a non-running VM .. + MaybeVmOwnership::Migration => VmState::Running, + MaybeVmOwnership::None => VmState::Created, + }; - Ok(VmInfoResponse { - config: Box::new(config), - state, - memory_actual_size, - device_tree, - }) + let mut memory_actual_size = vm_config.memory.total_size(); + match &self.vm { + MaybeVmOwnership::Vmm(vm) => { + memory_actual_size -= vm.balloon_size(); } - None => Err(VmError::VmNotCreated), + MaybeVmOwnership::Migration => {} + MaybeVmOwnership::None => {} } + + let device_tree = match &self.vm { + MaybeVmOwnership::Vmm(vm) => Some(vm.device_tree().lock().unwrap().clone()), + // TODO we need to fix this + MaybeVmOwnership::Migration => None, + MaybeVmOwnership::None => None, + }; + + Ok(VmInfoResponse { + config: Box::new(vm_config), + state, + memory_actual_size, + device_tree, + }) } fn vmm_ping(&self) -> VmmPingResponse { @@ -1786,14 +2972,19 @@ impl RequestHandler for Vmm { return Ok(()); } - // If a VM is booted, we first try to shut it down. - if self.vm.is_some() { - self.vm_shutdown()?; - } + match &self.vm { + MaybeVmOwnership::Vmm(_vm) => { + event!("vm", "deleted"); - self.vm_config = None; - - event!("vm", "deleted"); + // If a VM is booted, we first try to shut it down. + self.vm_shutdown()?; + self.vm_config = None; + } + MaybeVmOwnership::None => { + self.vm_config = None; + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating)?, + } Ok(()) } @@ -1806,61 +2997,91 @@ impl RequestHandler for Vmm { fn vm_resize( &mut self, - desired_vcpus: Option, + desired_vcpus: Option, desired_ram: Option, desired_balloon: Option, ) -> result::Result<(), VmError> { self.vm_config.as_ref().ok_or(VmError::VmNotCreated)?; - if let Some(ref mut vm) = self.vm { - if let Err(e) = vm.resize(desired_vcpus, desired_ram, desired_balloon) { - error!("Error when resizing VM: {:?}", e); - Err(e) - } else { - Ok(()) - } - } else { - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - if let Some(desired_vcpus) = desired_vcpus { - config.cpus.boot_vcpus = desired_vcpus; - } - if let Some(desired_ram) = desired_ram { - config.memory.size = desired_ram; + if desired_vcpus.is_some() { + todo!("doesn't work currently with our thread-local KVM_RUN approach"); + } + + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + if let Err(e) = vm.resize(desired_vcpus, desired_ram, desired_balloon) { + error!("Error when resizing VM: {:?}", e); + Err(e) + } else { + Ok(()) + } } - if let Some(desired_balloon) = desired_balloon { - if let Some(balloon_config) = &mut config.balloon { + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + if let Some(desired_vcpus) = desired_vcpus { + config.cpus.boot_vcpus = desired_vcpus; + } + if let Some(desired_ram) = desired_ram { + config.memory.size = desired_ram; + } + if let Some(desired_balloon) = desired_balloon + && let Some(balloon_config) = &mut config.balloon + { balloon_config.size = desired_balloon; } + + Ok(()) } - Ok(()) } } + fn vm_resize_disk(&mut self, id: String, desired_size: u64) -> result::Result<(), VmError> { + info!("request to resize disk: id={id}"); + self.vm_config.as_ref().ok_or(VmError::VmNotCreated)?; + + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + if let Err(e) = vm.resize_disk(id, desired_size) { + error!("Error when resizing disk: {:?}", e); + Err(e) + } else { + Ok(()) + } + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => Err(VmError::ResizeDisk), + } + } fn vm_resize_zone(&mut self, id: String, desired_ram: u64) -> result::Result<(), VmError> { self.vm_config.as_ref().ok_or(VmError::VmNotCreated)?; - if let Some(ref mut vm) = self.vm { - if let Err(e) = vm.resize_zone(id, desired_ram) { - error!("Error when resizing VM: {:?}", e); - Err(e) - } else { - Ok(()) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + if let Err(e) = vm.resize_zone(id, desired_ram) { + error!("Error when resizing VM: {:?}", e); + Err(e) + } else { + Ok(()) + } } - } else { - // Update VmConfig by setting the new desired ram. - let memory_config = &mut self.vm_config.as_ref().unwrap().lock().unwrap().memory; - - if let Some(zones) = &mut memory_config.zones { - for zone in zones.iter_mut() { - if zone.id == id { - zone.size = desired_ram; - return Ok(()); + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by setting the new desired ram. + let memory_config = &mut self.vm_config.as_ref().unwrap().lock().unwrap().memory; + + if let Some(zones) = &mut memory_config.zones { + for zone in zones.iter_mut() { + if zone.id == id { + zone.size = desired_ram; + return Ok(()); + } } } - } - error!("Could not find the memory zone {} for the resize", id); - Err(VmError::ResizeZone) + error!("Could not find the memory zone {} for the resize", id); + Err(VmError::ResizeZone) + } } } @@ -1877,19 +3098,23 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_device(device_cfg).map_err(|e| { - error!("Error when adding new device to the VM: {:?}", e); - e - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - add_to_config(&mut config.devices, device_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_device(device_cfg).map_err(|e| { + error!("Error when adding new device to the VM: {:?}", e); + e + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + add_to_config(&mut config.devices, device_cfg); + Ok(None) + } } } @@ -1906,39 +3131,49 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_user_device(device_cfg).map_err(|e| { - error!("Error when adding new user device to the VM: {:?}", e); - e - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - add_to_config(&mut config.user_devices, device_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_user_device(device_cfg).map_err(|e| { + error!("Error when adding new user device to the VM: {:?}", e); + e + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + add_to_config(&mut config.user_devices, device_cfg); + Ok(None) + } } } fn vm_remove_device(&mut self, id: String) -> result::Result<(), VmError> { - if let Some(ref mut vm) = self.vm { - if let Err(e) = vm.remove_device(id) { - error!("Error when removing device from the VM: {:?}", e); - Err(e) - } else { - Ok(()) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + if let Err(e) = vm.remove_device(id) { + error!("Error when removing device from the VM: {:?}", e); + Err(e) + } else { + Ok(()) + } } - } else if let Some(ref config) = self.vm_config { - let mut config = config.lock().unwrap(); - if config.remove_device(&id) { - Ok(()) - } else { - Err(VmError::NoDeviceToRemove(id)) + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + if let Some(ref config) = self.vm_config { + let mut config = config.lock().unwrap(); + if config.remove_device(&id) { + Ok(()) + } else { + Err(VmError::NoDeviceToRemove(id)) + } + } else { + Err(VmError::VmNotCreated) + } } - } else { - Err(VmError::VmNotCreated) } } @@ -1952,19 +3187,23 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_disk(disk_cfg).map_err(|e| { - error!("Error when adding new disk to the VM: {:?}", e); - e - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - add_to_config(&mut config.disks, disk_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_disk(disk_cfg).map_err(|e| { + error!("Error when adding new disk to the VM: {:?}", e); + e + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + add_to_config(&mut config.disks, disk_cfg); + Ok(None) + } } } @@ -1978,19 +3217,23 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_fs(fs_cfg).map_err(|e| { - error!("Error when adding new fs to the VM: {:?}", e); - e - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - add_to_config(&mut config.fs, fs_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_fs(fs_cfg).map_err(|e| { + error!("Error when adding new fs to the VM: {:?}", e); + e + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + add_to_config(&mut config.fs, fs_cfg); + Ok(None) + } } } @@ -2004,19 +3247,23 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_pmem(pmem_cfg).map_err(|e| { - error!("Error when adding new pmem device to the VM: {:?}", e); - e - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - add_to_config(&mut config.pmem, pmem_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_pmem(pmem_cfg).map_err(|e| { + error!("Error when adding new pmem device to the VM: {:?}", e); + e + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + add_to_config(&mut config.pmem, pmem_cfg); + Ok(None) + } } } @@ -2030,19 +3277,23 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_net(net_cfg).map_err(|e| { - error!("Error when adding new network device to the VM: {:?}", e); - e - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - add_to_config(&mut config.net, net_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_net(net_cfg).map_err(|e| { + error!("Error when adding new network device to the VM: {:?}", e); + e + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + add_to_config(&mut config.net, net_cfg); + Ok(None) + } } } @@ -2056,19 +3307,23 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_vdpa(vdpa_cfg).map_err(|e| { - error!("Error when adding new vDPA device to the VM: {:?}", e); - e - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - add_to_config(&mut config.vdpa, vdpa_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_vdpa(vdpa_cfg).map_err(|e| { + error!("Error when adding new vDPA device to the VM: {:?}", e); + e + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + add_to_config(&mut config.vdpa, vdpa_cfg); + Ok(None) + } } } @@ -2087,49 +3342,55 @@ impl RequestHandler for Vmm { config.validate().map_err(VmError::ConfigValidation)?; } - if let Some(ref mut vm) = self.vm { - let info = vm.add_vsock(vsock_cfg).map_err(|e| { - error!("Error when adding new vsock device to the VM: {:?}", e); - e - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - // Update VmConfig by adding the new device. - let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); - config.vsock = Some(vsock_cfg); - Ok(None) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.add_vsock(vsock_cfg).map_err(|e| { + error!("Error when adding new vsock device to the VM: {:?}", e); + e + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => { + // Update VmConfig by adding the new device. + let mut config = self.vm_config.as_ref().unwrap().lock().unwrap(); + config.vsock = Some(vsock_cfg); + Ok(None) + } } } fn vm_counters(&mut self) -> result::Result>, VmError> { - if let Some(ref mut vm) = self.vm { - let info = vm.counters().map_err(|e| { - error!("Error when getting counters from the VM: {:?}", e); - e - })?; - serde_json::to_vec(&info) - .map(Some) - .map_err(VmError::SerializeJson) - } else { - Err(VmError::VmNotRunning) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => { + let info = vm.counters().map_err(|e| { + error!("Error when getting counters from the VM: {:?}", e); + e + })?; + serde_json::to_vec(&info) + .map(Some) + .map_err(VmError::SerializeJson) + } + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => Err(VmError::VmNotRunning), } } fn vm_power_button(&mut self) -> result::Result<(), VmError> { - if let Some(ref mut vm) = self.vm { - vm.power_button() - } else { - Err(VmError::VmNotRunning) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => vm.power_button(), + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => Err(VmError::VmNotRunning), } } fn vm_nmi(&mut self) -> result::Result<(), VmError> { - if let Some(ref mut vm) = self.vm { - vm.nmi() - } else { - Err(VmError::VmNotRunning) + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => vm.nmi(), + MaybeVmOwnership::Migration => Err(VmError::VmMigrating), + MaybeVmOwnership::None => Err(VmError::VmNotRunning), } } @@ -2138,128 +3399,45 @@ impl RequestHandler for Vmm { receive_data_migration: VmReceiveMigrationData, ) -> result::Result<(), MigratableError> { info!( - "Receiving migration: receiver_url = {}", - receive_data_migration.receiver_url + "Receiving migration: receiver_url = {}, net_fds={:?}", + receive_data_migration.receiver_url, &receive_data_migration.net_fds ); + let mut listener = receive_migration_listener(&receive_data_migration)?; // Accept the connection and get the socket - let mut socket = Vmm::receive_migration_socket(&receive_data_migration.receiver_url)?; - - let mut started = false; - let mut memory_manager: Option>> = None; - let mut existing_memory_files = None; - loop { - let req = Request::read_from(&mut socket)?; - match req.command() { - Command::Invalid => info!("Invalid Command Received"), - Command::Start => { - info!("Start Command Received"); - started = true; - - Response::ok().write_to(&mut socket)?; - } - Command::Config => { - info!("Config Command Received"); + let mut socket = listener.accept().map_err(|e| { + warn!("Failed to accept migration connection: {}", e); + MigratableError::MigrateReceive(anyhow!("Failed to accept migration connection: {}", e)) + })?; - if !started { - warn!("Migration not started yet"); - Response::error().write_to(&mut socket)?; - continue; - } - memory_manager = Some(self.vm_receive_config( - &req, - &mut socket, - existing_memory_files.take(), - )?); - } - Command::State => { - info!("State Command Received"); + let mut state = ReceiveMigrationState::Established; - if !started { - warn!("Migration not started yet"); - Response::error().write_to(&mut socket)?; - continue; - } - if let Some(mm) = memory_manager.take() { - self.vm_receive_state(&req, &mut socket, mm)?; - } else { - warn!("Configuration not sent yet"); - Response::error().write_to(&mut socket)?; - } - } - Command::Memory => { - info!("Memory Command Received"); + while !state.finished() { + let req = Request::read_from(&mut socket)?; + trace!("Command {:?} received", req.command()); - if !started { - warn!("Migration not started yet"); - Response::error().write_to(&mut socket)?; - continue; - } - if let Some(mm) = memory_manager.as_ref() { - self.vm_receive_memory(&req, &mut socket, &mut mm.lock().unwrap())?; - } else { - warn!("Configuration not sent yet"); - Response::error().write_to(&mut socket)?; - } + let (response, new_state) = match self.vm_receive_migration_step( + &listener, + &mut socket, + state, + &req, + &receive_data_migration, + ) { + Ok(next_state) => (Response::ok(), next_state), + Err(err) => { + warn!("Migration command {:?} failed: {}", req.command(), err); + (Response::error(), ReceiveMigrationState::Aborted) } - Command::MemoryFd => { - info!("MemoryFd Command Received"); - - if !started { - warn!("Migration not started yet"); - Response::error().write_to(&mut socket)?; - continue; - } - - match &mut socket { - SocketStream::Unix(unix_socket) => { - let mut buf = [0u8; 4]; - let (_, file) = unix_socket.recv_with_fd(&mut buf).map_err(|e| { - MigratableError::MigrateReceive(anyhow!( - "Error receiving slot from socket: {}", - e - )) - })?; - - if existing_memory_files.is_none() { - existing_memory_files = Some(HashMap::default()) - } + }; - if let Some(ref mut existing_memory_files) = existing_memory_files { - let slot = u32::from_le_bytes(buf); - existing_memory_files.insert(slot, file.unwrap()); - } + state = new_state; + assert_eq!(response.length(), 0); + response.write_to(&mut socket)?; + } - Response::ok().write_to(&mut socket)?; - } - SocketStream::Tcp(_tcp_socket) => { - // For TCP sockets, we cannot transfer file descriptors - warn!( - "MemoryFd command received over TCP socket, which is not supported" - ); - Response::error().write_to(&mut socket)?; - } - } - } - Command::Complete => { - info!("Complete Command Received"); - if let Some(ref mut vm) = self.vm.as_mut() { - vm.resume()?; - Response::ok().write_to(&mut socket)?; - } else { - warn!("VM not created yet"); - Response::error().write_to(&mut socket)?; - } - break; - } - Command::Abandon => { - info!("Abandon Command Received"); - self.vm = None; - self.vm_config = None; - Response::ok().write_to(&mut socket).ok(); - break; - } - } + if let ReceiveMigrationState::Aborted = state { + self.vm = MaybeVmOwnership::None; + self.vm_config = None; } Ok(()) @@ -2269,6 +3447,18 @@ impl RequestHandler for Vmm { &mut self, send_data_migration: VmSendMigrationData, ) -> result::Result<(), MigratableError> { + match self.vm { + MaybeVmOwnership::Vmm(_) => (), + MaybeVmOwnership::Migration => { + return Err(MigratableError::MigrateSend(anyhow!( + "There is already an ongoing migration" + ))); + } + MaybeVmOwnership::None => { + return Err(MigratableError::MigrateSend(anyhow!("VM is not running"))); + } + }; + info!( "Sending migration: destination_url = {}, local = {}", send_data_migration.destination_url, send_data_migration.local @@ -2288,42 +3478,29 @@ impl RequestHandler for Vmm { ))); } - if let Some(vm) = self.vm.as_mut() { - Self::send_migration( - vm, - #[cfg(all(feature = "kvm", target_arch = "x86_64"))] - self.hypervisor.clone(), - send_data_migration.clone(), - ) - .map_err(|migration_err| { - error!("Migration failed: {:?}", migration_err); - - // Stop logging dirty pages only for non-local migrations - if !send_data_migration.local { - if let Err(e) = vm.stop_dirty_log() { - return e; - } - } + // Take VM ownership. This also means that API events can no longer + // change the VM (e.g. net device hotplug). + let vm = self.vm.take_vm_for_migration(); - if vm.get_state().unwrap() == VmState::Paused { - if let Err(e) = vm.resume() { - return e; - } - } + // Start migration thread + let worker = MigrationWorker { + vm, + check_migration_evt: self.check_migration_evt.try_clone().unwrap(), + config: send_data_migration, + #[cfg(all(feature = "kvm", target_arch = "x86_64"))] + hypervisor: self.hypervisor.clone(), + }; - migration_err - })?; + self.migration_thread_handle = Some( + thread::Builder::new() + .name("migration".into()) + .spawn(move || worker.run()) + // For upstreaming, we should simply continue and return an + // error when this fails. For our PoC, this is fine. + .unwrap(), + ); - // Shutdown the VM after the migration succeeded - self.exit_evt.write(1).map_err(|e| { - MigratableError::MigrateSend(anyhow!( - "Failed shutting down the VM after migration: {:?}", - e - )) - }) - } else { - Err(MigratableError::MigrateSend(anyhow!("VM is not running"))) - } + Ok(()) } } @@ -2333,6 +3510,8 @@ const DEVICE_MANAGER_SNAPSHOT_ID: &str = "device-manager"; #[cfg(test)] mod unit_tests { + use arch::CpuProfile; + use super::*; #[cfg(target_arch = "x86_64")] use crate::vm_config::DebugConsoleConfig; @@ -2366,6 +3545,7 @@ mod unit_tests { max_phys_bits: 46, affinity: None, features: CpuFeatures::default(), + profile: CpuProfile::default(), }, memory: MemoryConfig { size: 536_870_912, @@ -2389,6 +3569,8 @@ mod unit_tests { igvm: None, #[cfg(feature = "sev_snp")] host_data: None, + #[cfg(feature = "fw_cfg")] + fw_cfg_config: None, }), rate_limit_groups: None, disks: None, @@ -2396,6 +3578,7 @@ mod unit_tests { rng: RngConfig { src: PathBuf::from("/dev/urandom"), iommu: false, + bdf_device: None, }, balloon: None, fs: None, @@ -2405,12 +3588,17 @@ mod unit_tests { mode: ConsoleOutputMode::Null, iommu: false, socket: None, + url: None, + bdf_device: None, }, console: ConsoleConfig { file: None, - mode: ConsoleOutputMode::Tty, + // Caution: Don't use `Tty` to not mess with users terminal + mode: ConsoleOutputMode::Off, iommu: false, socket: None, + url: None, + bdf_device: None, }, #[cfg(target_arch = "x86_64")] debug_console: DebugConsoleConfig::default(), @@ -2422,8 +3610,6 @@ mod unit_tests { pvmemcontrol: None, pvpanic: false, iommu: false, - #[cfg(target_arch = "x86_64")] - sgx_epc: None, numa: None, watchdog: false, #[cfg(feature = "guest_debug")] @@ -2434,6 +3620,8 @@ mod unit_tests { preserved_fds: None, landlock_enable: false, landlock_rules: None, + #[cfg(feature = "ivshmem")] + ivshmem: None, }) } @@ -2460,14 +3648,15 @@ mod unit_tests { )); let _ = vmm.vm_create(create_dummy_vm_config()); - assert!(vmm - .vm_config - .as_ref() - .unwrap() - .lock() - .unwrap() - .devices - .is_none()); + assert!( + vmm.vm_config + .as_ref() + .unwrap() + .lock() + .unwrap() + .devices + .is_none() + ); assert!(vmm.vm_add_device(device_config.clone()).unwrap().is_none()); assert_eq!( @@ -2507,19 +3696,21 @@ mod unit_tests { )); let _ = vmm.vm_create(create_dummy_vm_config()); - assert!(vmm - .vm_config - .as_ref() - .unwrap() - .lock() - .unwrap() - .user_devices - .is_none()); + assert!( + vmm.vm_config + .as_ref() + .unwrap() + .lock() + .unwrap() + .user_devices + .is_none() + ); - assert!(vmm - .vm_add_user_device(user_device_config.clone()) - .unwrap() - .is_none()); + assert!( + vmm.vm_add_user_device(user_device_config.clone()) + .unwrap() + .is_none() + ); assert_eq!( vmm.vm_config .as_ref() @@ -2556,14 +3747,15 @@ mod unit_tests { )); let _ = vmm.vm_create(create_dummy_vm_config()); - assert!(vmm - .vm_config - .as_ref() - .unwrap() - .lock() - .unwrap() - .disks - .is_none()); + assert!( + vmm.vm_config + .as_ref() + .unwrap() + .lock() + .unwrap() + .disks + .is_none() + ); assert!(vmm.vm_add_disk(disk_config.clone()).unwrap().is_none()); assert_eq!( @@ -2641,14 +3833,15 @@ mod unit_tests { )); let _ = vmm.vm_create(create_dummy_vm_config()); - assert!(vmm - .vm_config - .as_ref() - .unwrap() - .lock() - .unwrap() - .pmem - .is_none()); + assert!( + vmm.vm_config + .as_ref() + .unwrap() + .lock() + .unwrap() + .pmem + .is_none() + ); assert!(vmm.vm_add_pmem(pmem_config.clone()).unwrap().is_none()); assert_eq!( @@ -2690,14 +3883,15 @@ mod unit_tests { )); let _ = vmm.vm_create(create_dummy_vm_config()); - assert!(vmm - .vm_config - .as_ref() - .unwrap() - .lock() - .unwrap() - .net - .is_none()); + assert!( + vmm.vm_config + .as_ref() + .unwrap() + .lock() + .unwrap() + .net + .is_none() + ); assert!(vmm.vm_add_net(net_config.clone()).unwrap().is_none()); assert_eq!( @@ -2736,14 +3930,15 @@ mod unit_tests { )); let _ = vmm.vm_create(create_dummy_vm_config()); - assert!(vmm - .vm_config - .as_ref() - .unwrap() - .lock() - .unwrap() - .vdpa - .is_none()); + assert!( + vmm.vm_config + .as_ref() + .unwrap() + .lock() + .unwrap() + .vdpa + .is_none() + ); assert!(vmm.vm_add_vdpa(vdpa_config.clone()).unwrap().is_none()); assert_eq!( @@ -2782,14 +3977,15 @@ mod unit_tests { )); let _ = vmm.vm_create(create_dummy_vm_config()); - assert!(vmm - .vm_config - .as_ref() - .unwrap() - .lock() - .unwrap() - .vsock - .is_none()); + assert!( + vmm.vm_config + .as_ref() + .unwrap() + .lock() + .unwrap() + .vsock + .is_none() + ); assert!(vmm.vm_add_vsock(vsock_config.clone()).unwrap().is_none()); assert_eq!( diff --git a/vmm/src/memory_manager.rs b/vmm/src/memory_manager.rs index a5ab297182..fbf3e2f5f9 100644 --- a/vmm/src/memory_manager.rs +++ b/vmm/src/memory_manager.rs @@ -17,18 +17,14 @@ use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Barrier, Mutex}; use std::{ffi, result, thread}; -use acpi_tables::{aml, Aml}; +use acpi_tables::{Aml, aml}; use anyhow::anyhow; -#[cfg(target_arch = "x86_64")] -use arch::x86_64::{SgxEpcRegion, SgxEpcSection}; use arch::RegionType; #[cfg(target_arch = "x86_64")] use devices::ioapic; #[cfg(target_arch = "aarch64")] use hypervisor::HypervisorVmError; use libc::_SC_NPROCESSORS_ONLN; -#[cfg(target_arch = "x86_64")] -use libc::{MAP_NORESERVE, MAP_POPULATE, MAP_SHARED, PROT_READ, PROT_WRITE}; use serde::{Deserialize, Serialize}; use thiserror::Error; use tracer::trace_scoped; @@ -42,7 +38,7 @@ use vm_memory::guest_memory::FileOffset; use vm_memory::mmap::MmapRegionError; use vm_memory::{ Address, Error as MmapError, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, - GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion, ReadVolatile, + GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion, }; use vm_migration::protocol::{MemoryRange, MemoryRangeTable}; use vm_migration::{ @@ -54,8 +50,6 @@ use crate::coredump::{ CoredumpMemoryRegion, CoredumpMemoryRegions, DumpState, GuestDebuggableError, }; use crate::migration::url_to_path; -#[cfg(target_arch = "x86_64")] -use crate::vm_config::SgxEpcConfig; use crate::vm_config::{HotplugMethod, MemoryConfig, MemoryZoneConfig}; use crate::{GuestMemoryMmap, GuestRegionMmap, MEMORY_MANAGER_SNAPSHOT_ID}; @@ -68,9 +62,6 @@ const SNAPSHOT_FILENAME: &str = "memory-ranges"; #[cfg(target_arch = "x86_64")] const X86_64_IRQ_BASE: u32 = 5; -#[cfg(target_arch = "x86_64")] -const SGX_PAGE_SIZE: u64 = 1 << 12; - const HOTPLUG_COUNT: usize = 8; // Memory policy constants @@ -183,8 +174,6 @@ pub struct MemoryManager { hugepage_size: Option, prefault: bool, thp: bool, - #[cfg(target_arch = "x86_64")] - sgx_epc_region: Option, user_provided_zones: bool, snapshot_memory_ranges: MemoryRangeTable, memory_zones: MemoryZones, @@ -199,7 +188,7 @@ pub struct MemoryManager { guest_ram_mappings: Vec, pub acpi_address: Option, - #[cfg(target_arch = "aarch64")] + #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] uefi_flash: Option>, } @@ -269,36 +258,6 @@ pub enum Error { #[error("Cannot create the system allocator")] CreateSystemAllocator, - /// Invalid SGX EPC section size - #[cfg(target_arch = "x86_64")] - #[error("Invalid SGX EPC section size")] - EpcSectionSizeInvalid, - - /// Failed allocating SGX EPC region - #[cfg(target_arch = "x86_64")] - #[error("Failed allocating SGX EPC region")] - SgxEpcRangeAllocation, - - /// Failed opening SGX virtual EPC device - #[cfg(target_arch = "x86_64")] - #[error("Failed opening SGX virtual EPC device")] - SgxVirtEpcOpen(#[source] io::Error), - - /// Failed setting the SGX virtual EPC section size - #[cfg(target_arch = "x86_64")] - #[error("Failed setting the SGX virtual EPC section size")] - SgxVirtEpcFileSetLen(#[source] io::Error), - - /// Failed opening SGX provisioning device - #[cfg(target_arch = "x86_64")] - #[error("Failed opening SGX provisioning device")] - SgxProvisionOpen(#[source] io::Error), - - /// Failed enabling SGX provisioning - #[cfg(target_arch = "x86_64")] - #[error("Failed enabling SGX provisioning")] - SgxEnableProvisioning(#[source] hypervisor::HypervisorVmError), - /// Failed creating a new MmapRegion instance. #[cfg(target_arch = "x86_64")] #[error("Failed creating a new MmapRegion instance")] @@ -319,7 +278,9 @@ pub enum Error { /// It's invalid to try applying a NUMA policy to a memory zone that is /// memory mapped with MAP_SHARED. - #[error("Invalid to try applying a NUMA policy to a memory zone that is memory mapped with MAP_SHARED")] + #[error( + "Invalid to try applying a NUMA policy to a memory zone that is memory mapped with MAP_SHARED" + )] InvalidSharedMemoryZoneWithHostNuma, /// Failed applying NUMA memory policy. @@ -1033,8 +994,7 @@ impl MemoryManager { phys_bits: u8, #[cfg(feature = "tdx")] tdx_enabled: bool, restore_data: Option<&MemoryManagerSnapshotData>, - existing_memory_files: Option>, - #[cfg(target_arch = "x86_64")] sgx_epc_config: Option>, + existing_memory_files: HashMap, ) -> Result>, Error> { trace_scoped!("MemoryManager::new"); @@ -1069,7 +1029,7 @@ impl MemoryManager { &data.guest_ram_mappings, &zones, prefault, - existing_memory_files.unwrap_or_default(), + existing_memory_files, config.thp, )?; let guest_memory = @@ -1236,8 +1196,7 @@ impl MemoryManager { None }; - // If running on SGX the start of device area and RAM area may diverge but - // at this point they are next to each other. + // The start of device area and RAM area are placed next to each other. let end_of_ram_area = start_of_device_area.unchecked_sub(1); let ram_allocator = AddressAllocator::new(GuestAddress(0), start_of_device_area.0).unwrap(); @@ -1263,8 +1222,6 @@ impl MemoryManager { hugepages: config.hugepages, hugepage_size: config.hugepage_size, prefault: config.prefault, - #[cfg(target_arch = "x86_64")] - sgx_epc_region: None, user_provided_zones, snapshot_memory_ranges: MemoryRangeTable::default(), memory_zones, @@ -1274,16 +1231,11 @@ impl MemoryManager { arch_mem_regions, ram_allocator, dynamic, - #[cfg(target_arch = "aarch64")] + #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] uefi_flash: None, thp: config.thp, }; - #[cfg(target_arch = "x86_64")] - if let Some(sgx_epc_config) = sgx_epc_config { - memory_manager.setup_sgx(sgx_epc_config)?; - } - Ok(Arc::new(Mutex::new(memory_manager))) } @@ -1310,9 +1262,7 @@ impl MemoryManager { #[cfg(feature = "tdx")] false, Some(&mem_snapshot), - None, - #[cfg(target_arch = "x86_64")] - None, + Default::default(), )?; mm.lock() @@ -1700,7 +1650,7 @@ impl MemoryManager { } // "Inserted" DIMM must have a size that is a multiple of 128MiB - if size % (128 << 20) != 0 { + if !size.is_multiple_of(128 << 20) { return Err(Error::InvalidSize); } @@ -1976,121 +1926,6 @@ impl MemoryManager { self.virtio_mem_resize(id, virtio_mem_size) } - #[cfg(target_arch = "x86_64")] - pub fn setup_sgx(&mut self, sgx_epc_config: Vec) -> Result<(), Error> { - let file = OpenOptions::new() - .read(true) - .open("/dev/sgx_provision") - .map_err(Error::SgxProvisionOpen)?; - self.vm - .enable_sgx_attribute(file) - .map_err(Error::SgxEnableProvisioning)?; - - // Go over each EPC section and verify its size is a 4k multiple. At - // the same time, calculate the total size needed for the contiguous - // EPC region. - let mut epc_region_size = 0; - for epc_section in sgx_epc_config.iter() { - if epc_section.size == 0 { - return Err(Error::EpcSectionSizeInvalid); - } - if epc_section.size & (SGX_PAGE_SIZE - 1) != 0 { - return Err(Error::EpcSectionSizeInvalid); - } - - epc_region_size += epc_section.size; - } - - // Place the SGX EPC region on a 4k boundary between the RAM and the device area - let epc_region_start = - GuestAddress(self.start_of_device_area.0.div_ceil(SGX_PAGE_SIZE) * SGX_PAGE_SIZE); - - self.start_of_device_area = epc_region_start - .checked_add(epc_region_size) - .ok_or(Error::GuestAddressOverFlow)?; - - let mut sgx_epc_region = SgxEpcRegion::new(epc_region_start, epc_region_size as GuestUsize); - info!( - "SGX EPC region: 0x{:x} (0x{:x})", - epc_region_start.0, epc_region_size - ); - - // Each section can be memory mapped into the allocated region. - let mut epc_section_start = epc_region_start.raw_value(); - for epc_section in sgx_epc_config.iter() { - let file = OpenOptions::new() - .read(true) - .write(true) - .open("/dev/sgx_vepc") - .map_err(Error::SgxVirtEpcOpen)?; - - let prot = PROT_READ | PROT_WRITE; - let mut flags = MAP_NORESERVE | MAP_SHARED; - if epc_section.prefault { - flags |= MAP_POPULATE; - } - - // We can't use the vm-memory crate to perform the memory mapping - // here as it would try to ensure the size of the backing file is - // matching the size of the expected mapping. The /dev/sgx_vepc - // device does not work that way, it provides a file descriptor - // which is not matching the mapping size, as it's a just a way to - // let KVM know that an EPC section is being created for the guest. - // SAFETY: FFI call with correct arguments - let host_addr = unsafe { - libc::mmap( - std::ptr::null_mut(), - epc_section.size as usize, - prot, - flags, - file.as_raw_fd(), - 0, - ) - }; - - if host_addr == libc::MAP_FAILED { - error!( - "Could not add SGX EPC section (size 0x{:x})", - epc_section.size - ); - return Err(Error::SgxEpcRangeAllocation); - } - - info!( - "Adding SGX EPC section: 0x{:x} (0x{:x})", - epc_section_start, epc_section.size - ); - - let _mem_slot = self.create_userspace_mapping( - epc_section_start, - epc_section.size, - host_addr as u64, - false, - false, - false, - )?; - - sgx_epc_region.insert( - epc_section.id.clone(), - SgxEpcSection::new( - GuestAddress(epc_section_start), - epc_section.size as GuestUsize, - ), - ); - - epc_section_start += epc_section.size; - } - - self.sgx_epc_region = Some(sgx_epc_region); - - Ok(()) - } - - #[cfg(target_arch = "x86_64")] - pub fn sgx_epc_region(&self) -> &Option { - &self.sgx_epc_region - } - pub fn is_hardlink(f: &File) -> bool { let mut stat = std::mem::MaybeUninit::::uninit(); // SAFETY: FFI call with correct arguments @@ -2124,23 +1959,21 @@ impl MemoryManager { } for region in memory_zone.regions() { - if snapshot { - if let Some(file_offset) = region.file_offset() { - if (region.flags() & libc::MAP_SHARED == libc::MAP_SHARED) - && Self::is_hardlink(file_offset.file()) - { - // In this very specific case, we know the memory - // region is backed by a file on the host filesystem - // that can be accessed by the user, and additionally - // the mapping is shared, which means that modifications - // to the content are written to the actual file. - // When meeting these conditions, we can skip the - // copy of the memory content for this specific region, - // as we can assume the user will have it saved through - // the backing file already. - continue; - } - } + if snapshot + && let Some(file_offset) = region.file_offset() + && (region.flags() & libc::MAP_SHARED == libc::MAP_SHARED) + && Self::is_hardlink(file_offset.file()) + { + // In this very specific case, we know the memory + // region is backed by a file on the host filesystem + // that can be accessed by the user, and additionally + // the mapping is shared, which means that modifications + // to the content are written to the actual file. + // When meeting these conditions, we can skip the + // copy of the memory content for this specific region, + // as we can assume the user will have it saved through + // the backing file already. + continue; } table.push(MemoryRange { @@ -2192,7 +2025,7 @@ impl MemoryManager { self.guest_ram_mappings.len() as u32 } - #[cfg(target_arch = "aarch64")] + #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] pub fn uefi_flash(&self) -> GuestMemoryAtomic { self.uefi_flash.as_ref().unwrap().clone() } @@ -2258,48 +2091,6 @@ impl MemoryManager { debug!("coredump total bytes {}", total_bytes); Ok(()) } - - pub fn receive_memory_regions( - &mut self, - ranges: &MemoryRangeTable, - fd: &mut F, - ) -> std::result::Result<(), MigratableError> - where - F: ReadVolatile, - { - let guest_memory = self.guest_memory(); - let mem = guest_memory.memory(); - - for range in ranges.regions() { - let mut offset: u64 = 0; - // Here we are manually handling the retry in case we can't the - // whole region at once because we can't use the implementation - // from vm-memory::GuestMemory of read_exact_from() as it is not - // following the correct behavior. For more info about this issue - // see: https://github.com/rust-vmm/vm-memory/issues/174 - loop { - let bytes_read = mem - .read_volatile_from( - GuestAddress(range.gpa + offset), - fd, - (range.length - offset) as usize, - ) - .map_err(|e| { - MigratableError::MigrateReceive(anyhow!( - "Error receiving memory from socket: {}", - e - )) - })?; - offset += bytes_read as u64; - - if offset == range.length { - break; - } - } - } - - Ok(()) - } } struct MemoryNotify { @@ -2642,34 +2433,6 @@ impl Aml for MemoryManager { ) .to_aml_bytes(sink); } - - #[cfg(target_arch = "x86_64")] - { - if let Some(sgx_epc_region) = &self.sgx_epc_region { - let min = sgx_epc_region.start().raw_value(); - let max = min + sgx_epc_region.size() - 1; - // SGX EPC region - aml::Device::new( - "_SB_.EPC_".into(), - vec![ - &aml::Name::new("_HID".into(), &aml::EISAName::new("INT0E0C")), - // QWORD describing the EPC region start and size - &aml::Name::new( - "_CRS".into(), - &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( - aml::AddressSpaceCacheable::NotCacheable, - true, - min, - max, - None, - )]), - ), - &aml::Method::new("_STA".into(), 0, false, vec![&aml::Return::new(&0xfu8)]), - ], - ) - .to_aml_bytes(sink); - } - } } } @@ -2806,24 +2569,23 @@ impl Migratable for MemoryManager { return Err(MigratableError::MigrateSend(anyhow!( "Error finding 'guest memory region' with address {:x}", r.gpa - ))) + ))); } }; - let dirty_bitmap: Vec = vm_dirty_bitmap + let dirty_bitmap = vm_dirty_bitmap .iter() .zip(vmm_dirty_bitmap.iter()) - .map(|(x, y)| x | y) - .collect(); + .map(|(x, y)| x | y); let sub_table = MemoryRangeTable::from_bitmap(dirty_bitmap, r.gpa, 4096); if sub_table.regions().is_empty() { - info!("Dirty Memory Range Table is empty"); + debug!("Dirty Memory Range Table is empty"); } else { - info!("Dirty Memory Range Table:"); + debug!("Dirty Memory Range Table:"); for range in sub_table.regions() { - info!("GPA: {:x} size: {} (KiB)", range.gpa, range.length / 1024); + trace!("GPA: {:x} size: {} (KiB)", range.gpa, range.length / 1024); } } diff --git a/vmm/src/migration.rs b/vmm/src/migration.rs index 2752e82e5b..d93b028055 100644 --- a/vmm/src/migration.rs +++ b/vmm/src/migration.rs @@ -27,7 +27,7 @@ pub fn url_to_path(url: &str) -> std::result::Result { if !path.is_dir() { return Err(MigratableError::MigrateSend(anyhow!( - "Destination is not a directory" + "Destination is not a directory: {path:?}" ))); } diff --git a/vmm/src/pci_segment.rs b/vmm/src/pci_segment.rs index 010859e05f..289ae853c7 100644 --- a/vmm/src/pci_segment.rs +++ b/vmm/src/pci_segment.rs @@ -11,11 +11,11 @@ use std::sync::{Arc, Mutex}; -use acpi_tables::{aml, Aml}; +use acpi_tables::{Aml, aml}; use arch::layout; use pci::{DeviceRelocation, PciBdf, PciBus, PciConfigMmio, PciRoot}; #[cfg(target_arch = "x86_64")] -use pci::{PciConfigIo, PCI_CONFIG_IO_PORT, PCI_CONFIG_IO_PORT_SIZE}; +use pci::{PCI_CONFIG_IO_PORT, PCI_CONFIG_IO_PORT_SIZE, PciConfigIo}; use uuid::Uuid; use vm_allocator::AddressAllocator; use vm_device::BusDeviceSync; @@ -105,7 +105,12 @@ impl PciSegment { info!( "Adding PCI segment: id={}, PCI MMIO config address: 0x{:x}, mem32 area [0x{:x}-0x{:x}, mem64 area [0x{:x}-0x{:x}", - segment.id, segment.mmio_config_address, segment.start_of_mem32_area, segment.end_of_mem32_area, segment.start_of_mem64_area, segment.end_of_mem64_area + segment.id, + segment.mmio_config_address, + segment.start_of_mem32_area, + segment.end_of_mem32_area, + segment.start_of_mem64_area, + segment.end_of_mem64_area ); Ok(segment) } @@ -158,15 +163,22 @@ impl PciSegment { ) } - pub(crate) fn next_device_bdf(&self) -> DeviceManagerResult { + /// Allocates a device BDF on this PCI segment. + /// + /// - `device_id`: Device ID to request for BDF allocation + /// + /// ## Errors + /// * [`DeviceManagerError::AllocatePciDeviceId`] if device ID + /// allocation on the bus fails. + pub(crate) fn allocate_device_bdf(&self, device_id: Option) -> DeviceManagerResult { Ok(PciBdf::new( self.id, 0, self.pci_bus .lock() .unwrap() - .next_device_id() - .map_err(DeviceManagerError::NextPciDeviceId)? as u8, + .allocate_device_id(device_id) + .map_err(DeviceManagerError::AllocatePciDeviceId)? as u8, 0, )) } @@ -196,6 +208,65 @@ impl PciSegment { Ok(()) } + + #[cfg(test)] + /// Creates a PciSegment without the need for an [`AddressManager`] + /// for testing purpose. + /// + /// An [`AddressManager`] would otherwise be required to create + /// [`PciBus`] instances. Instead, we use any struct that implements + /// [`DeviceRelocation`] to instantiate a [`PciBus`]. + pub(crate) fn new_without_address_manager( + id: u16, + numa_node: u32, + mem32_allocator: Arc>, + mem64_allocator: Arc>, + pci_irq_slots: &[u8; 32], + device_reloc: Arc, + ) -> DeviceManagerResult { + let pci_root = PciRoot::new(None); + let pci_bus = Arc::new(Mutex::new(PciBus::new(pci_root, device_reloc.clone()))); + + let pci_config_mmio = Arc::new(Mutex::new(PciConfigMmio::new(Arc::clone(&pci_bus)))); + let mmio_config_address = + layout::PCI_MMCONFIG_START.0 + layout::PCI_MMIO_CONFIG_SIZE_PER_SEGMENT * id as u64; + + let start_of_mem32_area = mem32_allocator.lock().unwrap().base().0; + let end_of_mem32_area = mem32_allocator.lock().unwrap().end().0; + + let start_of_mem64_area = mem64_allocator.lock().unwrap().base().0; + let end_of_mem64_area = mem64_allocator.lock().unwrap().end().0; + + let segment = PciSegment { + id, + pci_bus, + pci_config_mmio, + mmio_config_address, + proximity_domain: numa_node, + pci_devices_up: 0, + pci_devices_down: 0, + #[cfg(target_arch = "x86_64")] + pci_config_io: None, + mem32_allocator, + mem64_allocator, + start_of_mem32_area, + end_of_mem32_area, + start_of_mem64_area, + end_of_mem64_area, + pci_irq_slots: *pci_irq_slots, + }; + + info!( + "Adding PCI segment: id={}, PCI MMIO config address: 0x{:x}, mem32 area [0x{:x}-0x{:x}, mem64 area [0x{:x}-0x{:x}", + segment.id, + segment.mmio_config_address, + segment.start_of_mem32_area, + segment.end_of_mem32_area, + segment.start_of_mem64_area, + segment.end_of_mem64_area + ); + Ok(segment) + } } struct PciDevSlot { @@ -468,3 +539,96 @@ impl Aml for PciSegment { .to_aml_bytes(sink) } } + +#[cfg(test)] +mod unit_tests { + use std::result::Result; + + use vm_memory::GuestAddress; + + use super::*; + + #[derive(Debug)] + struct MocRelocDevice; + impl DeviceRelocation for MocRelocDevice { + fn move_bar( + &self, + _old_base: u64, + _new_base: u64, + _len: u64, + _pci_dev: &mut dyn pci::PciDevice, + _region_type: pci::PciBarRegionType, + ) -> Result<(), std::io::Error> { + Ok(()) + } + } + + fn setup() -> PciSegment { + let guest_addr = 0_u64; + let guest_size = 0x1000_usize; + let allocator_1 = Arc::new(Mutex::new( + AddressAllocator::new(GuestAddress(guest_addr), guest_size as u64).unwrap(), + )); + let allocator_2 = Arc::new(Mutex::new( + AddressAllocator::new(GuestAddress(guest_addr), guest_size as u64).unwrap(), + )); + let moc_device_reloc = Arc::new(MocRelocDevice {}); + let arr = [0_u8; 32]; + + PciSegment::new_without_address_manager( + 0, + 0, + allocator_1, + allocator_2, + &arr, + moc_device_reloc, + ) + .unwrap() + } + + #[test] + // Test the default bdf for a segment with an empty bus (except for the root device) + fn allocate_device_bdf_default() { + // The first address is occupied by the root + let segment = setup(); + let bdf = segment.allocate_device_bdf(None).unwrap(); + assert_eq!(bdf.segment(), segment.id); + assert_eq!(bdf.bus(), 0); + assert_eq!(bdf.device(), 1); + assert_eq!(bdf.function(), 0); + } + + #[test] + // Test to acquire a bdf with s specific device ID + fn allocate_device_bdf_fixed_device_id() { + // The first address is occupied by the root + let expect_device_id = 0x10_u8; + let segment = setup(); + let bdf = segment.allocate_device_bdf(Some(expect_device_id)).unwrap(); + assert_eq!(bdf.segment(), segment.id); + assert_eq!(bdf.bus(), 0); + assert_eq!(bdf.device(), expect_device_id); + assert_eq!(bdf.function(), 0); + } + + #[test] + // Test to acquire a bdf with invalid device id, one already + // taken and the other being greater then the number of allowed + // devices per bus. + fn allocate_device_bdf_invalid_device_id() { + // The first address is occupied by the root + let already_taken_device_id = 0x0_u8; + let overflow_device_id = 0xff_u8; + let segment = setup(); + let bdf_res = segment.allocate_device_bdf(Some(already_taken_device_id)); + assert!(matches!( + bdf_res, + Err(DeviceManagerError::AllocatePciDeviceId(_)) + )); + let bdf_res = segment.allocate_device_bdf(Some(overflow_device_id)); + assert!(matches!( + bdf_res, + Err(DeviceManagerError::AllocatePciDeviceId(_)) + )); + } +} diff --git a/vmm/src/seccomp_filters.rs b/vmm/src/seccomp_filters.rs index 40748f0d0b..44ba48d293 100644 --- a/vmm/src/seccomp_filters.rs +++ b/vmm/src/seccomp_filters.rs @@ -5,11 +5,25 @@ // SPDX-License-Identifier: Apache-2.0 use hypervisor::HypervisorType; +use libc::{ + BLKIOMIN, BLKIOOPT, BLKPBSZGET, BLKSSZGET, FIOCLEX, FIONBIO, SIOCGIFFLAGS, SIOCGIFHWADDR, + SIOCGIFINDEX, SIOCGIFMTU, SIOCSIFADDR, SIOCSIFFLAGS, SIOCSIFHWADDR, SIOCSIFMTU, SIOCSIFNETMASK, + TCGETS, TCGETS2, TCSETS, TCSETS2, TIOCGPGRP, TIOCGPTPEER, TIOCGWINSZ, TIOCSCTTY, TIOCSPGRP, + TIOCSPTLCK, TUNGETFEATURES, TUNGETIFF, TUNSETIFF, TUNSETOFFLOAD, TUNSETVNETHDRSZ, +}; use seccompiler::SeccompCmpOp::Eq; use seccompiler::{ BackendError, BpfProgram, Error, SeccompAction, SeccompCmpArgLen as ArgLen, SeccompCondition as Cond, SeccompFilter, SeccompRule, }; +use vhost::vhost_kern::vhost_binding::{ + VHOST_GET_BACKEND_FEATURES, VHOST_GET_FEATURES, VHOST_SET_BACKEND_FEATURES, VHOST_SET_FEATURES, + VHOST_SET_OWNER, VHOST_SET_VRING_ADDR, VHOST_SET_VRING_BASE, VHOST_SET_VRING_CALL, + VHOST_SET_VRING_KICK, VHOST_SET_VRING_NUM, VHOST_VDPA_GET_CONFIG, VHOST_VDPA_GET_CONFIG_SIZE, + VHOST_VDPA_GET_DEVICE_ID, VHOST_VDPA_GET_IOVA_RANGE, VHOST_VDPA_GET_STATUS, + VHOST_VDPA_GET_VRING_NUM, VHOST_VDPA_SET_CONFIG, VHOST_VDPA_SET_CONFIG_CALL, + VHOST_VDPA_SET_STATUS, VHOST_VDPA_SET_VRING_ENABLE, VHOST_VDPA_SUSPEND, +}; pub enum Thread { HttpApi, @@ -40,42 +54,6 @@ macro_rules! or { ($($x:expr),*) => (vec![$($x),*]) } -// See include/uapi/asm-generic/ioctls.h in the kernel code. -const TCGETS: u64 = 0x5401; -const TCSETS: u64 = 0x5402; -const TIOCSCTTY: u64 = 0x540E; -const TIOCGPGRP: u64 = 0x540F; -const TIOCSPGRP: u64 = 0x5410; -const TIOCGWINSZ: u64 = 0x5413; -const TIOCSPTLCK: u64 = 0x4004_5431; -const TIOCGPTPEER: u64 = 0x5441; -const FIOCLEX: u64 = 0x5451; -const FIONBIO: u64 = 0x5421; - -// See include/uapi/linux/fs.h in the kernel code. -const BLKSSZGET: u64 = 0x1268; -const BLKPBSZGET: u64 = 0x127b; -const BLKIOMIN: u64 = 0x1278; -const BLKIOOPT: u64 = 0x1279; - -// See include/uapi/linux/if_tun.h in the kernel code. -const TUNGETIFF: u64 = 0x8004_54d2; -const TUNSETIFF: u64 = 0x4004_54ca; -const TUNSETOFFLOAD: u64 = 0x4004_54d0; -const TUNSETVNETHDRSZ: u64 = 0x4004_54d8; -const TUNGETFEATURES: u64 = 0x8004_54cf; - -// See include/uapi/linux/sockios.h in the kernel code. -const SIOCGIFFLAGS: u64 = 0x8913; -const SIOCSIFFLAGS: u64 = 0x8914; -const SIOCSIFADDR: u64 = 0x8916; -const SIOCSIFNETMASK: u64 = 0x891c; -const SIOCGIFMTU: u64 = 0x8921; -const SIOCSIFMTU: u64 = 0x8922; -const SIOCSIFHWADDR: u64 = 0x8924; -const SIOCGIFHWADDR: u64 = 0x8927; -const SIOCGIFINDEX: u64 = 0x8933; - // See include/uapi/linux/vfio.h in the kernel code. const VFIO_GET_API_VERSION: u64 = 0x3b64; const VFIO_CHECK_EXTENSION: u64 = 0x3b65; @@ -93,29 +71,6 @@ const VFIO_IOMMU_MAP_DMA: u64 = 0x3b71; const VFIO_IOMMU_UNMAP_DMA: u64 = 0x3b72; const VFIO_DEVICE_IOEVENTFD: u64 = 0x3b74; -// See include/uapi/linux/vhost.h in the kernel code -const VHOST_GET_FEATURES: u64 = 0x8008af00; -const VHOST_SET_FEATURES: u64 = 0x4008af00; -const VHOST_SET_OWNER: u64 = 0xaf01; -const VHOST_SET_VRING_NUM: u64 = 0x4008af10; -const VHOST_SET_VRING_ADDR: u64 = 0x4028af11; -const VHOST_SET_VRING_BASE: u64 = 0x4008af12; -const VHOST_SET_VRING_KICK: u64 = 0x4008af20; -const VHOST_SET_VRING_CALL: u64 = 0x4008af21; -const VHOST_SET_BACKEND_FEATURES: u64 = 0x4008af25; -const VHOST_GET_BACKEND_FEATURES: u64 = 0x8008af26; -const VHOST_VDPA_GET_DEVICE_ID: u64 = 0x8004af70; -const VHOST_VDPA_GET_STATUS: u64 = 0x8001af71; -const VHOST_VDPA_SET_STATUS: u64 = 0x4001af72; -const VHOST_VDPA_GET_CONFIG: u64 = 0x8008af73; -const VHOST_VDPA_SET_CONFIG: u64 = 0x4008af74; -const VHOST_VDPA_SET_VRING_ENABLE: u64 = 0x4008af75; -const VHOST_VDPA_GET_VRING_NUM: u64 = 0x8002af76; -const VHOST_VDPA_SET_CONFIG_CALL: u64 = 0x4004af77; -const VHOST_VDPA_GET_IOVA_RANGE: u64 = 0x8010af78; -const VHOST_VDPA_GET_CONFIG_SIZE: u64 = 0x8004af79; -const VHOST_VDPA_SUSPEND: u64 = 0xaf7d; - // See include/uapi/linux/kvm.h in the kernel code. #[cfg(feature = "kvm")] mod kvm { @@ -148,6 +103,8 @@ mod kvm { pub const KVM_GET_REG_LIST: u64 = 0xc008_aeb0; pub const KVM_MEMORY_ENCRYPT_OP: u64 = 0xc008_aeba; pub const KVM_NMI: u64 = 0xae9a; + pub const KVM_GET_NESTED_STATE: u64 = 3229658814; + pub const KVM_SET_NESTED_STATE: u64 = 1082175167; } // MSHV IOCTL code. This is unstable until the kernel code has been declared stable. @@ -277,6 +234,8 @@ fn create_vmm_ioctl_seccomp_rule_common_kvm() -> Result, Backen and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_USER_MEMORY_REGION,)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_VCPU_EVENTS,)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_NMI)?], + and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_NESTED_STATE)?], + and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_NESTED_STATE)?], ]) } @@ -295,12 +254,12 @@ fn create_vmm_ioctl_seccomp_rule_common( hypervisor_type: HypervisorType, ) -> Result, BackendError> { let mut common_rules = or![ - and![Cond::new(1, ArgLen::Dword, Eq, BLKSSZGET)?], - and![Cond::new(1, ArgLen::Dword, Eq, BLKPBSZGET)?], - and![Cond::new(1, ArgLen::Dword, Eq, BLKIOMIN)?], - and![Cond::new(1, ArgLen::Dword, Eq, BLKIOOPT)?], - and![Cond::new(1, ArgLen::Dword, Eq, FIOCLEX)?], - and![Cond::new(1, ArgLen::Dword, Eq, FIONBIO)?], + and![Cond::new(1, ArgLen::Dword, Eq, BLKSSZGET as _)?], + and![Cond::new(1, ArgLen::Dword, Eq, BLKPBSZGET as _)?], + and![Cond::new(1, ArgLen::Dword, Eq, BLKIOMIN as _)?], + and![Cond::new(1, ArgLen::Dword, Eq, BLKIOOPT as _)?], + and![Cond::new(1, ArgLen::Dword, Eq, FIOCLEX as _)?], + and![Cond::new(1, ArgLen::Dword, Eq, FIONBIO as _)?], and![Cond::new(1, ArgLen::Dword, Eq, SIOCGIFFLAGS)?], and![Cond::new(1, ArgLen::Dword, Eq, SIOCGIFHWADDR)?], and![Cond::new(1, ArgLen::Dword, Eq, SIOCGIFMTU)?], @@ -310,19 +269,21 @@ fn create_vmm_ioctl_seccomp_rule_common( and![Cond::new(1, ArgLen::Dword, Eq, SIOCSIFHWADDR)?], and![Cond::new(1, ArgLen::Dword, Eq, SIOCSIFMTU)?], and![Cond::new(1, ArgLen::Dword, Eq, SIOCSIFNETMASK)?], - and![Cond::new(1, ArgLen::Dword, Eq, TCSETS)?], - and![Cond::new(1, ArgLen::Dword, Eq, TCGETS)?], - and![Cond::new(1, ArgLen::Dword, Eq, TIOCGPGRP)?], - and![Cond::new(1, ArgLen::Dword, Eq, TIOCGPTPEER)?], - and![Cond::new(1, ArgLen::Dword, Eq, TIOCGWINSZ)?], - and![Cond::new(1, ArgLen::Dword, Eq, TIOCSCTTY)?], - and![Cond::new(1, ArgLen::Dword, Eq, TIOCSPGRP)?], - and![Cond::new(1, ArgLen::Dword, Eq, TIOCSPTLCK)?], - and![Cond::new(1, ArgLen::Dword, Eq, TUNGETFEATURES)?], - and![Cond::new(1, ArgLen::Dword, Eq, TUNGETIFF)?], - and![Cond::new(1, ArgLen::Dword, Eq, TUNSETIFF)?], - and![Cond::new(1, ArgLen::Dword, Eq, TUNSETOFFLOAD)?], - and![Cond::new(1, ArgLen::Dword, Eq, TUNSETVNETHDRSZ)?], + and![Cond::new(1, ArgLen::Dword, Eq, TCSETS as _)?], + and![Cond::new(1, ArgLen::Dword, Eq, TCSETS2 as _)?], + and![Cond::new(1, ArgLen::Dword, Eq, TCGETS as _)?], + and![Cond::new(1, ArgLen::Dword, Eq, TCGETS2 as _)?], + and![Cond::new(1, ArgLen::Dword, Eq, TIOCGPGRP as _)?], + and![Cond::new(1, ArgLen::Dword, Eq, TIOCGPTPEER as _)?], + and![Cond::new(1, ArgLen::Dword, Eq, TIOCGWINSZ as _)?], + and![Cond::new(1, ArgLen::Dword, Eq, TIOCSCTTY as _)?], + and![Cond::new(1, ArgLen::Dword, Eq, TIOCSPGRP as _)?], + and![Cond::new(1, ArgLen::Dword, Eq, TIOCSPTLCK as _)?], + and![Cond::new(1, ArgLen::Dword, Eq, TUNGETFEATURES as _)?], + and![Cond::new(1, ArgLen::Dword, Eq, TUNGETIFF as _)?], + and![Cond::new(1, ArgLen::Dword, Eq, TUNSETIFF as _)?], + and![Cond::new(1, ArgLen::Dword, Eq, TUNSETOFFLOAD as _)?], + and![Cond::new(1, ArgLen::Dword, Eq, TUNSETVNETHDRSZ as _)?], and![Cond::new(1, ArgLen::Dword, Eq, VFIO_GET_API_VERSION)?], and![Cond::new(1, ArgLen::Dword, Eq, VFIO_CHECK_EXTENSION)?], and![Cond::new(1, ArgLen::Dword, Eq, VFIO_SET_IOMMU)?], @@ -343,32 +304,57 @@ fn create_vmm_ioctl_seccomp_rule_common( and![Cond::new(1, ArgLen::Dword, Eq, VFIO_IOMMU_MAP_DMA)?], and![Cond::new(1, ArgLen::Dword, Eq, VFIO_IOMMU_UNMAP_DMA)?], and![Cond::new(1, ArgLen::Dword, Eq, VFIO_DEVICE_IOEVENTFD)?], - and![Cond::new(1, ArgLen::Dword, Eq, VHOST_GET_FEATURES)?], - and![Cond::new(1, ArgLen::Dword, Eq, VHOST_SET_FEATURES)?], - and![Cond::new(1, ArgLen::Dword, Eq, VHOST_SET_OWNER)?], - and![Cond::new(1, ArgLen::Dword, Eq, VHOST_SET_VRING_NUM)?], - and![Cond::new(1, ArgLen::Dword, Eq, VHOST_SET_VRING_ADDR)?], - and![Cond::new(1, ArgLen::Dword, Eq, VHOST_SET_VRING_BASE)?], - and![Cond::new(1, ArgLen::Dword, Eq, VHOST_SET_VRING_KICK)?], - and![Cond::new(1, ArgLen::Dword, Eq, VHOST_SET_VRING_CALL)?], - and![Cond::new(1, ArgLen::Dword, Eq, VHOST_SET_BACKEND_FEATURES)?], - and![Cond::new(1, ArgLen::Dword, Eq, VHOST_GET_BACKEND_FEATURES)?], - and![Cond::new(1, ArgLen::Dword, Eq, VHOST_VDPA_GET_DEVICE_ID)?], - and![Cond::new(1, ArgLen::Dword, Eq, VHOST_VDPA_GET_STATUS)?], - and![Cond::new(1, ArgLen::Dword, Eq, VHOST_VDPA_SET_STATUS)?], - and![Cond::new(1, ArgLen::Dword, Eq, VHOST_VDPA_GET_CONFIG)?], - and![Cond::new(1, ArgLen::Dword, Eq, VHOST_VDPA_SET_CONFIG)?], + and![Cond::new(1, ArgLen::Dword, Eq, VHOST_GET_FEATURES())?], + and![Cond::new(1, ArgLen::Dword, Eq, VHOST_SET_FEATURES())?], + and![Cond::new(1, ArgLen::Dword, Eq, VHOST_SET_OWNER())?], + and![Cond::new(1, ArgLen::Dword, Eq, VHOST_SET_VRING_NUM())?], + and![Cond::new(1, ArgLen::Dword, Eq, VHOST_SET_VRING_ADDR())?], + and![Cond::new(1, ArgLen::Dword, Eq, VHOST_SET_VRING_BASE())?], + and![Cond::new(1, ArgLen::Dword, Eq, VHOST_SET_VRING_KICK())?], + and![Cond::new(1, ArgLen::Dword, Eq, VHOST_SET_VRING_CALL())?], + and![Cond::new( + 1, + ArgLen::Dword, + Eq, + VHOST_SET_BACKEND_FEATURES() + )?], + and![Cond::new( + 1, + ArgLen::Dword, + Eq, + VHOST_GET_BACKEND_FEATURES() + )?], + and![Cond::new(1, ArgLen::Dword, Eq, VHOST_VDPA_GET_DEVICE_ID())?], + and![Cond::new(1, ArgLen::Dword, Eq, VHOST_VDPA_GET_STATUS())?], + and![Cond::new(1, ArgLen::Dword, Eq, VHOST_VDPA_SET_STATUS())?], + and![Cond::new(1, ArgLen::Dword, Eq, VHOST_VDPA_GET_CONFIG())?], + and![Cond::new(1, ArgLen::Dword, Eq, VHOST_VDPA_SET_CONFIG())?], + and![Cond::new( + 1, + ArgLen::Dword, + Eq, + VHOST_VDPA_SET_VRING_ENABLE(), + )?], + and![Cond::new(1, ArgLen::Dword, Eq, VHOST_VDPA_GET_VRING_NUM())?], and![Cond::new( 1, ArgLen::Dword, Eq, - VHOST_VDPA_SET_VRING_ENABLE + VHOST_VDPA_SET_CONFIG_CALL() )?], - and![Cond::new(1, ArgLen::Dword, Eq, VHOST_VDPA_GET_VRING_NUM)?], - and![Cond::new(1, ArgLen::Dword, Eq, VHOST_VDPA_SET_CONFIG_CALL)?], - and![Cond::new(1, ArgLen::Dword, Eq, VHOST_VDPA_GET_IOVA_RANGE)?], - and![Cond::new(1, ArgLen::Dword, Eq, VHOST_VDPA_GET_CONFIG_SIZE)?], - and![Cond::new(1, ArgLen::Dword, Eq, VHOST_VDPA_SUSPEND)?], + and![Cond::new( + 1, + ArgLen::Dword, + Eq, + VHOST_VDPA_GET_IOVA_RANGE() + )?], + and![Cond::new( + 1, + ArgLen::Dword, + Eq, + VHOST_VDPA_GET_CONFIG_SIZE() + )?], + and![Cond::new(1, ArgLen::Dword, Eq, VHOST_VDPA_SUSPEND())?], ]; let hypervisor_rules = create_vmm_ioctl_seccomp_rule_hypervisor(hypervisor_type)?; @@ -390,7 +376,7 @@ fn create_vmm_ioctl_seccomp_rule_kvm() -> Result, BackendError> const KVM_GET_SREGS: u64 = 0x8138_ae83; const KVM_GET_TSC_KHZ: u64 = 0xaea3; const KVM_GET_XCRS: u64 = 0x8188_aea6; - const KVM_GET_XSAVE: u64 = 0x9000_aea4; + const KVM_GET_XSAVE2: u64 = 0x9000_aecf; const KVM_KVMCLOCK_CTRL: u64 = 0xaead; const KVM_SET_CLOCK: u64 = 0x4030_ae7b; const KVM_SET_CPUID2: u64 = 0x4008_ae90; @@ -418,7 +404,7 @@ fn create_vmm_ioctl_seccomp_rule_kvm() -> Result, BackendError> and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_SREGS)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_TSC_KHZ)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_XCRS,)?], - and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_XSAVE,)?], + and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_XSAVE2,)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_KVMCLOCK_CTRL)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_CLOCK)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_CPUID2)?], @@ -481,14 +467,16 @@ fn create_vmm_ioctl_seccomp_rule( } fn create_api_ioctl_seccomp_rule() -> Result, BackendError> { - Ok(or![and![Cond::new(1, ArgLen::Dword, Eq, FIONBIO)?]]) + Ok(or![and![Cond::new(1, ArgLen::Dword, Eq, FIONBIO as _)?]]) } fn create_signal_handler_ioctl_seccomp_rule() -> Result, BackendError> { Ok(or![ - and![Cond::new(1, ArgLen::Dword, Eq, TCGETS)?], - and![Cond::new(1, ArgLen::Dword, Eq, TCSETS)?], - and![Cond::new(1, ArgLen::Dword, Eq, TIOCGWINSZ)?], + and![Cond::new(1, ArgLen::Dword, Eq, TCGETS as _)?], + and![Cond::new(1, ArgLen::Dword, Eq, TCGETS2 as _)?], + and![Cond::new(1, ArgLen::Dword, Eq, TCSETS as _)?], + and![Cond::new(1, ArgLen::Dword, Eq, TCSETS2 as _)?], + and![Cond::new(1, ArgLen::Dword, Eq, TIOCGWINSZ as _)?], ]) } @@ -520,9 +508,9 @@ fn signal_handler_thread_rules() -> Result)>, Backend fn create_pty_foreground_ioctl_seccomp_rule() -> Result, BackendError> { Ok(or![ - and![Cond::new(1, ArgLen::Dword, Eq, TIOCGPGRP)?], - and![Cond::new(1, ArgLen::Dword, Eq, TIOCSCTTY)?], - and![Cond::new(1, ArgLen::Dword, Eq, TIOCSPGRP)?], + and![Cond::new(1, ArgLen::Dword, Eq, TIOCGPGRP as _)?], + and![Cond::new(1, ArgLen::Dword, Eq, TIOCSCTTY as _)?], + and![Cond::new(1, ArgLen::Dword, Eq, TIOCSPGRP as _)?], ]) } @@ -562,6 +550,8 @@ fn vmm_thread_rules( (libc::SYS_accept4, vec![]), #[cfg(target_arch = "x86_64")] (libc::SYS_access, vec![]), + #[cfg(target_arch = "x86_64")] + (libc::SYS_arch_prctl, vec![]), (libc::SYS_bind, vec![]), (libc::SYS_brk, vec![]), (libc::SYS_clock_gettime, vec![]), @@ -713,6 +703,8 @@ fn create_vcpu_ioctl_seccomp_rule_kvm() -> Result, BackendError and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_USER_MEMORY_REGION,)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_RUN,)?], and![Cond::new(1, ArgLen::Dword, Eq, KVM_NMI)?], + and![Cond::new(1, ArgLen::Dword, Eq, KVM_GET_NESTED_STATE)?], + and![Cond::new(1, ArgLen::Dword, Eq, KVM_SET_NESTED_STATE)?], ]) } @@ -767,14 +759,14 @@ fn create_vcpu_ioctl_seccomp_rule( and![Cond::new(1, ArgLen::Dword, Eq, VFIO_GROUP_UNSET_CONTAINER)?], and![Cond::new(1, ArgLen::Dword, Eq, VFIO_IOMMU_MAP_DMA)?], and![Cond::new(1, ArgLen::Dword, Eq, VFIO_IOMMU_UNMAP_DMA)?], - and![Cond::new(1, ArgLen::Dword, Eq, VHOST_VDPA_SET_STATUS)?], - and![Cond::new(1, ArgLen::Dword, Eq, VHOST_VDPA_GET_CONFIG)?], - and![Cond::new(1, ArgLen::Dword, Eq, VHOST_VDPA_SET_CONFIG)?], + and![Cond::new(1, ArgLen::Dword, Eq, VHOST_VDPA_SET_STATUS())?], + and![Cond::new(1, ArgLen::Dword, Eq, VHOST_VDPA_GET_CONFIG())?], + and![Cond::new(1, ArgLen::Dword, Eq, VHOST_VDPA_SET_CONFIG())?], and![Cond::new( 1, ArgLen::Dword, Eq, - VHOST_VDPA_SET_VRING_ENABLE + VHOST_VDPA_SET_VRING_ENABLE(), )?], ]; @@ -825,6 +817,7 @@ fn vcpu_thread_rules( (libc::SYS_rt_sigreturn, vec![]), (libc::SYS_sched_yield, vec![]), (libc::SYS_sendmsg, vec![]), + (libc::SYS_sendto, vec![]), (libc::SYS_shutdown, vec![]), (libc::SYS_sigaltstack, vec![]), (libc::SYS_tgkill, vec![]), @@ -835,7 +828,6 @@ fn vcpu_thread_rules( (libc::SYS_unlinkat, vec![]), (libc::SYS_write, vec![]), (libc::SYS_writev, vec![]), - #[cfg(debug_assertions)] (libc::SYS_fcntl, vec![]), (libc::SYS_getcwd, vec![]), ]) @@ -870,10 +862,12 @@ fn http_api_thread_rules() -> Result)>, BackendError> (libc::SYS_recvfrom, vec![]), (libc::SYS_recvmsg, vec![]), (libc::SYS_sched_yield, vec![]), + (libc::SYS_sendto, vec![]), (libc::SYS_sigaltstack, vec![]), (libc::SYS_write, vec![]), (libc::SYS_rt_sigprocmask, vec![]), (libc::SYS_getcwd, vec![]), + (libc::SYS_clock_nanosleep, vec![]), ]) } diff --git a/vmm/src/serial_manager.rs b/vmm/src/serial_manager.rs index f05100b464..fa37ab665a 100644 --- a/vmm/src/serial_manager.rs +++ b/vmm/src/serial_manager.rs @@ -3,9 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 // +use std::any::TypeId; +use std::collections::HashMap; use std::fs::File; -use std::io::Read; -use std::net::Shutdown; +use std::io::{Read, Write}; +use std::net::{Shutdown, TcpStream}; use std::os::unix::io::{AsRawFd, FromRawFd, IntoRawFd}; use std::os::unix::net::UnixStream; use std::panic::AssertUnwindSafe; @@ -67,9 +69,9 @@ pub enum Error { #[error("Error accepting connection")] AcceptConnection(#[source] io::Error), - /// Cannot clone the UnixStream - #[error("Error cloning UnixStream")] - CloneUnixStream(#[source] io::Error), + /// Cannot clone the Stream + #[error("Error cloning Stream")] + CloneStream(#[source] io::Error), /// Cannot shutdown the connection #[error("Error shutting down a connection")] @@ -91,9 +93,10 @@ pub enum EpollDispatch { File = 0, Kill = 1, Socket = 2, + Tcp = 3, Unknown, } -const EPOLL_EVENTS_LEN: usize = 4; +const EPOLL_EVENTS_LEN: usize = 5; impl From for EpollDispatch { fn from(v: u64) -> Self { @@ -102,11 +105,64 @@ impl From for EpollDispatch { 0 => File, 1 => Kill, 2 => Socket, + 3 => Tcp, _ => Unknown, } } } +/// A thread-safe writer that fans out to multiple keyed writers. Allows for +/// bundling different kinds of writers for the serial device, e.g. writing to +/// a TCP socket and a file. +#[derive(Clone)] +pub struct FanoutWriter { + writers: Arc>>>, +} + +impl FanoutWriter { + pub fn new() -> Self { + FanoutWriter { + writers: Arc::new(Mutex::new(HashMap::new())), + } + } + + pub fn add_writer(&self, writer: W) { + let mut writers = self.writers.lock().unwrap(); + writers.insert(TypeId::of::(), Box::new(writer)); + } + + pub fn remove_writer(&self, id: TypeId) -> Option> { + let mut writers = self.writers.lock().unwrap(); + writers.remove(&id) + } +} + +impl Write for FanoutWriter { + fn write(&mut self, buf: &[u8]) -> io::Result { + let mut writers = self.writers.lock().unwrap(); + let mut result: io::Result = Ok(buf.len()); + + for (i, w) in writers.values_mut().enumerate() { + let r = w.write(buf); + if i == 0 { + result = r; + } else if let Err(e) = r { + return Err(e); + } + } + + result + } + + fn flush(&mut self) -> io::Result<()> { + let mut writers = self.writers.lock().unwrap(); + for w in writers.values_mut() { + w.flush()?; + } + Ok(()) + } +} + pub struct SerialManager { #[cfg(any(target_arch = "x86_64", target_arch = "riscv64"))] serial: Arc>, @@ -165,6 +221,7 @@ impl SerialManager { } fd.as_raw_fd() } + ConsoleOutput::Tcp(ref fd, _) => fd.as_raw_fd(), _ => return Ok(None), }; @@ -179,10 +236,14 @@ impl SerialManager { ) .map_err(Error::Epoll)?; - let epoll_fd_data = if let ConsoleOutput::Socket(_) = output { - EpollDispatch::Socket - } else { - EpollDispatch::File + let epoll_fd_data = match output { + ConsoleOutput::File(_) => EpollDispatch::File, + ConsoleOutput::Pty(_) => EpollDispatch::File, + ConsoleOutput::Tty(_) => EpollDispatch::File, + ConsoleOutput::Null => EpollDispatch::File, + ConsoleOutput::Off => EpollDispatch::File, + ConsoleOutput::Socket(_) => EpollDispatch::Socket, + ConsoleOutput::Tcp(_, _) => EpollDispatch::Tcp, }; epoll::ctl( @@ -259,6 +320,7 @@ impl SerialManager { let serial = self.serial.clone(); let pty_write_out = self.pty_write_out.clone(); let mut reader: Option = None; + let mut reader_tcp: Option = None; // In case of PTY, we want to be able to detect a connection on the // other end of the PTY. This is done by detecting there's no event @@ -272,6 +334,17 @@ impl SerialManager { .name("serial-manager".to_string()) .spawn(move || { std::panic::catch_unwind(AssertUnwindSafe(move || { + let write_distributor = FanoutWriter::new(); + + if let ConsoleOutput::Tcp(_, Some(f)) = &in_file { + write_distributor.add_writer(f.clone()); + serial + .as_ref() + .lock() + .unwrap() + .set_out(Some(Box::new(write_distributor.clone()))); + } + let mut events = [epoll::Event::new(epoll::Events::empty(), 0); EPOLL_EVENTS_LEN]; @@ -328,10 +401,9 @@ impl SerialManager { let (unix_stream, _) = listener.accept().map_err(Error::AcceptConnection)?; let writer = - unix_stream.try_clone().map_err(Error::CloneUnixStream)?; - reader = Some( - unix_stream.try_clone().map_err(Error::CloneUnixStream)?, - ); + unix_stream.try_clone().map_err(Error::CloneStream)?; + reader = + Some(unix_stream.try_clone().map_err(Error::CloneStream)?); epoll::ctl( epoll_fd, @@ -345,6 +417,41 @@ impl SerialManager { .map_err(Error::Epoll)?; serial.lock().unwrap().set_out(Some(Box::new(writer))); } + EpollDispatch::Tcp => { + // New connection request arrived. + // Shutdown the previous connection, if any + if let Some(ref previous_reader) = reader_tcp { + previous_reader + .shutdown(Shutdown::Both) + .map_err(Error::AcceptConnection)?; + write_distributor.remove_writer(TypeId::of::()); + } + + let ConsoleOutput::Tcp(ref listener, _) = in_file else { + unreachable!(); + }; + + // Events on the listening socket will be connection requests. + // Accept them, create a reader and a writer. + let (tcp_stream, _) = + listener.accept().map_err(Error::AcceptConnection)?; + let writer = + tcp_stream.try_clone().map_err(Error::CloneStream)?; + reader_tcp = + Some(tcp_stream.try_clone().map_err(Error::CloneStream)?); + + epoll::ctl( + epoll_fd, + epoll::ControlOptions::EPOLL_CTL_ADD, + tcp_stream.into_raw_fd(), + epoll::Event::new( + epoll::Events::EPOLLIN, + EpollDispatch::File as u64, + ), + ) + .map_err(Error::Epoll)?; + write_distributor.add_writer(writer); + } EpollDispatch::File => { if event.events & libc::EPOLLIN as u32 != 0 { let mut input = [0u8; 64]; @@ -371,6 +478,27 @@ impl SerialManager { 0 } } + ConsoleOutput::Tcp(_, _) => { + if let Some(mut serial_reader) = reader_tcp.as_ref() + { + let count = serial_reader + .read(&mut input) + .map_err(Error::ReadInput)?; + if count == 0 { + info!("Remote end closed serial socket"); + serial_reader + .shutdown(Shutdown::Both) + .map_err(Error::ShutdownConnection)?; + reader_tcp = None; + write_distributor.remove_writer( + TypeId::of::(), + ); + } + count + } else { + 0 + } + } ConsoleOutput::Pty(file) | ConsoleOutput::Tty(file) => { (&**file) .read(&mut input) @@ -432,12 +560,12 @@ impl Drop for SerialManager { if let Some(handle) = self.handle.take() { handle.join().ok(); } - if let ConsoleOutput::Socket(_) = self.in_file { - if let Some(socket_path) = self.socket_path.as_ref() { - std::fs::remove_file(socket_path.as_os_str()) - .map_err(Error::RemoveUnixSocket) - .ok(); - } + if let ConsoleOutput::Socket(_) = self.in_file + && let Some(socket_path) = self.socket_path.as_ref() + { + std::fs::remove_file(socket_path.as_os_str()) + .map_err(Error::RemoveUnixSocket) + .ok(); } } } diff --git a/vmm/src/sigwinch_listener.rs b/vmm/src/sigwinch_listener.rs index 4d7aebdca3..b50e93c986 100644 --- a/vmm/src/sigwinch_listener.rs +++ b/vmm/src/sigwinch_listener.rs @@ -3,10 +3,10 @@ use std::cell::RefCell; use std::collections::BTreeSet; -use std::fs::{read_dir, File}; +use std::fs::{File, read_dir}; use std::io::{self, ErrorKind, Read, Write}; use std::iter::once; -use std::mem::{size_of, MaybeUninit}; +use std::mem::{MaybeUninit, size_of}; use std::os::unix::prelude::*; use std::process::exit; use std::ptr::null_mut; @@ -14,15 +14,16 @@ use std::ptr::null_mut; use arch::_NSIG; use hypervisor::HypervisorType; use libc::{ - c_int, c_void, close, fork, getpgrp, ioctl, pipe2, poll, pollfd, setsid, sigemptyset, - siginfo_t, signal, sigprocmask, syscall, tcgetpgrp, tcsetpgrp, SYS_close_range, EINVAL, ENOSYS, - ENOTTY, O_CLOEXEC, POLLERR, SIGCHLD, SIGWINCH, SIG_DFL, SIG_SETMASK, STDERR_FILENO, TIOCSCTTY, + EINVAL, ENOSYS, ENOTTY, O_CLOEXEC, POLLERR, SIG_DFL, SIG_SETMASK, SIGCHLD, SIGWINCH, + STDERR_FILENO, SYS_close_range, TIOCSCTTY, c_int, c_void, close, fork, getpgrp, ioctl, pipe2, + poll, pollfd, setsid, sigemptyset, siginfo_t, signal, sigprocmask, syscall, tcgetpgrp, + tcsetpgrp, }; -use seccompiler::{apply_filter, BpfProgram, SeccompAction}; +use seccompiler::{BpfProgram, SeccompAction, apply_filter}; use vmm_sys_util::signal::register_signal_handler; -use crate::clone3::{clone3, clone_args, CLONE_CLEAR_SIGHAND}; -use crate::seccomp_filters::{get_seccomp_filter, Thread}; +use crate::clone3::{CLONE_CLEAR_SIGHAND, clone_args, clone3}; +use crate::seccomp_filters::{Thread, get_seccomp_filter}; thread_local! { // The tty file descriptor is stored in a global variable so it @@ -83,7 +84,8 @@ unsafe fn close_fds_fallback(keep_fds: &BTreeSet) { .collect(); for fd in open_fds.difference(keep_fds) { - close(*fd); + // SAFETY: The FD is valid + unsafe { close(*fd) }; } } @@ -108,12 +110,14 @@ unsafe fn close_unused_fds(keep_fds: &mut [RawFd]) { continue; } - if syscall(SYS_close_range, first, last, 0) == -1 { + // SAFETY: FDs are valid + if unsafe { syscall(SYS_close_range, first, last, 0) } == -1 { // The kernel might be too old to have close_range, in // which case we need to fall back to an uglier method. let e = io::Error::last_os_error(); if e.raw_os_error() == Some(ENOSYS) { - return close_fds_fallback(&keep_fds.iter().copied().collect()); + // SAFETY: FDs are valid + return unsafe { close_fds_fallback(&keep_fds.iter().copied().collect()) }; } panic!("close_range: {e}"); @@ -212,7 +216,8 @@ unsafe fn clone_clear_sighand() -> io::Result { ..Default::default() }; args.flags |= CLONE_CLEAR_SIGHAND; - let r = clone3(&mut args, size_of::()); + // SAFETY: parameters are assumed to be valid + let r = unsafe { clone3(&mut args, size_of::()) }; if r != -1 { return Ok(r.try_into().unwrap()); } @@ -223,13 +228,15 @@ unsafe fn clone_clear_sighand() -> io::Result { // If CLONE_CLEAR_SIGHAND isn't available, fall back to resetting // all the signal handlers one by one. - let r = fork(); + // SAFETY: trivially safe, and we check the return value. + let r = unsafe { fork() }; if r == -1 { return Err(io::Error::last_os_error()); } if r == 0 { for signum in 1.._NSIG { - let _ = signal(signum, SIG_DFL); + // SAFETY: trivially safe, we unset the user-space signal handler + let _ = unsafe { signal(signum, SIG_DFL) }; } } Ok(r.try_into().unwrap()) diff --git a/vmm/src/vcpu_throttling.rs b/vmm/src/vcpu_throttling.rs new file mode 100644 index 0000000000..7e74b702d5 --- /dev/null +++ b/vmm/src/vcpu_throttling.rs @@ -0,0 +1,604 @@ +// Copyright © 2025 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 + +//! # vCPU throttling for Auto Converging +//! +//! vCPU throttling is crucial to reach a reasonable downtime when using a +//! precopy strategy for live-migration of VMs with memory-intensive workloads. +//! Auto converge means an increasing vCPU throttling over time until the memory +//! delta is small enough for the migration thread(s) to perform the switch-over +//! to the new host. +//! +//! Therefore, the migration thread(s) use this thread to help them reach their +//! goal. Next to typical lifecycle management, this thread must fulfill various +//! requirements to ensure a minimal downtime. +//! +//! ## Thread Requirements +//! - Needs to be able to gracefully wait for work. +//! - Must be able to exit gracefully. +//! - Must be able to cancel any work and return to its init state to support +//! live-migration cancellation and restart of live-migrations. +//! - Must not block the migration thread(s) whenever possible, to facilitate +//! fast live-migrations with short downtimes. +//! - Must be interruptible during a sleep phase to not block the migration +//! thread(s). +//! - Must not confuse or hinder the migration thread(s) regarding +//! pause()/resume() operations. Context: migration thread shuts down the +//! vCPUs for the handover. The throttle thread must not restart the vCPUs +//! again. + +use std::cell::Cell; +use std::cmp::min; +use std::sync::mpsc::RecvTimeoutError; +use std::sync::{Arc, Mutex, mpsc}; +use std::thread; +use std::thread::JoinHandle; +use std::time::{Duration, Instant}; + +use vm_migration::Pausable; + +use crate::cpu::CpuManager; + +/// The possible command of the thread, i.e., the current state. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +enum ThrottleCommand { + /// Waiting for next event. + Waiting, + /// Ongoing vCPU throttling. + /// + /// The inner value shows the current throttling percentage in range `1..=99`. + Throttling(u8 /* `1..=99` */), + /// Thread is shutting down gracefully. + Exiting, +} + +/// Helper to adapt the throttling timeslice as we go, depending on the time it +/// takes to pause() and resume() all vCPUs. +#[derive(Debug)] +struct TimesliceContext { + current_timeslice: Duration, + /// Duration it took to pause() all vCPUs on the previous iteration. + previous_pause_duration: Duration, + /// Duration it took to resume() all vCPUs on the previous iteration. + previous_resume_duration: Duration, +} + +impl TimesliceContext { + /// The initial timeslice for a throttling cycle (vCPU pause & resume). + const INITIAL_TIMESLICE: Duration = Duration::from_millis(100); + + /// The minimal value for the operations. + /// + /// Any value smaller than this is upgraded to this to prevent math + /// exceptions during timing calculations. + const MIN_DURATION: Duration = Duration::from_millis(1); + + /// Maximum time slice. This should not be too big. + /// + /// Otherwise, for example: Assuming we have 10% throttling and + /// 2000ms time slice, then the WM will be unresponsive for + /// 200ms every 1800ms. This is not convenient. /// + const MAX_TIMESLICE: Duration = Duration::from_millis(800); + + /// Creates a new instance with [`Self::INITIAL_TIMESLICE`]. + fn new() -> Self { + Self { + current_timeslice: Self::INITIAL_TIMESLICE, + previous_pause_duration: Self::MIN_DURATION, + previous_resume_duration: Self::MIN_DURATION, + } + } + + /// Updates the timeslice. + fn update_timeslice(&mut self) { + // CpuManager::pause() plus CpuManager::resume() without additional delay is the shortest + // we can get. + let one_percent = self.previous_pause_duration + self.previous_resume_duration; + self.current_timeslice = one_percent * 100; + self.current_timeslice = min(self.current_timeslice, Self::MAX_TIMESLICE); + } + + /// Calculates the sleep durations for after the `pause()` and `resume()` operations with + /// the current `timeslice`. + /// + /// It uses the `timeslice` that was calculated on the previous + /// invocation of [`Self::update_timeslice`]. + fn calc_sleep_durations( + &mut self, + percentage: u64, + ) -> ( + Duration, /* after pause */ + Duration, /* after resume */ + ) { + assert!(percentage <= 100); + assert!(percentage > 0); + + let timeslice_ms = self.current_timeslice.as_millis() as u64; + let wait_ms_after_pause_ms = timeslice_ms * percentage / 100; + let wait_ms_after_resume_ms = timeslice_ms - wait_ms_after_pause_ms; + + let wait_ms_after_pause_ms = + wait_ms_after_pause_ms.saturating_sub(self.previous_pause_duration.as_millis() as u64); + let wait_ms_after_resume_ms = wait_ms_after_resume_ms + .saturating_sub(self.previous_resume_duration.as_millis() as u64); + + ( + Duration::from_millis(wait_ms_after_pause_ms), + Duration::from_millis(wait_ms_after_resume_ms), + ) + } + + /// Set the previous pause duration. + /// + /// In case this is below [`Self::MIN_DURATION`], we upgrade it to [`Self::MIN_DURATION`]. + pub fn set_previous_pause_duration(&mut self, mut duration: Duration) { + if duration < Self::MIN_DURATION { + duration = Self::MIN_DURATION + } + + self.previous_pause_duration = duration; + } + + /// Set the duration it took to `resume()` all vCPUs on the previous iteration. + /// + /// In case this is below [`Self::MIN_DURATION`], we upgrade it to [`Self::MIN_DURATION`]. + pub fn set_previous_resume_duration(&mut self, mut duration: Duration) { + if duration < Self::MIN_DURATION { + duration = Self::MIN_DURATION + } + self.previous_resume_duration = duration; + } +} + +/// Context of the vCPU throttle thread. +// The main justification for this dedicated type is to split the thread +// functions from the higher-level control API. +// TODO seccomp is missing +pub struct ThrottleWorker { + handle: Option>, +} + +impl ThrottleWorker { + /// This should not be named "vcpu*" as libvirt fails when + /// iterating the vCPU threads then. Fix this first in libvirt! + const THREAD_NAME: &'static str = "throttle-vcpu"; + + /// Executes the provided callback and goes to sleep until the specified + /// `sleep_duration` passed. + /// + /// The time to execute the callback itself is not taken into account + /// when sleeping for `sleep_duration`. Therefore, the callback is + /// supposed to be quick (a couple of milliseconds). + /// + /// The thread is interruptible during the sleep phase when the `receiver` + /// receives a new [`ThrottleCommand`]. + /// + /// # Arguments + /// - `callback`: Function to run + /// - `set_callback_duration`: Set the duration to execute the callback. + /// - `sleep_duration`: Duration this function takes at most, including + /// running the `callback`. + /// - `receiver`: Receiving end of the channel to the migration managing + /// thread. + fn execute_and_wait_interruptible( + callback: &impl Fn(), + mut set_callback_duration: impl FnMut(Duration), + sleep_duration: Duration, + receiver: &mpsc::Receiver, + ) -> Option { + let begin = Instant::now(); + callback(); + let cb_duration = begin.elapsed(); + // Help to adjust the timeslice in the next cycle. + set_callback_duration(cb_duration); + + // It might happen that sometimes we get interrupted during a sleep phase + // with a new higher throttle percentage but this is negligible. For an + // auto-converge cycle, there are typically only ~10 steps involved over + // a time frame from a couple of seconds up to a couple of minutes. + match receiver.recv_timeout(sleep_duration) { + Ok(next_task) => Some(next_task), + Err(RecvTimeoutError::Timeout) => None, + Err(RecvTimeoutError::Disconnected) => { + panic!("thread and channel should exit gracefully") + } + } + } + + /// Executes one throttling step: either pause or resume of vCPUs. + /// + /// Runs the given callback, then waits for the specified duration, unless + /// interrupted by a new [`ThrottleCommand`]. + /// + /// # Behavior + /// - Runs the provided `callback` immediately. + /// - Waits up to `duration` for new commands on the `receiver`. + /// - If no command arrives before the timeout, this step completes + /// normally and returns `None`. + /// - If a [`ThrottleCommand::Throttling`] arrives, updates the current + /// throttle percentage in `current_throttle` and continues with the + /// loop. Returns `None`. + /// - If a [`ThrottleCommand::Waiting`] or [`ThrottleCommand::Exiting`] + /// arrives, this command is forwarded to the caller. + /// + /// # Arguments + /// - `callback`: Function to run (e.g., pause or resume vCPUs). + /// - `set_callback_duration`: Set the duration to execute the callback. + /// - `receiver`: Channel for receiving new [`ThrottleCommand`]s. + /// - `current_throttle`: Mutable reference to the current throttle + /// percentage (updated on [`ThrottleCommand::Throttling`]). + /// + /// # Returns + /// - `None` if the throttling cycle should continue. + /// - `Some(ThrottleCommand::Waiting | ThrottleCommand::Exiting)` if + /// throttling should stop. + fn throttle_step( + callback: &F, + set_callback_duration: impl FnMut(Duration), + duration: Duration, + receiver: &mpsc::Receiver, + current_throttle: &mut u64, + ) -> Option + where + F: Fn(), + { + let maybe_task = Self::execute_and_wait_interruptible( + callback, + set_callback_duration, + duration, + receiver, + ); + match maybe_task { + None => None, + Some(ThrottleCommand::Throttling(next)) => { + // A new throttle value is only applied at the end of a full + // throttling cycle. This is fine and negligible in a series of + // (tens of) thousands of cycles. + *current_throttle = next as u64; + None + } + Some(cmd @ (ThrottleCommand::Exiting | ThrottleCommand::Waiting)) => Some(cmd), + } + } + + /// Helper for [`Self::control_loop`] that runs the actual throttling loop. + /// + /// This function returns the next [`ThrottleCommand`] **only** if the thread + /// stopped the vCPU throttling. + fn throttle_loop( + receiver: &mpsc::Receiver, + initial_throttle: u8, + callback_pause_vcpus: &impl Fn(), + callback_resume_vcpus: &impl Fn(), + ) -> ThrottleCommand { + // The current throttle value, as long as the thread is throttling. + let mut current_throttle = initial_throttle as u64; + let mut timeslice_ctx = TimesliceContext::new(); + + loop { + // Catch logic bug: We should have exited in this case already. + assert_ne!(current_throttle, 0); + assert!(current_throttle < 100); + + let (wait_ms_after_pause, wait_ms_after_resume) = + timeslice_ctx.calc_sleep_durations(current_throttle); + + // pause vCPUs + if let Some(cmd) = Self::throttle_step( + callback_pause_vcpus, + |new_duration| timeslice_ctx.set_previous_pause_duration(new_duration), + wait_ms_after_pause, + receiver, + &mut current_throttle, + ) { + // TODO: future optimization + // Prevent unnecessary resume() here when the migration thread + // performs .pause() right after anyway. We could make .pause() and + // .resume() idempotent. + callback_resume_vcpus(); + // We only exit here in case if ThrottleCommand::Waiting or ::Exiting + return cmd; + } + + // resume vCPUs + if let Some(cmd) = Self::throttle_step( + callback_resume_vcpus, + |new_duration| timeslice_ctx.set_previous_resume_duration(new_duration), + wait_ms_after_resume, + receiver, + &mut current_throttle, + ) { + // We only exit here in case if ThrottleCommand::Waiting or ::Exiting + return cmd; + } + + // Update timeslice for next cycle. This way, we can closely match the expected + // percentage for pause() and resume(). + timeslice_ctx.update_timeslice(); + } + } + + /// Implements the control loop of the thread. + /// + /// It wraps the actual throttling with the necessary thread lifecycle + /// management. + fn control_loop( + receiver: mpsc::Receiver, + callback_pause_vcpus: impl Fn() + Send + 'static, + callback_resume_vcpus: impl Fn() + Send + 'static, + ) -> impl Fn() { + move || { + // In the outer loop, we gracefully wait for commands. + 'control: loop { + let thread_task = receiver.recv().expect("channel should not be closed"); + match thread_task { + ThrottleCommand::Exiting => { + break 'control; + } + ThrottleCommand::Waiting => { + continue 'control; + } + ThrottleCommand::Throttling(initial_throttle) => { + let next_task = Self::throttle_loop( + &receiver, + initial_throttle, + &callback_pause_vcpus, + &callback_resume_vcpus, + ); + if next_task == ThrottleCommand::Exiting { + break 'control; + } + // else: thread is in Waiting state + } + } + } + debug!("thread exited gracefully"); + } + } + + /// Spawns a new thread. + fn spawn( + receiver: mpsc::Receiver, + callback_pause_vcpus: impl Fn() + Send + 'static, + callback_resume_vcpus: impl Fn() + Send + 'static, + ) -> Self { + let handle = { + let thread_fn = + Self::control_loop(receiver, callback_pause_vcpus, callback_resume_vcpus); + thread::Builder::new() + .name(String::from(Self::THREAD_NAME)) + .spawn(thread_fn) + .expect("should spawn thread") + }; + + Self { + handle: Some(handle), + } + } +} + +impl Drop for ThrottleWorker { + fn drop(&mut self) { + // Note: The thread handle must send the shutdown command first. + if let Some(handle) = self.handle.take() { + handle.join().expect("thread should have succeeded"); + } + } +} + +/// Handler for controlling the vCPU throttle thread. +/// +/// vCPU throttling is needed for live-migration of memory-intensive workloads. +/// The current design assumes that all vCPUs are throttled equally. +/// +/// # Transitions +/// - `Waiting` -> `Throttling(x %)`, `Exit` +/// - `Throttling(x %)` -> `Exit`, `Waiting`, `Throttling(y %)` +/// - `Exiting` +pub struct ThrottleThreadHandle { + /// Thread state wrapped by synchronization primitives. + state_sender: mpsc::Sender, + /// Current throttle value. + /// + /// This is the last throttle value that was sent to the + /// thread. + current_throttle: Cell, + /// The underlying thread handle. Option to have more control over when it is dropped. + throttle_thread: Option, +} + +impl ThrottleThreadHandle { + /// Spawns a new thread and returning a handle to it. + /// + /// # Parameters + /// - `cpu_manager`: CPU manager to pause and resume vCPUs + pub fn new_from_cpu_manager(cpu_manager: &Arc>) -> Self { + let callback_pause_vcpus = { + let cpu_manager = cpu_manager.clone(); + Box::new(move || cpu_manager.lock().unwrap().pause().unwrap()) + }; + + let callback_resume_vcpus = { + let cpu_manager = cpu_manager.clone(); + Box::new(move || cpu_manager.lock().unwrap().resume().unwrap()) + }; + + Self::new(callback_pause_vcpus, callback_resume_vcpus) + } + + /// Spawns a new thread and returning a handle to it. + /// + /// This function returns when the thread gracefully arrived in + /// [`ThrottleCommand::Waiting`]. + /// + /// # Parameters + /// - `callback_pause_vcpus`: Function putting all vCPUs into pause state. The + /// function must not perform any artificial delay itself. + /// - `callback_resume_vcpus`: Function putting all vCPUs back into running + /// state. The function must not perform any artificial delay itself. + fn new( + callback_pause_vcpus: Box, + callback_resume_vcpus: Box, + ) -> Self { + // Channel used for synchronization. + let (sender, receiver) = mpsc::channel::(); + + let thread = ThrottleWorker::spawn(receiver, callback_pause_vcpus, callback_resume_vcpus); + + Self { + state_sender: sender, + current_throttle: Cell::new(0), + throttle_thread: Some(thread), + } + } + + /// Set's the throttle percentage to a value in range `0..=99` and updates + /// the thread's state. + /// + /// Setting the value back to `0` equals setting the thread back into + /// [`ThrottleCommand::Waiting`]. + /// + /// In case of an ongoing throttling cycle (vCPU pause & resume), any new + /// throttling percentage will be applied no later than when the current cycle + /// ends. + /// + /// # Panic + /// Panics, if `percent_new` is not in range `0..=99`. + pub fn set_throttle_percent(&self, percent_new: u8) { + assert!( + percent_new <= 100, + "setting a percentage of 100 or above is not allowed: {percent_new}%" + ); + + // We have no problematic race condition here as in normal operation + // there is exactly one thread calling these functions. + let percent_old = self.throttle_percent(); + + // Return early, no action needed. + if percent_old == percent_new { + return; + } + + if percent_new == 0 { + self.state_sender + .send(ThrottleCommand::Waiting) + .expect("channel should not be closed"); + } else { + self.state_sender + .send(ThrottleCommand::Throttling(percent_new)) + .expect("channel should not be closed"); + }; + + self.current_throttle.set(percent_new); + } + + /// Get the current throttle percentage in range `0..=99`. + /// + /// Please note that the value is not synchronized. + pub fn throttle_percent(&self) -> u8 { + self.current_throttle.get() + } + + /// Stops and terminates the thread gracefully. + /// + /// Waits for the thread to finish. This function **must** be called before + /// the migration thread(s) do anything with the CPU manager to prevent + /// odd states. + pub fn shutdown(&mut self) { + let begin = Instant::now(); + + { + // drop thread; ensure that the channel is still alive when it is dropped + if let Some(worker) = self.throttle_thread.take() { + self.state_sender + .send(ThrottleCommand::Exiting) + .expect("channel should not be closed"); + + // Ensure the sender is still living when this is dropped. + drop(worker); + } + } + + let elapsed = begin.elapsed(); + if elapsed > Duration::from_millis(20) { + warn!( + "shutting down thread takes too long ({} ms): this increases the downtime!", + elapsed.as_millis() + ); + } + } +} + +impl Drop for ThrottleThreadHandle { + fn drop(&mut self) { + self.shutdown(); + } +} + +#[cfg(test)] +mod tests { + use std::sync::atomic::{AtomicBool, Ordering}; + use std::thread::sleep; + + use super::*; + + // The test is successful if it does not get stuck. Then, the thread exits + // gracefully. + #[test] + fn test_vcpu_throttling_thread_lifecycle() { + for _ in 0..5 { + // State transitions: Waiting -> Exit + { + let mut handler = ThrottleThreadHandle::new(Box::new(|| {}), Box::new(|| {})); + + // The test is successful if it does not get stuck. + handler.shutdown(); + } + + // Dummy CpuManager + let cpus_throttled = Arc::new(AtomicBool::new(false)); + let callback_pause_vcpus = { + let cpus_running = cpus_throttled.clone(); + Box::new(move || { + let old = cpus_running.swap(true, Ordering::SeqCst); + assert!(!old); + }) + }; + let callback_resume_vcpus = { + let cpus_running = cpus_throttled.clone(); + Box::new(move || { + let old = cpus_running.swap(false, Ordering::SeqCst); + assert!(old); + }) + }; + + // State transitions: Waiting -> Throttle -> Waiting -> Throttle -> Exit + { + let mut handler = + ThrottleThreadHandle::new(callback_pause_vcpus, callback_resume_vcpus); + handler.set_throttle_percent(5); + sleep(TimesliceContext::INITIAL_TIMESLICE); + handler.set_throttle_percent(10); + sleep(TimesliceContext::INITIAL_TIMESLICE); + + // Assume we aborted vCPU throttling (or the live-migration at all). + handler.set_throttle_percent(0 /* reset to waiting */); + handler.set_throttle_percent(5); + sleep(TimesliceContext::INITIAL_TIMESLICE); + handler.set_throttle_percent(10); + sleep(TimesliceContext::INITIAL_TIMESLICE); + + // The test is successful if we don't have a panic here due to a + // closed channel. + for _ in 0..10 { + handler.shutdown(); + sleep(Duration::from_millis(1)); + } + + // The test is successful if it does not get stuck. + drop(handler); + } + } + } +} diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index d7bba25cc0..aa981e226b 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -25,16 +25,20 @@ use std::time::Instant; use std::{cmp, result, str, thread}; use anyhow::anyhow; +#[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] +use arch::PciSpaceInfo; #[cfg(target_arch = "x86_64")] use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START}; +#[cfg(all(feature = "kvm", target_arch = "x86_64"))] +use arch::x86_64::MAX_SUPPORTED_CPUS_LEGACY; #[cfg(feature = "tdx")] use arch::x86_64::tdx::TdvfSection; -#[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] -use arch::PciSpaceInfo; -use arch::{get_host_cpu_phys_bits, EntryPoint, NumaNode, NumaNodes}; +use arch::{EntryPoint, NumaNode, NumaNodes, get_host_cpu_phys_bits}; +use devices::AcpiNotificationFlags; #[cfg(target_arch = "aarch64")] use devices::interrupt_controller; -use devices::AcpiNotificationFlags; +#[cfg(feature = "fw_cfg")] +use devices::legacy::fw_cfg::FwCfgItem; #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] @@ -42,17 +46,17 @@ use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs; #[cfg(target_arch = "aarch64")] use hypervisor::arch::aarch64::regs::AARCH64_PMU_IRQ; use hypervisor::{HypervisorVmError, VmOps}; -use libc::{termios, SIGWINCH}; +use libc::{SIGWINCH, termios}; use linux_loader::cmdline::Cmdline; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use linux_loader::elf; +use linux_loader::loader::KernelLoader; #[cfg(target_arch = "x86_64")] use linux_loader::loader::bzimage::BzImage; #[cfg(target_arch = "x86_64")] use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent; #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] use linux_loader::loader::pe::Error::InvalidImageMagicNumber; -use linux_loader::loader::KernelLoader; use seccompiler::SeccompAction; use serde::{Deserialize, Serialize}; use thiserror::Error; @@ -60,17 +64,15 @@ use tracer::trace_scoped; use vm_device::Bus; #[cfg(feature = "tdx")] use vm_memory::{Address, ByteValued, GuestMemoryRegion, ReadVolatile}; -use vm_memory::{ - Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, WriteVolatile, -}; +use vm_memory::{Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic}; use vm_migration::protocol::{MemoryRangeTable, Request, Response}; use vm_migration::{ - snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable, + Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable, snapshot_from_id, }; use vmm_sys_util::eventfd::EventFd; use vmm_sys_util::sock_ctrl_msg::ScmSocket; -use crate::config::{add_to_config, ValidationError}; +use crate::config::{ValidationError, add_to_config}; use crate::console_devices::{ConsoleDeviceError, ConsoleInfo}; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use crate::coredump::{ @@ -90,14 +92,17 @@ use crate::memory_manager::{ use crate::migration::get_vm_snapshot; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] use crate::migration::url_to_file; -use crate::migration::{url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE}; +use crate::migration::{SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE, url_to_path}; +use crate::vcpu_throttling::ThrottleThreadHandle; +#[cfg(feature = "fw_cfg")] +use crate::vm_config::FwCfgConfig; use crate::vm_config::{ DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, NumaConfig, PayloadConfig, PmemConfig, UserDeviceConfig, VdpaConfig, VmConfig, VsockConfig, }; use crate::{ - cpu, GuestMemoryMmap, PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, - MEMORY_MANAGER_SNAPSHOT_ID, + CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, GuestMemoryMmap, + MEMORY_MANAGER_SNAPSHOT_ID, PciDeviceInfo, cpu, }; /// Errors associated with VM management @@ -116,6 +121,10 @@ pub enum Error { #[error("Cannot load the UEFI binary in memory")] UefiLoad(#[source] arch::aarch64::uefi::Error), + #[cfg(target_arch = "riscv64")] + #[error("Cannot load the UEFI binary in memory")] + UefiLoad(#[source] arch::riscv64::uefi::Error), + #[error("Cannot load the initramfs into memory")] InitramfsLoad, @@ -168,6 +177,9 @@ pub enum Error { #[error("VM is not running")] VmNotRunning, + #[error("VM is currently migrating and can't be modified")] + VmMigrating, + #[error("Cannot clone EventFd")] EventFdClone(#[source] io::Error), @@ -234,6 +246,9 @@ pub enum Error { #[error("Failed resizing a memory zone")] ResizeZone, + #[error("Failed resizing a disk image")] + ResizeDisk, + #[error("Cannot activate virtio devices")] ActivateVirtioDevices(#[source] DeviceManagerError), @@ -309,9 +324,6 @@ pub enum Error { #[error("Error joining kernel loading thread")] KernelLoadThreadJoin(std::boxed::Box), - #[error("Payload configuration is not bootable")] - InvalidPayload, - #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] #[error("Error coredumping VM")] Coredump(#[source] GuestDebuggableError), @@ -335,6 +347,38 @@ pub enum Error { #[error("Error locking disk images: Another instance likely holds a lock")] LockingError(#[source] DeviceManagerError), + + #[cfg(feature = "fw_cfg")] + #[error("Fw Cfg missing kernel")] + MissingFwCfgKernelFile(#[source] io::Error), + + #[cfg(feature = "fw_cfg")] + #[error("Fw Cfg missing initramfs")] + MissingFwCfgInitramfs(#[source] io::Error), + + #[cfg(feature = "fw_cfg")] + #[error("Fw Cfg missing kernel cmdline")] + MissingFwCfgCmdline, + + #[cfg(feature = "fw_cfg")] + #[error("Error creating e820 map")] + CreatingE820Map(#[source] io::Error), + + #[cfg(feature = "fw_cfg")] + #[error("Error creating acpi tables")] + CreatingAcpiTables(#[source] io::Error), + + #[cfg(feature = "fw_cfg")] + #[error("Error adding fw_cfg item")] + AddingFwCfgItem(#[source] io::Error), + + #[cfg(feature = "fw_cfg")] + #[error("Error populating fw_cfg")] + ErrorPopulatingFwCfg(#[source] io::Error), + + #[cfg(feature = "fw_cfg")] + #[error("Error using fw_cfg while disabled")] + FwCfgDisabled, } pub type Result = result::Result; @@ -482,6 +526,7 @@ pub struct Vm { hypervisor: Arc, stop_on_boot: bool, load_payload_handle: Option>>, + vcpu_throttler: ThrottleThreadHandle, } impl Vm { @@ -569,7 +614,6 @@ impl Vm { .lock() .unwrap() .populate_cpuid( - &memory_manager, &hypervisor, #[cfg(feature = "tdx")] tdx_enabled, @@ -581,7 +625,7 @@ impl Vm { #[cfg(feature = "tdx")] if tdx_enabled { let cpuid = cpu_manager.lock().unwrap().common_cpuid(); - let max_vcpus = cpu_manager.lock().unwrap().max_vcpus() as u32; + let max_vcpus = cpu_manager.lock().unwrap().max_vcpus(); vm.tdx_init(&cpuid, max_vcpus) .map_err(Error::InitializeTdxVm)?; } @@ -716,6 +760,24 @@ impl Vm { vm.sev_snp_init().map_err(Error::InitializeSevSnpVm)?; } + #[cfg(feature = "fw_cfg")] + { + let fw_cfg_config = config + .lock() + .unwrap() + .payload + .as_ref() + .map(|p| p.fw_cfg_config.is_some()) + .unwrap_or(false); + if fw_cfg_config { + device_manager + .lock() + .unwrap() + .create_fw_cfg_device() + .map_err(Error::DeviceManager)?; + } + } + #[cfg(feature = "tdx")] let kernel = config .lock() @@ -751,6 +813,10 @@ impl Vm { VmState::Created }; + // TODO we could also spawn the thread when a migration with auto-converge starts. + // Probably this is the better design. + let vcpu_throttler = ThrottleThreadHandle::new_from_cpu_manager(&cpu_manager); + Ok(Vm { #[cfg(feature = "tdx")] kernel, @@ -770,9 +836,94 @@ impl Vm { hypervisor, stop_on_boot, load_payload_handle, + vcpu_throttler, }) } + #[cfg(feature = "fw_cfg")] + fn populate_fw_cfg( + fw_cfg_config: &FwCfgConfig, + device_manager: &Arc>, + config: &Arc>, + ) -> Result<()> { + let mut e820_option: Option = None; + if fw_cfg_config.e820 { + e820_option = Some(config.lock().unwrap().memory.size as usize); + } + let mut kernel_option: Option = None; + if fw_cfg_config.kernel { + let kernel = config + .lock() + .unwrap() + .payload + .as_ref() + .map(|p| p.kernel.as_ref().map(File::open)) + .unwrap_or_default() + .transpose() + .map_err(Error::MissingFwCfgKernelFile)?; + kernel_option = kernel; + } + let mut cmdline_option: Option = None; + if fw_cfg_config.cmdline { + let cmdline = Vm::generate_cmdline( + config.lock().unwrap().payload.as_ref().unwrap(), + #[cfg(target_arch = "aarch64")] + device_manager, + ) + .map_err(|_| Error::MissingFwCfgCmdline)? + .as_cstring() + .map_err(|_| Error::MissingFwCfgCmdline)?; + cmdline_option = Some(cmdline); + } + let mut initramfs_option: Option = None; + if fw_cfg_config.initramfs { + let initramfs = config + .lock() + .unwrap() + .payload + .as_ref() + .map(|p| p.initramfs.as_ref().map(File::open)) + .unwrap_or_default() + .transpose() + .map_err(Error::MissingFwCfgInitramfs)?; + // We measure the initramfs when running Oak Containers in SNP mode (initramfs = Stage1) + // o/w use Stage0 to launch cloud disk images + initramfs_option = initramfs; + } + let mut fw_cfg_item_list_option: Option> = None; + if let Some(fw_cfg_files) = &fw_cfg_config.items { + let mut fw_cfg_item_list = vec![]; + for fw_cfg_file in fw_cfg_files.item_list.clone() { + fw_cfg_item_list.push(FwCfgItem { + name: fw_cfg_file.name, + content: devices::legacy::fw_cfg::FwCfgContent::File( + 0, + File::open(fw_cfg_file.file).map_err(Error::AddingFwCfgItem)?, + ), + }); + } + fw_cfg_item_list_option = Some(fw_cfg_item_list); + } + + let device_manager_binding = device_manager.lock().unwrap(); + let Some(fw_cfg) = device_manager_binding.fw_cfg() else { + return Err(Error::FwCfgDisabled); + }; + + fw_cfg + .lock() + .unwrap() + .populate_fw_cfg( + e820_option, + kernel_option, + initramfs_option, + cmdline_option, + fw_cfg_item_list_option, + ) + .map_err(Error::ErrorPopulatingFwCfg)?; + Ok(()) + } + fn create_numa_nodes( configs: Option>, memory_manager: &Arc>, @@ -832,24 +983,6 @@ impl Vm { } } - #[cfg(target_arch = "x86_64")] - if let Some(sgx_epc_sections) = &config.sgx_epc_sections { - if let Some(sgx_epc_region) = mm.sgx_epc_region() { - let mm_sections = sgx_epc_region.epc_sections(); - for sgx_epc_section in sgx_epc_sections.iter() { - if let Some(mm_section) = mm_sections.get(sgx_epc_section) { - node.sgx_epc_sections.push(mm_section.clone()); - } else { - error!("Unknown SGX EPC section '{}'", sgx_epc_section); - return Err(Error::InvalidNumaConfig); - } - } - } else { - error!("Missing SGX EPC region"); - return Err(Error::InvalidNumaConfig); - } - } - numa_nodes.insert(config.guest_numa_id, node); } } @@ -857,6 +990,31 @@ impl Vm { Ok(numa_nodes) } + /// Set's the throttle percentage to a value in range `0..=99`. + /// + /// Setting the value back to `0` brings the thread back into a waiting + /// state. + /// + /// # Panic + /// Panics, if `percent_new` is not in range `0..=99`. + pub fn set_throttle_percent(&self, percent: u8 /* 1..=99 */) { + self.vcpu_throttler.set_throttle_percent(percent); + } + + /// Get the current throttle percentage in range `0..=99`. + /// + /// Please note that the value is not synchronized. + pub fn throttle_percent(&self) -> u8 { + self.vcpu_throttler.throttle_percent() + } + + /// Stops and terminates the thread gracefully. + /// + /// Waits for the thread to finish. + pub fn stop_vcpu_throttling(&mut self) { + self.vcpu_throttler.shutdown(); + } + #[allow(clippy::too_many_arguments)] pub fn new( vm_config: Arc>, @@ -902,6 +1060,11 @@ impl Vm { vm_config.lock().unwrap().memory.total_size(), )?; + #[cfg(all(feature = "kvm", target_arch = "x86_64"))] + if vm_config.lock().unwrap().max_apic_id() > MAX_SUPPORTED_CPUS_LEGACY { + vm.enable_x2apic_api().unwrap(); + } + let phys_bits = physical_bits(&hypervisor, vm_config.lock().unwrap().cpus.max_phys_bits); let memory_manager = if let Some(snapshot) = @@ -917,9 +1080,6 @@ impl Vm { ) .map_err(Error::MemoryManager)? } else { - #[cfg(target_arch = "x86_64")] - let sgx_epc_config = vm_config.lock().unwrap().sgx_epc.clone(); - MemoryManager::new( vm.clone(), &vm_config.lock().unwrap().memory.clone(), @@ -928,9 +1088,7 @@ impl Vm { #[cfg(feature = "tdx")] tdx_enabled, None, - None, - #[cfg(target_arch = "x86_64")] - sgx_epc_config, + Default::default(), ) .map_err(Error::MemoryManager)? }; @@ -1033,90 +1191,49 @@ impl Vm { Ok(cmdline) } - #[cfg(target_arch = "aarch64")] - fn load_firmware(mut firmware: &File, memory_manager: Arc>) -> Result<()> { + #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] + fn load_firmware( + mut firmware: &File, + memory_manager: Arc>, + ) -> Result { let uefi_flash = memory_manager.lock().as_ref().unwrap().uefi_flash(); let mem = uefi_flash.memory(); - arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware) + arch::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware) .map_err(Error::UefiLoad)?; - Ok(()) - } - - #[cfg(target_arch = "aarch64")] - fn load_kernel( - firmware: Option, - kernel: Option, - memory_manager: Arc>, - ) -> Result { - let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); - let mem = guest_memory.memory(); - let entry_addr = match (firmware, kernel) { - (None, Some(mut kernel)) => { - match linux_loader::loader::pe::PE::load( - mem.deref(), - Some(arch::layout::KERNEL_START), - &mut kernel, - None, - ) { - Ok(entry_addr) => entry_addr.kernel_load, - // Try to load the binary as kernel PE file at first. - // If failed, retry to load it as UEFI binary. - // As the UEFI binary is formatless, it must be the last option to try. - Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => { - Self::load_firmware(&kernel, memory_manager)?; - arch::layout::UEFI_START - } - Err(e) => { - return Err(Error::KernelLoad(e)); - } - } - } - (Some(firmware), None) => { - Self::load_firmware(&firmware, memory_manager)?; - arch::layout::UEFI_START - } - _ => return Err(Error::InvalidPayload), - }; - - Ok(EntryPoint { entry_addr }) + Ok(EntryPoint { + entry_addr: arch::layout::UEFI_START, + }) } - #[cfg(target_arch = "riscv64")] + #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] fn load_kernel( - firmware: Option, - kernel: Option, + mut kernel: File, memory_manager: Arc>, ) -> Result { let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); let mem = guest_memory.memory(); let alignment = 0x20_0000; - let aligned_kernel_addr = arch::layout::KERNEL_START.0 + (alignment - 1) & !(alignment - 1); - let entry_addr = match (firmware, kernel) { - (None, Some(mut kernel)) => { - match linux_loader::loader::pe::PE::load( - mem.deref(), - Some(GuestAddress(aligned_kernel_addr)), - &mut kernel, - None, - ) { - Ok(entry_addr) => entry_addr.kernel_load, - // Try to load the binary as kernel PE file at first. - // If failed, retry to load it as UEFI binary. - // As the UEFI binary is formatless, it must be the last option to try. - Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => { - // TODO: UEFI for riscv64 is scheduled to next stage. - unimplemented!() - } - Err(e) => { - return Err(Error::KernelLoad(e)); - } + // round up + let aligned_kernel_addr = arch::layout::KERNEL_START.0.div_ceil(alignment) * alignment; + let entry_addr = { + match linux_loader::loader::pe::PE::load( + mem.deref(), + Some(GuestAddress(aligned_kernel_addr)), + &mut kernel, + None, + ) { + Ok(entry_addr) => entry_addr.kernel_load, + // Try to load the binary as kernel PE file at first. + // If failed, retry to load it as UEFI binary. + // As the UEFI binary is formatless, it must be the last option to try. + Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => { + Self::load_firmware(&kernel, memory_manager)?; + arch::layout::UEFI_START + } + Err(e) => { + return Err(Error::KernelLoad(e)); } } - (Some(_firmware), None) => { - // TODO: UEFI for riscv64 is scheduled to next stage. - unimplemented!() - } - _ => return Err(Error::InvalidPayload), }; Ok(EntryPoint { entry_addr }) @@ -1153,6 +1270,9 @@ impl Vm { Ok(entry_point) } + /// Loads the kernel or a firmware file. + /// + /// For x86_64, the boot path is the same. #[cfg(target_arch = "x86_64")] fn load_kernel( mut kernel: File, @@ -1231,22 +1351,19 @@ impl Vm { return Self::load_igvm(igvm, memory_manager, cpu_manager); } } - match ( - &payload.firmware, - &payload.kernel, - &payload.initramfs, - &payload.cmdline, - ) { - (Some(firmware), None, None, None) => { + match (&payload.firmware, &payload.kernel) { + (Some(firmware), None) => { let firmware = File::open(firmware).map_err(Error::FirmwareFile)?; Self::load_kernel(firmware, None, memory_manager) } - (None, Some(kernel), _, _) => { + (None, Some(kernel)) => { let kernel = File::open(kernel).map_err(Error::KernelFile)?; let cmdline = Self::generate_cmdline(payload)?; Self::load_kernel(kernel, Some(cmdline), memory_manager) } - _ => Err(Error::InvalidPayload), + _ => unreachable!( + "Unsupported boot configuration: programming error from 'PayloadConfigError::validate()'" + ), } } @@ -1258,13 +1375,15 @@ impl Vm { match (&payload.firmware, &payload.kernel) { (Some(firmware), None) => { let firmware = File::open(firmware).map_err(Error::FirmwareFile)?; - Self::load_kernel(Some(firmware), None, memory_manager) + Self::load_firmware(&firmware, memory_manager) } (None, Some(kernel)) => { let kernel = File::open(kernel).map_err(Error::KernelFile)?; - Self::load_kernel(None, Some(kernel), memory_manager) + Self::load_kernel(kernel, memory_manager) } - _ => Err(Error::InvalidPayload), + _ => unreachable!( + "Unsupported boot configuration: programming error from 'PayloadConfigError::validate()'" + ), } } @@ -1321,13 +1440,6 @@ impl Vm { let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus(); let rsdp_addr = Some(rsdp_addr); - let sgx_epc_region = self - .memory_manager - .lock() - .unwrap() - .sgx_epc_region() - .as_ref() - .cloned(); let serial_number = self .config @@ -1367,7 +1479,6 @@ impl Vm { boot_vcpus, entry_addr.setup_header, rsdp_addr, - sgx_epc_region, serial_number.as_deref(), uuid.as_deref(), oem_strings.as_deref(), @@ -1568,7 +1679,7 @@ impl Vm { pub fn resize( &mut self, - desired_vcpus: Option, + desired_vcpus: Option, desired_memory: Option, desired_balloon: Option, ) -> Result<()> { @@ -1654,6 +1765,16 @@ impl Vm { Ok(()) } + pub fn resize_disk(&mut self, id: String, desired_size: u64) -> Result<()> { + self.device_manager + .lock() + .unwrap() + .resize_disk(&id, desired_size) + .map_err(Error::DeviceManager)?; + + Ok(()) + } + pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> { let memory_config = &mut self.config.lock().unwrap().memory; @@ -2250,8 +2371,44 @@ impl Vm { } else { VmState::Running }; + current_state.valid_transition(new_state)?; + #[cfg(feature = "fw_cfg")] + { + let fw_cfg_enabled = self + .config + .lock() + .unwrap() + .payload + .as_ref() + .map(|p| p.fw_cfg_config.is_some()) + .unwrap_or(false); + if fw_cfg_enabled { + let fw_cfg_config = self + .config + .lock() + .unwrap() + .payload + .as_ref() + .map(|p| p.fw_cfg_config.clone()) + .unwrap_or_default() + .ok_or(Error::VmMissingConfig)?; + Self::populate_fw_cfg(&fw_cfg_config, &self.device_manager, &self.config)?; + + if fw_cfg_config.acpi_tables { + let tpm_enabled = self.config.lock().unwrap().tpm.is_some(); + crate::acpi::create_acpi_tables_for_fw_cfg( + &self.device_manager, + &self.cpu_manager, + &self.memory_manager, + &self.numa_nodes, + tpm_enabled, + )? + } + } + } + // Do earlier to parallelise with loading kernel #[cfg(target_arch = "x86_64")] cfg_if::cfg_if! { @@ -2332,15 +2489,16 @@ impl Vm { let rsdp_addr = self.create_acpi_tables(); #[cfg(not(target_arch = "riscv64"))] - // Configure shared state based on loaded kernel - entry_point - .map(|entry_point| { - // Safe to unwrap rsdp_addr as we know it can't be None when - // the entry_point is Some. - self.configure_system(rsdp_addr.unwrap(), entry_point) - }) - .transpose()?; - + { + #[cfg(not(feature = "sev_snp"))] + assert!(rsdp_addr.is_some()); + // Configure shared state based on loaded kernel + if let Some(rsdp_adr) = rsdp_addr { + entry_point + .map(|entry_point| self.configure_system(rsdp_adr, entry_point)) + .transpose()?; + } + } #[cfg(target_arch = "riscv64")] self.configure_system().unwrap(); @@ -2447,46 +2605,8 @@ impl Vm { Ok(()) } - pub fn send_memory_regions( - &mut self, - ranges: &MemoryRangeTable, - fd: &mut F, - ) -> std::result::Result<(), MigratableError> - where - F: WriteVolatile, - { - let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); - let mem = guest_memory.memory(); - - for range in ranges.regions() { - let mut offset: u64 = 0; - // Here we are manually handling the retry in case we can't the - // whole region at once because we can't use the implementation - // from vm-memory::GuestMemory of write_all_to() as it is not - // following the correct behavior. For more info about this issue - // see: https://github.com/rust-vmm/vm-memory/issues/174 - loop { - let bytes_written = mem - .write_volatile_to( - GuestAddress(range.gpa + offset), - fd, - (range.length - offset) as usize, - ) - .map_err(|e| { - MigratableError::MigrateSend(anyhow!( - "Error transferring memory to socket: {}", - e - )) - })?; - offset += bytes_written as u64; - - if offset == range.length { - break; - } - } - } - - Ok(()) + pub fn guest_memory(&self) -> GuestMemoryAtomic { + self.memory_manager.lock().unwrap().guest_memory() } pub fn memory_range_table(&self) -> std::result::Result { @@ -2605,7 +2725,7 @@ impl Vm { &mut self, destination_url: &str, ) -> std::result::Result { - let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus as u32; + let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus; let elf_note_size = self.get_note_size(NoteDescType::ElfAndVmm, nr_cpus) as isize; let mut elf_phdr_num = 1; let elf_sh_info = 0; @@ -2773,20 +2893,22 @@ impl Snapshottable for Vm { #[cfg(all(feature = "kvm", target_arch = "x86_64"))] let common_cpuid = { - let amx = self.config.lock().unwrap().cpus.features.amx; - let phys_bits = physical_bits( - &self.hypervisor, - self.config.lock().unwrap().cpus.max_phys_bits, - ); + let guard = self.config.lock().unwrap(); + let amx = guard.cpus.features.amx; + let phys_bits = physical_bits(&self.hypervisor, guard.cpus.max_phys_bits); + let kvm_hyperv = guard.cpus.kvm_hyperv; + let profile = guard.cpus.profile; + // Drop the guard before function call + core::mem::drop(guard); arch::generate_common_cpuid( &self.hypervisor, &arch::CpuidConfig { - sgx_epc_sections: None, phys_bits, - kvm_hyperv: self.config.lock().unwrap().cpus.kvm_hyperv, + kvm_hyperv, #[cfg(feature = "tdx")] tdx: false, amx, + profile, }, ) .map_err(|e| { @@ -3012,12 +3134,12 @@ impl GuestDebuggable for Vm { #[cfg(feature = "tdx")] { - if let Some(ref platform) = self.config.lock().unwrap().platform { - if platform.tdx { - return Err(GuestDebuggableError::Coredump(anyhow!( - "Coredump not possible with TDX VM" - ))); - } + if let Some(ref platform) = self.config.lock().unwrap().platform + && platform.tdx + { + return Err(GuestDebuggableError::Coredump(anyhow!( + "Coredump not possible with TDX VM" + ))); } } @@ -3394,7 +3516,7 @@ mod tests { &mem, "console=tty0", vec![0], - Some((0, 0, 0)), + Some((0, 0, 0, 0)), &dev_info, &gic, &None, diff --git a/vmm/src/vm_config.rs b/vmm/src/vm_config.rs index a2c5b996b4..7238a7ca23 100644 --- a/vmm/src/vm_config.rs +++ b/vmm/src/vm_config.rs @@ -3,15 +3,19 @@ // SPDX-License-Identifier: Apache-2.0 // use std::net::{IpAddr, Ipv4Addr}; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; +#[cfg(feature = "fw_cfg")] +use std::str::FromStr; use std::{fs, result}; +use arch::CpuProfile; use net_util::MacAddr; use serde::{Deserialize, Serialize}; +use thiserror::Error; use virtio_devices::RateLimiterConfig; -use crate::landlock::LandlockError; use crate::Landlock; +use crate::landlock::LandlockError; pub type LandlockResult = result::Result; @@ -23,7 +27,7 @@ pub(crate) trait ApplyLandlock { #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] pub struct CpuAffinity { - pub vcpu: u8, + pub vcpu: u32, pub host_cpus: Vec, } @@ -36,10 +40,10 @@ pub struct CpuFeatures { #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] pub struct CpuTopology { - pub threads_per_core: u8, - pub cores_per_die: u8, - pub dies_per_package: u8, - pub packages: u8, + pub threads_per_core: u16, + pub cores_per_die: u16, + pub dies_per_package: u16, + pub packages: u16, } // When booting with PVH boot the maximum physical addressable size @@ -53,8 +57,8 @@ pub fn default_cpuconfig_max_phys_bits() -> u8 { #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] pub struct CpusConfig { - pub boot_vcpus: u8, - pub max_vcpus: u8, + pub boot_vcpus: u32, + pub max_vcpus: u32, #[serde(default)] pub topology: Option, #[serde(default)] @@ -65,9 +69,11 @@ pub struct CpusConfig { pub affinity: Option>, #[serde(default)] pub features: CpuFeatures, + #[serde(default)] + pub profile: CpuProfile, } -pub const DEFAULT_VCPUS: u8 = 1; +pub const DEFAULT_VCPUS: u32 = 1; impl Default for CpusConfig { fn default() -> Self { @@ -79,6 +85,7 @@ impl Default for CpusConfig { max_phys_bits: DEFAULT_MAX_PHYS_BITS, affinity: None, features: CpuFeatures::default(), + profile: CpuProfile::default(), } } } @@ -156,7 +163,7 @@ pub struct MemoryZoneConfig { impl ApplyLandlock for MemoryZoneConfig { fn apply_landlock(&self, landlock: &mut Landlock) -> LandlockResult<()> { if let Some(file) = &self.file { - landlock.add_rule_with_access(file.to_path_buf(), "rw")?; + landlock.add_rule_with_access(file, "rw")?; } Ok(()) } @@ -273,12 +280,14 @@ pub struct DiskConfig { pub serial: Option, #[serde(default)] pub queue_affinity: Option>, + #[serde(default)] + pub bdf_device: Option, } impl ApplyLandlock for DiskConfig { fn apply_landlock(&self, landlock: &mut Landlock) -> LandlockResult<()> { if let Some(path) = &self.path { - landlock.add_rule_with_access(path.to_path_buf(), "rw")?; + landlock.add_rule_with_access(path, "rw")?; } Ok(()) } @@ -323,11 +332,10 @@ pub struct NetConfig { pub vhost_mode: VhostMode, #[serde(default)] pub id: Option, - #[serde( - default, - serialize_with = "serialize_netconfig_fds", - deserialize_with = "deserialize_netconfig_fds" - )] + // Special deserialize handling: + // A serialize-deserialize cycle typically happens across processes. + // The old FD is almost certainly invalid in the new process. + #[serde(default, deserialize_with = "deserialize_netconfig_fds")] pub fds: Option>, #[serde(default)] pub rate_limiter_config: Option, @@ -339,6 +347,8 @@ pub struct NetConfig { pub offload_ufo: bool, #[serde(default = "default_netconfig_true")] pub offload_csum: bool, + #[serde(default)] + pub bdf_device: Option, } pub fn default_netconfig_true() -> bool { @@ -350,10 +360,16 @@ pub fn default_netconfig_tap() -> Option { } pub fn default_netconfig_ip() -> IpAddr { + warn!( + "Deprecation warning: No IP address provided. A default IP address is assigned. This behavior will be deprecated soon." + ); IpAddr::V4(Ipv4Addr::new(192, 168, 249, 1)) } pub fn default_netconfig_mask() -> IpAddr { + warn!( + "Deprecation warning: No network mask provided. A default network mask is assigned. This behavior will be deprecated soon." + ); IpAddr::V4(Ipv4Addr::new(255, 255, 255, 0)) } @@ -373,26 +389,15 @@ pub fn default_netconfig_queue_size() -> u16 { DEFAULT_NET_QUEUE_SIZE } -fn serialize_netconfig_fds(x: &Option>, s: S) -> Result -where - S: serde::Serializer, -{ - if let Some(x) = x { - warn!("'NetConfig' contains FDs that can't be serialized correctly. Serializing them as invalid FDs."); - let invalid_fds = vec![-1; x.len()]; - s.serialize_some(&invalid_fds) - } else { - s.serialize_none() - } -} - fn deserialize_netconfig_fds<'de, D>(d: D) -> Result>, D::Error> where D: serde::Deserializer<'de>, { let invalid_fds: Option> = Option::deserialize(d)?; if let Some(invalid_fds) = invalid_fds { - warn!("'NetConfig' contains FDs that can't be deserialized correctly. Deserializing them as invalid FDs."); + debug!( + "FDs in 'NetConfig' won't be deserialized as they are most likely invalid now. Deserializing them as -1." + ); Ok(Some(vec![-1; invalid_fds.len()])) } else { Ok(None) @@ -404,6 +409,8 @@ pub struct RngConfig { pub src: PathBuf, #[serde(default)] pub iommu: bool, + #[serde(default)] + pub bdf_device: Option, } pub const DEFAULT_RNG_SOURCE: &str = "/dev/urandom"; @@ -413,6 +420,7 @@ impl Default for RngConfig { RngConfig { src: PathBuf::from(DEFAULT_RNG_SOURCE), iommu: false, + bdf_device: None, } } } @@ -420,7 +428,7 @@ impl Default for RngConfig { impl ApplyLandlock for RngConfig { fn apply_landlock(&self, landlock: &mut Landlock) -> LandlockResult<()> { // Rng Path only need read access - landlock.add_rule_with_access(self.src.to_path_buf(), "r")?; + landlock.add_rule_with_access(&self.src, "r")?; Ok(()) } } @@ -434,6 +442,8 @@ pub struct BalloonConfig { /// Option to enable free page reporting from the guest. #[serde(default)] pub free_page_reporting: bool, + #[serde(default)] + pub bdf_device: Option, } #[cfg(feature = "pvmemcontrol")] @@ -452,6 +462,8 @@ pub struct FsConfig { pub id: Option, #[serde(default)] pub pci_segment: u16, + #[serde(default)] + pub bdf_device: Option, } pub fn default_fsconfig_num_queues() -> usize { @@ -464,7 +476,7 @@ pub fn default_fsconfig_queue_size() -> u16 { impl ApplyLandlock for FsConfig { fn apply_landlock(&self, landlock: &mut Landlock) -> LandlockResult<()> { - landlock.add_rule_with_access(self.socket.to_path_buf(), "rw")?; + landlock.add_rule_with_access(&self.socket, "rw")?; Ok(()) } } @@ -482,12 +494,14 @@ pub struct PmemConfig { pub id: Option, #[serde(default)] pub pci_segment: u16, + #[serde(default)] + pub bdf_device: Option, } impl ApplyLandlock for PmemConfig { fn apply_landlock(&self, landlock: &mut Landlock) -> LandlockResult<()> { let access = if self.discard_writes { "r" } else { "rw" }; - landlock.add_rule_with_access(self.file.to_path_buf(), access)?; + landlock.add_rule_with_access(&self.file, access)?; Ok(()) } } @@ -499,6 +513,7 @@ pub enum ConsoleOutputMode { Tty, File, Socket, + Tcp, Null, } @@ -510,6 +525,10 @@ pub struct ConsoleConfig { #[serde(default)] pub iommu: bool, pub socket: Option, + pub url: Option, + /// PCI BDF to attach the console in the guest to + #[serde(default)] + pub bdf_device: Option, } pub fn default_consoleconfig_file() -> Option { @@ -519,10 +538,10 @@ pub fn default_consoleconfig_file() -> Option { impl ApplyLandlock for ConsoleConfig { fn apply_landlock(&self, landlock: &mut Landlock) -> LandlockResult<()> { if let Some(file) = &self.file { - landlock.add_rule_with_access(file.to_path_buf(), "rw")?; + landlock.add_rule_with_access(file, "rw")?; } if let Some(socket) = &self.socket { - landlock.add_rule_with_access(socket.to_path_buf(), "rw")?; + landlock.add_rule_with_access(socket, "rw")?; } Ok(()) } @@ -552,7 +571,7 @@ impl Default for DebugConsoleConfig { impl ApplyLandlock for DebugConsoleConfig { fn apply_landlock(&self, landlock: &mut Landlock) -> LandlockResult<()> { if let Some(file) = &self.file { - landlock.add_rule_with_access(file.to_path_buf(), "rw")?; + landlock.add_rule_with_access(file, "rw")?; } Ok(()) } @@ -580,8 +599,9 @@ impl ApplyLandlock for DeviceConfig { .to_str() .ok_or(LandlockError::InvalidPath)?; - let vfio_group_path = "/dev/vfio/".to_owned() + iommu_group_str; - landlock.add_rule_with_access(vfio_group_path.into(), "rw")?; + let mut vfio_group_path = PathBuf::from("/dev/vfio"); + vfio_group_path.push(iommu_group_str); + landlock.add_rule_with_access(&vfio_group_path, "rw")?; Ok(()) } @@ -598,7 +618,7 @@ pub struct UserDeviceConfig { impl ApplyLandlock for UserDeviceConfig { fn apply_landlock(&self, landlock: &mut Landlock) -> LandlockResult<()> { - landlock.add_rule_with_access(self.socket.to_path_buf(), "rw")?; + landlock.add_rule_with_access(&self.socket, "rw")?; Ok(()) } } @@ -614,6 +634,8 @@ pub struct VdpaConfig { pub id: Option, #[serde(default)] pub pci_segment: u16, + #[serde(default)] + pub bdf_device: Option, } pub fn default_vdpaconfig_num_queues() -> usize { @@ -622,7 +644,7 @@ pub fn default_vdpaconfig_num_queues() -> usize { impl ApplyLandlock for VdpaConfig { fn apply_landlock(&self, landlock: &mut Landlock) -> LandlockResult<()> { - landlock.add_rule_with_access(self.path.to_path_buf(), "rw")?; + landlock.add_rule_with_access(&self.path, "rw")?; Ok(()) } } @@ -637,23 +659,40 @@ pub struct VsockConfig { pub id: Option, #[serde(default)] pub pci_segment: u16, + #[serde(default)] + pub bdf_device: Option, } impl ApplyLandlock for VsockConfig { fn apply_landlock(&self, landlock: &mut Landlock) -> LandlockResult<()> { - landlock.add_rule_with_access(self.socket.to_path_buf(), "rw")?; + if let Some(parent) = self.socket.parent() { + landlock.add_rule_with_access(parent, "w")?; + } + + landlock.add_rule_with_access(&self.socket, "rw")?; + Ok(()) } } -#[cfg(target_arch = "x86_64")] +#[cfg(feature = "ivshmem")] +pub const DEFAULT_IVSHMEM_SIZE: usize = 128; + +#[cfg(feature = "ivshmem")] #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] -pub struct SgxEpcConfig { - pub id: String, - #[serde(default)] - pub size: u64, - #[serde(default)] - pub prefault: bool, +pub struct IvshmemConfig { + pub path: PathBuf, + pub size: usize, +} + +#[cfg(feature = "ivshmem")] +impl Default for IvshmemConfig { + fn default() -> Self { + Self { + path: PathBuf::new(), + size: DEFAULT_IVSHMEM_SIZE << 20, + } + } } #[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] @@ -669,18 +708,30 @@ pub struct NumaConfig { #[serde(default)] pub guest_numa_id: u32, #[serde(default)] - pub cpus: Option>, + pub cpus: Option>, #[serde(default)] pub distances: Option>, #[serde(default)] pub memory_zones: Option>, - #[cfg(target_arch = "x86_64")] - #[serde(default)] - pub sgx_epc_sections: Option>, #[serde(default)] pub pci_segments: Option>, } +/// Errors describing a misconfigured payload, i.e., a configuration that +/// can't be booted by Cloud Hypervisor. +/// +/// This typically is the case for invalid combinations of cmdline, kernel, +/// firmware, and initrd. +#[derive(Debug, Error, PartialEq, Eq)] +pub enum PayloadConfigError { + /// Specifying a kernel is not supported when a firmware is provided. + #[error("Specifying a kernel is not supported when a firmware is provided")] + FirmwarePlusOtherPayloads, + /// No bootitem provided: neither firmware nor kernel. + #[error("No bootitem provided: neither firmware nor kernel")] + MissingBootitem, +} + #[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] pub struct PayloadConfig { #[serde(default)] @@ -697,26 +748,128 @@ pub struct PayloadConfig { #[cfg(feature = "sev_snp")] #[serde(default)] pub host_data: Option, + #[cfg(feature = "fw_cfg")] + pub fw_cfg_config: Option, +} + +#[cfg(feature = "fw_cfg")] +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +pub struct FwCfgConfig { + pub e820: bool, + pub kernel: bool, + pub cmdline: bool, + pub initramfs: bool, + pub acpi_tables: bool, + pub items: Option, +} + +#[cfg(feature = "fw_cfg")] +impl Default for FwCfgConfig { + fn default() -> Self { + FwCfgConfig { + e820: true, + kernel: true, + cmdline: true, + initramfs: true, + acpi_tables: true, + items: None, + } + } +} + +#[cfg(feature = "fw_cfg")] +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +pub struct FwCfgItemList { + #[serde(default)] + pub item_list: Vec, +} + +#[cfg(feature = "fw_cfg")] +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +pub struct FwCfgItem { + #[serde(default)] + pub name: String, + #[serde(default)] + pub file: PathBuf, +} + +#[cfg(feature = "fw_cfg")] +pub enum FwCfgItemError { + InvalidValue(String), +} + +#[cfg(feature = "fw_cfg")] +impl FromStr for FwCfgItemList { + type Err = FwCfgItemError; + + fn from_str(s: &str) -> Result { + let body = s + .trim() + .strip_prefix('[') + .and_then(|s| s.strip_suffix(']')) + .ok_or_else(|| FwCfgItemError::InvalidValue(s.to_string()))?; + + let mut fw_cfg_items: Vec = vec![]; + let items: Vec<&str> = body.split(':').collect(); + for item in items { + fw_cfg_items.push( + FwCfgItem::parse(item) + .map_err(|_| FwCfgItemError::InvalidValue(item.to_string()))?, + ); + } + Ok(FwCfgItemList { + item_list: fw_cfg_items, + }) + } +} + +impl PayloadConfig { + /// Validates the payload config. + /// + /// Succeeds if Cloud Hypervisor will be able to boot the configuration. + /// Further, warns for some odd configurations. + pub fn validate(&mut self) -> Result<(), PayloadConfigError> { + match (&self.firmware, &self.kernel) { + (Some(_firmware), Some(_kernel)) => Err(PayloadConfigError::FirmwarePlusOtherPayloads), + (Some(_firmware), None) => { + if self.cmdline.is_some() { + log::warn!("Ignoring cmdline parameter as firmware is provided as the payload"); + self.cmdline = None; + } + if self.initramfs.is_some() { + log::warn!( + "Ignoring initramfs parameter as firmware is provided as the payload" + ); + self.initramfs = None; + } + Ok(()) + } + (None, Some(_kernel)) => Ok(()), + (None, None) => Err(PayloadConfigError::MissingBootitem), + }?; + + Ok(()) + } } impl ApplyLandlock for PayloadConfig { fn apply_landlock(&self, landlock: &mut Landlock) -> LandlockResult<()> { // Payload only needs read access if let Some(firmware) = &self.firmware { - landlock.add_rule_with_access(firmware.to_path_buf(), "r")?; + landlock.add_rule_with_access(firmware, "r")?; } if let Some(kernel) = &self.kernel { - landlock.add_rule_with_access(kernel.to_path_buf(), "r")?; + landlock.add_rule_with_access(kernel, "r")?; } if let Some(initramfs) = &self.initramfs { - landlock.add_rule_with_access(initramfs.to_path_buf(), "r")?; + landlock.add_rule_with_access(initramfs, "r")?; } #[cfg(feature = "igvm")] if let Some(igvm) = &self.igvm { - landlock.add_rule_with_access(igvm.to_path_buf(), "r")?; + landlock.add_rule_with_access(igvm, "r")?; } Ok(()) @@ -729,6 +882,8 @@ pub fn default_serial() -> ConsoleConfig { mode: ConsoleOutputMode::Null, iommu: false, socket: None, + url: None, + bdf_device: None, } } @@ -738,6 +893,8 @@ pub fn default_console() -> ConsoleConfig { mode: ConsoleOutputMode::Tty, iommu: false, socket: None, + url: None, + bdf_device: None, } } @@ -748,7 +905,7 @@ pub struct TpmConfig { impl ApplyLandlock for TpmConfig { fn apply_landlock(&self, landlock: &mut Landlock) -> LandlockResult<()> { - landlock.add_rule_with_access(self.socket.to_path_buf(), "rw")?; + landlock.add_rule_with_access(&self.socket, "rw")?; Ok(()) } } @@ -761,7 +918,7 @@ pub struct LandlockConfig { impl ApplyLandlock for LandlockConfig { fn apply_landlock(&self, landlock: &mut Landlock) -> LandlockResult<()> { - landlock.add_rule_with_access(self.path.to_path_buf(), self.access.clone().as_str())?; + landlock.add_rule_with_access(&self.path, self.access.clone().as_str())?; Ok(()) } } @@ -799,8 +956,6 @@ pub struct VmConfig { pub pvpanic: bool, #[serde(default)] pub iommu: bool, - #[cfg(target_arch = "x86_64")] - pub sgx_epc: Option>, pub numa: Option>, #[serde(default)] pub watchdog: bool, @@ -819,12 +974,19 @@ pub struct VmConfig { #[serde(default)] pub landlock_enable: bool, pub landlock_rules: Option>, + #[cfg(feature = "ivshmem")] + pub ivshmem: Option, } impl VmConfig { pub(crate) fn apply_landlock(&self) -> LandlockResult<()> { let mut landlock = Landlock::new()?; + #[cfg(target_arch = "aarch64")] + { + landlock.add_rule_with_access(Path::new("/sys/devices/system/cpu/cpu0/cache"), "r")?; + } + if let Some(mem_zones) = &self.memory.zones { for zone in mem_zones.iter() { zone.apply_landlock(&mut landlock)?; @@ -861,7 +1023,7 @@ impl VmConfig { } if let Some(devices) = &self.devices { - landlock.add_rule_with_access("/dev/vfio/vfio".into(), "rw")?; + landlock.add_rule_with_access(Path::new("/dev/vfio/vfio"), "rw")?; for device in devices.iter() { device.apply_landlock(&mut landlock)?; @@ -893,7 +1055,7 @@ impl VmConfig { } if self.net.is_some() { - landlock.add_rule_with_access("/dev/net/tun".into(), "rw")?; + landlock.add_rule_with_access(Path::new("/dev/net/tun"), "rw")?; } if let Some(landlock_rules) = &self.landlock_rules { @@ -906,4 +1068,18 @@ impl VmConfig { Ok(()) } + + #[cfg(all(feature = "kvm", target_arch = "x86_64"))] + pub(crate) fn max_apic_id(&self) -> u32 { + if let Some(topology) = &self.cpus.topology { + arch::x86_64::get_max_x2apic_id(( + topology.threads_per_core, + topology.cores_per_die, + topology.dies_per_package, + topology.packages, + )) + } else { + self.cpus.max_vcpus + } + } }