From 6eb6fb63054adf9497b27895909307d6711e43be Mon Sep 17 00:00:00 2001
From: Peter Harris <peter.harris@arm.com>
Date: Fri, 19 Dec 2025 11:47:58 +0000
Subject: [PATCH 1/3] Remove isEmulatingExtFrameBoundary

---
 layer_gpu_timeline/source/device.hpp          |  8 -----
 .../source/layer_device_functions_queue.cpp   | 18 ++++------
 .../source/layer_instance_functions.cpp       | 33 -------------------
 .../source/layer_instance_functions.hpp       |  8 -----
 4 files changed, 6 insertions(+), 61 deletions(-)

diff --git a/layer_gpu_timeline/source/device.hpp b/layer_gpu_timeline/source/device.hpp
index e3bb6b8..937b007 100644
--- a/layer_gpu_timeline/source/device.hpp
+++ b/layer_gpu_timeline/source/device.hpp
@@ -185,14 +185,6 @@ class Device
      */
     static const std::vector<DeviceCreatePatchPtr> createInfoPatches;
 
-    /**
-     * @brief Is this layer emulating VK_EXT_frame_boundary?
-     *
-     * Set to @c true if layer is emulating on top of a driver that doesn't
-     * support it, @c false if layer knows driver supports it.
-     */
-    bool isEmulatingExtFrameBoundary { false };
-
 private:
     /**
      * @brief State tracker for this device.
diff --git a/layer_gpu_timeline/source/layer_device_functions_queue.cpp b/layer_gpu_timeline/source/layer_device_functions_queue.cpp
index a4ba1a0..4159ae8 100644
--- a/layer_gpu_timeline/source/layer_device_functions_queue.cpp
+++ b/layer_gpu_timeline/source/layer_device_functions_queue.cpp
@@ -107,7 +107,7 @@ static void emitCommandBufferMetadata(Device& layer,
  * @param workloadVisitor   Visitor for the protobuf encoder.
  */
 static void checkManualFrameBoundary(
-    Device* layer,
+    Device& layer,
     VkQueue queue,
     const void* pNext,
     bool isLastSubmit,
@@ -118,10 +118,10 @@ static void checkManualFrameBoundary(
     if (ext && (ext->flags & VK_FRAME_BOUNDARY_FRAME_END_BIT_EXT))
     {
         // Emulate a queue present to indicate end of frame
-        auto& tracker = layer->getStateTracker();
+        auto& tracker = layer.getStateTracker();
         tracker.queuePresent();
 
-        TimelineProtobufEncoder::emitFrame(*layer, tracker.totalStats.getFrameCount(), getClockMonotonicRaw());
+        TimelineProtobufEncoder::emitFrame(layer, tracker.totalStats.getFrameCount(), getClockMonotonicRaw());
 
         // Emulate a new queue submit if work remains to submit
         if (!isLastSubmit)
@@ -148,12 +148,6 @@ VKAPI_ATTR VkResult VKAPI_CALL layer_vkQueuePresentKHR<user_tag>(VkQueue queue,
     vku::safe_VkPresentInfoKHR safePresentInfo(pPresentInfo);
     auto* newPresentInfo = reinterpret_cast<VkPresentInfoKHR*>(&safePresentInfo);
 
-    // Remove emulated frame boundaries
-    if (layer->isEmulatingExtFrameBoundary)
-    {
-        vku::RemoveFromPnext(safePresentInfo, VK_STRUCTURE_TYPE_FRAME_BOUNDARY_EXT);
-    }
-
     // Note that we assume QueuePresent is _always_ the end of a frame.
     // This is run with the lock held to ensure that all queue submit messages
     // are sent sequentially to the host tool
@@ -194,7 +188,7 @@ VKAPI_ATTR VkResult VKAPI_CALL
 
         // Check for end of frame boundary
         bool isLast = i == submitCount - 1;
-        checkManualFrameBoundary(layer, queue, submit.pNext, isLast, workloadVisitor);
+        checkManualFrameBoundary(*layer, queue, submit.pNext, isLast, workloadVisitor);
     }
 
     // Release the lock to call into the driver
@@ -232,7 +226,7 @@ VKAPI_ATTR VkResult VKAPI_CALL
 
         // Check for end of frame boundary
         bool isLast = i == submitCount - 1;
-        checkManualFrameBoundary(layer, queue, submit.pNext, isLast, workloadVisitor);
+        checkManualFrameBoundary(*layer, queue, submit.pNext, isLast, workloadVisitor);
     }
 
     // Release the lock to call into the driver
@@ -270,7 +264,7 @@ VKAPI_ATTR VkResult VKAPI_CALL
 
         // Check for end of frame boundary
         bool isLast = i == submitCount - 1;
-        checkManualFrameBoundary(layer, queue, submit.pNext, isLast, workloadVisitor);
+        checkManualFrameBoundary(*layer, queue, submit.pNext, isLast, workloadVisitor);
     }
 
     // Release the lock to call into the driver
diff --git a/layer_gpu_timeline/source/layer_instance_functions.cpp b/layer_gpu_timeline/source/layer_instance_functions.cpp
index 0d36dfd..f152da2 100644
--- a/layer_gpu_timeline/source/layer_instance_functions.cpp
+++ b/layer_gpu_timeline/source/layer_instance_functions.cpp
@@ -31,39 +31,6 @@
 
 extern std::mutex g_vulkanLock;
 
-/* See header for documentation. */
-template <>
-VKAPI_ATTR VkResult VKAPI_CALL layer_vkCreateDevice<user_tag>(
-    VkPhysicalDevice physicalDevice,
-    const VkDeviceCreateInfo* pCreateInfo,
-    const VkAllocationCallbacks* pAllocator,
-    VkDevice* pDevice
-) {
-    LAYER_TRACE(__func__);
-
-    // Use the default function for the heavy-lifting
-    auto res = layer_vkCreateDevice<default_tag>(physicalDevice, pCreateInfo, pAllocator, pDevice);
-    if (res != VK_SUCCESS)
-    {
-        return res;
-    }
-
-    // Cache flags indicating extension emulation
-    std::unique_lock<std::mutex> lock {g_vulkanLock};
-    auto* layer = Device::retrieve(*pDevice);
-
-    static const std::string target { VK_EXT_FRAME_BOUNDARY_EXTENSION_NAME };
-    for (auto& ext : layer->instance->injectedDeviceExtensions)
-    {
-        if (ext.first == target)
-        {
-            layer->isEmulatingExtFrameBoundary = true;
-        }
-    }
-
-    return res;
-}
-
 /* See Vulkan API for documentation. */
 template <>
 VKAPI_ATTR void VKAPI_CALL layer_vkGetPhysicalDeviceFeatures2<user_tag>(
diff --git a/layer_gpu_timeline/source/layer_instance_functions.hpp b/layer_gpu_timeline/source/layer_instance_functions.hpp
index c2e6f1d..3bab9ec 100644
--- a/layer_gpu_timeline/source/layer_instance_functions.hpp
+++ b/layer_gpu_timeline/source/layer_instance_functions.hpp
@@ -29,14 +29,6 @@
 
 // Functions for devices
 
-/* See Vulkan API for documentation. */
-template <>
-VKAPI_ATTR VkResult VKAPI_CALL layer_vkCreateDevice<user_tag>(
-    VkPhysicalDevice physicalDevice,
-    const VkDeviceCreateInfo* pCreateInfo,
-    const VkAllocationCallbacks* pAllocator,
-    VkDevice* pDevice);
-
 /* See Vulkan API for documentation. */
 template <>
 VKAPI_ATTR void VKAPI_CALL layer_vkGetPhysicalDeviceFeatures2<user_tag>(

From f12377969b3adc2970ceb0fdeeb9e0a6b9534e00 Mon Sep 17 00:00:00 2001
From: Peter Harris <peter.harris@arm.com>
Date: Fri, 19 Dec 2025 11:49:15 +0000
Subject: [PATCH 2/3] Profile layer: Add per frame support

---
 layer_gpu_profile/README_LAYER.md             |  33 +-
 layer_gpu_profile/android_build.sh            |   2 +-
 layer_gpu_profile/layer_config.json           |   3 +-
 layer_gpu_profile/source/CMakeLists.txt       |   1 +
 layer_gpu_profile/source/device_utils.hpp     |   3 +-
 layer_gpu_profile/source/instance.cpp         |   4 +-
 layer_gpu_profile/source/layer_config.cpp     |  77 +++--
 layer_gpu_profile/source/layer_config.hpp     |  53 +++-
 .../source/layer_device_functions.hpp         |   7 +
 .../source/layer_device_functions_queue.cpp   | 285 +++++++++++++++---
 .../source/layer_instance_functions.cpp       |  80 +++++
 .../source/layer_instance_functions.hpp       |  42 +++
 lglpy/comms/service_gpu_profile.py            | 122 ++++++--
 13 files changed, 616 insertions(+), 96 deletions(-)
 create mode 100644 layer_gpu_profile/source/layer_instance_functions.cpp
 create mode 100644 layer_gpu_profile/source/layer_instance_functions.hpp

diff --git a/layer_gpu_profile/README_LAYER.md b/layer_gpu_profile/README_LAYER.md
index 760fe3b..91f1499 100644
--- a/layer_gpu_profile/README_LAYER.md
+++ b/layer_gpu_profile/README_LAYER.md
@@ -113,19 +113,32 @@ application under test and the capture process. For full instructions see the
 
 ## Layer configuration
 
-The current layer supports two `sampling_mode` values:
+### Setting frame selection mode
 
-* `periodic_frame`: Sample every N frames.
-* `frame_list`: Sample specific frames.
+The current layer supports the following ways to select frames to profile using
+the `frame_mode` config option:
 
-When `mode` is `periodic_frame` the integer value of the `periodic_frame` key
-defines the frame sampling period. The integer value of the
-`periodic_min_frame` key defines the first possible frame that could be
-profiled, allowing profiles to skip over any loading frames. By default frame 0
-is ignored.
+* `disabled`: Sampling is disabled.
+* `periodic`: Sample every N frames.
+* `list`: Sample specific frames.
 
-When `mode` is `frame_list` the value of the `frame_list` key defines a list
-of integers giving the specific frames to capture.
+When frame selection mode is `periodic` the integer value of the
+`periodic_frame` key defines the frame sampling period. The integer value of
+the `periodic_min_frame` key defines the first possible frame that could be
+profiled, allowing profiles to skip over any loading frames. By default frame
+0 is ignored.
+
+When frame selection mode is `list` the value of the `frame_list` key defines
+a list of integers giving the specific frames to capture.
+
+### Setting counter sampling mode
+
+The current layer supports the following ways to select how to sample counters
+to profile using the `sample_mode` config option:
+
+* `disabled`: Sampling is disabled.
+* `workload`: Sample every workload in each frame of interest.
+* `frame`: Sample at the end of each frame of interest.
 
 ## Layer counters
 
diff --git a/layer_gpu_profile/android_build.sh b/layer_gpu_profile/android_build.sh
index 51489da..64e47ac 100755
--- a/layer_gpu_profile/android_build.sh
+++ b/layer_gpu_profile/android_build.sh
@@ -68,7 +68,7 @@ cmake \
     -DCMAKE_WARN_DEPRECATED=OFF \
     ..
 
-cmake --build . -j4
+cmake --build .
 
 popd
 
diff --git a/layer_gpu_profile/layer_config.json b/layer_gpu_profile/layer_config.json
index c24a31a..6fa5d47 100644
--- a/layer_gpu_profile/layer_config.json
+++ b/layer_gpu_profile/layer_config.json
@@ -1,6 +1,7 @@
 {
     "layer": "VK_LAYER_LGL_gpu_profile",
-    "sample_mode": "periodic_frame",
+    "frame_mode": "periodic",
+    "sample_mode": "frame",
     "periodic_min_frame": 1,
     "periodic_frame": 600,
     "frame_list": []
diff --git a/layer_gpu_profile/source/CMakeLists.txt b/layer_gpu_profile/source/CMakeLists.txt
index 0b1bea8..51810ed 100644
--- a/layer_gpu_profile/source/CMakeLists.txt
+++ b/layer_gpu_profile/source/CMakeLists.txt
@@ -55,6 +55,7 @@ add_library(
         layer_device_functions_render_pass.cpp
         layer_device_functions_trace_rays.cpp
         layer_device_functions_transfer.cpp
+        layer_instance_functions.cpp
         submit_visitor.cpp)
 
 target_include_directories(
diff --git a/layer_gpu_profile/source/device_utils.hpp b/layer_gpu_profile/source/device_utils.hpp
index f0b3415..530cf50 100644
--- a/layer_gpu_profile/source/device_utils.hpp
+++ b/layer_gpu_profile/source/device_utils.hpp
@@ -58,7 +58,8 @@
     VkCommandBuffer commandBuffer
 ) {
     // Don't instrument outside of active frame of interest
-    if(!layer.isFrameOfInterest)
+    bool isEnabled = layer.instance->config.isSamplingWorkloads();
+    if(!layer.isFrameOfInterest || !isEnabled)
     {
         return;
     }
diff --git a/layer_gpu_profile/source/instance.cpp b/layer_gpu_profile/source/instance.cpp
index df4f55a..0aaf9b1 100644
--- a/layer_gpu_profile/source/instance.cpp
+++ b/layer_gpu_profile/source/instance.cpp
@@ -46,7 +46,9 @@ const std::vector<std::string> Instance::requiredDriverExtensions {
 const std::vector<std::pair<std::string, uint32_t>> Instance::injectedInstanceExtensions {};
 
 /* See header for documentation. */
-std::vector<std::pair<std::string, uint32_t>> Instance::injectedDeviceExtensions {};
+std::vector<std::pair<std::string, uint32_t>> Instance::injectedDeviceExtensions {
+    {VK_EXT_FRAME_BOUNDARY_EXTENSION_NAME, VK_EXT_FRAME_BOUNDARY_SPEC_VERSION}
+};
 
 /* See header for documentation. */
 void Instance::store(VkInstance handle, std::unique_ptr<Instance>& instance)
diff --git a/layer_gpu_profile/source/layer_config.cpp b/layer_gpu_profile/source/layer_config.cpp
index 1154401..ee4b365 100644
--- a/layer_gpu_profile/source/layer_config.cpp
+++ b/layer_gpu_profile/source/layer_config.cpp
@@ -43,45 +43,70 @@
 /* See header for documentation. */
 void LayerConfig::parseSamplingOptions(const json& config)
 {
-    // Decode top level options
-    std::string rawMode = config.at("sample_mode");
+    // Decode frame selection mode
+    std::string rawFrameMode = config.at("frame_mode");
 
-    if (rawMode == "disabled")
+    if (rawFrameMode == "disabled")
     {
-        mode = MODE_DISABLED;
+        frameMode = FRAME_SELECTION_DISABLED;
     }
-    else if (rawMode == "periodic_frame")
+    else if (rawFrameMode == "periodic")
     {
-        mode = MODE_PERIODIC_FRAME;
+        frameMode = FRAME_SELECTION_PERIODIC;
         periodicFrame = config.at("periodic_frame");
         periodicMinFrame = config.at("periodic_min_frame");
     }
-    else if (rawMode == "frame_list")
+    else if (rawFrameMode == "list")
     {
-        mode = MODE_FRAME_LIST;
+        frameMode = FRAME_SELECTION_LIST;
         specificFrames = config.at("frame_list").get<std::vector<uint64_t>>();
     }
     else
     {
-        LAYER_ERR("Unknown counter sample_mode: %s", rawMode.c_str());
-        rawMode = "disabled";
+        LAYER_ERR("Unknown frame_mode: %s", rawFrameMode.c_str());
+        frameMode = FRAME_SELECTION_DISABLED;
+        rawFrameMode = "disabled";
+    }
+
+    // Decode counter sampling mode
+    std::string rawSampleMode = config.at("sample_mode");
+
+    if (rawSampleMode == "disabled")
+    {
+        samplingMode = COUNTER_SAMPLING_DISABLED;
+    }
+    else if (rawSampleMode == "frame")
+    {
+        samplingMode = COUNTER_SAMPLING_FRAMES;
+    }
+    else if (rawSampleMode == "workload")
+    {
+        samplingMode = COUNTER_SAMPLING_WORKLOADS;
+    }
+    else
+    {
+        LAYER_ERR("Unknown sample_mode: %s", rawSampleMode.c_str());
+        samplingMode = COUNTER_SAMPLING_DISABLED;
+        rawSampleMode = "disabled";
     }
 
     LAYER_LOG("Layer sampling configuration");
     LAYER_LOG("============================");
-    LAYER_LOG(" - Sample mode: %s", rawMode.c_str());
+    LAYER_LOG(" - Frame selection mode: %s", rawFrameMode.c_str());
 
-    if (mode == MODE_PERIODIC_FRAME)
+    if (frameMode == FRAME_SELECTION_PERIODIC)
     {
         LAYER_LOG(" - Frame period: %" PRIu64, periodicFrame);
         LAYER_LOG(" - Minimum frame: %" PRIu64, periodicMinFrame);
     }
-    else if (mode == MODE_FRAME_LIST)
+    else if (frameMode == FRAME_SELECTION_LIST)
     {
         std::stringstream result;
         std::copy(specificFrames.begin(), specificFrames.end(), std::ostream_iterator<uint64_t>(result, " "));
         LAYER_LOG(" - Frames: %s", result.str().c_str());
     }
+
+    LAYER_LOG(" - Counter sampling mode: %s", rawSampleMode.c_str());
 }
 
 /* See header for documentation. */
@@ -131,14 +156,14 @@ LayerConfig::LayerConfig()
 bool LayerConfig::isFrameOfInterest(
     uint64_t frameID
 ) const {
-    switch(mode)
+    switch(frameMode)
     {
-    case MODE_DISABLED:
+    case FRAME_SELECTION_DISABLED:
         return false;
-    case MODE_PERIODIC_FRAME:
+    case FRAME_SELECTION_PERIODIC:
         return (frameID >= periodicMinFrame) &&
                ((frameID % periodicFrame) == 0);
-    case MODE_FRAME_LIST:
+    case FRAME_SELECTION_LIST:
         return isIn(frameID, specificFrames);
     }
 
@@ -146,3 +171,21 @@ bool LayerConfig::isFrameOfInterest(
     return false;
 }
 
+/* See header for documentation. */
+bool LayerConfig::isSamplingWorkloads() const {
+    return frameMode != FRAME_SELECTION_DISABLED &&
+           samplingMode == COUNTER_SAMPLING_WORKLOADS;
+}
+
+/* See header for documentation. */
+bool LayerConfig::isSamplingFrames() const {
+    return frameMode != FRAME_SELECTION_DISABLED &&
+           samplingMode == COUNTER_SAMPLING_FRAMES;
+}
+
+/* See header for documentation. */
+bool LayerConfig::isSamplingAny() const {
+    return frameMode != FRAME_SELECTION_DISABLED &&
+           samplingMode != COUNTER_SAMPLING_DISABLED;
+}
+
diff --git a/layer_gpu_profile/source/layer_config.hpp b/layer_gpu_profile/source/layer_config.hpp
index 2c54cb3..17d2492 100644
--- a/layer_gpu_profile/source/layer_config.hpp
+++ b/layer_gpu_profile/source/layer_config.hpp
@@ -54,19 +54,50 @@ class LayerConfig
      *
      * @param frameID   The index of the next frame.
      *
-     * @return True if profiling should be enabled, False otherwise.
+     * @return @c true if profiling should be enabled, @c false otherwise.
      */
     bool isFrameOfInterest(uint64_t frameID) const;
 
+    /**
+     * @brief Test if we are sampling workloads.
+     *
+     * @return @c true if profiling workloads, @c false otherwise.
+     */
+    bool isSamplingWorkloads() const;
+
+    /**
+     * @brief Test if we are sampling frames.
+     *
+     * @return @c true if profiling frames, @c false otherwise.
+     */
+    bool isSamplingFrames() const;
+
+    /**
+     * @brief Test if any kind of sampling is active.
+     *
+     * @return @c true if profiling, @c false otherwise.
+     */
+    bool isSamplingAny() const;
+
 private:
     /**
-     * @brief Supported sampling modes.
+     * @brief Supported frame selection modes.
+     */
+    enum FrameSelectionMode
+    {
+        FRAME_SELECTION_DISABLED,
+        FRAME_SELECTION_LIST,
+        FRAME_SELECTION_PERIODIC
+    };
+
+    /**
+     * @brief Supported counter sampling modes.
      */
-    enum SamplingMode
+    enum CounterSamplingMode
     {
-        MODE_DISABLED,
-        MODE_FRAME_LIST,
-        MODE_PERIODIC_FRAME
+        COUNTER_SAMPLING_DISABLED,
+        COUNTER_SAMPLING_WORKLOADS,
+        COUNTER_SAMPLING_FRAMES
     };
 
     /**
@@ -79,9 +110,15 @@ class LayerConfig
     void parseSamplingOptions(const json& config);
 
     /**
-     * @brief The sampling mode.
+     * @brief The frame selection mode.
      */
-    SamplingMode mode {MODE_DISABLED};
+    FrameSelectionMode frameMode {FRAME_SELECTION_DISABLED};
+
+    /**
+     * @brief The counter sampling mode.
+     */
+    CounterSamplingMode samplingMode {COUNTER_SAMPLING_DISABLED};
+
 
     /**
      * @brief The sampling period in frames, or 0 if disabled.
diff --git a/layer_gpu_profile/source/layer_device_functions.hpp b/layer_gpu_profile/source/layer_device_functions.hpp
index e0e6b10..d0eb6a3 100644
--- a/layer_gpu_profile/source/layer_device_functions.hpp
+++ b/layer_gpu_profile/source/layer_device_functions.hpp
@@ -419,3 +419,10 @@ VKAPI_ATTR VkResult VKAPI_CALL layer_vkQueueSubmit2KHR<user_tag>(VkQueue queue,
                                                                  uint32_t submitCount,
                                                                  const VkSubmitInfo2* pSubmits,
                                                                  VkFence fence);
+
+/* See Vulkan API for documentation. */
+template <>
+VKAPI_ATTR VkResult VKAPI_CALL layer_vkQueueBindSparse<user_tag>(VkQueue queue,
+                                                                 uint32_t bindInfoCount,
+                                                                 const VkBindSparseInfo* pBindInfo,
+                                                                 VkFence fence);
diff --git a/layer_gpu_profile/source/layer_device_functions_queue.cpp b/layer_gpu_profile/source/layer_device_functions_queue.cpp
index 57c722d..fd7ba69 100644
--- a/layer_gpu_profile/source/layer_device_functions_queue.cpp
+++ b/layer_gpu_profile/source/layer_device_functions_queue.cpp
@@ -31,9 +31,9 @@
 #include "trackers/queue.hpp"
 
 #include <nlohmann/json.hpp>
-
 #include <mutex>
 #include <time.h>
+#include <vulkan/utility/vk_struct_helper.hpp>
 
 using json = nlohmann::json;
 
@@ -50,6 +50,12 @@ static void processLayerCommandStream(Device& layer,
                                       VkQueue queue,
                                       VkCommandBuffer commandBuffer)
 {
+    // Skip doing this if we are not doing per-workload profiling
+    if (!layer.instance->config.isSamplingWorkloads())
+    {
+        return;
+    }
+
     // Fetch layer proxies for this workload
     auto& tracker = layer.getStateTracker();
     auto& trackQueue = tracker.getQueue(queue);
@@ -62,66 +68,174 @@ static void processLayerCommandStream(Device& layer,
     trackQueue.runSubmitCommandStream(cbLCS, workloadVisitor);
 }
 
-/* See Vulkan API for documentation. */
-template<>
-VKAPI_ATTR VkResult VKAPI_CALL layer_vkQueuePresentKHR<user_tag>(VkQueue queue, const VkPresentInfoKHR* pPresentInfo)
-{
-    LAYER_TRACE(__func__);
-
-    // Hold the lock to access layer-wide global store
-    std::unique_lock<std::mutex> lock {g_vulkanLock};
-    auto* layer = Device::retrieve(queue);
-
-    auto& tracker = layer->getStateTracker();
+/**
+ * @brief Process frame boundaries
+ *
+ * @param layer             The layer context.
+ */
+static void processFrameBoundaryPreSubmit(
+    Device& layer
+) {
+    const auto& config = layer.instance->config;
+    auto& tracker = layer.getStateTracker();
     tracker.queuePresent();
 
     // End the previous frame if it was "of interest"
-    if (layer->isFrameOfInterest)
+    if (layer.isFrameOfInterest && config.isSamplingWorkloads())
     {
         json endFrameMessage {
             { "type", "end_frame" }
         };
 
-        layer->txMessage(endFrameMessage.dump());
+        layer.txMessage(endFrameMessage.dump());
     }
 
     uint64_t frameID = tracker.totalStats.getFrameCount();
-    layer->isFrameOfInterest = layer->instance->config.isFrameOfInterest(frameID);
+    layer.isFrameOfInterest = layer.instance->config.isFrameOfInterest(frameID);
 
     // Start the next frame if it is "of interest"
-    if (layer->isFrameOfInterest)
+    if (layer.isFrameOfInterest && config.isSamplingWorkloads())
     {
         json startFrameMessage {
             { "type", "start_frame" },
             { "frame", frameID },
         };
 
-        layer->txMessage(startFrameMessage.dump());
+        layer.txMessage(startFrameMessage.dump());
     }
+}
 
-    // If a "normal" frame then release the lock before calling in to the
-    // driver, otherwise keep the lock to stop other threads using Vulkan
-    // while we sync and reset the counter stream
-    if (!layer->isFrameOfInterest)
+/**
+ * @brief Check a pNext chain for a manual frame boundary marker.
+ *
+ * Emits the necessary metadata to emulate a vkQueuePresent.
+ *
+ * @param layer   The layer context.
+ * @param pNext   The submit pNext pointer.
+ *
+ * @returns @c true if end of frame detected, @c false otherwise.
+ */
+static bool processManualFrameBoundaryPreSubmit(
+    Device& layer,
+    const void* pNext
+) {
+    // Check for end of frame boundary
+    auto* ext = vku::FindStructInPNextChain<VkFrameBoundaryEXT>(pNext);
+    if (ext && (ext->flags & VK_FRAME_BOUNDARY_FRAME_END_BIT_EXT))
     {
-        lock.unlock();
+        processFrameBoundaryPreSubmit(layer);
+        return true;
     }
 
-    auto ret = layer->driver.vkQueuePresentKHR(queue, pPresentInfo);
+    return false;
+}
 
+/**
+ * @brief Process frame boundaries
+ *
+ * @param layer         The layer context.
+ * @param frameSample   @c true to emit frame sample, @c false just to reset.
+ */
+static void processFrameBoundaryPostSubmit(
+    Device& layer,
+    bool frameSample
+) {
     // If we are measuring performance ensure the previous frame has finished
     // and then take an initial sample to reset the counters
-    if (layer->isFrameOfInterest)
+    layer.driver.vkDeviceWaitIdle(layer.device);
+    workaroundDelay();
+    auto ec = layer.lgcSampler->sample_now();
+    if (ec)
     {
-        layer->driver.vkDeviceWaitIdle(layer->device);
-        workaroundDelay();
-        auto ec = layer->lgcSampler->sample_now();
+        LAYER_ERR("Failed to make libGPUCounters GPU counter sample");
+    }
+
+    // No sample data needed - just use it to reset the counters
+    if (!frameSample)
+    {
+        return;
+    }
+
+    // Otherwise emit a frame sample data packet
+    auto& tracker = layer.getStateTracker();
+
+    // Frame count has already been incremented for next frame so decrement
+    // for reporting purposes
+    uint64_t frameID = tracker.totalStats.getFrameCount() - 1;
+
+    json message {
+        { "type", "frame" },
+        { "frame", frameID },
+        { "counters", json::array() }
+    };
+
+    for (const auto& pair : layer.lgcActiveCounters)
+    {
+        hwcpipe::counter_sample sample;
+        ec = layer.lgcSampler->get_counter_value(pair.first, sample);
         if (ec)
         {
-            LAYER_ERR("Failed to make libGPUCounters GPU counter sample");
+            LAYER_ERR("Failed to get libGPUCounters GPU counter value");
+            continue;
+        }
+
+        if (sample.type == hwcpipe::counter_sample::type::uint64)
+        {
+            json counter {
+                { pair.second, sample.value.uint64 },
+            };
+
+            message["counters"].push_back(counter);
+
+        }
+        else
+        {
+            json counter {
+                { pair.second, sample.value.float64 },
+            };
+
+            message["counters"].push_back(counter);
+
         }
     }
 
+    layer.txMessage(message.dump());
+}
+
+/* See Vulkan API for documentation. */
+template<>
+VKAPI_ATTR VkResult VKAPI_CALL layer_vkQueuePresentKHR<user_tag>(VkQueue queue, const VkPresentInfoKHR* pPresentInfo)
+{
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(queue);
+    const auto& config = layer->instance->config;
+
+    bool sampleLastFrame = layer->isFrameOfInterest && config.isSamplingFrames();
+
+    // Process frame boundary pre-flight handling for metadata
+    processFrameBoundaryPreSubmit(*layer);
+
+    bool resetThisFrame = layer->isFrameOfInterest && config.isSamplingAny();
+
+    // If a "normal" frame then release the lock before calling in to the
+    // driver, otherwise keep the lock to stop other threads using Vulkan
+    // while we sync and reset the counter stream
+    if (!sampleLastFrame && !resetThisFrame)
+    {
+        lock.unlock();
+    }
+
+    auto ret = layer->driver.vkQueuePresentKHR(queue, pPresentInfo);
+
+    // Process frame boundary post-flight isolation of frames
+    if (sampleLastFrame || resetThisFrame)
+    {
+        processFrameBoundaryPostSubmit(*layer, sampleLastFrame);
+    }
+
     return ret;
 }
 
@@ -135,11 +249,21 @@ VKAPI_ATTR VkResult VKAPI_CALL
     // Hold the lock to access layer-wide global store
     std::unique_lock<std::mutex> lock {g_vulkanLock};
     auto* layer = Device::retrieve(queue);
+    const auto& config = layer->instance->config;
+
+    bool sampleLastFrame = layer->isFrameOfInterest && config.isSamplingFrames();
+
+    // TODO: Handling here does not split submits if a submit before the last
+    // indicates that it's the end of a frame, but I've never seen it done
+    bool isFrameEnd = processManualFrameBoundaryPreSubmit(*layer, pSubmits->pNext);
+
+    sampleLastFrame = isFrameEnd && sampleLastFrame;
+    bool resetThisFrame = isFrameEnd && layer->isFrameOfInterest && config.isSamplingAny();
 
     // If a "normal" frame then release the lock before calling in to the
     // driver, otherwise keep the lock to stop other threads using Vulkan
     // while we sync and reset the counter stream
-    if (!layer->isFrameOfInterest)
+    if (!sampleLastFrame && !resetThisFrame)
     {
         lock.unlock();
     }
@@ -152,7 +276,7 @@ VKAPI_ATTR VkResult VKAPI_CALL
 
     // If we are measuring performance then run the layer command stream with
     // the lock held to stop other submits perturbing the counter data
-    if (layer->isFrameOfInterest)
+    if (layer->isFrameOfInterest && config.isSamplingWorkloads())
     {
         for (uint32_t i = 0; i < submitCount; i++)
         {
@@ -165,6 +289,11 @@ VKAPI_ATTR VkResult VKAPI_CALL
         }
     }
 
+    if (sampleLastFrame || resetThisFrame)
+    {
+        processFrameBoundaryPostSubmit(*layer, sampleLastFrame);
+    }
+
     return res;
 }
 
@@ -178,11 +307,21 @@ VKAPI_ATTR VkResult VKAPI_CALL
     // Hold the lock to access layer-wide global store
     std::unique_lock<std::mutex> lock {g_vulkanLock};
     auto* layer = Device::retrieve(queue);
+    const auto& config = layer->instance->config;
+
+    bool sampleLastFrame = layer->isFrameOfInterest && config.isSamplingFrames();
+
+    // TODO: Handling here does not split submits if a submit before the last
+    // indicates that it's the end of a frame, but I've never seen it done
+    bool isFrameEnd = processManualFrameBoundaryPreSubmit(*layer, pSubmits->pNext);
+
+    sampleLastFrame = isFrameEnd && sampleLastFrame;
+    bool resetThisFrame = isFrameEnd && layer->isFrameOfInterest && config.isSamplingAny();
 
     // If a "normal" frame then release the lock before calling in to the
     // driver, otherwise keep the lock to stop other threads using Vulkan
     // while we sync and reset the counter stream
-    if (!layer->isFrameOfInterest)
+    if (!sampleLastFrame && !resetThisFrame)
     {
         lock.unlock();
     }
@@ -195,7 +334,7 @@ VKAPI_ATTR VkResult VKAPI_CALL
 
     // If we are measuring performance then run the layer command stream with
     // the lock held to stop other submits perturbing the counter data
-    if (layer->isFrameOfInterest)
+    if (layer->isFrameOfInterest && config.isSamplingWorkloads())
     {
         for (uint32_t i = 0; i < submitCount; i++)
         {
@@ -208,6 +347,11 @@ VKAPI_ATTR VkResult VKAPI_CALL
         }
     }
 
+    if (sampleLastFrame || resetThisFrame)
+    {
+        processFrameBoundaryPostSubmit(*layer, sampleLastFrame);
+    }
+
     return res;
 }
 
@@ -221,24 +365,34 @@ VKAPI_ATTR VkResult VKAPI_CALL
     // Hold the lock to access layer-wide global store
     std::unique_lock<std::mutex> lock {g_vulkanLock};
     auto* layer = Device::retrieve(queue);
+    const auto& config = layer->instance->config;
+
+    bool sampleLastFrame = layer->isFrameOfInterest && config.isSamplingFrames();
+
+    // TODO: Handling here does not split submits if a submit before the last
+    // indicates that it's the end of a frame, but I've never seen it done
+    bool isFrameEnd = processManualFrameBoundaryPreSubmit(*layer, pSubmits->pNext);
+
+    sampleLastFrame = isFrameEnd && sampleLastFrame;
+    bool resetThisFrame = isFrameEnd && layer->isFrameOfInterest && config.isSamplingAny();
 
     // If a "normal" frame then release the lock before calling in to the
     // driver, otherwise keep the lock to stop other threads using Vulkan
     // while we sync and reset the counter stream
-    if (!layer->isFrameOfInterest)
+    if (!sampleLastFrame && !resetThisFrame)
     {
         lock.unlock();
     }
 
     auto res = layer->driver.vkQueueSubmit2KHR(queue, submitCount, pSubmits, fence);
-    if (res != VK_SUCCESS || !layer->isFrameOfInterest)
+    if (res != VK_SUCCESS)
     {
         return res;
     }
 
     // If we are measuring performance then run the layer command stream with
     // the lock held to stop other submits perturbing the counter data
-    if (layer->isFrameOfInterest)
+    if (layer->isFrameOfInterest && config.isSamplingWorkloads())
     {
         for (uint32_t i = 0; i < submitCount; i++)
         {
@@ -251,5 +405,66 @@ VKAPI_ATTR VkResult VKAPI_CALL
         }
     }
 
+    if (sampleLastFrame || resetThisFrame)
+    {
+        processFrameBoundaryPostSubmit(*layer, sampleLastFrame);
+    }
+
+    return res;
+}
+
+/**
+ * See Vulkan API for documentation.
+ *
+ * Note: Modelling of this function is only implemented to support manual frame
+ * boundaries. There is no reporting of the workload associated with bind
+ * sparse submissions in the Mali timeline driver data model.
+ */
+template <>
+VKAPI_ATTR VkResult VKAPI_CALL layer_vkQueueBindSparse<user_tag>(
+    VkQueue queue,
+    uint32_t bindInfoCount,
+    const VkBindSparseInfo* pBindInfo,
+    VkFence fence
+) {
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock {g_vulkanLock};
+    auto* layer = Device::retrieve(queue);
+    const auto& config = layer->instance->config;
+
+    bool sampleLastFrame = layer->isFrameOfInterest && config.isSamplingFrames();
+
+    // Scan infos for frame boundaries
+    bool isFrameEnd { false };
+    for (uint32_t i = 0; i < bindInfoCount; i++)
+    {
+        const auto& info = pBindInfo[i];
+        isFrameEnd |= processManualFrameBoundaryPreSubmit(*layer, info.pNext);
+    }
+
+    sampleLastFrame = isFrameEnd && sampleLastFrame;
+    bool resetThisFrame = isFrameEnd && layer->isFrameOfInterest && config.isSamplingAny();
+
+    // If a "normal" frame then release the lock before calling in to the
+    // driver, otherwise keep the lock to stop other threads using Vulkan
+    // while we sync and reset the counter stream
+    if (!sampleLastFrame && !resetThisFrame)
+    {
+        lock.unlock();
+    }
+
+    auto res = layer->driver.vkQueueBindSparse(queue, bindInfoCount, pBindInfo, fence);
+    if (res != VK_SUCCESS)
+    {
+        return res;
+    }
+
+    if (sampleLastFrame || resetThisFrame)
+    {
+        processFrameBoundaryPostSubmit(*layer, sampleLastFrame);
+    }
+
     return res;
 }
diff --git a/layer_gpu_profile/source/layer_instance_functions.cpp b/layer_gpu_profile/source/layer_instance_functions.cpp
new file mode 100644
index 0000000..f152da2
--- /dev/null
+++ b/layer_gpu_profile/source/layer_instance_functions.cpp
@@ -0,0 +1,80 @@
+/*
+ * SPDX-License-Identifier: MIT
+ * ----------------------------------------------------------------------------
+ * Copyright (c) 2025 Arm Limited
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ * ----------------------------------------------------------------------------
+ */
+
+#include "instance.hpp"
+#include "device.hpp"
+
+#include <mutex>
+#include <vulkan/utility/vk_struct_helper.hpp>
+
+extern std::mutex g_vulkanLock;
+
+/* See Vulkan API for documentation. */
+template <>
+VKAPI_ATTR void VKAPI_CALL layer_vkGetPhysicalDeviceFeatures2<user_tag>(
+    VkPhysicalDevice physicalDevice,
+    VkPhysicalDeviceFeatures2* pFeatures
+) {
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock { g_vulkanLock };
+    auto* layer = Instance::retrieve(physicalDevice);
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkGetPhysicalDeviceFeatures2(physicalDevice, pFeatures);
+
+    // Patch the query response to show that it is supported
+    auto* ext = vku::FindStructInPNextChain<VkPhysicalDeviceFrameBoundaryFeaturesEXT>(pFeatures->pNext);
+    if (ext)
+    {
+        ext->frameBoundary = VK_TRUE;
+    }
+}
+
+/* See Vulkan API for documentation. */
+template <>
+VKAPI_ATTR void VKAPI_CALL layer_vkGetPhysicalDeviceFeatures2KHR<user_tag>(
+    VkPhysicalDevice physicalDevice,
+    VkPhysicalDeviceFeatures2* pFeatures
+) {
+    LAYER_TRACE(__func__);
+
+    // Hold the lock to access layer-wide global store
+    std::unique_lock<std::mutex> lock { g_vulkanLock };
+    auto* layer = Instance::retrieve(physicalDevice);
+
+    // Release the lock to call into the driver
+    lock.unlock();
+    layer->driver.vkGetPhysicalDeviceFeatures2KHR(physicalDevice, pFeatures);
+
+    // Patch the query response to show that it is supported
+    auto* ext = vku::FindStructInPNextChain<VkPhysicalDeviceFrameBoundaryFeaturesEXT>(pFeatures->pNext);
+    if (ext)
+    {
+        ext->frameBoundary = VK_TRUE;
+    }
+}
diff --git a/layer_gpu_profile/source/layer_instance_functions.hpp b/layer_gpu_profile/source/layer_instance_functions.hpp
new file mode 100644
index 0000000..3bab9ec
--- /dev/null
+++ b/layer_gpu_profile/source/layer_instance_functions.hpp
@@ -0,0 +1,42 @@
+/*
+ * SPDX-License-Identifier: MIT
+ * ----------------------------------------------------------------------------
+ * Copyright (c) 2025 Arm Limited
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ * ----------------------------------------------------------------------------
+ */
+
+#pragma once
+
+#include <vulkan/vulkan.h>
+
+// Functions for devices
+
+/* See Vulkan API for documentation. */
+template <>
+VKAPI_ATTR void VKAPI_CALL layer_vkGetPhysicalDeviceFeatures2<user_tag>(
+    VkPhysicalDevice physicalDevice,
+    VkPhysicalDeviceFeatures2* pFeatures);
+
+/* See Vulkan API for documentation. */
+template <>
+VKAPI_ATTR void VKAPI_CALL layer_vkGetPhysicalDeviceFeatures2KHR<user_tag>(
+    VkPhysicalDevice physicalDevice,
+    VkPhysicalDeviceFeatures2* pFeatures);
diff --git a/lglpy/comms/service_gpu_profile.py b/lglpy/comms/service_gpu_profile.py
index 7c30e5e..adea4c3 100644
--- a/lglpy/comms/service_gpu_profile.py
+++ b/lglpy/comms/service_gpu_profile.py
@@ -27,6 +27,7 @@
 '''
 
 import csv
+import enum
 import json
 import os
 from typing import Any, Optional, TypedDict, Union
@@ -51,13 +52,30 @@ class EndFrameMessage(TypedDict):
 
 class WorkloadMessage(TypedDict):
     '''
-    Type information for any workload JSON message.
+    Type information for any per-workload sample JSON message.
     '''
     type: str
     counters: list[dict[str, Union[int, float]]]
     labels: list[str]
 
 
+class FrameMessage(TypedDict):
+    '''
+    Type information for a per-frame sample JSON message.
+    '''
+    type: str
+    counters: list[dict[str, Union[int, float]]]
+    frame: int
+
+
+class SampleMode(enum.Enum):
+    '''
+    Type of sampling detected.
+    '''
+    PER_WORKLOAD = 0
+    PER_FRAME = 1
+
+
 class GPUProfileService:
     '''
     A service for handling network comms from the layer_gpu_profile layer.
@@ -73,9 +91,13 @@ def __init__(self, dir_path: str, verbose: bool = False):
         '''
         self.base_dir = dir_path
 
+        # Sample mode is detected on the fly when we get our first data
+        self.sample_mode = SampleMode.PER_WORKLOAD
+
         self.frame_id: Optional[int] = None
-        self.frame_header: Optional[list[str]] = None
-        self.frame_data: Optional[list[list[str]]] = None
+
+        self.table_header: Optional[list[str]] = None
+        self.table_data: list[list[str]] = []
 
         os.makedirs(dir_path, exist_ok=True)
 
@@ -96,8 +118,8 @@ def handle_start_frame(self, message: StartFrameMessage):
             message: The decoded JSON.
         '''
         self.frame_id = message["frame"]
-        self.frame_header = None
-        self.frame_data = []
+        self.table_header = None
+        self.table_data.clear()
 
     def handle_end_frame(self, message: EndFrameMessage):
         '''
@@ -110,25 +132,24 @@ def handle_end_frame(self, message: EndFrameMessage):
         del message
 
         assert self.frame_id is not None
-        assert self.frame_header is not None
-        assert self.frame_data is not None
+        assert self.table_header is not None
 
         # Emit the CSV file
         print(f'Generating CSV for frame {self.frame_id}')
         path = os.path.join(self.base_dir, f'frame_{self.frame_id:05d}.csv')
         with open(path, 'w', newline='') as handle:
             writer = csv.writer(handle)
-            writer.writerow(self.frame_header)
-            writer.writerows(self.frame_data)
+            writer.writerow(self.table_header)
+            writer.writerows(self.table_data)
 
         # Reset the state
         self.frame_id = None
-        self.frame_header = None
-        self.frame_data = None
+        self.table_header = None
+        self.table_data.clear()
 
     def create_workload_header(self, message: WorkloadMessage):
         '''
-        Create a table header row from a workload.
+        Create a table header row from a workload sample.
 
         Args:
             message: The decoded JSON.
@@ -142,22 +163,21 @@ def create_workload_header(self, message: WorkloadMessage):
             columns.append(key)
         columns.append('Label')
 
-        self.frame_header = columns
+        self.table_header = columns
 
     def create_workload_data(self, message: WorkloadMessage):
         '''
-        Create a table data row from a workload.
+        Create a table data row from a workload sample.
 
         Args:
             message: The decoded JSON.
         '''
         assert self.frame_id is not None
-        assert self.frame_header is not None
-        assert self.frame_data is not None
+        assert self.table_header is not None
 
         columns: list[str] = []
 
-        columns.append(str(len(self.frame_data)))
+        columns.append(str(len(self.table_data)))
         columns.append(message['type'])
 
         for counter in message['counters']:
@@ -165,20 +185,76 @@ def create_workload_data(self, message: WorkloadMessage):
             columns.append(f'{value:0.2f}')
         columns.append('|'.join(message['labels']))
 
-        self.frame_data.append(columns)
+        self.table_data.append(columns)
 
-    def handle_workload(self, message: WorkloadMessage):
+    def handle_workload_sample(self, message: WorkloadMessage):
         '''
-        Handle a workload message.
+        Handle a workload sample message.
 
         Args:
             message: The decoded JSON.
         '''
-        if not self.frame_header:
+        if not self.table_header:
             self.create_workload_header(message)
 
         self.create_workload_data(message)
 
+    def create_frame_header(self, message: FrameMessage):
+        '''
+        Create a table header row from a frame sample.
+        Args:
+            message: The decoded JSON.
+        '''
+        columns = []
+
+        columns.append('Frame ID')
+        for counter in message['counters']:
+            key = list(counter.keys())[0]
+            columns.append(key)
+
+        self.table_header = columns
+
+    def create_frame_data(self, message: FrameMessage):
+        '''
+        Create a table data row from a frame sample.
+
+        Args:
+            message: The decoded JSON.
+        '''
+        assert self.table_header is not None
+
+        columns: list[str] = []
+
+        columns.append(f'{self.frame_id}')
+
+        for counter in message['counters']:
+            value = list(counter.values())[0]
+            columns.append(f'{value:0.2f}')
+
+        self.table_data.append(columns)
+
+    def handle_frame_sample(self, message: FrameMessage):
+        '''
+        Handle a frame message.
+
+        Args:
+            message: The decoded JSON.
+        '''
+        self.frame_id = message['frame']
+
+        if not self.table_header:
+            self.create_frame_header(message)
+
+        assert self.table_header is not None
+        self.create_frame_data(message)
+
+        print(f'Updating CSV for frame {self.frame_id}')
+        path = os.path.join(self.base_dir, f'capture.csv')
+        with open(path, 'w', newline='') as handle:
+            writer = csv.writer(handle)
+            writer.writerow(self.table_header)
+            writer.writerows(self.table_data)
+
     def handle_message(self, message: Message) -> None:
         '''
         Handle a service request from a layer.
@@ -193,5 +269,7 @@ def handle_message(self, message: Message) -> None:
             self.handle_start_frame(payload)
         elif payload['type'] == 'end_frame':
             self.handle_end_frame(payload)
+        elif payload['type'] == 'frame':
+            self.handle_frame_sample(payload)
         else:
-            self.handle_workload(payload)
+            self.handle_workload_sample(payload)

From 3f2e89bfc850af02d218f8b8348f9bd96b360b53 Mon Sep 17 00:00:00 2001
From: Peter Harris <peter.harris@arm.com>
Date: Fri, 19 Dec 2025 16:20:29 +0000
Subject: [PATCH 3/3] Profile layer: Add frame serialization config option

---
 .github/workflows/native_test.yaml            |  2 ++
 .github/workflows/python_test.yaml            |  2 ++
 layer_gpu_profile/README_LAYER.md             |  8 +++++++
 layer_gpu_profile/layer_config.json           |  3 ++-
 layer_gpu_profile/source/layer_config.cpp     | 23 ++++++++++++++++---
 layer_gpu_profile/source/layer_config.hpp     | 11 +++++++++
 .../source/layer_device_functions_queue.cpp   | 10 ++++++--
 7 files changed, 53 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/native_test.yaml b/.github/workflows/native_test.yaml
index 1e4e3f2..8ed1c3e 100644
--- a/.github/workflows/native_test.yaml
+++ b/.github/workflows/native_test.yaml
@@ -11,12 +11,14 @@ on:
     paths-ignore:
       - 'lglpy/**'
       - '**/*.md'
+      - '**/*.json'
   pull_request:
     branches:
       - main
     paths-ignore:
       - 'lglpy/**'
       - '**/*.md'
+      - '**/*.json'
 
 env:
   CMAKE_BUILD_PARALLEL_LEVEL: '8'
diff --git a/.github/workflows/python_test.yaml b/.github/workflows/python_test.yaml
index d700acc..67b3a8f 100644
--- a/.github/workflows/python_test.yaml
+++ b/.github/workflows/python_test.yaml
@@ -10,11 +10,13 @@ on:
       - '*'
     paths-ignore:
       - '**/*.md'
+      - '**/*.json'
   pull_request:
     branches:
       - main
     paths-ignore:
       - '**/*.md'
+      - '**/*.json'
 
 jobs:
   python-test:
diff --git a/layer_gpu_profile/README_LAYER.md b/layer_gpu_profile/README_LAYER.md
index 91f1499..de4d3bc 100644
--- a/layer_gpu_profile/README_LAYER.md
+++ b/layer_gpu_profile/README_LAYER.md
@@ -140,6 +140,14 @@ to profile using the `sample_mode` config option:
 * `workload`: Sample every workload in each frame of interest.
 * `frame`: Sample at the end of each frame of interest.
 
+By default per-frame samples are isolated from other frames by inserting a
+`vkDeviceWaitIdle()` before and after the frame to ensure that workload
+in the sampled region does not overlap neighboring frames. Setting the
+`frame_serialization` config option to `false` will allow frames to overlap
+without serialization, but can add noise to the returned counter values. This
+option has no effect for per-workload sampling, which must always use
+serialization.
+
 ## Layer counters
 
 The current layer uses a hard-coded set of performance counters defined in the
diff --git a/layer_gpu_profile/layer_config.json b/layer_gpu_profile/layer_config.json
index 6fa5d47..cd114c5 100644
--- a/layer_gpu_profile/layer_config.json
+++ b/layer_gpu_profile/layer_config.json
@@ -4,5 +4,6 @@
     "sample_mode": "frame",
     "periodic_min_frame": 1,
     "periodic_frame": 600,
-    "frame_list": []
+    "frame_list": [],
+    "frame_serialization": true
 }
diff --git a/layer_gpu_profile/source/layer_config.cpp b/layer_gpu_profile/source/layer_config.cpp
index ee4b365..ab90cb8 100644
--- a/layer_gpu_profile/source/layer_config.cpp
+++ b/layer_gpu_profile/source/layer_config.cpp
@@ -90,6 +90,9 @@ void LayerConfig::parseSamplingOptions(const json& config)
         rawSampleMode = "disabled";
     }
 
+    // Decode frame serialization mode
+    frameSerialization = config.at("frame_serialization");
+
     LAYER_LOG("Layer sampling configuration");
     LAYER_LOG("============================");
     LAYER_LOG(" - Frame selection mode: %s", rawFrameMode.c_str());
@@ -107,6 +110,11 @@ void LayerConfig::parseSamplingOptions(const json& config)
     }
 
     LAYER_LOG(" - Counter sampling mode: %s", rawSampleMode.c_str());
+
+    if (samplingMode == COUNTER_SAMPLING_FRAMES)
+    {
+        LAYER_LOG(" - Frame serialization: %u", frameSerialization);
+    }
 }
 
 /* See header for documentation. */
@@ -172,20 +180,29 @@ bool LayerConfig::isFrameOfInterest(
 }
 
 /* See header for documentation. */
-bool LayerConfig::isSamplingWorkloads() const {
+bool LayerConfig::isSamplingWorkloads() const
+{
     return frameMode != FRAME_SELECTION_DISABLED &&
            samplingMode == COUNTER_SAMPLING_WORKLOADS;
 }
 
 /* See header for documentation. */
-bool LayerConfig::isSamplingFrames() const {
+bool LayerConfig::isSamplingFrames() const
+{
     return frameMode != FRAME_SELECTION_DISABLED &&
            samplingMode == COUNTER_SAMPLING_FRAMES;
 }
 
 /* See header for documentation. */
-bool LayerConfig::isSamplingAny() const {
+bool LayerConfig::isSamplingAny() const
+{
     return frameMode != FRAME_SELECTION_DISABLED &&
            samplingMode != COUNTER_SAMPLING_DISABLED;
 }
 
+/* See header for documentation. */
+bool LayerConfig::isSerializingFrames() const
+{
+    return isSamplingWorkloads() ||
+           (isSamplingFrames() && frameSerialization);
+};
diff --git a/layer_gpu_profile/source/layer_config.hpp b/layer_gpu_profile/source/layer_config.hpp
index 17d2492..9c88d4b 100644
--- a/layer_gpu_profile/source/layer_config.hpp
+++ b/layer_gpu_profile/source/layer_config.hpp
@@ -79,6 +79,13 @@ class LayerConfig
      */
     bool isSamplingAny() const;
 
+    /**
+     * @brief Test if we are serializing frames.
+     *
+     * @return @c true if serializing, @c false otherwise.
+     */
+    bool isSerializingFrames() const;
+
 private:
     /**
      * @brief Supported frame selection modes.
@@ -119,6 +126,10 @@ class LayerConfig
      */
     CounterSamplingMode samplingMode {COUNTER_SAMPLING_DISABLED};
 
+    /**
+     * @brief The frame sample serialization mode.
+     */
+    bool frameSerialization {true};
 
     /**
      * @brief The sampling period in frames, or 0 if disabled.
diff --git a/layer_gpu_profile/source/layer_device_functions_queue.cpp b/layer_gpu_profile/source/layer_device_functions_queue.cpp
index fd7ba69..6a8b104 100644
--- a/layer_gpu_profile/source/layer_device_functions_queue.cpp
+++ b/layer_gpu_profile/source/layer_device_functions_queue.cpp
@@ -140,10 +140,16 @@ static void processFrameBoundaryPostSubmit(
     Device& layer,
     bool frameSample
 ) {
+    const auto& config = layer.instance->config;
+
     // If we are measuring performance ensure the previous frame has finished
     // and then take an initial sample to reset the counters
-    layer.driver.vkDeviceWaitIdle(layer.device);
-    workaroundDelay();
+    if (config.isSerializingFrames())
+    {
+        layer.driver.vkDeviceWaitIdle(layer.device);
+        workaroundDelay();
+    }
+
     auto ec = layer.lgcSampler->sample_now();
     if (ec)
     {