drm/i915: OA RC6 WA BB

matt-auld · matt-auld · commit 5e6f54beca8b · 2016-02-22T10:16:46.000Z
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
@@ -2085,6 +2085,9 @@ struct drm_i915_private {
 			int n_builtin_sets;
 
 			bool enable_rc6;
+			struct drm_i915_gem_object *ctx_wa_obj_buf[2];
+			int ctx_wa_idx;
+			bool dirty_wa_obj;
 		} oa;
 	} perf;
 
@@ -3330,6 +3333,7 @@ void i915_oa_context_pin_notify(struct drm_i915_private *dev_priv,
 				struct intel_context *context);
 void i915_oa_legacy_ctx_switch_notify(struct drm_i915_gem_request *req);
 void i915_oa_update_reg_state(struct intel_engine_cs *ring, uint32_t *reg_state);
+struct drm_i915_gem_object *i915_oa_ctx_wa_obj(struct drm_i915_private *dev_priv);
 
 /* i915_gem_evict.c */
 int __must_check i915_gem_evict_something(struct drm_device *dev,
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
@@ -416,8 +416,10 @@ static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
 
 	free_oa_buffer(dev_priv);
 
-	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
-	intel_runtime_pm_put(dev_priv);
+	if (!dev_priv->perf.oa.enable_rc6) {
+		intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
+		intel_runtime_pm_put(dev_priv);
+	}
 
 	dev_priv->perf.oa.exclusive_stream = NULL;
 }
@@ -834,6 +836,119 @@ static void i915_oa_stream_disable(struct i915_perf_stream *stream)
 		hrtimer_cancel(&dev_priv->perf.oa.poll_check_timer);
 }
 
+struct drm_i915_gem_object *
+i915_oa_ctx_wa_obj(struct drm_i915_private *dev_priv)
+{
+	struct drm_i915_gem_object *ctx_wa_obj =
+		dev_priv->perf.oa.ctx_wa_obj_buf[dev_priv->perf.oa.ctx_wa_idx];
+
+	if (dev_priv->perf.oa.dirty_wa_obj) {
+		dev_priv->perf.oa.dirty_wa_obj = false;
+		dev_priv->perf.oa.ctx_wa_idx = !dev_priv->perf.oa.ctx_wa_idx;
+	}
+
+	return ctx_wa_obj;
+}
+
+static int init_ctx_wa_obj_buf(struct drm_i915_private *dev_priv)
+{
+	struct intel_engine_cs *ring = &dev_priv->ring[RCS];
+	struct page *page = i915_gem_object_get_page(ring->wa_ctx.obj, 0);
+	uint32_t *data = kmap_atomic(page);
+	int ret;
+
+	dev_priv->perf.oa.ctx_wa_obj_buf[0] =
+		i915_gem_object_create_from_data(dev_priv->dev, data,
+						 PAGE_SIZE);
+	kunmap_atomic(data);
+
+	if (!dev_priv->perf.oa.ctx_wa_obj_buf[0]) {
+		DRM_DEBUG_DRIVER("failed to allocate rc6 wa bb\n");
+		return -ENOMEM;
+	}
+
+	ret = i915_gem_obj_ggtt_pin(dev_priv->perf.oa.ctx_wa_obj_buf[0],
+				    PAGE_SIZE, 0);
+	if (ret) {
+		DRM_DEBUG_DRIVER("failed to pin rc6 wa bb\n");
+
+		mutex_lock(&dev_priv->dev->struct_mutex);
+		drm_gem_object_unreference(&dev_priv->perf.oa.ctx_wa_obj_buf[0]->base);
+		mutex_unlock(&dev_priv->dev->struct_mutex);
+
+		dev_priv->perf.oa.ctx_wa_obj_buf[0] = NULL;
+
+		return ret;
+	}
+
+	dev_priv->perf.oa.ctx_wa_obj_buf[1] = ring->wa_ctx.obj;
+
+	return 0;
+}
+
+static int init_rc6_wa_bb(struct drm_i915_private *dev_priv)
+{
+	struct page *page;
+	uint32_t *batch;
+	int ret, index, i, num_regs;
+	struct intel_engine_cs *ring = &dev_priv->ring[RCS];
+	struct drm_i915_gem_object *ctx_wa_obj;
+
+	if (!dev_priv->perf.oa.ctx_wa_obj_buf[0]) {
+		ret = init_ctx_wa_obj_buf(dev_priv);
+		if (ret)
+			return ret;
+	}
+
+	dev_priv->perf.oa.dirty_wa_obj = true;
+
+	ctx_wa_obj = dev_priv->perf.oa.ctx_wa_obj_buf[dev_priv->perf.oa.ctx_wa_idx];
+
+	page = i915_gem_object_get_page(ctx_wa_obj, 0);
+	batch = kmap_atomic(page);
+
+	index = ring->wa_ctx.per_ctx_rc6.offset;
+
+	batch[index++] = MI_NOOP;
+	batch[index++] = MI_LOAD_REGISTER_IMM(1);
+	batch[index++] = GDT_CHICKEN_BITS;
+	batch[index++] = 0xA0;
+
+	for (i = 0; i < dev_priv->perf.oa.mux_regs_len; i++) {
+		/* x <= 16 must hold with MI_LOAD_REGISTER_IMM(x) */
+		if (i % 16 == 0) {
+			num_regs = min(16, dev_priv->perf.oa.mux_regs_len - i);
+			batch[index++] = MI_NOOP;
+			batch[index++] = MI_LOAD_REGISTER_IMM(num_regs);
+		}
+
+		batch[index++] = dev_priv->perf.oa.mux_regs[i].addr;
+		batch[index++] = dev_priv->perf.oa.mux_regs[i].value;
+	}
+
+	batch[index++] = MI_NOOP;
+	batch[index++] = MI_LOAD_REGISTER_IMM(1);
+	batch[index++] = GDT_CHICKEN_BITS;
+	batch[index++] = 0x80;
+
+	for (i = 0; i < dev_priv->perf.oa.b_counter_regs_len; i++) {
+		if (i % 16 == 0) {
+			num_regs = min(16, dev_priv->perf.oa.b_counter_regs_len - i);
+			batch[index++] = MI_NOOP;
+			batch[index++] = MI_LOAD_REGISTER_IMM(num_regs);
+		}
+
+		batch[index++] = dev_priv->perf.oa.b_counter_regs[i].addr;
+		batch[index++] = dev_priv->perf.oa.b_counter_regs[i].value;
+	}
+
+	batch[index++] = MI_BATCH_BUFFER_END;
+
+	kunmap_atomic(batch);
+
+	return 0;
+}
+
 static int i915_oa_stream_init(struct i915_perf_stream *stream,
 			       struct drm_i915_perf_open_param *param,
 			       struct perf_open_properties *props)
@@ -912,12 +1027,38 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
 	 *
 	 *   In our case we are expected that taking pm + FORCEWAKE
 	 *   references will effectively disable RC6.
+	 *
+	 *   For BDW+, RC6 + OA is not plagued by this issue, so we instead
+	 *   try to leave RC6 enabled. One caveat though is that we now need
+	 *   to restore the NOA MUX configuration upon exiting RC6.
 	 */
-	intel_runtime_pm_get(dev_priv);
-	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
+
+	/* We must disable RC6 until we are able to correctly setup the RC6 WA
+	 * BB, if requested, otherwise we could potentially loose some OA state
+	 * which is not automatically restored as part of the OA power context */
+	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
+	intel_runtime_pm_put(dev_priv);
 
 	dev_priv->perf.oa.ops.enable_metric_set(dev_priv);
 
+	if (props->enable_rc6) {
+		if (IS_BROADWELL(dev_priv->dev)) {
+			ret = init_rc6_wa_bb(dev_priv);
+			if (ret)
+				DRM_ERROR("Failed to enable RC6 with OA\n");
+		} else {
+			DRM_ERROR("OA with RC6 enabled is not supported on this"
+				  " platform\n");
+			ret = -EINVAL;
+		}
+
+		intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
+		intel_runtime_pm_put(dev_priv);
+
+		if (ret)
+			return ret;
+	}
+
 	stream->destroy = i915_oa_stream_destroy;
 	stream->enable = i915_oa_stream_enable;
 	stream->disable = i915_oa_stream_disable;
@@ -1665,6 +1806,10 @@ void i915_perf_init(struct drm_device *dev)
 
 	dev_priv->perf.initialized = true;
 
+	dev_priv->perf.oa.ctx_wa_obj_buf[0] = NULL;
+	dev_priv->perf.oa.ctx_wa_idx = 0;
+	dev_priv->perf.oa.dirty_wa_obj = false;
+
 	return;
 
 sysfs_error:
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1286,8 +1286,6 @@ static int gen8_init_indirectctx_bb(struct intel_engine_cs *ring,
  *
  *   The number of DWORDS written are returned using this field.
  *
- *  This batch is terminated with MI_BATCH_BUFFER_END and so we need not add padding
- *  to align it with cacheline as padding after MI_BATCH_BUFFER_END is redundant.
  */
 static int gen8_init_perctx_bb(struct intel_engine_cs *ring,
 			       struct i915_wa_ctx_bb *wa_ctx,
@@ -1299,7 +1297,9 @@ static int gen8_init_perctx_bb(struct intel_engine_cs *ring,
 	/* WaDisableCtxRestoreArbitration:bdw,chv */
 	wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_ENABLE);
 
-	wa_ctx_emit(batch, index, MI_BATCH_BUFFER_END);
+	/* Pad to end of cacheline */
+	while (index % CACHELINE_DWORDS)
+		wa_ctx_emit(batch, index, MI_NOOP);
 
 	return wa_ctx_end(wa_ctx, *offset = index, 1);
 }
@@ -1354,6 +1354,33 @@ static int gen9_init_perctx_bb(struct intel_engine_cs *ring,
 	    (IS_BROXTON(dev) && (INTEL_REVID(dev) == BXT_REVID_A0)))
 		wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_ENABLE);
 
+	/* Pad to end of cacheline */
+	while (index % CACHELINE_DWORDS)
+		wa_ctx_emit(batch, index, MI_NOOP);
+
+	return wa_ctx_end(wa_ctx, *offset = index, 1);
+}
+
+
+/*
+ * This batch is started immediately after per_ctx batch. Since we ensure
+ * that per_ctx ends on a cacheline this batch is aligned automatically.
+ *
+ * This batch is a placeholder which we can potentially later use to the
+ * restore the NOA MUX configuration, the size and contents of which we do not
+ * know until we are able to configure the OA unit.
+ *
+ * This batch is terminated with MI_BATCH_BUFFER_END and so we need not add
+ * padding to align it with cacheline as padding after MI_BATCH_BUFFER_END is
+ * redundant
+ */
+static int init_oa_rc6_wa_bb(struct intel_engine_cs *ring,
+			     struct i915_wa_ctx_bb *wa_ctx,
+			     uint32_t *const batch,
+			     uint32_t *offset)
+{
+	uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS);
+
 	wa_ctx_emit(batch, index, MI_BATCH_BUFFER_END);
 
 	return wa_ctx_end(wa_ctx, *offset = index, 1);
@@ -1436,6 +1463,7 @@ static int intel_init_workaround_bb(struct intel_engine_cs *ring)
 					  &offset);
 		if (ret)
 			goto out;
+
 	} else if (INTEL_INFO(ring->dev)->gen == 9) {
 		ret = gen9_init_indirectctx_bb(ring,
 					       &wa_ctx->indirect_ctx,
@@ -1452,6 +1480,11 @@ static int intel_init_workaround_bb(struct intel_engine_cs *ring)
 			goto out;
 	}
 
+	ret = init_oa_rc6_wa_bb(ring,
+				&wa_ctx->per_ctx_rc6,
+				batch,
+				&offset);
+
 out:
 	kunmap_atomic(batch);
 	if (ret)
@@ -2305,7 +2338,25 @@ populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_o
 		reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] = 0;
 		if (ring->wa_ctx.obj) {
 			struct i915_ctx_workarounds *wa_ctx = &ring->wa_ctx;
-			uint32_t ggtt_offset = i915_gem_obj_ggtt_offset(wa_ctx->obj);
+			struct drm_i915_gem_object *wa_ctx_obj = wa_ctx->obj;
+			uint32_t ggtt_offset;
+
+			/* To support RC6 while capturing metrics with the OA
+			 * unit, we need to restore the NOA Mux configuration
+			 * upon existing RC6, while using the CTX_WA_PTR would
+			 * be a better solution here, there is reportadly HW bug
+			 * on SKL+ where the batch is not always executed, so
+			 * using the per-ctx batch seems to be next best solution
+			 *
+			 * Given that we need to able to update the per-ctx
+			 * batch on the fly with the NOA Mux config, we
+			 * double-buffer the batch, as the batch could
+			 * potentially still be in use by a restore in progress
+			 * */
+			if (dev_priv->perf.oa.enable_rc6)
+				wa_ctx_obj = i915_oa_ctx_wa_obj(dev_priv);
+
+			ggtt_offset = i915_gem_obj_ggtt_offset(wa_ctx_obj);
 
 			reg_state[CTX_RCS_INDIRECT_CTX+1] =
 				(ggtt_offset + wa_ctx->indirect_ctx.offset * sizeof(uint32_t)) |
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -139,7 +139,8 @@ struct  i915_ctx_workarounds {
 	struct i915_wa_ctx_bb {
 		u32 offset;
 		u32 size;
-	} indirect_ctx, per_ctx;
+	} indirect_ctx, per_ctx, per_ctx_rc6;
+
 	struct drm_i915_gem_object *obj;
 };