Skip to content

Commit 5e6f54b

Browse files
committed
drm/i915: OA RC6 WA BB
1 parent 42050b2 commit 5e6f54b

File tree

4 files changed

+210
-9
lines changed

4 files changed

+210
-9
lines changed

drivers/gpu/drm/i915/i915_drv.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2085,6 +2085,9 @@ struct drm_i915_private {
20852085
int n_builtin_sets;
20862086

20872087
bool enable_rc6;
2088+
struct drm_i915_gem_object *ctx_wa_obj_buf[2];
2089+
int ctx_wa_idx;
2090+
bool dirty_wa_obj;
20882091
} oa;
20892092
} perf;
20902093

@@ -3330,6 +3333,7 @@ void i915_oa_context_pin_notify(struct drm_i915_private *dev_priv,
33303333
struct intel_context *context);
33313334
void i915_oa_legacy_ctx_switch_notify(struct drm_i915_gem_request *req);
33323335
void i915_oa_update_reg_state(struct intel_engine_cs *ring, uint32_t *reg_state);
3336+
struct drm_i915_gem_object *i915_oa_ctx_wa_obj(struct drm_i915_private *dev_priv);
33333337

33343338
/* i915_gem_evict.c */
33353339
int __must_check i915_gem_evict_something(struct drm_device *dev,

drivers/gpu/drm/i915/i915_perf.c

Lines changed: 149 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -416,8 +416,10 @@ static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
416416

417417
free_oa_buffer(dev_priv);
418418

419-
intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
420-
intel_runtime_pm_put(dev_priv);
419+
if (!dev_priv->perf.oa.enable_rc6) {
420+
intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
421+
intel_runtime_pm_put(dev_priv);
422+
}
421423

422424
dev_priv->perf.oa.exclusive_stream = NULL;
423425
}
@@ -834,6 +836,119 @@ static void i915_oa_stream_disable(struct i915_perf_stream *stream)
834836
hrtimer_cancel(&dev_priv->perf.oa.poll_check_timer);
835837
}
836838

839+
struct drm_i915_gem_object *
840+
i915_oa_ctx_wa_obj(struct drm_i915_private *dev_priv)
841+
{
842+
struct drm_i915_gem_object *ctx_wa_obj =
843+
dev_priv->perf.oa.ctx_wa_obj_buf[dev_priv->perf.oa.ctx_wa_idx];
844+
845+
if (dev_priv->perf.oa.dirty_wa_obj) {
846+
dev_priv->perf.oa.dirty_wa_obj = false;
847+
dev_priv->perf.oa.ctx_wa_idx = !dev_priv->perf.oa.ctx_wa_idx;
848+
}
849+
850+
return ctx_wa_obj;
851+
}
852+
853+
static int init_ctx_wa_obj_buf(struct drm_i915_private *dev_priv)
854+
{
855+
struct intel_engine_cs *ring = &dev_priv->ring[RCS];
856+
struct page *page = i915_gem_object_get_page(ring->wa_ctx.obj, 0);
857+
uint32_t *data = kmap_atomic(page);
858+
int ret;
859+
860+
dev_priv->perf.oa.ctx_wa_obj_buf[0] =
861+
i915_gem_object_create_from_data(dev_priv->dev, data,
862+
PAGE_SIZE);
863+
kunmap_atomic(data);
864+
865+
if (!dev_priv->perf.oa.ctx_wa_obj_buf[0]) {
866+
DRM_DEBUG_DRIVER("failed to allocate rc6 wa bb\n");
867+
return -ENOMEM;
868+
}
869+
870+
ret = i915_gem_obj_ggtt_pin(dev_priv->perf.oa.ctx_wa_obj_buf[0],
871+
PAGE_SIZE, 0);
872+
if (ret) {
873+
DRM_DEBUG_DRIVER("failed to pin rc6 wa bb\n");
874+
875+
mutex_lock(&dev_priv->dev->struct_mutex);
876+
drm_gem_object_unreference(&dev_priv->perf.oa.ctx_wa_obj_buf[0]->base);
877+
mutex_unlock(&dev_priv->dev->struct_mutex);
878+
879+
dev_priv->perf.oa.ctx_wa_obj_buf[0] = NULL;
880+
881+
return ret;
882+
}
883+
884+
dev_priv->perf.oa.ctx_wa_obj_buf[1] = ring->wa_ctx.obj;
885+
886+
return 0;
887+
}
888+
889+
static int init_rc6_wa_bb(struct drm_i915_private *dev_priv)
890+
{
891+
struct page *page;
892+
uint32_t *batch;
893+
int ret, index, i, num_regs;
894+
struct intel_engine_cs *ring = &dev_priv->ring[RCS];
895+
struct drm_i915_gem_object *ctx_wa_obj;
896+
897+
if (!dev_priv->perf.oa.ctx_wa_obj_buf[0]) {
898+
ret = init_ctx_wa_obj_buf(dev_priv);
899+
if (ret)
900+
return ret;
901+
}
902+
903+
dev_priv->perf.oa.dirty_wa_obj = true;
904+
905+
ctx_wa_obj = dev_priv->perf.oa.ctx_wa_obj_buf[dev_priv->perf.oa.ctx_wa_idx];
906+
907+
page = i915_gem_object_get_page(ctx_wa_obj, 0);
908+
batch = kmap_atomic(page);
909+
910+
index = ring->wa_ctx.per_ctx_rc6.offset;
911+
912+
batch[index++] = MI_NOOP;
913+
batch[index++] = MI_LOAD_REGISTER_IMM(1);
914+
batch[index++] = GDT_CHICKEN_BITS;
915+
batch[index++] = 0xA0;
916+
917+
for (i = 0; i < dev_priv->perf.oa.mux_regs_len; i++) {
918+
/* x <= 16 must hold with MI_LOAD_REGISTER_IMM(x) */
919+
if (i % 16 == 0) {
920+
num_regs = min(16, dev_priv->perf.oa.mux_regs_len - i);
921+
batch[index++] = MI_NOOP;
922+
batch[index++] = MI_LOAD_REGISTER_IMM(num_regs);
923+
}
924+
925+
batch[index++] = dev_priv->perf.oa.mux_regs[i].addr;
926+
batch[index++] = dev_priv->perf.oa.mux_regs[i].value;
927+
}
928+
929+
batch[index++] = MI_NOOP;
930+
batch[index++] = MI_LOAD_REGISTER_IMM(1);
931+
batch[index++] = GDT_CHICKEN_BITS;
932+
batch[index++] = 0x80;
933+
934+
for (i = 0; i < dev_priv->perf.oa.b_counter_regs_len; i++) {
935+
if (i % 16 == 0) {
936+
num_regs = min(16, dev_priv->perf.oa.b_counter_regs_len - i);
937+
batch[index++] = MI_NOOP;
938+
batch[index++] = MI_LOAD_REGISTER_IMM(num_regs);
939+
}
940+
941+
batch[index++] = dev_priv->perf.oa.b_counter_regs[i].addr;
942+
batch[index++] = dev_priv->perf.oa.b_counter_regs[i].value;
943+
}
944+
945+
batch[index++] = MI_BATCH_BUFFER_END;
946+
947+
kunmap_atomic(batch);
948+
949+
return 0;
950+
}
951+
837952
static int i915_oa_stream_init(struct i915_perf_stream *stream,
838953
struct drm_i915_perf_open_param *param,
839954
struct perf_open_properties *props)
@@ -912,12 +1027,38 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
9121027
*
9131028
* In our case we are expected that taking pm + FORCEWAKE
9141029
* references will effectively disable RC6.
1030+
*
1031+
* For BDW+, RC6 + OA is not plagued by this issue, so we instead
1032+
* try to leave RC6 enabled. One caveat though is that we now need
1033+
* to restore the NOA MUX configuration upon exiting RC6.
9151034
*/
916-
intel_runtime_pm_get(dev_priv);
917-
intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
1035+
1036+
/* We must disable RC6 until we are able to correctly setup the RC6 WA
1037+
* BB, if requested, otherwise we could potentially loose some OA state
1038+
* which is not automatically restored as part of the OA power context */
1039+
intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
1040+
intel_runtime_pm_put(dev_priv);
9181041

9191042
dev_priv->perf.oa.ops.enable_metric_set(dev_priv);
9201043

1044+
if (props->enable_rc6) {
1045+
if (IS_BROADWELL(dev_priv->dev)) {
1046+
ret = init_rc6_wa_bb(dev_priv);
1047+
if (ret)
1048+
DRM_ERROR("Failed to enable RC6 with OA\n");
1049+
} else {
1050+
DRM_ERROR("OA with RC6 enabled is not supported on this"
1051+
" platform\n");
1052+
ret = -EINVAL;
1053+
}
1054+
1055+
intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
1056+
intel_runtime_pm_put(dev_priv);
1057+
1058+
if (ret)
1059+
return ret;
1060+
}
1061+
9211062
stream->destroy = i915_oa_stream_destroy;
9221063
stream->enable = i915_oa_stream_enable;
9231064
stream->disable = i915_oa_stream_disable;
@@ -1665,6 +1806,10 @@ void i915_perf_init(struct drm_device *dev)
16651806

16661807
dev_priv->perf.initialized = true;
16671808

1809+
dev_priv->perf.oa.ctx_wa_obj_buf[0] = NULL;
1810+
dev_priv->perf.oa.ctx_wa_idx = 0;
1811+
dev_priv->perf.oa.dirty_wa_obj = false;
1812+
16681813
return;
16691814

16701815
sysfs_error:

drivers/gpu/drm/i915/intel_lrc.c

Lines changed: 55 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1286,8 +1286,6 @@ static int gen8_init_indirectctx_bb(struct intel_engine_cs *ring,
12861286
*
12871287
* The number of DWORDS written are returned using this field.
12881288
*
1289-
* This batch is terminated with MI_BATCH_BUFFER_END and so we need not add padding
1290-
* to align it with cacheline as padding after MI_BATCH_BUFFER_END is redundant.
12911289
*/
12921290
static int gen8_init_perctx_bb(struct intel_engine_cs *ring,
12931291
struct i915_wa_ctx_bb *wa_ctx,
@@ -1299,7 +1297,9 @@ static int gen8_init_perctx_bb(struct intel_engine_cs *ring,
12991297
/* WaDisableCtxRestoreArbitration:bdw,chv */
13001298
wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_ENABLE);
13011299

1302-
wa_ctx_emit(batch, index, MI_BATCH_BUFFER_END);
1300+
/* Pad to end of cacheline */
1301+
while (index % CACHELINE_DWORDS)
1302+
wa_ctx_emit(batch, index, MI_NOOP);
13031303

13041304
return wa_ctx_end(wa_ctx, *offset = index, 1);
13051305
}
@@ -1354,6 +1354,33 @@ static int gen9_init_perctx_bb(struct intel_engine_cs *ring,
13541354
(IS_BROXTON(dev) && (INTEL_REVID(dev) == BXT_REVID_A0)))
13551355
wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_ENABLE);
13561356

1357+
/* Pad to end of cacheline */
1358+
while (index % CACHELINE_DWORDS)
1359+
wa_ctx_emit(batch, index, MI_NOOP);
1360+
1361+
return wa_ctx_end(wa_ctx, *offset = index, 1);
1362+
}
1363+
1364+
1365+
/*
1366+
* This batch is started immediately after per_ctx batch. Since we ensure
1367+
* that per_ctx ends on a cacheline this batch is aligned automatically.
1368+
*
1369+
* This batch is a placeholder which we can potentially later use to the
1370+
* restore the NOA MUX configuration, the size and contents of which we do not
1371+
* know until we are able to configure the OA unit.
1372+
*
1373+
* This batch is terminated with MI_BATCH_BUFFER_END and so we need not add
1374+
* padding to align it with cacheline as padding after MI_BATCH_BUFFER_END is
1375+
* redundant
1376+
*/
1377+
static int init_oa_rc6_wa_bb(struct intel_engine_cs *ring,
1378+
struct i915_wa_ctx_bb *wa_ctx,
1379+
uint32_t *const batch,
1380+
uint32_t *offset)
1381+
{
1382+
uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS);
1383+
13571384
wa_ctx_emit(batch, index, MI_BATCH_BUFFER_END);
13581385

13591386
return wa_ctx_end(wa_ctx, *offset = index, 1);
@@ -1436,6 +1463,7 @@ static int intel_init_workaround_bb(struct intel_engine_cs *ring)
14361463
&offset);
14371464
if (ret)
14381465
goto out;
1466+
14391467
} else if (INTEL_INFO(ring->dev)->gen == 9) {
14401468
ret = gen9_init_indirectctx_bb(ring,
14411469
&wa_ctx->indirect_ctx,
@@ -1452,6 +1480,11 @@ static int intel_init_workaround_bb(struct intel_engine_cs *ring)
14521480
goto out;
14531481
}
14541482

1483+
ret = init_oa_rc6_wa_bb(ring,
1484+
&wa_ctx->per_ctx_rc6,
1485+
batch,
1486+
&offset);
1487+
14551488
out:
14561489
kunmap_atomic(batch);
14571490
if (ret)
@@ -2305,7 +2338,25 @@ populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_o
23052338
reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] = 0;
23062339
if (ring->wa_ctx.obj) {
23072340
struct i915_ctx_workarounds *wa_ctx = &ring->wa_ctx;
2308-
uint32_t ggtt_offset = i915_gem_obj_ggtt_offset(wa_ctx->obj);
2341+
struct drm_i915_gem_object *wa_ctx_obj = wa_ctx->obj;
2342+
uint32_t ggtt_offset;
2343+
2344+
/* To support RC6 while capturing metrics with the OA
2345+
* unit, we need to restore the NOA Mux configuration
2346+
* upon existing RC6, while using the CTX_WA_PTR would
2347+
* be a better solution here, there is reportadly HW bug
2348+
* on SKL+ where the batch is not always executed, so
2349+
* using the per-ctx batch seems to be next best solution
2350+
*
2351+
* Given that we need to able to update the per-ctx
2352+
* batch on the fly with the NOA Mux config, we
2353+
* double-buffer the batch, as the batch could
2354+
* potentially still be in use by a restore in progress
2355+
* */
2356+
if (dev_priv->perf.oa.enable_rc6)
2357+
wa_ctx_obj = i915_oa_ctx_wa_obj(dev_priv);
2358+
2359+
ggtt_offset = i915_gem_obj_ggtt_offset(wa_ctx_obj);
23092360

23102361
reg_state[CTX_RCS_INDIRECT_CTX+1] =
23112362
(ggtt_offset + wa_ctx->indirect_ctx.offset * sizeof(uint32_t)) |

drivers/gpu/drm/i915/intel_ringbuffer.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,8 @@ struct i915_ctx_workarounds {
139139
struct i915_wa_ctx_bb {
140140
u32 offset;
141141
u32 size;
142-
} indirect_ctx, per_ctx;
142+
} indirect_ctx, per_ctx, per_ctx_rc6;
143+
143144
struct drm_i915_gem_object *obj;
144145
};
145146

0 commit comments

Comments
 (0)