aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChangbin Du <changbin.du@intel.com>2017-05-02 21:20:10 -0400
committerZhenyu Wang <zhenyuw@linux.intel.com>2017-06-08 01:59:14 -0400
commitffc197763e636b928963c5dd9a3eaea8146345e3 (patch)
tree7b047655b878a9741d24ae69d134f60eb6409da1
parentcd9f4688a3297c0df0eecc2adaae5812d3e5b997 (diff)
drm/i915/gvt: rewrite the trace gvt:gvt_command using trace style approach
The gvt:gvt_command trace involve unnecessary overhead even this trace is not enabled. We need improve it. The kernel trace infrastructure provide a full api to define a trace event. We should leverage them if possible. And one important thing is that a trace point should store raw data but not format string. This patch include two part work: 1) Refactor the gvt_command trace definition, including: o only store raw trace data. o use __dynamic_array() to declare a variable size buffer. o use __print_array() to format raw cmd data. o rename vm_id as vgpu_id. 2) Improve the trace invoking, including: o remove the cycles calculation for handler. We can get this data by any perf tool. o do not make a backup for raw cmd data which just doesn't make sense. With this patch, this trace has no overhead if it is not enabled. And we are trace style now. The final output example: gvt workload 0-211 [000] ...1 120.555964: gvt_command: vgpu1 ring 0: buf_type 0, ip_gma e161e880, raw cmd {0x4000000} gvt workload 0-211 [000] ...1 120.556014: gvt_command: vgpu1 ring 0: buf_type 0, ip_gma e161e884, raw cmd {0x7a000004,0x1004000,0xe1511018,0x0,0x7d,0x0} gvt workload 0-211 [000] ...1 120.556062: gvt_command: vgpu1 ring 0: buf_type 0, ip_gma e161e89c, raw cmd {0x7a000004,0x140000,0x0,0x0,0x0,0x0} gvt workload 0-211 [000] ...1 120.556110: gvt_command: vgpu1 ring 0: buf_type 0, ip_gma e161e8b4, raw cmd {0x10400002,0xe1511018,0x0,0x7d} Signed-off-by: Changbin Du <changbin.du@intel.com> Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com>
-rw-r--r--drivers/gpu/drm/i915/gvt/cmd_parser.c50
-rw-r--r--drivers/gpu/drm/i915/gvt/trace.h78
2 files changed, 32 insertions, 96 deletions
diff --git a/drivers/gpu/drm/i915/gvt/cmd_parser.c b/drivers/gpu/drm/i915/gvt/cmd_parser.c
index 41b2c3aaa04a..5634eb1fa24b 100644
--- a/drivers/gpu/drm/i915/gvt/cmd_parser.c
+++ b/drivers/gpu/drm/i915/gvt/cmd_parser.c
@@ -2414,53 +2414,13 @@ static void add_cmd_entry(struct intel_gvt *gvt, struct cmd_entry *e)
2414 hash_add(gvt->cmd_table, &e->hlist, e->info->opcode); 2414 hash_add(gvt->cmd_table, &e->hlist, e->info->opcode);
2415} 2415}
2416 2416
2417#define GVT_MAX_CMD_LENGTH 20 /* In Dword */
2418
2419static void trace_cs_command(struct parser_exec_state *s,
2420 cycles_t cost_pre_cmd_handler, cycles_t cost_cmd_handler)
2421{
2422 /* This buffer is used by ftrace to store all commands copied from
2423 * guest gma space. Sometimes commands can cross pages, this should
2424 * not be handled in ftrace logic. So this is just used as a
2425 * 'bounce buffer'
2426 */
2427 u32 cmd_trace_buf[GVT_MAX_CMD_LENGTH];
2428 int i;
2429 u32 cmd_len = cmd_length(s);
2430 /* The chosen value of GVT_MAX_CMD_LENGTH are just based on
2431 * following two considerations:
2432 * 1) From observation, most common ring commands is not that long.
2433 * But there are execeptions. So it indeed makes sence to observe
2434 * longer commands.
2435 * 2) From the performance and debugging point of view, dumping all
2436 * contents of very commands is not necessary.
2437 * We mgith shrink GVT_MAX_CMD_LENGTH or remove this trace event in
2438 * future for performance considerations.
2439 */
2440 if (unlikely(cmd_len > GVT_MAX_CMD_LENGTH)) {
2441 gvt_dbg_cmd("cmd length exceed tracing limitation!\n");
2442 cmd_len = GVT_MAX_CMD_LENGTH;
2443 }
2444
2445 for (i = 0; i < cmd_len; i++)
2446 cmd_trace_buf[i] = cmd_val(s, i);
2447
2448 trace_gvt_command(s->vgpu->id, s->ring_id, s->ip_gma, cmd_trace_buf,
2449 cmd_len, s->buf_type == RING_BUFFER_INSTRUCTION,
2450 cost_pre_cmd_handler, cost_cmd_handler);
2451}
2452
2453/* call the cmd handler, and advance ip */ 2417/* call the cmd handler, and advance ip */
2454static int cmd_parser_exec(struct parser_exec_state *s) 2418static int cmd_parser_exec(struct parser_exec_state *s)
2455{ 2419{
2420 struct intel_vgpu *vgpu = s->vgpu;
2456 struct cmd_info *info; 2421 struct cmd_info *info;
2457 u32 cmd; 2422 u32 cmd;
2458 int ret = 0; 2423 int ret = 0;
2459 cycles_t t0, t1, t2;
2460 struct parser_exec_state s_before_advance_custom;
2461 struct intel_vgpu *vgpu = s->vgpu;
2462
2463 t0 = get_cycles();
2464 2424
2465 cmd = cmd_val(s, 0); 2425 cmd = cmd_val(s, 0);
2466 2426
@@ -2475,9 +2435,8 @@ static int cmd_parser_exec(struct parser_exec_state *s)
2475 2435
2476 s->info = info; 2436 s->info = info;
2477 2437
2478 t1 = get_cycles(); 2438 trace_gvt_command(vgpu->id, s->ring_id, s->ip_gma, s->ip_va,
2479 2439 cmd_length(s), s->buf_type);
2480 s_before_advance_custom = *s;
2481 2440
2482 if (info->handler) { 2441 if (info->handler) {
2483 ret = info->handler(s); 2442 ret = info->handler(s);
@@ -2486,9 +2445,6 @@ static int cmd_parser_exec(struct parser_exec_state *s)
2486 return ret; 2445 return ret;
2487 } 2446 }
2488 } 2447 }
2489 t2 = get_cycles();
2490
2491 trace_cs_command(&s_before_advance_custom, t1 - t0, t2 - t1);
2492 2448
2493 if (!(info->flag & F_IP_ADVANCE_CUSTOM)) { 2449 if (!(info->flag & F_IP_ADVANCE_CUSTOM)) {
2494 ret = cmd_advance_default(s); 2450 ret = cmd_advance_default(s);
diff --git a/drivers/gpu/drm/i915/gvt/trace.h b/drivers/gpu/drm/i915/gvt/trace.h
index 53a2d10cf3f1..9171291e36c6 100644
--- a/drivers/gpu/drm/i915/gvt/trace.h
+++ b/drivers/gpu/drm/i915/gvt/trace.h
@@ -224,57 +224,37 @@ TRACE_EVENT(oos_sync,
224 TP_printk("%s", __entry->buf) 224 TP_printk("%s", __entry->buf)
225); 225);
226 226
227#define MAX_CMD_STR_LEN 256
228TRACE_EVENT(gvt_command, 227TRACE_EVENT(gvt_command,
229 TP_PROTO(u8 vm_id, u8 ring_id, u32 ip_gma, u32 *cmd_va, u32 cmd_len, bool ring_buffer_cmd, cycles_t cost_pre_cmd_handler, cycles_t cost_cmd_handler), 228 TP_PROTO(u8 vgpu_id, u8 ring_id, u32 ip_gma, u32 *cmd_va, u32 cmd_len,
230 229 u32 buf_type),
231 TP_ARGS(vm_id, ring_id, ip_gma, cmd_va, cmd_len, ring_buffer_cmd, cost_pre_cmd_handler, cost_cmd_handler), 230
232 231 TP_ARGS(vgpu_id, ring_id, ip_gma, cmd_va, cmd_len, buf_type),
233 TP_STRUCT__entry( 232
234 __field(u8, vm_id) 233 TP_STRUCT__entry(
235 __field(u8, ring_id) 234 __field(u8, vgpu_id)
236 __field(int, i) 235 __field(u8, ring_id)
237 __array(char, tmp_buf, MAX_CMD_STR_LEN) 236 __field(u32, ip_gma)
238 __array(char, cmd_str, MAX_CMD_STR_LEN) 237 __field(u32, buf_type)
239 ), 238 __field(u32, cmd_len)
240 239 __dynamic_array(u32, raw_cmd, cmd_len)
241 TP_fast_assign( 240 ),
242 __entry->vm_id = vm_id; 241
243 __entry->ring_id = ring_id; 242 TP_fast_assign(
244 __entry->cmd_str[0] = '\0'; 243 __entry->vgpu_id = vgpu_id;
245 snprintf(__entry->tmp_buf, MAX_CMD_STR_LEN, "VM(%d) Ring(%d): %s ip(%08x) pre handler cost (%llu), handler cost (%llu) ", vm_id, ring_id, ring_buffer_cmd ? "RB":"BB", ip_gma, cost_pre_cmd_handler, cost_cmd_handler); 244 __entry->ring_id = ring_id;
246 strcat(__entry->cmd_str, __entry->tmp_buf); 245 __entry->ip_gma = ip_gma;
247 entry->i = 0; 246 __entry->buf_type = buf_type;
248 while (cmd_len > 0) { 247 __entry->cmd_len = cmd_len;
249 if (cmd_len >= 8) { 248 memcpy(__get_dynamic_array(raw_cmd), cmd_va, cmd_len * sizeof(*cmd_va));
250 snprintf(__entry->tmp_buf, MAX_CMD_STR_LEN, "%08x %08x %08x %08x %08x %08x %08x %08x ", 249 ),
251 cmd_va[__entry->i], cmd_va[__entry->i+1], cmd_va[__entry->i+2], cmd_va[__entry->i+3], 250
252 cmd_va[__entry->i+4], cmd_va[__entry->i+5], cmd_va[__entry->i+6], cmd_va[__entry->i+7]);
253 __entry->i += 8;
254 cmd_len -= 8;
255 strcat(__entry->cmd_str, __entry->tmp_buf);
256 } else if (cmd_len >= 4) {
257 snprintf(__entry->tmp_buf, MAX_CMD_STR_LEN, "%08x %08x %08x %08x ",
258 cmd_va[__entry->i], cmd_va[__entry->i+1], cmd_va[__entry->i+2], cmd_va[__entry->i+3]);
259 __entry->i += 4;
260 cmd_len -= 4;
261 strcat(__entry->cmd_str, __entry->tmp_buf);
262 } else if (cmd_len >= 2) {
263 snprintf(__entry->tmp_buf, MAX_CMD_STR_LEN, "%08x %08x ", cmd_va[__entry->i], cmd_va[__entry->i+1]);
264 __entry->i += 2;
265 cmd_len -= 2;
266 strcat(__entry->cmd_str, __entry->tmp_buf);
267 } else if (cmd_len == 1) {
268 snprintf(__entry->tmp_buf, MAX_CMD_STR_LEN, "%08x ", cmd_va[__entry->i]);
269 __entry->i += 1;
270 cmd_len -= 1;
271 strcat(__entry->cmd_str, __entry->tmp_buf);
272 }
273 }
274 strcat(__entry->cmd_str, "\n");
275 ),
276 251
277 TP_printk("%s", __entry->cmd_str) 252 TP_printk("vgpu%d ring %d: buf_type %u, ip_gma %08x, raw cmd %s",
253 __entry->vgpu_id,
254 __entry->ring_id,
255 __entry->buf_type,
256 __entry->ip_gma,
257 __print_array(__get_dynamic_array(raw_cmd), __entry->cmd_len, 4))
278); 258);
279#endif /* _GVT_TRACE_H_ */ 259#endif /* _GVT_TRACE_H_ */
280 260