1 files changed, 72 insertions, 0 deletions
diff --git a/tools/perf/examples/bpf/augmented_raw_syscalls.c b/tools/perf/examples/bpf/augmented_raw_syscalls.c
index cde91c34b101..90a19336310b 100644
--- a/tools/perf/examples/bpf/augmented_raw_syscalls.c
+++ b/tools/perf/examples/bpf/augmented_raw_syscalls.c
@@ -37,15 +37,87 @@ struct syscall_exit_args {
        long               ret;
 };
+struct augmented_filename {
+        unsigned int    size;
+        int             reserved;
+        char            value[256];
+};
+#define SYS_OPEN 2
+#define SYS_OPENAT 257
 SEC("raw_syscalls:sys_enter")
 int sys_enter(struct syscall_enter_args *args)
 {
        struct {
                struct syscall_enter_args args;
+                struct augmented_filename filename;
        } augmented_args;
        unsigned int len = sizeof(augmented_args);
+        const void *filename_arg = NULL;
        probe_read(&augmented_args.args, sizeof(augmented_args.args), args);
+        /*
+         * Yonghong and Edward Cree sayz:
+         *
+         * https://www.spinics.net/lists/netdev/msg531645.html
+         *
+         * >>   R0=inv(id=0) R1=inv2 R6=ctx(id=0,off=0,imm=0) R7=inv64 R10=fp0,call_-1
+         * >> 10: (bf) r1 = r6
+         * >> 11: (07) r1 += 16
+         * >> 12: (05) goto pc+2
+         * >> 15: (79) r3 = *(u64 *)(r1 +0)
+         * >> dereference of modified ctx ptr R1 off=16 disallowed
+         * > Aha, we at least got a different error message this time.
+         * > And indeed llvm has done that optimisation, rather than the more obvious
+         * > 11: r3 = *(u64 *)(r1 +16)
+         * > because it wants to have lots of reads share a single insn.  You may be able
+         * > to defeat that optimisation by adding compiler barriers, idk.  Maybe someone
+         * > with llvm knowledge can figure out how to stop it (ideally, llvm would know
+         * > when it's generating for bpf backend and not do that).  -O0?  ¯\_(ツ)_/¯
+         *
+         * The optimization mostly likes below:
+         *
+         *      br1:
+         *      ...
+         *      r1 += 16
+         *      goto merge
+         *      br2:
+         *      ...
+         *      r1 += 20
+         *      goto merge
+         *      merge:
+         *      *(u64 *)(r1 + 0)
+         *
+         * The compiler tries to merge common loads. There is no easy way to
+         * stop this compiler optimization without turning off a lot of other
+         * optimizations. The easiest way is to add barriers:
+         *
+         *       __asm__ __volatile__("": : :"memory")
+         *
+         *       after the ctx memory access to prevent their down stream merging.
+         */
+        switch (augmented_args.args.syscall_nr) {
+        case SYS_OPEN:   filename_arg = (const void *)args->args[0];
+                        __asm__ __volatile__("": : :"memory");
+                         break;
+        case SYS_OPENAT: filename_arg = (const void *)args->args[1];
+                         break;
+        }
+        if (filename_arg != NULL) {
+                augmented_args.filename.reserved = 0;
+                augmented_args.filename.size = probe_read_str(&augmented_args.filename.value,
+                                                              sizeof(augmented_args.filename.value),
+                                                              filename_arg);
+                if (augmented_args.filename.size < sizeof(augmented_args.filename.value)) {
+                        len -= sizeof(augmented_args.filename.value) - augmented_args.filename.size;
+                        len &= sizeof(augmented_args.filename.value) - 1;
+                }
+        } else {
+                len = sizeof(augmented_args.args);
+        }
        perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, &augmented_args, len);
        return 0;
 }

diff --git a/tools/perf/examples/bpf/augmented_raw_syscalls.c b/tools/perf/examples/bpf/augmented_raw_syscalls.c index cde91c34b101..90a19336310b 100644 --- a/tools/perf/examples/bpf/augmented_raw_syscalls.c +++ b/tools/perf/examples/bpf/augmented_raw_syscalls.c
@@ -37,15 +37,87 @@ struct syscall_exit_args {
37	long ret;	37	long ret;
38	};	38	};
39		39
		40	struct augmented_filename {
		41	unsigned int size;
		42	int reserved;
		43	char value[256];
		44	};
		45
		46	#define SYS_OPEN 2
		47	#define SYS_OPENAT 257
		48
40	SEC("raw_syscalls:sys_enter")	49	SEC("raw_syscalls:sys_enter")
41	int sys_enter(struct syscall_enter_args *args)	50	int sys_enter(struct syscall_enter_args *args)
42	{	51	{
43	struct {	52	struct {
44	struct syscall_enter_args args;	53	struct syscall_enter_args args;
		54	struct augmented_filename filename;
45	} augmented_args;	55	} augmented_args;
46	unsigned int len = sizeof(augmented_args);	56	unsigned int len = sizeof(augmented_args);
		57	const void *filename_arg = NULL;
47		58
48	probe_read(&augmented_args.args, sizeof(augmented_args.args), args);	59	probe_read(&augmented_args.args, sizeof(augmented_args.args), args);
		60	/*
		61	* Yonghong and Edward Cree sayz:
		62	*
		63	* https://www.spinics.net/lists/netdev/msg531645.html
		64	*
		65	* >> R0=inv(id=0) R1=inv2 R6=ctx(id=0,off=0,imm=0) R7=inv64 R10=fp0,call_-1
		66	* >> 10: (bf) r1 = r6
		67	* >> 11: (07) r1 += 16
		68	* >> 12: (05) goto pc+2
		69	* >> 15: (79) r3 = (u64 )(r1 +0)
		70	* >> dereference of modified ctx ptr R1 off=16 disallowed
		71	* > Aha, we at least got a different error message this time.
		72	* > And indeed llvm has done that optimisation, rather than the more obvious
		73	* > 11: r3 = (u64 )(r1 +16)
		74	* > because it wants to have lots of reads share a single insn. You may be able
		75	* > to defeat that optimisation by adding compiler barriers, idk. Maybe someone
		76	* > with llvm knowledge can figure out how to stop it (ideally, llvm would know
		77	* > when it's generating for bpf backend and not do that). -O0? ¯\_(ツ)_/¯
		78	*
		79	* The optimization mostly likes below:
		80	*
		81	* br1:
		82	* ...
		83	* r1 += 16
		84	* goto merge
		85	* br2:
		86	* ...
		87	* r1 += 20
		88	* goto merge
		89	* merge:
		90	* (u64 )(r1 + 0)
		91	*
		92	* The compiler tries to merge common loads. There is no easy way to
		93	* stop this compiler optimization without turning off a lot of other
		94	* optimizations. The easiest way is to add barriers:
		95	*
		96	* __asm__ __volatile__("": : :"memory")
		97	*
		98	* after the ctx memory access to prevent their down stream merging.
		99	*/
		100	switch (augmented_args.args.syscall_nr) {
		101	case SYS_OPEN: filename_arg = (const void *)args->args[0];
		102	__asm__ __volatile__("": : :"memory");
		103	break;
		104	case SYS_OPENAT: filename_arg = (const void *)args->args[1];
		105	break;
		106	}
		107
		108	if (filename_arg != NULL) {
		109	augmented_args.filename.reserved = 0;
		110	augmented_args.filename.size = probe_read_str(&augmented_args.filename.value,
		111	sizeof(augmented_args.filename.value),
		112	filename_arg);
		113	if (augmented_args.filename.size < sizeof(augmented_args.filename.value)) {
		114	len -= sizeof(augmented_args.filename.value) - augmented_args.filename.size;
		115	len &= sizeof(augmented_args.filename.value) - 1;
		116	}
		117	} else {
		118	len = sizeof(augmented_args.args);
		119	}
		120
49	perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, &augmented_args, len);	121	perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, &augmented_args, len);
50	return 0;	122	return 0;
51	}	123	}