bpf: undo prog rejection on read-only lock failure

Partially undo commit 9facc336876f ("bpf: reject any prog that failed read-only lock") since it caused a regression, that is, syzkaller was able to manage to cause a panic via fault injection deep in set_memory_ro() path by letting an allocation fail: In x86's __change_page_attr_set_clr() it was able to change the attributes of the primary mapping but not in the alias mapping via cpa_process_alias(), so the second, inner call to the __change_page_attr() via __change_page_attr_set_clr() had to split a larger page and failed in the alloc_pages() with the artifically triggered allocation error which is then propagated down to the call site. Thus, for set_memory_ro() this means that it returned with an error, but from debugging a probe_kernel_write() revealed EFAULT on that memory since the primary mapping succeeded to get changed. Therefore the subsequent hdr->locked = 0 reset triggered the panic as it was performed on read-only memory, so call-site assumptions were infact wrong to assume that it would either succeed /or/ not succeed at all since there's no such rollback in set_memory_*() calls from partial change of mappings, in other words, we're left in a state that is "half done". A later undo via set_memory_rw() is succeeding though due to matching permissions on that part (aka due to the try_preserve_large_page() succeeding). While reproducing locally with explicitly triggering this error, the initial splitting only happens on rare occasions and in real world it would additionally need oom conditions, but that said, it could partially fail. Therefore, it is definitely wrong to bail out on set_memory_ro() error and reject the program with the set_memory_*() semantics we have today. Shouldn't have gone the extra mile since no other user in tree today infact checks for any set_memory_*() errors, e.g. neither module_enable_ro() / module_disable_ro() for module RO/NX handling which is mostly default these days nor kprobes core with alloc_insn_page() / free_insn_page() as examples that could be invoked long after bootup and original 314beb9bcabf ("x86: bpf_jit_comp: secure bpf jit against spraying attacks") did neither when it got first introduced to BPF so "improving" with bailing out was clearly not right when set_memory_*() cannot handle it today. Kees suggested that if set_memory_*() can fail, we should annotate it with __must_check, and all callers need to deal with it gracefully given those set_memory_*() markings aren't "advisory", but they're expected to actually do what they say. This might be an option worth to move forward in future but would at the same time require that set_memory_*() calls from supporting archs are guaranteed to be "atomic" in that they provide rollback if part of the range fails, once that happened, the transition from RW -> RO could be made more robust that way, while subsequent RO -> RW transition /must/ continue guaranteeing to always succeed the undo part. Reported-by: syzbot+a4eb8c7766952a1ca872@syzkaller.appspotmail.com Reported-by: syzbot+d866d1925855328eac3b@syzkaller.appspotmail.com Fixes: 9facc336876f ("bpf: reject any prog that failed read-only lock") Cc: Laura Abbott <labbott@redhat.com> Cc: Kees Cook <keescook@chromium.org> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
author: Daniel Borkmann <daniel@iogearbox.net> 2018-06-28 17:34:59 -0400
committer: Alexei Starovoitov <ast@kernel.org> 2018-06-29 13:47:35 -0400
commit: 85782e037f8aba8922dadb24a1523ca0b82ab8bc (patch)
tree: 3d2eec5426e774dc0824a2bb3ebc883cadb642d6
parent: f605ce5eb26ac934fb8106d75d46a2c875a2bf23 (diff)
2 files changed, 9 insertions, 77 deletions
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 20f2659dd829..300baad62c88 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -470,9 +470,7 @@ struct sock_fprog_kern {
 };
 struct bpf_binary_header {
-        u16 pages;
+        u32 pages;
-        u16 locked:1;
        /* Some arches need word alignment for their instructions */
        u8 image[] __aligned(4);
 };
@@ -481,7 +479,7 @@ struct bpf_prog {
        u16                     pages;          /* Number of allocated pages */
        u16                     jited:1,        /* Is our filter JIT'ed? */
                                jit_requested:1,/* archs need to JIT the prog */
-                                locked:1,       /* Program image locked? */
+                                undo_set_mem:1, /* Passed set_memory_ro() checkpoint */
                                gpl_compatible:1, /* Is filter GPL compatible? */
                                cb_access:1,    /* Is control block accessed? */
                                dst_needed:1,   /* Do we need dst entry? */
@@ -677,46 +675,24 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default)
 static inline void bpf_prog_lock_ro(struct bpf_prog *fp)
 {
-#ifdef CONFIG_ARCH_HAS_SET_MEMORY
+        fp->undo_set_mem = 1;
-        fp->locked = 1;
+        set_memory_ro((unsigned long)fp, fp->pages);
-        if (set_memory_ro((unsigned long)fp, fp->pages))
-                fp->locked = 0;
-#endif
 }
 static inline void bpf_prog_unlock_ro(struct bpf_prog *fp)
 {
-#ifdef CONFIG_ARCH_HAS_SET_MEMORY
+        if (fp->undo_set_mem)
-        if (fp->locked) {
+                set_memory_rw((unsigned long)fp, fp->pages);
-                WARN_ON_ONCE(set_memory_rw((unsigned long)fp, fp->pages));
-                /* In case set_memory_rw() fails, we want to be the first
-                 * to crash here instead of some random place later on.
-                 */
-                fp->locked = 0;
-        }
-#endif
 }
 static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr)
 {
-#ifdef CONFIG_ARCH_HAS_SET_MEMORY
+        set_memory_ro((unsigned long)hdr, hdr->pages);
-        hdr->locked = 1;
-        if (set_memory_ro((unsigned long)hdr, hdr->pages))
-                hdr->locked = 0;
-#endif
 }
 static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr)
 {
-#ifdef CONFIG_ARCH_HAS_SET_MEMORY
+        set_memory_rw((unsigned long)hdr, hdr->pages);
-        if (hdr->locked) {
-                WARN_ON_ONCE(set_memory_rw((unsigned long)hdr, hdr->pages));
-                /* In case set_memory_rw() fails, we want to be the first
-                 * to crash here instead of some random place later on.
-                 */
-                hdr->locked = 0;
-        }
-#endif
 }
 static inline struct bpf_binary_header *
@@ -728,22 +704,6 @@ bpf_jit_binary_hdr(const struct bpf_prog *fp)
        return (void *)addr;
 }
-#ifdef CONFIG_ARCH_HAS_SET_MEMORY
-static inline int bpf_prog_check_pages_ro_single(const struct bpf_prog *fp)
-{
-        if (!fp->locked)
-                return -ENOLCK;
-        if (fp->jited) {
-                const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
-                if (!hdr->locked)
-                        return -ENOLCK;
-        }
-        return 0;
-}
-#endif
 int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap);
 static inline int sk_filter(struct sock *sk, struct sk_buff *skb)
 {
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index a9e6c04d0f4a..1e5625d46414 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -598,8 +598,6 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
        bpf_fill_ill_insns(hdr, size);
        hdr->pages = size / PAGE_SIZE;
-        hdr->locked = 0;
        hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
                     PAGE_SIZE - sizeof(*hdr));
        start = (get_random_int() % hole) & ~(alignment - 1);
@@ -1450,22 +1448,6 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
        return 0;
 }
-static int bpf_prog_check_pages_ro_locked(const struct bpf_prog *fp)
-{
-#ifdef CONFIG_ARCH_HAS_SET_MEMORY
-        int i, err;
-        for (i = 0; i < fp->aux->func_cnt; i++) {
-                err = bpf_prog_check_pages_ro_single(fp->aux->func[i]);
-                if (err)
-                        return err;
-        }
-        return bpf_prog_check_pages_ro_single(fp);
-#endif
-        return 0;
-}
 static void bpf_prog_select_func(struct bpf_prog *fp)
 {
 #ifndef CONFIG_BPF_JIT_ALWAYS_ON
@@ -1524,17 +1506,7 @@ finalize:
         * all eBPF JITs might immediately support all features.
         */
        *err = bpf_check_tail_call(fp);
-        if (*err)
-                return fp;
-        /* Checkpoint: at this point onwards any cBPF -> eBPF or
-         * native eBPF program is read-only. If we failed to change
-         * the page attributes (e.g. allocation failure from
-         * splitting large pages), then reject the whole program
-         * in order to guarantee not ending up with any W+X pages
-         * from BPF side in kernel.
-         */
-        *err = bpf_prog_check_pages_ro_locked(fp);
        return fp;
 }
 EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
author	Daniel Borkmann <daniel@iogearbox.net>	2018-06-28 17:34:59 -0400
committer	Alexei Starovoitov <ast@kernel.org>	2018-06-29 13:47:35 -0400
commit	85782e037f8aba8922dadb24a1523ca0b82ab8bc (patch)
tree	3d2eec5426e774dc0824a2bb3ebc883cadb642d6
parent	f605ce5eb26ac934fb8106d75d46a2c875a2bf23 (diff)

diff --git a/include/linux/filter.h b/include/linux/filter.h index 20f2659dd829..300baad62c88 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h
@@ -470,9 +470,7 @@ struct sock_fprog_kern {
470	};	470	};
471		471
472	struct bpf_binary_header {	472	struct bpf_binary_header {
473	u16 pages;	473	u32 pages;
474	u16 locked:1;
475
476	/* Some arches need word alignment for their instructions */	474	/* Some arches need word alignment for their instructions */
477	u8 image[] __aligned(4);	475	u8 image[] __aligned(4);
478	};	476	};
@@ -481,7 +479,7 @@ struct bpf_prog {
481	u16 pages; /* Number of allocated pages */	479	u16 pages; /* Number of allocated pages */
482	u16 jited:1, /* Is our filter JIT'ed? */	480	u16 jited:1, /* Is our filter JIT'ed? */
483	jit_requested:1,/* archs need to JIT the prog */	481	jit_requested:1,/* archs need to JIT the prog */
484	locked:1, /* Program image locked? */	482	undo_set_mem:1, /* Passed set_memory_ro() checkpoint */
485	gpl_compatible:1, /* Is filter GPL compatible? */	483	gpl_compatible:1, /* Is filter GPL compatible? */
486	cb_access:1, /* Is control block accessed? */	484	cb_access:1, /* Is control block accessed? */
487	dst_needed:1, /* Do we need dst entry? */	485	dst_needed:1, /* Do we need dst entry? */
@@ -677,46 +675,24 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default)
677		675
678	static inline void bpf_prog_lock_ro(struct bpf_prog *fp)	676	static inline void bpf_prog_lock_ro(struct bpf_prog *fp)
679	{	677	{
680	#ifdef CONFIG_ARCH_HAS_SET_MEMORY	678	fp->undo_set_mem = 1;
681	fp->locked = 1;	679	set_memory_ro((unsigned long)fp, fp->pages);
682	if (set_memory_ro((unsigned long)fp, fp->pages))
683	fp->locked = 0;
684	#endif
685	}	680	}
686		681
687	static inline void bpf_prog_unlock_ro(struct bpf_prog *fp)	682	static inline void bpf_prog_unlock_ro(struct bpf_prog *fp)
688	{	683	{
689	#ifdef CONFIG_ARCH_HAS_SET_MEMORY	684	if (fp->undo_set_mem)
690	if (fp->locked) {	685	set_memory_rw((unsigned long)fp, fp->pages);
691	WARN_ON_ONCE(set_memory_rw((unsigned long)fp, fp->pages));
692	/* In case set_memory_rw() fails, we want to be the first
693	* to crash here instead of some random place later on.
694	*/
695	fp->locked = 0;
696	}
697	#endif
698	}	686	}
699		687
700	static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr)	688	static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr)
701	{	689	{
702	#ifdef CONFIG_ARCH_HAS_SET_MEMORY	690	set_memory_ro((unsigned long)hdr, hdr->pages);
703	hdr->locked = 1;
704	if (set_memory_ro((unsigned long)hdr, hdr->pages))
705	hdr->locked = 0;
706	#endif
707	}	691	}
708		692
709	static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr)	693	static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr)
710	{	694	{
711	#ifdef CONFIG_ARCH_HAS_SET_MEMORY	695	set_memory_rw((unsigned long)hdr, hdr->pages);
712	if (hdr->locked) {
713	WARN_ON_ONCE(set_memory_rw((unsigned long)hdr, hdr->pages));
714	/* In case set_memory_rw() fails, we want to be the first
715	* to crash here instead of some random place later on.
716	*/
717	hdr->locked = 0;
718	}
719	#endif
720	}	696	}
721		697
722	static inline struct bpf_binary_header *	698	static inline struct bpf_binary_header *
@@ -728,22 +704,6 @@ bpf_jit_binary_hdr(const struct bpf_prog *fp)
728	return (void *)addr;	704	return (void *)addr;
729	}	705	}
730		706
731	#ifdef CONFIG_ARCH_HAS_SET_MEMORY
732	static inline int bpf_prog_check_pages_ro_single(const struct bpf_prog *fp)
733	{
734	if (!fp->locked)
735	return -ENOLCK;
736	if (fp->jited) {
737	const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
738
739	if (!hdr->locked)
740	return -ENOLCK;
741	}
742
743	return 0;
744	}
745	#endif
746
747	int sk_filter_trim_cap(struct sock sk, struct sk_buff skb, unsigned int cap);	707	int sk_filter_trim_cap(struct sock sk, struct sk_buff skb, unsigned int cap);
748	static inline int sk_filter(struct sock sk, struct sk_buff skb)	708	static inline int sk_filter(struct sock sk, struct sk_buff skb)
749	{	709	{


diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a9e6c04d0f4a..1e5625d46414 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c
@@ -598,8 +598,6 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
598	bpf_fill_ill_insns(hdr, size);	598	bpf_fill_ill_insns(hdr, size);
599		599
600	hdr->pages = size / PAGE_SIZE;	600	hdr->pages = size / PAGE_SIZE;
601	hdr->locked = 0;
602
603	hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),	601	hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
604	PAGE_SIZE - sizeof(*hdr));	602	PAGE_SIZE - sizeof(*hdr));
605	start = (get_random_int() % hole) & ~(alignment - 1);	603	start = (get_random_int() % hole) & ~(alignment - 1);
@@ -1450,22 +1448,6 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
1450	return 0;	1448	return 0;
1451	}	1449	}
1452		1450
1453	static int bpf_prog_check_pages_ro_locked(const struct bpf_prog *fp)
1454	{
1455	#ifdef CONFIG_ARCH_HAS_SET_MEMORY
1456	int i, err;
1457
1458	for (i = 0; i < fp->aux->func_cnt; i++) {
1459	err = bpf_prog_check_pages_ro_single(fp->aux->func[i]);
1460	if (err)
1461	return err;
1462	}
1463
1464	return bpf_prog_check_pages_ro_single(fp);
1465	#endif
1466	return 0;
1467	}
1468
1469	static void bpf_prog_select_func(struct bpf_prog *fp)	1451	static void bpf_prog_select_func(struct bpf_prog *fp)
1470	{	1452	{
1471	#ifndef CONFIG_BPF_JIT_ALWAYS_ON	1453	#ifndef CONFIG_BPF_JIT_ALWAYS_ON
@@ -1524,17 +1506,7 @@ finalize:
1524	* all eBPF JITs might immediately support all features.	1506	* all eBPF JITs might immediately support all features.
1525	*/	1507	*/
1526	*err = bpf_check_tail_call(fp);	1508	*err = bpf_check_tail_call(fp);
1527	if (*err)	1509
1528	return fp;
1529
1530	/* Checkpoint: at this point onwards any cBPF -> eBPF or
1531	* native eBPF program is read-only. If we failed to change
1532	* the page attributes (e.g. allocation failure from
1533	* splitting large pages), then reject the whole program
1534	* in order to guarantee not ending up with any W+X pages
1535	* from BPF side in kernel.
1536	*/
1537	*err = bpf_prog_check_pages_ro_locked(fp);
1538	return fp;	1510	return fp;
1539	}	1511	}
1540	EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);	1512	EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);