aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDaniel Borkmann <daniel@iogearbox.net>2018-06-28 17:34:59 -0400
committerAlexei Starovoitov <ast@kernel.org>2018-06-29 13:47:35 -0400
commit85782e037f8aba8922dadb24a1523ca0b82ab8bc (patch)
tree3d2eec5426e774dc0824a2bb3ebc883cadb642d6
parentf605ce5eb26ac934fb8106d75d46a2c875a2bf23 (diff)
bpf: undo prog rejection on read-only lock failure
Partially undo commit 9facc336876f ("bpf: reject any prog that failed read-only lock") since it caused a regression, that is, syzkaller was able to manage to cause a panic via fault injection deep in set_memory_ro() path by letting an allocation fail: In x86's __change_page_attr_set_clr() it was able to change the attributes of the primary mapping but not in the alias mapping via cpa_process_alias(), so the second, inner call to the __change_page_attr() via __change_page_attr_set_clr() had to split a larger page and failed in the alloc_pages() with the artifically triggered allocation error which is then propagated down to the call site. Thus, for set_memory_ro() this means that it returned with an error, but from debugging a probe_kernel_write() revealed EFAULT on that memory since the primary mapping succeeded to get changed. Therefore the subsequent hdr->locked = 0 reset triggered the panic as it was performed on read-only memory, so call-site assumptions were infact wrong to assume that it would either succeed /or/ not succeed at all since there's no such rollback in set_memory_*() calls from partial change of mappings, in other words, we're left in a state that is "half done". A later undo via set_memory_rw() is succeeding though due to matching permissions on that part (aka due to the try_preserve_large_page() succeeding). While reproducing locally with explicitly triggering this error, the initial splitting only happens on rare occasions and in real world it would additionally need oom conditions, but that said, it could partially fail. Therefore, it is definitely wrong to bail out on set_memory_ro() error and reject the program with the set_memory_*() semantics we have today. Shouldn't have gone the extra mile since no other user in tree today infact checks for any set_memory_*() errors, e.g. neither module_enable_ro() / module_disable_ro() for module RO/NX handling which is mostly default these days nor kprobes core with alloc_insn_page() / free_insn_page() as examples that could be invoked long after bootup and original 314beb9bcabf ("x86: bpf_jit_comp: secure bpf jit against spraying attacks") did neither when it got first introduced to BPF so "improving" with bailing out was clearly not right when set_memory_*() cannot handle it today. Kees suggested that if set_memory_*() can fail, we should annotate it with __must_check, and all callers need to deal with it gracefully given those set_memory_*() markings aren't "advisory", but they're expected to actually do what they say. This might be an option worth to move forward in future but would at the same time require that set_memory_*() calls from supporting archs are guaranteed to be "atomic" in that they provide rollback if part of the range fails, once that happened, the transition from RW -> RO could be made more robust that way, while subsequent RO -> RW transition /must/ continue guaranteeing to always succeed the undo part. Reported-by: syzbot+a4eb8c7766952a1ca872@syzkaller.appspotmail.com Reported-by: syzbot+d866d1925855328eac3b@syzkaller.appspotmail.com Fixes: 9facc336876f ("bpf: reject any prog that failed read-only lock") Cc: Laura Abbott <labbott@redhat.com> Cc: Kees Cook <keescook@chromium.org> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
-rw-r--r--include/linux/filter.h56
-rw-r--r--kernel/bpf/core.c30
2 files changed, 9 insertions, 77 deletions
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 20f2659dd829..300baad62c88 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -470,9 +470,7 @@ struct sock_fprog_kern {
470}; 470};
471 471
472struct bpf_binary_header { 472struct bpf_binary_header {
473 u16 pages; 473 u32 pages;
474 u16 locked:1;
475
476 /* Some arches need word alignment for their instructions */ 474 /* Some arches need word alignment for their instructions */
477 u8 image[] __aligned(4); 475 u8 image[] __aligned(4);
478}; 476};
@@ -481,7 +479,7 @@ struct bpf_prog {
481 u16 pages; /* Number of allocated pages */ 479 u16 pages; /* Number of allocated pages */
482 u16 jited:1, /* Is our filter JIT'ed? */ 480 u16 jited:1, /* Is our filter JIT'ed? */
483 jit_requested:1,/* archs need to JIT the prog */ 481 jit_requested:1,/* archs need to JIT the prog */
484 locked:1, /* Program image locked? */ 482 undo_set_mem:1, /* Passed set_memory_ro() checkpoint */
485 gpl_compatible:1, /* Is filter GPL compatible? */ 483 gpl_compatible:1, /* Is filter GPL compatible? */
486 cb_access:1, /* Is control block accessed? */ 484 cb_access:1, /* Is control block accessed? */
487 dst_needed:1, /* Do we need dst entry? */ 485 dst_needed:1, /* Do we need dst entry? */
@@ -677,46 +675,24 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default)
677 675
678static inline void bpf_prog_lock_ro(struct bpf_prog *fp) 676static inline void bpf_prog_lock_ro(struct bpf_prog *fp)
679{ 677{
680#ifdef CONFIG_ARCH_HAS_SET_MEMORY 678 fp->undo_set_mem = 1;
681 fp->locked = 1; 679 set_memory_ro((unsigned long)fp, fp->pages);
682 if (set_memory_ro((unsigned long)fp, fp->pages))
683 fp->locked = 0;
684#endif
685} 680}
686 681
687static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) 682static inline void bpf_prog_unlock_ro(struct bpf_prog *fp)
688{ 683{
689#ifdef CONFIG_ARCH_HAS_SET_MEMORY 684 if (fp->undo_set_mem)
690 if (fp->locked) { 685 set_memory_rw((unsigned long)fp, fp->pages);
691 WARN_ON_ONCE(set_memory_rw((unsigned long)fp, fp->pages));
692 /* In case set_memory_rw() fails, we want to be the first
693 * to crash here instead of some random place later on.
694 */
695 fp->locked = 0;
696 }
697#endif
698} 686}
699 687
700static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) 688static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr)
701{ 689{
702#ifdef CONFIG_ARCH_HAS_SET_MEMORY 690 set_memory_ro((unsigned long)hdr, hdr->pages);
703 hdr->locked = 1;
704 if (set_memory_ro((unsigned long)hdr, hdr->pages))
705 hdr->locked = 0;
706#endif
707} 691}
708 692
709static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr) 693static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr)
710{ 694{
711#ifdef CONFIG_ARCH_HAS_SET_MEMORY 695 set_memory_rw((unsigned long)hdr, hdr->pages);
712 if (hdr->locked) {
713 WARN_ON_ONCE(set_memory_rw((unsigned long)hdr, hdr->pages));
714 /* In case set_memory_rw() fails, we want to be the first
715 * to crash here instead of some random place later on.
716 */
717 hdr->locked = 0;
718 }
719#endif
720} 696}
721 697
722static inline struct bpf_binary_header * 698static inline struct bpf_binary_header *
@@ -728,22 +704,6 @@ bpf_jit_binary_hdr(const struct bpf_prog *fp)
728 return (void *)addr; 704 return (void *)addr;
729} 705}
730 706
731#ifdef CONFIG_ARCH_HAS_SET_MEMORY
732static inline int bpf_prog_check_pages_ro_single(const struct bpf_prog *fp)
733{
734 if (!fp->locked)
735 return -ENOLCK;
736 if (fp->jited) {
737 const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
738
739 if (!hdr->locked)
740 return -ENOLCK;
741 }
742
743 return 0;
744}
745#endif
746
747int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap); 707int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap);
748static inline int sk_filter(struct sock *sk, struct sk_buff *skb) 708static inline int sk_filter(struct sock *sk, struct sk_buff *skb)
749{ 709{
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index a9e6c04d0f4a..1e5625d46414 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -598,8 +598,6 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
598 bpf_fill_ill_insns(hdr, size); 598 bpf_fill_ill_insns(hdr, size);
599 599
600 hdr->pages = size / PAGE_SIZE; 600 hdr->pages = size / PAGE_SIZE;
601 hdr->locked = 0;
602
603 hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), 601 hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
604 PAGE_SIZE - sizeof(*hdr)); 602 PAGE_SIZE - sizeof(*hdr));
605 start = (get_random_int() % hole) & ~(alignment - 1); 603 start = (get_random_int() % hole) & ~(alignment - 1);
@@ -1450,22 +1448,6 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
1450 return 0; 1448 return 0;
1451} 1449}
1452 1450
1453static int bpf_prog_check_pages_ro_locked(const struct bpf_prog *fp)
1454{
1455#ifdef CONFIG_ARCH_HAS_SET_MEMORY
1456 int i, err;
1457
1458 for (i = 0; i < fp->aux->func_cnt; i++) {
1459 err = bpf_prog_check_pages_ro_single(fp->aux->func[i]);
1460 if (err)
1461 return err;
1462 }
1463
1464 return bpf_prog_check_pages_ro_single(fp);
1465#endif
1466 return 0;
1467}
1468
1469static void bpf_prog_select_func(struct bpf_prog *fp) 1451static void bpf_prog_select_func(struct bpf_prog *fp)
1470{ 1452{
1471#ifndef CONFIG_BPF_JIT_ALWAYS_ON 1453#ifndef CONFIG_BPF_JIT_ALWAYS_ON
@@ -1524,17 +1506,7 @@ finalize:
1524 * all eBPF JITs might immediately support all features. 1506 * all eBPF JITs might immediately support all features.
1525 */ 1507 */
1526 *err = bpf_check_tail_call(fp); 1508 *err = bpf_check_tail_call(fp);
1527 if (*err) 1509
1528 return fp;
1529
1530 /* Checkpoint: at this point onwards any cBPF -> eBPF or
1531 * native eBPF program is read-only. If we failed to change
1532 * the page attributes (e.g. allocation failure from
1533 * splitting large pages), then reject the whole program
1534 * in order to guarantee not ending up with any W+X pages
1535 * from BPF side in kernel.
1536 */
1537 *err = bpf_prog_check_pages_ro_locked(fp);
1538 return fp; 1510 return fp;
1539} 1511}
1540EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); 1512EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);