diff options
Diffstat (limited to 'kernel')
47 files changed, 1998 insertions, 847 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index ef0d95a190b4..48c5376d290a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -64,6 +64,7 @@ obj-$(CONFIG_CRASH_CORE) += crash_core.o | |||
64 | obj-$(CONFIG_KEXEC_CORE) += kexec_core.o | 64 | obj-$(CONFIG_KEXEC_CORE) += kexec_core.o |
65 | obj-$(CONFIG_KEXEC) += kexec.o | 65 | obj-$(CONFIG_KEXEC) += kexec.o |
66 | obj-$(CONFIG_KEXEC_FILE) += kexec_file.o | 66 | obj-$(CONFIG_KEXEC_FILE) += kexec_file.o |
67 | obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o | ||
67 | obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o | 68 | obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o |
68 | obj-$(CONFIG_COMPAT) += compat.o | 69 | obj-$(CONFIG_COMPAT) += compat.o |
69 | obj-$(CONFIG_CGROUPS) += cgroup/ | 70 | obj-$(CONFIG_CGROUPS) += cgroup/ |
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8191a7db2777..66088a9e9b9e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c | |||
@@ -890,7 +890,8 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog, | |||
890 | 890 | ||
891 | static int bpf_jit_blind_insn(const struct bpf_insn *from, | 891 | static int bpf_jit_blind_insn(const struct bpf_insn *from, |
892 | const struct bpf_insn *aux, | 892 | const struct bpf_insn *aux, |
893 | struct bpf_insn *to_buff) | 893 | struct bpf_insn *to_buff, |
894 | bool emit_zext) | ||
894 | { | 895 | { |
895 | struct bpf_insn *to = to_buff; | 896 | struct bpf_insn *to = to_buff; |
896 | u32 imm_rnd = get_random_int(); | 897 | u32 imm_rnd = get_random_int(); |
@@ -1005,6 +1006,8 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, | |||
1005 | case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */ | 1006 | case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */ |
1006 | *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm); | 1007 | *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm); |
1007 | *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); | 1008 | *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); |
1009 | if (emit_zext) | ||
1010 | *to++ = BPF_ZEXT_REG(BPF_REG_AX); | ||
1008 | *to++ = BPF_ALU64_REG(BPF_OR, aux[0].dst_reg, BPF_REG_AX); | 1011 | *to++ = BPF_ALU64_REG(BPF_OR, aux[0].dst_reg, BPF_REG_AX); |
1009 | break; | 1012 | break; |
1010 | 1013 | ||
@@ -1088,7 +1091,8 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) | |||
1088 | insn[1].code == 0) | 1091 | insn[1].code == 0) |
1089 | memcpy(aux, insn, sizeof(aux)); | 1092 | memcpy(aux, insn, sizeof(aux)); |
1090 | 1093 | ||
1091 | rewritten = bpf_jit_blind_insn(insn, aux, insn_buff); | 1094 | rewritten = bpf_jit_blind_insn(insn, aux, insn_buff, |
1095 | clone->aux->verifier_zext); | ||
1092 | if (!rewritten) | 1096 | if (!rewritten) |
1093 | continue; | 1097 | continue; |
1094 | 1098 | ||
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5d141f16f6fa..272071e9112f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c | |||
@@ -1707,20 +1707,26 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) | |||
1707 | if (err) | 1707 | if (err) |
1708 | goto free_used_maps; | 1708 | goto free_used_maps; |
1709 | 1709 | ||
1710 | err = bpf_prog_new_fd(prog); | 1710 | /* Upon success of bpf_prog_alloc_id(), the BPF prog is |
1711 | if (err < 0) { | 1711 | * effectively publicly exposed. However, retrieving via |
1712 | /* failed to allocate fd. | 1712 | * bpf_prog_get_fd_by_id() will take another reference, |
1713 | * bpf_prog_put() is needed because the above | 1713 | * therefore it cannot be gone underneath us. |
1714 | * bpf_prog_alloc_id() has published the prog | 1714 | * |
1715 | * to the userspace and the userspace may | 1715 | * Only for the time /after/ successful bpf_prog_new_fd() |
1716 | * have refcnt-ed it through BPF_PROG_GET_FD_BY_ID. | 1716 | * and before returning to userspace, we might just hold |
1717 | */ | 1717 | * one reference and any parallel close on that fd could |
1718 | bpf_prog_put(prog); | 1718 | * rip everything out. Hence, below notifications must |
1719 | return err; | 1719 | * happen before bpf_prog_new_fd(). |
1720 | } | 1720 | * |
1721 | 1721 | * Also, any failure handling from this point onwards must | |
1722 | * be using bpf_prog_put() given the program is exposed. | ||
1723 | */ | ||
1722 | bpf_prog_kallsyms_add(prog); | 1724 | bpf_prog_kallsyms_add(prog); |
1723 | perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0); | 1725 | perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0); |
1726 | |||
1727 | err = bpf_prog_new_fd(prog); | ||
1728 | if (err < 0) | ||
1729 | bpf_prog_put(prog); | ||
1724 | return err; | 1730 | return err; |
1725 | 1731 | ||
1726 | free_used_maps: | 1732 | free_used_maps: |
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c84d83f86141..c36a719fee6d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c | |||
@@ -985,9 +985,6 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg) | |||
985 | reg->smax_value = S64_MAX; | 985 | reg->smax_value = S64_MAX; |
986 | reg->umin_value = 0; | 986 | reg->umin_value = 0; |
987 | reg->umax_value = U64_MAX; | 987 | reg->umax_value = U64_MAX; |
988 | |||
989 | /* constant backtracking is enabled for root only for now */ | ||
990 | reg->precise = capable(CAP_SYS_ADMIN) ? false : true; | ||
991 | } | 988 | } |
992 | 989 | ||
993 | /* Mark a register as having a completely unknown (scalar) value. */ | 990 | /* Mark a register as having a completely unknown (scalar) value. */ |
@@ -1014,7 +1011,11 @@ static void mark_reg_unknown(struct bpf_verifier_env *env, | |||
1014 | __mark_reg_not_init(regs + regno); | 1011 | __mark_reg_not_init(regs + regno); |
1015 | return; | 1012 | return; |
1016 | } | 1013 | } |
1017 | __mark_reg_unknown(regs + regno); | 1014 | regs += regno; |
1015 | __mark_reg_unknown(regs); | ||
1016 | /* constant backtracking is enabled for root without bpf2bpf calls */ | ||
1017 | regs->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks ? | ||
1018 | true : false; | ||
1018 | } | 1019 | } |
1019 | 1020 | ||
1020 | static void __mark_reg_not_init(struct bpf_reg_state *reg) | 1021 | static void __mark_reg_not_init(struct bpf_reg_state *reg) |
@@ -1771,16 +1772,21 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, | |||
1771 | bitmap_from_u64(mask, stack_mask); | 1772 | bitmap_from_u64(mask, stack_mask); |
1772 | for_each_set_bit(i, mask, 64) { | 1773 | for_each_set_bit(i, mask, 64) { |
1773 | if (i >= func->allocated_stack / BPF_REG_SIZE) { | 1774 | if (i >= func->allocated_stack / BPF_REG_SIZE) { |
1774 | /* This can happen if backtracking | 1775 | /* the sequence of instructions: |
1775 | * is propagating stack precision where | 1776 | * 2: (bf) r3 = r10 |
1776 | * caller has larger stack frame | 1777 | * 3: (7b) *(u64 *)(r3 -8) = r0 |
1777 | * than callee, but backtrack_insn() should | 1778 | * 4: (79) r4 = *(u64 *)(r10 -8) |
1778 | * have returned -ENOTSUPP. | 1779 | * doesn't contain jmps. It's backtracked |
1780 | * as a single block. | ||
1781 | * During backtracking insn 3 is not recognized as | ||
1782 | * stack access, so at the end of backtracking | ||
1783 | * stack slot fp-8 is still marked in stack_mask. | ||
1784 | * However the parent state may not have accessed | ||
1785 | * fp-8 and it's "unallocated" stack space. | ||
1786 | * In such case fallback to conservative. | ||
1779 | */ | 1787 | */ |
1780 | verbose(env, "BUG spi %d stack_size %d\n", | 1788 | mark_all_scalars_precise(env, st); |
1781 | i, func->allocated_stack); | 1789 | return 0; |
1782 | WARN_ONCE(1, "verifier backtracking bug"); | ||
1783 | return -EFAULT; | ||
1784 | } | 1790 | } |
1785 | 1791 | ||
1786 | if (func->stack[i].slot_type[0] != STACK_SPILL) { | 1792 | if (func->stack[i].slot_type[0] != STACK_SPILL) { |
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 753afbca549f..8be1da1ebd9a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c | |||
@@ -5255,8 +5255,16 @@ static struct cgroup *cgroup_create(struct cgroup *parent) | |||
5255 | * if the parent has to be frozen, the child has too. | 5255 | * if the parent has to be frozen, the child has too. |
5256 | */ | 5256 | */ |
5257 | cgrp->freezer.e_freeze = parent->freezer.e_freeze; | 5257 | cgrp->freezer.e_freeze = parent->freezer.e_freeze; |
5258 | if (cgrp->freezer.e_freeze) | 5258 | if (cgrp->freezer.e_freeze) { |
5259 | /* | ||
5260 | * Set the CGRP_FREEZE flag, so when a process will be | ||
5261 | * attached to the child cgroup, it will become frozen. | ||
5262 | * At this point the new cgroup is unpopulated, so we can | ||
5263 | * consider it frozen immediately. | ||
5264 | */ | ||
5265 | set_bit(CGRP_FREEZE, &cgrp->flags); | ||
5259 | set_bit(CGRP_FROZEN, &cgrp->flags); | 5266 | set_bit(CGRP_FROZEN, &cgrp->flags); |
5267 | } | ||
5260 | 5268 | ||
5261 | spin_lock_irq(&css_set_lock); | 5269 | spin_lock_irq(&css_set_lock); |
5262 | for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { | 5270 | for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { |
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 2bd410f934b3..69cfb4345388 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c | |||
@@ -230,9 +230,7 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages, | |||
230 | */ | 230 | */ |
231 | struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) | 231 | struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) |
232 | { | 232 | { |
233 | int node = dev ? dev_to_node(dev) : NUMA_NO_NODE; | 233 | size_t count = size >> PAGE_SHIFT; |
234 | size_t count = PAGE_ALIGN(size) >> PAGE_SHIFT; | ||
235 | size_t align = get_order(PAGE_ALIGN(size)); | ||
236 | struct page *page = NULL; | 234 | struct page *page = NULL; |
237 | struct cma *cma = NULL; | 235 | struct cma *cma = NULL; |
238 | 236 | ||
@@ -243,14 +241,12 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) | |||
243 | 241 | ||
244 | /* CMA can be used only in the context which permits sleeping */ | 242 | /* CMA can be used only in the context which permits sleeping */ |
245 | if (cma && gfpflags_allow_blocking(gfp)) { | 243 | if (cma && gfpflags_allow_blocking(gfp)) { |
244 | size_t align = get_order(size); | ||
246 | size_t cma_align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT); | 245 | size_t cma_align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT); |
247 | 246 | ||
248 | page = cma_alloc(cma, count, cma_align, gfp & __GFP_NOWARN); | 247 | page = cma_alloc(cma, count, cma_align, gfp & __GFP_NOWARN); |
249 | } | 248 | } |
250 | 249 | ||
251 | /* Fallback allocation of normal pages */ | ||
252 | if (!page) | ||
253 | page = alloc_pages_node(node, gfp, align); | ||
254 | return page; | 250 | return page; |
255 | } | 251 | } |
256 | 252 | ||
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 795c9b095d75..8402b29c280f 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c | |||
@@ -85,6 +85,8 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) | |||
85 | struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, | 85 | struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, |
86 | dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) | 86 | dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) |
87 | { | 87 | { |
88 | size_t alloc_size = PAGE_ALIGN(size); | ||
89 | int node = dev_to_node(dev); | ||
88 | struct page *page = NULL; | 90 | struct page *page = NULL; |
89 | u64 phys_mask; | 91 | u64 phys_mask; |
90 | 92 | ||
@@ -95,8 +97,14 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, | |||
95 | gfp &= ~__GFP_ZERO; | 97 | gfp &= ~__GFP_ZERO; |
96 | gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, | 98 | gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, |
97 | &phys_mask); | 99 | &phys_mask); |
100 | page = dma_alloc_contiguous(dev, alloc_size, gfp); | ||
101 | if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { | ||
102 | dma_free_contiguous(dev, page, alloc_size); | ||
103 | page = NULL; | ||
104 | } | ||
98 | again: | 105 | again: |
99 | page = dma_alloc_contiguous(dev, size, gfp); | 106 | if (!page) |
107 | page = alloc_pages_node(node, gfp, get_order(alloc_size)); | ||
100 | if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { | 108 | if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { |
101 | dma_free_contiguous(dev, page, size); | 109 | dma_free_contiguous(dev, page, size); |
102 | page = NULL; | 110 | page = NULL; |
@@ -297,7 +305,7 @@ void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, | |||
297 | dma_direct_sync_single_for_cpu(dev, addr, size, dir); | 305 | dma_direct_sync_single_for_cpu(dev, addr, size, dir); |
298 | 306 | ||
299 | if (unlikely(is_swiotlb_buffer(phys))) | 307 | if (unlikely(is_swiotlb_buffer(phys))) |
300 | swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs); | 308 | swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs); |
301 | } | 309 | } |
302 | EXPORT_SYMBOL(dma_direct_unmap_page); | 310 | EXPORT_SYMBOL(dma_direct_unmap_page); |
303 | 311 | ||
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 9de232229063..796a44f8ef5a 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c | |||
@@ -444,7 +444,9 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr, | |||
444 | 444 | ||
445 | phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, | 445 | phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, |
446 | dma_addr_t tbl_dma_addr, | 446 | dma_addr_t tbl_dma_addr, |
447 | phys_addr_t orig_addr, size_t size, | 447 | phys_addr_t orig_addr, |
448 | size_t mapping_size, | ||
449 | size_t alloc_size, | ||
448 | enum dma_data_direction dir, | 450 | enum dma_data_direction dir, |
449 | unsigned long attrs) | 451 | unsigned long attrs) |
450 | { | 452 | { |
@@ -464,6 +466,12 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, | |||
464 | pr_warn_once("%s is active and system is using DMA bounce buffers\n", | 466 | pr_warn_once("%s is active and system is using DMA bounce buffers\n", |
465 | sme_active() ? "SME" : "SEV"); | 467 | sme_active() ? "SME" : "SEV"); |
466 | 468 | ||
469 | if (mapping_size > alloc_size) { | ||
470 | dev_warn_once(hwdev, "Invalid sizes (mapping: %zd bytes, alloc: %zd bytes)", | ||
471 | mapping_size, alloc_size); | ||
472 | return (phys_addr_t)DMA_MAPPING_ERROR; | ||
473 | } | ||
474 | |||
467 | mask = dma_get_seg_boundary(hwdev); | 475 | mask = dma_get_seg_boundary(hwdev); |
468 | 476 | ||
469 | tbl_dma_addr &= mask; | 477 | tbl_dma_addr &= mask; |
@@ -471,8 +479,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, | |||
471 | offset_slots = ALIGN(tbl_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; | 479 | offset_slots = ALIGN(tbl_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; |
472 | 480 | ||
473 | /* | 481 | /* |
474 | * Carefully handle integer overflow which can occur when mask == ~0UL. | 482 | * Carefully handle integer overflow which can occur when mask == ~0UL. |
475 | */ | 483 | */ |
476 | max_slots = mask + 1 | 484 | max_slots = mask + 1 |
477 | ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT | 485 | ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT |
478 | : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT); | 486 | : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT); |
@@ -481,8 +489,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, | |||
481 | * For mappings greater than or equal to a page, we limit the stride | 489 | * For mappings greater than or equal to a page, we limit the stride |
482 | * (and hence alignment) to a page size. | 490 | * (and hence alignment) to a page size. |
483 | */ | 491 | */ |
484 | nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; | 492 | nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; |
485 | if (size >= PAGE_SIZE) | 493 | if (alloc_size >= PAGE_SIZE) |
486 | stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT)); | 494 | stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT)); |
487 | else | 495 | else |
488 | stride = 1; | 496 | stride = 1; |
@@ -547,7 +555,7 @@ not_found: | |||
547 | spin_unlock_irqrestore(&io_tlb_lock, flags); | 555 | spin_unlock_irqrestore(&io_tlb_lock, flags); |
548 | if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) | 556 | if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) |
549 | dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n", | 557 | dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n", |
550 | size, io_tlb_nslabs, tmp_io_tlb_used); | 558 | alloc_size, io_tlb_nslabs, tmp_io_tlb_used); |
551 | return (phys_addr_t)DMA_MAPPING_ERROR; | 559 | return (phys_addr_t)DMA_MAPPING_ERROR; |
552 | found: | 560 | found: |
553 | io_tlb_used += nslots; | 561 | io_tlb_used += nslots; |
@@ -562,7 +570,7 @@ found: | |||
562 | io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT); | 570 | io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT); |
563 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && | 571 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && |
564 | (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) | 572 | (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) |
565 | swiotlb_bounce(orig_addr, tlb_addr, size, DMA_TO_DEVICE); | 573 | swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_TO_DEVICE); |
566 | 574 | ||
567 | return tlb_addr; | 575 | return tlb_addr; |
568 | } | 576 | } |
@@ -571,11 +579,11 @@ found: | |||
571 | * tlb_addr is the physical address of the bounce buffer to unmap. | 579 | * tlb_addr is the physical address of the bounce buffer to unmap. |
572 | */ | 580 | */ |
573 | void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, | 581 | void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, |
574 | size_t size, enum dma_data_direction dir, | 582 | size_t mapping_size, size_t alloc_size, |
575 | unsigned long attrs) | 583 | enum dma_data_direction dir, unsigned long attrs) |
576 | { | 584 | { |
577 | unsigned long flags; | 585 | unsigned long flags; |
578 | int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; | 586 | int i, count, nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; |
579 | int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT; | 587 | int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT; |
580 | phys_addr_t orig_addr = io_tlb_orig_addr[index]; | 588 | phys_addr_t orig_addr = io_tlb_orig_addr[index]; |
581 | 589 | ||
@@ -585,7 +593,7 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, | |||
585 | if (orig_addr != INVALID_PHYS_ADDR && | 593 | if (orig_addr != INVALID_PHYS_ADDR && |
586 | !(attrs & DMA_ATTR_SKIP_CPU_SYNC) && | 594 | !(attrs & DMA_ATTR_SKIP_CPU_SYNC) && |
587 | ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))) | 595 | ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))) |
588 | swiotlb_bounce(orig_addr, tlb_addr, size, DMA_FROM_DEVICE); | 596 | swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_FROM_DEVICE); |
589 | 597 | ||
590 | /* | 598 | /* |
591 | * Return the buffer to the free list by setting the corresponding | 599 | * Return the buffer to the free list by setting the corresponding |
@@ -665,14 +673,14 @@ bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr, | |||
665 | 673 | ||
666 | /* Oh well, have to allocate and map a bounce buffer. */ | 674 | /* Oh well, have to allocate and map a bounce buffer. */ |
667 | *phys = swiotlb_tbl_map_single(dev, __phys_to_dma(dev, io_tlb_start), | 675 | *phys = swiotlb_tbl_map_single(dev, __phys_to_dma(dev, io_tlb_start), |
668 | *phys, size, dir, attrs); | 676 | *phys, size, size, dir, attrs); |
669 | if (*phys == (phys_addr_t)DMA_MAPPING_ERROR) | 677 | if (*phys == (phys_addr_t)DMA_MAPPING_ERROR) |
670 | return false; | 678 | return false; |
671 | 679 | ||
672 | /* Ensure that the address returned is DMA'ble */ | 680 | /* Ensure that the address returned is DMA'ble */ |
673 | *dma_addr = __phys_to_dma(dev, *phys); | 681 | *dma_addr = __phys_to_dma(dev, *phys); |
674 | if (unlikely(!dma_capable(dev, *dma_addr, size))) { | 682 | if (unlikely(!dma_capable(dev, *dma_addr, size))) { |
675 | swiotlb_tbl_unmap_single(dev, *phys, size, dir, | 683 | swiotlb_tbl_unmap_single(dev, *phys, size, size, dir, |
676 | attrs | DMA_ATTR_SKIP_CPU_SYNC); | 684 | attrs | DMA_ATTR_SKIP_CPU_SYNC); |
677 | return false; | 685 | return false; |
678 | } | 686 | } |
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index c5cd852fe86b..3cc8416ec844 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c | |||
@@ -413,7 +413,7 @@ static int hw_breakpoint_parse(struct perf_event *bp, | |||
413 | 413 | ||
414 | int register_perf_hw_breakpoint(struct perf_event *bp) | 414 | int register_perf_hw_breakpoint(struct perf_event *bp) |
415 | { | 415 | { |
416 | struct arch_hw_breakpoint hw; | 416 | struct arch_hw_breakpoint hw = { }; |
417 | int err; | 417 | int err; |
418 | 418 | ||
419 | err = reserve_bp_slot(bp); | 419 | err = reserve_bp_slot(bp); |
@@ -461,7 +461,7 @@ int | |||
461 | modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr, | 461 | modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr, |
462 | bool check) | 462 | bool check) |
463 | { | 463 | { |
464 | struct arch_hw_breakpoint hw; | 464 | struct arch_hw_breakpoint hw = { }; |
465 | int err; | 465 | int err; |
466 | 466 | ||
467 | err = hw_breakpoint_parse(bp, attr, &hw); | 467 | err = hw_breakpoint_parse(bp, attr, &hw); |
diff --git a/kernel/exit.c b/kernel/exit.c index 5b4a5dcce8f8..22ab6a4bdc51 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -1554,6 +1554,23 @@ end: | |||
1554 | return retval; | 1554 | return retval; |
1555 | } | 1555 | } |
1556 | 1556 | ||
1557 | static struct pid *pidfd_get_pid(unsigned int fd) | ||
1558 | { | ||
1559 | struct fd f; | ||
1560 | struct pid *pid; | ||
1561 | |||
1562 | f = fdget(fd); | ||
1563 | if (!f.file) | ||
1564 | return ERR_PTR(-EBADF); | ||
1565 | |||
1566 | pid = pidfd_pid(f.file); | ||
1567 | if (!IS_ERR(pid)) | ||
1568 | get_pid(pid); | ||
1569 | |||
1570 | fdput(f); | ||
1571 | return pid; | ||
1572 | } | ||
1573 | |||
1557 | static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, | 1574 | static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, |
1558 | int options, struct rusage *ru) | 1575 | int options, struct rusage *ru) |
1559 | { | 1576 | { |
@@ -1576,19 +1593,32 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, | |||
1576 | type = PIDTYPE_PID; | 1593 | type = PIDTYPE_PID; |
1577 | if (upid <= 0) | 1594 | if (upid <= 0) |
1578 | return -EINVAL; | 1595 | return -EINVAL; |
1596 | |||
1597 | pid = find_get_pid(upid); | ||
1579 | break; | 1598 | break; |
1580 | case P_PGID: | 1599 | case P_PGID: |
1581 | type = PIDTYPE_PGID; | 1600 | type = PIDTYPE_PGID; |
1582 | if (upid <= 0) | 1601 | if (upid < 0) |
1602 | return -EINVAL; | ||
1603 | |||
1604 | if (upid) | ||
1605 | pid = find_get_pid(upid); | ||
1606 | else | ||
1607 | pid = get_task_pid(current, PIDTYPE_PGID); | ||
1608 | break; | ||
1609 | case P_PIDFD: | ||
1610 | type = PIDTYPE_PID; | ||
1611 | if (upid < 0) | ||
1583 | return -EINVAL; | 1612 | return -EINVAL; |
1613 | |||
1614 | pid = pidfd_get_pid(upid); | ||
1615 | if (IS_ERR(pid)) | ||
1616 | return PTR_ERR(pid); | ||
1584 | break; | 1617 | break; |
1585 | default: | 1618 | default: |
1586 | return -EINVAL; | 1619 | return -EINVAL; |
1587 | } | 1620 | } |
1588 | 1621 | ||
1589 | if (type < PIDTYPE_MAX) | ||
1590 | pid = find_get_pid(upid); | ||
1591 | |||
1592 | wo.wo_type = type; | 1622 | wo.wo_type = type; |
1593 | wo.wo_pid = pid; | 1623 | wo.wo_pid = pid; |
1594 | wo.wo_flags = options; | 1624 | wo.wo_flags = options; |
diff --git a/kernel/fork.c b/kernel/fork.c index 2852d0e76ea3..1d1cd06edbc1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -768,6 +768,7 @@ static void set_max_threads(unsigned int max_threads_suggested) | |||
768 | int arch_task_struct_size __read_mostly; | 768 | int arch_task_struct_size __read_mostly; |
769 | #endif | 769 | #endif |
770 | 770 | ||
771 | #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR | ||
771 | static void task_struct_whitelist(unsigned long *offset, unsigned long *size) | 772 | static void task_struct_whitelist(unsigned long *offset, unsigned long *size) |
772 | { | 773 | { |
773 | /* Fetch thread_struct whitelist for the architecture. */ | 774 | /* Fetch thread_struct whitelist for the architecture. */ |
@@ -782,6 +783,7 @@ static void task_struct_whitelist(unsigned long *offset, unsigned long *size) | |||
782 | else | 783 | else |
783 | *offset += offsetof(struct task_struct, thread); | 784 | *offset += offsetof(struct task_struct, thread); |
784 | } | 785 | } |
786 | #endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */ | ||
785 | 787 | ||
786 | void __init fork_init(void) | 788 | void __init fork_init(void) |
787 | { | 789 | { |
@@ -1690,6 +1692,14 @@ static inline void rcu_copy_process(struct task_struct *p) | |||
1690 | #endif /* #ifdef CONFIG_TASKS_RCU */ | 1692 | #endif /* #ifdef CONFIG_TASKS_RCU */ |
1691 | } | 1693 | } |
1692 | 1694 | ||
1695 | struct pid *pidfd_pid(const struct file *file) | ||
1696 | { | ||
1697 | if (file->f_op == &pidfd_fops) | ||
1698 | return file->private_data; | ||
1699 | |||
1700 | return ERR_PTR(-EBADF); | ||
1701 | } | ||
1702 | |||
1693 | static int pidfd_release(struct inode *inode, struct file *file) | 1703 | static int pidfd_release(struct inode *inode, struct file *file) |
1694 | { | 1704 | { |
1695 | struct pid *pid = file->private_data; | 1705 | struct pid *pid = file->private_data; |
@@ -2338,6 +2348,8 @@ struct mm_struct *copy_init_mm(void) | |||
2338 | * | 2348 | * |
2339 | * It copies the process, and if successful kick-starts | 2349 | * It copies the process, and if successful kick-starts |
2340 | * it and waits for it to finish using the VM if required. | 2350 | * it and waits for it to finish using the VM if required. |
2351 | * | ||
2352 | * args->exit_signal is expected to be checked for sanity by the caller. | ||
2341 | */ | 2353 | */ |
2342 | long _do_fork(struct kernel_clone_args *args) | 2354 | long _do_fork(struct kernel_clone_args *args) |
2343 | { | 2355 | { |
@@ -2562,6 +2574,14 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, | |||
2562 | if (copy_from_user(&args, uargs, size)) | 2574 | if (copy_from_user(&args, uargs, size)) |
2563 | return -EFAULT; | 2575 | return -EFAULT; |
2564 | 2576 | ||
2577 | /* | ||
2578 | * Verify that higher 32bits of exit_signal are unset and that | ||
2579 | * it is a valid signal | ||
2580 | */ | ||
2581 | if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) || | ||
2582 | !valid_signal(args.exit_signal))) | ||
2583 | return -EINVAL; | ||
2584 | |||
2565 | *kargs = (struct kernel_clone_args){ | 2585 | *kargs = (struct kernel_clone_args){ |
2566 | .flags = args.flags, | 2586 | .flags = args.flags, |
2567 | .pidfd = u64_to_user_ptr(args.pidfd), | 2587 | .pidfd = u64_to_user_ptr(args.pidfd), |
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 9484e88dabc2..9be995fc3c5a 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c | |||
@@ -295,6 +295,18 @@ static void irq_sysfs_add(int irq, struct irq_desc *desc) | |||
295 | } | 295 | } |
296 | } | 296 | } |
297 | 297 | ||
298 | static void irq_sysfs_del(struct irq_desc *desc) | ||
299 | { | ||
300 | /* | ||
301 | * If irq_sysfs_init() has not yet been invoked (early boot), then | ||
302 | * irq_kobj_base is NULL and the descriptor was never added. | ||
303 | * kobject_del() complains about a object with no parent, so make | ||
304 | * it conditional. | ||
305 | */ | ||
306 | if (irq_kobj_base) | ||
307 | kobject_del(&desc->kobj); | ||
308 | } | ||
309 | |||
298 | static int __init irq_sysfs_init(void) | 310 | static int __init irq_sysfs_init(void) |
299 | { | 311 | { |
300 | struct irq_desc *desc; | 312 | struct irq_desc *desc; |
@@ -325,6 +337,7 @@ static struct kobj_type irq_kobj_type = { | |||
325 | }; | 337 | }; |
326 | 338 | ||
327 | static void irq_sysfs_add(int irq, struct irq_desc *desc) {} | 339 | static void irq_sysfs_add(int irq, struct irq_desc *desc) {} |
340 | static void irq_sysfs_del(struct irq_desc *desc) {} | ||
328 | 341 | ||
329 | #endif /* CONFIG_SYSFS */ | 342 | #endif /* CONFIG_SYSFS */ |
330 | 343 | ||
@@ -438,7 +451,7 @@ static void free_desc(unsigned int irq) | |||
438 | * The sysfs entry must be serialized against a concurrent | 451 | * The sysfs entry must be serialized against a concurrent |
439 | * irq_sysfs_init() as well. | 452 | * irq_sysfs_init() as well. |
440 | */ | 453 | */ |
441 | kobject_del(&desc->kobj); | 454 | irq_sysfs_del(desc); |
442 | delete_irq_desc(irq); | 455 | delete_irq_desc(irq); |
443 | 456 | ||
444 | /* | 457 | /* |
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index da9addb8d655..cfc4f088a0e7 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
@@ -100,10 +100,6 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v) | |||
100 | return 0; | 100 | return 0; |
101 | } | 101 | } |
102 | 102 | ||
103 | #ifndef is_affinity_mask_valid | ||
104 | #define is_affinity_mask_valid(val) 1 | ||
105 | #endif | ||
106 | |||
107 | int no_irq_affinity; | 103 | int no_irq_affinity; |
108 | static int irq_affinity_proc_show(struct seq_file *m, void *v) | 104 | static int irq_affinity_proc_show(struct seq_file *m, void *v) |
109 | { | 105 | { |
@@ -136,11 +132,6 @@ static ssize_t write_irq_affinity(int type, struct file *file, | |||
136 | if (err) | 132 | if (err) |
137 | goto free_cpumask; | 133 | goto free_cpumask; |
138 | 134 | ||
139 | if (!is_affinity_mask_valid(new_value)) { | ||
140 | err = -EINVAL; | ||
141 | goto free_cpumask; | ||
142 | } | ||
143 | |||
144 | /* | 135 | /* |
145 | * Do not allow disabling IRQs completely - it's a too easy | 136 | * Do not allow disabling IRQs completely - it's a too easy |
146 | * way to make the system unusable accidentally :-) At least | 137 | * way to make the system unusable accidentally :-) At least |
@@ -232,11 +223,6 @@ static ssize_t default_affinity_write(struct file *file, | |||
232 | if (err) | 223 | if (err) |
233 | goto out; | 224 | goto out; |
234 | 225 | ||
235 | if (!is_affinity_mask_valid(new_value)) { | ||
236 | err = -EINVAL; | ||
237 | goto out; | ||
238 | } | ||
239 | |||
240 | /* | 226 | /* |
241 | * Do not allow disabling IRQs completely - it's a too easy | 227 | * Do not allow disabling IRQs completely - it's a too easy |
242 | * way to make the system unusable accidentally :-) At least | 228 | * way to make the system unusable accidentally :-) At least |
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 95414ad3506a..98c04ca5fa43 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c | |||
@@ -36,6 +36,8 @@ static void resend_irqs(unsigned long arg) | |||
36 | irq = find_first_bit(irqs_resend, nr_irqs); | 36 | irq = find_first_bit(irqs_resend, nr_irqs); |
37 | clear_bit(irq, irqs_resend); | 37 | clear_bit(irq, irqs_resend); |
38 | desc = irq_to_desc(irq); | 38 | desc = irq_to_desc(irq); |
39 | if (!desc) | ||
40 | continue; | ||
39 | local_irq_disable(); | 41 | local_irq_disable(); |
40 | desc->handle_irq(desc); | 42 | desc->handle_irq(desc); |
41 | local_irq_enable(); | 43 | local_irq_enable(); |
diff --git a/kernel/jump_label.c b/kernel/jump_label.c index df3008419a1d..cdb3ffab128b 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c | |||
@@ -407,7 +407,9 @@ static bool jump_label_can_update(struct jump_entry *entry, bool init) | |||
407 | return false; | 407 | return false; |
408 | 408 | ||
409 | if (!kernel_text_address(jump_entry_code(entry))) { | 409 | if (!kernel_text_address(jump_entry_code(entry))) { |
410 | WARN_ONCE(1, "can't patch jump_label at %pS", (void *)jump_entry_code(entry)); | 410 | WARN_ONCE(!jump_entry_is_init(entry), |
411 | "can't patch jump_label at %pS", | ||
412 | (void *)jump_entry_code(entry)); | ||
411 | return false; | 413 | return false; |
412 | } | 414 | } |
413 | 415 | ||
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 95a260f9214b..136ce049c4ad 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c | |||
@@ -263,8 +263,10 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, | |||
263 | { | 263 | { |
264 | char namebuf[KSYM_NAME_LEN]; | 264 | char namebuf[KSYM_NAME_LEN]; |
265 | 265 | ||
266 | if (is_ksym_addr(addr)) | 266 | if (is_ksym_addr(addr)) { |
267 | return !!get_symbol_pos(addr, symbolsize, offset); | 267 | get_symbol_pos(addr, symbolsize, offset); |
268 | return 1; | ||
269 | } | ||
268 | return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf) || | 270 | return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf) || |
269 | !!__bpf_address_lookup(addr, symbolsize, offset, namebuf); | 271 | !!__bpf_address_lookup(addr, symbolsize, offset, namebuf); |
270 | } | 272 | } |
diff --git a/kernel/kexec_elf.c b/kernel/kexec_elf.c new file mode 100644 index 000000000000..d3689632e8b9 --- /dev/null +++ b/kernel/kexec_elf.c | |||
@@ -0,0 +1,430 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* | ||
3 | * Load ELF vmlinux file for the kexec_file_load syscall. | ||
4 | * | ||
5 | * Copyright (C) 2004 Adam Litke (agl@us.ibm.com) | ||
6 | * Copyright (C) 2004 IBM Corp. | ||
7 | * Copyright (C) 2005 R Sharada (sharada@in.ibm.com) | ||
8 | * Copyright (C) 2006 Mohan Kumar M (mohan@in.ibm.com) | ||
9 | * Copyright (C) 2016 IBM Corporation | ||
10 | * | ||
11 | * Based on kexec-tools' kexec-elf-exec.c and kexec-elf-ppc64.c. | ||
12 | * Heavily modified for the kernel by | ||
13 | * Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>. | ||
14 | */ | ||
15 | |||
16 | #define pr_fmt(fmt) "kexec_elf: " fmt | ||
17 | |||
18 | #include <linux/elf.h> | ||
19 | #include <linux/kexec.h> | ||
20 | #include <linux/module.h> | ||
21 | #include <linux/slab.h> | ||
22 | #include <linux/types.h> | ||
23 | |||
24 | static inline bool elf_is_elf_file(const struct elfhdr *ehdr) | ||
25 | { | ||
26 | return memcmp(ehdr->e_ident, ELFMAG, SELFMAG) == 0; | ||
27 | } | ||
28 | |||
29 | static uint64_t elf64_to_cpu(const struct elfhdr *ehdr, uint64_t value) | ||
30 | { | ||
31 | if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB) | ||
32 | value = le64_to_cpu(value); | ||
33 | else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB) | ||
34 | value = be64_to_cpu(value); | ||
35 | |||
36 | return value; | ||
37 | } | ||
38 | |||
39 | static uint32_t elf32_to_cpu(const struct elfhdr *ehdr, uint32_t value) | ||
40 | { | ||
41 | if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB) | ||
42 | value = le32_to_cpu(value); | ||
43 | else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB) | ||
44 | value = be32_to_cpu(value); | ||
45 | |||
46 | return value; | ||
47 | } | ||
48 | |||
49 | static uint16_t elf16_to_cpu(const struct elfhdr *ehdr, uint16_t value) | ||
50 | { | ||
51 | if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB) | ||
52 | value = le16_to_cpu(value); | ||
53 | else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB) | ||
54 | value = be16_to_cpu(value); | ||
55 | |||
56 | return value; | ||
57 | } | ||
58 | |||
59 | /** | ||
60 | * elf_is_ehdr_sane - check that it is safe to use the ELF header | ||
61 | * @buf_len: size of the buffer in which the ELF file is loaded. | ||
62 | */ | ||
63 | static bool elf_is_ehdr_sane(const struct elfhdr *ehdr, size_t buf_len) | ||
64 | { | ||
65 | if (ehdr->e_phnum > 0 && ehdr->e_phentsize != sizeof(struct elf_phdr)) { | ||
66 | pr_debug("Bad program header size.\n"); | ||
67 | return false; | ||
68 | } else if (ehdr->e_shnum > 0 && | ||
69 | ehdr->e_shentsize != sizeof(struct elf_shdr)) { | ||
70 | pr_debug("Bad section header size.\n"); | ||
71 | return false; | ||
72 | } else if (ehdr->e_ident[EI_VERSION] != EV_CURRENT || | ||
73 | ehdr->e_version != EV_CURRENT) { | ||
74 | pr_debug("Unknown ELF version.\n"); | ||
75 | return false; | ||
76 | } | ||
77 | |||
78 | if (ehdr->e_phoff > 0 && ehdr->e_phnum > 0) { | ||
79 | size_t phdr_size; | ||
80 | |||
81 | /* | ||
82 | * e_phnum is at most 65535 so calculating the size of the | ||
83 | * program header cannot overflow. | ||
84 | */ | ||
85 | phdr_size = sizeof(struct elf_phdr) * ehdr->e_phnum; | ||
86 | |||
87 | /* Sanity check the program header table location. */ | ||
88 | if (ehdr->e_phoff + phdr_size < ehdr->e_phoff) { | ||
89 | pr_debug("Program headers at invalid location.\n"); | ||
90 | return false; | ||
91 | } else if (ehdr->e_phoff + phdr_size > buf_len) { | ||
92 | pr_debug("Program headers truncated.\n"); | ||
93 | return false; | ||
94 | } | ||
95 | } | ||
96 | |||
97 | if (ehdr->e_shoff > 0 && ehdr->e_shnum > 0) { | ||
98 | size_t shdr_size; | ||
99 | |||
100 | /* | ||
101 | * e_shnum is at most 65536 so calculating | ||
102 | * the size of the section header cannot overflow. | ||
103 | */ | ||
104 | shdr_size = sizeof(struct elf_shdr) * ehdr->e_shnum; | ||
105 | |||
106 | /* Sanity check the section header table location. */ | ||
107 | if (ehdr->e_shoff + shdr_size < ehdr->e_shoff) { | ||
108 | pr_debug("Section headers at invalid location.\n"); | ||
109 | return false; | ||
110 | } else if (ehdr->e_shoff + shdr_size > buf_len) { | ||
111 | pr_debug("Section headers truncated.\n"); | ||
112 | return false; | ||
113 | } | ||
114 | } | ||
115 | |||
116 | return true; | ||
117 | } | ||
118 | |||
119 | static int elf_read_ehdr(const char *buf, size_t len, struct elfhdr *ehdr) | ||
120 | { | ||
121 | struct elfhdr *buf_ehdr; | ||
122 | |||
123 | if (len < sizeof(*buf_ehdr)) { | ||
124 | pr_debug("Buffer is too small to hold ELF header.\n"); | ||
125 | return -ENOEXEC; | ||
126 | } | ||
127 | |||
128 | memset(ehdr, 0, sizeof(*ehdr)); | ||
129 | memcpy(ehdr->e_ident, buf, sizeof(ehdr->e_ident)); | ||
130 | if (!elf_is_elf_file(ehdr)) { | ||
131 | pr_debug("No ELF header magic.\n"); | ||
132 | return -ENOEXEC; | ||
133 | } | ||
134 | |||
135 | if (ehdr->e_ident[EI_CLASS] != ELF_CLASS) { | ||
136 | pr_debug("Not a supported ELF class.\n"); | ||
137 | return -ENOEXEC; | ||
138 | } else if (ehdr->e_ident[EI_DATA] != ELFDATA2LSB && | ||
139 | ehdr->e_ident[EI_DATA] != ELFDATA2MSB) { | ||
140 | pr_debug("Not a supported ELF data format.\n"); | ||
141 | return -ENOEXEC; | ||
142 | } | ||
143 | |||
144 | buf_ehdr = (struct elfhdr *) buf; | ||
145 | if (elf16_to_cpu(ehdr, buf_ehdr->e_ehsize) != sizeof(*buf_ehdr)) { | ||
146 | pr_debug("Bad ELF header size.\n"); | ||
147 | return -ENOEXEC; | ||
148 | } | ||
149 | |||
150 | ehdr->e_type = elf16_to_cpu(ehdr, buf_ehdr->e_type); | ||
151 | ehdr->e_machine = elf16_to_cpu(ehdr, buf_ehdr->e_machine); | ||
152 | ehdr->e_version = elf32_to_cpu(ehdr, buf_ehdr->e_version); | ||
153 | ehdr->e_flags = elf32_to_cpu(ehdr, buf_ehdr->e_flags); | ||
154 | ehdr->e_phentsize = elf16_to_cpu(ehdr, buf_ehdr->e_phentsize); | ||
155 | ehdr->e_phnum = elf16_to_cpu(ehdr, buf_ehdr->e_phnum); | ||
156 | ehdr->e_shentsize = elf16_to_cpu(ehdr, buf_ehdr->e_shentsize); | ||
157 | ehdr->e_shnum = elf16_to_cpu(ehdr, buf_ehdr->e_shnum); | ||
158 | ehdr->e_shstrndx = elf16_to_cpu(ehdr, buf_ehdr->e_shstrndx); | ||
159 | |||
160 | switch (ehdr->e_ident[EI_CLASS]) { | ||
161 | case ELFCLASS64: | ||
162 | ehdr->e_entry = elf64_to_cpu(ehdr, buf_ehdr->e_entry); | ||
163 | ehdr->e_phoff = elf64_to_cpu(ehdr, buf_ehdr->e_phoff); | ||
164 | ehdr->e_shoff = elf64_to_cpu(ehdr, buf_ehdr->e_shoff); | ||
165 | break; | ||
166 | |||
167 | case ELFCLASS32: | ||
168 | ehdr->e_entry = elf32_to_cpu(ehdr, buf_ehdr->e_entry); | ||
169 | ehdr->e_phoff = elf32_to_cpu(ehdr, buf_ehdr->e_phoff); | ||
170 | ehdr->e_shoff = elf32_to_cpu(ehdr, buf_ehdr->e_shoff); | ||
171 | break; | ||
172 | |||
173 | default: | ||
174 | pr_debug("Unknown ELF class.\n"); | ||
175 | return -EINVAL; | ||
176 | } | ||
177 | |||
178 | return elf_is_ehdr_sane(ehdr, len) ? 0 : -ENOEXEC; | ||
179 | } | ||
180 | |||
181 | /** | ||
182 | * elf_is_phdr_sane - check that it is safe to use the program header | ||
183 | * @buf_len: size of the buffer in which the ELF file is loaded. | ||
184 | */ | ||
185 | static bool elf_is_phdr_sane(const struct elf_phdr *phdr, size_t buf_len) | ||
186 | { | ||
187 | |||
188 | if (phdr->p_offset + phdr->p_filesz < phdr->p_offset) { | ||
189 | pr_debug("ELF segment location wraps around.\n"); | ||
190 | return false; | ||
191 | } else if (phdr->p_offset + phdr->p_filesz > buf_len) { | ||
192 | pr_debug("ELF segment not in file.\n"); | ||
193 | return false; | ||
194 | } else if (phdr->p_paddr + phdr->p_memsz < phdr->p_paddr) { | ||
195 | pr_debug("ELF segment address wraps around.\n"); | ||
196 | return false; | ||
197 | } | ||
198 | |||
199 | return true; | ||
200 | } | ||
201 | |||
202 | static int elf_read_phdr(const char *buf, size_t len, | ||
203 | struct kexec_elf_info *elf_info, | ||
204 | int idx) | ||
205 | { | ||
206 | /* Override the const in proghdrs, we are the ones doing the loading. */ | ||
207 | struct elf_phdr *phdr = (struct elf_phdr *) &elf_info->proghdrs[idx]; | ||
208 | const struct elfhdr *ehdr = elf_info->ehdr; | ||
209 | const char *pbuf; | ||
210 | struct elf_phdr *buf_phdr; | ||
211 | |||
212 | pbuf = buf + elf_info->ehdr->e_phoff + (idx * sizeof(*buf_phdr)); | ||
213 | buf_phdr = (struct elf_phdr *) pbuf; | ||
214 | |||
215 | phdr->p_type = elf32_to_cpu(elf_info->ehdr, buf_phdr->p_type); | ||
216 | phdr->p_flags = elf32_to_cpu(elf_info->ehdr, buf_phdr->p_flags); | ||
217 | |||
218 | switch (ehdr->e_ident[EI_CLASS]) { | ||
219 | case ELFCLASS64: | ||
220 | phdr->p_offset = elf64_to_cpu(ehdr, buf_phdr->p_offset); | ||
221 | phdr->p_paddr = elf64_to_cpu(ehdr, buf_phdr->p_paddr); | ||
222 | phdr->p_vaddr = elf64_to_cpu(ehdr, buf_phdr->p_vaddr); | ||
223 | phdr->p_filesz = elf64_to_cpu(ehdr, buf_phdr->p_filesz); | ||
224 | phdr->p_memsz = elf64_to_cpu(ehdr, buf_phdr->p_memsz); | ||
225 | phdr->p_align = elf64_to_cpu(ehdr, buf_phdr->p_align); | ||
226 | break; | ||
227 | |||
228 | case ELFCLASS32: | ||
229 | phdr->p_offset = elf32_to_cpu(ehdr, buf_phdr->p_offset); | ||
230 | phdr->p_paddr = elf32_to_cpu(ehdr, buf_phdr->p_paddr); | ||
231 | phdr->p_vaddr = elf32_to_cpu(ehdr, buf_phdr->p_vaddr); | ||
232 | phdr->p_filesz = elf32_to_cpu(ehdr, buf_phdr->p_filesz); | ||
233 | phdr->p_memsz = elf32_to_cpu(ehdr, buf_phdr->p_memsz); | ||
234 | phdr->p_align = elf32_to_cpu(ehdr, buf_phdr->p_align); | ||
235 | break; | ||
236 | |||
237 | default: | ||
238 | pr_debug("Unknown ELF class.\n"); | ||
239 | return -EINVAL; | ||
240 | } | ||
241 | |||
242 | return elf_is_phdr_sane(phdr, len) ? 0 : -ENOEXEC; | ||
243 | } | ||
244 | |||
245 | /** | ||
246 | * elf_read_phdrs - read the program headers from the buffer | ||
247 | * | ||
248 | * This function assumes that the program header table was checked for sanity. | ||
249 | * Use elf_is_ehdr_sane() if it wasn't. | ||
250 | */ | ||
251 | static int elf_read_phdrs(const char *buf, size_t len, | ||
252 | struct kexec_elf_info *elf_info) | ||
253 | { | ||
254 | size_t phdr_size, i; | ||
255 | const struct elfhdr *ehdr = elf_info->ehdr; | ||
256 | |||
257 | /* | ||
258 | * e_phnum is at most 65535 so calculating the size of the | ||
259 | * program header cannot overflow. | ||
260 | */ | ||
261 | phdr_size = sizeof(struct elf_phdr) * ehdr->e_phnum; | ||
262 | |||
263 | elf_info->proghdrs = kzalloc(phdr_size, GFP_KERNEL); | ||
264 | if (!elf_info->proghdrs) | ||
265 | return -ENOMEM; | ||
266 | |||
267 | for (i = 0; i < ehdr->e_phnum; i++) { | ||
268 | int ret; | ||
269 | |||
270 | ret = elf_read_phdr(buf, len, elf_info, i); | ||
271 | if (ret) { | ||
272 | kfree(elf_info->proghdrs); | ||
273 | elf_info->proghdrs = NULL; | ||
274 | return ret; | ||
275 | } | ||
276 | } | ||
277 | |||
278 | return 0; | ||
279 | } | ||
280 | |||
281 | /** | ||
282 | * elf_read_from_buffer - read ELF file and sets up ELF header and ELF info | ||
283 | * @buf: Buffer to read ELF file from. | ||
284 | * @len: Size of @buf. | ||
285 | * @ehdr: Pointer to existing struct which will be populated. | ||
286 | * @elf_info: Pointer to existing struct which will be populated. | ||
287 | * | ||
288 | * This function allows reading ELF files with different byte order than | ||
289 | * the kernel, byte-swapping the fields as needed. | ||
290 | * | ||
291 | * Return: | ||
292 | * On success returns 0, and the caller should call | ||
293 | * kexec_free_elf_info(elf_info) to free the memory allocated for the section | ||
294 | * and program headers. | ||
295 | */ | ||
296 | static int elf_read_from_buffer(const char *buf, size_t len, | ||
297 | struct elfhdr *ehdr, | ||
298 | struct kexec_elf_info *elf_info) | ||
299 | { | ||
300 | int ret; | ||
301 | |||
302 | ret = elf_read_ehdr(buf, len, ehdr); | ||
303 | if (ret) | ||
304 | return ret; | ||
305 | |||
306 | elf_info->buffer = buf; | ||
307 | elf_info->ehdr = ehdr; | ||
308 | if (ehdr->e_phoff > 0 && ehdr->e_phnum > 0) { | ||
309 | ret = elf_read_phdrs(buf, len, elf_info); | ||
310 | if (ret) | ||
311 | return ret; | ||
312 | } | ||
313 | return 0; | ||
314 | } | ||
315 | |||
316 | /** | ||
317 | * kexec_free_elf_info - free memory allocated by elf_read_from_buffer | ||
318 | */ | ||
319 | void kexec_free_elf_info(struct kexec_elf_info *elf_info) | ||
320 | { | ||
321 | kfree(elf_info->proghdrs); | ||
322 | memset(elf_info, 0, sizeof(*elf_info)); | ||
323 | } | ||
324 | /** | ||
325 | * kexec_build_elf_info - read ELF executable and check that we can use it | ||
326 | */ | ||
327 | int kexec_build_elf_info(const char *buf, size_t len, struct elfhdr *ehdr, | ||
328 | struct kexec_elf_info *elf_info) | ||
329 | { | ||
330 | int i; | ||
331 | int ret; | ||
332 | |||
333 | ret = elf_read_from_buffer(buf, len, ehdr, elf_info); | ||
334 | if (ret) | ||
335 | return ret; | ||
336 | |||
337 | /* Big endian vmlinux has type ET_DYN. */ | ||
338 | if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) { | ||
339 | pr_err("Not an ELF executable.\n"); | ||
340 | goto error; | ||
341 | } else if (!elf_info->proghdrs) { | ||
342 | pr_err("No ELF program header.\n"); | ||
343 | goto error; | ||
344 | } | ||
345 | |||
346 | for (i = 0; i < ehdr->e_phnum; i++) { | ||
347 | /* | ||
348 | * Kexec does not support loading interpreters. | ||
349 | * In addition this check keeps us from attempting | ||
350 | * to kexec ordinay executables. | ||
351 | */ | ||
352 | if (elf_info->proghdrs[i].p_type == PT_INTERP) { | ||
353 | pr_err("Requires an ELF interpreter.\n"); | ||
354 | goto error; | ||
355 | } | ||
356 | } | ||
357 | |||
358 | return 0; | ||
359 | error: | ||
360 | kexec_free_elf_info(elf_info); | ||
361 | return -ENOEXEC; | ||
362 | } | ||
363 | |||
364 | |||
365 | int kexec_elf_probe(const char *buf, unsigned long len) | ||
366 | { | ||
367 | struct elfhdr ehdr; | ||
368 | struct kexec_elf_info elf_info; | ||
369 | int ret; | ||
370 | |||
371 | ret = kexec_build_elf_info(buf, len, &ehdr, &elf_info); | ||
372 | if (ret) | ||
373 | return ret; | ||
374 | |||
375 | kexec_free_elf_info(&elf_info); | ||
376 | |||
377 | return elf_check_arch(&ehdr) ? 0 : -ENOEXEC; | ||
378 | } | ||
379 | |||
380 | /** | ||
381 | * kexec_elf_load - load ELF executable image | ||
382 | * @lowest_load_addr: On return, will be the address where the first PT_LOAD | ||
383 | * section will be loaded in memory. | ||
384 | * | ||
385 | * Return: | ||
386 | * 0 on success, negative value on failure. | ||
387 | */ | ||
388 | int kexec_elf_load(struct kimage *image, struct elfhdr *ehdr, | ||
389 | struct kexec_elf_info *elf_info, | ||
390 | struct kexec_buf *kbuf, | ||
391 | unsigned long *lowest_load_addr) | ||
392 | { | ||
393 | unsigned long lowest_addr = UINT_MAX; | ||
394 | int ret; | ||
395 | size_t i; | ||
396 | |||
397 | /* Read in the PT_LOAD segments. */ | ||
398 | for (i = 0; i < ehdr->e_phnum; i++) { | ||
399 | unsigned long load_addr; | ||
400 | size_t size; | ||
401 | const struct elf_phdr *phdr; | ||
402 | |||
403 | phdr = &elf_info->proghdrs[i]; | ||
404 | if (phdr->p_type != PT_LOAD) | ||
405 | continue; | ||
406 | |||
407 | size = phdr->p_filesz; | ||
408 | if (size > phdr->p_memsz) | ||
409 | size = phdr->p_memsz; | ||
410 | |||
411 | kbuf->buffer = (void *) elf_info->buffer + phdr->p_offset; | ||
412 | kbuf->bufsz = size; | ||
413 | kbuf->memsz = phdr->p_memsz; | ||
414 | kbuf->buf_align = phdr->p_align; | ||
415 | kbuf->buf_min = phdr->p_paddr; | ||
416 | kbuf->mem = KEXEC_BUF_MEM_UNKNOWN; | ||
417 | ret = kexec_add_buffer(kbuf); | ||
418 | if (ret) | ||
419 | goto out; | ||
420 | load_addr = kbuf->mem; | ||
421 | |||
422 | if (load_addr < lowest_addr) | ||
423 | lowest_addr = load_addr; | ||
424 | } | ||
425 | |||
426 | *lowest_load_addr = lowest_addr; | ||
427 | ret = 0; | ||
428 | out: | ||
429 | return ret; | ||
430 | } | ||
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9873fc627d61..d9770a5393c8 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -470,6 +470,7 @@ static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); | |||
470 | */ | 470 | */ |
471 | static void do_optimize_kprobes(void) | 471 | static void do_optimize_kprobes(void) |
472 | { | 472 | { |
473 | lockdep_assert_held(&text_mutex); | ||
473 | /* | 474 | /* |
474 | * The optimization/unoptimization refers online_cpus via | 475 | * The optimization/unoptimization refers online_cpus via |
475 | * stop_machine() and cpu-hotplug modifies online_cpus. | 476 | * stop_machine() and cpu-hotplug modifies online_cpus. |
@@ -487,9 +488,7 @@ static void do_optimize_kprobes(void) | |||
487 | list_empty(&optimizing_list)) | 488 | list_empty(&optimizing_list)) |
488 | return; | 489 | return; |
489 | 490 | ||
490 | mutex_lock(&text_mutex); | ||
491 | arch_optimize_kprobes(&optimizing_list); | 491 | arch_optimize_kprobes(&optimizing_list); |
492 | mutex_unlock(&text_mutex); | ||
493 | } | 492 | } |
494 | 493 | ||
495 | /* | 494 | /* |
@@ -500,6 +499,7 @@ static void do_unoptimize_kprobes(void) | |||
500 | { | 499 | { |
501 | struct optimized_kprobe *op, *tmp; | 500 | struct optimized_kprobe *op, *tmp; |
502 | 501 | ||
502 | lockdep_assert_held(&text_mutex); | ||
503 | /* See comment in do_optimize_kprobes() */ | 503 | /* See comment in do_optimize_kprobes() */ |
504 | lockdep_assert_cpus_held(); | 504 | lockdep_assert_cpus_held(); |
505 | 505 | ||
@@ -507,7 +507,6 @@ static void do_unoptimize_kprobes(void) | |||
507 | if (list_empty(&unoptimizing_list)) | 507 | if (list_empty(&unoptimizing_list)) |
508 | return; | 508 | return; |
509 | 509 | ||
510 | mutex_lock(&text_mutex); | ||
511 | arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list); | 510 | arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list); |
512 | /* Loop free_list for disarming */ | 511 | /* Loop free_list for disarming */ |
513 | list_for_each_entry_safe(op, tmp, &freeing_list, list) { | 512 | list_for_each_entry_safe(op, tmp, &freeing_list, list) { |
@@ -524,7 +523,6 @@ static void do_unoptimize_kprobes(void) | |||
524 | } else | 523 | } else |
525 | list_del_init(&op->list); | 524 | list_del_init(&op->list); |
526 | } | 525 | } |
527 | mutex_unlock(&text_mutex); | ||
528 | } | 526 | } |
529 | 527 | ||
530 | /* Reclaim all kprobes on the free_list */ | 528 | /* Reclaim all kprobes on the free_list */ |
@@ -556,6 +554,7 @@ static void kprobe_optimizer(struct work_struct *work) | |||
556 | { | 554 | { |
557 | mutex_lock(&kprobe_mutex); | 555 | mutex_lock(&kprobe_mutex); |
558 | cpus_read_lock(); | 556 | cpus_read_lock(); |
557 | mutex_lock(&text_mutex); | ||
559 | /* Lock modules while optimizing kprobes */ | 558 | /* Lock modules while optimizing kprobes */ |
560 | mutex_lock(&module_mutex); | 559 | mutex_lock(&module_mutex); |
561 | 560 | ||
@@ -583,6 +582,7 @@ static void kprobe_optimizer(struct work_struct *work) | |||
583 | do_free_cleaned_kprobes(); | 582 | do_free_cleaned_kprobes(); |
584 | 583 | ||
585 | mutex_unlock(&module_mutex); | 584 | mutex_unlock(&module_mutex); |
585 | mutex_unlock(&text_mutex); | ||
586 | cpus_read_unlock(); | 586 | cpus_read_unlock(); |
587 | mutex_unlock(&kprobe_mutex); | 587 | mutex_unlock(&kprobe_mutex); |
588 | 588 | ||
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 4861cf8e274b..4aca3f4379d2 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c | |||
@@ -620,7 +620,7 @@ static void print_lock(struct held_lock *hlock) | |||
620 | return; | 620 | return; |
621 | } | 621 | } |
622 | 622 | ||
623 | printk(KERN_CONT "%p", hlock->instance); | 623 | printk(KERN_CONT "%px", hlock->instance); |
624 | print_lock_name(lock); | 624 | print_lock_name(lock); |
625 | printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip); | 625 | printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip); |
626 | } | 626 | } |
diff --git a/kernel/module.c b/kernel/module.c index 5933395af9a0..9ee93421269c 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -65,9 +65,9 @@ | |||
65 | /* | 65 | /* |
66 | * Modules' sections will be aligned on page boundaries | 66 | * Modules' sections will be aligned on page boundaries |
67 | * to ensure complete separation of code and data, but | 67 | * to ensure complete separation of code and data, but |
68 | * only when CONFIG_STRICT_MODULE_RWX=y | 68 | * only when CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y |
69 | */ | 69 | */ |
70 | #ifdef CONFIG_STRICT_MODULE_RWX | 70 | #ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX |
71 | # define debug_align(X) ALIGN(X, PAGE_SIZE) | 71 | # define debug_align(X) ALIGN(X, PAGE_SIZE) |
72 | #else | 72 | #else |
73 | # define debug_align(X) (X) | 73 | # define debug_align(X) (X) |
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 5ec3ea4028e2..4aa02eee8f6c 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug | |||
@@ -8,6 +8,17 @@ menu "RCU Debugging" | |||
8 | config PROVE_RCU | 8 | config PROVE_RCU |
9 | def_bool PROVE_LOCKING | 9 | def_bool PROVE_LOCKING |
10 | 10 | ||
11 | config PROVE_RCU_LIST | ||
12 | bool "RCU list lockdep debugging" | ||
13 | depends on PROVE_RCU && RCU_EXPERT | ||
14 | default n | ||
15 | help | ||
16 | Enable RCU lockdep checking for list usages. By default it is | ||
17 | turned off since there are several list RCU users that still | ||
18 | need to be converted to pass a lockdep expression. To prevent | ||
19 | false-positive splats, we keep it default disabled but once all | ||
20 | users are converted, we can remove this config option. | ||
21 | |||
11 | config TORTURE_TEST | 22 | config TORTURE_TEST |
12 | tristate | 23 | tristate |
13 | default n | 24 | default n |
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 5290b01de534..8fd4f82c9b3d 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h | |||
@@ -227,6 +227,7 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) | |||
227 | 227 | ||
228 | #ifdef CONFIG_RCU_STALL_COMMON | 228 | #ifdef CONFIG_RCU_STALL_COMMON |
229 | 229 | ||
230 | extern int rcu_cpu_stall_ftrace_dump; | ||
230 | extern int rcu_cpu_stall_suppress; | 231 | extern int rcu_cpu_stall_suppress; |
231 | extern int rcu_cpu_stall_timeout; | 232 | extern int rcu_cpu_stall_timeout; |
232 | int rcu_jiffies_till_stall_check(void); | 233 | int rcu_jiffies_till_stall_check(void); |
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 9bd5f6023c21..495c58ce1640 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c | |||
@@ -24,6 +24,49 @@ void rcu_cblist_init(struct rcu_cblist *rclp) | |||
24 | } | 24 | } |
25 | 25 | ||
26 | /* | 26 | /* |
27 | * Enqueue an rcu_head structure onto the specified callback list. | ||
28 | * This function assumes that the callback is non-lazy because it | ||
29 | * is intended for use by no-CBs CPUs, which do not distinguish | ||
30 | * between lazy and non-lazy RCU callbacks. | ||
31 | */ | ||
32 | void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp) | ||
33 | { | ||
34 | *rclp->tail = rhp; | ||
35 | rclp->tail = &rhp->next; | ||
36 | WRITE_ONCE(rclp->len, rclp->len + 1); | ||
37 | } | ||
38 | |||
39 | /* | ||
40 | * Flush the second rcu_cblist structure onto the first one, obliterating | ||
41 | * any contents of the first. If rhp is non-NULL, enqueue it as the sole | ||
42 | * element of the second rcu_cblist structure, but ensuring that the second | ||
43 | * rcu_cblist structure, if initially non-empty, always appears non-empty | ||
44 | * throughout the process. If rdp is NULL, the second rcu_cblist structure | ||
45 | * is instead initialized to empty. | ||
46 | */ | ||
47 | void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp, | ||
48 | struct rcu_cblist *srclp, | ||
49 | struct rcu_head *rhp) | ||
50 | { | ||
51 | drclp->head = srclp->head; | ||
52 | if (drclp->head) | ||
53 | drclp->tail = srclp->tail; | ||
54 | else | ||
55 | drclp->tail = &drclp->head; | ||
56 | drclp->len = srclp->len; | ||
57 | drclp->len_lazy = srclp->len_lazy; | ||
58 | if (!rhp) { | ||
59 | rcu_cblist_init(srclp); | ||
60 | } else { | ||
61 | rhp->next = NULL; | ||
62 | srclp->head = rhp; | ||
63 | srclp->tail = &rhp->next; | ||
64 | WRITE_ONCE(srclp->len, 1); | ||
65 | srclp->len_lazy = 0; | ||
66 | } | ||
67 | } | ||
68 | |||
69 | /* | ||
27 | * Dequeue the oldest rcu_head structure from the specified callback | 70 | * Dequeue the oldest rcu_head structure from the specified callback |
28 | * list. This function assumes that the callback is non-lazy, but | 71 | * list. This function assumes that the callback is non-lazy, but |
29 | * the caller can later invoke rcu_cblist_dequeued_lazy() if it | 72 | * the caller can later invoke rcu_cblist_dequeued_lazy() if it |
@@ -44,6 +87,67 @@ struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp) | |||
44 | return rhp; | 87 | return rhp; |
45 | } | 88 | } |
46 | 89 | ||
90 | /* Set the length of an rcu_segcblist structure. */ | ||
91 | void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v) | ||
92 | { | ||
93 | #ifdef CONFIG_RCU_NOCB_CPU | ||
94 | atomic_long_set(&rsclp->len, v); | ||
95 | #else | ||
96 | WRITE_ONCE(rsclp->len, v); | ||
97 | #endif | ||
98 | } | ||
99 | |||
100 | /* | ||
101 | * Increase the numeric length of an rcu_segcblist structure by the | ||
102 | * specified amount, which can be negative. This can cause the ->len | ||
103 | * field to disagree with the actual number of callbacks on the structure. | ||
104 | * This increase is fully ordered with respect to the callers accesses | ||
105 | * both before and after. | ||
106 | */ | ||
107 | void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v) | ||
108 | { | ||
109 | #ifdef CONFIG_RCU_NOCB_CPU | ||
110 | smp_mb__before_atomic(); /* Up to the caller! */ | ||
111 | atomic_long_add(v, &rsclp->len); | ||
112 | smp_mb__after_atomic(); /* Up to the caller! */ | ||
113 | #else | ||
114 | smp_mb(); /* Up to the caller! */ | ||
115 | WRITE_ONCE(rsclp->len, rsclp->len + v); | ||
116 | smp_mb(); /* Up to the caller! */ | ||
117 | #endif | ||
118 | } | ||
119 | |||
120 | /* | ||
121 | * Increase the numeric length of an rcu_segcblist structure by one. | ||
122 | * This can cause the ->len field to disagree with the actual number of | ||
123 | * callbacks on the structure. This increase is fully ordered with respect | ||
124 | * to the callers accesses both before and after. | ||
125 | */ | ||
126 | void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp) | ||
127 | { | ||
128 | rcu_segcblist_add_len(rsclp, 1); | ||
129 | } | ||
130 | |||
131 | /* | ||
132 | * Exchange the numeric length of the specified rcu_segcblist structure | ||
133 | * with the specified value. This can cause the ->len field to disagree | ||
134 | * with the actual number of callbacks on the structure. This exchange is | ||
135 | * fully ordered with respect to the callers accesses both before and after. | ||
136 | */ | ||
137 | long rcu_segcblist_xchg_len(struct rcu_segcblist *rsclp, long v) | ||
138 | { | ||
139 | #ifdef CONFIG_RCU_NOCB_CPU | ||
140 | return atomic_long_xchg(&rsclp->len, v); | ||
141 | #else | ||
142 | long ret = rsclp->len; | ||
143 | |||
144 | smp_mb(); /* Up to the caller! */ | ||
145 | WRITE_ONCE(rsclp->len, v); | ||
146 | smp_mb(); /* Up to the caller! */ | ||
147 | return ret; | ||
148 | #endif | ||
149 | } | ||
150 | |||
47 | /* | 151 | /* |
48 | * Initialize an rcu_segcblist structure. | 152 | * Initialize an rcu_segcblist structure. |
49 | */ | 153 | */ |
@@ -56,8 +160,9 @@ void rcu_segcblist_init(struct rcu_segcblist *rsclp) | |||
56 | rsclp->head = NULL; | 160 | rsclp->head = NULL; |
57 | for (i = 0; i < RCU_CBLIST_NSEGS; i++) | 161 | for (i = 0; i < RCU_CBLIST_NSEGS; i++) |
58 | rsclp->tails[i] = &rsclp->head; | 162 | rsclp->tails[i] = &rsclp->head; |
59 | rsclp->len = 0; | 163 | rcu_segcblist_set_len(rsclp, 0); |
60 | rsclp->len_lazy = 0; | 164 | rsclp->len_lazy = 0; |
165 | rsclp->enabled = 1; | ||
61 | } | 166 | } |
62 | 167 | ||
63 | /* | 168 | /* |
@@ -69,7 +174,16 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) | |||
69 | WARN_ON_ONCE(!rcu_segcblist_empty(rsclp)); | 174 | WARN_ON_ONCE(!rcu_segcblist_empty(rsclp)); |
70 | WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp)); | 175 | WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp)); |
71 | WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp)); | 176 | WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp)); |
72 | rsclp->tails[RCU_NEXT_TAIL] = NULL; | 177 | rsclp->enabled = 0; |
178 | } | ||
179 | |||
180 | /* | ||
181 | * Mark the specified rcu_segcblist structure as offloaded. This | ||
182 | * structure must be empty. | ||
183 | */ | ||
184 | void rcu_segcblist_offload(struct rcu_segcblist *rsclp) | ||
185 | { | ||
186 | rsclp->offloaded = 1; | ||
73 | } | 187 | } |
74 | 188 | ||
75 | /* | 189 | /* |
@@ -118,6 +232,18 @@ struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp) | |||
118 | } | 232 | } |
119 | 233 | ||
120 | /* | 234 | /* |
235 | * Return false if there are no CBs awaiting grace periods, otherwise, | ||
236 | * return true and store the nearest waited-upon grace period into *lp. | ||
237 | */ | ||
238 | bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp) | ||
239 | { | ||
240 | if (!rcu_segcblist_pend_cbs(rsclp)) | ||
241 | return false; | ||
242 | *lp = rsclp->gp_seq[RCU_WAIT_TAIL]; | ||
243 | return true; | ||
244 | } | ||
245 | |||
246 | /* | ||
121 | * Enqueue the specified callback onto the specified rcu_segcblist | 247 | * Enqueue the specified callback onto the specified rcu_segcblist |
122 | * structure, updating accounting as needed. Note that the ->len | 248 | * structure, updating accounting as needed. Note that the ->len |
123 | * field may be accessed locklessly, hence the WRITE_ONCE(). | 249 | * field may be accessed locklessly, hence the WRITE_ONCE(). |
@@ -129,13 +255,13 @@ struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp) | |||
129 | void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, | 255 | void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, |
130 | struct rcu_head *rhp, bool lazy) | 256 | struct rcu_head *rhp, bool lazy) |
131 | { | 257 | { |
132 | WRITE_ONCE(rsclp->len, rsclp->len + 1); /* ->len sampled locklessly. */ | 258 | rcu_segcblist_inc_len(rsclp); |
133 | if (lazy) | 259 | if (lazy) |
134 | rsclp->len_lazy++; | 260 | rsclp->len_lazy++; |
135 | smp_mb(); /* Ensure counts are updated before callback is enqueued. */ | 261 | smp_mb(); /* Ensure counts are updated before callback is enqueued. */ |
136 | rhp->next = NULL; | 262 | rhp->next = NULL; |
137 | *rsclp->tails[RCU_NEXT_TAIL] = rhp; | 263 | WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rhp); |
138 | rsclp->tails[RCU_NEXT_TAIL] = &rhp->next; | 264 | WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], &rhp->next); |
139 | } | 265 | } |
140 | 266 | ||
141 | /* | 267 | /* |
@@ -155,7 +281,7 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, | |||
155 | 281 | ||
156 | if (rcu_segcblist_n_cbs(rsclp) == 0) | 282 | if (rcu_segcblist_n_cbs(rsclp) == 0) |
157 | return false; | 283 | return false; |
158 | WRITE_ONCE(rsclp->len, rsclp->len + 1); | 284 | rcu_segcblist_inc_len(rsclp); |
159 | if (lazy) | 285 | if (lazy) |
160 | rsclp->len_lazy++; | 286 | rsclp->len_lazy++; |
161 | smp_mb(); /* Ensure counts are updated before callback is entrained. */ | 287 | smp_mb(); /* Ensure counts are updated before callback is entrained. */ |
@@ -163,9 +289,9 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, | |||
163 | for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--) | 289 | for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--) |
164 | if (rsclp->tails[i] != rsclp->tails[i - 1]) | 290 | if (rsclp->tails[i] != rsclp->tails[i - 1]) |
165 | break; | 291 | break; |
166 | *rsclp->tails[i] = rhp; | 292 | WRITE_ONCE(*rsclp->tails[i], rhp); |
167 | for (; i <= RCU_NEXT_TAIL; i++) | 293 | for (; i <= RCU_NEXT_TAIL; i++) |
168 | rsclp->tails[i] = &rhp->next; | 294 | WRITE_ONCE(rsclp->tails[i], &rhp->next); |
169 | return true; | 295 | return true; |
170 | } | 296 | } |
171 | 297 | ||
@@ -182,9 +308,8 @@ void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp, | |||
182 | struct rcu_cblist *rclp) | 308 | struct rcu_cblist *rclp) |
183 | { | 309 | { |
184 | rclp->len_lazy += rsclp->len_lazy; | 310 | rclp->len_lazy += rsclp->len_lazy; |
185 | rclp->len += rsclp->len; | ||
186 | rsclp->len_lazy = 0; | 311 | rsclp->len_lazy = 0; |
187 | WRITE_ONCE(rsclp->len, 0); /* ->len sampled locklessly. */ | 312 | rclp->len = rcu_segcblist_xchg_len(rsclp, 0); |
188 | } | 313 | } |
189 | 314 | ||
190 | /* | 315 | /* |
@@ -200,12 +325,12 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp, | |||
200 | if (!rcu_segcblist_ready_cbs(rsclp)) | 325 | if (!rcu_segcblist_ready_cbs(rsclp)) |
201 | return; /* Nothing to do. */ | 326 | return; /* Nothing to do. */ |
202 | *rclp->tail = rsclp->head; | 327 | *rclp->tail = rsclp->head; |
203 | rsclp->head = *rsclp->tails[RCU_DONE_TAIL]; | 328 | WRITE_ONCE(rsclp->head, *rsclp->tails[RCU_DONE_TAIL]); |
204 | *rsclp->tails[RCU_DONE_TAIL] = NULL; | 329 | WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL); |
205 | rclp->tail = rsclp->tails[RCU_DONE_TAIL]; | 330 | rclp->tail = rsclp->tails[RCU_DONE_TAIL]; |
206 | for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--) | 331 | for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--) |
207 | if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL]) | 332 | if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL]) |
208 | rsclp->tails[i] = &rsclp->head; | 333 | WRITE_ONCE(rsclp->tails[i], &rsclp->head); |
209 | } | 334 | } |
210 | 335 | ||
211 | /* | 336 | /* |
@@ -224,9 +349,9 @@ void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp, | |||
224 | return; /* Nothing to do. */ | 349 | return; /* Nothing to do. */ |
225 | *rclp->tail = *rsclp->tails[RCU_DONE_TAIL]; | 350 | *rclp->tail = *rsclp->tails[RCU_DONE_TAIL]; |
226 | rclp->tail = rsclp->tails[RCU_NEXT_TAIL]; | 351 | rclp->tail = rsclp->tails[RCU_NEXT_TAIL]; |
227 | *rsclp->tails[RCU_DONE_TAIL] = NULL; | 352 | WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL); |
228 | for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) | 353 | for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) |
229 | rsclp->tails[i] = rsclp->tails[RCU_DONE_TAIL]; | 354 | WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_DONE_TAIL]); |
230 | } | 355 | } |
231 | 356 | ||
232 | /* | 357 | /* |
@@ -237,8 +362,7 @@ void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp, | |||
237 | struct rcu_cblist *rclp) | 362 | struct rcu_cblist *rclp) |
238 | { | 363 | { |
239 | rsclp->len_lazy += rclp->len_lazy; | 364 | rsclp->len_lazy += rclp->len_lazy; |
240 | /* ->len sampled locklessly. */ | 365 | rcu_segcblist_add_len(rsclp, rclp->len); |
241 | WRITE_ONCE(rsclp->len, rsclp->len + rclp->len); | ||
242 | rclp->len_lazy = 0; | 366 | rclp->len_lazy = 0; |
243 | rclp->len = 0; | 367 | rclp->len = 0; |
244 | } | 368 | } |
@@ -255,10 +379,10 @@ void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp, | |||
255 | if (!rclp->head) | 379 | if (!rclp->head) |
256 | return; /* No callbacks to move. */ | 380 | return; /* No callbacks to move. */ |
257 | *rclp->tail = rsclp->head; | 381 | *rclp->tail = rsclp->head; |
258 | rsclp->head = rclp->head; | 382 | WRITE_ONCE(rsclp->head, rclp->head); |
259 | for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) | 383 | for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) |
260 | if (&rsclp->head == rsclp->tails[i]) | 384 | if (&rsclp->head == rsclp->tails[i]) |
261 | rsclp->tails[i] = rclp->tail; | 385 | WRITE_ONCE(rsclp->tails[i], rclp->tail); |
262 | else | 386 | else |
263 | break; | 387 | break; |
264 | rclp->head = NULL; | 388 | rclp->head = NULL; |
@@ -274,8 +398,8 @@ void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp, | |||
274 | { | 398 | { |
275 | if (!rclp->head) | 399 | if (!rclp->head) |
276 | return; /* Nothing to do. */ | 400 | return; /* Nothing to do. */ |
277 | *rsclp->tails[RCU_NEXT_TAIL] = rclp->head; | 401 | WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rclp->head); |
278 | rsclp->tails[RCU_NEXT_TAIL] = rclp->tail; | 402 | WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], rclp->tail); |
279 | rclp->head = NULL; | 403 | rclp->head = NULL; |
280 | rclp->tail = &rclp->head; | 404 | rclp->tail = &rclp->head; |
281 | } | 405 | } |
@@ -299,7 +423,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq) | |||
299 | for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { | 423 | for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { |
300 | if (ULONG_CMP_LT(seq, rsclp->gp_seq[i])) | 424 | if (ULONG_CMP_LT(seq, rsclp->gp_seq[i])) |
301 | break; | 425 | break; |
302 | rsclp->tails[RCU_DONE_TAIL] = rsclp->tails[i]; | 426 | WRITE_ONCE(rsclp->tails[RCU_DONE_TAIL], rsclp->tails[i]); |
303 | } | 427 | } |
304 | 428 | ||
305 | /* If no callbacks moved, nothing more need be done. */ | 429 | /* If no callbacks moved, nothing more need be done. */ |
@@ -308,7 +432,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq) | |||
308 | 432 | ||
309 | /* Clean up tail pointers that might have been misordered above. */ | 433 | /* Clean up tail pointers that might have been misordered above. */ |
310 | for (j = RCU_WAIT_TAIL; j < i; j++) | 434 | for (j = RCU_WAIT_TAIL; j < i; j++) |
311 | rsclp->tails[j] = rsclp->tails[RCU_DONE_TAIL]; | 435 | WRITE_ONCE(rsclp->tails[j], rsclp->tails[RCU_DONE_TAIL]); |
312 | 436 | ||
313 | /* | 437 | /* |
314 | * Callbacks moved, so clean up the misordered ->tails[] pointers | 438 | * Callbacks moved, so clean up the misordered ->tails[] pointers |
@@ -319,7 +443,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq) | |||
319 | for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { | 443 | for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { |
320 | if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL]) | 444 | if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL]) |
321 | break; /* No more callbacks. */ | 445 | break; /* No more callbacks. */ |
322 | rsclp->tails[j] = rsclp->tails[i]; | 446 | WRITE_ONCE(rsclp->tails[j], rsclp->tails[i]); |
323 | rsclp->gp_seq[j] = rsclp->gp_seq[i]; | 447 | rsclp->gp_seq[j] = rsclp->gp_seq[i]; |
324 | } | 448 | } |
325 | } | 449 | } |
@@ -384,7 +508,7 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq) | |||
384 | * structure other than in the RCU_NEXT_TAIL segment. | 508 | * structure other than in the RCU_NEXT_TAIL segment. |
385 | */ | 509 | */ |
386 | for (; i < RCU_NEXT_TAIL; i++) { | 510 | for (; i < RCU_NEXT_TAIL; i++) { |
387 | rsclp->tails[i] = rsclp->tails[RCU_NEXT_TAIL]; | 511 | WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_NEXT_TAIL]); |
388 | rsclp->gp_seq[i] = seq; | 512 | rsclp->gp_seq[i] = seq; |
389 | } | 513 | } |
390 | return true; | 514 | return true; |
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 71b64648464e..815c2fdd3fcc 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h | |||
@@ -9,6 +9,12 @@ | |||
9 | 9 | ||
10 | #include <linux/rcu_segcblist.h> | 10 | #include <linux/rcu_segcblist.h> |
11 | 11 | ||
12 | /* Return number of callbacks in the specified callback list. */ | ||
13 | static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp) | ||
14 | { | ||
15 | return READ_ONCE(rclp->len); | ||
16 | } | ||
17 | |||
12 | /* | 18 | /* |
13 | * Account for the fact that a previously dequeued callback turned out | 19 | * Account for the fact that a previously dequeued callback turned out |
14 | * to be marked as lazy. | 20 | * to be marked as lazy. |
@@ -19,6 +25,10 @@ static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp) | |||
19 | } | 25 | } |
20 | 26 | ||
21 | void rcu_cblist_init(struct rcu_cblist *rclp); | 27 | void rcu_cblist_init(struct rcu_cblist *rclp); |
28 | void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp); | ||
29 | void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp, | ||
30 | struct rcu_cblist *srclp, | ||
31 | struct rcu_head *rhp); | ||
22 | struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp); | 32 | struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp); |
23 | 33 | ||
24 | /* | 34 | /* |
@@ -36,13 +46,17 @@ struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp); | |||
36 | */ | 46 | */ |
37 | static inline bool rcu_segcblist_empty(struct rcu_segcblist *rsclp) | 47 | static inline bool rcu_segcblist_empty(struct rcu_segcblist *rsclp) |
38 | { | 48 | { |
39 | return !rsclp->head; | 49 | return !READ_ONCE(rsclp->head); |
40 | } | 50 | } |
41 | 51 | ||
42 | /* Return number of callbacks in segmented callback list. */ | 52 | /* Return number of callbacks in segmented callback list. */ |
43 | static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp) | 53 | static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp) |
44 | { | 54 | { |
55 | #ifdef CONFIG_RCU_NOCB_CPU | ||
56 | return atomic_long_read(&rsclp->len); | ||
57 | #else | ||
45 | return READ_ONCE(rsclp->len); | 58 | return READ_ONCE(rsclp->len); |
59 | #endif | ||
46 | } | 60 | } |
47 | 61 | ||
48 | /* Return number of lazy callbacks in segmented callback list. */ | 62 | /* Return number of lazy callbacks in segmented callback list. */ |
@@ -54,16 +68,22 @@ static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp) | |||
54 | /* Return number of lazy callbacks in segmented callback list. */ | 68 | /* Return number of lazy callbacks in segmented callback list. */ |
55 | static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp) | 69 | static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp) |
56 | { | 70 | { |
57 | return rsclp->len - rsclp->len_lazy; | 71 | return rcu_segcblist_n_cbs(rsclp) - rsclp->len_lazy; |
58 | } | 72 | } |
59 | 73 | ||
60 | /* | 74 | /* |
61 | * Is the specified rcu_segcblist enabled, for example, not corresponding | 75 | * Is the specified rcu_segcblist enabled, for example, not corresponding |
62 | * to an offline or callback-offloaded CPU? | 76 | * to an offline CPU? |
63 | */ | 77 | */ |
64 | static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) | 78 | static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) |
65 | { | 79 | { |
66 | return !!rsclp->tails[RCU_NEXT_TAIL]; | 80 | return rsclp->enabled; |
81 | } | ||
82 | |||
83 | /* Is the specified rcu_segcblist offloaded? */ | ||
84 | static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp) | ||
85 | { | ||
86 | return rsclp->offloaded; | ||
67 | } | 87 | } |
68 | 88 | ||
69 | /* | 89 | /* |
@@ -73,36 +93,18 @@ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) | |||
73 | */ | 93 | */ |
74 | static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg) | 94 | static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg) |
75 | { | 95 | { |
76 | return !*rsclp->tails[seg]; | 96 | return !READ_ONCE(*READ_ONCE(rsclp->tails[seg])); |
77 | } | ||
78 | |||
79 | /* | ||
80 | * Interim function to return rcu_segcblist head pointer. Longer term, the | ||
81 | * rcu_segcblist will be used more pervasively, removing the need for this | ||
82 | * function. | ||
83 | */ | ||
84 | static inline struct rcu_head *rcu_segcblist_head(struct rcu_segcblist *rsclp) | ||
85 | { | ||
86 | return rsclp->head; | ||
87 | } | ||
88 | |||
89 | /* | ||
90 | * Interim function to return rcu_segcblist head pointer. Longer term, the | ||
91 | * rcu_segcblist will be used more pervasively, removing the need for this | ||
92 | * function. | ||
93 | */ | ||
94 | static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp) | ||
95 | { | ||
96 | WARN_ON_ONCE(rcu_segcblist_empty(rsclp)); | ||
97 | return rsclp->tails[RCU_NEXT_TAIL]; | ||
98 | } | 97 | } |
99 | 98 | ||
99 | void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp); | ||
100 | void rcu_segcblist_init(struct rcu_segcblist *rsclp); | 100 | void rcu_segcblist_init(struct rcu_segcblist *rsclp); |
101 | void rcu_segcblist_disable(struct rcu_segcblist *rsclp); | 101 | void rcu_segcblist_disable(struct rcu_segcblist *rsclp); |
102 | void rcu_segcblist_offload(struct rcu_segcblist *rsclp); | ||
102 | bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp); | 103 | bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp); |
103 | bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp); | 104 | bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp); |
104 | struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp); | 105 | struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp); |
105 | struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp); | 106 | struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp); |
107 | bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp); | ||
106 | void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, | 108 | void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, |
107 | struct rcu_head *rhp, bool lazy); | 109 | struct rcu_head *rhp, bool lazy); |
108 | bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, | 110 | bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, |
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 7a6890b23c5f..5a879d073c1c 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c | |||
@@ -89,7 +89,7 @@ torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable | |||
89 | 89 | ||
90 | static char *perf_type = "rcu"; | 90 | static char *perf_type = "rcu"; |
91 | module_param(perf_type, charp, 0444); | 91 | module_param(perf_type, charp, 0444); |
92 | MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, rcu_bh, ...)"); | 92 | MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, srcu, ...)"); |
93 | 93 | ||
94 | static int nrealreaders; | 94 | static int nrealreaders; |
95 | static int nrealwriters; | 95 | static int nrealwriters; |
@@ -375,6 +375,14 @@ rcu_perf_writer(void *arg) | |||
375 | if (holdoff) | 375 | if (holdoff) |
376 | schedule_timeout_uninterruptible(holdoff * HZ); | 376 | schedule_timeout_uninterruptible(holdoff * HZ); |
377 | 377 | ||
378 | /* | ||
379 | * Wait until rcu_end_inkernel_boot() is called for normal GP tests | ||
380 | * so that RCU is not always expedited for normal GP tests. | ||
381 | * The system_state test is approximate, but works well in practice. | ||
382 | */ | ||
383 | while (!gp_exp && system_state != SYSTEM_RUNNING) | ||
384 | schedule_timeout_uninterruptible(1); | ||
385 | |||
378 | t = ktime_get_mono_fast_ns(); | 386 | t = ktime_get_mono_fast_ns(); |
379 | if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) { | 387 | if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) { |
380 | t_rcu_perf_writer_started = t; | 388 | t_rcu_perf_writer_started = t; |
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index fce4e7e6f502..3c9feca1eab1 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c | |||
@@ -161,6 +161,7 @@ static atomic_long_t n_rcu_torture_timers; | |||
161 | static long n_barrier_attempts; | 161 | static long n_barrier_attempts; |
162 | static long n_barrier_successes; /* did rcu_barrier test succeed? */ | 162 | static long n_barrier_successes; /* did rcu_barrier test succeed? */ |
163 | static struct list_head rcu_torture_removed; | 163 | static struct list_head rcu_torture_removed; |
164 | static unsigned long shutdown_jiffies; | ||
164 | 165 | ||
165 | static int rcu_torture_writer_state; | 166 | static int rcu_torture_writer_state; |
166 | #define RTWS_FIXED_DELAY 0 | 167 | #define RTWS_FIXED_DELAY 0 |
@@ -228,6 +229,15 @@ static u64 notrace rcu_trace_clock_local(void) | |||
228 | } | 229 | } |
229 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ | 230 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ |
230 | 231 | ||
232 | /* | ||
233 | * Stop aggressive CPU-hog tests a bit before the end of the test in order | ||
234 | * to avoid interfering with test shutdown. | ||
235 | */ | ||
236 | static bool shutdown_time_arrived(void) | ||
237 | { | ||
238 | return shutdown_secs && time_after(jiffies, shutdown_jiffies - 30 * HZ); | ||
239 | } | ||
240 | |||
231 | static unsigned long boost_starttime; /* jiffies of next boost test start. */ | 241 | static unsigned long boost_starttime; /* jiffies of next boost test start. */ |
232 | static DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ | 242 | static DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ |
233 | /* and boost task create/destroy. */ | 243 | /* and boost task create/destroy. */ |
@@ -1713,12 +1723,14 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) | |||
1713 | } | 1723 | } |
1714 | 1724 | ||
1715 | // Give the scheduler a chance, even on nohz_full CPUs. | 1725 | // Give the scheduler a chance, even on nohz_full CPUs. |
1716 | static void rcu_torture_fwd_prog_cond_resched(void) | 1726 | static void rcu_torture_fwd_prog_cond_resched(unsigned long iter) |
1717 | { | 1727 | { |
1718 | if (IS_ENABLED(CONFIG_PREEMPT) && IS_ENABLED(CONFIG_NO_HZ_FULL)) { | 1728 | if (IS_ENABLED(CONFIG_PREEMPT) && IS_ENABLED(CONFIG_NO_HZ_FULL)) { |
1719 | if (need_resched()) | 1729 | // Real call_rcu() floods hit userspace, so emulate that. |
1730 | if (need_resched() || (iter & 0xfff)) | ||
1720 | schedule(); | 1731 | schedule(); |
1721 | } else { | 1732 | } else { |
1733 | // No userspace emulation: CB invocation throttles call_rcu() | ||
1722 | cond_resched(); | 1734 | cond_resched(); |
1723 | } | 1735 | } |
1724 | } | 1736 | } |
@@ -1746,7 +1758,7 @@ static unsigned long rcu_torture_fwd_prog_cbfree(void) | |||
1746 | spin_unlock_irqrestore(&rcu_fwd_lock, flags); | 1758 | spin_unlock_irqrestore(&rcu_fwd_lock, flags); |
1747 | kfree(rfcp); | 1759 | kfree(rfcp); |
1748 | freed++; | 1760 | freed++; |
1749 | rcu_torture_fwd_prog_cond_resched(); | 1761 | rcu_torture_fwd_prog_cond_resched(freed); |
1750 | } | 1762 | } |
1751 | return freed; | 1763 | return freed; |
1752 | } | 1764 | } |
@@ -1785,15 +1797,17 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) | |||
1785 | WRITE_ONCE(rcu_fwd_startat, jiffies); | 1797 | WRITE_ONCE(rcu_fwd_startat, jiffies); |
1786 | stopat = rcu_fwd_startat + dur; | 1798 | stopat = rcu_fwd_startat + dur; |
1787 | while (time_before(jiffies, stopat) && | 1799 | while (time_before(jiffies, stopat) && |
1800 | !shutdown_time_arrived() && | ||
1788 | !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { | 1801 | !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { |
1789 | idx = cur_ops->readlock(); | 1802 | idx = cur_ops->readlock(); |
1790 | udelay(10); | 1803 | udelay(10); |
1791 | cur_ops->readunlock(idx); | 1804 | cur_ops->readunlock(idx); |
1792 | if (!fwd_progress_need_resched || need_resched()) | 1805 | if (!fwd_progress_need_resched || need_resched()) |
1793 | rcu_torture_fwd_prog_cond_resched(); | 1806 | rcu_torture_fwd_prog_cond_resched(1); |
1794 | } | 1807 | } |
1795 | (*tested_tries)++; | 1808 | (*tested_tries)++; |
1796 | if (!time_before(jiffies, stopat) && | 1809 | if (!time_before(jiffies, stopat) && |
1810 | !shutdown_time_arrived() && | ||
1797 | !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { | 1811 | !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { |
1798 | (*tested)++; | 1812 | (*tested)++; |
1799 | cver = READ_ONCE(rcu_torture_current_version) - cver; | 1813 | cver = READ_ONCE(rcu_torture_current_version) - cver; |
@@ -1852,6 +1866,7 @@ static void rcu_torture_fwd_prog_cr(void) | |||
1852 | gps = cur_ops->get_gp_seq(); | 1866 | gps = cur_ops->get_gp_seq(); |
1853 | rcu_launder_gp_seq_start = gps; | 1867 | rcu_launder_gp_seq_start = gps; |
1854 | while (time_before(jiffies, stopat) && | 1868 | while (time_before(jiffies, stopat) && |
1869 | !shutdown_time_arrived() && | ||
1855 | !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { | 1870 | !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { |
1856 | rfcp = READ_ONCE(rcu_fwd_cb_head); | 1871 | rfcp = READ_ONCE(rcu_fwd_cb_head); |
1857 | rfcpn = NULL; | 1872 | rfcpn = NULL; |
@@ -1875,7 +1890,7 @@ static void rcu_torture_fwd_prog_cr(void) | |||
1875 | rfcp->rfc_gps = 0; | 1890 | rfcp->rfc_gps = 0; |
1876 | } | 1891 | } |
1877 | cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr); | 1892 | cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr); |
1878 | rcu_torture_fwd_prog_cond_resched(); | 1893 | rcu_torture_fwd_prog_cond_resched(n_launders + n_max_cbs); |
1879 | } | 1894 | } |
1880 | stoppedat = jiffies; | 1895 | stoppedat = jiffies; |
1881 | n_launders_cb_snap = READ_ONCE(n_launders_cb); | 1896 | n_launders_cb_snap = READ_ONCE(n_launders_cb); |
@@ -1884,7 +1899,8 @@ static void rcu_torture_fwd_prog_cr(void) | |||
1884 | cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */ | 1899 | cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */ |
1885 | (void)rcu_torture_fwd_prog_cbfree(); | 1900 | (void)rcu_torture_fwd_prog_cbfree(); |
1886 | 1901 | ||
1887 | if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop)) { | 1902 | if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop) && |
1903 | !shutdown_time_arrived()) { | ||
1888 | WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED); | 1904 | WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED); |
1889 | pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n", | 1905 | pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n", |
1890 | __func__, | 1906 | __func__, |
@@ -2160,6 +2176,7 @@ rcu_torture_cleanup(void) | |||
2160 | return; | 2176 | return; |
2161 | } | 2177 | } |
2162 | 2178 | ||
2179 | show_rcu_gp_kthreads(); | ||
2163 | rcu_torture_barrier_cleanup(); | 2180 | rcu_torture_barrier_cleanup(); |
2164 | torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task); | 2181 | torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task); |
2165 | torture_stop_kthread(rcu_torture_stall, stall_task); | 2182 | torture_stop_kthread(rcu_torture_stall, stall_task); |
@@ -2465,6 +2482,7 @@ rcu_torture_init(void) | |||
2465 | goto unwind; | 2482 | goto unwind; |
2466 | rcutor_hp = firsterr; | 2483 | rcutor_hp = firsterr; |
2467 | } | 2484 | } |
2485 | shutdown_jiffies = jiffies + shutdown_secs * HZ; | ||
2468 | firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup); | 2486 | firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup); |
2469 | if (firsterr) | 2487 | if (firsterr) |
2470 | goto unwind; | 2488 | goto unwind; |
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index cf0e886314f2..5dffade2d7cd 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c | |||
@@ -1279,8 +1279,9 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) | |||
1279 | 1279 | ||
1280 | c0 = l0 - u0; | 1280 | c0 = l0 - u0; |
1281 | c1 = l1 - u1; | 1281 | c1 = l1 - u1; |
1282 | pr_cont(" %d(%ld,%ld %1p)", | 1282 | pr_cont(" %d(%ld,%ld %c)", |
1283 | cpu, c0, c1, rcu_segcblist_head(&sdp->srcu_cblist)); | 1283 | cpu, c0, c1, |
1284 | "C."[rcu_segcblist_empty(&sdp->srcu_cblist)]); | ||
1284 | s0 += c0; | 1285 | s0 += c0; |
1285 | s1 += c1; | 1286 | s1 += c1; |
1286 | } | 1287 | } |
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a14e5fbbea46..71395e91b876 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c | |||
@@ -56,6 +56,7 @@ | |||
56 | #include <linux/smpboot.h> | 56 | #include <linux/smpboot.h> |
57 | #include <linux/jiffies.h> | 57 | #include <linux/jiffies.h> |
58 | #include <linux/sched/isolation.h> | 58 | #include <linux/sched/isolation.h> |
59 | #include <linux/sched/clock.h> | ||
59 | #include "../time/tick-internal.h" | 60 | #include "../time/tick-internal.h" |
60 | 61 | ||
61 | #include "tree.h" | 62 | #include "tree.h" |
@@ -210,9 +211,9 @@ static long rcu_get_n_cbs_cpu(int cpu) | |||
210 | { | 211 | { |
211 | struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); | 212 | struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); |
212 | 213 | ||
213 | if (rcu_segcblist_is_enabled(&rdp->cblist)) /* Online normal CPU? */ | 214 | if (rcu_segcblist_is_enabled(&rdp->cblist)) |
214 | return rcu_segcblist_n_cbs(&rdp->cblist); | 215 | return rcu_segcblist_n_cbs(&rdp->cblist); |
215 | return rcu_get_n_cbs_nocb_cpu(rdp); /* Works for offline, too. */ | 216 | return 0; |
216 | } | 217 | } |
217 | 218 | ||
218 | void rcu_softirq_qs(void) | 219 | void rcu_softirq_qs(void) |
@@ -416,6 +417,12 @@ module_param(qlowmark, long, 0444); | |||
416 | static ulong jiffies_till_first_fqs = ULONG_MAX; | 417 | static ulong jiffies_till_first_fqs = ULONG_MAX; |
417 | static ulong jiffies_till_next_fqs = ULONG_MAX; | 418 | static ulong jiffies_till_next_fqs = ULONG_MAX; |
418 | static bool rcu_kick_kthreads; | 419 | static bool rcu_kick_kthreads; |
420 | static int rcu_divisor = 7; | ||
421 | module_param(rcu_divisor, int, 0644); | ||
422 | |||
423 | /* Force an exit from rcu_do_batch() after 3 milliseconds. */ | ||
424 | static long rcu_resched_ns = 3 * NSEC_PER_MSEC; | ||
425 | module_param(rcu_resched_ns, long, 0644); | ||
419 | 426 | ||
420 | /* | 427 | /* |
421 | * How long the grace period must be before we start recruiting | 428 | * How long the grace period must be before we start recruiting |
@@ -1251,6 +1258,7 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp) | |||
1251 | unsigned long gp_seq_req; | 1258 | unsigned long gp_seq_req; |
1252 | bool ret = false; | 1259 | bool ret = false; |
1253 | 1260 | ||
1261 | rcu_lockdep_assert_cblist_protected(rdp); | ||
1254 | raw_lockdep_assert_held_rcu_node(rnp); | 1262 | raw_lockdep_assert_held_rcu_node(rnp); |
1255 | 1263 | ||
1256 | /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ | 1264 | /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ |
@@ -1292,7 +1300,7 @@ static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp, | |||
1292 | unsigned long c; | 1300 | unsigned long c; |
1293 | bool needwake; | 1301 | bool needwake; |
1294 | 1302 | ||
1295 | lockdep_assert_irqs_disabled(); | 1303 | rcu_lockdep_assert_cblist_protected(rdp); |
1296 | c = rcu_seq_snap(&rcu_state.gp_seq); | 1304 | c = rcu_seq_snap(&rcu_state.gp_seq); |
1297 | if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) { | 1305 | if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) { |
1298 | /* Old request still live, so mark recent callbacks. */ | 1306 | /* Old request still live, so mark recent callbacks. */ |
@@ -1318,6 +1326,7 @@ static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp, | |||
1318 | */ | 1326 | */ |
1319 | static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp) | 1327 | static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp) |
1320 | { | 1328 | { |
1329 | rcu_lockdep_assert_cblist_protected(rdp); | ||
1321 | raw_lockdep_assert_held_rcu_node(rnp); | 1330 | raw_lockdep_assert_held_rcu_node(rnp); |
1322 | 1331 | ||
1323 | /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ | 1332 | /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ |
@@ -1335,6 +1344,21 @@ static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp) | |||
1335 | } | 1344 | } |
1336 | 1345 | ||
1337 | /* | 1346 | /* |
1347 | * Move and classify callbacks, but only if doing so won't require | ||
1348 | * that the RCU grace-period kthread be awakened. | ||
1349 | */ | ||
1350 | static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp, | ||
1351 | struct rcu_data *rdp) | ||
1352 | { | ||
1353 | rcu_lockdep_assert_cblist_protected(rdp); | ||
1354 | if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) || | ||
1355 | !raw_spin_trylock_rcu_node(rnp)) | ||
1356 | return; | ||
1357 | WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp)); | ||
1358 | raw_spin_unlock_rcu_node(rnp); | ||
1359 | } | ||
1360 | |||
1361 | /* | ||
1338 | * Update CPU-local rcu_data state to record the beginnings and ends of | 1362 | * Update CPU-local rcu_data state to record the beginnings and ends of |
1339 | * grace periods. The caller must hold the ->lock of the leaf rcu_node | 1363 | * grace periods. The caller must hold the ->lock of the leaf rcu_node |
1340 | * structure corresponding to the current CPU, and must have irqs disabled. | 1364 | * structure corresponding to the current CPU, and must have irqs disabled. |
@@ -1342,8 +1366,10 @@ static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp) | |||
1342 | */ | 1366 | */ |
1343 | static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) | 1367 | static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) |
1344 | { | 1368 | { |
1345 | bool ret; | 1369 | bool ret = false; |
1346 | bool need_gp; | 1370 | bool need_gp; |
1371 | const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && | ||
1372 | rcu_segcblist_is_offloaded(&rdp->cblist); | ||
1347 | 1373 | ||
1348 | raw_lockdep_assert_held_rcu_node(rnp); | 1374 | raw_lockdep_assert_held_rcu_node(rnp); |
1349 | 1375 | ||
@@ -1353,10 +1379,12 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) | |||
1353 | /* Handle the ends of any preceding grace periods first. */ | 1379 | /* Handle the ends of any preceding grace periods first. */ |
1354 | if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) || | 1380 | if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) || |
1355 | unlikely(READ_ONCE(rdp->gpwrap))) { | 1381 | unlikely(READ_ONCE(rdp->gpwrap))) { |
1356 | ret = rcu_advance_cbs(rnp, rdp); /* Advance callbacks. */ | 1382 | if (!offloaded) |
1383 | ret = rcu_advance_cbs(rnp, rdp); /* Advance CBs. */ | ||
1357 | trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuend")); | 1384 | trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuend")); |
1358 | } else { | 1385 | } else { |
1359 | ret = rcu_accelerate_cbs(rnp, rdp); /* Recent callbacks. */ | 1386 | if (!offloaded) |
1387 | ret = rcu_accelerate_cbs(rnp, rdp); /* Recent CBs. */ | ||
1360 | } | 1388 | } |
1361 | 1389 | ||
1362 | /* Now handle the beginnings of any new-to-this-CPU grace periods. */ | 1390 | /* Now handle the beginnings of any new-to-this-CPU grace periods. */ |
@@ -1657,6 +1685,7 @@ static void rcu_gp_cleanup(void) | |||
1657 | unsigned long gp_duration; | 1685 | unsigned long gp_duration; |
1658 | bool needgp = false; | 1686 | bool needgp = false; |
1659 | unsigned long new_gp_seq; | 1687 | unsigned long new_gp_seq; |
1688 | bool offloaded; | ||
1660 | struct rcu_data *rdp; | 1689 | struct rcu_data *rdp; |
1661 | struct rcu_node *rnp = rcu_get_root(); | 1690 | struct rcu_node *rnp = rcu_get_root(); |
1662 | struct swait_queue_head *sq; | 1691 | struct swait_queue_head *sq; |
@@ -1722,7 +1751,9 @@ static void rcu_gp_cleanup(void) | |||
1722 | needgp = true; | 1751 | needgp = true; |
1723 | } | 1752 | } |
1724 | /* Advance CBs to reduce false positives below. */ | 1753 | /* Advance CBs to reduce false positives below. */ |
1725 | if (!rcu_accelerate_cbs(rnp, rdp) && needgp) { | 1754 | offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && |
1755 | rcu_segcblist_is_offloaded(&rdp->cblist); | ||
1756 | if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) { | ||
1726 | WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT); | 1757 | WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT); |
1727 | rcu_state.gp_req_activity = jiffies; | 1758 | rcu_state.gp_req_activity = jiffies; |
1728 | trace_rcu_grace_period(rcu_state.name, | 1759 | trace_rcu_grace_period(rcu_state.name, |
@@ -1916,7 +1947,9 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) | |||
1916 | { | 1947 | { |
1917 | unsigned long flags; | 1948 | unsigned long flags; |
1918 | unsigned long mask; | 1949 | unsigned long mask; |
1919 | bool needwake; | 1950 | bool needwake = false; |
1951 | const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && | ||
1952 | rcu_segcblist_is_offloaded(&rdp->cblist); | ||
1920 | struct rcu_node *rnp; | 1953 | struct rcu_node *rnp; |
1921 | 1954 | ||
1922 | rnp = rdp->mynode; | 1955 | rnp = rdp->mynode; |
@@ -1943,7 +1976,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) | |||
1943 | * This GP can't end until cpu checks in, so all of our | 1976 | * This GP can't end until cpu checks in, so all of our |
1944 | * callbacks can be processed during the next GP. | 1977 | * callbacks can be processed during the next GP. |
1945 | */ | 1978 | */ |
1946 | needwake = rcu_accelerate_cbs(rnp, rdp); | 1979 | if (!offloaded) |
1980 | needwake = rcu_accelerate_cbs(rnp, rdp); | ||
1947 | 1981 | ||
1948 | rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); | 1982 | rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); |
1949 | /* ^^^ Released rnp->lock */ | 1983 | /* ^^^ Released rnp->lock */ |
@@ -2077,9 +2111,12 @@ int rcutree_dead_cpu(unsigned int cpu) | |||
2077 | static void rcu_do_batch(struct rcu_data *rdp) | 2111 | static void rcu_do_batch(struct rcu_data *rdp) |
2078 | { | 2112 | { |
2079 | unsigned long flags; | 2113 | unsigned long flags; |
2114 | const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && | ||
2115 | rcu_segcblist_is_offloaded(&rdp->cblist); | ||
2080 | struct rcu_head *rhp; | 2116 | struct rcu_head *rhp; |
2081 | struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); | 2117 | struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); |
2082 | long bl, count; | 2118 | long bl, count; |
2119 | long pending, tlimit = 0; | ||
2083 | 2120 | ||
2084 | /* If no callbacks are ready, just return. */ | 2121 | /* If no callbacks are ready, just return. */ |
2085 | if (!rcu_segcblist_ready_cbs(&rdp->cblist)) { | 2122 | if (!rcu_segcblist_ready_cbs(&rdp->cblist)) { |
@@ -2099,13 +2136,19 @@ static void rcu_do_batch(struct rcu_data *rdp) | |||
2099 | * callback counts, as rcu_barrier() needs to be conservative. | 2136 | * callback counts, as rcu_barrier() needs to be conservative. |
2100 | */ | 2137 | */ |
2101 | local_irq_save(flags); | 2138 | local_irq_save(flags); |
2139 | rcu_nocb_lock(rdp); | ||
2102 | WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); | 2140 | WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); |
2103 | bl = rdp->blimit; | 2141 | pending = rcu_segcblist_n_cbs(&rdp->cblist); |
2142 | bl = max(rdp->blimit, pending >> rcu_divisor); | ||
2143 | if (unlikely(bl > 100)) | ||
2144 | tlimit = local_clock() + rcu_resched_ns; | ||
2104 | trace_rcu_batch_start(rcu_state.name, | 2145 | trace_rcu_batch_start(rcu_state.name, |
2105 | rcu_segcblist_n_lazy_cbs(&rdp->cblist), | 2146 | rcu_segcblist_n_lazy_cbs(&rdp->cblist), |
2106 | rcu_segcblist_n_cbs(&rdp->cblist), bl); | 2147 | rcu_segcblist_n_cbs(&rdp->cblist), bl); |
2107 | rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl); | 2148 | rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl); |
2108 | local_irq_restore(flags); | 2149 | if (offloaded) |
2150 | rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist); | ||
2151 | rcu_nocb_unlock_irqrestore(rdp, flags); | ||
2109 | 2152 | ||
2110 | /* Invoke callbacks. */ | 2153 | /* Invoke callbacks. */ |
2111 | rhp = rcu_cblist_dequeue(&rcl); | 2154 | rhp = rcu_cblist_dequeue(&rcl); |
@@ -2117,13 +2160,29 @@ static void rcu_do_batch(struct rcu_data *rdp) | |||
2117 | * Stop only if limit reached and CPU has something to do. | 2160 | * Stop only if limit reached and CPU has something to do. |
2118 | * Note: The rcl structure counts down from zero. | 2161 | * Note: The rcl structure counts down from zero. |
2119 | */ | 2162 | */ |
2120 | if (-rcl.len >= bl && | 2163 | if (-rcl.len >= bl && !offloaded && |
2121 | (need_resched() || | 2164 | (need_resched() || |
2122 | (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) | 2165 | (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) |
2123 | break; | 2166 | break; |
2167 | if (unlikely(tlimit)) { | ||
2168 | /* only call local_clock() every 32 callbacks */ | ||
2169 | if (likely((-rcl.len & 31) || local_clock() < tlimit)) | ||
2170 | continue; | ||
2171 | /* Exceeded the time limit, so leave. */ | ||
2172 | break; | ||
2173 | } | ||
2174 | if (offloaded) { | ||
2175 | WARN_ON_ONCE(in_serving_softirq()); | ||
2176 | local_bh_enable(); | ||
2177 | lockdep_assert_irqs_enabled(); | ||
2178 | cond_resched_tasks_rcu_qs(); | ||
2179 | lockdep_assert_irqs_enabled(); | ||
2180 | local_bh_disable(); | ||
2181 | } | ||
2124 | } | 2182 | } |
2125 | 2183 | ||
2126 | local_irq_save(flags); | 2184 | local_irq_save(flags); |
2185 | rcu_nocb_lock(rdp); | ||
2127 | count = -rcl.len; | 2186 | count = -rcl.len; |
2128 | trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(), | 2187 | trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(), |
2129 | is_idle_task(current), rcu_is_callbacks_kthread()); | 2188 | is_idle_task(current), rcu_is_callbacks_kthread()); |
@@ -2149,12 +2208,14 @@ static void rcu_do_batch(struct rcu_data *rdp) | |||
2149 | * The following usually indicates a double call_rcu(). To track | 2208 | * The following usually indicates a double call_rcu(). To track |
2150 | * this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. | 2209 | * this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. |
2151 | */ | 2210 | */ |
2152 | WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != (count == 0)); | 2211 | WARN_ON_ONCE(count == 0 && !rcu_segcblist_empty(&rdp->cblist)); |
2212 | WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) && | ||
2213 | count != 0 && rcu_segcblist_empty(&rdp->cblist)); | ||
2153 | 2214 | ||
2154 | local_irq_restore(flags); | 2215 | rcu_nocb_unlock_irqrestore(rdp, flags); |
2155 | 2216 | ||
2156 | /* Re-invoke RCU core processing if there are callbacks remaining. */ | 2217 | /* Re-invoke RCU core processing if there are callbacks remaining. */ |
2157 | if (rcu_segcblist_ready_cbs(&rdp->cblist)) | 2218 | if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist)) |
2158 | invoke_rcu_core(); | 2219 | invoke_rcu_core(); |
2159 | } | 2220 | } |
2160 | 2221 | ||
@@ -2280,6 +2341,8 @@ static __latent_entropy void rcu_core(void) | |||
2280 | unsigned long flags; | 2341 | unsigned long flags; |
2281 | struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); | 2342 | struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); |
2282 | struct rcu_node *rnp = rdp->mynode; | 2343 | struct rcu_node *rnp = rdp->mynode; |
2344 | const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && | ||
2345 | rcu_segcblist_is_offloaded(&rdp->cblist); | ||
2283 | 2346 | ||
2284 | if (cpu_is_offline(smp_processor_id())) | 2347 | if (cpu_is_offline(smp_processor_id())) |
2285 | return; | 2348 | return; |
@@ -2299,7 +2362,7 @@ static __latent_entropy void rcu_core(void) | |||
2299 | 2362 | ||
2300 | /* No grace period and unregistered callbacks? */ | 2363 | /* No grace period and unregistered callbacks? */ |
2301 | if (!rcu_gp_in_progress() && | 2364 | if (!rcu_gp_in_progress() && |
2302 | rcu_segcblist_is_enabled(&rdp->cblist)) { | 2365 | rcu_segcblist_is_enabled(&rdp->cblist) && !offloaded) { |
2303 | local_irq_save(flags); | 2366 | local_irq_save(flags); |
2304 | if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) | 2367 | if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) |
2305 | rcu_accelerate_cbs_unlocked(rnp, rdp); | 2368 | rcu_accelerate_cbs_unlocked(rnp, rdp); |
@@ -2309,7 +2372,7 @@ static __latent_entropy void rcu_core(void) | |||
2309 | rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check()); | 2372 | rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check()); |
2310 | 2373 | ||
2311 | /* If there are callbacks ready, invoke them. */ | 2374 | /* If there are callbacks ready, invoke them. */ |
2312 | if (rcu_segcblist_ready_cbs(&rdp->cblist) && | 2375 | if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist) && |
2313 | likely(READ_ONCE(rcu_scheduler_fully_active))) | 2376 | likely(READ_ONCE(rcu_scheduler_fully_active))) |
2314 | rcu_do_batch(rdp); | 2377 | rcu_do_batch(rdp); |
2315 | 2378 | ||
@@ -2489,10 +2552,11 @@ static void rcu_leak_callback(struct rcu_head *rhp) | |||
2489 | * is expected to specify a CPU. | 2552 | * is expected to specify a CPU. |
2490 | */ | 2553 | */ |
2491 | static void | 2554 | static void |
2492 | __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy) | 2555 | __call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy) |
2493 | { | 2556 | { |
2494 | unsigned long flags; | 2557 | unsigned long flags; |
2495 | struct rcu_data *rdp; | 2558 | struct rcu_data *rdp; |
2559 | bool was_alldone; | ||
2496 | 2560 | ||
2497 | /* Misaligned rcu_head! */ | 2561 | /* Misaligned rcu_head! */ |
2498 | WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1)); | 2562 | WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1)); |
@@ -2514,28 +2578,18 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy) | |||
2514 | rdp = this_cpu_ptr(&rcu_data); | 2578 | rdp = this_cpu_ptr(&rcu_data); |
2515 | 2579 | ||
2516 | /* Add the callback to our list. */ | 2580 | /* Add the callback to our list. */ |
2517 | if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist)) || cpu != -1) { | 2581 | if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist))) { |
2518 | int offline; | 2582 | // This can trigger due to call_rcu() from offline CPU: |
2519 | 2583 | WARN_ON_ONCE(rcu_scheduler_active != RCU_SCHEDULER_INACTIVE); | |
2520 | if (cpu != -1) | ||
2521 | rdp = per_cpu_ptr(&rcu_data, cpu); | ||
2522 | if (likely(rdp->mynode)) { | ||
2523 | /* Post-boot, so this should be for a no-CBs CPU. */ | ||
2524 | offline = !__call_rcu_nocb(rdp, head, lazy, flags); | ||
2525 | WARN_ON_ONCE(offline); | ||
2526 | /* Offline CPU, _call_rcu() illegal, leak callback. */ | ||
2527 | local_irq_restore(flags); | ||
2528 | return; | ||
2529 | } | ||
2530 | /* | ||
2531 | * Very early boot, before rcu_init(). Initialize if needed | ||
2532 | * and then drop through to queue the callback. | ||
2533 | */ | ||
2534 | WARN_ON_ONCE(cpu != -1); | ||
2535 | WARN_ON_ONCE(!rcu_is_watching()); | 2584 | WARN_ON_ONCE(!rcu_is_watching()); |
2585 | // Very early boot, before rcu_init(). Initialize if needed | ||
2586 | // and then drop through to queue the callback. | ||
2536 | if (rcu_segcblist_empty(&rdp->cblist)) | 2587 | if (rcu_segcblist_empty(&rdp->cblist)) |
2537 | rcu_segcblist_init(&rdp->cblist); | 2588 | rcu_segcblist_init(&rdp->cblist); |
2538 | } | 2589 | } |
2590 | if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags)) | ||
2591 | return; // Enqueued onto ->nocb_bypass, so just leave. | ||
2592 | /* If we get here, rcu_nocb_try_bypass() acquired ->nocb_lock. */ | ||
2539 | rcu_segcblist_enqueue(&rdp->cblist, head, lazy); | 2593 | rcu_segcblist_enqueue(&rdp->cblist, head, lazy); |
2540 | if (__is_kfree_rcu_offset((unsigned long)func)) | 2594 | if (__is_kfree_rcu_offset((unsigned long)func)) |
2541 | trace_rcu_kfree_callback(rcu_state.name, head, | 2595 | trace_rcu_kfree_callback(rcu_state.name, head, |
@@ -2548,8 +2602,13 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy) | |||
2548 | rcu_segcblist_n_cbs(&rdp->cblist)); | 2602 | rcu_segcblist_n_cbs(&rdp->cblist)); |
2549 | 2603 | ||
2550 | /* Go handle any RCU core processing required. */ | 2604 | /* Go handle any RCU core processing required. */ |
2551 | __call_rcu_core(rdp, head, flags); | 2605 | if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && |
2552 | local_irq_restore(flags); | 2606 | unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) { |
2607 | __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */ | ||
2608 | } else { | ||
2609 | __call_rcu_core(rdp, head, flags); | ||
2610 | local_irq_restore(flags); | ||
2611 | } | ||
2553 | } | 2612 | } |
2554 | 2613 | ||
2555 | /** | 2614 | /** |
@@ -2589,7 +2648,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy) | |||
2589 | */ | 2648 | */ |
2590 | void call_rcu(struct rcu_head *head, rcu_callback_t func) | 2649 | void call_rcu(struct rcu_head *head, rcu_callback_t func) |
2591 | { | 2650 | { |
2592 | __call_rcu(head, func, -1, 0); | 2651 | __call_rcu(head, func, 0); |
2593 | } | 2652 | } |
2594 | EXPORT_SYMBOL_GPL(call_rcu); | 2653 | EXPORT_SYMBOL_GPL(call_rcu); |
2595 | 2654 | ||
@@ -2602,7 +2661,7 @@ EXPORT_SYMBOL_GPL(call_rcu); | |||
2602 | */ | 2661 | */ |
2603 | void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) | 2662 | void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) |
2604 | { | 2663 | { |
2605 | __call_rcu(head, func, -1, 1); | 2664 | __call_rcu(head, func, 1); |
2606 | } | 2665 | } |
2607 | EXPORT_SYMBOL_GPL(kfree_call_rcu); | 2666 | EXPORT_SYMBOL_GPL(kfree_call_rcu); |
2608 | 2667 | ||
@@ -2735,6 +2794,10 @@ static int rcu_pending(void) | |||
2735 | /* Check for CPU stalls, if enabled. */ | 2794 | /* Check for CPU stalls, if enabled. */ |
2736 | check_cpu_stall(rdp); | 2795 | check_cpu_stall(rdp); |
2737 | 2796 | ||
2797 | /* Does this CPU need a deferred NOCB wakeup? */ | ||
2798 | if (rcu_nocb_need_deferred_wakeup(rdp)) | ||
2799 | return 1; | ||
2800 | |||
2738 | /* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */ | 2801 | /* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */ |
2739 | if (rcu_nohz_full_cpu()) | 2802 | if (rcu_nohz_full_cpu()) |
2740 | return 0; | 2803 | return 0; |
@@ -2750,6 +2813,8 @@ static int rcu_pending(void) | |||
2750 | /* Has RCU gone idle with this CPU needing another grace period? */ | 2813 | /* Has RCU gone idle with this CPU needing another grace period? */ |
2751 | if (!rcu_gp_in_progress() && | 2814 | if (!rcu_gp_in_progress() && |
2752 | rcu_segcblist_is_enabled(&rdp->cblist) && | 2815 | rcu_segcblist_is_enabled(&rdp->cblist) && |
2816 | (!IS_ENABLED(CONFIG_RCU_NOCB_CPU) || | ||
2817 | !rcu_segcblist_is_offloaded(&rdp->cblist)) && | ||
2753 | !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) | 2818 | !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) |
2754 | return 1; | 2819 | return 1; |
2755 | 2820 | ||
@@ -2758,10 +2823,6 @@ static int rcu_pending(void) | |||
2758 | unlikely(READ_ONCE(rdp->gpwrap))) /* outside lock */ | 2823 | unlikely(READ_ONCE(rdp->gpwrap))) /* outside lock */ |
2759 | return 1; | 2824 | return 1; |
2760 | 2825 | ||
2761 | /* Does this CPU need a deferred NOCB wakeup? */ | ||
2762 | if (rcu_nocb_need_deferred_wakeup(rdp)) | ||
2763 | return 1; | ||
2764 | |||
2765 | /* nothing to do */ | 2826 | /* nothing to do */ |
2766 | return 0; | 2827 | return 0; |
2767 | } | 2828 | } |
@@ -2801,6 +2862,8 @@ static void rcu_barrier_func(void *unused) | |||
2801 | rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence); | 2862 | rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence); |
2802 | rdp->barrier_head.func = rcu_barrier_callback; | 2863 | rdp->barrier_head.func = rcu_barrier_callback; |
2803 | debug_rcu_head_queue(&rdp->barrier_head); | 2864 | debug_rcu_head_queue(&rdp->barrier_head); |
2865 | rcu_nocb_lock(rdp); | ||
2866 | WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies)); | ||
2804 | if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) { | 2867 | if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) { |
2805 | atomic_inc(&rcu_state.barrier_cpu_count); | 2868 | atomic_inc(&rcu_state.barrier_cpu_count); |
2806 | } else { | 2869 | } else { |
@@ -2808,6 +2871,7 @@ static void rcu_barrier_func(void *unused) | |||
2808 | rcu_barrier_trace(TPS("IRQNQ"), -1, | 2871 | rcu_barrier_trace(TPS("IRQNQ"), -1, |
2809 | rcu_state.barrier_sequence); | 2872 | rcu_state.barrier_sequence); |
2810 | } | 2873 | } |
2874 | rcu_nocb_unlock(rdp); | ||
2811 | } | 2875 | } |
2812 | 2876 | ||
2813 | /** | 2877 | /** |
@@ -2858,22 +2922,11 @@ void rcu_barrier(void) | |||
2858 | * corresponding CPU's preceding callbacks have been invoked. | 2922 | * corresponding CPU's preceding callbacks have been invoked. |
2859 | */ | 2923 | */ |
2860 | for_each_possible_cpu(cpu) { | 2924 | for_each_possible_cpu(cpu) { |
2861 | if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu)) | ||
2862 | continue; | ||
2863 | rdp = per_cpu_ptr(&rcu_data, cpu); | 2925 | rdp = per_cpu_ptr(&rcu_data, cpu); |
2864 | if (rcu_is_nocb_cpu(cpu)) { | 2926 | if (!cpu_online(cpu) && |
2865 | if (!rcu_nocb_cpu_needs_barrier(cpu)) { | 2927 | !rcu_segcblist_is_offloaded(&rdp->cblist)) |
2866 | rcu_barrier_trace(TPS("OfflineNoCB"), cpu, | 2928 | continue; |
2867 | rcu_state.barrier_sequence); | 2929 | if (rcu_segcblist_n_cbs(&rdp->cblist)) { |
2868 | } else { | ||
2869 | rcu_barrier_trace(TPS("OnlineNoCB"), cpu, | ||
2870 | rcu_state.barrier_sequence); | ||
2871 | smp_mb__before_atomic(); | ||
2872 | atomic_inc(&rcu_state.barrier_cpu_count); | ||
2873 | __call_rcu(&rdp->barrier_head, | ||
2874 | rcu_barrier_callback, cpu, 0); | ||
2875 | } | ||
2876 | } else if (rcu_segcblist_n_cbs(&rdp->cblist)) { | ||
2877 | rcu_barrier_trace(TPS("OnlineQ"), cpu, | 2930 | rcu_barrier_trace(TPS("OnlineQ"), cpu, |
2878 | rcu_state.barrier_sequence); | 2931 | rcu_state.barrier_sequence); |
2879 | smp_call_function_single(cpu, rcu_barrier_func, NULL, 1); | 2932 | smp_call_function_single(cpu, rcu_barrier_func, NULL, 1); |
@@ -2958,7 +3011,8 @@ rcu_boot_init_percpu_data(int cpu) | |||
2958 | * Initializes a CPU's per-CPU RCU data. Note that only one online or | 3011 | * Initializes a CPU's per-CPU RCU data. Note that only one online or |
2959 | * offline event can be happening at a given time. Note also that we can | 3012 | * offline event can be happening at a given time. Note also that we can |
2960 | * accept some slop in the rsp->gp_seq access due to the fact that this | 3013 | * accept some slop in the rsp->gp_seq access due to the fact that this |
2961 | * CPU cannot possibly have any RCU callbacks in flight yet. | 3014 | * CPU cannot possibly have any non-offloaded RCU callbacks in flight yet. |
3015 | * And any offloaded callbacks are being numbered elsewhere. | ||
2962 | */ | 3016 | */ |
2963 | int rcutree_prepare_cpu(unsigned int cpu) | 3017 | int rcutree_prepare_cpu(unsigned int cpu) |
2964 | { | 3018 | { |
@@ -2972,7 +3026,7 @@ int rcutree_prepare_cpu(unsigned int cpu) | |||
2972 | rdp->n_force_qs_snap = rcu_state.n_force_qs; | 3026 | rdp->n_force_qs_snap = rcu_state.n_force_qs; |
2973 | rdp->blimit = blimit; | 3027 | rdp->blimit = blimit; |
2974 | if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */ | 3028 | if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */ |
2975 | !init_nocb_callback_list(rdp)) | 3029 | !rcu_segcblist_is_offloaded(&rdp->cblist)) |
2976 | rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ | 3030 | rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ |
2977 | rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */ | 3031 | rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */ |
2978 | rcu_dynticks_eqs_online(); | 3032 | rcu_dynticks_eqs_online(); |
@@ -3151,29 +3205,38 @@ void rcutree_migrate_callbacks(int cpu) | |||
3151 | { | 3205 | { |
3152 | unsigned long flags; | 3206 | unsigned long flags; |
3153 | struct rcu_data *my_rdp; | 3207 | struct rcu_data *my_rdp; |
3208 | struct rcu_node *my_rnp; | ||
3154 | struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); | 3209 | struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); |
3155 | struct rcu_node *rnp_root = rcu_get_root(); | ||
3156 | bool needwake; | 3210 | bool needwake; |
3157 | 3211 | ||
3158 | if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist)) | 3212 | if (rcu_segcblist_is_offloaded(&rdp->cblist) || |
3213 | rcu_segcblist_empty(&rdp->cblist)) | ||
3159 | return; /* No callbacks to migrate. */ | 3214 | return; /* No callbacks to migrate. */ |
3160 | 3215 | ||
3161 | local_irq_save(flags); | 3216 | local_irq_save(flags); |
3162 | my_rdp = this_cpu_ptr(&rcu_data); | 3217 | my_rdp = this_cpu_ptr(&rcu_data); |
3163 | if (rcu_nocb_adopt_orphan_cbs(my_rdp, rdp, flags)) { | 3218 | my_rnp = my_rdp->mynode; |
3164 | local_irq_restore(flags); | 3219 | rcu_nocb_lock(my_rdp); /* irqs already disabled. */ |
3165 | return; | 3220 | WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies)); |
3166 | } | 3221 | raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */ |
3167 | raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ | ||
3168 | /* Leverage recent GPs and set GP for new callbacks. */ | 3222 | /* Leverage recent GPs and set GP for new callbacks. */ |
3169 | needwake = rcu_advance_cbs(rnp_root, rdp) || | 3223 | needwake = rcu_advance_cbs(my_rnp, rdp) || |
3170 | rcu_advance_cbs(rnp_root, my_rdp); | 3224 | rcu_advance_cbs(my_rnp, my_rdp); |
3171 | rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist); | 3225 | rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist); |
3226 | needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp); | ||
3227 | rcu_segcblist_disable(&rdp->cblist); | ||
3172 | WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != | 3228 | WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != |
3173 | !rcu_segcblist_n_cbs(&my_rdp->cblist)); | 3229 | !rcu_segcblist_n_cbs(&my_rdp->cblist)); |
3174 | raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags); | 3230 | if (rcu_segcblist_is_offloaded(&my_rdp->cblist)) { |
3231 | raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */ | ||
3232 | __call_rcu_nocb_wake(my_rdp, true, flags); | ||
3233 | } else { | ||
3234 | rcu_nocb_unlock(my_rdp); /* irqs remain disabled. */ | ||
3235 | raw_spin_unlock_irqrestore_rcu_node(my_rnp, flags); | ||
3236 | } | ||
3175 | if (needwake) | 3237 | if (needwake) |
3176 | rcu_gp_kthread_wake(); | 3238 | rcu_gp_kthread_wake(); |
3239 | lockdep_assert_irqs_enabled(); | ||
3177 | WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || | 3240 | WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || |
3178 | !rcu_segcblist_empty(&rdp->cblist), | 3241 | !rcu_segcblist_empty(&rdp->cblist), |
3179 | "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", | 3242 | "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", |
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 7acaf3a62d39..c612f306fe89 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h | |||
@@ -194,29 +194,38 @@ struct rcu_data { | |||
194 | 194 | ||
195 | /* 5) Callback offloading. */ | 195 | /* 5) Callback offloading. */ |
196 | #ifdef CONFIG_RCU_NOCB_CPU | 196 | #ifdef CONFIG_RCU_NOCB_CPU |
197 | struct rcu_head *nocb_head; /* CBs waiting for kthread. */ | 197 | struct swait_queue_head nocb_cb_wq; /* For nocb kthreads to sleep on. */ |
198 | struct rcu_head **nocb_tail; | 198 | struct task_struct *nocb_gp_kthread; |
199 | atomic_long_t nocb_q_count; /* # CBs waiting for nocb */ | ||
200 | atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */ | ||
201 | struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */ | ||
202 | struct rcu_head **nocb_follower_tail; | ||
203 | struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */ | ||
204 | struct task_struct *nocb_kthread; | ||
205 | raw_spinlock_t nocb_lock; /* Guard following pair of fields. */ | 199 | raw_spinlock_t nocb_lock; /* Guard following pair of fields. */ |
200 | atomic_t nocb_lock_contended; /* Contention experienced. */ | ||
206 | int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ | 201 | int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ |
207 | struct timer_list nocb_timer; /* Enforce finite deferral. */ | 202 | struct timer_list nocb_timer; /* Enforce finite deferral. */ |
208 | 203 | unsigned long nocb_gp_adv_time; /* Last call_rcu() CB adv (jiffies). */ | |
209 | /* The following fields are used by the leader, hence own cacheline. */ | 204 | |
210 | struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp; | 205 | /* The following fields are used by call_rcu, hence own cacheline. */ |
211 | /* CBs waiting for GP. */ | 206 | raw_spinlock_t nocb_bypass_lock ____cacheline_internodealigned_in_smp; |
212 | struct rcu_head **nocb_gp_tail; | 207 | struct rcu_cblist nocb_bypass; /* Lock-contention-bypass CB list. */ |
213 | bool nocb_leader_sleep; /* Is the nocb leader thread asleep? */ | 208 | unsigned long nocb_bypass_first; /* Time (jiffies) of first enqueue. */ |
214 | struct rcu_data *nocb_next_follower; | 209 | unsigned long nocb_nobypass_last; /* Last ->cblist enqueue (jiffies). */ |
215 | /* Next follower in wakeup chain. */ | 210 | int nocb_nobypass_count; /* # ->cblist enqueues at ^^^ time. */ |
216 | 211 | ||
217 | /* The following fields are used by the follower, hence new cachline. */ | 212 | /* The following fields are used by GP kthread, hence own cacheline. */ |
218 | struct rcu_data *nocb_leader ____cacheline_internodealigned_in_smp; | 213 | raw_spinlock_t nocb_gp_lock ____cacheline_internodealigned_in_smp; |
219 | /* Leader CPU takes GP-end wakeups. */ | 214 | struct timer_list nocb_bypass_timer; /* Force nocb_bypass flush. */ |
215 | u8 nocb_gp_sleep; /* Is the nocb GP thread asleep? */ | ||
216 | u8 nocb_gp_bypass; /* Found a bypass on last scan? */ | ||
217 | u8 nocb_gp_gp; /* GP to wait for on last scan? */ | ||
218 | unsigned long nocb_gp_seq; /* If so, ->gp_seq to wait for. */ | ||
219 | unsigned long nocb_gp_loops; /* # passes through wait code. */ | ||
220 | struct swait_queue_head nocb_gp_wq; /* For nocb kthreads to sleep on. */ | ||
221 | bool nocb_cb_sleep; /* Is the nocb CB thread asleep? */ | ||
222 | struct task_struct *nocb_cb_kthread; | ||
223 | struct rcu_data *nocb_next_cb_rdp; | ||
224 | /* Next rcu_data in wakeup chain. */ | ||
225 | |||
226 | /* The following fields are used by CB kthread, hence new cacheline. */ | ||
227 | struct rcu_data *nocb_gp_rdp ____cacheline_internodealigned_in_smp; | ||
228 | /* GP rdp takes GP-end wakeups. */ | ||
220 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | 229 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ |
221 | 230 | ||
222 | /* 6) RCU priority boosting. */ | 231 | /* 6) RCU priority boosting. */ |
@@ -419,25 +428,39 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp); | |||
419 | static bool rcu_preempt_need_deferred_qs(struct task_struct *t); | 428 | static bool rcu_preempt_need_deferred_qs(struct task_struct *t); |
420 | static void rcu_preempt_deferred_qs(struct task_struct *t); | 429 | static void rcu_preempt_deferred_qs(struct task_struct *t); |
421 | static void zero_cpu_stall_ticks(struct rcu_data *rdp); | 430 | static void zero_cpu_stall_ticks(struct rcu_data *rdp); |
422 | static bool rcu_nocb_cpu_needs_barrier(int cpu); | ||
423 | static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp); | 431 | static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp); |
424 | static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq); | 432 | static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq); |
425 | static void rcu_init_one_nocb(struct rcu_node *rnp); | 433 | static void rcu_init_one_nocb(struct rcu_node *rnp); |
426 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | 434 | static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, |
427 | bool lazy, unsigned long flags); | 435 | unsigned long j); |
428 | static bool rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, | 436 | static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, |
429 | struct rcu_data *rdp, | 437 | bool *was_alldone, unsigned long flags); |
430 | unsigned long flags); | 438 | static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty, |
439 | unsigned long flags); | ||
431 | static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); | 440 | static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); |
432 | static void do_nocb_deferred_wakeup(struct rcu_data *rdp); | 441 | static void do_nocb_deferred_wakeup(struct rcu_data *rdp); |
433 | static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); | 442 | static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); |
434 | static void rcu_spawn_cpu_nocb_kthread(int cpu); | 443 | static void rcu_spawn_cpu_nocb_kthread(int cpu); |
435 | static void __init rcu_spawn_nocb_kthreads(void); | 444 | static void __init rcu_spawn_nocb_kthreads(void); |
445 | static void show_rcu_nocb_state(struct rcu_data *rdp); | ||
446 | static void rcu_nocb_lock(struct rcu_data *rdp); | ||
447 | static void rcu_nocb_unlock(struct rcu_data *rdp); | ||
448 | static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, | ||
449 | unsigned long flags); | ||
450 | static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp); | ||
436 | #ifdef CONFIG_RCU_NOCB_CPU | 451 | #ifdef CONFIG_RCU_NOCB_CPU |
437 | static void __init rcu_organize_nocb_kthreads(void); | 452 | static void __init rcu_organize_nocb_kthreads(void); |
438 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | 453 | #define rcu_nocb_lock_irqsave(rdp, flags) \ |
439 | static bool init_nocb_callback_list(struct rcu_data *rdp); | 454 | do { \ |
440 | static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp); | 455 | if (!rcu_segcblist_is_offloaded(&(rdp)->cblist)) \ |
456 | local_irq_save(flags); \ | ||
457 | else \ | ||
458 | raw_spin_lock_irqsave(&(rdp)->nocb_lock, (flags)); \ | ||
459 | } while (0) | ||
460 | #else /* #ifdef CONFIG_RCU_NOCB_CPU */ | ||
461 | #define rcu_nocb_lock_irqsave(rdp, flags) local_irq_save(flags) | ||
462 | #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ | ||
463 | |||
441 | static void rcu_bind_gp_kthread(void); | 464 | static void rcu_bind_gp_kthread(void); |
442 | static bool rcu_nohz_full_cpu(void); | 465 | static bool rcu_nohz_full_cpu(void); |
443 | static void rcu_dynticks_task_enter(void); | 466 | static void rcu_dynticks_task_enter(void); |
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index af7e7b9c86af..d632cd019597 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h | |||
@@ -781,7 +781,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) | |||
781 | * other hand, if the CPU is not in an RCU read-side critical section, | 781 | * other hand, if the CPU is not in an RCU read-side critical section, |
782 | * the IPI handler reports the quiescent state immediately. | 782 | * the IPI handler reports the quiescent state immediately. |
783 | * | 783 | * |
784 | * Although this is a greate improvement over previous expedited | 784 | * Although this is a great improvement over previous expedited |
785 | * implementations, it is still unfriendly to real-time workloads, so is | 785 | * implementations, it is still unfriendly to real-time workloads, so is |
786 | * thus not recommended for any sort of common-case code. In fact, if | 786 | * thus not recommended for any sort of common-case code. In fact, if |
787 | * you are using synchronize_rcu_expedited() in a loop, please restructure | 787 | * you are using synchronize_rcu_expedited() in a loop, please restructure |
@@ -792,6 +792,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) | |||
792 | */ | 792 | */ |
793 | void synchronize_rcu_expedited(void) | 793 | void synchronize_rcu_expedited(void) |
794 | { | 794 | { |
795 | bool boottime = (rcu_scheduler_active == RCU_SCHEDULER_INIT); | ||
795 | struct rcu_exp_work rew; | 796 | struct rcu_exp_work rew; |
796 | struct rcu_node *rnp; | 797 | struct rcu_node *rnp; |
797 | unsigned long s; | 798 | unsigned long s; |
@@ -817,7 +818,7 @@ void synchronize_rcu_expedited(void) | |||
817 | return; /* Someone else did our work for us. */ | 818 | return; /* Someone else did our work for us. */ |
818 | 819 | ||
819 | /* Ensure that load happens before action based on it. */ | 820 | /* Ensure that load happens before action based on it. */ |
820 | if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) { | 821 | if (unlikely(boottime)) { |
821 | /* Direct call during scheduler init and early_initcalls(). */ | 822 | /* Direct call during scheduler init and early_initcalls(). */ |
822 | rcu_exp_sel_wait_wake(s); | 823 | rcu_exp_sel_wait_wake(s); |
823 | } else { | 824 | } else { |
@@ -835,5 +836,8 @@ void synchronize_rcu_expedited(void) | |||
835 | 836 | ||
836 | /* Let the next expedited grace period start. */ | 837 | /* Let the next expedited grace period start. */ |
837 | mutex_unlock(&rcu_state.exp_mutex); | 838 | mutex_unlock(&rcu_state.exp_mutex); |
839 | |||
840 | if (likely(!boottime)) | ||
841 | destroy_work_on_stack(&rew.rew_work); | ||
838 | } | 842 | } |
839 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | 843 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); |
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index acb225023ed1..2defc7fe74c3 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h | |||
@@ -288,7 +288,6 @@ void rcu_note_context_switch(bool preempt) | |||
288 | struct rcu_data *rdp = this_cpu_ptr(&rcu_data); | 288 | struct rcu_data *rdp = this_cpu_ptr(&rcu_data); |
289 | struct rcu_node *rnp; | 289 | struct rcu_node *rnp; |
290 | 290 | ||
291 | barrier(); /* Avoid RCU read-side critical sections leaking down. */ | ||
292 | trace_rcu_utilization(TPS("Start context switch")); | 291 | trace_rcu_utilization(TPS("Start context switch")); |
293 | lockdep_assert_irqs_disabled(); | 292 | lockdep_assert_irqs_disabled(); |
294 | WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0); | 293 | WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0); |
@@ -314,15 +313,6 @@ void rcu_note_context_switch(bool preempt) | |||
314 | ? rnp->gp_seq | 313 | ? rnp->gp_seq |
315 | : rcu_seq_snap(&rnp->gp_seq)); | 314 | : rcu_seq_snap(&rnp->gp_seq)); |
316 | rcu_preempt_ctxt_queue(rnp, rdp); | 315 | rcu_preempt_ctxt_queue(rnp, rdp); |
317 | } else if (t->rcu_read_lock_nesting < 0 && | ||
318 | t->rcu_read_unlock_special.s) { | ||
319 | |||
320 | /* | ||
321 | * Complete exit from RCU read-side critical section on | ||
322 | * behalf of preempted instance of __rcu_read_unlock(). | ||
323 | */ | ||
324 | rcu_read_unlock_special(t); | ||
325 | rcu_preempt_deferred_qs(t); | ||
326 | } else { | 316 | } else { |
327 | rcu_preempt_deferred_qs(t); | 317 | rcu_preempt_deferred_qs(t); |
328 | } | 318 | } |
@@ -340,7 +330,6 @@ void rcu_note_context_switch(bool preempt) | |||
340 | if (rdp->exp_deferred_qs) | 330 | if (rdp->exp_deferred_qs) |
341 | rcu_report_exp_rdp(rdp); | 331 | rcu_report_exp_rdp(rdp); |
342 | trace_rcu_utilization(TPS("End context switch")); | 332 | trace_rcu_utilization(TPS("End context switch")); |
343 | barrier(); /* Avoid RCU read-side critical sections leaking up. */ | ||
344 | } | 333 | } |
345 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); | 334 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); |
346 | 335 | ||
@@ -626,22 +615,18 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
626 | (rdp->grpmask & rnp->expmask) || | 615 | (rdp->grpmask & rnp->expmask) || |
627 | tick_nohz_full_cpu(rdp->cpu); | 616 | tick_nohz_full_cpu(rdp->cpu); |
628 | // Need to defer quiescent state until everything is enabled. | 617 | // Need to defer quiescent state until everything is enabled. |
629 | if ((exp || in_irq()) && irqs_were_disabled && use_softirq && | 618 | if (irqs_were_disabled && use_softirq && |
630 | (in_irq() || !t->rcu_read_unlock_special.b.deferred_qs)) { | 619 | (in_interrupt() || |
620 | (exp && !t->rcu_read_unlock_special.b.deferred_qs))) { | ||
631 | // Using softirq, safe to awaken, and we get | 621 | // Using softirq, safe to awaken, and we get |
632 | // no help from enabling irqs, unlike bh/preempt. | 622 | // no help from enabling irqs, unlike bh/preempt. |
633 | raise_softirq_irqoff(RCU_SOFTIRQ); | 623 | raise_softirq_irqoff(RCU_SOFTIRQ); |
634 | } else if (exp && irqs_were_disabled && !use_softirq && | ||
635 | !t->rcu_read_unlock_special.b.deferred_qs) { | ||
636 | // Safe to awaken and we get no help from enabling | ||
637 | // irqs, unlike bh/preempt. | ||
638 | invoke_rcu_core(); | ||
639 | } else { | 624 | } else { |
640 | // Enabling BH or preempt does reschedule, so... | 625 | // Enabling BH or preempt does reschedule, so... |
641 | // Also if no expediting or NO_HZ_FULL, slow is OK. | 626 | // Also if no expediting or NO_HZ_FULL, slow is OK. |
642 | set_tsk_need_resched(current); | 627 | set_tsk_need_resched(current); |
643 | set_preempt_need_resched(); | 628 | set_preempt_need_resched(); |
644 | if (IS_ENABLED(CONFIG_IRQ_WORK) && | 629 | if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled && |
645 | !rdp->defer_qs_iw_pending && exp) { | 630 | !rdp->defer_qs_iw_pending && exp) { |
646 | // Get scheduler to re-evaluate and call hooks. | 631 | // Get scheduler to re-evaluate and call hooks. |
647 | // If !IRQ_WORK, FQS scan will eventually IPI. | 632 | // If !IRQ_WORK, FQS scan will eventually IPI. |
@@ -828,11 +813,6 @@ static void rcu_qs(void) | |||
828 | * dyntick-idle quiescent state visible to other CPUs, which will in | 813 | * dyntick-idle quiescent state visible to other CPUs, which will in |
829 | * some cases serve for expedited as well as normal grace periods. | 814 | * some cases serve for expedited as well as normal grace periods. |
830 | * Either way, register a lightweight quiescent state. | 815 | * Either way, register a lightweight quiescent state. |
831 | * | ||
832 | * The barrier() calls are redundant in the common case when this is | ||
833 | * called externally, but just in case this is called from within this | ||
834 | * file. | ||
835 | * | ||
836 | */ | 816 | */ |
837 | void rcu_all_qs(void) | 817 | void rcu_all_qs(void) |
838 | { | 818 | { |
@@ -847,14 +827,12 @@ void rcu_all_qs(void) | |||
847 | return; | 827 | return; |
848 | } | 828 | } |
849 | this_cpu_write(rcu_data.rcu_urgent_qs, false); | 829 | this_cpu_write(rcu_data.rcu_urgent_qs, false); |
850 | barrier(); /* Avoid RCU read-side critical sections leaking down. */ | ||
851 | if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) { | 830 | if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) { |
852 | local_irq_save(flags); | 831 | local_irq_save(flags); |
853 | rcu_momentary_dyntick_idle(); | 832 | rcu_momentary_dyntick_idle(); |
854 | local_irq_restore(flags); | 833 | local_irq_restore(flags); |
855 | } | 834 | } |
856 | rcu_qs(); | 835 | rcu_qs(); |
857 | barrier(); /* Avoid RCU read-side critical sections leaking up. */ | ||
858 | preempt_enable(); | 836 | preempt_enable(); |
859 | } | 837 | } |
860 | EXPORT_SYMBOL_GPL(rcu_all_qs); | 838 | EXPORT_SYMBOL_GPL(rcu_all_qs); |
@@ -864,7 +842,6 @@ EXPORT_SYMBOL_GPL(rcu_all_qs); | |||
864 | */ | 842 | */ |
865 | void rcu_note_context_switch(bool preempt) | 843 | void rcu_note_context_switch(bool preempt) |
866 | { | 844 | { |
867 | barrier(); /* Avoid RCU read-side critical sections leaking down. */ | ||
868 | trace_rcu_utilization(TPS("Start context switch")); | 845 | trace_rcu_utilization(TPS("Start context switch")); |
869 | rcu_qs(); | 846 | rcu_qs(); |
870 | /* Load rcu_urgent_qs before other flags. */ | 847 | /* Load rcu_urgent_qs before other flags. */ |
@@ -877,7 +854,6 @@ void rcu_note_context_switch(bool preempt) | |||
877 | rcu_tasks_qs(current); | 854 | rcu_tasks_qs(current); |
878 | out: | 855 | out: |
879 | trace_rcu_utilization(TPS("End context switch")); | 856 | trace_rcu_utilization(TPS("End context switch")); |
880 | barrier(); /* Avoid RCU read-side critical sections leaking up. */ | ||
881 | } | 857 | } |
882 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); | 858 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); |
883 | 859 | ||
@@ -1134,7 +1110,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) | |||
1134 | * already exist. We only create this kthread for preemptible RCU. | 1110 | * already exist. We only create this kthread for preemptible RCU. |
1135 | * Returns zero if all is well, a negated errno otherwise. | 1111 | * Returns zero if all is well, a negated errno otherwise. |
1136 | */ | 1112 | */ |
1137 | static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp) | 1113 | static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) |
1138 | { | 1114 | { |
1139 | int rnp_index = rnp - rcu_get_root(); | 1115 | int rnp_index = rnp - rcu_get_root(); |
1140 | unsigned long flags; | 1116 | unsigned long flags; |
@@ -1142,25 +1118,27 @@ static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp) | |||
1142 | struct task_struct *t; | 1118 | struct task_struct *t; |
1143 | 1119 | ||
1144 | if (!IS_ENABLED(CONFIG_PREEMPT_RCU)) | 1120 | if (!IS_ENABLED(CONFIG_PREEMPT_RCU)) |
1145 | return 0; | 1121 | return; |
1146 | 1122 | ||
1147 | if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0) | 1123 | if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0) |
1148 | return 0; | 1124 | return; |
1149 | 1125 | ||
1150 | rcu_state.boost = 1; | 1126 | rcu_state.boost = 1; |
1127 | |||
1151 | if (rnp->boost_kthread_task != NULL) | 1128 | if (rnp->boost_kthread_task != NULL) |
1152 | return 0; | 1129 | return; |
1130 | |||
1153 | t = kthread_create(rcu_boost_kthread, (void *)rnp, | 1131 | t = kthread_create(rcu_boost_kthread, (void *)rnp, |
1154 | "rcub/%d", rnp_index); | 1132 | "rcub/%d", rnp_index); |
1155 | if (IS_ERR(t)) | 1133 | if (WARN_ON_ONCE(IS_ERR(t))) |
1156 | return PTR_ERR(t); | 1134 | return; |
1135 | |||
1157 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | 1136 | raw_spin_lock_irqsave_rcu_node(rnp, flags); |
1158 | rnp->boost_kthread_task = t; | 1137 | rnp->boost_kthread_task = t; |
1159 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 1138 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
1160 | sp.sched_priority = kthread_prio; | 1139 | sp.sched_priority = kthread_prio; |
1161 | sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); | 1140 | sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); |
1162 | wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ | 1141 | wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ |
1163 | return 0; | ||
1164 | } | 1142 | } |
1165 | 1143 | ||
1166 | /* | 1144 | /* |
@@ -1201,7 +1179,7 @@ static void __init rcu_spawn_boost_kthreads(void) | |||
1201 | struct rcu_node *rnp; | 1179 | struct rcu_node *rnp; |
1202 | 1180 | ||
1203 | rcu_for_each_leaf_node(rnp) | 1181 | rcu_for_each_leaf_node(rnp) |
1204 | (void)rcu_spawn_one_boost_kthread(rnp); | 1182 | rcu_spawn_one_boost_kthread(rnp); |
1205 | } | 1183 | } |
1206 | 1184 | ||
1207 | static void rcu_prepare_kthreads(int cpu) | 1185 | static void rcu_prepare_kthreads(int cpu) |
@@ -1211,7 +1189,7 @@ static void rcu_prepare_kthreads(int cpu) | |||
1211 | 1189 | ||
1212 | /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ | 1190 | /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ |
1213 | if (rcu_scheduler_fully_active) | 1191 | if (rcu_scheduler_fully_active) |
1214 | (void)rcu_spawn_one_boost_kthread(rnp); | 1192 | rcu_spawn_one_boost_kthread(rnp); |
1215 | } | 1193 | } |
1216 | 1194 | ||
1217 | #else /* #ifdef CONFIG_RCU_BOOST */ | 1195 | #else /* #ifdef CONFIG_RCU_BOOST */ |
@@ -1248,10 +1226,10 @@ static void rcu_prepare_kthreads(int cpu) | |||
1248 | #if !defined(CONFIG_RCU_FAST_NO_HZ) | 1226 | #if !defined(CONFIG_RCU_FAST_NO_HZ) |
1249 | 1227 | ||
1250 | /* | 1228 | /* |
1251 | * Check to see if any future RCU-related work will need to be done | 1229 | * Check to see if any future non-offloaded RCU-related work will need |
1252 | * by the current CPU, even if none need be done immediately, returning | 1230 | * to be done by the current CPU, even if none need be done immediately, |
1253 | * 1 if so. This function is part of the RCU implementation; it is -not- | 1231 | * returning 1 if so. This function is part of the RCU implementation; |
1254 | * an exported member of the RCU API. | 1232 | * it is -not- an exported member of the RCU API. |
1255 | * | 1233 | * |
1256 | * Because we not have RCU_FAST_NO_HZ, just check whether or not this | 1234 | * Because we not have RCU_FAST_NO_HZ, just check whether or not this |
1257 | * CPU has RCU callbacks queued. | 1235 | * CPU has RCU callbacks queued. |
@@ -1259,7 +1237,8 @@ static void rcu_prepare_kthreads(int cpu) | |||
1259 | int rcu_needs_cpu(u64 basemono, u64 *nextevt) | 1237 | int rcu_needs_cpu(u64 basemono, u64 *nextevt) |
1260 | { | 1238 | { |
1261 | *nextevt = KTIME_MAX; | 1239 | *nextevt = KTIME_MAX; |
1262 | return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist); | 1240 | return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) && |
1241 | !rcu_segcblist_is_offloaded(&this_cpu_ptr(&rcu_data)->cblist); | ||
1263 | } | 1242 | } |
1264 | 1243 | ||
1265 | /* | 1244 | /* |
@@ -1360,8 +1339,9 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) | |||
1360 | 1339 | ||
1361 | lockdep_assert_irqs_disabled(); | 1340 | lockdep_assert_irqs_disabled(); |
1362 | 1341 | ||
1363 | /* If no callbacks, RCU doesn't need the CPU. */ | 1342 | /* If no non-offloaded callbacks, RCU doesn't need the CPU. */ |
1364 | if (rcu_segcblist_empty(&rdp->cblist)) { | 1343 | if (rcu_segcblist_empty(&rdp->cblist) || |
1344 | rcu_segcblist_is_offloaded(&this_cpu_ptr(&rcu_data)->cblist)) { | ||
1365 | *nextevt = KTIME_MAX; | 1345 | *nextevt = KTIME_MAX; |
1366 | return 0; | 1346 | return 0; |
1367 | } | 1347 | } |
@@ -1404,7 +1384,7 @@ static void rcu_prepare_for_idle(void) | |||
1404 | int tne; | 1384 | int tne; |
1405 | 1385 | ||
1406 | lockdep_assert_irqs_disabled(); | 1386 | lockdep_assert_irqs_disabled(); |
1407 | if (rcu_is_nocb_cpu(smp_processor_id())) | 1387 | if (rcu_segcblist_is_offloaded(&rdp->cblist)) |
1408 | return; | 1388 | return; |
1409 | 1389 | ||
1410 | /* Handle nohz enablement switches conservatively. */ | 1390 | /* Handle nohz enablement switches conservatively. */ |
@@ -1453,8 +1433,10 @@ static void rcu_prepare_for_idle(void) | |||
1453 | */ | 1433 | */ |
1454 | static void rcu_cleanup_after_idle(void) | 1434 | static void rcu_cleanup_after_idle(void) |
1455 | { | 1435 | { |
1436 | struct rcu_data *rdp = this_cpu_ptr(&rcu_data); | ||
1437 | |||
1456 | lockdep_assert_irqs_disabled(); | 1438 | lockdep_assert_irqs_disabled(); |
1457 | if (rcu_is_nocb_cpu(smp_processor_id())) | 1439 | if (rcu_segcblist_is_offloaded(&rdp->cblist)) |
1458 | return; | 1440 | return; |
1459 | if (rcu_try_advance_all_cbs()) | 1441 | if (rcu_try_advance_all_cbs()) |
1460 | invoke_rcu_core(); | 1442 | invoke_rcu_core(); |
@@ -1469,10 +1451,10 @@ static void rcu_cleanup_after_idle(void) | |||
1469 | * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads | 1451 | * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads |
1470 | * created that pull the callbacks from the corresponding CPU, wait for | 1452 | * created that pull the callbacks from the corresponding CPU, wait for |
1471 | * a grace period to elapse, and invoke the callbacks. These kthreads | 1453 | * a grace period to elapse, and invoke the callbacks. These kthreads |
1472 | * are organized into leaders, which manage incoming callbacks, wait for | 1454 | * are organized into GP kthreads, which manage incoming callbacks, wait for |
1473 | * grace periods, and awaken followers, and the followers, which only | 1455 | * grace periods, and awaken CB kthreads, and the CB kthreads, which only |
1474 | * invoke callbacks. Each leader is its own follower. The no-CBs CPUs | 1456 | * invoke callbacks. Each GP kthread invokes its own CBs. The no-CBs CPUs |
1475 | * do a wake_up() on their kthread when they insert a callback into any | 1457 | * do a wake_up() on their GP kthread when they insert a callback into any |
1476 | * empty list, unless the rcu_nocb_poll boot parameter has been specified, | 1458 | * empty list, unless the rcu_nocb_poll boot parameter has been specified, |
1477 | * in which case each kthread actively polls its CPU. (Which isn't so great | 1459 | * in which case each kthread actively polls its CPU. (Which isn't so great |
1478 | * for energy efficiency, but which does reduce RCU's overhead on that CPU.) | 1460 | * for energy efficiency, but which does reduce RCU's overhead on that CPU.) |
@@ -1515,6 +1497,116 @@ static int __init parse_rcu_nocb_poll(char *arg) | |||
1515 | early_param("rcu_nocb_poll", parse_rcu_nocb_poll); | 1497 | early_param("rcu_nocb_poll", parse_rcu_nocb_poll); |
1516 | 1498 | ||
1517 | /* | 1499 | /* |
1500 | * Don't bother bypassing ->cblist if the call_rcu() rate is low. | ||
1501 | * After all, the main point of bypassing is to avoid lock contention | ||
1502 | * on ->nocb_lock, which only can happen at high call_rcu() rates. | ||
1503 | */ | ||
1504 | int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ; | ||
1505 | module_param(nocb_nobypass_lim_per_jiffy, int, 0); | ||
1506 | |||
1507 | /* | ||
1508 | * Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the | ||
1509 | * lock isn't immediately available, increment ->nocb_lock_contended to | ||
1510 | * flag the contention. | ||
1511 | */ | ||
1512 | static void rcu_nocb_bypass_lock(struct rcu_data *rdp) | ||
1513 | { | ||
1514 | lockdep_assert_irqs_disabled(); | ||
1515 | if (raw_spin_trylock(&rdp->nocb_bypass_lock)) | ||
1516 | return; | ||
1517 | atomic_inc(&rdp->nocb_lock_contended); | ||
1518 | WARN_ON_ONCE(smp_processor_id() != rdp->cpu); | ||
1519 | smp_mb__after_atomic(); /* atomic_inc() before lock. */ | ||
1520 | raw_spin_lock(&rdp->nocb_bypass_lock); | ||
1521 | smp_mb__before_atomic(); /* atomic_dec() after lock. */ | ||
1522 | atomic_dec(&rdp->nocb_lock_contended); | ||
1523 | } | ||
1524 | |||
1525 | /* | ||
1526 | * Spinwait until the specified rcu_data structure's ->nocb_lock is | ||
1527 | * not contended. Please note that this is extremely special-purpose, | ||
1528 | * relying on the fact that at most two kthreads and one CPU contend for | ||
1529 | * this lock, and also that the two kthreads are guaranteed to have frequent | ||
1530 | * grace-period-duration time intervals between successive acquisitions | ||
1531 | * of the lock. This allows us to use an extremely simple throttling | ||
1532 | * mechanism, and further to apply it only to the CPU doing floods of | ||
1533 | * call_rcu() invocations. Don't try this at home! | ||
1534 | */ | ||
1535 | static void rcu_nocb_wait_contended(struct rcu_data *rdp) | ||
1536 | { | ||
1537 | WARN_ON_ONCE(smp_processor_id() != rdp->cpu); | ||
1538 | while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended))) | ||
1539 | cpu_relax(); | ||
1540 | } | ||
1541 | |||
1542 | /* | ||
1543 | * Conditionally acquire the specified rcu_data structure's | ||
1544 | * ->nocb_bypass_lock. | ||
1545 | */ | ||
1546 | static bool rcu_nocb_bypass_trylock(struct rcu_data *rdp) | ||
1547 | { | ||
1548 | lockdep_assert_irqs_disabled(); | ||
1549 | return raw_spin_trylock(&rdp->nocb_bypass_lock); | ||
1550 | } | ||
1551 | |||
1552 | /* | ||
1553 | * Release the specified rcu_data structure's ->nocb_bypass_lock. | ||
1554 | */ | ||
1555 | static void rcu_nocb_bypass_unlock(struct rcu_data *rdp) | ||
1556 | { | ||
1557 | lockdep_assert_irqs_disabled(); | ||
1558 | raw_spin_unlock(&rdp->nocb_bypass_lock); | ||
1559 | } | ||
1560 | |||
1561 | /* | ||
1562 | * Acquire the specified rcu_data structure's ->nocb_lock, but only | ||
1563 | * if it corresponds to a no-CBs CPU. | ||
1564 | */ | ||
1565 | static void rcu_nocb_lock(struct rcu_data *rdp) | ||
1566 | { | ||
1567 | lockdep_assert_irqs_disabled(); | ||
1568 | if (!rcu_segcblist_is_offloaded(&rdp->cblist)) | ||
1569 | return; | ||
1570 | raw_spin_lock(&rdp->nocb_lock); | ||
1571 | } | ||
1572 | |||
1573 | /* | ||
1574 | * Release the specified rcu_data structure's ->nocb_lock, but only | ||
1575 | * if it corresponds to a no-CBs CPU. | ||
1576 | */ | ||
1577 | static void rcu_nocb_unlock(struct rcu_data *rdp) | ||
1578 | { | ||
1579 | if (rcu_segcblist_is_offloaded(&rdp->cblist)) { | ||
1580 | lockdep_assert_irqs_disabled(); | ||
1581 | raw_spin_unlock(&rdp->nocb_lock); | ||
1582 | } | ||
1583 | } | ||
1584 | |||
1585 | /* | ||
1586 | * Release the specified rcu_data structure's ->nocb_lock and restore | ||
1587 | * interrupts, but only if it corresponds to a no-CBs CPU. | ||
1588 | */ | ||
1589 | static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, | ||
1590 | unsigned long flags) | ||
1591 | { | ||
1592 | if (rcu_segcblist_is_offloaded(&rdp->cblist)) { | ||
1593 | lockdep_assert_irqs_disabled(); | ||
1594 | raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); | ||
1595 | } else { | ||
1596 | local_irq_restore(flags); | ||
1597 | } | ||
1598 | } | ||
1599 | |||
1600 | /* Lockdep check that ->cblist may be safely accessed. */ | ||
1601 | static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp) | ||
1602 | { | ||
1603 | lockdep_assert_irqs_disabled(); | ||
1604 | if (rcu_segcblist_is_offloaded(&rdp->cblist) && | ||
1605 | cpu_online(rdp->cpu)) | ||
1606 | lockdep_assert_held(&rdp->nocb_lock); | ||
1607 | } | ||
1608 | |||
1609 | /* | ||
1518 | * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended | 1610 | * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended |
1519 | * grace period. | 1611 | * grace period. |
1520 | */ | 1612 | */ |
@@ -1543,440 +1635,514 @@ bool rcu_is_nocb_cpu(int cpu) | |||
1543 | } | 1635 | } |
1544 | 1636 | ||
1545 | /* | 1637 | /* |
1546 | * Kick the leader kthread for this NOCB group. Caller holds ->nocb_lock | 1638 | * Kick the GP kthread for this NOCB group. Caller holds ->nocb_lock |
1547 | * and this function releases it. | 1639 | * and this function releases it. |
1548 | */ | 1640 | */ |
1549 | static void __wake_nocb_leader(struct rcu_data *rdp, bool force, | 1641 | static void wake_nocb_gp(struct rcu_data *rdp, bool force, |
1550 | unsigned long flags) | 1642 | unsigned long flags) |
1551 | __releases(rdp->nocb_lock) | 1643 | __releases(rdp->nocb_lock) |
1552 | { | 1644 | { |
1553 | struct rcu_data *rdp_leader = rdp->nocb_leader; | 1645 | bool needwake = false; |
1646 | struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; | ||
1554 | 1647 | ||
1555 | lockdep_assert_held(&rdp->nocb_lock); | 1648 | lockdep_assert_held(&rdp->nocb_lock); |
1556 | if (!READ_ONCE(rdp_leader->nocb_kthread)) { | 1649 | if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) { |
1557 | raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); | 1650 | trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, |
1651 | TPS("AlreadyAwake")); | ||
1652 | rcu_nocb_unlock_irqrestore(rdp, flags); | ||
1558 | return; | 1653 | return; |
1559 | } | 1654 | } |
1560 | if (rdp_leader->nocb_leader_sleep || force) { | 1655 | del_timer(&rdp->nocb_timer); |
1561 | /* Prior smp_mb__after_atomic() orders against prior enqueue. */ | 1656 | rcu_nocb_unlock_irqrestore(rdp, flags); |
1562 | WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); | 1657 | raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); |
1563 | del_timer(&rdp->nocb_timer); | 1658 | if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) { |
1564 | raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); | 1659 | WRITE_ONCE(rdp_gp->nocb_gp_sleep, false); |
1565 | smp_mb(); /* ->nocb_leader_sleep before swake_up_one(). */ | 1660 | needwake = true; |
1566 | swake_up_one(&rdp_leader->nocb_wq); | 1661 | trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake")); |
1567 | } else { | ||
1568 | raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); | ||
1569 | } | 1662 | } |
1663 | raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); | ||
1664 | if (needwake) | ||
1665 | wake_up_process(rdp_gp->nocb_gp_kthread); | ||
1570 | } | 1666 | } |
1571 | 1667 | ||
1572 | /* | 1668 | /* |
1573 | * Kick the leader kthread for this NOCB group, but caller has not | 1669 | * Arrange to wake the GP kthread for this NOCB group at some future |
1574 | * acquired locks. | 1670 | * time when it is safe to do so. |
1575 | */ | 1671 | */ |
1576 | static void wake_nocb_leader(struct rcu_data *rdp, bool force) | 1672 | static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype, |
1673 | const char *reason) | ||
1577 | { | 1674 | { |
1578 | unsigned long flags; | 1675 | if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) |
1676 | mod_timer(&rdp->nocb_timer, jiffies + 1); | ||
1677 | if (rdp->nocb_defer_wakeup < waketype) | ||
1678 | WRITE_ONCE(rdp->nocb_defer_wakeup, waketype); | ||
1679 | trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason); | ||
1680 | } | ||
1681 | |||
1682 | /* | ||
1683 | * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL. | ||
1684 | * However, if there is a callback to be enqueued and if ->nocb_bypass | ||
1685 | * proves to be initially empty, just return false because the no-CB GP | ||
1686 | * kthread may need to be awakened in this case. | ||
1687 | * | ||
1688 | * Note that this function always returns true if rhp is NULL. | ||
1689 | */ | ||
1690 | static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, | ||
1691 | unsigned long j) | ||
1692 | { | ||
1693 | struct rcu_cblist rcl; | ||
1579 | 1694 | ||
1580 | raw_spin_lock_irqsave(&rdp->nocb_lock, flags); | 1695 | WARN_ON_ONCE(!rcu_segcblist_is_offloaded(&rdp->cblist)); |
1581 | __wake_nocb_leader(rdp, force, flags); | 1696 | rcu_lockdep_assert_cblist_protected(rdp); |
1697 | lockdep_assert_held(&rdp->nocb_bypass_lock); | ||
1698 | if (rhp && !rcu_cblist_n_cbs(&rdp->nocb_bypass)) { | ||
1699 | raw_spin_unlock(&rdp->nocb_bypass_lock); | ||
1700 | return false; | ||
1701 | } | ||
1702 | /* Note: ->cblist.len already accounts for ->nocb_bypass contents. */ | ||
1703 | if (rhp) | ||
1704 | rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */ | ||
1705 | rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp); | ||
1706 | rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl); | ||
1707 | WRITE_ONCE(rdp->nocb_bypass_first, j); | ||
1708 | rcu_nocb_bypass_unlock(rdp); | ||
1709 | return true; | ||
1582 | } | 1710 | } |
1583 | 1711 | ||
1584 | /* | 1712 | /* |
1585 | * Arrange to wake the leader kthread for this NOCB group at some | 1713 | * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL. |
1586 | * future time when it is safe to do so. | 1714 | * However, if there is a callback to be enqueued and if ->nocb_bypass |
1715 | * proves to be initially empty, just return false because the no-CB GP | ||
1716 | * kthread may need to be awakened in this case. | ||
1717 | * | ||
1718 | * Note that this function always returns true if rhp is NULL. | ||
1587 | */ | 1719 | */ |
1588 | static void wake_nocb_leader_defer(struct rcu_data *rdp, int waketype, | 1720 | static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, |
1589 | const char *reason) | 1721 | unsigned long j) |
1590 | { | 1722 | { |
1591 | unsigned long flags; | 1723 | if (!rcu_segcblist_is_offloaded(&rdp->cblist)) |
1724 | return true; | ||
1725 | rcu_lockdep_assert_cblist_protected(rdp); | ||
1726 | rcu_nocb_bypass_lock(rdp); | ||
1727 | return rcu_nocb_do_flush_bypass(rdp, rhp, j); | ||
1728 | } | ||
1592 | 1729 | ||
1593 | raw_spin_lock_irqsave(&rdp->nocb_lock, flags); | 1730 | /* |
1594 | if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) | 1731 | * If the ->nocb_bypass_lock is immediately available, flush the |
1595 | mod_timer(&rdp->nocb_timer, jiffies + 1); | 1732 | * ->nocb_bypass queue into ->cblist. |
1596 | WRITE_ONCE(rdp->nocb_defer_wakeup, waketype); | 1733 | */ |
1597 | trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason); | 1734 | static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j) |
1598 | raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); | 1735 | { |
1736 | rcu_lockdep_assert_cblist_protected(rdp); | ||
1737 | if (!rcu_segcblist_is_offloaded(&rdp->cblist) || | ||
1738 | !rcu_nocb_bypass_trylock(rdp)) | ||
1739 | return; | ||
1740 | WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j)); | ||
1599 | } | 1741 | } |
1600 | 1742 | ||
1601 | /* Does rcu_barrier need to queue an RCU callback on the specified CPU? */ | 1743 | /* |
1602 | static bool rcu_nocb_cpu_needs_barrier(int cpu) | 1744 | * See whether it is appropriate to use the ->nocb_bypass list in order |
1745 | * to control contention on ->nocb_lock. A limited number of direct | ||
1746 | * enqueues are permitted into ->cblist per jiffy. If ->nocb_bypass | ||
1747 | * is non-empty, further callbacks must be placed into ->nocb_bypass, | ||
1748 | * otherwise rcu_barrier() breaks. Use rcu_nocb_flush_bypass() to switch | ||
1749 | * back to direct use of ->cblist. However, ->nocb_bypass should not be | ||
1750 | * used if ->cblist is empty, because otherwise callbacks can be stranded | ||
1751 | * on ->nocb_bypass because we cannot count on the current CPU ever again | ||
1752 | * invoking call_rcu(). The general rule is that if ->nocb_bypass is | ||
1753 | * non-empty, the corresponding no-CBs grace-period kthread must not be | ||
1754 | * in an indefinite sleep state. | ||
1755 | * | ||
1756 | * Finally, it is not permitted to use the bypass during early boot, | ||
1757 | * as doing so would confuse the auto-initialization code. Besides | ||
1758 | * which, there is no point in worrying about lock contention while | ||
1759 | * there is only one CPU in operation. | ||
1760 | */ | ||
1761 | static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, | ||
1762 | bool *was_alldone, unsigned long flags) | ||
1603 | { | 1763 | { |
1604 | struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); | 1764 | unsigned long c; |
1605 | unsigned long ret; | 1765 | unsigned long cur_gp_seq; |
1606 | #ifdef CONFIG_PROVE_RCU | 1766 | unsigned long j = jiffies; |
1607 | struct rcu_head *rhp; | 1767 | long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); |
1608 | #endif /* #ifdef CONFIG_PROVE_RCU */ | ||
1609 | 1768 | ||
1610 | /* | 1769 | if (!rcu_segcblist_is_offloaded(&rdp->cblist)) { |
1611 | * Check count of all no-CBs callbacks awaiting invocation. | 1770 | *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); |
1612 | * There needs to be a barrier before this function is called, | 1771 | return false; /* Not offloaded, no bypassing. */ |
1613 | * but associated with a prior determination that no more | 1772 | } |
1614 | * callbacks would be posted. In the worst case, the first | 1773 | lockdep_assert_irqs_disabled(); |
1615 | * barrier in rcu_barrier() suffices (but the caller cannot | 1774 | |
1616 | * necessarily rely on this, not a substitute for the caller | 1775 | // Don't use ->nocb_bypass during early boot. |
1617 | * getting the concurrency design right!). There must also be a | 1776 | if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) { |
1618 | * barrier between the following load and posting of a callback | 1777 | rcu_nocb_lock(rdp); |
1619 | * (if a callback is in fact needed). This is associated with an | 1778 | WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); |
1620 | * atomic_inc() in the caller. | 1779 | *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); |
1621 | */ | 1780 | return false; |
1622 | ret = rcu_get_n_cbs_nocb_cpu(rdp); | 1781 | } |
1623 | 1782 | ||
1624 | #ifdef CONFIG_PROVE_RCU | 1783 | // If we have advanced to a new jiffy, reset counts to allow |
1625 | rhp = READ_ONCE(rdp->nocb_head); | 1784 | // moving back from ->nocb_bypass to ->cblist. |
1626 | if (!rhp) | 1785 | if (j == rdp->nocb_nobypass_last) { |
1627 | rhp = READ_ONCE(rdp->nocb_gp_head); | 1786 | c = rdp->nocb_nobypass_count + 1; |
1628 | if (!rhp) | 1787 | } else { |
1629 | rhp = READ_ONCE(rdp->nocb_follower_head); | 1788 | WRITE_ONCE(rdp->nocb_nobypass_last, j); |
1630 | 1789 | c = rdp->nocb_nobypass_count - nocb_nobypass_lim_per_jiffy; | |
1631 | /* Having no rcuo kthread but CBs after scheduler starts is bad! */ | 1790 | if (ULONG_CMP_LT(rdp->nocb_nobypass_count, |
1632 | if (!READ_ONCE(rdp->nocb_kthread) && rhp && | 1791 | nocb_nobypass_lim_per_jiffy)) |
1633 | rcu_scheduler_fully_active) { | 1792 | c = 0; |
1634 | /* RCU callback enqueued before CPU first came online??? */ | 1793 | else if (c > nocb_nobypass_lim_per_jiffy) |
1635 | pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n", | 1794 | c = nocb_nobypass_lim_per_jiffy; |
1636 | cpu, rhp->func); | 1795 | } |
1637 | WARN_ON_ONCE(1); | 1796 | WRITE_ONCE(rdp->nocb_nobypass_count, c); |
1797 | |||
1798 | // If there hasn't yet been all that many ->cblist enqueues | ||
1799 | // this jiffy, tell the caller to enqueue onto ->cblist. But flush | ||
1800 | // ->nocb_bypass first. | ||
1801 | if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy) { | ||
1802 | rcu_nocb_lock(rdp); | ||
1803 | *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); | ||
1804 | if (*was_alldone) | ||
1805 | trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, | ||
1806 | TPS("FirstQ")); | ||
1807 | WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j)); | ||
1808 | WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); | ||
1809 | return false; // Caller must enqueue the callback. | ||
1810 | } | ||
1811 | |||
1812 | // If ->nocb_bypass has been used too long or is too full, | ||
1813 | // flush ->nocb_bypass to ->cblist. | ||
1814 | if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) || | ||
1815 | ncbs >= qhimark) { | ||
1816 | rcu_nocb_lock(rdp); | ||
1817 | if (!rcu_nocb_flush_bypass(rdp, rhp, j)) { | ||
1818 | *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); | ||
1819 | if (*was_alldone) | ||
1820 | trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, | ||
1821 | TPS("FirstQ")); | ||
1822 | WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); | ||
1823 | return false; // Caller must enqueue the callback. | ||
1824 | } | ||
1825 | if (j != rdp->nocb_gp_adv_time && | ||
1826 | rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && | ||
1827 | rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) { | ||
1828 | rcu_advance_cbs_nowake(rdp->mynode, rdp); | ||
1829 | rdp->nocb_gp_adv_time = j; | ||
1830 | } | ||
1831 | rcu_nocb_unlock_irqrestore(rdp, flags); | ||
1832 | return true; // Callback already enqueued. | ||
1638 | } | 1833 | } |
1639 | #endif /* #ifdef CONFIG_PROVE_RCU */ | ||
1640 | 1834 | ||
1641 | return !!ret; | 1835 | // We need to use the bypass. |
1836 | rcu_nocb_wait_contended(rdp); | ||
1837 | rcu_nocb_bypass_lock(rdp); | ||
1838 | ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); | ||
1839 | rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */ | ||
1840 | rcu_cblist_enqueue(&rdp->nocb_bypass, rhp); | ||
1841 | if (!ncbs) { | ||
1842 | WRITE_ONCE(rdp->nocb_bypass_first, j); | ||
1843 | trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ")); | ||
1844 | } | ||
1845 | rcu_nocb_bypass_unlock(rdp); | ||
1846 | smp_mb(); /* Order enqueue before wake. */ | ||
1847 | if (ncbs) { | ||
1848 | local_irq_restore(flags); | ||
1849 | } else { | ||
1850 | // No-CBs GP kthread might be indefinitely asleep, if so, wake. | ||
1851 | rcu_nocb_lock(rdp); // Rare during call_rcu() flood. | ||
1852 | if (!rcu_segcblist_pend_cbs(&rdp->cblist)) { | ||
1853 | trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, | ||
1854 | TPS("FirstBQwake")); | ||
1855 | __call_rcu_nocb_wake(rdp, true, flags); | ||
1856 | } else { | ||
1857 | trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, | ||
1858 | TPS("FirstBQnoWake")); | ||
1859 | rcu_nocb_unlock_irqrestore(rdp, flags); | ||
1860 | } | ||
1861 | } | ||
1862 | return true; // Callback already enqueued. | ||
1642 | } | 1863 | } |
1643 | 1864 | ||
1644 | /* | 1865 | /* |
1645 | * Enqueue the specified string of rcu_head structures onto the specified | 1866 | * Awaken the no-CBs grace-period kthead if needed, either due to it |
1646 | * CPU's no-CBs lists. The CPU is specified by rdp, the head of the | 1867 | * legitimately being asleep or due to overload conditions. |
1647 | * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy | ||
1648 | * counts are supplied by rhcount and rhcount_lazy. | ||
1649 | * | 1868 | * |
1650 | * If warranted, also wake up the kthread servicing this CPUs queues. | 1869 | * If warranted, also wake up the kthread servicing this CPUs queues. |
1651 | */ | 1870 | */ |
1652 | static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, | 1871 | static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, |
1653 | struct rcu_head *rhp, | 1872 | unsigned long flags) |
1654 | struct rcu_head **rhtp, | 1873 | __releases(rdp->nocb_lock) |
1655 | int rhcount, int rhcount_lazy, | ||
1656 | unsigned long flags) | ||
1657 | { | 1874 | { |
1658 | int len; | 1875 | unsigned long cur_gp_seq; |
1659 | struct rcu_head **old_rhpp; | 1876 | unsigned long j; |
1877 | long len; | ||
1660 | struct task_struct *t; | 1878 | struct task_struct *t; |
1661 | 1879 | ||
1662 | /* Enqueue the callback on the nocb list and update counts. */ | 1880 | // If we are being polled or there is no kthread, just leave. |
1663 | atomic_long_add(rhcount, &rdp->nocb_q_count); | 1881 | t = READ_ONCE(rdp->nocb_gp_kthread); |
1664 | /* rcu_barrier() relies on ->nocb_q_count add before xchg. */ | ||
1665 | old_rhpp = xchg(&rdp->nocb_tail, rhtp); | ||
1666 | WRITE_ONCE(*old_rhpp, rhp); | ||
1667 | atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); | ||
1668 | smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */ | ||
1669 | |||
1670 | /* If we are not being polled and there is a kthread, awaken it ... */ | ||
1671 | t = READ_ONCE(rdp->nocb_kthread); | ||
1672 | if (rcu_nocb_poll || !t) { | 1882 | if (rcu_nocb_poll || !t) { |
1673 | trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, | 1883 | trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, |
1674 | TPS("WakeNotPoll")); | 1884 | TPS("WakeNotPoll")); |
1885 | rcu_nocb_unlock_irqrestore(rdp, flags); | ||
1675 | return; | 1886 | return; |
1676 | } | 1887 | } |
1677 | len = rcu_get_n_cbs_nocb_cpu(rdp); | 1888 | // Need to actually to a wakeup. |
1678 | if (old_rhpp == &rdp->nocb_head) { | 1889 | len = rcu_segcblist_n_cbs(&rdp->cblist); |
1890 | if (was_alldone) { | ||
1891 | rdp->qlen_last_fqs_check = len; | ||
1679 | if (!irqs_disabled_flags(flags)) { | 1892 | if (!irqs_disabled_flags(flags)) { |
1680 | /* ... if queue was empty ... */ | 1893 | /* ... if queue was empty ... */ |
1681 | wake_nocb_leader(rdp, false); | 1894 | wake_nocb_gp(rdp, false, flags); |
1682 | trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, | 1895 | trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, |
1683 | TPS("WakeEmpty")); | 1896 | TPS("WakeEmpty")); |
1684 | } else { | 1897 | } else { |
1685 | wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE, | 1898 | wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE, |
1686 | TPS("WakeEmptyIsDeferred")); | 1899 | TPS("WakeEmptyIsDeferred")); |
1900 | rcu_nocb_unlock_irqrestore(rdp, flags); | ||
1687 | } | 1901 | } |
1688 | rdp->qlen_last_fqs_check = 0; | ||
1689 | } else if (len > rdp->qlen_last_fqs_check + qhimark) { | 1902 | } else if (len > rdp->qlen_last_fqs_check + qhimark) { |
1690 | /* ... or if many callbacks queued. */ | 1903 | /* ... or if many callbacks queued. */ |
1691 | if (!irqs_disabled_flags(flags)) { | 1904 | rdp->qlen_last_fqs_check = len; |
1692 | wake_nocb_leader(rdp, true); | 1905 | j = jiffies; |
1693 | trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, | 1906 | if (j != rdp->nocb_gp_adv_time && |
1694 | TPS("WakeOvf")); | 1907 | rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && |
1695 | } else { | 1908 | rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) { |
1696 | wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE_FORCE, | 1909 | rcu_advance_cbs_nowake(rdp->mynode, rdp); |
1697 | TPS("WakeOvfIsDeferred")); | 1910 | rdp->nocb_gp_adv_time = j; |
1698 | } | 1911 | } |
1699 | rdp->qlen_last_fqs_check = LONG_MAX / 2; | 1912 | smp_mb(); /* Enqueue before timer_pending(). */ |
1913 | if ((rdp->nocb_cb_sleep || | ||
1914 | !rcu_segcblist_ready_cbs(&rdp->cblist)) && | ||
1915 | !timer_pending(&rdp->nocb_bypass_timer)) | ||
1916 | wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE, | ||
1917 | TPS("WakeOvfIsDeferred")); | ||
1918 | rcu_nocb_unlock_irqrestore(rdp, flags); | ||
1700 | } else { | 1919 | } else { |
1701 | trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); | 1920 | trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); |
1921 | rcu_nocb_unlock_irqrestore(rdp, flags); | ||
1702 | } | 1922 | } |
1703 | return; | 1923 | return; |
1704 | } | 1924 | } |
1705 | 1925 | ||
1706 | /* | 1926 | /* Wake up the no-CBs GP kthread to flush ->nocb_bypass. */ |
1707 | * This is a helper for __call_rcu(), which invokes this when the normal | 1927 | static void do_nocb_bypass_wakeup_timer(struct timer_list *t) |
1708 | * callback queue is inoperable. If this is not a no-CBs CPU, this | ||
1709 | * function returns failure back to __call_rcu(), which can complain | ||
1710 | * appropriately. | ||
1711 | * | ||
1712 | * Otherwise, this function queues the callback where the corresponding | ||
1713 | * "rcuo" kthread can find it. | ||
1714 | */ | ||
1715 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | ||
1716 | bool lazy, unsigned long flags) | ||
1717 | { | 1928 | { |
1929 | unsigned long flags; | ||
1930 | struct rcu_data *rdp = from_timer(rdp, t, nocb_bypass_timer); | ||
1718 | 1931 | ||
1719 | if (!rcu_is_nocb_cpu(rdp->cpu)) | 1932 | trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer")); |
1720 | return false; | 1933 | rcu_nocb_lock_irqsave(rdp, flags); |
1721 | __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags); | 1934 | smp_mb__after_spinlock(); /* Timer expire before wakeup. */ |
1722 | if (__is_kfree_rcu_offset((unsigned long)rhp->func)) | 1935 | __call_rcu_nocb_wake(rdp, true, flags); |
1723 | trace_rcu_kfree_callback(rcu_state.name, rhp, | ||
1724 | (unsigned long)rhp->func, | ||
1725 | -atomic_long_read(&rdp->nocb_q_count_lazy), | ||
1726 | -rcu_get_n_cbs_nocb_cpu(rdp)); | ||
1727 | else | ||
1728 | trace_rcu_callback(rcu_state.name, rhp, | ||
1729 | -atomic_long_read(&rdp->nocb_q_count_lazy), | ||
1730 | -rcu_get_n_cbs_nocb_cpu(rdp)); | ||
1731 | |||
1732 | /* | ||
1733 | * If called from an extended quiescent state with interrupts | ||
1734 | * disabled, invoke the RCU core in order to allow the idle-entry | ||
1735 | * deferred-wakeup check to function. | ||
1736 | */ | ||
1737 | if (irqs_disabled_flags(flags) && | ||
1738 | !rcu_is_watching() && | ||
1739 | cpu_online(smp_processor_id())) | ||
1740 | invoke_rcu_core(); | ||
1741 | |||
1742 | return true; | ||
1743 | } | ||
1744 | |||
1745 | /* | ||
1746 | * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is | ||
1747 | * not a no-CBs CPU. | ||
1748 | */ | ||
1749 | static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, | ||
1750 | struct rcu_data *rdp, | ||
1751 | unsigned long flags) | ||
1752 | { | ||
1753 | lockdep_assert_irqs_disabled(); | ||
1754 | if (!rcu_is_nocb_cpu(smp_processor_id())) | ||
1755 | return false; /* Not NOCBs CPU, caller must migrate CBs. */ | ||
1756 | __call_rcu_nocb_enqueue(my_rdp, rcu_segcblist_head(&rdp->cblist), | ||
1757 | rcu_segcblist_tail(&rdp->cblist), | ||
1758 | rcu_segcblist_n_cbs(&rdp->cblist), | ||
1759 | rcu_segcblist_n_lazy_cbs(&rdp->cblist), flags); | ||
1760 | rcu_segcblist_init(&rdp->cblist); | ||
1761 | rcu_segcblist_disable(&rdp->cblist); | ||
1762 | return true; | ||
1763 | } | 1936 | } |
1764 | 1937 | ||
1765 | /* | 1938 | /* |
1766 | * If necessary, kick off a new grace period, and either way wait | 1939 | * No-CBs GP kthreads come here to wait for additional callbacks to show up |
1767 | * for a subsequent grace period to complete. | 1940 | * or for grace periods to end. |
1768 | */ | 1941 | */ |
1769 | static void rcu_nocb_wait_gp(struct rcu_data *rdp) | 1942 | static void nocb_gp_wait(struct rcu_data *my_rdp) |
1770 | { | 1943 | { |
1771 | unsigned long c; | 1944 | bool bypass = false; |
1772 | bool d; | 1945 | long bypass_ncbs; |
1946 | int __maybe_unused cpu = my_rdp->cpu; | ||
1947 | unsigned long cur_gp_seq; | ||
1773 | unsigned long flags; | 1948 | unsigned long flags; |
1949 | bool gotcbs; | ||
1950 | unsigned long j = jiffies; | ||
1951 | bool needwait_gp = false; // This prevents actual uninitialized use. | ||
1774 | bool needwake; | 1952 | bool needwake; |
1775 | struct rcu_node *rnp = rdp->mynode; | 1953 | bool needwake_gp; |
1954 | struct rcu_data *rdp; | ||
1955 | struct rcu_node *rnp; | ||
1956 | unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning. | ||
1776 | 1957 | ||
1777 | local_irq_save(flags); | 1958 | /* |
1778 | c = rcu_seq_snap(&rcu_state.gp_seq); | 1959 | * Each pass through the following loop checks for CBs and for the |
1779 | if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) { | 1960 | * nearest grace period (if any) to wait for next. The CB kthreads |
1780 | local_irq_restore(flags); | 1961 | * and the global grace-period kthread are awakened if needed. |
1781 | } else { | 1962 | */ |
1782 | raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ | 1963 | for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) { |
1783 | needwake = rcu_start_this_gp(rnp, rdp, c); | 1964 | trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check")); |
1784 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 1965 | rcu_nocb_lock_irqsave(rdp, flags); |
1785 | if (needwake) | 1966 | bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); |
1967 | if (bypass_ncbs && | ||
1968 | (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) || | ||
1969 | bypass_ncbs > 2 * qhimark)) { | ||
1970 | // Bypass full or old, so flush it. | ||
1971 | (void)rcu_nocb_try_flush_bypass(rdp, j); | ||
1972 | bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); | ||
1973 | } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) { | ||
1974 | rcu_nocb_unlock_irqrestore(rdp, flags); | ||
1975 | continue; /* No callbacks here, try next. */ | ||
1976 | } | ||
1977 | if (bypass_ncbs) { | ||
1978 | trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, | ||
1979 | TPS("Bypass")); | ||
1980 | bypass = true; | ||
1981 | } | ||
1982 | rnp = rdp->mynode; | ||
1983 | if (bypass) { // Avoid race with first bypass CB. | ||
1984 | WRITE_ONCE(my_rdp->nocb_defer_wakeup, | ||
1985 | RCU_NOCB_WAKE_NOT); | ||
1986 | del_timer(&my_rdp->nocb_timer); | ||
1987 | } | ||
1988 | // Advance callbacks if helpful and low contention. | ||
1989 | needwake_gp = false; | ||
1990 | if (!rcu_segcblist_restempty(&rdp->cblist, | ||
1991 | RCU_NEXT_READY_TAIL) || | ||
1992 | (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && | ||
1993 | rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) { | ||
1994 | raw_spin_lock_rcu_node(rnp); /* irqs disabled. */ | ||
1995 | needwake_gp = rcu_advance_cbs(rnp, rdp); | ||
1996 | raw_spin_unlock_rcu_node(rnp); /* irqs disabled. */ | ||
1997 | } | ||
1998 | // Need to wait on some grace period? | ||
1999 | WARN_ON_ONCE(!rcu_segcblist_restempty(&rdp->cblist, | ||
2000 | RCU_NEXT_READY_TAIL)); | ||
2001 | if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) { | ||
2002 | if (!needwait_gp || | ||
2003 | ULONG_CMP_LT(cur_gp_seq, wait_gp_seq)) | ||
2004 | wait_gp_seq = cur_gp_seq; | ||
2005 | needwait_gp = true; | ||
2006 | trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, | ||
2007 | TPS("NeedWaitGP")); | ||
2008 | } | ||
2009 | if (rcu_segcblist_ready_cbs(&rdp->cblist)) { | ||
2010 | needwake = rdp->nocb_cb_sleep; | ||
2011 | WRITE_ONCE(rdp->nocb_cb_sleep, false); | ||
2012 | smp_mb(); /* CB invocation -after- GP end. */ | ||
2013 | } else { | ||
2014 | needwake = false; | ||
2015 | } | ||
2016 | rcu_nocb_unlock_irqrestore(rdp, flags); | ||
2017 | if (needwake) { | ||
2018 | swake_up_one(&rdp->nocb_cb_wq); | ||
2019 | gotcbs = true; | ||
2020 | } | ||
2021 | if (needwake_gp) | ||
1786 | rcu_gp_kthread_wake(); | 2022 | rcu_gp_kthread_wake(); |
1787 | } | 2023 | } |
1788 | 2024 | ||
1789 | /* | 2025 | my_rdp->nocb_gp_bypass = bypass; |
1790 | * Wait for the grace period. Do so interruptibly to avoid messing | 2026 | my_rdp->nocb_gp_gp = needwait_gp; |
1791 | * up the load average. | 2027 | my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0; |
1792 | */ | 2028 | if (bypass && !rcu_nocb_poll) { |
1793 | trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait")); | 2029 | // At least one child with non-empty ->nocb_bypass, so set |
1794 | for (;;) { | 2030 | // timer in order to avoid stranding its callbacks. |
2031 | raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); | ||
2032 | mod_timer(&my_rdp->nocb_bypass_timer, j + 2); | ||
2033 | raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); | ||
2034 | } | ||
2035 | if (rcu_nocb_poll) { | ||
2036 | /* Polling, so trace if first poll in the series. */ | ||
2037 | if (gotcbs) | ||
2038 | trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll")); | ||
2039 | schedule_timeout_interruptible(1); | ||
2040 | } else if (!needwait_gp) { | ||
2041 | /* Wait for callbacks to appear. */ | ||
2042 | trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep")); | ||
2043 | swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq, | ||
2044 | !READ_ONCE(my_rdp->nocb_gp_sleep)); | ||
2045 | trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep")); | ||
2046 | } else { | ||
2047 | rnp = my_rdp->mynode; | ||
2048 | trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait")); | ||
1795 | swait_event_interruptible_exclusive( | 2049 | swait_event_interruptible_exclusive( |
1796 | rnp->nocb_gp_wq[rcu_seq_ctr(c) & 0x1], | 2050 | rnp->nocb_gp_wq[rcu_seq_ctr(wait_gp_seq) & 0x1], |
1797 | (d = rcu_seq_done(&rnp->gp_seq, c))); | 2051 | rcu_seq_done(&rnp->gp_seq, wait_gp_seq) || |
1798 | if (likely(d)) | 2052 | !READ_ONCE(my_rdp->nocb_gp_sleep)); |
1799 | break; | 2053 | trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait")); |
1800 | WARN_ON(signal_pending(current)); | ||
1801 | trace_rcu_this_gp(rnp, rdp, c, TPS("ResumeWait")); | ||
1802 | } | 2054 | } |
1803 | trace_rcu_this_gp(rnp, rdp, c, TPS("EndWait")); | 2055 | if (!rcu_nocb_poll) { |
1804 | smp_mb(); /* Ensure that CB invocation happens after GP end. */ | 2056 | raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); |
2057 | if (bypass) | ||
2058 | del_timer(&my_rdp->nocb_bypass_timer); | ||
2059 | WRITE_ONCE(my_rdp->nocb_gp_sleep, true); | ||
2060 | raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); | ||
2061 | } | ||
2062 | my_rdp->nocb_gp_seq = -1; | ||
2063 | WARN_ON(signal_pending(current)); | ||
1805 | } | 2064 | } |
1806 | 2065 | ||
1807 | /* | 2066 | /* |
1808 | * Leaders come here to wait for additional callbacks to show up. | 2067 | * No-CBs grace-period-wait kthread. There is one of these per group |
1809 | * This function does not return until callbacks appear. | 2068 | * of CPUs, but only once at least one CPU in that group has come online |
2069 | * at least once since boot. This kthread checks for newly posted | ||
2070 | * callbacks from any of the CPUs it is responsible for, waits for a | ||
2071 | * grace period, then awakens all of the rcu_nocb_cb_kthread() instances | ||
2072 | * that then have callback-invocation work to do. | ||
1810 | */ | 2073 | */ |
1811 | static void nocb_leader_wait(struct rcu_data *my_rdp) | 2074 | static int rcu_nocb_gp_kthread(void *arg) |
1812 | { | 2075 | { |
1813 | bool firsttime = true; | 2076 | struct rcu_data *rdp = arg; |
1814 | unsigned long flags; | ||
1815 | bool gotcbs; | ||
1816 | struct rcu_data *rdp; | ||
1817 | struct rcu_head **tail; | ||
1818 | |||
1819 | wait_again: | ||
1820 | |||
1821 | /* Wait for callbacks to appear. */ | ||
1822 | if (!rcu_nocb_poll) { | ||
1823 | trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu, TPS("Sleep")); | ||
1824 | swait_event_interruptible_exclusive(my_rdp->nocb_wq, | ||
1825 | !READ_ONCE(my_rdp->nocb_leader_sleep)); | ||
1826 | raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); | ||
1827 | my_rdp->nocb_leader_sleep = true; | ||
1828 | WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); | ||
1829 | del_timer(&my_rdp->nocb_timer); | ||
1830 | raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags); | ||
1831 | } else if (firsttime) { | ||
1832 | firsttime = false; /* Don't drown trace log with "Poll"! */ | ||
1833 | trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu, TPS("Poll")); | ||
1834 | } | ||
1835 | |||
1836 | /* | ||
1837 | * Each pass through the following loop checks a follower for CBs. | ||
1838 | * We are our own first follower. Any CBs found are moved to | ||
1839 | * nocb_gp_head, where they await a grace period. | ||
1840 | */ | ||
1841 | gotcbs = false; | ||
1842 | smp_mb(); /* wakeup and _sleep before ->nocb_head reads. */ | ||
1843 | for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { | ||
1844 | rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head); | ||
1845 | if (!rdp->nocb_gp_head) | ||
1846 | continue; /* No CBs here, try next follower. */ | ||
1847 | |||
1848 | /* Move callbacks to wait-for-GP list, which is empty. */ | ||
1849 | WRITE_ONCE(rdp->nocb_head, NULL); | ||
1850 | rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); | ||
1851 | gotcbs = true; | ||
1852 | } | ||
1853 | |||
1854 | /* No callbacks? Sleep a bit if polling, and go retry. */ | ||
1855 | if (unlikely(!gotcbs)) { | ||
1856 | WARN_ON(signal_pending(current)); | ||
1857 | if (rcu_nocb_poll) { | ||
1858 | schedule_timeout_interruptible(1); | ||
1859 | } else { | ||
1860 | trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu, | ||
1861 | TPS("WokeEmpty")); | ||
1862 | } | ||
1863 | goto wait_again; | ||
1864 | } | ||
1865 | 2077 | ||
1866 | /* Wait for one grace period. */ | 2078 | for (;;) { |
1867 | rcu_nocb_wait_gp(my_rdp); | 2079 | WRITE_ONCE(rdp->nocb_gp_loops, rdp->nocb_gp_loops + 1); |
1868 | 2080 | nocb_gp_wait(rdp); | |
1869 | /* Each pass through the following loop wakes a follower, if needed. */ | 2081 | cond_resched_tasks_rcu_qs(); |
1870 | for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { | ||
1871 | if (!rcu_nocb_poll && | ||
1872 | READ_ONCE(rdp->nocb_head) && | ||
1873 | READ_ONCE(my_rdp->nocb_leader_sleep)) { | ||
1874 | raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); | ||
1875 | my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/ | ||
1876 | raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags); | ||
1877 | } | ||
1878 | if (!rdp->nocb_gp_head) | ||
1879 | continue; /* No CBs, so no need to wake follower. */ | ||
1880 | |||
1881 | /* Append callbacks to follower's "done" list. */ | ||
1882 | raw_spin_lock_irqsave(&rdp->nocb_lock, flags); | ||
1883 | tail = rdp->nocb_follower_tail; | ||
1884 | rdp->nocb_follower_tail = rdp->nocb_gp_tail; | ||
1885 | *tail = rdp->nocb_gp_head; | ||
1886 | raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); | ||
1887 | if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { | ||
1888 | /* List was empty, so wake up the follower. */ | ||
1889 | swake_up_one(&rdp->nocb_wq); | ||
1890 | } | ||
1891 | } | 2082 | } |
1892 | 2083 | return 0; | |
1893 | /* If we (the leader) don't have CBs, go wait some more. */ | ||
1894 | if (!my_rdp->nocb_follower_head) | ||
1895 | goto wait_again; | ||
1896 | } | 2084 | } |
1897 | 2085 | ||
1898 | /* | 2086 | /* |
1899 | * Followers come here to wait for additional callbacks to show up. | 2087 | * Invoke any ready callbacks from the corresponding no-CBs CPU, |
1900 | * This function does not return until callbacks appear. | 2088 | * then, if there are no more, wait for more to appear. |
1901 | */ | 2089 | */ |
1902 | static void nocb_follower_wait(struct rcu_data *rdp) | 2090 | static void nocb_cb_wait(struct rcu_data *rdp) |
1903 | { | 2091 | { |
1904 | for (;;) { | 2092 | unsigned long cur_gp_seq; |
1905 | trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FollowerSleep")); | 2093 | unsigned long flags; |
1906 | swait_event_interruptible_exclusive(rdp->nocb_wq, | 2094 | bool needwake_gp = false; |
1907 | READ_ONCE(rdp->nocb_follower_head)); | 2095 | struct rcu_node *rnp = rdp->mynode; |
1908 | if (smp_load_acquire(&rdp->nocb_follower_head)) { | 2096 | |
1909 | /* ^^^ Ensure CB invocation follows _head test. */ | 2097 | local_irq_save(flags); |
1910 | return; | 2098 | rcu_momentary_dyntick_idle(); |
1911 | } | 2099 | local_irq_restore(flags); |
1912 | WARN_ON(signal_pending(current)); | 2100 | local_bh_disable(); |
1913 | trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); | 2101 | rcu_do_batch(rdp); |
2102 | local_bh_enable(); | ||
2103 | lockdep_assert_irqs_enabled(); | ||
2104 | rcu_nocb_lock_irqsave(rdp, flags); | ||
2105 | if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && | ||
2106 | rcu_seq_done(&rnp->gp_seq, cur_gp_seq) && | ||
2107 | raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */ | ||
2108 | needwake_gp = rcu_advance_cbs(rdp->mynode, rdp); | ||
2109 | raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ | ||
2110 | } | ||
2111 | if (rcu_segcblist_ready_cbs(&rdp->cblist)) { | ||
2112 | rcu_nocb_unlock_irqrestore(rdp, flags); | ||
2113 | if (needwake_gp) | ||
2114 | rcu_gp_kthread_wake(); | ||
2115 | return; | ||
2116 | } | ||
2117 | |||
2118 | trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep")); | ||
2119 | WRITE_ONCE(rdp->nocb_cb_sleep, true); | ||
2120 | rcu_nocb_unlock_irqrestore(rdp, flags); | ||
2121 | if (needwake_gp) | ||
2122 | rcu_gp_kthread_wake(); | ||
2123 | swait_event_interruptible_exclusive(rdp->nocb_cb_wq, | ||
2124 | !READ_ONCE(rdp->nocb_cb_sleep)); | ||
2125 | if (!smp_load_acquire(&rdp->nocb_cb_sleep)) { /* VVV */ | ||
2126 | /* ^^^ Ensure CB invocation follows _sleep test. */ | ||
2127 | return; | ||
1914 | } | 2128 | } |
2129 | WARN_ON(signal_pending(current)); | ||
2130 | trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); | ||
1915 | } | 2131 | } |
1916 | 2132 | ||
1917 | /* | 2133 | /* |
1918 | * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes | 2134 | * Per-rcu_data kthread, but only for no-CBs CPUs. Repeatedly invoke |
1919 | * callbacks queued by the corresponding no-CBs CPU, however, there is | 2135 | * nocb_cb_wait() to do the dirty work. |
1920 | * an optional leader-follower relationship so that the grace-period | ||
1921 | * kthreads don't have to do quite so many wakeups. | ||
1922 | */ | 2136 | */ |
1923 | static int rcu_nocb_kthread(void *arg) | 2137 | static int rcu_nocb_cb_kthread(void *arg) |
1924 | { | 2138 | { |
1925 | int c, cl; | ||
1926 | unsigned long flags; | ||
1927 | struct rcu_head *list; | ||
1928 | struct rcu_head *next; | ||
1929 | struct rcu_head **tail; | ||
1930 | struct rcu_data *rdp = arg; | 2139 | struct rcu_data *rdp = arg; |
1931 | 2140 | ||
1932 | /* Each pass through this loop invokes one batch of callbacks */ | 2141 | // Each pass through this loop does one callback batch, and, |
2142 | // if there are no more ready callbacks, waits for them. | ||
1933 | for (;;) { | 2143 | for (;;) { |
1934 | /* Wait for callbacks. */ | 2144 | nocb_cb_wait(rdp); |
1935 | if (rdp->nocb_leader == rdp) | 2145 | cond_resched_tasks_rcu_qs(); |
1936 | nocb_leader_wait(rdp); | ||
1937 | else | ||
1938 | nocb_follower_wait(rdp); | ||
1939 | |||
1940 | /* Pull the ready-to-invoke callbacks onto local list. */ | ||
1941 | raw_spin_lock_irqsave(&rdp->nocb_lock, flags); | ||
1942 | list = rdp->nocb_follower_head; | ||
1943 | rdp->nocb_follower_head = NULL; | ||
1944 | tail = rdp->nocb_follower_tail; | ||
1945 | rdp->nocb_follower_tail = &rdp->nocb_follower_head; | ||
1946 | raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); | ||
1947 | if (WARN_ON_ONCE(!list)) | ||
1948 | continue; | ||
1949 | trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeNonEmpty")); | ||
1950 | |||
1951 | /* Each pass through the following loop invokes a callback. */ | ||
1952 | trace_rcu_batch_start(rcu_state.name, | ||
1953 | atomic_long_read(&rdp->nocb_q_count_lazy), | ||
1954 | rcu_get_n_cbs_nocb_cpu(rdp), -1); | ||
1955 | c = cl = 0; | ||
1956 | while (list) { | ||
1957 | next = list->next; | ||
1958 | /* Wait for enqueuing to complete, if needed. */ | ||
1959 | while (next == NULL && &list->next != tail) { | ||
1960 | trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, | ||
1961 | TPS("WaitQueue")); | ||
1962 | schedule_timeout_interruptible(1); | ||
1963 | trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, | ||
1964 | TPS("WokeQueue")); | ||
1965 | next = list->next; | ||
1966 | } | ||
1967 | debug_rcu_head_unqueue(list); | ||
1968 | local_bh_disable(); | ||
1969 | if (__rcu_reclaim(rcu_state.name, list)) | ||
1970 | cl++; | ||
1971 | c++; | ||
1972 | local_bh_enable(); | ||
1973 | cond_resched_tasks_rcu_qs(); | ||
1974 | list = next; | ||
1975 | } | ||
1976 | trace_rcu_batch_end(rcu_state.name, c, !!list, 0, 0, 1); | ||
1977 | smp_mb__before_atomic(); /* _add after CB invocation. */ | ||
1978 | atomic_long_add(-c, &rdp->nocb_q_count); | ||
1979 | atomic_long_add(-cl, &rdp->nocb_q_count_lazy); | ||
1980 | } | 2146 | } |
1981 | return 0; | 2147 | return 0; |
1982 | } | 2148 | } |
@@ -1993,14 +2159,14 @@ static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp) | |||
1993 | unsigned long flags; | 2159 | unsigned long flags; |
1994 | int ndw; | 2160 | int ndw; |
1995 | 2161 | ||
1996 | raw_spin_lock_irqsave(&rdp->nocb_lock, flags); | 2162 | rcu_nocb_lock_irqsave(rdp, flags); |
1997 | if (!rcu_nocb_need_deferred_wakeup(rdp)) { | 2163 | if (!rcu_nocb_need_deferred_wakeup(rdp)) { |
1998 | raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); | 2164 | rcu_nocb_unlock_irqrestore(rdp, flags); |
1999 | return; | 2165 | return; |
2000 | } | 2166 | } |
2001 | ndw = READ_ONCE(rdp->nocb_defer_wakeup); | 2167 | ndw = READ_ONCE(rdp->nocb_defer_wakeup); |
2002 | WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); | 2168 | WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); |
2003 | __wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); | 2169 | wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); |
2004 | trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake")); | 2170 | trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake")); |
2005 | } | 2171 | } |
2006 | 2172 | ||
@@ -2027,6 +2193,7 @@ void __init rcu_init_nohz(void) | |||
2027 | { | 2193 | { |
2028 | int cpu; | 2194 | int cpu; |
2029 | bool need_rcu_nocb_mask = false; | 2195 | bool need_rcu_nocb_mask = false; |
2196 | struct rcu_data *rdp; | ||
2030 | 2197 | ||
2031 | #if defined(CONFIG_NO_HZ_FULL) | 2198 | #if defined(CONFIG_NO_HZ_FULL) |
2032 | if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask)) | 2199 | if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask)) |
@@ -2060,67 +2227,63 @@ void __init rcu_init_nohz(void) | |||
2060 | if (rcu_nocb_poll) | 2227 | if (rcu_nocb_poll) |
2061 | pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); | 2228 | pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); |
2062 | 2229 | ||
2063 | for_each_cpu(cpu, rcu_nocb_mask) | 2230 | for_each_cpu(cpu, rcu_nocb_mask) { |
2064 | init_nocb_callback_list(per_cpu_ptr(&rcu_data, cpu)); | 2231 | rdp = per_cpu_ptr(&rcu_data, cpu); |
2232 | if (rcu_segcblist_empty(&rdp->cblist)) | ||
2233 | rcu_segcblist_init(&rdp->cblist); | ||
2234 | rcu_segcblist_offload(&rdp->cblist); | ||
2235 | } | ||
2065 | rcu_organize_nocb_kthreads(); | 2236 | rcu_organize_nocb_kthreads(); |
2066 | } | 2237 | } |
2067 | 2238 | ||
2068 | /* Initialize per-rcu_data variables for no-CBs CPUs. */ | 2239 | /* Initialize per-rcu_data variables for no-CBs CPUs. */ |
2069 | static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) | 2240 | static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) |
2070 | { | 2241 | { |
2071 | rdp->nocb_tail = &rdp->nocb_head; | 2242 | init_swait_queue_head(&rdp->nocb_cb_wq); |
2072 | init_swait_queue_head(&rdp->nocb_wq); | 2243 | init_swait_queue_head(&rdp->nocb_gp_wq); |
2073 | rdp->nocb_follower_tail = &rdp->nocb_follower_head; | ||
2074 | raw_spin_lock_init(&rdp->nocb_lock); | 2244 | raw_spin_lock_init(&rdp->nocb_lock); |
2245 | raw_spin_lock_init(&rdp->nocb_bypass_lock); | ||
2246 | raw_spin_lock_init(&rdp->nocb_gp_lock); | ||
2075 | timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0); | 2247 | timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0); |
2248 | timer_setup(&rdp->nocb_bypass_timer, do_nocb_bypass_wakeup_timer, 0); | ||
2249 | rcu_cblist_init(&rdp->nocb_bypass); | ||
2076 | } | 2250 | } |
2077 | 2251 | ||
2078 | /* | 2252 | /* |
2079 | * If the specified CPU is a no-CBs CPU that does not already have its | 2253 | * If the specified CPU is a no-CBs CPU that does not already have its |
2080 | * rcuo kthread, spawn it. If the CPUs are brought online out of order, | 2254 | * rcuo CB kthread, spawn it. Additionally, if the rcuo GP kthread |
2081 | * this can require re-organizing the leader-follower relationships. | 2255 | * for this CPU's group has not yet been created, spawn it as well. |
2082 | */ | 2256 | */ |
2083 | static void rcu_spawn_one_nocb_kthread(int cpu) | 2257 | static void rcu_spawn_one_nocb_kthread(int cpu) |
2084 | { | 2258 | { |
2085 | struct rcu_data *rdp; | 2259 | struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); |
2086 | struct rcu_data *rdp_last; | 2260 | struct rcu_data *rdp_gp; |
2087 | struct rcu_data *rdp_old_leader; | ||
2088 | struct rcu_data *rdp_spawn = per_cpu_ptr(&rcu_data, cpu); | ||
2089 | struct task_struct *t; | 2261 | struct task_struct *t; |
2090 | 2262 | ||
2091 | /* | 2263 | /* |
2092 | * If this isn't a no-CBs CPU or if it already has an rcuo kthread, | 2264 | * If this isn't a no-CBs CPU or if it already has an rcuo kthread, |
2093 | * then nothing to do. | 2265 | * then nothing to do. |
2094 | */ | 2266 | */ |
2095 | if (!rcu_is_nocb_cpu(cpu) || rdp_spawn->nocb_kthread) | 2267 | if (!rcu_is_nocb_cpu(cpu) || rdp->nocb_cb_kthread) |
2096 | return; | 2268 | return; |
2097 | 2269 | ||
2098 | /* If we didn't spawn the leader first, reorganize! */ | 2270 | /* If we didn't spawn the GP kthread first, reorganize! */ |
2099 | rdp_old_leader = rdp_spawn->nocb_leader; | 2271 | rdp_gp = rdp->nocb_gp_rdp; |
2100 | if (rdp_old_leader != rdp_spawn && !rdp_old_leader->nocb_kthread) { | 2272 | if (!rdp_gp->nocb_gp_kthread) { |
2101 | rdp_last = NULL; | 2273 | t = kthread_run(rcu_nocb_gp_kthread, rdp_gp, |
2102 | rdp = rdp_old_leader; | 2274 | "rcuog/%d", rdp_gp->cpu); |
2103 | do { | 2275 | if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) |
2104 | rdp->nocb_leader = rdp_spawn; | 2276 | return; |
2105 | if (rdp_last && rdp != rdp_spawn) | 2277 | WRITE_ONCE(rdp_gp->nocb_gp_kthread, t); |
2106 | rdp_last->nocb_next_follower = rdp; | ||
2107 | if (rdp == rdp_spawn) { | ||
2108 | rdp = rdp->nocb_next_follower; | ||
2109 | } else { | ||
2110 | rdp_last = rdp; | ||
2111 | rdp = rdp->nocb_next_follower; | ||
2112 | rdp_last->nocb_next_follower = NULL; | ||
2113 | } | ||
2114 | } while (rdp); | ||
2115 | rdp_spawn->nocb_next_follower = rdp_old_leader; | ||
2116 | } | 2278 | } |
2117 | 2279 | ||
2118 | /* Spawn the kthread for this CPU. */ | 2280 | /* Spawn the kthread for this CPU. */ |
2119 | t = kthread_run(rcu_nocb_kthread, rdp_spawn, | 2281 | t = kthread_run(rcu_nocb_cb_kthread, rdp, |
2120 | "rcuo%c/%d", rcu_state.abbr, cpu); | 2282 | "rcuo%c/%d", rcu_state.abbr, cpu); |
2121 | if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo kthread, OOM is now expected behavior\n", __func__)) | 2283 | if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__)) |
2122 | return; | 2284 | return; |
2123 | WRITE_ONCE(rdp_spawn->nocb_kthread, t); | 2285 | WRITE_ONCE(rdp->nocb_cb_kthread, t); |
2286 | WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread); | ||
2124 | } | 2287 | } |
2125 | 2288 | ||
2126 | /* | 2289 | /* |
@@ -2147,27 +2310,28 @@ static void __init rcu_spawn_nocb_kthreads(void) | |||
2147 | rcu_spawn_cpu_nocb_kthread(cpu); | 2310 | rcu_spawn_cpu_nocb_kthread(cpu); |
2148 | } | 2311 | } |
2149 | 2312 | ||
2150 | /* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */ | 2313 | /* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */ |
2151 | static int rcu_nocb_leader_stride = -1; | 2314 | static int rcu_nocb_gp_stride = -1; |
2152 | module_param(rcu_nocb_leader_stride, int, 0444); | 2315 | module_param(rcu_nocb_gp_stride, int, 0444); |
2153 | 2316 | ||
2154 | /* | 2317 | /* |
2155 | * Initialize leader-follower relationships for all no-CBs CPU. | 2318 | * Initialize GP-CB relationships for all no-CBs CPU. |
2156 | */ | 2319 | */ |
2157 | static void __init rcu_organize_nocb_kthreads(void) | 2320 | static void __init rcu_organize_nocb_kthreads(void) |
2158 | { | 2321 | { |
2159 | int cpu; | 2322 | int cpu; |
2160 | int ls = rcu_nocb_leader_stride; | 2323 | bool firsttime = true; |
2161 | int nl = 0; /* Next leader. */ | 2324 | int ls = rcu_nocb_gp_stride; |
2325 | int nl = 0; /* Next GP kthread. */ | ||
2162 | struct rcu_data *rdp; | 2326 | struct rcu_data *rdp; |
2163 | struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */ | 2327 | struct rcu_data *rdp_gp = NULL; /* Suppress misguided gcc warn. */ |
2164 | struct rcu_data *rdp_prev = NULL; | 2328 | struct rcu_data *rdp_prev = NULL; |
2165 | 2329 | ||
2166 | if (!cpumask_available(rcu_nocb_mask)) | 2330 | if (!cpumask_available(rcu_nocb_mask)) |
2167 | return; | 2331 | return; |
2168 | if (ls == -1) { | 2332 | if (ls == -1) { |
2169 | ls = int_sqrt(nr_cpu_ids); | 2333 | ls = nr_cpu_ids / int_sqrt(nr_cpu_ids); |
2170 | rcu_nocb_leader_stride = ls; | 2334 | rcu_nocb_gp_stride = ls; |
2171 | } | 2335 | } |
2172 | 2336 | ||
2173 | /* | 2337 | /* |
@@ -2178,39 +2342,24 @@ static void __init rcu_organize_nocb_kthreads(void) | |||
2178 | for_each_cpu(cpu, rcu_nocb_mask) { | 2342 | for_each_cpu(cpu, rcu_nocb_mask) { |
2179 | rdp = per_cpu_ptr(&rcu_data, cpu); | 2343 | rdp = per_cpu_ptr(&rcu_data, cpu); |
2180 | if (rdp->cpu >= nl) { | 2344 | if (rdp->cpu >= nl) { |
2181 | /* New leader, set up for followers & next leader. */ | 2345 | /* New GP kthread, set up for CBs & next GP. */ |
2182 | nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls; | 2346 | nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls; |
2183 | rdp->nocb_leader = rdp; | 2347 | rdp->nocb_gp_rdp = rdp; |
2184 | rdp_leader = rdp; | 2348 | rdp_gp = rdp; |
2349 | if (!firsttime && dump_tree) | ||
2350 | pr_cont("\n"); | ||
2351 | firsttime = false; | ||
2352 | pr_alert("%s: No-CB GP kthread CPU %d:", __func__, cpu); | ||
2185 | } else { | 2353 | } else { |
2186 | /* Another follower, link to previous leader. */ | 2354 | /* Another CB kthread, link to previous GP kthread. */ |
2187 | rdp->nocb_leader = rdp_leader; | 2355 | rdp->nocb_gp_rdp = rdp_gp; |
2188 | rdp_prev->nocb_next_follower = rdp; | 2356 | rdp_prev->nocb_next_cb_rdp = rdp; |
2357 | pr_alert(" %d", cpu); | ||
2189 | } | 2358 | } |
2190 | rdp_prev = rdp; | 2359 | rdp_prev = rdp; |
2191 | } | 2360 | } |
2192 | } | 2361 | } |
2193 | 2362 | ||
2194 | /* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ | ||
2195 | static bool init_nocb_callback_list(struct rcu_data *rdp) | ||
2196 | { | ||
2197 | if (!rcu_is_nocb_cpu(rdp->cpu)) | ||
2198 | return false; | ||
2199 | |||
2200 | /* If there are early-boot callbacks, move them to nocb lists. */ | ||
2201 | if (!rcu_segcblist_empty(&rdp->cblist)) { | ||
2202 | rdp->nocb_head = rcu_segcblist_head(&rdp->cblist); | ||
2203 | rdp->nocb_tail = rcu_segcblist_tail(&rdp->cblist); | ||
2204 | atomic_long_set(&rdp->nocb_q_count, | ||
2205 | rcu_segcblist_n_cbs(&rdp->cblist)); | ||
2206 | atomic_long_set(&rdp->nocb_q_count_lazy, | ||
2207 | rcu_segcblist_n_lazy_cbs(&rdp->cblist)); | ||
2208 | rcu_segcblist_init(&rdp->cblist); | ||
2209 | } | ||
2210 | rcu_segcblist_disable(&rdp->cblist); | ||
2211 | return true; | ||
2212 | } | ||
2213 | |||
2214 | /* | 2363 | /* |
2215 | * Bind the current task to the offloaded CPUs. If there are no offloaded | 2364 | * Bind the current task to the offloaded CPUs. If there are no offloaded |
2216 | * CPUs, leave the task unbound. Splat if the bind attempt fails. | 2365 | * CPUs, leave the task unbound. Splat if the bind attempt fails. |
@@ -2223,20 +2372,101 @@ void rcu_bind_current_to_nocb(void) | |||
2223 | EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb); | 2372 | EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb); |
2224 | 2373 | ||
2225 | /* | 2374 | /* |
2226 | * Return the number of RCU callbacks still queued from the specified | 2375 | * Dump out nocb grace-period kthread state for the specified rcu_data |
2227 | * CPU, which must be a nocbs CPU. | 2376 | * structure. |
2228 | */ | 2377 | */ |
2229 | static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp) | 2378 | static void show_rcu_nocb_gp_state(struct rcu_data *rdp) |
2230 | { | 2379 | { |
2231 | return atomic_long_read(&rdp->nocb_q_count); | 2380 | struct rcu_node *rnp = rdp->mynode; |
2381 | |||
2382 | pr_info("nocb GP %d %c%c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu\n", | ||
2383 | rdp->cpu, | ||
2384 | "kK"[!!rdp->nocb_gp_kthread], | ||
2385 | "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)], | ||
2386 | "dD"[!!rdp->nocb_defer_wakeup], | ||
2387 | "tT"[timer_pending(&rdp->nocb_timer)], | ||
2388 | "bB"[timer_pending(&rdp->nocb_bypass_timer)], | ||
2389 | "sS"[!!rdp->nocb_gp_sleep], | ||
2390 | ".W"[swait_active(&rdp->nocb_gp_wq)], | ||
2391 | ".W"[swait_active(&rnp->nocb_gp_wq[0])], | ||
2392 | ".W"[swait_active(&rnp->nocb_gp_wq[1])], | ||
2393 | ".B"[!!rdp->nocb_gp_bypass], | ||
2394 | ".G"[!!rdp->nocb_gp_gp], | ||
2395 | (long)rdp->nocb_gp_seq, | ||
2396 | rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops)); | ||
2397 | } | ||
2398 | |||
2399 | /* Dump out nocb kthread state for the specified rcu_data structure. */ | ||
2400 | static void show_rcu_nocb_state(struct rcu_data *rdp) | ||
2401 | { | ||
2402 | struct rcu_segcblist *rsclp = &rdp->cblist; | ||
2403 | bool waslocked; | ||
2404 | bool wastimer; | ||
2405 | bool wassleep; | ||
2406 | |||
2407 | if (rdp->nocb_gp_rdp == rdp) | ||
2408 | show_rcu_nocb_gp_state(rdp); | ||
2409 | |||
2410 | pr_info(" CB %d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%c%c%c q%ld\n", | ||
2411 | rdp->cpu, rdp->nocb_gp_rdp->cpu, | ||
2412 | "kK"[!!rdp->nocb_cb_kthread], | ||
2413 | "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)], | ||
2414 | "cC"[!!atomic_read(&rdp->nocb_lock_contended)], | ||
2415 | "lL"[raw_spin_is_locked(&rdp->nocb_lock)], | ||
2416 | "sS"[!!rdp->nocb_cb_sleep], | ||
2417 | ".W"[swait_active(&rdp->nocb_cb_wq)], | ||
2418 | jiffies - rdp->nocb_bypass_first, | ||
2419 | jiffies - rdp->nocb_nobypass_last, | ||
2420 | rdp->nocb_nobypass_count, | ||
2421 | ".D"[rcu_segcblist_ready_cbs(rsclp)], | ||
2422 | ".W"[!rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)], | ||
2423 | ".R"[!rcu_segcblist_restempty(rsclp, RCU_WAIT_TAIL)], | ||
2424 | ".N"[!rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL)], | ||
2425 | ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)], | ||
2426 | rcu_segcblist_n_cbs(&rdp->cblist)); | ||
2427 | |||
2428 | /* It is OK for GP kthreads to have GP state. */ | ||
2429 | if (rdp->nocb_gp_rdp == rdp) | ||
2430 | return; | ||
2431 | |||
2432 | waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock); | ||
2433 | wastimer = timer_pending(&rdp->nocb_timer); | ||
2434 | wassleep = swait_active(&rdp->nocb_gp_wq); | ||
2435 | if (!rdp->nocb_defer_wakeup && !rdp->nocb_gp_sleep && | ||
2436 | !waslocked && !wastimer && !wassleep) | ||
2437 | return; /* Nothing untowards. */ | ||
2438 | |||
2439 | pr_info(" !!! %c%c%c%c %c\n", | ||
2440 | "lL"[waslocked], | ||
2441 | "dD"[!!rdp->nocb_defer_wakeup], | ||
2442 | "tT"[wastimer], | ||
2443 | "sS"[!!rdp->nocb_gp_sleep], | ||
2444 | ".W"[wassleep]); | ||
2232 | } | 2445 | } |
2233 | 2446 | ||
2234 | #else /* #ifdef CONFIG_RCU_NOCB_CPU */ | 2447 | #else /* #ifdef CONFIG_RCU_NOCB_CPU */ |
2235 | 2448 | ||
2236 | static bool rcu_nocb_cpu_needs_barrier(int cpu) | 2449 | /* No ->nocb_lock to acquire. */ |
2450 | static void rcu_nocb_lock(struct rcu_data *rdp) | ||
2451 | { | ||
2452 | } | ||
2453 | |||
2454 | /* No ->nocb_lock to release. */ | ||
2455 | static void rcu_nocb_unlock(struct rcu_data *rdp) | ||
2237 | { | 2456 | { |
2238 | WARN_ON_ONCE(1); /* Should be dead code. */ | 2457 | } |
2239 | return false; | 2458 | |
2459 | /* No ->nocb_lock to release. */ | ||
2460 | static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, | ||
2461 | unsigned long flags) | ||
2462 | { | ||
2463 | local_irq_restore(flags); | ||
2464 | } | ||
2465 | |||
2466 | /* Lockdep check that ->cblist may be safely accessed. */ | ||
2467 | static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp) | ||
2468 | { | ||
2469 | lockdep_assert_irqs_disabled(); | ||
2240 | } | 2470 | } |
2241 | 2471 | ||
2242 | static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) | 2472 | static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) |
@@ -2252,19 +2482,24 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) | |||
2252 | { | 2482 | { |
2253 | } | 2483 | } |
2254 | 2484 | ||
2255 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | 2485 | static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, |
2256 | bool lazy, unsigned long flags) | 2486 | unsigned long j) |
2257 | { | 2487 | { |
2258 | return false; | 2488 | return true; |
2259 | } | 2489 | } |
2260 | 2490 | ||
2261 | static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, | 2491 | static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, |
2262 | struct rcu_data *rdp, | 2492 | bool *was_alldone, unsigned long flags) |
2263 | unsigned long flags) | ||
2264 | { | 2493 | { |
2265 | return false; | 2494 | return false; |
2266 | } | 2495 | } |
2267 | 2496 | ||
2497 | static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty, | ||
2498 | unsigned long flags) | ||
2499 | { | ||
2500 | WARN_ON_ONCE(1); /* Should be dead code! */ | ||
2501 | } | ||
2502 | |||
2268 | static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) | 2503 | static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) |
2269 | { | 2504 | { |
2270 | } | 2505 | } |
@@ -2286,14 +2521,8 @@ static void __init rcu_spawn_nocb_kthreads(void) | |||
2286 | { | 2521 | { |
2287 | } | 2522 | } |
2288 | 2523 | ||
2289 | static bool init_nocb_callback_list(struct rcu_data *rdp) | 2524 | static void show_rcu_nocb_state(struct rcu_data *rdp) |
2290 | { | ||
2291 | return false; | ||
2292 | } | ||
2293 | |||
2294 | static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp) | ||
2295 | { | 2525 | { |
2296 | return 0; | ||
2297 | } | 2526 | } |
2298 | 2527 | ||
2299 | #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ | 2528 | #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ |
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 065183391f75..841ab43f3e60 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h | |||
@@ -527,6 +527,8 @@ static void check_cpu_stall(struct rcu_data *rdp) | |||
527 | 527 | ||
528 | /* We haven't checked in, so go dump stack. */ | 528 | /* We haven't checked in, so go dump stack. */ |
529 | print_cpu_stall(); | 529 | print_cpu_stall(); |
530 | if (rcu_cpu_stall_ftrace_dump) | ||
531 | rcu_ftrace_dump(DUMP_ALL); | ||
530 | 532 | ||
531 | } else if (rcu_gp_in_progress() && | 533 | } else if (rcu_gp_in_progress() && |
532 | ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) && | 534 | ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) && |
@@ -534,6 +536,8 @@ static void check_cpu_stall(struct rcu_data *rdp) | |||
534 | 536 | ||
535 | /* They had a few time units to dump stack, so complain. */ | 537 | /* They had a few time units to dump stack, so complain. */ |
536 | print_other_cpu_stall(gs2); | 538 | print_other_cpu_stall(gs2); |
539 | if (rcu_cpu_stall_ftrace_dump) | ||
540 | rcu_ftrace_dump(DUMP_ALL); | ||
537 | } | 541 | } |
538 | } | 542 | } |
539 | 543 | ||
@@ -585,6 +589,11 @@ void show_rcu_gp_kthreads(void) | |||
585 | cpu, (long)rdp->gp_seq_needed); | 589 | cpu, (long)rdp->gp_seq_needed); |
586 | } | 590 | } |
587 | } | 591 | } |
592 | for_each_possible_cpu(cpu) { | ||
593 | rdp = per_cpu_ptr(&rcu_data, cpu); | ||
594 | if (rcu_segcblist_is_offloaded(&rdp->cblist)) | ||
595 | show_rcu_nocb_state(rdp); | ||
596 | } | ||
588 | /* sched_show_task(rcu_state.gp_kthread); */ | 597 | /* sched_show_task(rcu_state.gp_kthread); */ |
589 | } | 598 | } |
590 | EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); | 599 | EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); |
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 61df2bf08563..1861103662db 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c | |||
@@ -61,9 +61,15 @@ module_param(rcu_normal_after_boot, int, 0); | |||
61 | 61 | ||
62 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 62 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
63 | /** | 63 | /** |
64 | * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section? | 64 | * rcu_read_lock_held_common() - might we be in RCU-sched read-side critical section? |
65 | * @ret: Best guess answer if lockdep cannot be relied on | ||
65 | * | 66 | * |
66 | * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an | 67 | * Returns true if lockdep must be ignored, in which case *ret contains |
68 | * the best guess described below. Otherwise returns false, in which | ||
69 | * case *ret tells the caller nothing and the caller should instead | ||
70 | * consult lockdep. | ||
71 | * | ||
72 | * If CONFIG_DEBUG_LOCK_ALLOC is selected, set *ret to nonzero iff in an | ||
67 | * RCU-sched read-side critical section. In absence of | 73 | * RCU-sched read-side critical section. In absence of |
68 | * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side | 74 | * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side |
69 | * critical section unless it can prove otherwise. Note that disabling | 75 | * critical section unless it can prove otherwise. Note that disabling |
@@ -75,35 +81,45 @@ module_param(rcu_normal_after_boot, int, 0); | |||
75 | * Check debug_lockdep_rcu_enabled() to prevent false positives during boot | 81 | * Check debug_lockdep_rcu_enabled() to prevent false positives during boot |
76 | * and while lockdep is disabled. | 82 | * and while lockdep is disabled. |
77 | * | 83 | * |
78 | * Note that if the CPU is in the idle loop from an RCU point of | 84 | * Note that if the CPU is in the idle loop from an RCU point of view (ie: |
79 | * view (ie: that we are in the section between rcu_idle_enter() and | 85 | * that we are in the section between rcu_idle_enter() and rcu_idle_exit()) |
80 | * rcu_idle_exit()) then rcu_read_lock_held() returns false even if the CPU | 86 | * then rcu_read_lock_held() sets *ret to false even if the CPU did an |
81 | * did an rcu_read_lock(). The reason for this is that RCU ignores CPUs | 87 | * rcu_read_lock(). The reason for this is that RCU ignores CPUs that are |
82 | * that are in such a section, considering these as in extended quiescent | 88 | * in such a section, considering these as in extended quiescent state, |
83 | * state, so such a CPU is effectively never in an RCU read-side critical | 89 | * so such a CPU is effectively never in an RCU read-side critical section |
84 | * section regardless of what RCU primitives it invokes. This state of | 90 | * regardless of what RCU primitives it invokes. This state of affairs is |
85 | * affairs is required --- we need to keep an RCU-free window in idle | 91 | * required --- we need to keep an RCU-free window in idle where the CPU may |
86 | * where the CPU may possibly enter into low power mode. This way we can | 92 | * possibly enter into low power mode. This way we can notice an extended |
87 | * notice an extended quiescent state to other CPUs that started a grace | 93 | * quiescent state to other CPUs that started a grace period. Otherwise |
88 | * period. Otherwise we would delay any grace period as long as we run in | 94 | * we would delay any grace period as long as we run in the idle task. |
89 | * the idle task. | ||
90 | * | 95 | * |
91 | * Similarly, we avoid claiming an SRCU read lock held if the current | 96 | * Similarly, we avoid claiming an RCU read lock held if the current |
92 | * CPU is offline. | 97 | * CPU is offline. |
93 | */ | 98 | */ |
99 | static bool rcu_read_lock_held_common(bool *ret) | ||
100 | { | ||
101 | if (!debug_lockdep_rcu_enabled()) { | ||
102 | *ret = 1; | ||
103 | return true; | ||
104 | } | ||
105 | if (!rcu_is_watching()) { | ||
106 | *ret = 0; | ||
107 | return true; | ||
108 | } | ||
109 | if (!rcu_lockdep_current_cpu_online()) { | ||
110 | *ret = 0; | ||
111 | return true; | ||
112 | } | ||
113 | return false; | ||
114 | } | ||
115 | |||
94 | int rcu_read_lock_sched_held(void) | 116 | int rcu_read_lock_sched_held(void) |
95 | { | 117 | { |
96 | int lockdep_opinion = 0; | 118 | bool ret; |
97 | 119 | ||
98 | if (!debug_lockdep_rcu_enabled()) | 120 | if (rcu_read_lock_held_common(&ret)) |
99 | return 1; | 121 | return ret; |
100 | if (!rcu_is_watching()) | 122 | return lock_is_held(&rcu_sched_lock_map) || !preemptible(); |
101 | return 0; | ||
102 | if (!rcu_lockdep_current_cpu_online()) | ||
103 | return 0; | ||
104 | if (debug_locks) | ||
105 | lockdep_opinion = lock_is_held(&rcu_sched_lock_map); | ||
106 | return lockdep_opinion || !preemptible(); | ||
107 | } | 123 | } |
108 | EXPORT_SYMBOL(rcu_read_lock_sched_held); | 124 | EXPORT_SYMBOL(rcu_read_lock_sched_held); |
109 | #endif | 125 | #endif |
@@ -136,8 +152,7 @@ static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1); | |||
136 | */ | 152 | */ |
137 | bool rcu_gp_is_expedited(void) | 153 | bool rcu_gp_is_expedited(void) |
138 | { | 154 | { |
139 | return rcu_expedited || atomic_read(&rcu_expedited_nesting) || | 155 | return rcu_expedited || atomic_read(&rcu_expedited_nesting); |
140 | rcu_scheduler_active == RCU_SCHEDULER_INIT; | ||
141 | } | 156 | } |
142 | EXPORT_SYMBOL_GPL(rcu_gp_is_expedited); | 157 | EXPORT_SYMBOL_GPL(rcu_gp_is_expedited); |
143 | 158 | ||
@@ -261,12 +276,10 @@ NOKPROBE_SYMBOL(debug_lockdep_rcu_enabled); | |||
261 | */ | 276 | */ |
262 | int rcu_read_lock_held(void) | 277 | int rcu_read_lock_held(void) |
263 | { | 278 | { |
264 | if (!debug_lockdep_rcu_enabled()) | 279 | bool ret; |
265 | return 1; | 280 | |
266 | if (!rcu_is_watching()) | 281 | if (rcu_read_lock_held_common(&ret)) |
267 | return 0; | 282 | return ret; |
268 | if (!rcu_lockdep_current_cpu_online()) | ||
269 | return 0; | ||
270 | return lock_is_held(&rcu_lock_map); | 283 | return lock_is_held(&rcu_lock_map); |
271 | } | 284 | } |
272 | EXPORT_SYMBOL_GPL(rcu_read_lock_held); | 285 | EXPORT_SYMBOL_GPL(rcu_read_lock_held); |
@@ -288,16 +301,28 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_held); | |||
288 | */ | 301 | */ |
289 | int rcu_read_lock_bh_held(void) | 302 | int rcu_read_lock_bh_held(void) |
290 | { | 303 | { |
291 | if (!debug_lockdep_rcu_enabled()) | 304 | bool ret; |
292 | return 1; | 305 | |
293 | if (!rcu_is_watching()) | 306 | if (rcu_read_lock_held_common(&ret)) |
294 | return 0; | 307 | return ret; |
295 | if (!rcu_lockdep_current_cpu_online()) | ||
296 | return 0; | ||
297 | return in_softirq() || irqs_disabled(); | 308 | return in_softirq() || irqs_disabled(); |
298 | } | 309 | } |
299 | EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); | 310 | EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); |
300 | 311 | ||
312 | int rcu_read_lock_any_held(void) | ||
313 | { | ||
314 | bool ret; | ||
315 | |||
316 | if (rcu_read_lock_held_common(&ret)) | ||
317 | return ret; | ||
318 | if (lock_is_held(&rcu_lock_map) || | ||
319 | lock_is_held(&rcu_bh_lock_map) || | ||
320 | lock_is_held(&rcu_sched_lock_map)) | ||
321 | return 1; | ||
322 | return !preemptible(); | ||
323 | } | ||
324 | EXPORT_SYMBOL_GPL(rcu_read_lock_any_held); | ||
325 | |||
301 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | 326 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ |
302 | 327 | ||
303 | /** | 328 | /** |
@@ -437,6 +462,8 @@ EXPORT_SYMBOL_GPL(rcutorture_sched_setaffinity); | |||
437 | #endif | 462 | #endif |
438 | 463 | ||
439 | #ifdef CONFIG_RCU_STALL_COMMON | 464 | #ifdef CONFIG_RCU_STALL_COMMON |
465 | int rcu_cpu_stall_ftrace_dump __read_mostly; | ||
466 | module_param(rcu_cpu_stall_ftrace_dump, int, 0644); | ||
440 | int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ | 467 | int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ |
441 | EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); | 468 | EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); |
442 | module_param(rcu_cpu_stall_suppress, int, 0644); | 469 | module_param(rcu_cpu_stall_suppress, int, 0644); |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2b037f195473..7fa8e74ad2ab 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -3486,8 +3486,36 @@ void scheduler_tick(void) | |||
3486 | 3486 | ||
3487 | struct tick_work { | 3487 | struct tick_work { |
3488 | int cpu; | 3488 | int cpu; |
3489 | atomic_t state; | ||
3489 | struct delayed_work work; | 3490 | struct delayed_work work; |
3490 | }; | 3491 | }; |
3492 | /* Values for ->state, see diagram below. */ | ||
3493 | #define TICK_SCHED_REMOTE_OFFLINE 0 | ||
3494 | #define TICK_SCHED_REMOTE_OFFLINING 1 | ||
3495 | #define TICK_SCHED_REMOTE_RUNNING 2 | ||
3496 | |||
3497 | /* | ||
3498 | * State diagram for ->state: | ||
3499 | * | ||
3500 | * | ||
3501 | * TICK_SCHED_REMOTE_OFFLINE | ||
3502 | * | ^ | ||
3503 | * | | | ||
3504 | * | | sched_tick_remote() | ||
3505 | * | | | ||
3506 | * | | | ||
3507 | * +--TICK_SCHED_REMOTE_OFFLINING | ||
3508 | * | ^ | ||
3509 | * | | | ||
3510 | * sched_tick_start() | | sched_tick_stop() | ||
3511 | * | | | ||
3512 | * V | | ||
3513 | * TICK_SCHED_REMOTE_RUNNING | ||
3514 | * | ||
3515 | * | ||
3516 | * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() | ||
3517 | * and sched_tick_start() are happy to leave the state in RUNNING. | ||
3518 | */ | ||
3491 | 3519 | ||
3492 | static struct tick_work __percpu *tick_work_cpu; | 3520 | static struct tick_work __percpu *tick_work_cpu; |
3493 | 3521 | ||
@@ -3500,6 +3528,7 @@ static void sched_tick_remote(struct work_struct *work) | |||
3500 | struct task_struct *curr; | 3528 | struct task_struct *curr; |
3501 | struct rq_flags rf; | 3529 | struct rq_flags rf; |
3502 | u64 delta; | 3530 | u64 delta; |
3531 | int os; | ||
3503 | 3532 | ||
3504 | /* | 3533 | /* |
3505 | * Handle the tick only if it appears the remote CPU is running in full | 3534 | * Handle the tick only if it appears the remote CPU is running in full |
@@ -3513,7 +3542,7 @@ static void sched_tick_remote(struct work_struct *work) | |||
3513 | 3542 | ||
3514 | rq_lock_irq(rq, &rf); | 3543 | rq_lock_irq(rq, &rf); |
3515 | curr = rq->curr; | 3544 | curr = rq->curr; |
3516 | if (is_idle_task(curr)) | 3545 | if (is_idle_task(curr) || cpu_is_offline(cpu)) |
3517 | goto out_unlock; | 3546 | goto out_unlock; |
3518 | 3547 | ||
3519 | update_rq_clock(rq); | 3548 | update_rq_clock(rq); |
@@ -3533,13 +3562,18 @@ out_requeue: | |||
3533 | /* | 3562 | /* |
3534 | * Run the remote tick once per second (1Hz). This arbitrary | 3563 | * Run the remote tick once per second (1Hz). This arbitrary |
3535 | * frequency is large enough to avoid overload but short enough | 3564 | * frequency is large enough to avoid overload but short enough |
3536 | * to keep scheduler internal stats reasonably up to date. | 3565 | * to keep scheduler internal stats reasonably up to date. But |
3566 | * first update state to reflect hotplug activity if required. | ||
3537 | */ | 3567 | */ |
3538 | queue_delayed_work(system_unbound_wq, dwork, HZ); | 3568 | os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); |
3569 | WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); | ||
3570 | if (os == TICK_SCHED_REMOTE_RUNNING) | ||
3571 | queue_delayed_work(system_unbound_wq, dwork, HZ); | ||
3539 | } | 3572 | } |
3540 | 3573 | ||
3541 | static void sched_tick_start(int cpu) | 3574 | static void sched_tick_start(int cpu) |
3542 | { | 3575 | { |
3576 | int os; | ||
3543 | struct tick_work *twork; | 3577 | struct tick_work *twork; |
3544 | 3578 | ||
3545 | if (housekeeping_cpu(cpu, HK_FLAG_TICK)) | 3579 | if (housekeeping_cpu(cpu, HK_FLAG_TICK)) |
@@ -3548,15 +3582,20 @@ static void sched_tick_start(int cpu) | |||
3548 | WARN_ON_ONCE(!tick_work_cpu); | 3582 | WARN_ON_ONCE(!tick_work_cpu); |
3549 | 3583 | ||
3550 | twork = per_cpu_ptr(tick_work_cpu, cpu); | 3584 | twork = per_cpu_ptr(tick_work_cpu, cpu); |
3551 | twork->cpu = cpu; | 3585 | os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); |
3552 | INIT_DELAYED_WORK(&twork->work, sched_tick_remote); | 3586 | WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); |
3553 | queue_delayed_work(system_unbound_wq, &twork->work, HZ); | 3587 | if (os == TICK_SCHED_REMOTE_OFFLINE) { |
3588 | twork->cpu = cpu; | ||
3589 | INIT_DELAYED_WORK(&twork->work, sched_tick_remote); | ||
3590 | queue_delayed_work(system_unbound_wq, &twork->work, HZ); | ||
3591 | } | ||
3554 | } | 3592 | } |
3555 | 3593 | ||
3556 | #ifdef CONFIG_HOTPLUG_CPU | 3594 | #ifdef CONFIG_HOTPLUG_CPU |
3557 | static void sched_tick_stop(int cpu) | 3595 | static void sched_tick_stop(int cpu) |
3558 | { | 3596 | { |
3559 | struct tick_work *twork; | 3597 | struct tick_work *twork; |
3598 | int os; | ||
3560 | 3599 | ||
3561 | if (housekeeping_cpu(cpu, HK_FLAG_TICK)) | 3600 | if (housekeeping_cpu(cpu, HK_FLAG_TICK)) |
3562 | return; | 3601 | return; |
@@ -3564,7 +3603,10 @@ static void sched_tick_stop(int cpu) | |||
3564 | WARN_ON_ONCE(!tick_work_cpu); | 3603 | WARN_ON_ONCE(!tick_work_cpu); |
3565 | 3604 | ||
3566 | twork = per_cpu_ptr(tick_work_cpu, cpu); | 3605 | twork = per_cpu_ptr(tick_work_cpu, cpu); |
3567 | cancel_delayed_work_sync(&twork->work); | 3606 | /* There cannot be competing actions, but don't rely on stop-machine. */ |
3607 | os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING); | ||
3608 | WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING); | ||
3609 | /* Don't cancel, as this would mess up the state machine. */ | ||
3568 | } | 3610 | } |
3569 | #endif /* CONFIG_HOTPLUG_CPU */ | 3611 | #endif /* CONFIG_HOTPLUG_CPU */ |
3570 | 3612 | ||
@@ -3572,7 +3614,6 @@ int __init sched_tick_offload_init(void) | |||
3572 | { | 3614 | { |
3573 | tick_work_cpu = alloc_percpu(struct tick_work); | 3615 | tick_work_cpu = alloc_percpu(struct tick_work); |
3574 | BUG_ON(!tick_work_cpu); | 3616 | BUG_ON(!tick_work_cpu); |
3575 | |||
3576 | return 0; | 3617 | return 0; |
3577 | } | 3618 | } |
3578 | 3619 | ||
@@ -3904,7 +3945,7 @@ void __noreturn do_task_dead(void) | |||
3904 | 3945 | ||
3905 | static inline void sched_submit_work(struct task_struct *tsk) | 3946 | static inline void sched_submit_work(struct task_struct *tsk) |
3906 | { | 3947 | { |
3907 | if (!tsk->state || tsk_is_pi_blocked(tsk)) | 3948 | if (!tsk->state) |
3908 | return; | 3949 | return; |
3909 | 3950 | ||
3910 | /* | 3951 | /* |
@@ -3920,6 +3961,9 @@ static inline void sched_submit_work(struct task_struct *tsk) | |||
3920 | preempt_enable_no_resched(); | 3961 | preempt_enable_no_resched(); |
3921 | } | 3962 | } |
3922 | 3963 | ||
3964 | if (tsk_is_pi_blocked(tsk)) | ||
3965 | return; | ||
3966 | |||
3923 | /* | 3967 | /* |
3924 | * If we are going to sleep and we have plugged IO queued, | 3968 | * If we are going to sleep and we have plugged IO queued, |
3925 | * make sure to submit it to avoid deadlocks. | 3969 | * make sure to submit it to avoid deadlocks. |
@@ -5102,37 +5146,40 @@ out_unlock: | |||
5102 | return retval; | 5146 | return retval; |
5103 | } | 5147 | } |
5104 | 5148 | ||
5105 | static int sched_read_attr(struct sched_attr __user *uattr, | 5149 | /* |
5106 | struct sched_attr *attr, | 5150 | * Copy the kernel size attribute structure (which might be larger |
5107 | unsigned int usize) | 5151 | * than what user-space knows about) to user-space. |
5152 | * | ||
5153 | * Note that all cases are valid: user-space buffer can be larger or | ||
5154 | * smaller than the kernel-space buffer. The usual case is that both | ||
5155 | * have the same size. | ||
5156 | */ | ||
5157 | static int | ||
5158 | sched_attr_copy_to_user(struct sched_attr __user *uattr, | ||
5159 | struct sched_attr *kattr, | ||
5160 | unsigned int usize) | ||
5108 | { | 5161 | { |
5109 | int ret; | 5162 | unsigned int ksize = sizeof(*kattr); |
5110 | 5163 | ||
5111 | if (!access_ok(uattr, usize)) | 5164 | if (!access_ok(uattr, usize)) |
5112 | return -EFAULT; | 5165 | return -EFAULT; |
5113 | 5166 | ||
5114 | /* | 5167 | /* |
5115 | * If we're handed a smaller struct than we know of, | 5168 | * sched_getattr() ABI forwards and backwards compatibility: |
5116 | * ensure all the unknown bits are 0 - i.e. old | 5169 | * |
5117 | * user-space does not get uncomplete information. | 5170 | * If usize == ksize then we just copy everything to user-space and all is good. |
5171 | * | ||
5172 | * If usize < ksize then we only copy as much as user-space has space for, | ||
5173 | * this keeps ABI compatibility as well. We skip the rest. | ||
5174 | * | ||
5175 | * If usize > ksize then user-space is using a newer version of the ABI, | ||
5176 | * which part the kernel doesn't know about. Just ignore it - tooling can | ||
5177 | * detect the kernel's knowledge of attributes from the attr->size value | ||
5178 | * which is set to ksize in this case. | ||
5118 | */ | 5179 | */ |
5119 | if (usize < sizeof(*attr)) { | 5180 | kattr->size = min(usize, ksize); |
5120 | unsigned char *addr; | ||
5121 | unsigned char *end; | ||
5122 | 5181 | ||
5123 | addr = (void *)attr + usize; | 5182 | if (copy_to_user(uattr, kattr, kattr->size)) |
5124 | end = (void *)attr + sizeof(*attr); | ||
5125 | |||
5126 | for (; addr < end; addr++) { | ||
5127 | if (*addr) | ||
5128 | return -EFBIG; | ||
5129 | } | ||
5130 | |||
5131 | attr->size = usize; | ||
5132 | } | ||
5133 | |||
5134 | ret = copy_to_user(uattr, attr, attr->size); | ||
5135 | if (ret) | ||
5136 | return -EFAULT; | 5183 | return -EFAULT; |
5137 | 5184 | ||
5138 | return 0; | 5185 | return 0; |
@@ -5142,20 +5189,18 @@ static int sched_read_attr(struct sched_attr __user *uattr, | |||
5142 | * sys_sched_getattr - similar to sched_getparam, but with sched_attr | 5189 | * sys_sched_getattr - similar to sched_getparam, but with sched_attr |
5143 | * @pid: the pid in question. | 5190 | * @pid: the pid in question. |
5144 | * @uattr: structure containing the extended parameters. | 5191 | * @uattr: structure containing the extended parameters. |
5145 | * @size: sizeof(attr) for fwd/bwd comp. | 5192 | * @usize: sizeof(attr) that user-space knows about, for forwards and backwards compatibility. |
5146 | * @flags: for future extension. | 5193 | * @flags: for future extension. |
5147 | */ | 5194 | */ |
5148 | SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, | 5195 | SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, |
5149 | unsigned int, size, unsigned int, flags) | 5196 | unsigned int, usize, unsigned int, flags) |
5150 | { | 5197 | { |
5151 | struct sched_attr attr = { | 5198 | struct sched_attr kattr = { }; |
5152 | .size = sizeof(struct sched_attr), | ||
5153 | }; | ||
5154 | struct task_struct *p; | 5199 | struct task_struct *p; |
5155 | int retval; | 5200 | int retval; |
5156 | 5201 | ||
5157 | if (!uattr || pid < 0 || size > PAGE_SIZE || | 5202 | if (!uattr || pid < 0 || usize > PAGE_SIZE || |
5158 | size < SCHED_ATTR_SIZE_VER0 || flags) | 5203 | usize < SCHED_ATTR_SIZE_VER0 || flags) |
5159 | return -EINVAL; | 5204 | return -EINVAL; |
5160 | 5205 | ||
5161 | rcu_read_lock(); | 5206 | rcu_read_lock(); |
@@ -5168,25 +5213,24 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, | |||
5168 | if (retval) | 5213 | if (retval) |
5169 | goto out_unlock; | 5214 | goto out_unlock; |
5170 | 5215 | ||
5171 | attr.sched_policy = p->policy; | 5216 | kattr.sched_policy = p->policy; |
5172 | if (p->sched_reset_on_fork) | 5217 | if (p->sched_reset_on_fork) |
5173 | attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; | 5218 | kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; |
5174 | if (task_has_dl_policy(p)) | 5219 | if (task_has_dl_policy(p)) |
5175 | __getparam_dl(p, &attr); | 5220 | __getparam_dl(p, &kattr); |
5176 | else if (task_has_rt_policy(p)) | 5221 | else if (task_has_rt_policy(p)) |
5177 | attr.sched_priority = p->rt_priority; | 5222 | kattr.sched_priority = p->rt_priority; |
5178 | else | 5223 | else |
5179 | attr.sched_nice = task_nice(p); | 5224 | kattr.sched_nice = task_nice(p); |
5180 | 5225 | ||
5181 | #ifdef CONFIG_UCLAMP_TASK | 5226 | #ifdef CONFIG_UCLAMP_TASK |
5182 | attr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; | 5227 | kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; |
5183 | attr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; | 5228 | kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; |
5184 | #endif | 5229 | #endif |
5185 | 5230 | ||
5186 | rcu_read_unlock(); | 5231 | rcu_read_unlock(); |
5187 | 5232 | ||
5188 | retval = sched_read_attr(uattr, &attr, size); | 5233 | return sched_attr_copy_to_user(uattr, &kattr, usize); |
5189 | return retval; | ||
5190 | 5234 | ||
5191 | out_unlock: | 5235 | out_unlock: |
5192 | rcu_read_unlock(); | 5236 | rcu_read_unlock(); |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bc9cfeaac8bd..500f5db0de0b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -4470,6 +4470,8 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) | |||
4470 | if (likely(cfs_rq->runtime_remaining > 0)) | 4470 | if (likely(cfs_rq->runtime_remaining > 0)) |
4471 | return; | 4471 | return; |
4472 | 4472 | ||
4473 | if (cfs_rq->throttled) | ||
4474 | return; | ||
4473 | /* | 4475 | /* |
4474 | * if we're unable to extend our runtime we resched so that the active | 4476 | * if we're unable to extend our runtime we resched so that the active |
4475 | * hierarchy can be throttled | 4477 | * hierarchy can be throttled |
@@ -4673,6 +4675,9 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, | |||
4673 | if (!cfs_rq_throttled(cfs_rq)) | 4675 | if (!cfs_rq_throttled(cfs_rq)) |
4674 | goto next; | 4676 | goto next; |
4675 | 4677 | ||
4678 | /* By the above check, this should never be true */ | ||
4679 | SCHED_WARN_ON(cfs_rq->runtime_remaining > 0); | ||
4680 | |||
4676 | runtime = -cfs_rq->runtime_remaining + 1; | 4681 | runtime = -cfs_rq->runtime_remaining + 1; |
4677 | if (runtime > remaining) | 4682 | if (runtime > remaining) |
4678 | runtime = remaining; | 4683 | runtime = remaining; |
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 80940939b733..e4bc4aa739b8 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c | |||
@@ -241,13 +241,14 @@ static void do_idle(void) | |||
241 | check_pgt_cache(); | 241 | check_pgt_cache(); |
242 | rmb(); | 242 | rmb(); |
243 | 243 | ||
244 | local_irq_disable(); | ||
245 | |||
244 | if (cpu_is_offline(cpu)) { | 246 | if (cpu_is_offline(cpu)) { |
245 | tick_nohz_idle_stop_tick_protected(); | 247 | tick_nohz_idle_stop_tick(); |
246 | cpuhp_report_idle_dead(); | 248 | cpuhp_report_idle_dead(); |
247 | arch_cpu_idle_dead(); | 249 | arch_cpu_idle_dead(); |
248 | } | 250 | } |
249 | 251 | ||
250 | local_irq_disable(); | ||
251 | arch_cpu_idle_enter(); | 252 | arch_cpu_idle_enter(); |
252 | 253 | ||
253 | /* | 254 | /* |
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 23fbbcc414d5..6e52b67b420e 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c | |||
@@ -1131,7 +1131,15 @@ static void psi_trigger_destroy(struct kref *ref) | |||
1131 | * deadlock while waiting for psi_poll_work to acquire trigger_lock | 1131 | * deadlock while waiting for psi_poll_work to acquire trigger_lock |
1132 | */ | 1132 | */ |
1133 | if (kworker_to_destroy) { | 1133 | if (kworker_to_destroy) { |
1134 | /* | ||
1135 | * After the RCU grace period has expired, the worker | ||
1136 | * can no longer be found through group->poll_kworker. | ||
1137 | * But it might have been already scheduled before | ||
1138 | * that - deschedule it cleanly before destroying it. | ||
1139 | */ | ||
1134 | kthread_cancel_delayed_work_sync(&group->poll_work); | 1140 | kthread_cancel_delayed_work_sync(&group->poll_work); |
1141 | atomic_set(&group->poll_scheduled, 0); | ||
1142 | |||
1135 | kthread_destroy_worker(kworker_to_destroy); | 1143 | kthread_destroy_worker(kworker_to_destroy); |
1136 | } | 1144 | } |
1137 | kfree(t); | 1145 | kfree(t); |
diff --git a/kernel/signal.c b/kernel/signal.c index e667be6907d7..c4da1ef56fdf 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -90,6 +90,11 @@ static bool sig_task_ignored(struct task_struct *t, int sig, bool force) | |||
90 | handler == SIG_DFL && !(force && sig_kernel_only(sig))) | 90 | handler == SIG_DFL && !(force && sig_kernel_only(sig))) |
91 | return true; | 91 | return true; |
92 | 92 | ||
93 | /* Only allow kernel generated signals to this kthread */ | ||
94 | if (unlikely((t->flags & PF_KTHREAD) && | ||
95 | (handler == SIG_KTHREAD_KERNEL) && !force)) | ||
96 | return true; | ||
97 | |||
93 | return sig_handler_ignored(handler, sig); | 98 | return sig_handler_ignored(handler, sig); |
94 | } | 99 | } |
95 | 100 | ||
@@ -3673,8 +3678,11 @@ static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info) | |||
3673 | 3678 | ||
3674 | static struct pid *pidfd_to_pid(const struct file *file) | 3679 | static struct pid *pidfd_to_pid(const struct file *file) |
3675 | { | 3680 | { |
3676 | if (file->f_op == &pidfd_fops) | 3681 | struct pid *pid; |
3677 | return file->private_data; | 3682 | |
3683 | pid = pidfd_pid(file); | ||
3684 | if (!IS_ERR(pid)) | ||
3685 | return pid; | ||
3678 | 3686 | ||
3679 | return tgid_pidfd_to_pid(file); | 3687 | return tgid_pidfd_to_pid(file); |
3680 | } | 3688 | } |
diff --git a/kernel/sys.c b/kernel/sys.c index 2969304c29fe..ec48396b4943 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -124,6 +124,12 @@ | |||
124 | #ifndef PAC_RESET_KEYS | 124 | #ifndef PAC_RESET_KEYS |
125 | # define PAC_RESET_KEYS(a, b) (-EINVAL) | 125 | # define PAC_RESET_KEYS(a, b) (-EINVAL) |
126 | #endif | 126 | #endif |
127 | #ifndef SET_TAGGED_ADDR_CTRL | ||
128 | # define SET_TAGGED_ADDR_CTRL(a) (-EINVAL) | ||
129 | #endif | ||
130 | #ifndef GET_TAGGED_ADDR_CTRL | ||
131 | # define GET_TAGGED_ADDR_CTRL() (-EINVAL) | ||
132 | #endif | ||
127 | 133 | ||
128 | /* | 134 | /* |
129 | * this is where the system-wide overflow UID and GID are defined, for | 135 | * this is where the system-wide overflow UID and GID are defined, for |
@@ -2492,6 +2498,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
2492 | return -EINVAL; | 2498 | return -EINVAL; |
2493 | error = PAC_RESET_KEYS(me, arg2); | 2499 | error = PAC_RESET_KEYS(me, arg2); |
2494 | break; | 2500 | break; |
2501 | case PR_SET_TAGGED_ADDR_CTRL: | ||
2502 | if (arg3 || arg4 || arg5) | ||
2503 | return -EINVAL; | ||
2504 | error = SET_TAGGED_ADDR_CTRL(arg2); | ||
2505 | break; | ||
2506 | case PR_GET_TAGGED_ADDR_CTRL: | ||
2507 | if (arg2 || arg3 || arg4 || arg5) | ||
2508 | return -EINVAL; | ||
2509 | error = GET_TAGGED_ADDR_CTRL(); | ||
2510 | break; | ||
2495 | default: | 2511 | default: |
2496 | error = -EINVAL; | 2512 | error = -EINVAL; |
2497 | break; | 2513 | break; |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d911c8470149..ca69290bee2a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -146,6 +146,11 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) | |||
146 | static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) | 146 | static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) |
147 | { | 147 | { |
148 | tk->offs_boot = ktime_add(tk->offs_boot, delta); | 148 | tk->offs_boot = ktime_add(tk->offs_boot, delta); |
149 | /* | ||
150 | * Timespec representation for VDSO update to avoid 64bit division | ||
151 | * on every update. | ||
152 | */ | ||
153 | tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot); | ||
149 | } | 154 | } |
150 | 155 | ||
151 | /* | 156 | /* |
diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 8cf3596a4ce6..4bc37ac3bb05 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c | |||
@@ -17,7 +17,7 @@ static inline void update_vdso_data(struct vdso_data *vdata, | |||
17 | struct timekeeper *tk) | 17 | struct timekeeper *tk) |
18 | { | 18 | { |
19 | struct vdso_timestamp *vdso_ts; | 19 | struct vdso_timestamp *vdso_ts; |
20 | u64 nsec; | 20 | u64 nsec, sec; |
21 | 21 | ||
22 | vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; | 22 | vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; |
23 | vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask; | 23 | vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask; |
@@ -45,23 +45,27 @@ static inline void update_vdso_data(struct vdso_data *vdata, | |||
45 | } | 45 | } |
46 | vdso_ts->nsec = nsec; | 46 | vdso_ts->nsec = nsec; |
47 | 47 | ||
48 | /* CLOCK_MONOTONIC_RAW */ | 48 | /* Copy MONOTONIC time for BOOTTIME */ |
49 | vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; | 49 | sec = vdso_ts->sec; |
50 | vdso_ts->sec = tk->raw_sec; | 50 | /* Add the boot offset */ |
51 | vdso_ts->nsec = tk->tkr_raw.xtime_nsec; | 51 | sec += tk->monotonic_to_boot.tv_sec; |
52 | nsec += (u64)tk->monotonic_to_boot.tv_nsec << tk->tkr_mono.shift; | ||
52 | 53 | ||
53 | /* CLOCK_BOOTTIME */ | 54 | /* CLOCK_BOOTTIME */ |
54 | vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; | 55 | vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; |
55 | vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; | 56 | vdso_ts->sec = sec; |
56 | nsec = tk->tkr_mono.xtime_nsec; | 57 | |
57 | nsec += ((u64)(tk->wall_to_monotonic.tv_nsec + | ||
58 | ktime_to_ns(tk->offs_boot)) << tk->tkr_mono.shift); | ||
59 | while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { | 58 | while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { |
60 | nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift); | 59 | nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift); |
61 | vdso_ts->sec++; | 60 | vdso_ts->sec++; |
62 | } | 61 | } |
63 | vdso_ts->nsec = nsec; | 62 | vdso_ts->nsec = nsec; |
64 | 63 | ||
64 | /* CLOCK_MONOTONIC_RAW */ | ||
65 | vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; | ||
66 | vdso_ts->sec = tk->raw_sec; | ||
67 | vdso_ts->nsec = tk->tkr_raw.xtime_nsec; | ||
68 | |||
65 | /* CLOCK_TAI */ | 69 | /* CLOCK_TAI */ |
66 | vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI]; | 70 | vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI]; |
67 | vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset; | 71 | vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset; |
diff --git a/kernel/torture.c b/kernel/torture.c index a8d9bdfba7c3..7c13f5558b71 100644 --- a/kernel/torture.c +++ b/kernel/torture.c | |||
@@ -263,7 +263,6 @@ static void torture_onoff_cleanup(void) | |||
263 | onoff_task = NULL; | 263 | onoff_task = NULL; |
264 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 264 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
265 | } | 265 | } |
266 | EXPORT_SYMBOL_GPL(torture_onoff_cleanup); | ||
267 | 266 | ||
268 | /* | 267 | /* |
269 | * Print online/offline testing statistics. | 268 | * Print online/offline testing statistics. |
@@ -449,7 +448,6 @@ static void torture_shuffle_cleanup(void) | |||
449 | } | 448 | } |
450 | shuffler_task = NULL; | 449 | shuffler_task = NULL; |
451 | } | 450 | } |
452 | EXPORT_SYMBOL_GPL(torture_shuffle_cleanup); | ||
453 | 451 | ||
454 | /* | 452 | /* |
455 | * Variables for auto-shutdown. This allows "lights out" torture runs | 453 | * Variables for auto-shutdown. This allows "lights out" torture runs |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index eca34503f178..f9821a3374e9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -3095,6 +3095,14 @@ t_probe_next(struct seq_file *m, loff_t *pos) | |||
3095 | hnd = &iter->probe_entry->hlist; | 3095 | hnd = &iter->probe_entry->hlist; |
3096 | 3096 | ||
3097 | hash = iter->probe->ops.func_hash->filter_hash; | 3097 | hash = iter->probe->ops.func_hash->filter_hash; |
3098 | |||
3099 | /* | ||
3100 | * A probe being registered may temporarily have an empty hash | ||
3101 | * and it's at the end of the func_probes list. | ||
3102 | */ | ||
3103 | if (!hash || hash == EMPTY_HASH) | ||
3104 | return NULL; | ||
3105 | |||
3098 | size = 1 << hash->size_bits; | 3106 | size = 1 << hash->size_bits; |
3099 | 3107 | ||
3100 | retry: | 3108 | retry: |
@@ -4320,12 +4328,21 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr, | |||
4320 | 4328 | ||
4321 | mutex_unlock(&ftrace_lock); | 4329 | mutex_unlock(&ftrace_lock); |
4322 | 4330 | ||
4331 | /* | ||
4332 | * Note, there's a small window here that the func_hash->filter_hash | ||
4333 | * may be NULL or empty. Need to be carefule when reading the loop. | ||
4334 | */ | ||
4323 | mutex_lock(&probe->ops.func_hash->regex_lock); | 4335 | mutex_lock(&probe->ops.func_hash->regex_lock); |
4324 | 4336 | ||
4325 | orig_hash = &probe->ops.func_hash->filter_hash; | 4337 | orig_hash = &probe->ops.func_hash->filter_hash; |
4326 | old_hash = *orig_hash; | 4338 | old_hash = *orig_hash; |
4327 | hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); | 4339 | hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); |
4328 | 4340 | ||
4341 | if (!hash) { | ||
4342 | ret = -ENOMEM; | ||
4343 | goto out; | ||
4344 | } | ||
4345 | |||
4329 | ret = ftrace_match_records(hash, glob, strlen(glob)); | 4346 | ret = ftrace_match_records(hash, glob, strlen(glob)); |
4330 | 4347 | ||
4331 | /* Nothing found? */ | 4348 | /* Nothing found? */ |
diff --git a/kernel/trace/ftrace_internal.h b/kernel/trace/ftrace_internal.h index 0515a2096f90..0456e0a3dab1 100644 --- a/kernel/trace/ftrace_internal.h +++ b/kernel/trace/ftrace_internal.h | |||
@@ -6,22 +6,22 @@ | |||
6 | 6 | ||
7 | /* | 7 | /* |
8 | * Traverse the ftrace_global_list, invoking all entries. The reason that we | 8 | * Traverse the ftrace_global_list, invoking all entries. The reason that we |
9 | * can use rcu_dereference_raw_notrace() is that elements removed from this list | 9 | * can use rcu_dereference_raw_check() is that elements removed from this list |
10 | * are simply leaked, so there is no need to interact with a grace-period | 10 | * are simply leaked, so there is no need to interact with a grace-period |
11 | * mechanism. The rcu_dereference_raw_notrace() calls are needed to handle | 11 | * mechanism. The rcu_dereference_raw_check() calls are needed to handle |
12 | * concurrent insertions into the ftrace_global_list. | 12 | * concurrent insertions into the ftrace_global_list. |
13 | * | 13 | * |
14 | * Silly Alpha and silly pointer-speculation compiler optimizations! | 14 | * Silly Alpha and silly pointer-speculation compiler optimizations! |
15 | */ | 15 | */ |
16 | #define do_for_each_ftrace_op(op, list) \ | 16 | #define do_for_each_ftrace_op(op, list) \ |
17 | op = rcu_dereference_raw_notrace(list); \ | 17 | op = rcu_dereference_raw_check(list); \ |
18 | do | 18 | do |
19 | 19 | ||
20 | /* | 20 | /* |
21 | * Optimized for just a single item in the list (as that is the normal case). | 21 | * Optimized for just a single item in the list (as that is the normal case). |
22 | */ | 22 | */ |
23 | #define while_for_each_ftrace_op(op) \ | 23 | #define while_for_each_ftrace_op(op) \ |
24 | while (likely(op = rcu_dereference_raw_notrace((op)->next)) && \ | 24 | while (likely(op = rcu_dereference_raw_check((op)->next)) && \ |
25 | unlikely((op) != &ftrace_list_end)) | 25 | unlikely((op) != &ftrace_list_end)) |
26 | 26 | ||
27 | extern struct ftrace_ops __rcu *ftrace_ops_list; | 27 | extern struct ftrace_ops __rcu *ftrace_ops_list; |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 525a97fbbc60..947ba433865f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -1567,9 +1567,9 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, | |||
1567 | 1567 | ||
1568 | /** | 1568 | /** |
1569 | * update_max_tr_single - only copy one trace over, and reset the rest | 1569 | * update_max_tr_single - only copy one trace over, and reset the rest |
1570 | * @tr - tracer | 1570 | * @tr: tracer |
1571 | * @tsk - task with the latency | 1571 | * @tsk: task with the latency |
1572 | * @cpu - the cpu of the buffer to copy. | 1572 | * @cpu: the cpu of the buffer to copy. |
1573 | * | 1573 | * |
1574 | * Flip the trace of a single CPU buffer between the @tr and the max_tr. | 1574 | * Flip the trace of a single CPU buffer between the @tr and the max_tr. |
1575 | */ | 1575 | */ |
@@ -1767,7 +1767,7 @@ static void __init apply_trace_boot_options(void); | |||
1767 | 1767 | ||
1768 | /** | 1768 | /** |
1769 | * register_tracer - register a tracer with the ftrace system. | 1769 | * register_tracer - register a tracer with the ftrace system. |
1770 | * @type - the plugin for the tracer | 1770 | * @type: the plugin for the tracer |
1771 | * | 1771 | * |
1772 | * Register a new plugin tracer. | 1772 | * Register a new plugin tracer. |
1773 | */ | 1773 | */ |
@@ -2230,9 +2230,9 @@ static bool tracing_record_taskinfo_skip(int flags) | |||
2230 | /** | 2230 | /** |
2231 | * tracing_record_taskinfo - record the task info of a task | 2231 | * tracing_record_taskinfo - record the task info of a task |
2232 | * | 2232 | * |
2233 | * @task - task to record | 2233 | * @task: task to record |
2234 | * @flags - TRACE_RECORD_CMDLINE for recording comm | 2234 | * @flags: TRACE_RECORD_CMDLINE for recording comm |
2235 | * - TRACE_RECORD_TGID for recording tgid | 2235 | * TRACE_RECORD_TGID for recording tgid |
2236 | */ | 2236 | */ |
2237 | void tracing_record_taskinfo(struct task_struct *task, int flags) | 2237 | void tracing_record_taskinfo(struct task_struct *task, int flags) |
2238 | { | 2238 | { |
@@ -2258,10 +2258,10 @@ void tracing_record_taskinfo(struct task_struct *task, int flags) | |||
2258 | /** | 2258 | /** |
2259 | * tracing_record_taskinfo_sched_switch - record task info for sched_switch | 2259 | * tracing_record_taskinfo_sched_switch - record task info for sched_switch |
2260 | * | 2260 | * |
2261 | * @prev - previous task during sched_switch | 2261 | * @prev: previous task during sched_switch |
2262 | * @next - next task during sched_switch | 2262 | * @next: next task during sched_switch |
2263 | * @flags - TRACE_RECORD_CMDLINE for recording comm | 2263 | * @flags: TRACE_RECORD_CMDLINE for recording comm |
2264 | * TRACE_RECORD_TGID for recording tgid | 2264 | * TRACE_RECORD_TGID for recording tgid |
2265 | */ | 2265 | */ |
2266 | void tracing_record_taskinfo_sched_switch(struct task_struct *prev, | 2266 | void tracing_record_taskinfo_sched_switch(struct task_struct *prev, |
2267 | struct task_struct *next, int flags) | 2267 | struct task_struct *next, int flags) |
@@ -2642,10 +2642,10 @@ static void ftrace_exports(struct ring_buffer_event *event) | |||
2642 | 2642 | ||
2643 | preempt_disable_notrace(); | 2643 | preempt_disable_notrace(); |
2644 | 2644 | ||
2645 | export = rcu_dereference_raw_notrace(ftrace_exports_list); | 2645 | export = rcu_dereference_raw_check(ftrace_exports_list); |
2646 | while (export) { | 2646 | while (export) { |
2647 | trace_process_export(export, event); | 2647 | trace_process_export(export, event); |
2648 | export = rcu_dereference_raw_notrace(export->next); | 2648 | export = rcu_dereference_raw_check(export->next); |
2649 | } | 2649 | } |
2650 | 2650 | ||
2651 | preempt_enable_notrace(); | 2651 | preempt_enable_notrace(); |
@@ -3072,7 +3072,9 @@ static void trace_printk_start_stop_comm(int enabled) | |||
3072 | 3072 | ||
3073 | /** | 3073 | /** |
3074 | * trace_vbprintk - write binary msg to tracing buffer | 3074 | * trace_vbprintk - write binary msg to tracing buffer |
3075 | * | 3075 | * @ip: The address of the caller |
3076 | * @fmt: The string format to write to the buffer | ||
3077 | * @args: Arguments for @fmt | ||
3076 | */ | 3078 | */ |
3077 | int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) | 3079 | int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) |
3078 | { | 3080 | { |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index c7506bc81b75..648930823b57 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -787,7 +787,7 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, | |||
787 | return ret; | 787 | return ret; |
788 | } | 788 | } |
789 | 789 | ||
790 | static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) | 790 | int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) |
791 | { | 791 | { |
792 | char *event = NULL, *sub = NULL, *match; | 792 | char *event = NULL, *sub = NULL, *match; |
793 | int ret; | 793 | int ret; |
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index dbef0d135075..fb6bfbc5bf86 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c | |||
@@ -895,7 +895,8 @@ void trace_probe_cleanup(struct trace_probe *tp) | |||
895 | for (i = 0; i < tp->nr_args; i++) | 895 | for (i = 0; i < tp->nr_args; i++) |
896 | traceprobe_free_probe_arg(&tp->args[i]); | 896 | traceprobe_free_probe_arg(&tp->args[i]); |
897 | 897 | ||
898 | kfree(call->class->system); | 898 | if (call->class) |
899 | kfree(call->class->system); | ||
899 | kfree(call->name); | 900 | kfree(call->name); |
900 | kfree(call->print_fmt); | 901 | kfree(call->print_fmt); |
901 | } | 902 | } |