summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/bpf/core.c8
-rw-r--r--kernel/bpf/syscall.c30
-rw-r--r--kernel/bpf/verifier.c32
-rw-r--r--kernel/cgroup/cgroup.c10
-rw-r--r--kernel/dma/contiguous.c8
-rw-r--r--kernel/dma/direct.c12
-rw-r--r--kernel/dma/swiotlb.c34
-rw-r--r--kernel/events/hw_breakpoint.c4
-rw-r--r--kernel/exit.c38
-rw-r--r--kernel/fork.c20
-rw-r--r--kernel/irq/irqdesc.c15
-rw-r--r--kernel/irq/proc.c14
-rw-r--r--kernel/irq/resend.c2
-rw-r--r--kernel/jump_label.c4
-rw-r--r--kernel/kallsyms.c6
-rw-r--r--kernel/kexec_elf.c430
-rw-r--r--kernel/kprobes.c8
-rw-r--r--kernel/locking/lockdep.c2
-rw-r--r--kernel/module.c4
-rw-r--r--kernel/rcu/Kconfig.debug11
-rw-r--r--kernel/rcu/rcu.h1
-rw-r--r--kernel/rcu/rcu_segcblist.c174
-rw-r--r--kernel/rcu/rcu_segcblist.h54
-rw-r--r--kernel/rcu/rcuperf.c10
-rw-r--r--kernel/rcu/rcutorture.c30
-rw-r--r--kernel/rcu/srcutree.c5
-rw-r--r--kernel/rcu/tree.c205
-rw-r--r--kernel/rcu/tree.h81
-rw-r--r--kernel/rcu/tree_exp.h8
-rw-r--r--kernel/rcu/tree_plugin.h1195
-rw-r--r--kernel/rcu/tree_stall.h9
-rw-r--r--kernel/rcu/update.c105
-rw-r--r--kernel/sched/core.c140
-rw-r--r--kernel/sched/fair.c5
-rw-r--r--kernel/sched/idle.c5
-rw-r--r--kernel/sched/psi.c8
-rw-r--r--kernel/signal.c12
-rw-r--r--kernel/sys.c16
-rw-r--r--kernel/time/timekeeping.c5
-rw-r--r--kernel/time/vsyscall.c22
-rw-r--r--kernel/torture.c2
-rw-r--r--kernel/trace/ftrace.c17
-rw-r--r--kernel/trace/ftrace_internal.h8
-rw-r--r--kernel/trace/trace.c30
-rw-r--r--kernel/trace/trace_events.c2
-rw-r--r--kernel/trace/trace_probe.c3
47 files changed, 1998 insertions, 847 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index ef0d95a190b4..48c5376d290a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -64,6 +64,7 @@ obj-$(CONFIG_CRASH_CORE) += crash_core.o
64obj-$(CONFIG_KEXEC_CORE) += kexec_core.o 64obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
65obj-$(CONFIG_KEXEC) += kexec.o 65obj-$(CONFIG_KEXEC) += kexec.o
66obj-$(CONFIG_KEXEC_FILE) += kexec_file.o 66obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
67obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o
67obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o 68obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
68obj-$(CONFIG_COMPAT) += compat.o 69obj-$(CONFIG_COMPAT) += compat.o
69obj-$(CONFIG_CGROUPS) += cgroup/ 70obj-$(CONFIG_CGROUPS) += cgroup/
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 8191a7db2777..66088a9e9b9e 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -890,7 +890,8 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog,
890 890
891static int bpf_jit_blind_insn(const struct bpf_insn *from, 891static int bpf_jit_blind_insn(const struct bpf_insn *from,
892 const struct bpf_insn *aux, 892 const struct bpf_insn *aux,
893 struct bpf_insn *to_buff) 893 struct bpf_insn *to_buff,
894 bool emit_zext)
894{ 895{
895 struct bpf_insn *to = to_buff; 896 struct bpf_insn *to = to_buff;
896 u32 imm_rnd = get_random_int(); 897 u32 imm_rnd = get_random_int();
@@ -1005,6 +1006,8 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
1005 case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */ 1006 case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */
1006 *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm); 1007 *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm);
1007 *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); 1008 *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
1009 if (emit_zext)
1010 *to++ = BPF_ZEXT_REG(BPF_REG_AX);
1008 *to++ = BPF_ALU64_REG(BPF_OR, aux[0].dst_reg, BPF_REG_AX); 1011 *to++ = BPF_ALU64_REG(BPF_OR, aux[0].dst_reg, BPF_REG_AX);
1009 break; 1012 break;
1010 1013
@@ -1088,7 +1091,8 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
1088 insn[1].code == 0) 1091 insn[1].code == 0)
1089 memcpy(aux, insn, sizeof(aux)); 1092 memcpy(aux, insn, sizeof(aux));
1090 1093
1091 rewritten = bpf_jit_blind_insn(insn, aux, insn_buff); 1094 rewritten = bpf_jit_blind_insn(insn, aux, insn_buff,
1095 clone->aux->verifier_zext);
1092 if (!rewritten) 1096 if (!rewritten)
1093 continue; 1097 continue;
1094 1098
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5d141f16f6fa..272071e9112f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1707,20 +1707,26 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
1707 if (err) 1707 if (err)
1708 goto free_used_maps; 1708 goto free_used_maps;
1709 1709
1710 err = bpf_prog_new_fd(prog); 1710 /* Upon success of bpf_prog_alloc_id(), the BPF prog is
1711 if (err < 0) { 1711 * effectively publicly exposed. However, retrieving via
1712 /* failed to allocate fd. 1712 * bpf_prog_get_fd_by_id() will take another reference,
1713 * bpf_prog_put() is needed because the above 1713 * therefore it cannot be gone underneath us.
1714 * bpf_prog_alloc_id() has published the prog 1714 *
1715 * to the userspace and the userspace may 1715 * Only for the time /after/ successful bpf_prog_new_fd()
1716 * have refcnt-ed it through BPF_PROG_GET_FD_BY_ID. 1716 * and before returning to userspace, we might just hold
1717 */ 1717 * one reference and any parallel close on that fd could
1718 bpf_prog_put(prog); 1718 * rip everything out. Hence, below notifications must
1719 return err; 1719 * happen before bpf_prog_new_fd().
1720 } 1720 *
1721 1721 * Also, any failure handling from this point onwards must
1722 * be using bpf_prog_put() given the program is exposed.
1723 */
1722 bpf_prog_kallsyms_add(prog); 1724 bpf_prog_kallsyms_add(prog);
1723 perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0); 1725 perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
1726
1727 err = bpf_prog_new_fd(prog);
1728 if (err < 0)
1729 bpf_prog_put(prog);
1724 return err; 1730 return err;
1725 1731
1726free_used_maps: 1732free_used_maps:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c84d83f86141..c36a719fee6d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -985,9 +985,6 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg)
985 reg->smax_value = S64_MAX; 985 reg->smax_value = S64_MAX;
986 reg->umin_value = 0; 986 reg->umin_value = 0;
987 reg->umax_value = U64_MAX; 987 reg->umax_value = U64_MAX;
988
989 /* constant backtracking is enabled for root only for now */
990 reg->precise = capable(CAP_SYS_ADMIN) ? false : true;
991} 988}
992 989
993/* Mark a register as having a completely unknown (scalar) value. */ 990/* Mark a register as having a completely unknown (scalar) value. */
@@ -1014,7 +1011,11 @@ static void mark_reg_unknown(struct bpf_verifier_env *env,
1014 __mark_reg_not_init(regs + regno); 1011 __mark_reg_not_init(regs + regno);
1015 return; 1012 return;
1016 } 1013 }
1017 __mark_reg_unknown(regs + regno); 1014 regs += regno;
1015 __mark_reg_unknown(regs);
1016 /* constant backtracking is enabled for root without bpf2bpf calls */
1017 regs->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks ?
1018 true : false;
1018} 1019}
1019 1020
1020static void __mark_reg_not_init(struct bpf_reg_state *reg) 1021static void __mark_reg_not_init(struct bpf_reg_state *reg)
@@ -1771,16 +1772,21 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
1771 bitmap_from_u64(mask, stack_mask); 1772 bitmap_from_u64(mask, stack_mask);
1772 for_each_set_bit(i, mask, 64) { 1773 for_each_set_bit(i, mask, 64) {
1773 if (i >= func->allocated_stack / BPF_REG_SIZE) { 1774 if (i >= func->allocated_stack / BPF_REG_SIZE) {
1774 /* This can happen if backtracking 1775 /* the sequence of instructions:
1775 * is propagating stack precision where 1776 * 2: (bf) r3 = r10
1776 * caller has larger stack frame 1777 * 3: (7b) *(u64 *)(r3 -8) = r0
1777 * than callee, but backtrack_insn() should 1778 * 4: (79) r4 = *(u64 *)(r10 -8)
1778 * have returned -ENOTSUPP. 1779 * doesn't contain jmps. It's backtracked
1780 * as a single block.
1781 * During backtracking insn 3 is not recognized as
1782 * stack access, so at the end of backtracking
1783 * stack slot fp-8 is still marked in stack_mask.
1784 * However the parent state may not have accessed
1785 * fp-8 and it's "unallocated" stack space.
1786 * In such case fallback to conservative.
1779 */ 1787 */
1780 verbose(env, "BUG spi %d stack_size %d\n", 1788 mark_all_scalars_precise(env, st);
1781 i, func->allocated_stack); 1789 return 0;
1782 WARN_ONCE(1, "verifier backtracking bug");
1783 return -EFAULT;
1784 } 1790 }
1785 1791
1786 if (func->stack[i].slot_type[0] != STACK_SPILL) { 1792 if (func->stack[i].slot_type[0] != STACK_SPILL) {
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 753afbca549f..8be1da1ebd9a 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5255,8 +5255,16 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
5255 * if the parent has to be frozen, the child has too. 5255 * if the parent has to be frozen, the child has too.
5256 */ 5256 */
5257 cgrp->freezer.e_freeze = parent->freezer.e_freeze; 5257 cgrp->freezer.e_freeze = parent->freezer.e_freeze;
5258 if (cgrp->freezer.e_freeze) 5258 if (cgrp->freezer.e_freeze) {
5259 /*
5260 * Set the CGRP_FREEZE flag, so when a process will be
5261 * attached to the child cgroup, it will become frozen.
5262 * At this point the new cgroup is unpopulated, so we can
5263 * consider it frozen immediately.
5264 */
5265 set_bit(CGRP_FREEZE, &cgrp->flags);
5259 set_bit(CGRP_FROZEN, &cgrp->flags); 5266 set_bit(CGRP_FROZEN, &cgrp->flags);
5267 }
5260 5268
5261 spin_lock_irq(&css_set_lock); 5269 spin_lock_irq(&css_set_lock);
5262 for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { 5270 for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 2bd410f934b3..69cfb4345388 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -230,9 +230,7 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages,
230 */ 230 */
231struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) 231struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
232{ 232{
233 int node = dev ? dev_to_node(dev) : NUMA_NO_NODE; 233 size_t count = size >> PAGE_SHIFT;
234 size_t count = PAGE_ALIGN(size) >> PAGE_SHIFT;
235 size_t align = get_order(PAGE_ALIGN(size));
236 struct page *page = NULL; 234 struct page *page = NULL;
237 struct cma *cma = NULL; 235 struct cma *cma = NULL;
238 236
@@ -243,14 +241,12 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
243 241
244 /* CMA can be used only in the context which permits sleeping */ 242 /* CMA can be used only in the context which permits sleeping */
245 if (cma && gfpflags_allow_blocking(gfp)) { 243 if (cma && gfpflags_allow_blocking(gfp)) {
244 size_t align = get_order(size);
246 size_t cma_align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT); 245 size_t cma_align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT);
247 246
248 page = cma_alloc(cma, count, cma_align, gfp & __GFP_NOWARN); 247 page = cma_alloc(cma, count, cma_align, gfp & __GFP_NOWARN);
249 } 248 }
250 249
251 /* Fallback allocation of normal pages */
252 if (!page)
253 page = alloc_pages_node(node, gfp, align);
254 return page; 250 return page;
255} 251}
256 252
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 795c9b095d75..8402b29c280f 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -85,6 +85,8 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
85struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, 85struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
86 dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) 86 dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
87{ 87{
88 size_t alloc_size = PAGE_ALIGN(size);
89 int node = dev_to_node(dev);
88 struct page *page = NULL; 90 struct page *page = NULL;
89 u64 phys_mask; 91 u64 phys_mask;
90 92
@@ -95,8 +97,14 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
95 gfp &= ~__GFP_ZERO; 97 gfp &= ~__GFP_ZERO;
96 gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, 98 gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
97 &phys_mask); 99 &phys_mask);
100 page = dma_alloc_contiguous(dev, alloc_size, gfp);
101 if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
102 dma_free_contiguous(dev, page, alloc_size);
103 page = NULL;
104 }
98again: 105again:
99 page = dma_alloc_contiguous(dev, size, gfp); 106 if (!page)
107 page = alloc_pages_node(node, gfp, get_order(alloc_size));
100 if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { 108 if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
101 dma_free_contiguous(dev, page, size); 109 dma_free_contiguous(dev, page, size);
102 page = NULL; 110 page = NULL;
@@ -297,7 +305,7 @@ void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
297 dma_direct_sync_single_for_cpu(dev, addr, size, dir); 305 dma_direct_sync_single_for_cpu(dev, addr, size, dir);
298 306
299 if (unlikely(is_swiotlb_buffer(phys))) 307 if (unlikely(is_swiotlb_buffer(phys)))
300 swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs); 308 swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs);
301} 309}
302EXPORT_SYMBOL(dma_direct_unmap_page); 310EXPORT_SYMBOL(dma_direct_unmap_page);
303 311
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 9de232229063..796a44f8ef5a 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -444,7 +444,9 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr,
444 444
445phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, 445phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
446 dma_addr_t tbl_dma_addr, 446 dma_addr_t tbl_dma_addr,
447 phys_addr_t orig_addr, size_t size, 447 phys_addr_t orig_addr,
448 size_t mapping_size,
449 size_t alloc_size,
448 enum dma_data_direction dir, 450 enum dma_data_direction dir,
449 unsigned long attrs) 451 unsigned long attrs)
450{ 452{
@@ -464,6 +466,12 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
464 pr_warn_once("%s is active and system is using DMA bounce buffers\n", 466 pr_warn_once("%s is active and system is using DMA bounce buffers\n",
465 sme_active() ? "SME" : "SEV"); 467 sme_active() ? "SME" : "SEV");
466 468
469 if (mapping_size > alloc_size) {
470 dev_warn_once(hwdev, "Invalid sizes (mapping: %zd bytes, alloc: %zd bytes)",
471 mapping_size, alloc_size);
472 return (phys_addr_t)DMA_MAPPING_ERROR;
473 }
474
467 mask = dma_get_seg_boundary(hwdev); 475 mask = dma_get_seg_boundary(hwdev);
468 476
469 tbl_dma_addr &= mask; 477 tbl_dma_addr &= mask;
@@ -471,8 +479,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
471 offset_slots = ALIGN(tbl_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; 479 offset_slots = ALIGN(tbl_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
472 480
473 /* 481 /*
474 * Carefully handle integer overflow which can occur when mask == ~0UL. 482 * Carefully handle integer overflow which can occur when mask == ~0UL.
475 */ 483 */
476 max_slots = mask + 1 484 max_slots = mask + 1
477 ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT 485 ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT
478 : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT); 486 : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
@@ -481,8 +489,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
481 * For mappings greater than or equal to a page, we limit the stride 489 * For mappings greater than or equal to a page, we limit the stride
482 * (and hence alignment) to a page size. 490 * (and hence alignment) to a page size.
483 */ 491 */
484 nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; 492 nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
485 if (size >= PAGE_SIZE) 493 if (alloc_size >= PAGE_SIZE)
486 stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT)); 494 stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
487 else 495 else
488 stride = 1; 496 stride = 1;
@@ -547,7 +555,7 @@ not_found:
547 spin_unlock_irqrestore(&io_tlb_lock, flags); 555 spin_unlock_irqrestore(&io_tlb_lock, flags);
548 if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) 556 if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit())
549 dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n", 557 dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n",
550 size, io_tlb_nslabs, tmp_io_tlb_used); 558 alloc_size, io_tlb_nslabs, tmp_io_tlb_used);
551 return (phys_addr_t)DMA_MAPPING_ERROR; 559 return (phys_addr_t)DMA_MAPPING_ERROR;
552found: 560found:
553 io_tlb_used += nslots; 561 io_tlb_used += nslots;
@@ -562,7 +570,7 @@ found:
562 io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT); 570 io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT);
563 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && 571 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
564 (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) 572 (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
565 swiotlb_bounce(orig_addr, tlb_addr, size, DMA_TO_DEVICE); 573 swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_TO_DEVICE);
566 574
567 return tlb_addr; 575 return tlb_addr;
568} 576}
@@ -571,11 +579,11 @@ found:
571 * tlb_addr is the physical address of the bounce buffer to unmap. 579 * tlb_addr is the physical address of the bounce buffer to unmap.
572 */ 580 */
573void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, 581void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
574 size_t size, enum dma_data_direction dir, 582 size_t mapping_size, size_t alloc_size,
575 unsigned long attrs) 583 enum dma_data_direction dir, unsigned long attrs)
576{ 584{
577 unsigned long flags; 585 unsigned long flags;
578 int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; 586 int i, count, nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
579 int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT; 587 int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;
580 phys_addr_t orig_addr = io_tlb_orig_addr[index]; 588 phys_addr_t orig_addr = io_tlb_orig_addr[index];
581 589
@@ -585,7 +593,7 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
585 if (orig_addr != INVALID_PHYS_ADDR && 593 if (orig_addr != INVALID_PHYS_ADDR &&
586 !(attrs & DMA_ATTR_SKIP_CPU_SYNC) && 594 !(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
587 ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))) 595 ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)))
588 swiotlb_bounce(orig_addr, tlb_addr, size, DMA_FROM_DEVICE); 596 swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_FROM_DEVICE);
589 597
590 /* 598 /*
591 * Return the buffer to the free list by setting the corresponding 599 * Return the buffer to the free list by setting the corresponding
@@ -665,14 +673,14 @@ bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr,
665 673
666 /* Oh well, have to allocate and map a bounce buffer. */ 674 /* Oh well, have to allocate and map a bounce buffer. */
667 *phys = swiotlb_tbl_map_single(dev, __phys_to_dma(dev, io_tlb_start), 675 *phys = swiotlb_tbl_map_single(dev, __phys_to_dma(dev, io_tlb_start),
668 *phys, size, dir, attrs); 676 *phys, size, size, dir, attrs);
669 if (*phys == (phys_addr_t)DMA_MAPPING_ERROR) 677 if (*phys == (phys_addr_t)DMA_MAPPING_ERROR)
670 return false; 678 return false;
671 679
672 /* Ensure that the address returned is DMA'ble */ 680 /* Ensure that the address returned is DMA'ble */
673 *dma_addr = __phys_to_dma(dev, *phys); 681 *dma_addr = __phys_to_dma(dev, *phys);
674 if (unlikely(!dma_capable(dev, *dma_addr, size))) { 682 if (unlikely(!dma_capable(dev, *dma_addr, size))) {
675 swiotlb_tbl_unmap_single(dev, *phys, size, dir, 683 swiotlb_tbl_unmap_single(dev, *phys, size, size, dir,
676 attrs | DMA_ATTR_SKIP_CPU_SYNC); 684 attrs | DMA_ATTR_SKIP_CPU_SYNC);
677 return false; 685 return false;
678 } 686 }
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index c5cd852fe86b..3cc8416ec844 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -413,7 +413,7 @@ static int hw_breakpoint_parse(struct perf_event *bp,
413 413
414int register_perf_hw_breakpoint(struct perf_event *bp) 414int register_perf_hw_breakpoint(struct perf_event *bp)
415{ 415{
416 struct arch_hw_breakpoint hw; 416 struct arch_hw_breakpoint hw = { };
417 int err; 417 int err;
418 418
419 err = reserve_bp_slot(bp); 419 err = reserve_bp_slot(bp);
@@ -461,7 +461,7 @@ int
461modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr, 461modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr,
462 bool check) 462 bool check)
463{ 463{
464 struct arch_hw_breakpoint hw; 464 struct arch_hw_breakpoint hw = { };
465 int err; 465 int err;
466 466
467 err = hw_breakpoint_parse(bp, attr, &hw); 467 err = hw_breakpoint_parse(bp, attr, &hw);
diff --git a/kernel/exit.c b/kernel/exit.c
index 5b4a5dcce8f8..22ab6a4bdc51 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1554,6 +1554,23 @@ end:
1554 return retval; 1554 return retval;
1555} 1555}
1556 1556
1557static struct pid *pidfd_get_pid(unsigned int fd)
1558{
1559 struct fd f;
1560 struct pid *pid;
1561
1562 f = fdget(fd);
1563 if (!f.file)
1564 return ERR_PTR(-EBADF);
1565
1566 pid = pidfd_pid(f.file);
1567 if (!IS_ERR(pid))
1568 get_pid(pid);
1569
1570 fdput(f);
1571 return pid;
1572}
1573
1557static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, 1574static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
1558 int options, struct rusage *ru) 1575 int options, struct rusage *ru)
1559{ 1576{
@@ -1576,19 +1593,32 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
1576 type = PIDTYPE_PID; 1593 type = PIDTYPE_PID;
1577 if (upid <= 0) 1594 if (upid <= 0)
1578 return -EINVAL; 1595 return -EINVAL;
1596
1597 pid = find_get_pid(upid);
1579 break; 1598 break;
1580 case P_PGID: 1599 case P_PGID:
1581 type = PIDTYPE_PGID; 1600 type = PIDTYPE_PGID;
1582 if (upid <= 0) 1601 if (upid < 0)
1602 return -EINVAL;
1603
1604 if (upid)
1605 pid = find_get_pid(upid);
1606 else
1607 pid = get_task_pid(current, PIDTYPE_PGID);
1608 break;
1609 case P_PIDFD:
1610 type = PIDTYPE_PID;
1611 if (upid < 0)
1583 return -EINVAL; 1612 return -EINVAL;
1613
1614 pid = pidfd_get_pid(upid);
1615 if (IS_ERR(pid))
1616 return PTR_ERR(pid);
1584 break; 1617 break;
1585 default: 1618 default:
1586 return -EINVAL; 1619 return -EINVAL;
1587 } 1620 }
1588 1621
1589 if (type < PIDTYPE_MAX)
1590 pid = find_get_pid(upid);
1591
1592 wo.wo_type = type; 1622 wo.wo_type = type;
1593 wo.wo_pid = pid; 1623 wo.wo_pid = pid;
1594 wo.wo_flags = options; 1624 wo.wo_flags = options;
diff --git a/kernel/fork.c b/kernel/fork.c
index 2852d0e76ea3..1d1cd06edbc1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -768,6 +768,7 @@ static void set_max_threads(unsigned int max_threads_suggested)
768int arch_task_struct_size __read_mostly; 768int arch_task_struct_size __read_mostly;
769#endif 769#endif
770 770
771#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
771static void task_struct_whitelist(unsigned long *offset, unsigned long *size) 772static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
772{ 773{
773 /* Fetch thread_struct whitelist for the architecture. */ 774 /* Fetch thread_struct whitelist for the architecture. */
@@ -782,6 +783,7 @@ static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
782 else 783 else
783 *offset += offsetof(struct task_struct, thread); 784 *offset += offsetof(struct task_struct, thread);
784} 785}
786#endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */
785 787
786void __init fork_init(void) 788void __init fork_init(void)
787{ 789{
@@ -1690,6 +1692,14 @@ static inline void rcu_copy_process(struct task_struct *p)
1690#endif /* #ifdef CONFIG_TASKS_RCU */ 1692#endif /* #ifdef CONFIG_TASKS_RCU */
1691} 1693}
1692 1694
1695struct pid *pidfd_pid(const struct file *file)
1696{
1697 if (file->f_op == &pidfd_fops)
1698 return file->private_data;
1699
1700 return ERR_PTR(-EBADF);
1701}
1702
1693static int pidfd_release(struct inode *inode, struct file *file) 1703static int pidfd_release(struct inode *inode, struct file *file)
1694{ 1704{
1695 struct pid *pid = file->private_data; 1705 struct pid *pid = file->private_data;
@@ -2338,6 +2348,8 @@ struct mm_struct *copy_init_mm(void)
2338 * 2348 *
2339 * It copies the process, and if successful kick-starts 2349 * It copies the process, and if successful kick-starts
2340 * it and waits for it to finish using the VM if required. 2350 * it and waits for it to finish using the VM if required.
2351 *
2352 * args->exit_signal is expected to be checked for sanity by the caller.
2341 */ 2353 */
2342long _do_fork(struct kernel_clone_args *args) 2354long _do_fork(struct kernel_clone_args *args)
2343{ 2355{
@@ -2562,6 +2574,14 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
2562 if (copy_from_user(&args, uargs, size)) 2574 if (copy_from_user(&args, uargs, size))
2563 return -EFAULT; 2575 return -EFAULT;
2564 2576
2577 /*
2578 * Verify that higher 32bits of exit_signal are unset and that
2579 * it is a valid signal
2580 */
2581 if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
2582 !valid_signal(args.exit_signal)))
2583 return -EINVAL;
2584
2565 *kargs = (struct kernel_clone_args){ 2585 *kargs = (struct kernel_clone_args){
2566 .flags = args.flags, 2586 .flags = args.flags,
2567 .pidfd = u64_to_user_ptr(args.pidfd), 2587 .pidfd = u64_to_user_ptr(args.pidfd),
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 9484e88dabc2..9be995fc3c5a 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -295,6 +295,18 @@ static void irq_sysfs_add(int irq, struct irq_desc *desc)
295 } 295 }
296} 296}
297 297
298static void irq_sysfs_del(struct irq_desc *desc)
299{
300 /*
301 * If irq_sysfs_init() has not yet been invoked (early boot), then
302 * irq_kobj_base is NULL and the descriptor was never added.
303 * kobject_del() complains about a object with no parent, so make
304 * it conditional.
305 */
306 if (irq_kobj_base)
307 kobject_del(&desc->kobj);
308}
309
298static int __init irq_sysfs_init(void) 310static int __init irq_sysfs_init(void)
299{ 311{
300 struct irq_desc *desc; 312 struct irq_desc *desc;
@@ -325,6 +337,7 @@ static struct kobj_type irq_kobj_type = {
325}; 337};
326 338
327static void irq_sysfs_add(int irq, struct irq_desc *desc) {} 339static void irq_sysfs_add(int irq, struct irq_desc *desc) {}
340static void irq_sysfs_del(struct irq_desc *desc) {}
328 341
329#endif /* CONFIG_SYSFS */ 342#endif /* CONFIG_SYSFS */
330 343
@@ -438,7 +451,7 @@ static void free_desc(unsigned int irq)
438 * The sysfs entry must be serialized against a concurrent 451 * The sysfs entry must be serialized against a concurrent
439 * irq_sysfs_init() as well. 452 * irq_sysfs_init() as well.
440 */ 453 */
441 kobject_del(&desc->kobj); 454 irq_sysfs_del(desc);
442 delete_irq_desc(irq); 455 delete_irq_desc(irq);
443 456
444 /* 457 /*
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index da9addb8d655..cfc4f088a0e7 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -100,10 +100,6 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
100 return 0; 100 return 0;
101} 101}
102 102
103#ifndef is_affinity_mask_valid
104#define is_affinity_mask_valid(val) 1
105#endif
106
107int no_irq_affinity; 103int no_irq_affinity;
108static int irq_affinity_proc_show(struct seq_file *m, void *v) 104static int irq_affinity_proc_show(struct seq_file *m, void *v)
109{ 105{
@@ -136,11 +132,6 @@ static ssize_t write_irq_affinity(int type, struct file *file,
136 if (err) 132 if (err)
137 goto free_cpumask; 133 goto free_cpumask;
138 134
139 if (!is_affinity_mask_valid(new_value)) {
140 err = -EINVAL;
141 goto free_cpumask;
142 }
143
144 /* 135 /*
145 * Do not allow disabling IRQs completely - it's a too easy 136 * Do not allow disabling IRQs completely - it's a too easy
146 * way to make the system unusable accidentally :-) At least 137 * way to make the system unusable accidentally :-) At least
@@ -232,11 +223,6 @@ static ssize_t default_affinity_write(struct file *file,
232 if (err) 223 if (err)
233 goto out; 224 goto out;
234 225
235 if (!is_affinity_mask_valid(new_value)) {
236 err = -EINVAL;
237 goto out;
238 }
239
240 /* 226 /*
241 * Do not allow disabling IRQs completely - it's a too easy 227 * Do not allow disabling IRQs completely - it's a too easy
242 * way to make the system unusable accidentally :-) At least 228 * way to make the system unusable accidentally :-) At least
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 95414ad3506a..98c04ca5fa43 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -36,6 +36,8 @@ static void resend_irqs(unsigned long arg)
36 irq = find_first_bit(irqs_resend, nr_irqs); 36 irq = find_first_bit(irqs_resend, nr_irqs);
37 clear_bit(irq, irqs_resend); 37 clear_bit(irq, irqs_resend);
38 desc = irq_to_desc(irq); 38 desc = irq_to_desc(irq);
39 if (!desc)
40 continue;
39 local_irq_disable(); 41 local_irq_disable();
40 desc->handle_irq(desc); 42 desc->handle_irq(desc);
41 local_irq_enable(); 43 local_irq_enable();
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index df3008419a1d..cdb3ffab128b 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -407,7 +407,9 @@ static bool jump_label_can_update(struct jump_entry *entry, bool init)
407 return false; 407 return false;
408 408
409 if (!kernel_text_address(jump_entry_code(entry))) { 409 if (!kernel_text_address(jump_entry_code(entry))) {
410 WARN_ONCE(1, "can't patch jump_label at %pS", (void *)jump_entry_code(entry)); 410 WARN_ONCE(!jump_entry_is_init(entry),
411 "can't patch jump_label at %pS",
412 (void *)jump_entry_code(entry));
411 return false; 413 return false;
412 } 414 }
413 415
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 95a260f9214b..136ce049c4ad 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -263,8 +263,10 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
263{ 263{
264 char namebuf[KSYM_NAME_LEN]; 264 char namebuf[KSYM_NAME_LEN];
265 265
266 if (is_ksym_addr(addr)) 266 if (is_ksym_addr(addr)) {
267 return !!get_symbol_pos(addr, symbolsize, offset); 267 get_symbol_pos(addr, symbolsize, offset);
268 return 1;
269 }
268 return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf) || 270 return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf) ||
269 !!__bpf_address_lookup(addr, symbolsize, offset, namebuf); 271 !!__bpf_address_lookup(addr, symbolsize, offset, namebuf);
270} 272}
diff --git a/kernel/kexec_elf.c b/kernel/kexec_elf.c
new file mode 100644
index 000000000000..d3689632e8b9
--- /dev/null
+++ b/kernel/kexec_elf.c
@@ -0,0 +1,430 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Load ELF vmlinux file for the kexec_file_load syscall.
4 *
5 * Copyright (C) 2004 Adam Litke (agl@us.ibm.com)
6 * Copyright (C) 2004 IBM Corp.
7 * Copyright (C) 2005 R Sharada (sharada@in.ibm.com)
8 * Copyright (C) 2006 Mohan Kumar M (mohan@in.ibm.com)
9 * Copyright (C) 2016 IBM Corporation
10 *
11 * Based on kexec-tools' kexec-elf-exec.c and kexec-elf-ppc64.c.
12 * Heavily modified for the kernel by
13 * Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>.
14 */
15
16#define pr_fmt(fmt) "kexec_elf: " fmt
17
18#include <linux/elf.h>
19#include <linux/kexec.h>
20#include <linux/module.h>
21#include <linux/slab.h>
22#include <linux/types.h>
23
24static inline bool elf_is_elf_file(const struct elfhdr *ehdr)
25{
26 return memcmp(ehdr->e_ident, ELFMAG, SELFMAG) == 0;
27}
28
29static uint64_t elf64_to_cpu(const struct elfhdr *ehdr, uint64_t value)
30{
31 if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB)
32 value = le64_to_cpu(value);
33 else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB)
34 value = be64_to_cpu(value);
35
36 return value;
37}
38
39static uint32_t elf32_to_cpu(const struct elfhdr *ehdr, uint32_t value)
40{
41 if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB)
42 value = le32_to_cpu(value);
43 else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB)
44 value = be32_to_cpu(value);
45
46 return value;
47}
48
49static uint16_t elf16_to_cpu(const struct elfhdr *ehdr, uint16_t value)
50{
51 if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB)
52 value = le16_to_cpu(value);
53 else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB)
54 value = be16_to_cpu(value);
55
56 return value;
57}
58
59/**
60 * elf_is_ehdr_sane - check that it is safe to use the ELF header
61 * @buf_len: size of the buffer in which the ELF file is loaded.
62 */
63static bool elf_is_ehdr_sane(const struct elfhdr *ehdr, size_t buf_len)
64{
65 if (ehdr->e_phnum > 0 && ehdr->e_phentsize != sizeof(struct elf_phdr)) {
66 pr_debug("Bad program header size.\n");
67 return false;
68 } else if (ehdr->e_shnum > 0 &&
69 ehdr->e_shentsize != sizeof(struct elf_shdr)) {
70 pr_debug("Bad section header size.\n");
71 return false;
72 } else if (ehdr->e_ident[EI_VERSION] != EV_CURRENT ||
73 ehdr->e_version != EV_CURRENT) {
74 pr_debug("Unknown ELF version.\n");
75 return false;
76 }
77
78 if (ehdr->e_phoff > 0 && ehdr->e_phnum > 0) {
79 size_t phdr_size;
80
81 /*
82 * e_phnum is at most 65535 so calculating the size of the
83 * program header cannot overflow.
84 */
85 phdr_size = sizeof(struct elf_phdr) * ehdr->e_phnum;
86
87 /* Sanity check the program header table location. */
88 if (ehdr->e_phoff + phdr_size < ehdr->e_phoff) {
89 pr_debug("Program headers at invalid location.\n");
90 return false;
91 } else if (ehdr->e_phoff + phdr_size > buf_len) {
92 pr_debug("Program headers truncated.\n");
93 return false;
94 }
95 }
96
97 if (ehdr->e_shoff > 0 && ehdr->e_shnum > 0) {
98 size_t shdr_size;
99
100 /*
101 * e_shnum is at most 65536 so calculating
102 * the size of the section header cannot overflow.
103 */
104 shdr_size = sizeof(struct elf_shdr) * ehdr->e_shnum;
105
106 /* Sanity check the section header table location. */
107 if (ehdr->e_shoff + shdr_size < ehdr->e_shoff) {
108 pr_debug("Section headers at invalid location.\n");
109 return false;
110 } else if (ehdr->e_shoff + shdr_size > buf_len) {
111 pr_debug("Section headers truncated.\n");
112 return false;
113 }
114 }
115
116 return true;
117}
118
119static int elf_read_ehdr(const char *buf, size_t len, struct elfhdr *ehdr)
120{
121 struct elfhdr *buf_ehdr;
122
123 if (len < sizeof(*buf_ehdr)) {
124 pr_debug("Buffer is too small to hold ELF header.\n");
125 return -ENOEXEC;
126 }
127
128 memset(ehdr, 0, sizeof(*ehdr));
129 memcpy(ehdr->e_ident, buf, sizeof(ehdr->e_ident));
130 if (!elf_is_elf_file(ehdr)) {
131 pr_debug("No ELF header magic.\n");
132 return -ENOEXEC;
133 }
134
135 if (ehdr->e_ident[EI_CLASS] != ELF_CLASS) {
136 pr_debug("Not a supported ELF class.\n");
137 return -ENOEXEC;
138 } else if (ehdr->e_ident[EI_DATA] != ELFDATA2LSB &&
139 ehdr->e_ident[EI_DATA] != ELFDATA2MSB) {
140 pr_debug("Not a supported ELF data format.\n");
141 return -ENOEXEC;
142 }
143
144 buf_ehdr = (struct elfhdr *) buf;
145 if (elf16_to_cpu(ehdr, buf_ehdr->e_ehsize) != sizeof(*buf_ehdr)) {
146 pr_debug("Bad ELF header size.\n");
147 return -ENOEXEC;
148 }
149
150 ehdr->e_type = elf16_to_cpu(ehdr, buf_ehdr->e_type);
151 ehdr->e_machine = elf16_to_cpu(ehdr, buf_ehdr->e_machine);
152 ehdr->e_version = elf32_to_cpu(ehdr, buf_ehdr->e_version);
153 ehdr->e_flags = elf32_to_cpu(ehdr, buf_ehdr->e_flags);
154 ehdr->e_phentsize = elf16_to_cpu(ehdr, buf_ehdr->e_phentsize);
155 ehdr->e_phnum = elf16_to_cpu(ehdr, buf_ehdr->e_phnum);
156 ehdr->e_shentsize = elf16_to_cpu(ehdr, buf_ehdr->e_shentsize);
157 ehdr->e_shnum = elf16_to_cpu(ehdr, buf_ehdr->e_shnum);
158 ehdr->e_shstrndx = elf16_to_cpu(ehdr, buf_ehdr->e_shstrndx);
159
160 switch (ehdr->e_ident[EI_CLASS]) {
161 case ELFCLASS64:
162 ehdr->e_entry = elf64_to_cpu(ehdr, buf_ehdr->e_entry);
163 ehdr->e_phoff = elf64_to_cpu(ehdr, buf_ehdr->e_phoff);
164 ehdr->e_shoff = elf64_to_cpu(ehdr, buf_ehdr->e_shoff);
165 break;
166
167 case ELFCLASS32:
168 ehdr->e_entry = elf32_to_cpu(ehdr, buf_ehdr->e_entry);
169 ehdr->e_phoff = elf32_to_cpu(ehdr, buf_ehdr->e_phoff);
170 ehdr->e_shoff = elf32_to_cpu(ehdr, buf_ehdr->e_shoff);
171 break;
172
173 default:
174 pr_debug("Unknown ELF class.\n");
175 return -EINVAL;
176 }
177
178 return elf_is_ehdr_sane(ehdr, len) ? 0 : -ENOEXEC;
179}
180
181/**
182 * elf_is_phdr_sane - check that it is safe to use the program header
183 * @buf_len: size of the buffer in which the ELF file is loaded.
184 */
185static bool elf_is_phdr_sane(const struct elf_phdr *phdr, size_t buf_len)
186{
187
188 if (phdr->p_offset + phdr->p_filesz < phdr->p_offset) {
189 pr_debug("ELF segment location wraps around.\n");
190 return false;
191 } else if (phdr->p_offset + phdr->p_filesz > buf_len) {
192 pr_debug("ELF segment not in file.\n");
193 return false;
194 } else if (phdr->p_paddr + phdr->p_memsz < phdr->p_paddr) {
195 pr_debug("ELF segment address wraps around.\n");
196 return false;
197 }
198
199 return true;
200}
201
202static int elf_read_phdr(const char *buf, size_t len,
203 struct kexec_elf_info *elf_info,
204 int idx)
205{
206 /* Override the const in proghdrs, we are the ones doing the loading. */
207 struct elf_phdr *phdr = (struct elf_phdr *) &elf_info->proghdrs[idx];
208 const struct elfhdr *ehdr = elf_info->ehdr;
209 const char *pbuf;
210 struct elf_phdr *buf_phdr;
211
212 pbuf = buf + elf_info->ehdr->e_phoff + (idx * sizeof(*buf_phdr));
213 buf_phdr = (struct elf_phdr *) pbuf;
214
215 phdr->p_type = elf32_to_cpu(elf_info->ehdr, buf_phdr->p_type);
216 phdr->p_flags = elf32_to_cpu(elf_info->ehdr, buf_phdr->p_flags);
217
218 switch (ehdr->e_ident[EI_CLASS]) {
219 case ELFCLASS64:
220 phdr->p_offset = elf64_to_cpu(ehdr, buf_phdr->p_offset);
221 phdr->p_paddr = elf64_to_cpu(ehdr, buf_phdr->p_paddr);
222 phdr->p_vaddr = elf64_to_cpu(ehdr, buf_phdr->p_vaddr);
223 phdr->p_filesz = elf64_to_cpu(ehdr, buf_phdr->p_filesz);
224 phdr->p_memsz = elf64_to_cpu(ehdr, buf_phdr->p_memsz);
225 phdr->p_align = elf64_to_cpu(ehdr, buf_phdr->p_align);
226 break;
227
228 case ELFCLASS32:
229 phdr->p_offset = elf32_to_cpu(ehdr, buf_phdr->p_offset);
230 phdr->p_paddr = elf32_to_cpu(ehdr, buf_phdr->p_paddr);
231 phdr->p_vaddr = elf32_to_cpu(ehdr, buf_phdr->p_vaddr);
232 phdr->p_filesz = elf32_to_cpu(ehdr, buf_phdr->p_filesz);
233 phdr->p_memsz = elf32_to_cpu(ehdr, buf_phdr->p_memsz);
234 phdr->p_align = elf32_to_cpu(ehdr, buf_phdr->p_align);
235 break;
236
237 default:
238 pr_debug("Unknown ELF class.\n");
239 return -EINVAL;
240 }
241
242 return elf_is_phdr_sane(phdr, len) ? 0 : -ENOEXEC;
243}
244
245/**
246 * elf_read_phdrs - read the program headers from the buffer
247 *
248 * This function assumes that the program header table was checked for sanity.
249 * Use elf_is_ehdr_sane() if it wasn't.
250 */
251static int elf_read_phdrs(const char *buf, size_t len,
252 struct kexec_elf_info *elf_info)
253{
254 size_t phdr_size, i;
255 const struct elfhdr *ehdr = elf_info->ehdr;
256
257 /*
258 * e_phnum is at most 65535 so calculating the size of the
259 * program header cannot overflow.
260 */
261 phdr_size = sizeof(struct elf_phdr) * ehdr->e_phnum;
262
263 elf_info->proghdrs = kzalloc(phdr_size, GFP_KERNEL);
264 if (!elf_info->proghdrs)
265 return -ENOMEM;
266
267 for (i = 0; i < ehdr->e_phnum; i++) {
268 int ret;
269
270 ret = elf_read_phdr(buf, len, elf_info, i);
271 if (ret) {
272 kfree(elf_info->proghdrs);
273 elf_info->proghdrs = NULL;
274 return ret;
275 }
276 }
277
278 return 0;
279}
280
281/**
282 * elf_read_from_buffer - read ELF file and sets up ELF header and ELF info
283 * @buf: Buffer to read ELF file from.
284 * @len: Size of @buf.
285 * @ehdr: Pointer to existing struct which will be populated.
286 * @elf_info: Pointer to existing struct which will be populated.
287 *
288 * This function allows reading ELF files with different byte order than
289 * the kernel, byte-swapping the fields as needed.
290 *
291 * Return:
292 * On success returns 0, and the caller should call
293 * kexec_free_elf_info(elf_info) to free the memory allocated for the section
294 * and program headers.
295 */
296static int elf_read_from_buffer(const char *buf, size_t len,
297 struct elfhdr *ehdr,
298 struct kexec_elf_info *elf_info)
299{
300 int ret;
301
302 ret = elf_read_ehdr(buf, len, ehdr);
303 if (ret)
304 return ret;
305
306 elf_info->buffer = buf;
307 elf_info->ehdr = ehdr;
308 if (ehdr->e_phoff > 0 && ehdr->e_phnum > 0) {
309 ret = elf_read_phdrs(buf, len, elf_info);
310 if (ret)
311 return ret;
312 }
313 return 0;
314}
315
316/**
317 * kexec_free_elf_info - free memory allocated by elf_read_from_buffer
318 */
319void kexec_free_elf_info(struct kexec_elf_info *elf_info)
320{
321 kfree(elf_info->proghdrs);
322 memset(elf_info, 0, sizeof(*elf_info));
323}
324/**
325 * kexec_build_elf_info - read ELF executable and check that we can use it
326 */
327int kexec_build_elf_info(const char *buf, size_t len, struct elfhdr *ehdr,
328 struct kexec_elf_info *elf_info)
329{
330 int i;
331 int ret;
332
333 ret = elf_read_from_buffer(buf, len, ehdr, elf_info);
334 if (ret)
335 return ret;
336
337 /* Big endian vmlinux has type ET_DYN. */
338 if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) {
339 pr_err("Not an ELF executable.\n");
340 goto error;
341 } else if (!elf_info->proghdrs) {
342 pr_err("No ELF program header.\n");
343 goto error;
344 }
345
346 for (i = 0; i < ehdr->e_phnum; i++) {
347 /*
348 * Kexec does not support loading interpreters.
349 * In addition this check keeps us from attempting
350 * to kexec ordinay executables.
351 */
352 if (elf_info->proghdrs[i].p_type == PT_INTERP) {
353 pr_err("Requires an ELF interpreter.\n");
354 goto error;
355 }
356 }
357
358 return 0;
359error:
360 kexec_free_elf_info(elf_info);
361 return -ENOEXEC;
362}
363
364
365int kexec_elf_probe(const char *buf, unsigned long len)
366{
367 struct elfhdr ehdr;
368 struct kexec_elf_info elf_info;
369 int ret;
370
371 ret = kexec_build_elf_info(buf, len, &ehdr, &elf_info);
372 if (ret)
373 return ret;
374
375 kexec_free_elf_info(&elf_info);
376
377 return elf_check_arch(&ehdr) ? 0 : -ENOEXEC;
378}
379
380/**
381 * kexec_elf_load - load ELF executable image
382 * @lowest_load_addr: On return, will be the address where the first PT_LOAD
383 * section will be loaded in memory.
384 *
385 * Return:
386 * 0 on success, negative value on failure.
387 */
388int kexec_elf_load(struct kimage *image, struct elfhdr *ehdr,
389 struct kexec_elf_info *elf_info,
390 struct kexec_buf *kbuf,
391 unsigned long *lowest_load_addr)
392{
393 unsigned long lowest_addr = UINT_MAX;
394 int ret;
395 size_t i;
396
397 /* Read in the PT_LOAD segments. */
398 for (i = 0; i < ehdr->e_phnum; i++) {
399 unsigned long load_addr;
400 size_t size;
401 const struct elf_phdr *phdr;
402
403 phdr = &elf_info->proghdrs[i];
404 if (phdr->p_type != PT_LOAD)
405 continue;
406
407 size = phdr->p_filesz;
408 if (size > phdr->p_memsz)
409 size = phdr->p_memsz;
410
411 kbuf->buffer = (void *) elf_info->buffer + phdr->p_offset;
412 kbuf->bufsz = size;
413 kbuf->memsz = phdr->p_memsz;
414 kbuf->buf_align = phdr->p_align;
415 kbuf->buf_min = phdr->p_paddr;
416 kbuf->mem = KEXEC_BUF_MEM_UNKNOWN;
417 ret = kexec_add_buffer(kbuf);
418 if (ret)
419 goto out;
420 load_addr = kbuf->mem;
421
422 if (load_addr < lowest_addr)
423 lowest_addr = load_addr;
424 }
425
426 *lowest_load_addr = lowest_addr;
427 ret = 0;
428 out:
429 return ret;
430}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 9873fc627d61..d9770a5393c8 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -470,6 +470,7 @@ static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
470 */ 470 */
471static void do_optimize_kprobes(void) 471static void do_optimize_kprobes(void)
472{ 472{
473 lockdep_assert_held(&text_mutex);
473 /* 474 /*
474 * The optimization/unoptimization refers online_cpus via 475 * The optimization/unoptimization refers online_cpus via
475 * stop_machine() and cpu-hotplug modifies online_cpus. 476 * stop_machine() and cpu-hotplug modifies online_cpus.
@@ -487,9 +488,7 @@ static void do_optimize_kprobes(void)
487 list_empty(&optimizing_list)) 488 list_empty(&optimizing_list))
488 return; 489 return;
489 490
490 mutex_lock(&text_mutex);
491 arch_optimize_kprobes(&optimizing_list); 491 arch_optimize_kprobes(&optimizing_list);
492 mutex_unlock(&text_mutex);
493} 492}
494 493
495/* 494/*
@@ -500,6 +499,7 @@ static void do_unoptimize_kprobes(void)
500{ 499{
501 struct optimized_kprobe *op, *tmp; 500 struct optimized_kprobe *op, *tmp;
502 501
502 lockdep_assert_held(&text_mutex);
503 /* See comment in do_optimize_kprobes() */ 503 /* See comment in do_optimize_kprobes() */
504 lockdep_assert_cpus_held(); 504 lockdep_assert_cpus_held();
505 505
@@ -507,7 +507,6 @@ static void do_unoptimize_kprobes(void)
507 if (list_empty(&unoptimizing_list)) 507 if (list_empty(&unoptimizing_list))
508 return; 508 return;
509 509
510 mutex_lock(&text_mutex);
511 arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list); 510 arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
512 /* Loop free_list for disarming */ 511 /* Loop free_list for disarming */
513 list_for_each_entry_safe(op, tmp, &freeing_list, list) { 512 list_for_each_entry_safe(op, tmp, &freeing_list, list) {
@@ -524,7 +523,6 @@ static void do_unoptimize_kprobes(void)
524 } else 523 } else
525 list_del_init(&op->list); 524 list_del_init(&op->list);
526 } 525 }
527 mutex_unlock(&text_mutex);
528} 526}
529 527
530/* Reclaim all kprobes on the free_list */ 528/* Reclaim all kprobes on the free_list */
@@ -556,6 +554,7 @@ static void kprobe_optimizer(struct work_struct *work)
556{ 554{
557 mutex_lock(&kprobe_mutex); 555 mutex_lock(&kprobe_mutex);
558 cpus_read_lock(); 556 cpus_read_lock();
557 mutex_lock(&text_mutex);
559 /* Lock modules while optimizing kprobes */ 558 /* Lock modules while optimizing kprobes */
560 mutex_lock(&module_mutex); 559 mutex_lock(&module_mutex);
561 560
@@ -583,6 +582,7 @@ static void kprobe_optimizer(struct work_struct *work)
583 do_free_cleaned_kprobes(); 582 do_free_cleaned_kprobes();
584 583
585 mutex_unlock(&module_mutex); 584 mutex_unlock(&module_mutex);
585 mutex_unlock(&text_mutex);
586 cpus_read_unlock(); 586 cpus_read_unlock();
587 mutex_unlock(&kprobe_mutex); 587 mutex_unlock(&kprobe_mutex);
588 588
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 4861cf8e274b..4aca3f4379d2 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -620,7 +620,7 @@ static void print_lock(struct held_lock *hlock)
620 return; 620 return;
621 } 621 }
622 622
623 printk(KERN_CONT "%p", hlock->instance); 623 printk(KERN_CONT "%px", hlock->instance);
624 print_lock_name(lock); 624 print_lock_name(lock);
625 printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip); 625 printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip);
626} 626}
diff --git a/kernel/module.c b/kernel/module.c
index 5933395af9a0..9ee93421269c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -65,9 +65,9 @@
65/* 65/*
66 * Modules' sections will be aligned on page boundaries 66 * Modules' sections will be aligned on page boundaries
67 * to ensure complete separation of code and data, but 67 * to ensure complete separation of code and data, but
68 * only when CONFIG_STRICT_MODULE_RWX=y 68 * only when CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y
69 */ 69 */
70#ifdef CONFIG_STRICT_MODULE_RWX 70#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX
71# define debug_align(X) ALIGN(X, PAGE_SIZE) 71# define debug_align(X) ALIGN(X, PAGE_SIZE)
72#else 72#else
73# define debug_align(X) (X) 73# define debug_align(X) (X)
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 5ec3ea4028e2..4aa02eee8f6c 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -8,6 +8,17 @@ menu "RCU Debugging"
8config PROVE_RCU 8config PROVE_RCU
9 def_bool PROVE_LOCKING 9 def_bool PROVE_LOCKING
10 10
11config PROVE_RCU_LIST
12 bool "RCU list lockdep debugging"
13 depends on PROVE_RCU && RCU_EXPERT
14 default n
15 help
16 Enable RCU lockdep checking for list usages. By default it is
17 turned off since there are several list RCU users that still
18 need to be converted to pass a lockdep expression. To prevent
19 false-positive splats, we keep it default disabled but once all
20 users are converted, we can remove this config option.
21
11config TORTURE_TEST 22config TORTURE_TEST
12 tristate 23 tristate
13 default n 24 default n
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 5290b01de534..8fd4f82c9b3d 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -227,6 +227,7 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
227 227
228#ifdef CONFIG_RCU_STALL_COMMON 228#ifdef CONFIG_RCU_STALL_COMMON
229 229
230extern int rcu_cpu_stall_ftrace_dump;
230extern int rcu_cpu_stall_suppress; 231extern int rcu_cpu_stall_suppress;
231extern int rcu_cpu_stall_timeout; 232extern int rcu_cpu_stall_timeout;
232int rcu_jiffies_till_stall_check(void); 233int rcu_jiffies_till_stall_check(void);
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 9bd5f6023c21..495c58ce1640 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -24,6 +24,49 @@ void rcu_cblist_init(struct rcu_cblist *rclp)
24} 24}
25 25
26/* 26/*
27 * Enqueue an rcu_head structure onto the specified callback list.
28 * This function assumes that the callback is non-lazy because it
29 * is intended for use by no-CBs CPUs, which do not distinguish
30 * between lazy and non-lazy RCU callbacks.
31 */
32void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp)
33{
34 *rclp->tail = rhp;
35 rclp->tail = &rhp->next;
36 WRITE_ONCE(rclp->len, rclp->len + 1);
37}
38
39/*
40 * Flush the second rcu_cblist structure onto the first one, obliterating
41 * any contents of the first. If rhp is non-NULL, enqueue it as the sole
42 * element of the second rcu_cblist structure, but ensuring that the second
43 * rcu_cblist structure, if initially non-empty, always appears non-empty
44 * throughout the process. If rdp is NULL, the second rcu_cblist structure
45 * is instead initialized to empty.
46 */
47void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp,
48 struct rcu_cblist *srclp,
49 struct rcu_head *rhp)
50{
51 drclp->head = srclp->head;
52 if (drclp->head)
53 drclp->tail = srclp->tail;
54 else
55 drclp->tail = &drclp->head;
56 drclp->len = srclp->len;
57 drclp->len_lazy = srclp->len_lazy;
58 if (!rhp) {
59 rcu_cblist_init(srclp);
60 } else {
61 rhp->next = NULL;
62 srclp->head = rhp;
63 srclp->tail = &rhp->next;
64 WRITE_ONCE(srclp->len, 1);
65 srclp->len_lazy = 0;
66 }
67}
68
69/*
27 * Dequeue the oldest rcu_head structure from the specified callback 70 * Dequeue the oldest rcu_head structure from the specified callback
28 * list. This function assumes that the callback is non-lazy, but 71 * list. This function assumes that the callback is non-lazy, but
29 * the caller can later invoke rcu_cblist_dequeued_lazy() if it 72 * the caller can later invoke rcu_cblist_dequeued_lazy() if it
@@ -44,6 +87,67 @@ struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp)
44 return rhp; 87 return rhp;
45} 88}
46 89
90/* Set the length of an rcu_segcblist structure. */
91void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v)
92{
93#ifdef CONFIG_RCU_NOCB_CPU
94 atomic_long_set(&rsclp->len, v);
95#else
96 WRITE_ONCE(rsclp->len, v);
97#endif
98}
99
100/*
101 * Increase the numeric length of an rcu_segcblist structure by the
102 * specified amount, which can be negative. This can cause the ->len
103 * field to disagree with the actual number of callbacks on the structure.
104 * This increase is fully ordered with respect to the callers accesses
105 * both before and after.
106 */
107void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v)
108{
109#ifdef CONFIG_RCU_NOCB_CPU
110 smp_mb__before_atomic(); /* Up to the caller! */
111 atomic_long_add(v, &rsclp->len);
112 smp_mb__after_atomic(); /* Up to the caller! */
113#else
114 smp_mb(); /* Up to the caller! */
115 WRITE_ONCE(rsclp->len, rsclp->len + v);
116 smp_mb(); /* Up to the caller! */
117#endif
118}
119
120/*
121 * Increase the numeric length of an rcu_segcblist structure by one.
122 * This can cause the ->len field to disagree with the actual number of
123 * callbacks on the structure. This increase is fully ordered with respect
124 * to the callers accesses both before and after.
125 */
126void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp)
127{
128 rcu_segcblist_add_len(rsclp, 1);
129}
130
131/*
132 * Exchange the numeric length of the specified rcu_segcblist structure
133 * with the specified value. This can cause the ->len field to disagree
134 * with the actual number of callbacks on the structure. This exchange is
135 * fully ordered with respect to the callers accesses both before and after.
136 */
137long rcu_segcblist_xchg_len(struct rcu_segcblist *rsclp, long v)
138{
139#ifdef CONFIG_RCU_NOCB_CPU
140 return atomic_long_xchg(&rsclp->len, v);
141#else
142 long ret = rsclp->len;
143
144 smp_mb(); /* Up to the caller! */
145 WRITE_ONCE(rsclp->len, v);
146 smp_mb(); /* Up to the caller! */
147 return ret;
148#endif
149}
150
47/* 151/*
48 * Initialize an rcu_segcblist structure. 152 * Initialize an rcu_segcblist structure.
49 */ 153 */
@@ -56,8 +160,9 @@ void rcu_segcblist_init(struct rcu_segcblist *rsclp)
56 rsclp->head = NULL; 160 rsclp->head = NULL;
57 for (i = 0; i < RCU_CBLIST_NSEGS; i++) 161 for (i = 0; i < RCU_CBLIST_NSEGS; i++)
58 rsclp->tails[i] = &rsclp->head; 162 rsclp->tails[i] = &rsclp->head;
59 rsclp->len = 0; 163 rcu_segcblist_set_len(rsclp, 0);
60 rsclp->len_lazy = 0; 164 rsclp->len_lazy = 0;
165 rsclp->enabled = 1;
61} 166}
62 167
63/* 168/*
@@ -69,7 +174,16 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
69 WARN_ON_ONCE(!rcu_segcblist_empty(rsclp)); 174 WARN_ON_ONCE(!rcu_segcblist_empty(rsclp));
70 WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp)); 175 WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp));
71 WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp)); 176 WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp));
72 rsclp->tails[RCU_NEXT_TAIL] = NULL; 177 rsclp->enabled = 0;
178}
179
180/*
181 * Mark the specified rcu_segcblist structure as offloaded. This
182 * structure must be empty.
183 */
184void rcu_segcblist_offload(struct rcu_segcblist *rsclp)
185{
186 rsclp->offloaded = 1;
73} 187}
74 188
75/* 189/*
@@ -118,6 +232,18 @@ struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp)
118} 232}
119 233
120/* 234/*
235 * Return false if there are no CBs awaiting grace periods, otherwise,
236 * return true and store the nearest waited-upon grace period into *lp.
237 */
238bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp)
239{
240 if (!rcu_segcblist_pend_cbs(rsclp))
241 return false;
242 *lp = rsclp->gp_seq[RCU_WAIT_TAIL];
243 return true;
244}
245
246/*
121 * Enqueue the specified callback onto the specified rcu_segcblist 247 * Enqueue the specified callback onto the specified rcu_segcblist
122 * structure, updating accounting as needed. Note that the ->len 248 * structure, updating accounting as needed. Note that the ->len
123 * field may be accessed locklessly, hence the WRITE_ONCE(). 249 * field may be accessed locklessly, hence the WRITE_ONCE().
@@ -129,13 +255,13 @@ struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp)
129void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, 255void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
130 struct rcu_head *rhp, bool lazy) 256 struct rcu_head *rhp, bool lazy)
131{ 257{
132 WRITE_ONCE(rsclp->len, rsclp->len + 1); /* ->len sampled locklessly. */ 258 rcu_segcblist_inc_len(rsclp);
133 if (lazy) 259 if (lazy)
134 rsclp->len_lazy++; 260 rsclp->len_lazy++;
135 smp_mb(); /* Ensure counts are updated before callback is enqueued. */ 261 smp_mb(); /* Ensure counts are updated before callback is enqueued. */
136 rhp->next = NULL; 262 rhp->next = NULL;
137 *rsclp->tails[RCU_NEXT_TAIL] = rhp; 263 WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rhp);
138 rsclp->tails[RCU_NEXT_TAIL] = &rhp->next; 264 WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], &rhp->next);
139} 265}
140 266
141/* 267/*
@@ -155,7 +281,7 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
155 281
156 if (rcu_segcblist_n_cbs(rsclp) == 0) 282 if (rcu_segcblist_n_cbs(rsclp) == 0)
157 return false; 283 return false;
158 WRITE_ONCE(rsclp->len, rsclp->len + 1); 284 rcu_segcblist_inc_len(rsclp);
159 if (lazy) 285 if (lazy)
160 rsclp->len_lazy++; 286 rsclp->len_lazy++;
161 smp_mb(); /* Ensure counts are updated before callback is entrained. */ 287 smp_mb(); /* Ensure counts are updated before callback is entrained. */
@@ -163,9 +289,9 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
163 for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--) 289 for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--)
164 if (rsclp->tails[i] != rsclp->tails[i - 1]) 290 if (rsclp->tails[i] != rsclp->tails[i - 1])
165 break; 291 break;
166 *rsclp->tails[i] = rhp; 292 WRITE_ONCE(*rsclp->tails[i], rhp);
167 for (; i <= RCU_NEXT_TAIL; i++) 293 for (; i <= RCU_NEXT_TAIL; i++)
168 rsclp->tails[i] = &rhp->next; 294 WRITE_ONCE(rsclp->tails[i], &rhp->next);
169 return true; 295 return true;
170} 296}
171 297
@@ -182,9 +308,8 @@ void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
182 struct rcu_cblist *rclp) 308 struct rcu_cblist *rclp)
183{ 309{
184 rclp->len_lazy += rsclp->len_lazy; 310 rclp->len_lazy += rsclp->len_lazy;
185 rclp->len += rsclp->len;
186 rsclp->len_lazy = 0; 311 rsclp->len_lazy = 0;
187 WRITE_ONCE(rsclp->len, 0); /* ->len sampled locklessly. */ 312 rclp->len = rcu_segcblist_xchg_len(rsclp, 0);
188} 313}
189 314
190/* 315/*
@@ -200,12 +325,12 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
200 if (!rcu_segcblist_ready_cbs(rsclp)) 325 if (!rcu_segcblist_ready_cbs(rsclp))
201 return; /* Nothing to do. */ 326 return; /* Nothing to do. */
202 *rclp->tail = rsclp->head; 327 *rclp->tail = rsclp->head;
203 rsclp->head = *rsclp->tails[RCU_DONE_TAIL]; 328 WRITE_ONCE(rsclp->head, *rsclp->tails[RCU_DONE_TAIL]);
204 *rsclp->tails[RCU_DONE_TAIL] = NULL; 329 WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL);
205 rclp->tail = rsclp->tails[RCU_DONE_TAIL]; 330 rclp->tail = rsclp->tails[RCU_DONE_TAIL];
206 for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--) 331 for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--)
207 if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL]) 332 if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL])
208 rsclp->tails[i] = &rsclp->head; 333 WRITE_ONCE(rsclp->tails[i], &rsclp->head);
209} 334}
210 335
211/* 336/*
@@ -224,9 +349,9 @@ void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
224 return; /* Nothing to do. */ 349 return; /* Nothing to do. */
225 *rclp->tail = *rsclp->tails[RCU_DONE_TAIL]; 350 *rclp->tail = *rsclp->tails[RCU_DONE_TAIL];
226 rclp->tail = rsclp->tails[RCU_NEXT_TAIL]; 351 rclp->tail = rsclp->tails[RCU_NEXT_TAIL];
227 *rsclp->tails[RCU_DONE_TAIL] = NULL; 352 WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL);
228 for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) 353 for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++)
229 rsclp->tails[i] = rsclp->tails[RCU_DONE_TAIL]; 354 WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_DONE_TAIL]);
230} 355}
231 356
232/* 357/*
@@ -237,8 +362,7 @@ void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp,
237 struct rcu_cblist *rclp) 362 struct rcu_cblist *rclp)
238{ 363{
239 rsclp->len_lazy += rclp->len_lazy; 364 rsclp->len_lazy += rclp->len_lazy;
240 /* ->len sampled locklessly. */ 365 rcu_segcblist_add_len(rsclp, rclp->len);
241 WRITE_ONCE(rsclp->len, rsclp->len + rclp->len);
242 rclp->len_lazy = 0; 366 rclp->len_lazy = 0;
243 rclp->len = 0; 367 rclp->len = 0;
244} 368}
@@ -255,10 +379,10 @@ void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp,
255 if (!rclp->head) 379 if (!rclp->head)
256 return; /* No callbacks to move. */ 380 return; /* No callbacks to move. */
257 *rclp->tail = rsclp->head; 381 *rclp->tail = rsclp->head;
258 rsclp->head = rclp->head; 382 WRITE_ONCE(rsclp->head, rclp->head);
259 for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) 383 for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)
260 if (&rsclp->head == rsclp->tails[i]) 384 if (&rsclp->head == rsclp->tails[i])
261 rsclp->tails[i] = rclp->tail; 385 WRITE_ONCE(rsclp->tails[i], rclp->tail);
262 else 386 else
263 break; 387 break;
264 rclp->head = NULL; 388 rclp->head = NULL;
@@ -274,8 +398,8 @@ void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
274{ 398{
275 if (!rclp->head) 399 if (!rclp->head)
276 return; /* Nothing to do. */ 400 return; /* Nothing to do. */
277 *rsclp->tails[RCU_NEXT_TAIL] = rclp->head; 401 WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rclp->head);
278 rsclp->tails[RCU_NEXT_TAIL] = rclp->tail; 402 WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], rclp->tail);
279 rclp->head = NULL; 403 rclp->head = NULL;
280 rclp->tail = &rclp->head; 404 rclp->tail = &rclp->head;
281} 405}
@@ -299,7 +423,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
299 for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { 423 for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
300 if (ULONG_CMP_LT(seq, rsclp->gp_seq[i])) 424 if (ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
301 break; 425 break;
302 rsclp->tails[RCU_DONE_TAIL] = rsclp->tails[i]; 426 WRITE_ONCE(rsclp->tails[RCU_DONE_TAIL], rsclp->tails[i]);
303 } 427 }
304 428
305 /* If no callbacks moved, nothing more need be done. */ 429 /* If no callbacks moved, nothing more need be done. */
@@ -308,7 +432,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
308 432
309 /* Clean up tail pointers that might have been misordered above. */ 433 /* Clean up tail pointers that might have been misordered above. */
310 for (j = RCU_WAIT_TAIL; j < i; j++) 434 for (j = RCU_WAIT_TAIL; j < i; j++)
311 rsclp->tails[j] = rsclp->tails[RCU_DONE_TAIL]; 435 WRITE_ONCE(rsclp->tails[j], rsclp->tails[RCU_DONE_TAIL]);
312 436
313 /* 437 /*
314 * Callbacks moved, so clean up the misordered ->tails[] pointers 438 * Callbacks moved, so clean up the misordered ->tails[] pointers
@@ -319,7 +443,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
319 for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { 443 for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
320 if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL]) 444 if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL])
321 break; /* No more callbacks. */ 445 break; /* No more callbacks. */
322 rsclp->tails[j] = rsclp->tails[i]; 446 WRITE_ONCE(rsclp->tails[j], rsclp->tails[i]);
323 rsclp->gp_seq[j] = rsclp->gp_seq[i]; 447 rsclp->gp_seq[j] = rsclp->gp_seq[i];
324 } 448 }
325} 449}
@@ -384,7 +508,7 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
384 * structure other than in the RCU_NEXT_TAIL segment. 508 * structure other than in the RCU_NEXT_TAIL segment.
385 */ 509 */
386 for (; i < RCU_NEXT_TAIL; i++) { 510 for (; i < RCU_NEXT_TAIL; i++) {
387 rsclp->tails[i] = rsclp->tails[RCU_NEXT_TAIL]; 511 WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_NEXT_TAIL]);
388 rsclp->gp_seq[i] = seq; 512 rsclp->gp_seq[i] = seq;
389 } 513 }
390 return true; 514 return true;
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 71b64648464e..815c2fdd3fcc 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -9,6 +9,12 @@
9 9
10#include <linux/rcu_segcblist.h> 10#include <linux/rcu_segcblist.h>
11 11
12/* Return number of callbacks in the specified callback list. */
13static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp)
14{
15 return READ_ONCE(rclp->len);
16}
17
12/* 18/*
13 * Account for the fact that a previously dequeued callback turned out 19 * Account for the fact that a previously dequeued callback turned out
14 * to be marked as lazy. 20 * to be marked as lazy.
@@ -19,6 +25,10 @@ static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp)
19} 25}
20 26
21void rcu_cblist_init(struct rcu_cblist *rclp); 27void rcu_cblist_init(struct rcu_cblist *rclp);
28void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp);
29void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp,
30 struct rcu_cblist *srclp,
31 struct rcu_head *rhp);
22struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp); 32struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp);
23 33
24/* 34/*
@@ -36,13 +46,17 @@ struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp);
36 */ 46 */
37static inline bool rcu_segcblist_empty(struct rcu_segcblist *rsclp) 47static inline bool rcu_segcblist_empty(struct rcu_segcblist *rsclp)
38{ 48{
39 return !rsclp->head; 49 return !READ_ONCE(rsclp->head);
40} 50}
41 51
42/* Return number of callbacks in segmented callback list. */ 52/* Return number of callbacks in segmented callback list. */
43static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp) 53static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp)
44{ 54{
55#ifdef CONFIG_RCU_NOCB_CPU
56 return atomic_long_read(&rsclp->len);
57#else
45 return READ_ONCE(rsclp->len); 58 return READ_ONCE(rsclp->len);
59#endif
46} 60}
47 61
48/* Return number of lazy callbacks in segmented callback list. */ 62/* Return number of lazy callbacks in segmented callback list. */
@@ -54,16 +68,22 @@ static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp)
54/* Return number of lazy callbacks in segmented callback list. */ 68/* Return number of lazy callbacks in segmented callback list. */
55static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp) 69static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp)
56{ 70{
57 return rsclp->len - rsclp->len_lazy; 71 return rcu_segcblist_n_cbs(rsclp) - rsclp->len_lazy;
58} 72}
59 73
60/* 74/*
61 * Is the specified rcu_segcblist enabled, for example, not corresponding 75 * Is the specified rcu_segcblist enabled, for example, not corresponding
62 * to an offline or callback-offloaded CPU? 76 * to an offline CPU?
63 */ 77 */
64static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) 78static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)
65{ 79{
66 return !!rsclp->tails[RCU_NEXT_TAIL]; 80 return rsclp->enabled;
81}
82
83/* Is the specified rcu_segcblist offloaded? */
84static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp)
85{
86 return rsclp->offloaded;
67} 87}
68 88
69/* 89/*
@@ -73,36 +93,18 @@ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)
73 */ 93 */
74static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg) 94static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg)
75{ 95{
76 return !*rsclp->tails[seg]; 96 return !READ_ONCE(*READ_ONCE(rsclp->tails[seg]));
77}
78
79/*
80 * Interim function to return rcu_segcblist head pointer. Longer term, the
81 * rcu_segcblist will be used more pervasively, removing the need for this
82 * function.
83 */
84static inline struct rcu_head *rcu_segcblist_head(struct rcu_segcblist *rsclp)
85{
86 return rsclp->head;
87}
88
89/*
90 * Interim function to return rcu_segcblist head pointer. Longer term, the
91 * rcu_segcblist will be used more pervasively, removing the need for this
92 * function.
93 */
94static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp)
95{
96 WARN_ON_ONCE(rcu_segcblist_empty(rsclp));
97 return rsclp->tails[RCU_NEXT_TAIL];
98} 97}
99 98
99void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp);
100void rcu_segcblist_init(struct rcu_segcblist *rsclp); 100void rcu_segcblist_init(struct rcu_segcblist *rsclp);
101void rcu_segcblist_disable(struct rcu_segcblist *rsclp); 101void rcu_segcblist_disable(struct rcu_segcblist *rsclp);
102void rcu_segcblist_offload(struct rcu_segcblist *rsclp);
102bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp); 103bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp);
103bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp); 104bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp);
104struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp); 105struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp);
105struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp); 106struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp);
107bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp);
106void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, 108void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
107 struct rcu_head *rhp, bool lazy); 109 struct rcu_head *rhp, bool lazy);
108bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, 110bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 7a6890b23c5f..5a879d073c1c 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -89,7 +89,7 @@ torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable
89 89
90static char *perf_type = "rcu"; 90static char *perf_type = "rcu";
91module_param(perf_type, charp, 0444); 91module_param(perf_type, charp, 0444);
92MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, rcu_bh, ...)"); 92MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, srcu, ...)");
93 93
94static int nrealreaders; 94static int nrealreaders;
95static int nrealwriters; 95static int nrealwriters;
@@ -375,6 +375,14 @@ rcu_perf_writer(void *arg)
375 if (holdoff) 375 if (holdoff)
376 schedule_timeout_uninterruptible(holdoff * HZ); 376 schedule_timeout_uninterruptible(holdoff * HZ);
377 377
378 /*
379 * Wait until rcu_end_inkernel_boot() is called for normal GP tests
380 * so that RCU is not always expedited for normal GP tests.
381 * The system_state test is approximate, but works well in practice.
382 */
383 while (!gp_exp && system_state != SYSTEM_RUNNING)
384 schedule_timeout_uninterruptible(1);
385
378 t = ktime_get_mono_fast_ns(); 386 t = ktime_get_mono_fast_ns();
379 if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) { 387 if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) {
380 t_rcu_perf_writer_started = t; 388 t_rcu_perf_writer_started = t;
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index fce4e7e6f502..3c9feca1eab1 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -161,6 +161,7 @@ static atomic_long_t n_rcu_torture_timers;
161static long n_barrier_attempts; 161static long n_barrier_attempts;
162static long n_barrier_successes; /* did rcu_barrier test succeed? */ 162static long n_barrier_successes; /* did rcu_barrier test succeed? */
163static struct list_head rcu_torture_removed; 163static struct list_head rcu_torture_removed;
164static unsigned long shutdown_jiffies;
164 165
165static int rcu_torture_writer_state; 166static int rcu_torture_writer_state;
166#define RTWS_FIXED_DELAY 0 167#define RTWS_FIXED_DELAY 0
@@ -228,6 +229,15 @@ static u64 notrace rcu_trace_clock_local(void)
228} 229}
229#endif /* #else #ifdef CONFIG_RCU_TRACE */ 230#endif /* #else #ifdef CONFIG_RCU_TRACE */
230 231
232/*
233 * Stop aggressive CPU-hog tests a bit before the end of the test in order
234 * to avoid interfering with test shutdown.
235 */
236static bool shutdown_time_arrived(void)
237{
238 return shutdown_secs && time_after(jiffies, shutdown_jiffies - 30 * HZ);
239}
240
231static unsigned long boost_starttime; /* jiffies of next boost test start. */ 241static unsigned long boost_starttime; /* jiffies of next boost test start. */
232static DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ 242static DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
233 /* and boost task create/destroy. */ 243 /* and boost task create/destroy. */
@@ -1713,12 +1723,14 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp)
1713} 1723}
1714 1724
1715// Give the scheduler a chance, even on nohz_full CPUs. 1725// Give the scheduler a chance, even on nohz_full CPUs.
1716static void rcu_torture_fwd_prog_cond_resched(void) 1726static void rcu_torture_fwd_prog_cond_resched(unsigned long iter)
1717{ 1727{
1718 if (IS_ENABLED(CONFIG_PREEMPT) && IS_ENABLED(CONFIG_NO_HZ_FULL)) { 1728 if (IS_ENABLED(CONFIG_PREEMPT) && IS_ENABLED(CONFIG_NO_HZ_FULL)) {
1719 if (need_resched()) 1729 // Real call_rcu() floods hit userspace, so emulate that.
1730 if (need_resched() || (iter & 0xfff))
1720 schedule(); 1731 schedule();
1721 } else { 1732 } else {
1733 // No userspace emulation: CB invocation throttles call_rcu()
1722 cond_resched(); 1734 cond_resched();
1723 } 1735 }
1724} 1736}
@@ -1746,7 +1758,7 @@ static unsigned long rcu_torture_fwd_prog_cbfree(void)
1746 spin_unlock_irqrestore(&rcu_fwd_lock, flags); 1758 spin_unlock_irqrestore(&rcu_fwd_lock, flags);
1747 kfree(rfcp); 1759 kfree(rfcp);
1748 freed++; 1760 freed++;
1749 rcu_torture_fwd_prog_cond_resched(); 1761 rcu_torture_fwd_prog_cond_resched(freed);
1750 } 1762 }
1751 return freed; 1763 return freed;
1752} 1764}
@@ -1785,15 +1797,17 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries)
1785 WRITE_ONCE(rcu_fwd_startat, jiffies); 1797 WRITE_ONCE(rcu_fwd_startat, jiffies);
1786 stopat = rcu_fwd_startat + dur; 1798 stopat = rcu_fwd_startat + dur;
1787 while (time_before(jiffies, stopat) && 1799 while (time_before(jiffies, stopat) &&
1800 !shutdown_time_arrived() &&
1788 !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { 1801 !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {
1789 idx = cur_ops->readlock(); 1802 idx = cur_ops->readlock();
1790 udelay(10); 1803 udelay(10);
1791 cur_ops->readunlock(idx); 1804 cur_ops->readunlock(idx);
1792 if (!fwd_progress_need_resched || need_resched()) 1805 if (!fwd_progress_need_resched || need_resched())
1793 rcu_torture_fwd_prog_cond_resched(); 1806 rcu_torture_fwd_prog_cond_resched(1);
1794 } 1807 }
1795 (*tested_tries)++; 1808 (*tested_tries)++;
1796 if (!time_before(jiffies, stopat) && 1809 if (!time_before(jiffies, stopat) &&
1810 !shutdown_time_arrived() &&
1797 !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { 1811 !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {
1798 (*tested)++; 1812 (*tested)++;
1799 cver = READ_ONCE(rcu_torture_current_version) - cver; 1813 cver = READ_ONCE(rcu_torture_current_version) - cver;
@@ -1852,6 +1866,7 @@ static void rcu_torture_fwd_prog_cr(void)
1852 gps = cur_ops->get_gp_seq(); 1866 gps = cur_ops->get_gp_seq();
1853 rcu_launder_gp_seq_start = gps; 1867 rcu_launder_gp_seq_start = gps;
1854 while (time_before(jiffies, stopat) && 1868 while (time_before(jiffies, stopat) &&
1869 !shutdown_time_arrived() &&
1855 !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { 1870 !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {
1856 rfcp = READ_ONCE(rcu_fwd_cb_head); 1871 rfcp = READ_ONCE(rcu_fwd_cb_head);
1857 rfcpn = NULL; 1872 rfcpn = NULL;
@@ -1875,7 +1890,7 @@ static void rcu_torture_fwd_prog_cr(void)
1875 rfcp->rfc_gps = 0; 1890 rfcp->rfc_gps = 0;
1876 } 1891 }
1877 cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr); 1892 cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr);
1878 rcu_torture_fwd_prog_cond_resched(); 1893 rcu_torture_fwd_prog_cond_resched(n_launders + n_max_cbs);
1879 } 1894 }
1880 stoppedat = jiffies; 1895 stoppedat = jiffies;
1881 n_launders_cb_snap = READ_ONCE(n_launders_cb); 1896 n_launders_cb_snap = READ_ONCE(n_launders_cb);
@@ -1884,7 +1899,8 @@ static void rcu_torture_fwd_prog_cr(void)
1884 cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */ 1899 cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */
1885 (void)rcu_torture_fwd_prog_cbfree(); 1900 (void)rcu_torture_fwd_prog_cbfree();
1886 1901
1887 if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop)) { 1902 if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop) &&
1903 !shutdown_time_arrived()) {
1888 WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED); 1904 WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED);
1889 pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n", 1905 pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n",
1890 __func__, 1906 __func__,
@@ -2160,6 +2176,7 @@ rcu_torture_cleanup(void)
2160 return; 2176 return;
2161 } 2177 }
2162 2178
2179 show_rcu_gp_kthreads();
2163 rcu_torture_barrier_cleanup(); 2180 rcu_torture_barrier_cleanup();
2164 torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task); 2181 torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task);
2165 torture_stop_kthread(rcu_torture_stall, stall_task); 2182 torture_stop_kthread(rcu_torture_stall, stall_task);
@@ -2465,6 +2482,7 @@ rcu_torture_init(void)
2465 goto unwind; 2482 goto unwind;
2466 rcutor_hp = firsterr; 2483 rcutor_hp = firsterr;
2467 } 2484 }
2485 shutdown_jiffies = jiffies + shutdown_secs * HZ;
2468 firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup); 2486 firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);
2469 if (firsterr) 2487 if (firsterr)
2470 goto unwind; 2488 goto unwind;
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index cf0e886314f2..5dffade2d7cd 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -1279,8 +1279,9 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
1279 1279
1280 c0 = l0 - u0; 1280 c0 = l0 - u0;
1281 c1 = l1 - u1; 1281 c1 = l1 - u1;
1282 pr_cont(" %d(%ld,%ld %1p)", 1282 pr_cont(" %d(%ld,%ld %c)",
1283 cpu, c0, c1, rcu_segcblist_head(&sdp->srcu_cblist)); 1283 cpu, c0, c1,
1284 "C."[rcu_segcblist_empty(&sdp->srcu_cblist)]);
1284 s0 += c0; 1285 s0 += c0;
1285 s1 += c1; 1286 s1 += c1;
1286 } 1287 }
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a14e5fbbea46..71395e91b876 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -56,6 +56,7 @@
56#include <linux/smpboot.h> 56#include <linux/smpboot.h>
57#include <linux/jiffies.h> 57#include <linux/jiffies.h>
58#include <linux/sched/isolation.h> 58#include <linux/sched/isolation.h>
59#include <linux/sched/clock.h>
59#include "../time/tick-internal.h" 60#include "../time/tick-internal.h"
60 61
61#include "tree.h" 62#include "tree.h"
@@ -210,9 +211,9 @@ static long rcu_get_n_cbs_cpu(int cpu)
210{ 211{
211 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); 212 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
212 213
213 if (rcu_segcblist_is_enabled(&rdp->cblist)) /* Online normal CPU? */ 214 if (rcu_segcblist_is_enabled(&rdp->cblist))
214 return rcu_segcblist_n_cbs(&rdp->cblist); 215 return rcu_segcblist_n_cbs(&rdp->cblist);
215 return rcu_get_n_cbs_nocb_cpu(rdp); /* Works for offline, too. */ 216 return 0;
216} 217}
217 218
218void rcu_softirq_qs(void) 219void rcu_softirq_qs(void)
@@ -416,6 +417,12 @@ module_param(qlowmark, long, 0444);
416static ulong jiffies_till_first_fqs = ULONG_MAX; 417static ulong jiffies_till_first_fqs = ULONG_MAX;
417static ulong jiffies_till_next_fqs = ULONG_MAX; 418static ulong jiffies_till_next_fqs = ULONG_MAX;
418static bool rcu_kick_kthreads; 419static bool rcu_kick_kthreads;
420static int rcu_divisor = 7;
421module_param(rcu_divisor, int, 0644);
422
423/* Force an exit from rcu_do_batch() after 3 milliseconds. */
424static long rcu_resched_ns = 3 * NSEC_PER_MSEC;
425module_param(rcu_resched_ns, long, 0644);
419 426
420/* 427/*
421 * How long the grace period must be before we start recruiting 428 * How long the grace period must be before we start recruiting
@@ -1251,6 +1258,7 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
1251 unsigned long gp_seq_req; 1258 unsigned long gp_seq_req;
1252 bool ret = false; 1259 bool ret = false;
1253 1260
1261 rcu_lockdep_assert_cblist_protected(rdp);
1254 raw_lockdep_assert_held_rcu_node(rnp); 1262 raw_lockdep_assert_held_rcu_node(rnp);
1255 1263
1256 /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ 1264 /* If no pending (not yet ready to invoke) callbacks, nothing to do. */
@@ -1292,7 +1300,7 @@ static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp,
1292 unsigned long c; 1300 unsigned long c;
1293 bool needwake; 1301 bool needwake;
1294 1302
1295 lockdep_assert_irqs_disabled(); 1303 rcu_lockdep_assert_cblist_protected(rdp);
1296 c = rcu_seq_snap(&rcu_state.gp_seq); 1304 c = rcu_seq_snap(&rcu_state.gp_seq);
1297 if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) { 1305 if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) {
1298 /* Old request still live, so mark recent callbacks. */ 1306 /* Old request still live, so mark recent callbacks. */
@@ -1318,6 +1326,7 @@ static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp,
1318 */ 1326 */
1319static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp) 1327static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
1320{ 1328{
1329 rcu_lockdep_assert_cblist_protected(rdp);
1321 raw_lockdep_assert_held_rcu_node(rnp); 1330 raw_lockdep_assert_held_rcu_node(rnp);
1322 1331
1323 /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ 1332 /* If no pending (not yet ready to invoke) callbacks, nothing to do. */
@@ -1335,6 +1344,21 @@ static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
1335} 1344}
1336 1345
1337/* 1346/*
1347 * Move and classify callbacks, but only if doing so won't require
1348 * that the RCU grace-period kthread be awakened.
1349 */
1350static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp,
1351 struct rcu_data *rdp)
1352{
1353 rcu_lockdep_assert_cblist_protected(rdp);
1354 if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) ||
1355 !raw_spin_trylock_rcu_node(rnp))
1356 return;
1357 WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp));
1358 raw_spin_unlock_rcu_node(rnp);
1359}
1360
1361/*
1338 * Update CPU-local rcu_data state to record the beginnings and ends of 1362 * Update CPU-local rcu_data state to record the beginnings and ends of
1339 * grace periods. The caller must hold the ->lock of the leaf rcu_node 1363 * grace periods. The caller must hold the ->lock of the leaf rcu_node
1340 * structure corresponding to the current CPU, and must have irqs disabled. 1364 * structure corresponding to the current CPU, and must have irqs disabled.
@@ -1342,8 +1366,10 @@ static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
1342 */ 1366 */
1343static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) 1367static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
1344{ 1368{
1345 bool ret; 1369 bool ret = false;
1346 bool need_gp; 1370 bool need_gp;
1371 const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
1372 rcu_segcblist_is_offloaded(&rdp->cblist);
1347 1373
1348 raw_lockdep_assert_held_rcu_node(rnp); 1374 raw_lockdep_assert_held_rcu_node(rnp);
1349 1375
@@ -1353,10 +1379,12 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
1353 /* Handle the ends of any preceding grace periods first. */ 1379 /* Handle the ends of any preceding grace periods first. */
1354 if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) || 1380 if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) ||
1355 unlikely(READ_ONCE(rdp->gpwrap))) { 1381 unlikely(READ_ONCE(rdp->gpwrap))) {
1356 ret = rcu_advance_cbs(rnp, rdp); /* Advance callbacks. */ 1382 if (!offloaded)
1383 ret = rcu_advance_cbs(rnp, rdp); /* Advance CBs. */
1357 trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuend")); 1384 trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuend"));
1358 } else { 1385 } else {
1359 ret = rcu_accelerate_cbs(rnp, rdp); /* Recent callbacks. */ 1386 if (!offloaded)
1387 ret = rcu_accelerate_cbs(rnp, rdp); /* Recent CBs. */
1360 } 1388 }
1361 1389
1362 /* Now handle the beginnings of any new-to-this-CPU grace periods. */ 1390 /* Now handle the beginnings of any new-to-this-CPU grace periods. */
@@ -1657,6 +1685,7 @@ static void rcu_gp_cleanup(void)
1657 unsigned long gp_duration; 1685 unsigned long gp_duration;
1658 bool needgp = false; 1686 bool needgp = false;
1659 unsigned long new_gp_seq; 1687 unsigned long new_gp_seq;
1688 bool offloaded;
1660 struct rcu_data *rdp; 1689 struct rcu_data *rdp;
1661 struct rcu_node *rnp = rcu_get_root(); 1690 struct rcu_node *rnp = rcu_get_root();
1662 struct swait_queue_head *sq; 1691 struct swait_queue_head *sq;
@@ -1722,7 +1751,9 @@ static void rcu_gp_cleanup(void)
1722 needgp = true; 1751 needgp = true;
1723 } 1752 }
1724 /* Advance CBs to reduce false positives below. */ 1753 /* Advance CBs to reduce false positives below. */
1725 if (!rcu_accelerate_cbs(rnp, rdp) && needgp) { 1754 offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
1755 rcu_segcblist_is_offloaded(&rdp->cblist);
1756 if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) {
1726 WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT); 1757 WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT);
1727 rcu_state.gp_req_activity = jiffies; 1758 rcu_state.gp_req_activity = jiffies;
1728 trace_rcu_grace_period(rcu_state.name, 1759 trace_rcu_grace_period(rcu_state.name,
@@ -1916,7 +1947,9 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp)
1916{ 1947{
1917 unsigned long flags; 1948 unsigned long flags;
1918 unsigned long mask; 1949 unsigned long mask;
1919 bool needwake; 1950 bool needwake = false;
1951 const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
1952 rcu_segcblist_is_offloaded(&rdp->cblist);
1920 struct rcu_node *rnp; 1953 struct rcu_node *rnp;
1921 1954
1922 rnp = rdp->mynode; 1955 rnp = rdp->mynode;
@@ -1943,7 +1976,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp)
1943 * This GP can't end until cpu checks in, so all of our 1976 * This GP can't end until cpu checks in, so all of our
1944 * callbacks can be processed during the next GP. 1977 * callbacks can be processed during the next GP.
1945 */ 1978 */
1946 needwake = rcu_accelerate_cbs(rnp, rdp); 1979 if (!offloaded)
1980 needwake = rcu_accelerate_cbs(rnp, rdp);
1947 1981
1948 rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); 1982 rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
1949 /* ^^^ Released rnp->lock */ 1983 /* ^^^ Released rnp->lock */
@@ -2077,9 +2111,12 @@ int rcutree_dead_cpu(unsigned int cpu)
2077static void rcu_do_batch(struct rcu_data *rdp) 2111static void rcu_do_batch(struct rcu_data *rdp)
2078{ 2112{
2079 unsigned long flags; 2113 unsigned long flags;
2114 const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
2115 rcu_segcblist_is_offloaded(&rdp->cblist);
2080 struct rcu_head *rhp; 2116 struct rcu_head *rhp;
2081 struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); 2117 struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
2082 long bl, count; 2118 long bl, count;
2119 long pending, tlimit = 0;
2083 2120
2084 /* If no callbacks are ready, just return. */ 2121 /* If no callbacks are ready, just return. */
2085 if (!rcu_segcblist_ready_cbs(&rdp->cblist)) { 2122 if (!rcu_segcblist_ready_cbs(&rdp->cblist)) {
@@ -2099,13 +2136,19 @@ static void rcu_do_batch(struct rcu_data *rdp)
2099 * callback counts, as rcu_barrier() needs to be conservative. 2136 * callback counts, as rcu_barrier() needs to be conservative.
2100 */ 2137 */
2101 local_irq_save(flags); 2138 local_irq_save(flags);
2139 rcu_nocb_lock(rdp);
2102 WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); 2140 WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
2103 bl = rdp->blimit; 2141 pending = rcu_segcblist_n_cbs(&rdp->cblist);
2142 bl = max(rdp->blimit, pending >> rcu_divisor);
2143 if (unlikely(bl > 100))
2144 tlimit = local_clock() + rcu_resched_ns;
2104 trace_rcu_batch_start(rcu_state.name, 2145 trace_rcu_batch_start(rcu_state.name,
2105 rcu_segcblist_n_lazy_cbs(&rdp->cblist), 2146 rcu_segcblist_n_lazy_cbs(&rdp->cblist),
2106 rcu_segcblist_n_cbs(&rdp->cblist), bl); 2147 rcu_segcblist_n_cbs(&rdp->cblist), bl);
2107 rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl); 2148 rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
2108 local_irq_restore(flags); 2149 if (offloaded)
2150 rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
2151 rcu_nocb_unlock_irqrestore(rdp, flags);
2109 2152
2110 /* Invoke callbacks. */ 2153 /* Invoke callbacks. */
2111 rhp = rcu_cblist_dequeue(&rcl); 2154 rhp = rcu_cblist_dequeue(&rcl);
@@ -2117,13 +2160,29 @@ static void rcu_do_batch(struct rcu_data *rdp)
2117 * Stop only if limit reached and CPU has something to do. 2160 * Stop only if limit reached and CPU has something to do.
2118 * Note: The rcl structure counts down from zero. 2161 * Note: The rcl structure counts down from zero.
2119 */ 2162 */
2120 if (-rcl.len >= bl && 2163 if (-rcl.len >= bl && !offloaded &&
2121 (need_resched() || 2164 (need_resched() ||
2122 (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) 2165 (!is_idle_task(current) && !rcu_is_callbacks_kthread())))
2123 break; 2166 break;
2167 if (unlikely(tlimit)) {
2168 /* only call local_clock() every 32 callbacks */
2169 if (likely((-rcl.len & 31) || local_clock() < tlimit))
2170 continue;
2171 /* Exceeded the time limit, so leave. */
2172 break;
2173 }
2174 if (offloaded) {
2175 WARN_ON_ONCE(in_serving_softirq());
2176 local_bh_enable();
2177 lockdep_assert_irqs_enabled();
2178 cond_resched_tasks_rcu_qs();
2179 lockdep_assert_irqs_enabled();
2180 local_bh_disable();
2181 }
2124 } 2182 }
2125 2183
2126 local_irq_save(flags); 2184 local_irq_save(flags);
2185 rcu_nocb_lock(rdp);
2127 count = -rcl.len; 2186 count = -rcl.len;
2128 trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(), 2187 trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(),
2129 is_idle_task(current), rcu_is_callbacks_kthread()); 2188 is_idle_task(current), rcu_is_callbacks_kthread());
@@ -2149,12 +2208,14 @@ static void rcu_do_batch(struct rcu_data *rdp)
2149 * The following usually indicates a double call_rcu(). To track 2208 * The following usually indicates a double call_rcu(). To track
2150 * this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. 2209 * this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y.
2151 */ 2210 */
2152 WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != (count == 0)); 2211 WARN_ON_ONCE(count == 0 && !rcu_segcblist_empty(&rdp->cblist));
2212 WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
2213 count != 0 && rcu_segcblist_empty(&rdp->cblist));
2153 2214
2154 local_irq_restore(flags); 2215 rcu_nocb_unlock_irqrestore(rdp, flags);
2155 2216
2156 /* Re-invoke RCU core processing if there are callbacks remaining. */ 2217 /* Re-invoke RCU core processing if there are callbacks remaining. */
2157 if (rcu_segcblist_ready_cbs(&rdp->cblist)) 2218 if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist))
2158 invoke_rcu_core(); 2219 invoke_rcu_core();
2159} 2220}
2160 2221
@@ -2280,6 +2341,8 @@ static __latent_entropy void rcu_core(void)
2280 unsigned long flags; 2341 unsigned long flags;
2281 struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); 2342 struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
2282 struct rcu_node *rnp = rdp->mynode; 2343 struct rcu_node *rnp = rdp->mynode;
2344 const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
2345 rcu_segcblist_is_offloaded(&rdp->cblist);
2283 2346
2284 if (cpu_is_offline(smp_processor_id())) 2347 if (cpu_is_offline(smp_processor_id()))
2285 return; 2348 return;
@@ -2299,7 +2362,7 @@ static __latent_entropy void rcu_core(void)
2299 2362
2300 /* No grace period and unregistered callbacks? */ 2363 /* No grace period and unregistered callbacks? */
2301 if (!rcu_gp_in_progress() && 2364 if (!rcu_gp_in_progress() &&
2302 rcu_segcblist_is_enabled(&rdp->cblist)) { 2365 rcu_segcblist_is_enabled(&rdp->cblist) && !offloaded) {
2303 local_irq_save(flags); 2366 local_irq_save(flags);
2304 if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) 2367 if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
2305 rcu_accelerate_cbs_unlocked(rnp, rdp); 2368 rcu_accelerate_cbs_unlocked(rnp, rdp);
@@ -2309,7 +2372,7 @@ static __latent_entropy void rcu_core(void)
2309 rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check()); 2372 rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check());
2310 2373
2311 /* If there are callbacks ready, invoke them. */ 2374 /* If there are callbacks ready, invoke them. */
2312 if (rcu_segcblist_ready_cbs(&rdp->cblist) && 2375 if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist) &&
2313 likely(READ_ONCE(rcu_scheduler_fully_active))) 2376 likely(READ_ONCE(rcu_scheduler_fully_active)))
2314 rcu_do_batch(rdp); 2377 rcu_do_batch(rdp);
2315 2378
@@ -2489,10 +2552,11 @@ static void rcu_leak_callback(struct rcu_head *rhp)
2489 * is expected to specify a CPU. 2552 * is expected to specify a CPU.
2490 */ 2553 */
2491static void 2554static void
2492__call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy) 2555__call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy)
2493{ 2556{
2494 unsigned long flags; 2557 unsigned long flags;
2495 struct rcu_data *rdp; 2558 struct rcu_data *rdp;
2559 bool was_alldone;
2496 2560
2497 /* Misaligned rcu_head! */ 2561 /* Misaligned rcu_head! */
2498 WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1)); 2562 WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1));
@@ -2514,28 +2578,18 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy)
2514 rdp = this_cpu_ptr(&rcu_data); 2578 rdp = this_cpu_ptr(&rcu_data);
2515 2579
2516 /* Add the callback to our list. */ 2580 /* Add the callback to our list. */
2517 if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist)) || cpu != -1) { 2581 if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist))) {
2518 int offline; 2582 // This can trigger due to call_rcu() from offline CPU:
2519 2583 WARN_ON_ONCE(rcu_scheduler_active != RCU_SCHEDULER_INACTIVE);
2520 if (cpu != -1)
2521 rdp = per_cpu_ptr(&rcu_data, cpu);
2522 if (likely(rdp->mynode)) {
2523 /* Post-boot, so this should be for a no-CBs CPU. */
2524 offline = !__call_rcu_nocb(rdp, head, lazy, flags);
2525 WARN_ON_ONCE(offline);
2526 /* Offline CPU, _call_rcu() illegal, leak callback. */
2527 local_irq_restore(flags);
2528 return;
2529 }
2530 /*
2531 * Very early boot, before rcu_init(). Initialize if needed
2532 * and then drop through to queue the callback.
2533 */
2534 WARN_ON_ONCE(cpu != -1);
2535 WARN_ON_ONCE(!rcu_is_watching()); 2584 WARN_ON_ONCE(!rcu_is_watching());
2585 // Very early boot, before rcu_init(). Initialize if needed
2586 // and then drop through to queue the callback.
2536 if (rcu_segcblist_empty(&rdp->cblist)) 2587 if (rcu_segcblist_empty(&rdp->cblist))
2537 rcu_segcblist_init(&rdp->cblist); 2588 rcu_segcblist_init(&rdp->cblist);
2538 } 2589 }
2590 if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags))
2591 return; // Enqueued onto ->nocb_bypass, so just leave.
2592 /* If we get here, rcu_nocb_try_bypass() acquired ->nocb_lock. */
2539 rcu_segcblist_enqueue(&rdp->cblist, head, lazy); 2593 rcu_segcblist_enqueue(&rdp->cblist, head, lazy);
2540 if (__is_kfree_rcu_offset((unsigned long)func)) 2594 if (__is_kfree_rcu_offset((unsigned long)func))
2541 trace_rcu_kfree_callback(rcu_state.name, head, 2595 trace_rcu_kfree_callback(rcu_state.name, head,
@@ -2548,8 +2602,13 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy)
2548 rcu_segcblist_n_cbs(&rdp->cblist)); 2602 rcu_segcblist_n_cbs(&rdp->cblist));
2549 2603
2550 /* Go handle any RCU core processing required. */ 2604 /* Go handle any RCU core processing required. */
2551 __call_rcu_core(rdp, head, flags); 2605 if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
2552 local_irq_restore(flags); 2606 unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) {
2607 __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */
2608 } else {
2609 __call_rcu_core(rdp, head, flags);
2610 local_irq_restore(flags);
2611 }
2553} 2612}
2554 2613
2555/** 2614/**
@@ -2589,7 +2648,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy)
2589 */ 2648 */
2590void call_rcu(struct rcu_head *head, rcu_callback_t func) 2649void call_rcu(struct rcu_head *head, rcu_callback_t func)
2591{ 2650{
2592 __call_rcu(head, func, -1, 0); 2651 __call_rcu(head, func, 0);
2593} 2652}
2594EXPORT_SYMBOL_GPL(call_rcu); 2653EXPORT_SYMBOL_GPL(call_rcu);
2595 2654
@@ -2602,7 +2661,7 @@ EXPORT_SYMBOL_GPL(call_rcu);
2602 */ 2661 */
2603void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) 2662void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
2604{ 2663{
2605 __call_rcu(head, func, -1, 1); 2664 __call_rcu(head, func, 1);
2606} 2665}
2607EXPORT_SYMBOL_GPL(kfree_call_rcu); 2666EXPORT_SYMBOL_GPL(kfree_call_rcu);
2608 2667
@@ -2735,6 +2794,10 @@ static int rcu_pending(void)
2735 /* Check for CPU stalls, if enabled. */ 2794 /* Check for CPU stalls, if enabled. */
2736 check_cpu_stall(rdp); 2795 check_cpu_stall(rdp);
2737 2796
2797 /* Does this CPU need a deferred NOCB wakeup? */
2798 if (rcu_nocb_need_deferred_wakeup(rdp))
2799 return 1;
2800
2738 /* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */ 2801 /* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */
2739 if (rcu_nohz_full_cpu()) 2802 if (rcu_nohz_full_cpu())
2740 return 0; 2803 return 0;
@@ -2750,6 +2813,8 @@ static int rcu_pending(void)
2750 /* Has RCU gone idle with this CPU needing another grace period? */ 2813 /* Has RCU gone idle with this CPU needing another grace period? */
2751 if (!rcu_gp_in_progress() && 2814 if (!rcu_gp_in_progress() &&
2752 rcu_segcblist_is_enabled(&rdp->cblist) && 2815 rcu_segcblist_is_enabled(&rdp->cblist) &&
2816 (!IS_ENABLED(CONFIG_RCU_NOCB_CPU) ||
2817 !rcu_segcblist_is_offloaded(&rdp->cblist)) &&
2753 !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) 2818 !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
2754 return 1; 2819 return 1;
2755 2820
@@ -2758,10 +2823,6 @@ static int rcu_pending(void)
2758 unlikely(READ_ONCE(rdp->gpwrap))) /* outside lock */ 2823 unlikely(READ_ONCE(rdp->gpwrap))) /* outside lock */
2759 return 1; 2824 return 1;
2760 2825
2761 /* Does this CPU need a deferred NOCB wakeup? */
2762 if (rcu_nocb_need_deferred_wakeup(rdp))
2763 return 1;
2764
2765 /* nothing to do */ 2826 /* nothing to do */
2766 return 0; 2827 return 0;
2767} 2828}
@@ -2801,6 +2862,8 @@ static void rcu_barrier_func(void *unused)
2801 rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence); 2862 rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence);
2802 rdp->barrier_head.func = rcu_barrier_callback; 2863 rdp->barrier_head.func = rcu_barrier_callback;
2803 debug_rcu_head_queue(&rdp->barrier_head); 2864 debug_rcu_head_queue(&rdp->barrier_head);
2865 rcu_nocb_lock(rdp);
2866 WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
2804 if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) { 2867 if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) {
2805 atomic_inc(&rcu_state.barrier_cpu_count); 2868 atomic_inc(&rcu_state.barrier_cpu_count);
2806 } else { 2869 } else {
@@ -2808,6 +2871,7 @@ static void rcu_barrier_func(void *unused)
2808 rcu_barrier_trace(TPS("IRQNQ"), -1, 2871 rcu_barrier_trace(TPS("IRQNQ"), -1,
2809 rcu_state.barrier_sequence); 2872 rcu_state.barrier_sequence);
2810 } 2873 }
2874 rcu_nocb_unlock(rdp);
2811} 2875}
2812 2876
2813/** 2877/**
@@ -2858,22 +2922,11 @@ void rcu_barrier(void)
2858 * corresponding CPU's preceding callbacks have been invoked. 2922 * corresponding CPU's preceding callbacks have been invoked.
2859 */ 2923 */
2860 for_each_possible_cpu(cpu) { 2924 for_each_possible_cpu(cpu) {
2861 if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu))
2862 continue;
2863 rdp = per_cpu_ptr(&rcu_data, cpu); 2925 rdp = per_cpu_ptr(&rcu_data, cpu);
2864 if (rcu_is_nocb_cpu(cpu)) { 2926 if (!cpu_online(cpu) &&
2865 if (!rcu_nocb_cpu_needs_barrier(cpu)) { 2927 !rcu_segcblist_is_offloaded(&rdp->cblist))
2866 rcu_barrier_trace(TPS("OfflineNoCB"), cpu, 2928 continue;
2867 rcu_state.barrier_sequence); 2929 if (rcu_segcblist_n_cbs(&rdp->cblist)) {
2868 } else {
2869 rcu_barrier_trace(TPS("OnlineNoCB"), cpu,
2870 rcu_state.barrier_sequence);
2871 smp_mb__before_atomic();
2872 atomic_inc(&rcu_state.barrier_cpu_count);
2873 __call_rcu(&rdp->barrier_head,
2874 rcu_barrier_callback, cpu, 0);
2875 }
2876 } else if (rcu_segcblist_n_cbs(&rdp->cblist)) {
2877 rcu_barrier_trace(TPS("OnlineQ"), cpu, 2930 rcu_barrier_trace(TPS("OnlineQ"), cpu,
2878 rcu_state.barrier_sequence); 2931 rcu_state.barrier_sequence);
2879 smp_call_function_single(cpu, rcu_barrier_func, NULL, 1); 2932 smp_call_function_single(cpu, rcu_barrier_func, NULL, 1);
@@ -2958,7 +3011,8 @@ rcu_boot_init_percpu_data(int cpu)
2958 * Initializes a CPU's per-CPU RCU data. Note that only one online or 3011 * Initializes a CPU's per-CPU RCU data. Note that only one online or
2959 * offline event can be happening at a given time. Note also that we can 3012 * offline event can be happening at a given time. Note also that we can
2960 * accept some slop in the rsp->gp_seq access due to the fact that this 3013 * accept some slop in the rsp->gp_seq access due to the fact that this
2961 * CPU cannot possibly have any RCU callbacks in flight yet. 3014 * CPU cannot possibly have any non-offloaded RCU callbacks in flight yet.
3015 * And any offloaded callbacks are being numbered elsewhere.
2962 */ 3016 */
2963int rcutree_prepare_cpu(unsigned int cpu) 3017int rcutree_prepare_cpu(unsigned int cpu)
2964{ 3018{
@@ -2972,7 +3026,7 @@ int rcutree_prepare_cpu(unsigned int cpu)
2972 rdp->n_force_qs_snap = rcu_state.n_force_qs; 3026 rdp->n_force_qs_snap = rcu_state.n_force_qs;
2973 rdp->blimit = blimit; 3027 rdp->blimit = blimit;
2974 if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */ 3028 if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */
2975 !init_nocb_callback_list(rdp)) 3029 !rcu_segcblist_is_offloaded(&rdp->cblist))
2976 rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ 3030 rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */
2977 rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */ 3031 rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */
2978 rcu_dynticks_eqs_online(); 3032 rcu_dynticks_eqs_online();
@@ -3151,29 +3205,38 @@ void rcutree_migrate_callbacks(int cpu)
3151{ 3205{
3152 unsigned long flags; 3206 unsigned long flags;
3153 struct rcu_data *my_rdp; 3207 struct rcu_data *my_rdp;
3208 struct rcu_node *my_rnp;
3154 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); 3209 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
3155 struct rcu_node *rnp_root = rcu_get_root();
3156 bool needwake; 3210 bool needwake;
3157 3211
3158 if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist)) 3212 if (rcu_segcblist_is_offloaded(&rdp->cblist) ||
3213 rcu_segcblist_empty(&rdp->cblist))
3159 return; /* No callbacks to migrate. */ 3214 return; /* No callbacks to migrate. */
3160 3215
3161 local_irq_save(flags); 3216 local_irq_save(flags);
3162 my_rdp = this_cpu_ptr(&rcu_data); 3217 my_rdp = this_cpu_ptr(&rcu_data);
3163 if (rcu_nocb_adopt_orphan_cbs(my_rdp, rdp, flags)) { 3218 my_rnp = my_rdp->mynode;
3164 local_irq_restore(flags); 3219 rcu_nocb_lock(my_rdp); /* irqs already disabled. */
3165 return; 3220 WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies));
3166 } 3221 raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */
3167 raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */
3168 /* Leverage recent GPs and set GP for new callbacks. */ 3222 /* Leverage recent GPs and set GP for new callbacks. */
3169 needwake = rcu_advance_cbs(rnp_root, rdp) || 3223 needwake = rcu_advance_cbs(my_rnp, rdp) ||
3170 rcu_advance_cbs(rnp_root, my_rdp); 3224 rcu_advance_cbs(my_rnp, my_rdp);
3171 rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist); 3225 rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist);
3226 needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp);
3227 rcu_segcblist_disable(&rdp->cblist);
3172 WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != 3228 WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) !=
3173 !rcu_segcblist_n_cbs(&my_rdp->cblist)); 3229 !rcu_segcblist_n_cbs(&my_rdp->cblist));
3174 raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags); 3230 if (rcu_segcblist_is_offloaded(&my_rdp->cblist)) {
3231 raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */
3232 __call_rcu_nocb_wake(my_rdp, true, flags);
3233 } else {
3234 rcu_nocb_unlock(my_rdp); /* irqs remain disabled. */
3235 raw_spin_unlock_irqrestore_rcu_node(my_rnp, flags);
3236 }
3175 if (needwake) 3237 if (needwake)
3176 rcu_gp_kthread_wake(); 3238 rcu_gp_kthread_wake();
3239 lockdep_assert_irqs_enabled();
3177 WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || 3240 WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
3178 !rcu_segcblist_empty(&rdp->cblist), 3241 !rcu_segcblist_empty(&rdp->cblist),
3179 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", 3242 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 7acaf3a62d39..c612f306fe89 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -194,29 +194,38 @@ struct rcu_data {
194 194
195 /* 5) Callback offloading. */ 195 /* 5) Callback offloading. */
196#ifdef CONFIG_RCU_NOCB_CPU 196#ifdef CONFIG_RCU_NOCB_CPU
197 struct rcu_head *nocb_head; /* CBs waiting for kthread. */ 197 struct swait_queue_head nocb_cb_wq; /* For nocb kthreads to sleep on. */
198 struct rcu_head **nocb_tail; 198 struct task_struct *nocb_gp_kthread;
199 atomic_long_t nocb_q_count; /* # CBs waiting for nocb */
200 atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */
201 struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
202 struct rcu_head **nocb_follower_tail;
203 struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */
204 struct task_struct *nocb_kthread;
205 raw_spinlock_t nocb_lock; /* Guard following pair of fields. */ 199 raw_spinlock_t nocb_lock; /* Guard following pair of fields. */
200 atomic_t nocb_lock_contended; /* Contention experienced. */
206 int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ 201 int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
207 struct timer_list nocb_timer; /* Enforce finite deferral. */ 202 struct timer_list nocb_timer; /* Enforce finite deferral. */
208 203 unsigned long nocb_gp_adv_time; /* Last call_rcu() CB adv (jiffies). */
209 /* The following fields are used by the leader, hence own cacheline. */ 204
210 struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp; 205 /* The following fields are used by call_rcu, hence own cacheline. */
211 /* CBs waiting for GP. */ 206 raw_spinlock_t nocb_bypass_lock ____cacheline_internodealigned_in_smp;
212 struct rcu_head **nocb_gp_tail; 207 struct rcu_cblist nocb_bypass; /* Lock-contention-bypass CB list. */
213 bool nocb_leader_sleep; /* Is the nocb leader thread asleep? */ 208 unsigned long nocb_bypass_first; /* Time (jiffies) of first enqueue. */
214 struct rcu_data *nocb_next_follower; 209 unsigned long nocb_nobypass_last; /* Last ->cblist enqueue (jiffies). */
215 /* Next follower in wakeup chain. */ 210 int nocb_nobypass_count; /* # ->cblist enqueues at ^^^ time. */
216 211
217 /* The following fields are used by the follower, hence new cachline. */ 212 /* The following fields are used by GP kthread, hence own cacheline. */
218 struct rcu_data *nocb_leader ____cacheline_internodealigned_in_smp; 213 raw_spinlock_t nocb_gp_lock ____cacheline_internodealigned_in_smp;
219 /* Leader CPU takes GP-end wakeups. */ 214 struct timer_list nocb_bypass_timer; /* Force nocb_bypass flush. */
215 u8 nocb_gp_sleep; /* Is the nocb GP thread asleep? */
216 u8 nocb_gp_bypass; /* Found a bypass on last scan? */
217 u8 nocb_gp_gp; /* GP to wait for on last scan? */
218 unsigned long nocb_gp_seq; /* If so, ->gp_seq to wait for. */
219 unsigned long nocb_gp_loops; /* # passes through wait code. */
220 struct swait_queue_head nocb_gp_wq; /* For nocb kthreads to sleep on. */
221 bool nocb_cb_sleep; /* Is the nocb CB thread asleep? */
222 struct task_struct *nocb_cb_kthread;
223 struct rcu_data *nocb_next_cb_rdp;
224 /* Next rcu_data in wakeup chain. */
225
226 /* The following fields are used by CB kthread, hence new cacheline. */
227 struct rcu_data *nocb_gp_rdp ____cacheline_internodealigned_in_smp;
228 /* GP rdp takes GP-end wakeups. */
220#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ 229#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
221 230
222 /* 6) RCU priority boosting. */ 231 /* 6) RCU priority boosting. */
@@ -419,25 +428,39 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
419static bool rcu_preempt_need_deferred_qs(struct task_struct *t); 428static bool rcu_preempt_need_deferred_qs(struct task_struct *t);
420static void rcu_preempt_deferred_qs(struct task_struct *t); 429static void rcu_preempt_deferred_qs(struct task_struct *t);
421static void zero_cpu_stall_ticks(struct rcu_data *rdp); 430static void zero_cpu_stall_ticks(struct rcu_data *rdp);
422static bool rcu_nocb_cpu_needs_barrier(int cpu);
423static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp); 431static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
424static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq); 432static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
425static void rcu_init_one_nocb(struct rcu_node *rnp); 433static void rcu_init_one_nocb(struct rcu_node *rnp);
426static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, 434static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
427 bool lazy, unsigned long flags); 435 unsigned long j);
428static bool rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, 436static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
429 struct rcu_data *rdp, 437 bool *was_alldone, unsigned long flags);
430 unsigned long flags); 438static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
439 unsigned long flags);
431static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); 440static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
432static void do_nocb_deferred_wakeup(struct rcu_data *rdp); 441static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
433static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); 442static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
434static void rcu_spawn_cpu_nocb_kthread(int cpu); 443static void rcu_spawn_cpu_nocb_kthread(int cpu);
435static void __init rcu_spawn_nocb_kthreads(void); 444static void __init rcu_spawn_nocb_kthreads(void);
445static void show_rcu_nocb_state(struct rcu_data *rdp);
446static void rcu_nocb_lock(struct rcu_data *rdp);
447static void rcu_nocb_unlock(struct rcu_data *rdp);
448static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
449 unsigned long flags);
450static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp);
436#ifdef CONFIG_RCU_NOCB_CPU 451#ifdef CONFIG_RCU_NOCB_CPU
437static void __init rcu_organize_nocb_kthreads(void); 452static void __init rcu_organize_nocb_kthreads(void);
438#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ 453#define rcu_nocb_lock_irqsave(rdp, flags) \
439static bool init_nocb_callback_list(struct rcu_data *rdp); 454do { \
440static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp); 455 if (!rcu_segcblist_is_offloaded(&(rdp)->cblist)) \
456 local_irq_save(flags); \
457 else \
458 raw_spin_lock_irqsave(&(rdp)->nocb_lock, (flags)); \
459} while (0)
460#else /* #ifdef CONFIG_RCU_NOCB_CPU */
461#define rcu_nocb_lock_irqsave(rdp, flags) local_irq_save(flags)
462#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
463
441static void rcu_bind_gp_kthread(void); 464static void rcu_bind_gp_kthread(void);
442static bool rcu_nohz_full_cpu(void); 465static bool rcu_nohz_full_cpu(void);
443static void rcu_dynticks_task_enter(void); 466static void rcu_dynticks_task_enter(void);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index af7e7b9c86af..d632cd019597 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -781,7 +781,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
781 * other hand, if the CPU is not in an RCU read-side critical section, 781 * other hand, if the CPU is not in an RCU read-side critical section,
782 * the IPI handler reports the quiescent state immediately. 782 * the IPI handler reports the quiescent state immediately.
783 * 783 *
784 * Although this is a greate improvement over previous expedited 784 * Although this is a great improvement over previous expedited
785 * implementations, it is still unfriendly to real-time workloads, so is 785 * implementations, it is still unfriendly to real-time workloads, so is
786 * thus not recommended for any sort of common-case code. In fact, if 786 * thus not recommended for any sort of common-case code. In fact, if
787 * you are using synchronize_rcu_expedited() in a loop, please restructure 787 * you are using synchronize_rcu_expedited() in a loop, please restructure
@@ -792,6 +792,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
792 */ 792 */
793void synchronize_rcu_expedited(void) 793void synchronize_rcu_expedited(void)
794{ 794{
795 bool boottime = (rcu_scheduler_active == RCU_SCHEDULER_INIT);
795 struct rcu_exp_work rew; 796 struct rcu_exp_work rew;
796 struct rcu_node *rnp; 797 struct rcu_node *rnp;
797 unsigned long s; 798 unsigned long s;
@@ -817,7 +818,7 @@ void synchronize_rcu_expedited(void)
817 return; /* Someone else did our work for us. */ 818 return; /* Someone else did our work for us. */
818 819
819 /* Ensure that load happens before action based on it. */ 820 /* Ensure that load happens before action based on it. */
820 if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) { 821 if (unlikely(boottime)) {
821 /* Direct call during scheduler init and early_initcalls(). */ 822 /* Direct call during scheduler init and early_initcalls(). */
822 rcu_exp_sel_wait_wake(s); 823 rcu_exp_sel_wait_wake(s);
823 } else { 824 } else {
@@ -835,5 +836,8 @@ void synchronize_rcu_expedited(void)
835 836
836 /* Let the next expedited grace period start. */ 837 /* Let the next expedited grace period start. */
837 mutex_unlock(&rcu_state.exp_mutex); 838 mutex_unlock(&rcu_state.exp_mutex);
839
840 if (likely(!boottime))
841 destroy_work_on_stack(&rew.rew_work);
838} 842}
839EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); 843EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index acb225023ed1..2defc7fe74c3 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -288,7 +288,6 @@ void rcu_note_context_switch(bool preempt)
288 struct rcu_data *rdp = this_cpu_ptr(&rcu_data); 288 struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
289 struct rcu_node *rnp; 289 struct rcu_node *rnp;
290 290
291 barrier(); /* Avoid RCU read-side critical sections leaking down. */
292 trace_rcu_utilization(TPS("Start context switch")); 291 trace_rcu_utilization(TPS("Start context switch"));
293 lockdep_assert_irqs_disabled(); 292 lockdep_assert_irqs_disabled();
294 WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0); 293 WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0);
@@ -314,15 +313,6 @@ void rcu_note_context_switch(bool preempt)
314 ? rnp->gp_seq 313 ? rnp->gp_seq
315 : rcu_seq_snap(&rnp->gp_seq)); 314 : rcu_seq_snap(&rnp->gp_seq));
316 rcu_preempt_ctxt_queue(rnp, rdp); 315 rcu_preempt_ctxt_queue(rnp, rdp);
317 } else if (t->rcu_read_lock_nesting < 0 &&
318 t->rcu_read_unlock_special.s) {
319
320 /*
321 * Complete exit from RCU read-side critical section on
322 * behalf of preempted instance of __rcu_read_unlock().
323 */
324 rcu_read_unlock_special(t);
325 rcu_preempt_deferred_qs(t);
326 } else { 316 } else {
327 rcu_preempt_deferred_qs(t); 317 rcu_preempt_deferred_qs(t);
328 } 318 }
@@ -340,7 +330,6 @@ void rcu_note_context_switch(bool preempt)
340 if (rdp->exp_deferred_qs) 330 if (rdp->exp_deferred_qs)
341 rcu_report_exp_rdp(rdp); 331 rcu_report_exp_rdp(rdp);
342 trace_rcu_utilization(TPS("End context switch")); 332 trace_rcu_utilization(TPS("End context switch"));
343 barrier(); /* Avoid RCU read-side critical sections leaking up. */
344} 333}
345EXPORT_SYMBOL_GPL(rcu_note_context_switch); 334EXPORT_SYMBOL_GPL(rcu_note_context_switch);
346 335
@@ -626,22 +615,18 @@ static void rcu_read_unlock_special(struct task_struct *t)
626 (rdp->grpmask & rnp->expmask) || 615 (rdp->grpmask & rnp->expmask) ||
627 tick_nohz_full_cpu(rdp->cpu); 616 tick_nohz_full_cpu(rdp->cpu);
628 // Need to defer quiescent state until everything is enabled. 617 // Need to defer quiescent state until everything is enabled.
629 if ((exp || in_irq()) && irqs_were_disabled && use_softirq && 618 if (irqs_were_disabled && use_softirq &&
630 (in_irq() || !t->rcu_read_unlock_special.b.deferred_qs)) { 619 (in_interrupt() ||
620 (exp && !t->rcu_read_unlock_special.b.deferred_qs))) {
631 // Using softirq, safe to awaken, and we get 621 // Using softirq, safe to awaken, and we get
632 // no help from enabling irqs, unlike bh/preempt. 622 // no help from enabling irqs, unlike bh/preempt.
633 raise_softirq_irqoff(RCU_SOFTIRQ); 623 raise_softirq_irqoff(RCU_SOFTIRQ);
634 } else if (exp && irqs_were_disabled && !use_softirq &&
635 !t->rcu_read_unlock_special.b.deferred_qs) {
636 // Safe to awaken and we get no help from enabling
637 // irqs, unlike bh/preempt.
638 invoke_rcu_core();
639 } else { 624 } else {
640 // Enabling BH or preempt does reschedule, so... 625 // Enabling BH or preempt does reschedule, so...
641 // Also if no expediting or NO_HZ_FULL, slow is OK. 626 // Also if no expediting or NO_HZ_FULL, slow is OK.
642 set_tsk_need_resched(current); 627 set_tsk_need_resched(current);
643 set_preempt_need_resched(); 628 set_preempt_need_resched();
644 if (IS_ENABLED(CONFIG_IRQ_WORK) && 629 if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled &&
645 !rdp->defer_qs_iw_pending && exp) { 630 !rdp->defer_qs_iw_pending && exp) {
646 // Get scheduler to re-evaluate and call hooks. 631 // Get scheduler to re-evaluate and call hooks.
647 // If !IRQ_WORK, FQS scan will eventually IPI. 632 // If !IRQ_WORK, FQS scan will eventually IPI.
@@ -828,11 +813,6 @@ static void rcu_qs(void)
828 * dyntick-idle quiescent state visible to other CPUs, which will in 813 * dyntick-idle quiescent state visible to other CPUs, which will in
829 * some cases serve for expedited as well as normal grace periods. 814 * some cases serve for expedited as well as normal grace periods.
830 * Either way, register a lightweight quiescent state. 815 * Either way, register a lightweight quiescent state.
831 *
832 * The barrier() calls are redundant in the common case when this is
833 * called externally, but just in case this is called from within this
834 * file.
835 *
836 */ 816 */
837void rcu_all_qs(void) 817void rcu_all_qs(void)
838{ 818{
@@ -847,14 +827,12 @@ void rcu_all_qs(void)
847 return; 827 return;
848 } 828 }
849 this_cpu_write(rcu_data.rcu_urgent_qs, false); 829 this_cpu_write(rcu_data.rcu_urgent_qs, false);
850 barrier(); /* Avoid RCU read-side critical sections leaking down. */
851 if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) { 830 if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) {
852 local_irq_save(flags); 831 local_irq_save(flags);
853 rcu_momentary_dyntick_idle(); 832 rcu_momentary_dyntick_idle();
854 local_irq_restore(flags); 833 local_irq_restore(flags);
855 } 834 }
856 rcu_qs(); 835 rcu_qs();
857 barrier(); /* Avoid RCU read-side critical sections leaking up. */
858 preempt_enable(); 836 preempt_enable();
859} 837}
860EXPORT_SYMBOL_GPL(rcu_all_qs); 838EXPORT_SYMBOL_GPL(rcu_all_qs);
@@ -864,7 +842,6 @@ EXPORT_SYMBOL_GPL(rcu_all_qs);
864 */ 842 */
865void rcu_note_context_switch(bool preempt) 843void rcu_note_context_switch(bool preempt)
866{ 844{
867 barrier(); /* Avoid RCU read-side critical sections leaking down. */
868 trace_rcu_utilization(TPS("Start context switch")); 845 trace_rcu_utilization(TPS("Start context switch"));
869 rcu_qs(); 846 rcu_qs();
870 /* Load rcu_urgent_qs before other flags. */ 847 /* Load rcu_urgent_qs before other flags. */
@@ -877,7 +854,6 @@ void rcu_note_context_switch(bool preempt)
877 rcu_tasks_qs(current); 854 rcu_tasks_qs(current);
878out: 855out:
879 trace_rcu_utilization(TPS("End context switch")); 856 trace_rcu_utilization(TPS("End context switch"));
880 barrier(); /* Avoid RCU read-side critical sections leaking up. */
881} 857}
882EXPORT_SYMBOL_GPL(rcu_note_context_switch); 858EXPORT_SYMBOL_GPL(rcu_note_context_switch);
883 859
@@ -1134,7 +1110,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1134 * already exist. We only create this kthread for preemptible RCU. 1110 * already exist. We only create this kthread for preemptible RCU.
1135 * Returns zero if all is well, a negated errno otherwise. 1111 * Returns zero if all is well, a negated errno otherwise.
1136 */ 1112 */
1137static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp) 1113static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
1138{ 1114{
1139 int rnp_index = rnp - rcu_get_root(); 1115 int rnp_index = rnp - rcu_get_root();
1140 unsigned long flags; 1116 unsigned long flags;
@@ -1142,25 +1118,27 @@ static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
1142 struct task_struct *t; 1118 struct task_struct *t;
1143 1119
1144 if (!IS_ENABLED(CONFIG_PREEMPT_RCU)) 1120 if (!IS_ENABLED(CONFIG_PREEMPT_RCU))
1145 return 0; 1121 return;
1146 1122
1147 if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0) 1123 if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0)
1148 return 0; 1124 return;
1149 1125
1150 rcu_state.boost = 1; 1126 rcu_state.boost = 1;
1127
1151 if (rnp->boost_kthread_task != NULL) 1128 if (rnp->boost_kthread_task != NULL)
1152 return 0; 1129 return;
1130
1153 t = kthread_create(rcu_boost_kthread, (void *)rnp, 1131 t = kthread_create(rcu_boost_kthread, (void *)rnp,
1154 "rcub/%d", rnp_index); 1132 "rcub/%d", rnp_index);
1155 if (IS_ERR(t)) 1133 if (WARN_ON_ONCE(IS_ERR(t)))
1156 return PTR_ERR(t); 1134 return;
1135
1157 raw_spin_lock_irqsave_rcu_node(rnp, flags); 1136 raw_spin_lock_irqsave_rcu_node(rnp, flags);
1158 rnp->boost_kthread_task = t; 1137 rnp->boost_kthread_task = t;
1159 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1138 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1160 sp.sched_priority = kthread_prio; 1139 sp.sched_priority = kthread_prio;
1161 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); 1140 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1162 wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ 1141 wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
1163 return 0;
1164} 1142}
1165 1143
1166/* 1144/*
@@ -1201,7 +1179,7 @@ static void __init rcu_spawn_boost_kthreads(void)
1201 struct rcu_node *rnp; 1179 struct rcu_node *rnp;
1202 1180
1203 rcu_for_each_leaf_node(rnp) 1181 rcu_for_each_leaf_node(rnp)
1204 (void)rcu_spawn_one_boost_kthread(rnp); 1182 rcu_spawn_one_boost_kthread(rnp);
1205} 1183}
1206 1184
1207static void rcu_prepare_kthreads(int cpu) 1185static void rcu_prepare_kthreads(int cpu)
@@ -1211,7 +1189,7 @@ static void rcu_prepare_kthreads(int cpu)
1211 1189
1212 /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ 1190 /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
1213 if (rcu_scheduler_fully_active) 1191 if (rcu_scheduler_fully_active)
1214 (void)rcu_spawn_one_boost_kthread(rnp); 1192 rcu_spawn_one_boost_kthread(rnp);
1215} 1193}
1216 1194
1217#else /* #ifdef CONFIG_RCU_BOOST */ 1195#else /* #ifdef CONFIG_RCU_BOOST */
@@ -1248,10 +1226,10 @@ static void rcu_prepare_kthreads(int cpu)
1248#if !defined(CONFIG_RCU_FAST_NO_HZ) 1226#if !defined(CONFIG_RCU_FAST_NO_HZ)
1249 1227
1250/* 1228/*
1251 * Check to see if any future RCU-related work will need to be done 1229 * Check to see if any future non-offloaded RCU-related work will need
1252 * by the current CPU, even if none need be done immediately, returning 1230 * to be done by the current CPU, even if none need be done immediately,
1253 * 1 if so. This function is part of the RCU implementation; it is -not- 1231 * returning 1 if so. This function is part of the RCU implementation;
1254 * an exported member of the RCU API. 1232 * it is -not- an exported member of the RCU API.
1255 * 1233 *
1256 * Because we not have RCU_FAST_NO_HZ, just check whether or not this 1234 * Because we not have RCU_FAST_NO_HZ, just check whether or not this
1257 * CPU has RCU callbacks queued. 1235 * CPU has RCU callbacks queued.
@@ -1259,7 +1237,8 @@ static void rcu_prepare_kthreads(int cpu)
1259int rcu_needs_cpu(u64 basemono, u64 *nextevt) 1237int rcu_needs_cpu(u64 basemono, u64 *nextevt)
1260{ 1238{
1261 *nextevt = KTIME_MAX; 1239 *nextevt = KTIME_MAX;
1262 return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist); 1240 return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) &&
1241 !rcu_segcblist_is_offloaded(&this_cpu_ptr(&rcu_data)->cblist);
1263} 1242}
1264 1243
1265/* 1244/*
@@ -1360,8 +1339,9 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
1360 1339
1361 lockdep_assert_irqs_disabled(); 1340 lockdep_assert_irqs_disabled();
1362 1341
1363 /* If no callbacks, RCU doesn't need the CPU. */ 1342 /* If no non-offloaded callbacks, RCU doesn't need the CPU. */
1364 if (rcu_segcblist_empty(&rdp->cblist)) { 1343 if (rcu_segcblist_empty(&rdp->cblist) ||
1344 rcu_segcblist_is_offloaded(&this_cpu_ptr(&rcu_data)->cblist)) {
1365 *nextevt = KTIME_MAX; 1345 *nextevt = KTIME_MAX;
1366 return 0; 1346 return 0;
1367 } 1347 }
@@ -1404,7 +1384,7 @@ static void rcu_prepare_for_idle(void)
1404 int tne; 1384 int tne;
1405 1385
1406 lockdep_assert_irqs_disabled(); 1386 lockdep_assert_irqs_disabled();
1407 if (rcu_is_nocb_cpu(smp_processor_id())) 1387 if (rcu_segcblist_is_offloaded(&rdp->cblist))
1408 return; 1388 return;
1409 1389
1410 /* Handle nohz enablement switches conservatively. */ 1390 /* Handle nohz enablement switches conservatively. */
@@ -1453,8 +1433,10 @@ static void rcu_prepare_for_idle(void)
1453 */ 1433 */
1454static void rcu_cleanup_after_idle(void) 1434static void rcu_cleanup_after_idle(void)
1455{ 1435{
1436 struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
1437
1456 lockdep_assert_irqs_disabled(); 1438 lockdep_assert_irqs_disabled();
1457 if (rcu_is_nocb_cpu(smp_processor_id())) 1439 if (rcu_segcblist_is_offloaded(&rdp->cblist))
1458 return; 1440 return;
1459 if (rcu_try_advance_all_cbs()) 1441 if (rcu_try_advance_all_cbs())
1460 invoke_rcu_core(); 1442 invoke_rcu_core();
@@ -1469,10 +1451,10 @@ static void rcu_cleanup_after_idle(void)
1469 * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads 1451 * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads
1470 * created that pull the callbacks from the corresponding CPU, wait for 1452 * created that pull the callbacks from the corresponding CPU, wait for
1471 * a grace period to elapse, and invoke the callbacks. These kthreads 1453 * a grace period to elapse, and invoke the callbacks. These kthreads
1472 * are organized into leaders, which manage incoming callbacks, wait for 1454 * are organized into GP kthreads, which manage incoming callbacks, wait for
1473 * grace periods, and awaken followers, and the followers, which only 1455 * grace periods, and awaken CB kthreads, and the CB kthreads, which only
1474 * invoke callbacks. Each leader is its own follower. The no-CBs CPUs 1456 * invoke callbacks. Each GP kthread invokes its own CBs. The no-CBs CPUs
1475 * do a wake_up() on their kthread when they insert a callback into any 1457 * do a wake_up() on their GP kthread when they insert a callback into any
1476 * empty list, unless the rcu_nocb_poll boot parameter has been specified, 1458 * empty list, unless the rcu_nocb_poll boot parameter has been specified,
1477 * in which case each kthread actively polls its CPU. (Which isn't so great 1459 * in which case each kthread actively polls its CPU. (Which isn't so great
1478 * for energy efficiency, but which does reduce RCU's overhead on that CPU.) 1460 * for energy efficiency, but which does reduce RCU's overhead on that CPU.)
@@ -1515,6 +1497,116 @@ static int __init parse_rcu_nocb_poll(char *arg)
1515early_param("rcu_nocb_poll", parse_rcu_nocb_poll); 1497early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
1516 1498
1517/* 1499/*
1500 * Don't bother bypassing ->cblist if the call_rcu() rate is low.
1501 * After all, the main point of bypassing is to avoid lock contention
1502 * on ->nocb_lock, which only can happen at high call_rcu() rates.
1503 */
1504int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ;
1505module_param(nocb_nobypass_lim_per_jiffy, int, 0);
1506
1507/*
1508 * Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the
1509 * lock isn't immediately available, increment ->nocb_lock_contended to
1510 * flag the contention.
1511 */
1512static void rcu_nocb_bypass_lock(struct rcu_data *rdp)
1513{
1514 lockdep_assert_irqs_disabled();
1515 if (raw_spin_trylock(&rdp->nocb_bypass_lock))
1516 return;
1517 atomic_inc(&rdp->nocb_lock_contended);
1518 WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
1519 smp_mb__after_atomic(); /* atomic_inc() before lock. */
1520 raw_spin_lock(&rdp->nocb_bypass_lock);
1521 smp_mb__before_atomic(); /* atomic_dec() after lock. */
1522 atomic_dec(&rdp->nocb_lock_contended);
1523}
1524
1525/*
1526 * Spinwait until the specified rcu_data structure's ->nocb_lock is
1527 * not contended. Please note that this is extremely special-purpose,
1528 * relying on the fact that at most two kthreads and one CPU contend for
1529 * this lock, and also that the two kthreads are guaranteed to have frequent
1530 * grace-period-duration time intervals between successive acquisitions
1531 * of the lock. This allows us to use an extremely simple throttling
1532 * mechanism, and further to apply it only to the CPU doing floods of
1533 * call_rcu() invocations. Don't try this at home!
1534 */
1535static void rcu_nocb_wait_contended(struct rcu_data *rdp)
1536{
1537 WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
1538 while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended)))
1539 cpu_relax();
1540}
1541
1542/*
1543 * Conditionally acquire the specified rcu_data structure's
1544 * ->nocb_bypass_lock.
1545 */
1546static bool rcu_nocb_bypass_trylock(struct rcu_data *rdp)
1547{
1548 lockdep_assert_irqs_disabled();
1549 return raw_spin_trylock(&rdp->nocb_bypass_lock);
1550}
1551
1552/*
1553 * Release the specified rcu_data structure's ->nocb_bypass_lock.
1554 */
1555static void rcu_nocb_bypass_unlock(struct rcu_data *rdp)
1556{
1557 lockdep_assert_irqs_disabled();
1558 raw_spin_unlock(&rdp->nocb_bypass_lock);
1559}
1560
1561/*
1562 * Acquire the specified rcu_data structure's ->nocb_lock, but only
1563 * if it corresponds to a no-CBs CPU.
1564 */
1565static void rcu_nocb_lock(struct rcu_data *rdp)
1566{
1567 lockdep_assert_irqs_disabled();
1568 if (!rcu_segcblist_is_offloaded(&rdp->cblist))
1569 return;
1570 raw_spin_lock(&rdp->nocb_lock);
1571}
1572
1573/*
1574 * Release the specified rcu_data structure's ->nocb_lock, but only
1575 * if it corresponds to a no-CBs CPU.
1576 */
1577static void rcu_nocb_unlock(struct rcu_data *rdp)
1578{
1579 if (rcu_segcblist_is_offloaded(&rdp->cblist)) {
1580 lockdep_assert_irqs_disabled();
1581 raw_spin_unlock(&rdp->nocb_lock);
1582 }
1583}
1584
1585/*
1586 * Release the specified rcu_data structure's ->nocb_lock and restore
1587 * interrupts, but only if it corresponds to a no-CBs CPU.
1588 */
1589static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
1590 unsigned long flags)
1591{
1592 if (rcu_segcblist_is_offloaded(&rdp->cblist)) {
1593 lockdep_assert_irqs_disabled();
1594 raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
1595 } else {
1596 local_irq_restore(flags);
1597 }
1598}
1599
1600/* Lockdep check that ->cblist may be safely accessed. */
1601static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
1602{
1603 lockdep_assert_irqs_disabled();
1604 if (rcu_segcblist_is_offloaded(&rdp->cblist) &&
1605 cpu_online(rdp->cpu))
1606 lockdep_assert_held(&rdp->nocb_lock);
1607}
1608
1609/*
1518 * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended 1610 * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
1519 * grace period. 1611 * grace period.
1520 */ 1612 */
@@ -1543,440 +1635,514 @@ bool rcu_is_nocb_cpu(int cpu)
1543} 1635}
1544 1636
1545/* 1637/*
1546 * Kick the leader kthread for this NOCB group. Caller holds ->nocb_lock 1638 * Kick the GP kthread for this NOCB group. Caller holds ->nocb_lock
1547 * and this function releases it. 1639 * and this function releases it.
1548 */ 1640 */
1549static void __wake_nocb_leader(struct rcu_data *rdp, bool force, 1641static void wake_nocb_gp(struct rcu_data *rdp, bool force,
1550 unsigned long flags) 1642 unsigned long flags)
1551 __releases(rdp->nocb_lock) 1643 __releases(rdp->nocb_lock)
1552{ 1644{
1553 struct rcu_data *rdp_leader = rdp->nocb_leader; 1645 bool needwake = false;
1646 struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
1554 1647
1555 lockdep_assert_held(&rdp->nocb_lock); 1648 lockdep_assert_held(&rdp->nocb_lock);
1556 if (!READ_ONCE(rdp_leader->nocb_kthread)) { 1649 if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) {
1557 raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); 1650 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
1651 TPS("AlreadyAwake"));
1652 rcu_nocb_unlock_irqrestore(rdp, flags);
1558 return; 1653 return;
1559 } 1654 }
1560 if (rdp_leader->nocb_leader_sleep || force) { 1655 del_timer(&rdp->nocb_timer);
1561 /* Prior smp_mb__after_atomic() orders against prior enqueue. */ 1656 rcu_nocb_unlock_irqrestore(rdp, flags);
1562 WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); 1657 raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
1563 del_timer(&rdp->nocb_timer); 1658 if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) {
1564 raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); 1659 WRITE_ONCE(rdp_gp->nocb_gp_sleep, false);
1565 smp_mb(); /* ->nocb_leader_sleep before swake_up_one(). */ 1660 needwake = true;
1566 swake_up_one(&rdp_leader->nocb_wq); 1661 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake"));
1567 } else {
1568 raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
1569 } 1662 }
1663 raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
1664 if (needwake)
1665 wake_up_process(rdp_gp->nocb_gp_kthread);
1570} 1666}
1571 1667
1572/* 1668/*
1573 * Kick the leader kthread for this NOCB group, but caller has not 1669 * Arrange to wake the GP kthread for this NOCB group at some future
1574 * acquired locks. 1670 * time when it is safe to do so.
1575 */ 1671 */
1576static void wake_nocb_leader(struct rcu_data *rdp, bool force) 1672static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
1673 const char *reason)
1577{ 1674{
1578 unsigned long flags; 1675 if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT)
1676 mod_timer(&rdp->nocb_timer, jiffies + 1);
1677 if (rdp->nocb_defer_wakeup < waketype)
1678 WRITE_ONCE(rdp->nocb_defer_wakeup, waketype);
1679 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason);
1680}
1681
1682/*
1683 * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
1684 * However, if there is a callback to be enqueued and if ->nocb_bypass
1685 * proves to be initially empty, just return false because the no-CB GP
1686 * kthread may need to be awakened in this case.
1687 *
1688 * Note that this function always returns true if rhp is NULL.
1689 */
1690static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
1691 unsigned long j)
1692{
1693 struct rcu_cblist rcl;
1579 1694
1580 raw_spin_lock_irqsave(&rdp->nocb_lock, flags); 1695 WARN_ON_ONCE(!rcu_segcblist_is_offloaded(&rdp->cblist));
1581 __wake_nocb_leader(rdp, force, flags); 1696 rcu_lockdep_assert_cblist_protected(rdp);
1697 lockdep_assert_held(&rdp->nocb_bypass_lock);
1698 if (rhp && !rcu_cblist_n_cbs(&rdp->nocb_bypass)) {
1699 raw_spin_unlock(&rdp->nocb_bypass_lock);
1700 return false;
1701 }
1702 /* Note: ->cblist.len already accounts for ->nocb_bypass contents. */
1703 if (rhp)
1704 rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
1705 rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp);
1706 rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl);
1707 WRITE_ONCE(rdp->nocb_bypass_first, j);
1708 rcu_nocb_bypass_unlock(rdp);
1709 return true;
1582} 1710}
1583 1711
1584/* 1712/*
1585 * Arrange to wake the leader kthread for this NOCB group at some 1713 * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
1586 * future time when it is safe to do so. 1714 * However, if there is a callback to be enqueued and if ->nocb_bypass
1715 * proves to be initially empty, just return false because the no-CB GP
1716 * kthread may need to be awakened in this case.
1717 *
1718 * Note that this function always returns true if rhp is NULL.
1587 */ 1719 */
1588static void wake_nocb_leader_defer(struct rcu_data *rdp, int waketype, 1720static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
1589 const char *reason) 1721 unsigned long j)
1590{ 1722{
1591 unsigned long flags; 1723 if (!rcu_segcblist_is_offloaded(&rdp->cblist))
1724 return true;
1725 rcu_lockdep_assert_cblist_protected(rdp);
1726 rcu_nocb_bypass_lock(rdp);
1727 return rcu_nocb_do_flush_bypass(rdp, rhp, j);
1728}
1592 1729
1593 raw_spin_lock_irqsave(&rdp->nocb_lock, flags); 1730/*
1594 if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) 1731 * If the ->nocb_bypass_lock is immediately available, flush the
1595 mod_timer(&rdp->nocb_timer, jiffies + 1); 1732 * ->nocb_bypass queue into ->cblist.
1596 WRITE_ONCE(rdp->nocb_defer_wakeup, waketype); 1733 */
1597 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason); 1734static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
1598 raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); 1735{
1736 rcu_lockdep_assert_cblist_protected(rdp);
1737 if (!rcu_segcblist_is_offloaded(&rdp->cblist) ||
1738 !rcu_nocb_bypass_trylock(rdp))
1739 return;
1740 WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j));
1599} 1741}
1600 1742
1601/* Does rcu_barrier need to queue an RCU callback on the specified CPU? */ 1743/*
1602static bool rcu_nocb_cpu_needs_barrier(int cpu) 1744 * See whether it is appropriate to use the ->nocb_bypass list in order
1745 * to control contention on ->nocb_lock. A limited number of direct
1746 * enqueues are permitted into ->cblist per jiffy. If ->nocb_bypass
1747 * is non-empty, further callbacks must be placed into ->nocb_bypass,
1748 * otherwise rcu_barrier() breaks. Use rcu_nocb_flush_bypass() to switch
1749 * back to direct use of ->cblist. However, ->nocb_bypass should not be
1750 * used if ->cblist is empty, because otherwise callbacks can be stranded
1751 * on ->nocb_bypass because we cannot count on the current CPU ever again
1752 * invoking call_rcu(). The general rule is that if ->nocb_bypass is
1753 * non-empty, the corresponding no-CBs grace-period kthread must not be
1754 * in an indefinite sleep state.
1755 *
1756 * Finally, it is not permitted to use the bypass during early boot,
1757 * as doing so would confuse the auto-initialization code. Besides
1758 * which, there is no point in worrying about lock contention while
1759 * there is only one CPU in operation.
1760 */
1761static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
1762 bool *was_alldone, unsigned long flags)
1603{ 1763{
1604 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); 1764 unsigned long c;
1605 unsigned long ret; 1765 unsigned long cur_gp_seq;
1606#ifdef CONFIG_PROVE_RCU 1766 unsigned long j = jiffies;
1607 struct rcu_head *rhp; 1767 long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
1608#endif /* #ifdef CONFIG_PROVE_RCU */
1609 1768
1610 /* 1769 if (!rcu_segcblist_is_offloaded(&rdp->cblist)) {
1611 * Check count of all no-CBs callbacks awaiting invocation. 1770 *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
1612 * There needs to be a barrier before this function is called, 1771 return false; /* Not offloaded, no bypassing. */
1613 * but associated with a prior determination that no more 1772 }
1614 * callbacks would be posted. In the worst case, the first 1773 lockdep_assert_irqs_disabled();
1615 * barrier in rcu_barrier() suffices (but the caller cannot 1774
1616 * necessarily rely on this, not a substitute for the caller 1775 // Don't use ->nocb_bypass during early boot.
1617 * getting the concurrency design right!). There must also be a 1776 if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) {
1618 * barrier between the following load and posting of a callback 1777 rcu_nocb_lock(rdp);
1619 * (if a callback is in fact needed). This is associated with an 1778 WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
1620 * atomic_inc() in the caller. 1779 *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
1621 */ 1780 return false;
1622 ret = rcu_get_n_cbs_nocb_cpu(rdp); 1781 }
1623 1782
1624#ifdef CONFIG_PROVE_RCU 1783 // If we have advanced to a new jiffy, reset counts to allow
1625 rhp = READ_ONCE(rdp->nocb_head); 1784 // moving back from ->nocb_bypass to ->cblist.
1626 if (!rhp) 1785 if (j == rdp->nocb_nobypass_last) {
1627 rhp = READ_ONCE(rdp->nocb_gp_head); 1786 c = rdp->nocb_nobypass_count + 1;
1628 if (!rhp) 1787 } else {
1629 rhp = READ_ONCE(rdp->nocb_follower_head); 1788 WRITE_ONCE(rdp->nocb_nobypass_last, j);
1630 1789 c = rdp->nocb_nobypass_count - nocb_nobypass_lim_per_jiffy;
1631 /* Having no rcuo kthread but CBs after scheduler starts is bad! */ 1790 if (ULONG_CMP_LT(rdp->nocb_nobypass_count,
1632 if (!READ_ONCE(rdp->nocb_kthread) && rhp && 1791 nocb_nobypass_lim_per_jiffy))
1633 rcu_scheduler_fully_active) { 1792 c = 0;
1634 /* RCU callback enqueued before CPU first came online??? */ 1793 else if (c > nocb_nobypass_lim_per_jiffy)
1635 pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n", 1794 c = nocb_nobypass_lim_per_jiffy;
1636 cpu, rhp->func); 1795 }
1637 WARN_ON_ONCE(1); 1796 WRITE_ONCE(rdp->nocb_nobypass_count, c);
1797
1798 // If there hasn't yet been all that many ->cblist enqueues
1799 // this jiffy, tell the caller to enqueue onto ->cblist. But flush
1800 // ->nocb_bypass first.
1801 if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy) {
1802 rcu_nocb_lock(rdp);
1803 *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
1804 if (*was_alldone)
1805 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
1806 TPS("FirstQ"));
1807 WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j));
1808 WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
1809 return false; // Caller must enqueue the callback.
1810 }
1811
1812 // If ->nocb_bypass has been used too long or is too full,
1813 // flush ->nocb_bypass to ->cblist.
1814 if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) ||
1815 ncbs >= qhimark) {
1816 rcu_nocb_lock(rdp);
1817 if (!rcu_nocb_flush_bypass(rdp, rhp, j)) {
1818 *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
1819 if (*was_alldone)
1820 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
1821 TPS("FirstQ"));
1822 WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
1823 return false; // Caller must enqueue the callback.
1824 }
1825 if (j != rdp->nocb_gp_adv_time &&
1826 rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
1827 rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
1828 rcu_advance_cbs_nowake(rdp->mynode, rdp);
1829 rdp->nocb_gp_adv_time = j;
1830 }
1831 rcu_nocb_unlock_irqrestore(rdp, flags);
1832 return true; // Callback already enqueued.
1638 } 1833 }
1639#endif /* #ifdef CONFIG_PROVE_RCU */
1640 1834
1641 return !!ret; 1835 // We need to use the bypass.
1836 rcu_nocb_wait_contended(rdp);
1837 rcu_nocb_bypass_lock(rdp);
1838 ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
1839 rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
1840 rcu_cblist_enqueue(&rdp->nocb_bypass, rhp);
1841 if (!ncbs) {
1842 WRITE_ONCE(rdp->nocb_bypass_first, j);
1843 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ"));
1844 }
1845 rcu_nocb_bypass_unlock(rdp);
1846 smp_mb(); /* Order enqueue before wake. */
1847 if (ncbs) {
1848 local_irq_restore(flags);
1849 } else {
1850 // No-CBs GP kthread might be indefinitely asleep, if so, wake.
1851 rcu_nocb_lock(rdp); // Rare during call_rcu() flood.
1852 if (!rcu_segcblist_pend_cbs(&rdp->cblist)) {
1853 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
1854 TPS("FirstBQwake"));
1855 __call_rcu_nocb_wake(rdp, true, flags);
1856 } else {
1857 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
1858 TPS("FirstBQnoWake"));
1859 rcu_nocb_unlock_irqrestore(rdp, flags);
1860 }
1861 }
1862 return true; // Callback already enqueued.
1642} 1863}
1643 1864
1644/* 1865/*
1645 * Enqueue the specified string of rcu_head structures onto the specified 1866 * Awaken the no-CBs grace-period kthead if needed, either due to it
1646 * CPU's no-CBs lists. The CPU is specified by rdp, the head of the 1867 * legitimately being asleep or due to overload conditions.
1647 * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy
1648 * counts are supplied by rhcount and rhcount_lazy.
1649 * 1868 *
1650 * If warranted, also wake up the kthread servicing this CPUs queues. 1869 * If warranted, also wake up the kthread servicing this CPUs queues.
1651 */ 1870 */
1652static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, 1871static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
1653 struct rcu_head *rhp, 1872 unsigned long flags)
1654 struct rcu_head **rhtp, 1873 __releases(rdp->nocb_lock)
1655 int rhcount, int rhcount_lazy,
1656 unsigned long flags)
1657{ 1874{
1658 int len; 1875 unsigned long cur_gp_seq;
1659 struct rcu_head **old_rhpp; 1876 unsigned long j;
1877 long len;
1660 struct task_struct *t; 1878 struct task_struct *t;
1661 1879
1662 /* Enqueue the callback on the nocb list and update counts. */ 1880 // If we are being polled or there is no kthread, just leave.
1663 atomic_long_add(rhcount, &rdp->nocb_q_count); 1881 t = READ_ONCE(rdp->nocb_gp_kthread);
1664 /* rcu_barrier() relies on ->nocb_q_count add before xchg. */
1665 old_rhpp = xchg(&rdp->nocb_tail, rhtp);
1666 WRITE_ONCE(*old_rhpp, rhp);
1667 atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
1668 smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */
1669
1670 /* If we are not being polled and there is a kthread, awaken it ... */
1671 t = READ_ONCE(rdp->nocb_kthread);
1672 if (rcu_nocb_poll || !t) { 1882 if (rcu_nocb_poll || !t) {
1673 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, 1883 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
1674 TPS("WakeNotPoll")); 1884 TPS("WakeNotPoll"));
1885 rcu_nocb_unlock_irqrestore(rdp, flags);
1675 return; 1886 return;
1676 } 1887 }
1677 len = rcu_get_n_cbs_nocb_cpu(rdp); 1888 // Need to actually to a wakeup.
1678 if (old_rhpp == &rdp->nocb_head) { 1889 len = rcu_segcblist_n_cbs(&rdp->cblist);
1890 if (was_alldone) {
1891 rdp->qlen_last_fqs_check = len;
1679 if (!irqs_disabled_flags(flags)) { 1892 if (!irqs_disabled_flags(flags)) {
1680 /* ... if queue was empty ... */ 1893 /* ... if queue was empty ... */
1681 wake_nocb_leader(rdp, false); 1894 wake_nocb_gp(rdp, false, flags);
1682 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, 1895 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
1683 TPS("WakeEmpty")); 1896 TPS("WakeEmpty"));
1684 } else { 1897 } else {
1685 wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE, 1898 wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE,
1686 TPS("WakeEmptyIsDeferred")); 1899 TPS("WakeEmptyIsDeferred"));
1900 rcu_nocb_unlock_irqrestore(rdp, flags);
1687 } 1901 }
1688 rdp->qlen_last_fqs_check = 0;
1689 } else if (len > rdp->qlen_last_fqs_check + qhimark) { 1902 } else if (len > rdp->qlen_last_fqs_check + qhimark) {
1690 /* ... or if many callbacks queued. */ 1903 /* ... or if many callbacks queued. */
1691 if (!irqs_disabled_flags(flags)) { 1904 rdp->qlen_last_fqs_check = len;
1692 wake_nocb_leader(rdp, true); 1905 j = jiffies;
1693 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, 1906 if (j != rdp->nocb_gp_adv_time &&
1694 TPS("WakeOvf")); 1907 rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
1695 } else { 1908 rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
1696 wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE_FORCE, 1909 rcu_advance_cbs_nowake(rdp->mynode, rdp);
1697 TPS("WakeOvfIsDeferred")); 1910 rdp->nocb_gp_adv_time = j;
1698 } 1911 }
1699 rdp->qlen_last_fqs_check = LONG_MAX / 2; 1912 smp_mb(); /* Enqueue before timer_pending(). */
1913 if ((rdp->nocb_cb_sleep ||
1914 !rcu_segcblist_ready_cbs(&rdp->cblist)) &&
1915 !timer_pending(&rdp->nocb_bypass_timer))
1916 wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE,
1917 TPS("WakeOvfIsDeferred"));
1918 rcu_nocb_unlock_irqrestore(rdp, flags);
1700 } else { 1919 } else {
1701 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); 1920 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
1921 rcu_nocb_unlock_irqrestore(rdp, flags);
1702 } 1922 }
1703 return; 1923 return;
1704} 1924}
1705 1925
1706/* 1926/* Wake up the no-CBs GP kthread to flush ->nocb_bypass. */
1707 * This is a helper for __call_rcu(), which invokes this when the normal 1927static void do_nocb_bypass_wakeup_timer(struct timer_list *t)
1708 * callback queue is inoperable. If this is not a no-CBs CPU, this
1709 * function returns failure back to __call_rcu(), which can complain
1710 * appropriately.
1711 *
1712 * Otherwise, this function queues the callback where the corresponding
1713 * "rcuo" kthread can find it.
1714 */
1715static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
1716 bool lazy, unsigned long flags)
1717{ 1928{
1929 unsigned long flags;
1930 struct rcu_data *rdp = from_timer(rdp, t, nocb_bypass_timer);
1718 1931
1719 if (!rcu_is_nocb_cpu(rdp->cpu)) 1932 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer"));
1720 return false; 1933 rcu_nocb_lock_irqsave(rdp, flags);
1721 __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags); 1934 smp_mb__after_spinlock(); /* Timer expire before wakeup. */
1722 if (__is_kfree_rcu_offset((unsigned long)rhp->func)) 1935 __call_rcu_nocb_wake(rdp, true, flags);
1723 trace_rcu_kfree_callback(rcu_state.name, rhp,
1724 (unsigned long)rhp->func,
1725 -atomic_long_read(&rdp->nocb_q_count_lazy),
1726 -rcu_get_n_cbs_nocb_cpu(rdp));
1727 else
1728 trace_rcu_callback(rcu_state.name, rhp,
1729 -atomic_long_read(&rdp->nocb_q_count_lazy),
1730 -rcu_get_n_cbs_nocb_cpu(rdp));
1731
1732 /*
1733 * If called from an extended quiescent state with interrupts
1734 * disabled, invoke the RCU core in order to allow the idle-entry
1735 * deferred-wakeup check to function.
1736 */
1737 if (irqs_disabled_flags(flags) &&
1738 !rcu_is_watching() &&
1739 cpu_online(smp_processor_id()))
1740 invoke_rcu_core();
1741
1742 return true;
1743}
1744
1745/*
1746 * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is
1747 * not a no-CBs CPU.
1748 */
1749static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
1750 struct rcu_data *rdp,
1751 unsigned long flags)
1752{
1753 lockdep_assert_irqs_disabled();
1754 if (!rcu_is_nocb_cpu(smp_processor_id()))
1755 return false; /* Not NOCBs CPU, caller must migrate CBs. */
1756 __call_rcu_nocb_enqueue(my_rdp, rcu_segcblist_head(&rdp->cblist),
1757 rcu_segcblist_tail(&rdp->cblist),
1758 rcu_segcblist_n_cbs(&rdp->cblist),
1759 rcu_segcblist_n_lazy_cbs(&rdp->cblist), flags);
1760 rcu_segcblist_init(&rdp->cblist);
1761 rcu_segcblist_disable(&rdp->cblist);
1762 return true;
1763} 1936}
1764 1937
1765/* 1938/*
1766 * If necessary, kick off a new grace period, and either way wait 1939 * No-CBs GP kthreads come here to wait for additional callbacks to show up
1767 * for a subsequent grace period to complete. 1940 * or for grace periods to end.
1768 */ 1941 */
1769static void rcu_nocb_wait_gp(struct rcu_data *rdp) 1942static void nocb_gp_wait(struct rcu_data *my_rdp)
1770{ 1943{
1771 unsigned long c; 1944 bool bypass = false;
1772 bool d; 1945 long bypass_ncbs;
1946 int __maybe_unused cpu = my_rdp->cpu;
1947 unsigned long cur_gp_seq;
1773 unsigned long flags; 1948 unsigned long flags;
1949 bool gotcbs;
1950 unsigned long j = jiffies;
1951 bool needwait_gp = false; // This prevents actual uninitialized use.
1774 bool needwake; 1952 bool needwake;
1775 struct rcu_node *rnp = rdp->mynode; 1953 bool needwake_gp;
1954 struct rcu_data *rdp;
1955 struct rcu_node *rnp;
1956 unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning.
1776 1957
1777 local_irq_save(flags); 1958 /*
1778 c = rcu_seq_snap(&rcu_state.gp_seq); 1959 * Each pass through the following loop checks for CBs and for the
1779 if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) { 1960 * nearest grace period (if any) to wait for next. The CB kthreads
1780 local_irq_restore(flags); 1961 * and the global grace-period kthread are awakened if needed.
1781 } else { 1962 */
1782 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ 1963 for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) {
1783 needwake = rcu_start_this_gp(rnp, rdp, c); 1964 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
1784 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1965 rcu_nocb_lock_irqsave(rdp, flags);
1785 if (needwake) 1966 bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
1967 if (bypass_ncbs &&
1968 (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
1969 bypass_ncbs > 2 * qhimark)) {
1970 // Bypass full or old, so flush it.
1971 (void)rcu_nocb_try_flush_bypass(rdp, j);
1972 bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
1973 } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
1974 rcu_nocb_unlock_irqrestore(rdp, flags);
1975 continue; /* No callbacks here, try next. */
1976 }
1977 if (bypass_ncbs) {
1978 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
1979 TPS("Bypass"));
1980 bypass = true;
1981 }
1982 rnp = rdp->mynode;
1983 if (bypass) { // Avoid race with first bypass CB.
1984 WRITE_ONCE(my_rdp->nocb_defer_wakeup,
1985 RCU_NOCB_WAKE_NOT);
1986 del_timer(&my_rdp->nocb_timer);
1987 }
1988 // Advance callbacks if helpful and low contention.
1989 needwake_gp = false;
1990 if (!rcu_segcblist_restempty(&rdp->cblist,
1991 RCU_NEXT_READY_TAIL) ||
1992 (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
1993 rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) {
1994 raw_spin_lock_rcu_node(rnp); /* irqs disabled. */
1995 needwake_gp = rcu_advance_cbs(rnp, rdp);
1996 raw_spin_unlock_rcu_node(rnp); /* irqs disabled. */
1997 }
1998 // Need to wait on some grace period?
1999 WARN_ON_ONCE(!rcu_segcblist_restempty(&rdp->cblist,
2000 RCU_NEXT_READY_TAIL));
2001 if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) {
2002 if (!needwait_gp ||
2003 ULONG_CMP_LT(cur_gp_seq, wait_gp_seq))
2004 wait_gp_seq = cur_gp_seq;
2005 needwait_gp = true;
2006 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
2007 TPS("NeedWaitGP"));
2008 }
2009 if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
2010 needwake = rdp->nocb_cb_sleep;
2011 WRITE_ONCE(rdp->nocb_cb_sleep, false);
2012 smp_mb(); /* CB invocation -after- GP end. */
2013 } else {
2014 needwake = false;
2015 }
2016 rcu_nocb_unlock_irqrestore(rdp, flags);
2017 if (needwake) {
2018 swake_up_one(&rdp->nocb_cb_wq);
2019 gotcbs = true;
2020 }
2021 if (needwake_gp)
1786 rcu_gp_kthread_wake(); 2022 rcu_gp_kthread_wake();
1787 } 2023 }
1788 2024
1789 /* 2025 my_rdp->nocb_gp_bypass = bypass;
1790 * Wait for the grace period. Do so interruptibly to avoid messing 2026 my_rdp->nocb_gp_gp = needwait_gp;
1791 * up the load average. 2027 my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
1792 */ 2028 if (bypass && !rcu_nocb_poll) {
1793 trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait")); 2029 // At least one child with non-empty ->nocb_bypass, so set
1794 for (;;) { 2030 // timer in order to avoid stranding its callbacks.
2031 raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
2032 mod_timer(&my_rdp->nocb_bypass_timer, j + 2);
2033 raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
2034 }
2035 if (rcu_nocb_poll) {
2036 /* Polling, so trace if first poll in the series. */
2037 if (gotcbs)
2038 trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll"));
2039 schedule_timeout_interruptible(1);
2040 } else if (!needwait_gp) {
2041 /* Wait for callbacks to appear. */
2042 trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep"));
2043 swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq,
2044 !READ_ONCE(my_rdp->nocb_gp_sleep));
2045 trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep"));
2046 } else {
2047 rnp = my_rdp->mynode;
2048 trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait"));
1795 swait_event_interruptible_exclusive( 2049 swait_event_interruptible_exclusive(
1796 rnp->nocb_gp_wq[rcu_seq_ctr(c) & 0x1], 2050 rnp->nocb_gp_wq[rcu_seq_ctr(wait_gp_seq) & 0x1],
1797 (d = rcu_seq_done(&rnp->gp_seq, c))); 2051 rcu_seq_done(&rnp->gp_seq, wait_gp_seq) ||
1798 if (likely(d)) 2052 !READ_ONCE(my_rdp->nocb_gp_sleep));
1799 break; 2053 trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait"));
1800 WARN_ON(signal_pending(current));
1801 trace_rcu_this_gp(rnp, rdp, c, TPS("ResumeWait"));
1802 } 2054 }
1803 trace_rcu_this_gp(rnp, rdp, c, TPS("EndWait")); 2055 if (!rcu_nocb_poll) {
1804 smp_mb(); /* Ensure that CB invocation happens after GP end. */ 2056 raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
2057 if (bypass)
2058 del_timer(&my_rdp->nocb_bypass_timer);
2059 WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
2060 raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
2061 }
2062 my_rdp->nocb_gp_seq = -1;
2063 WARN_ON(signal_pending(current));
1805} 2064}
1806 2065
1807/* 2066/*
1808 * Leaders come here to wait for additional callbacks to show up. 2067 * No-CBs grace-period-wait kthread. There is one of these per group
1809 * This function does not return until callbacks appear. 2068 * of CPUs, but only once at least one CPU in that group has come online
2069 * at least once since boot. This kthread checks for newly posted
2070 * callbacks from any of the CPUs it is responsible for, waits for a
2071 * grace period, then awakens all of the rcu_nocb_cb_kthread() instances
2072 * that then have callback-invocation work to do.
1810 */ 2073 */
1811static void nocb_leader_wait(struct rcu_data *my_rdp) 2074static int rcu_nocb_gp_kthread(void *arg)
1812{ 2075{
1813 bool firsttime = true; 2076 struct rcu_data *rdp = arg;
1814 unsigned long flags;
1815 bool gotcbs;
1816 struct rcu_data *rdp;
1817 struct rcu_head **tail;
1818
1819wait_again:
1820
1821 /* Wait for callbacks to appear. */
1822 if (!rcu_nocb_poll) {
1823 trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu, TPS("Sleep"));
1824 swait_event_interruptible_exclusive(my_rdp->nocb_wq,
1825 !READ_ONCE(my_rdp->nocb_leader_sleep));
1826 raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags);
1827 my_rdp->nocb_leader_sleep = true;
1828 WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
1829 del_timer(&my_rdp->nocb_timer);
1830 raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags);
1831 } else if (firsttime) {
1832 firsttime = false; /* Don't drown trace log with "Poll"! */
1833 trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu, TPS("Poll"));
1834 }
1835
1836 /*
1837 * Each pass through the following loop checks a follower for CBs.
1838 * We are our own first follower. Any CBs found are moved to
1839 * nocb_gp_head, where they await a grace period.
1840 */
1841 gotcbs = false;
1842 smp_mb(); /* wakeup and _sleep before ->nocb_head reads. */
1843 for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
1844 rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head);
1845 if (!rdp->nocb_gp_head)
1846 continue; /* No CBs here, try next follower. */
1847
1848 /* Move callbacks to wait-for-GP list, which is empty. */
1849 WRITE_ONCE(rdp->nocb_head, NULL);
1850 rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
1851 gotcbs = true;
1852 }
1853
1854 /* No callbacks? Sleep a bit if polling, and go retry. */
1855 if (unlikely(!gotcbs)) {
1856 WARN_ON(signal_pending(current));
1857 if (rcu_nocb_poll) {
1858 schedule_timeout_interruptible(1);
1859 } else {
1860 trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu,
1861 TPS("WokeEmpty"));
1862 }
1863 goto wait_again;
1864 }
1865 2077
1866 /* Wait for one grace period. */ 2078 for (;;) {
1867 rcu_nocb_wait_gp(my_rdp); 2079 WRITE_ONCE(rdp->nocb_gp_loops, rdp->nocb_gp_loops + 1);
1868 2080 nocb_gp_wait(rdp);
1869 /* Each pass through the following loop wakes a follower, if needed. */ 2081 cond_resched_tasks_rcu_qs();
1870 for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
1871 if (!rcu_nocb_poll &&
1872 READ_ONCE(rdp->nocb_head) &&
1873 READ_ONCE(my_rdp->nocb_leader_sleep)) {
1874 raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags);
1875 my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/
1876 raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags);
1877 }
1878 if (!rdp->nocb_gp_head)
1879 continue; /* No CBs, so no need to wake follower. */
1880
1881 /* Append callbacks to follower's "done" list. */
1882 raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
1883 tail = rdp->nocb_follower_tail;
1884 rdp->nocb_follower_tail = rdp->nocb_gp_tail;
1885 *tail = rdp->nocb_gp_head;
1886 raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
1887 if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
1888 /* List was empty, so wake up the follower. */
1889 swake_up_one(&rdp->nocb_wq);
1890 }
1891 } 2082 }
1892 2083 return 0;
1893 /* If we (the leader) don't have CBs, go wait some more. */
1894 if (!my_rdp->nocb_follower_head)
1895 goto wait_again;
1896} 2084}
1897 2085
1898/* 2086/*
1899 * Followers come here to wait for additional callbacks to show up. 2087 * Invoke any ready callbacks from the corresponding no-CBs CPU,
1900 * This function does not return until callbacks appear. 2088 * then, if there are no more, wait for more to appear.
1901 */ 2089 */
1902static void nocb_follower_wait(struct rcu_data *rdp) 2090static void nocb_cb_wait(struct rcu_data *rdp)
1903{ 2091{
1904 for (;;) { 2092 unsigned long cur_gp_seq;
1905 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FollowerSleep")); 2093 unsigned long flags;
1906 swait_event_interruptible_exclusive(rdp->nocb_wq, 2094 bool needwake_gp = false;
1907 READ_ONCE(rdp->nocb_follower_head)); 2095 struct rcu_node *rnp = rdp->mynode;
1908 if (smp_load_acquire(&rdp->nocb_follower_head)) { 2096
1909 /* ^^^ Ensure CB invocation follows _head test. */ 2097 local_irq_save(flags);
1910 return; 2098 rcu_momentary_dyntick_idle();
1911 } 2099 local_irq_restore(flags);
1912 WARN_ON(signal_pending(current)); 2100 local_bh_disable();
1913 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); 2101 rcu_do_batch(rdp);
2102 local_bh_enable();
2103 lockdep_assert_irqs_enabled();
2104 rcu_nocb_lock_irqsave(rdp, flags);
2105 if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
2106 rcu_seq_done(&rnp->gp_seq, cur_gp_seq) &&
2107 raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */
2108 needwake_gp = rcu_advance_cbs(rdp->mynode, rdp);
2109 raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
2110 }
2111 if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
2112 rcu_nocb_unlock_irqrestore(rdp, flags);
2113 if (needwake_gp)
2114 rcu_gp_kthread_wake();
2115 return;
2116 }
2117
2118 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
2119 WRITE_ONCE(rdp->nocb_cb_sleep, true);
2120 rcu_nocb_unlock_irqrestore(rdp, flags);
2121 if (needwake_gp)
2122 rcu_gp_kthread_wake();
2123 swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
2124 !READ_ONCE(rdp->nocb_cb_sleep));
2125 if (!smp_load_acquire(&rdp->nocb_cb_sleep)) { /* VVV */
2126 /* ^^^ Ensure CB invocation follows _sleep test. */
2127 return;
1914 } 2128 }
2129 WARN_ON(signal_pending(current));
2130 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
1915} 2131}
1916 2132
1917/* 2133/*
1918 * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes 2134 * Per-rcu_data kthread, but only for no-CBs CPUs. Repeatedly invoke
1919 * callbacks queued by the corresponding no-CBs CPU, however, there is 2135 * nocb_cb_wait() to do the dirty work.
1920 * an optional leader-follower relationship so that the grace-period
1921 * kthreads don't have to do quite so many wakeups.
1922 */ 2136 */
1923static int rcu_nocb_kthread(void *arg) 2137static int rcu_nocb_cb_kthread(void *arg)
1924{ 2138{
1925 int c, cl;
1926 unsigned long flags;
1927 struct rcu_head *list;
1928 struct rcu_head *next;
1929 struct rcu_head **tail;
1930 struct rcu_data *rdp = arg; 2139 struct rcu_data *rdp = arg;
1931 2140
1932 /* Each pass through this loop invokes one batch of callbacks */ 2141 // Each pass through this loop does one callback batch, and,
2142 // if there are no more ready callbacks, waits for them.
1933 for (;;) { 2143 for (;;) {
1934 /* Wait for callbacks. */ 2144 nocb_cb_wait(rdp);
1935 if (rdp->nocb_leader == rdp) 2145 cond_resched_tasks_rcu_qs();
1936 nocb_leader_wait(rdp);
1937 else
1938 nocb_follower_wait(rdp);
1939
1940 /* Pull the ready-to-invoke callbacks onto local list. */
1941 raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
1942 list = rdp->nocb_follower_head;
1943 rdp->nocb_follower_head = NULL;
1944 tail = rdp->nocb_follower_tail;
1945 rdp->nocb_follower_tail = &rdp->nocb_follower_head;
1946 raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
1947 if (WARN_ON_ONCE(!list))
1948 continue;
1949 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeNonEmpty"));
1950
1951 /* Each pass through the following loop invokes a callback. */
1952 trace_rcu_batch_start(rcu_state.name,
1953 atomic_long_read(&rdp->nocb_q_count_lazy),
1954 rcu_get_n_cbs_nocb_cpu(rdp), -1);
1955 c = cl = 0;
1956 while (list) {
1957 next = list->next;
1958 /* Wait for enqueuing to complete, if needed. */
1959 while (next == NULL && &list->next != tail) {
1960 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
1961 TPS("WaitQueue"));
1962 schedule_timeout_interruptible(1);
1963 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
1964 TPS("WokeQueue"));
1965 next = list->next;
1966 }
1967 debug_rcu_head_unqueue(list);
1968 local_bh_disable();
1969 if (__rcu_reclaim(rcu_state.name, list))
1970 cl++;
1971 c++;
1972 local_bh_enable();
1973 cond_resched_tasks_rcu_qs();
1974 list = next;
1975 }
1976 trace_rcu_batch_end(rcu_state.name, c, !!list, 0, 0, 1);
1977 smp_mb__before_atomic(); /* _add after CB invocation. */
1978 atomic_long_add(-c, &rdp->nocb_q_count);
1979 atomic_long_add(-cl, &rdp->nocb_q_count_lazy);
1980 } 2146 }
1981 return 0; 2147 return 0;
1982} 2148}
@@ -1993,14 +2159,14 @@ static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
1993 unsigned long flags; 2159 unsigned long flags;
1994 int ndw; 2160 int ndw;
1995 2161
1996 raw_spin_lock_irqsave(&rdp->nocb_lock, flags); 2162 rcu_nocb_lock_irqsave(rdp, flags);
1997 if (!rcu_nocb_need_deferred_wakeup(rdp)) { 2163 if (!rcu_nocb_need_deferred_wakeup(rdp)) {
1998 raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); 2164 rcu_nocb_unlock_irqrestore(rdp, flags);
1999 return; 2165 return;
2000 } 2166 }
2001 ndw = READ_ONCE(rdp->nocb_defer_wakeup); 2167 ndw = READ_ONCE(rdp->nocb_defer_wakeup);
2002 WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); 2168 WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
2003 __wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); 2169 wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
2004 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake")); 2170 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));
2005} 2171}
2006 2172
@@ -2027,6 +2193,7 @@ void __init rcu_init_nohz(void)
2027{ 2193{
2028 int cpu; 2194 int cpu;
2029 bool need_rcu_nocb_mask = false; 2195 bool need_rcu_nocb_mask = false;
2196 struct rcu_data *rdp;
2030 2197
2031#if defined(CONFIG_NO_HZ_FULL) 2198#if defined(CONFIG_NO_HZ_FULL)
2032 if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask)) 2199 if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask))
@@ -2060,67 +2227,63 @@ void __init rcu_init_nohz(void)
2060 if (rcu_nocb_poll) 2227 if (rcu_nocb_poll)
2061 pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); 2228 pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
2062 2229
2063 for_each_cpu(cpu, rcu_nocb_mask) 2230 for_each_cpu(cpu, rcu_nocb_mask) {
2064 init_nocb_callback_list(per_cpu_ptr(&rcu_data, cpu)); 2231 rdp = per_cpu_ptr(&rcu_data, cpu);
2232 if (rcu_segcblist_empty(&rdp->cblist))
2233 rcu_segcblist_init(&rdp->cblist);
2234 rcu_segcblist_offload(&rdp->cblist);
2235 }
2065 rcu_organize_nocb_kthreads(); 2236 rcu_organize_nocb_kthreads();
2066} 2237}
2067 2238
2068/* Initialize per-rcu_data variables for no-CBs CPUs. */ 2239/* Initialize per-rcu_data variables for no-CBs CPUs. */
2069static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) 2240static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2070{ 2241{
2071 rdp->nocb_tail = &rdp->nocb_head; 2242 init_swait_queue_head(&rdp->nocb_cb_wq);
2072 init_swait_queue_head(&rdp->nocb_wq); 2243 init_swait_queue_head(&rdp->nocb_gp_wq);
2073 rdp->nocb_follower_tail = &rdp->nocb_follower_head;
2074 raw_spin_lock_init(&rdp->nocb_lock); 2244 raw_spin_lock_init(&rdp->nocb_lock);
2245 raw_spin_lock_init(&rdp->nocb_bypass_lock);
2246 raw_spin_lock_init(&rdp->nocb_gp_lock);
2075 timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0); 2247 timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
2248 timer_setup(&rdp->nocb_bypass_timer, do_nocb_bypass_wakeup_timer, 0);
2249 rcu_cblist_init(&rdp->nocb_bypass);
2076} 2250}
2077 2251
2078/* 2252/*
2079 * If the specified CPU is a no-CBs CPU that does not already have its 2253 * If the specified CPU is a no-CBs CPU that does not already have its
2080 * rcuo kthread, spawn it. If the CPUs are brought online out of order, 2254 * rcuo CB kthread, spawn it. Additionally, if the rcuo GP kthread
2081 * this can require re-organizing the leader-follower relationships. 2255 * for this CPU's group has not yet been created, spawn it as well.
2082 */ 2256 */
2083static void rcu_spawn_one_nocb_kthread(int cpu) 2257static void rcu_spawn_one_nocb_kthread(int cpu)
2084{ 2258{
2085 struct rcu_data *rdp; 2259 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
2086 struct rcu_data *rdp_last; 2260 struct rcu_data *rdp_gp;
2087 struct rcu_data *rdp_old_leader;
2088 struct rcu_data *rdp_spawn = per_cpu_ptr(&rcu_data, cpu);
2089 struct task_struct *t; 2261 struct task_struct *t;
2090 2262
2091 /* 2263 /*
2092 * If this isn't a no-CBs CPU or if it already has an rcuo kthread, 2264 * If this isn't a no-CBs CPU or if it already has an rcuo kthread,
2093 * then nothing to do. 2265 * then nothing to do.
2094 */ 2266 */
2095 if (!rcu_is_nocb_cpu(cpu) || rdp_spawn->nocb_kthread) 2267 if (!rcu_is_nocb_cpu(cpu) || rdp->nocb_cb_kthread)
2096 return; 2268 return;
2097 2269
2098 /* If we didn't spawn the leader first, reorganize! */ 2270 /* If we didn't spawn the GP kthread first, reorganize! */
2099 rdp_old_leader = rdp_spawn->nocb_leader; 2271 rdp_gp = rdp->nocb_gp_rdp;
2100 if (rdp_old_leader != rdp_spawn && !rdp_old_leader->nocb_kthread) { 2272 if (!rdp_gp->nocb_gp_kthread) {
2101 rdp_last = NULL; 2273 t = kthread_run(rcu_nocb_gp_kthread, rdp_gp,
2102 rdp = rdp_old_leader; 2274 "rcuog/%d", rdp_gp->cpu);
2103 do { 2275 if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__))
2104 rdp->nocb_leader = rdp_spawn; 2276 return;
2105 if (rdp_last && rdp != rdp_spawn) 2277 WRITE_ONCE(rdp_gp->nocb_gp_kthread, t);
2106 rdp_last->nocb_next_follower = rdp;
2107 if (rdp == rdp_spawn) {
2108 rdp = rdp->nocb_next_follower;
2109 } else {
2110 rdp_last = rdp;
2111 rdp = rdp->nocb_next_follower;
2112 rdp_last->nocb_next_follower = NULL;
2113 }
2114 } while (rdp);
2115 rdp_spawn->nocb_next_follower = rdp_old_leader;
2116 } 2278 }
2117 2279
2118 /* Spawn the kthread for this CPU. */ 2280 /* Spawn the kthread for this CPU. */
2119 t = kthread_run(rcu_nocb_kthread, rdp_spawn, 2281 t = kthread_run(rcu_nocb_cb_kthread, rdp,
2120 "rcuo%c/%d", rcu_state.abbr, cpu); 2282 "rcuo%c/%d", rcu_state.abbr, cpu);
2121 if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo kthread, OOM is now expected behavior\n", __func__)) 2283 if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__))
2122 return; 2284 return;
2123 WRITE_ONCE(rdp_spawn->nocb_kthread, t); 2285 WRITE_ONCE(rdp->nocb_cb_kthread, t);
2286 WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread);
2124} 2287}
2125 2288
2126/* 2289/*
@@ -2147,27 +2310,28 @@ static void __init rcu_spawn_nocb_kthreads(void)
2147 rcu_spawn_cpu_nocb_kthread(cpu); 2310 rcu_spawn_cpu_nocb_kthread(cpu);
2148} 2311}
2149 2312
2150/* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */ 2313/* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */
2151static int rcu_nocb_leader_stride = -1; 2314static int rcu_nocb_gp_stride = -1;
2152module_param(rcu_nocb_leader_stride, int, 0444); 2315module_param(rcu_nocb_gp_stride, int, 0444);
2153 2316
2154/* 2317/*
2155 * Initialize leader-follower relationships for all no-CBs CPU. 2318 * Initialize GP-CB relationships for all no-CBs CPU.
2156 */ 2319 */
2157static void __init rcu_organize_nocb_kthreads(void) 2320static void __init rcu_organize_nocb_kthreads(void)
2158{ 2321{
2159 int cpu; 2322 int cpu;
2160 int ls = rcu_nocb_leader_stride; 2323 bool firsttime = true;
2161 int nl = 0; /* Next leader. */ 2324 int ls = rcu_nocb_gp_stride;
2325 int nl = 0; /* Next GP kthread. */
2162 struct rcu_data *rdp; 2326 struct rcu_data *rdp;
2163 struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */ 2327 struct rcu_data *rdp_gp = NULL; /* Suppress misguided gcc warn. */
2164 struct rcu_data *rdp_prev = NULL; 2328 struct rcu_data *rdp_prev = NULL;
2165 2329
2166 if (!cpumask_available(rcu_nocb_mask)) 2330 if (!cpumask_available(rcu_nocb_mask))
2167 return; 2331 return;
2168 if (ls == -1) { 2332 if (ls == -1) {
2169 ls = int_sqrt(nr_cpu_ids); 2333 ls = nr_cpu_ids / int_sqrt(nr_cpu_ids);
2170 rcu_nocb_leader_stride = ls; 2334 rcu_nocb_gp_stride = ls;
2171 } 2335 }
2172 2336
2173 /* 2337 /*
@@ -2178,39 +2342,24 @@ static void __init rcu_organize_nocb_kthreads(void)
2178 for_each_cpu(cpu, rcu_nocb_mask) { 2342 for_each_cpu(cpu, rcu_nocb_mask) {
2179 rdp = per_cpu_ptr(&rcu_data, cpu); 2343 rdp = per_cpu_ptr(&rcu_data, cpu);
2180 if (rdp->cpu >= nl) { 2344 if (rdp->cpu >= nl) {
2181 /* New leader, set up for followers & next leader. */ 2345 /* New GP kthread, set up for CBs & next GP. */
2182 nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls; 2346 nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls;
2183 rdp->nocb_leader = rdp; 2347 rdp->nocb_gp_rdp = rdp;
2184 rdp_leader = rdp; 2348 rdp_gp = rdp;
2349 if (!firsttime && dump_tree)
2350 pr_cont("\n");
2351 firsttime = false;
2352 pr_alert("%s: No-CB GP kthread CPU %d:", __func__, cpu);
2185 } else { 2353 } else {
2186 /* Another follower, link to previous leader. */ 2354 /* Another CB kthread, link to previous GP kthread. */
2187 rdp->nocb_leader = rdp_leader; 2355 rdp->nocb_gp_rdp = rdp_gp;
2188 rdp_prev->nocb_next_follower = rdp; 2356 rdp_prev->nocb_next_cb_rdp = rdp;
2357 pr_alert(" %d", cpu);
2189 } 2358 }
2190 rdp_prev = rdp; 2359 rdp_prev = rdp;
2191 } 2360 }
2192} 2361}
2193 2362
2194/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
2195static bool init_nocb_callback_list(struct rcu_data *rdp)
2196{
2197 if (!rcu_is_nocb_cpu(rdp->cpu))
2198 return false;
2199
2200 /* If there are early-boot callbacks, move them to nocb lists. */
2201 if (!rcu_segcblist_empty(&rdp->cblist)) {
2202 rdp->nocb_head = rcu_segcblist_head(&rdp->cblist);
2203 rdp->nocb_tail = rcu_segcblist_tail(&rdp->cblist);
2204 atomic_long_set(&rdp->nocb_q_count,
2205 rcu_segcblist_n_cbs(&rdp->cblist));
2206 atomic_long_set(&rdp->nocb_q_count_lazy,
2207 rcu_segcblist_n_lazy_cbs(&rdp->cblist));
2208 rcu_segcblist_init(&rdp->cblist);
2209 }
2210 rcu_segcblist_disable(&rdp->cblist);
2211 return true;
2212}
2213
2214/* 2363/*
2215 * Bind the current task to the offloaded CPUs. If there are no offloaded 2364 * Bind the current task to the offloaded CPUs. If there are no offloaded
2216 * CPUs, leave the task unbound. Splat if the bind attempt fails. 2365 * CPUs, leave the task unbound. Splat if the bind attempt fails.
@@ -2223,20 +2372,101 @@ void rcu_bind_current_to_nocb(void)
2223EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb); 2372EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb);
2224 2373
2225/* 2374/*
2226 * Return the number of RCU callbacks still queued from the specified 2375 * Dump out nocb grace-period kthread state for the specified rcu_data
2227 * CPU, which must be a nocbs CPU. 2376 * structure.
2228 */ 2377 */
2229static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp) 2378static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
2230{ 2379{
2231 return atomic_long_read(&rdp->nocb_q_count); 2380 struct rcu_node *rnp = rdp->mynode;
2381
2382 pr_info("nocb GP %d %c%c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu\n",
2383 rdp->cpu,
2384 "kK"[!!rdp->nocb_gp_kthread],
2385 "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)],
2386 "dD"[!!rdp->nocb_defer_wakeup],
2387 "tT"[timer_pending(&rdp->nocb_timer)],
2388 "bB"[timer_pending(&rdp->nocb_bypass_timer)],
2389 "sS"[!!rdp->nocb_gp_sleep],
2390 ".W"[swait_active(&rdp->nocb_gp_wq)],
2391 ".W"[swait_active(&rnp->nocb_gp_wq[0])],
2392 ".W"[swait_active(&rnp->nocb_gp_wq[1])],
2393 ".B"[!!rdp->nocb_gp_bypass],
2394 ".G"[!!rdp->nocb_gp_gp],
2395 (long)rdp->nocb_gp_seq,
2396 rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops));
2397}
2398
2399/* Dump out nocb kthread state for the specified rcu_data structure. */
2400static void show_rcu_nocb_state(struct rcu_data *rdp)
2401{
2402 struct rcu_segcblist *rsclp = &rdp->cblist;
2403 bool waslocked;
2404 bool wastimer;
2405 bool wassleep;
2406
2407 if (rdp->nocb_gp_rdp == rdp)
2408 show_rcu_nocb_gp_state(rdp);
2409
2410 pr_info(" CB %d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%c%c%c q%ld\n",
2411 rdp->cpu, rdp->nocb_gp_rdp->cpu,
2412 "kK"[!!rdp->nocb_cb_kthread],
2413 "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)],
2414 "cC"[!!atomic_read(&rdp->nocb_lock_contended)],
2415 "lL"[raw_spin_is_locked(&rdp->nocb_lock)],
2416 "sS"[!!rdp->nocb_cb_sleep],
2417 ".W"[swait_active(&rdp->nocb_cb_wq)],
2418 jiffies - rdp->nocb_bypass_first,
2419 jiffies - rdp->nocb_nobypass_last,
2420 rdp->nocb_nobypass_count,
2421 ".D"[rcu_segcblist_ready_cbs(rsclp)],
2422 ".W"[!rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)],
2423 ".R"[!rcu_segcblist_restempty(rsclp, RCU_WAIT_TAIL)],
2424 ".N"[!rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL)],
2425 ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)],
2426 rcu_segcblist_n_cbs(&rdp->cblist));
2427
2428 /* It is OK for GP kthreads to have GP state. */
2429 if (rdp->nocb_gp_rdp == rdp)
2430 return;
2431
2432 waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock);
2433 wastimer = timer_pending(&rdp->nocb_timer);
2434 wassleep = swait_active(&rdp->nocb_gp_wq);
2435 if (!rdp->nocb_defer_wakeup && !rdp->nocb_gp_sleep &&
2436 !waslocked && !wastimer && !wassleep)
2437 return; /* Nothing untowards. */
2438
2439 pr_info(" !!! %c%c%c%c %c\n",
2440 "lL"[waslocked],
2441 "dD"[!!rdp->nocb_defer_wakeup],
2442 "tT"[wastimer],
2443 "sS"[!!rdp->nocb_gp_sleep],
2444 ".W"[wassleep]);
2232} 2445}
2233 2446
2234#else /* #ifdef CONFIG_RCU_NOCB_CPU */ 2447#else /* #ifdef CONFIG_RCU_NOCB_CPU */
2235 2448
2236static bool rcu_nocb_cpu_needs_barrier(int cpu) 2449/* No ->nocb_lock to acquire. */
2450static void rcu_nocb_lock(struct rcu_data *rdp)
2451{
2452}
2453
2454/* No ->nocb_lock to release. */
2455static void rcu_nocb_unlock(struct rcu_data *rdp)
2237{ 2456{
2238 WARN_ON_ONCE(1); /* Should be dead code. */ 2457}
2239 return false; 2458
2459/* No ->nocb_lock to release. */
2460static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
2461 unsigned long flags)
2462{
2463 local_irq_restore(flags);
2464}
2465
2466/* Lockdep check that ->cblist may be safely accessed. */
2467static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
2468{
2469 lockdep_assert_irqs_disabled();
2240} 2470}
2241 2471
2242static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) 2472static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
@@ -2252,19 +2482,24 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
2252{ 2482{
2253} 2483}
2254 2484
2255static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, 2485static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
2256 bool lazy, unsigned long flags) 2486 unsigned long j)
2257{ 2487{
2258 return false; 2488 return true;
2259} 2489}
2260 2490
2261static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, 2491static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
2262 struct rcu_data *rdp, 2492 bool *was_alldone, unsigned long flags)
2263 unsigned long flags)
2264{ 2493{
2265 return false; 2494 return false;
2266} 2495}
2267 2496
2497static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
2498 unsigned long flags)
2499{
2500 WARN_ON_ONCE(1); /* Should be dead code! */
2501}
2502
2268static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) 2503static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2269{ 2504{
2270} 2505}
@@ -2286,14 +2521,8 @@ static void __init rcu_spawn_nocb_kthreads(void)
2286{ 2521{
2287} 2522}
2288 2523
2289static bool init_nocb_callback_list(struct rcu_data *rdp) 2524static void show_rcu_nocb_state(struct rcu_data *rdp)
2290{
2291 return false;
2292}
2293
2294static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp)
2295{ 2525{
2296 return 0;
2297} 2526}
2298 2527
2299#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ 2528#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 065183391f75..841ab43f3e60 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -527,6 +527,8 @@ static void check_cpu_stall(struct rcu_data *rdp)
527 527
528 /* We haven't checked in, so go dump stack. */ 528 /* We haven't checked in, so go dump stack. */
529 print_cpu_stall(); 529 print_cpu_stall();
530 if (rcu_cpu_stall_ftrace_dump)
531 rcu_ftrace_dump(DUMP_ALL);
530 532
531 } else if (rcu_gp_in_progress() && 533 } else if (rcu_gp_in_progress() &&
532 ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) && 534 ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) &&
@@ -534,6 +536,8 @@ static void check_cpu_stall(struct rcu_data *rdp)
534 536
535 /* They had a few time units to dump stack, so complain. */ 537 /* They had a few time units to dump stack, so complain. */
536 print_other_cpu_stall(gs2); 538 print_other_cpu_stall(gs2);
539 if (rcu_cpu_stall_ftrace_dump)
540 rcu_ftrace_dump(DUMP_ALL);
537 } 541 }
538} 542}
539 543
@@ -585,6 +589,11 @@ void show_rcu_gp_kthreads(void)
585 cpu, (long)rdp->gp_seq_needed); 589 cpu, (long)rdp->gp_seq_needed);
586 } 590 }
587 } 591 }
592 for_each_possible_cpu(cpu) {
593 rdp = per_cpu_ptr(&rcu_data, cpu);
594 if (rcu_segcblist_is_offloaded(&rdp->cblist))
595 show_rcu_nocb_state(rdp);
596 }
588 /* sched_show_task(rcu_state.gp_kthread); */ 597 /* sched_show_task(rcu_state.gp_kthread); */
589} 598}
590EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); 599EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 61df2bf08563..1861103662db 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -61,9 +61,15 @@ module_param(rcu_normal_after_boot, int, 0);
61 61
62#ifdef CONFIG_DEBUG_LOCK_ALLOC 62#ifdef CONFIG_DEBUG_LOCK_ALLOC
63/** 63/**
64 * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section? 64 * rcu_read_lock_held_common() - might we be in RCU-sched read-side critical section?
65 * @ret: Best guess answer if lockdep cannot be relied on
65 * 66 *
66 * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an 67 * Returns true if lockdep must be ignored, in which case *ret contains
68 * the best guess described below. Otherwise returns false, in which
69 * case *ret tells the caller nothing and the caller should instead
70 * consult lockdep.
71 *
72 * If CONFIG_DEBUG_LOCK_ALLOC is selected, set *ret to nonzero iff in an
67 * RCU-sched read-side critical section. In absence of 73 * RCU-sched read-side critical section. In absence of
68 * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side 74 * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side
69 * critical section unless it can prove otherwise. Note that disabling 75 * critical section unless it can prove otherwise. Note that disabling
@@ -75,35 +81,45 @@ module_param(rcu_normal_after_boot, int, 0);
75 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot 81 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot
76 * and while lockdep is disabled. 82 * and while lockdep is disabled.
77 * 83 *
78 * Note that if the CPU is in the idle loop from an RCU point of 84 * Note that if the CPU is in the idle loop from an RCU point of view (ie:
79 * view (ie: that we are in the section between rcu_idle_enter() and 85 * that we are in the section between rcu_idle_enter() and rcu_idle_exit())
80 * rcu_idle_exit()) then rcu_read_lock_held() returns false even if the CPU 86 * then rcu_read_lock_held() sets *ret to false even if the CPU did an
81 * did an rcu_read_lock(). The reason for this is that RCU ignores CPUs 87 * rcu_read_lock(). The reason for this is that RCU ignores CPUs that are
82 * that are in such a section, considering these as in extended quiescent 88 * in such a section, considering these as in extended quiescent state,
83 * state, so such a CPU is effectively never in an RCU read-side critical 89 * so such a CPU is effectively never in an RCU read-side critical section
84 * section regardless of what RCU primitives it invokes. This state of 90 * regardless of what RCU primitives it invokes. This state of affairs is
85 * affairs is required --- we need to keep an RCU-free window in idle 91 * required --- we need to keep an RCU-free window in idle where the CPU may
86 * where the CPU may possibly enter into low power mode. This way we can 92 * possibly enter into low power mode. This way we can notice an extended
87 * notice an extended quiescent state to other CPUs that started a grace 93 * quiescent state to other CPUs that started a grace period. Otherwise
88 * period. Otherwise we would delay any grace period as long as we run in 94 * we would delay any grace period as long as we run in the idle task.
89 * the idle task.
90 * 95 *
91 * Similarly, we avoid claiming an SRCU read lock held if the current 96 * Similarly, we avoid claiming an RCU read lock held if the current
92 * CPU is offline. 97 * CPU is offline.
93 */ 98 */
99static bool rcu_read_lock_held_common(bool *ret)
100{
101 if (!debug_lockdep_rcu_enabled()) {
102 *ret = 1;
103 return true;
104 }
105 if (!rcu_is_watching()) {
106 *ret = 0;
107 return true;
108 }
109 if (!rcu_lockdep_current_cpu_online()) {
110 *ret = 0;
111 return true;
112 }
113 return false;
114}
115
94int rcu_read_lock_sched_held(void) 116int rcu_read_lock_sched_held(void)
95{ 117{
96 int lockdep_opinion = 0; 118 bool ret;
97 119
98 if (!debug_lockdep_rcu_enabled()) 120 if (rcu_read_lock_held_common(&ret))
99 return 1; 121 return ret;
100 if (!rcu_is_watching()) 122 return lock_is_held(&rcu_sched_lock_map) || !preemptible();
101 return 0;
102 if (!rcu_lockdep_current_cpu_online())
103 return 0;
104 if (debug_locks)
105 lockdep_opinion = lock_is_held(&rcu_sched_lock_map);
106 return lockdep_opinion || !preemptible();
107} 123}
108EXPORT_SYMBOL(rcu_read_lock_sched_held); 124EXPORT_SYMBOL(rcu_read_lock_sched_held);
109#endif 125#endif
@@ -136,8 +152,7 @@ static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1);
136 */ 152 */
137bool rcu_gp_is_expedited(void) 153bool rcu_gp_is_expedited(void)
138{ 154{
139 return rcu_expedited || atomic_read(&rcu_expedited_nesting) || 155 return rcu_expedited || atomic_read(&rcu_expedited_nesting);
140 rcu_scheduler_active == RCU_SCHEDULER_INIT;
141} 156}
142EXPORT_SYMBOL_GPL(rcu_gp_is_expedited); 157EXPORT_SYMBOL_GPL(rcu_gp_is_expedited);
143 158
@@ -261,12 +276,10 @@ NOKPROBE_SYMBOL(debug_lockdep_rcu_enabled);
261 */ 276 */
262int rcu_read_lock_held(void) 277int rcu_read_lock_held(void)
263{ 278{
264 if (!debug_lockdep_rcu_enabled()) 279 bool ret;
265 return 1; 280
266 if (!rcu_is_watching()) 281 if (rcu_read_lock_held_common(&ret))
267 return 0; 282 return ret;
268 if (!rcu_lockdep_current_cpu_online())
269 return 0;
270 return lock_is_held(&rcu_lock_map); 283 return lock_is_held(&rcu_lock_map);
271} 284}
272EXPORT_SYMBOL_GPL(rcu_read_lock_held); 285EXPORT_SYMBOL_GPL(rcu_read_lock_held);
@@ -288,16 +301,28 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_held);
288 */ 301 */
289int rcu_read_lock_bh_held(void) 302int rcu_read_lock_bh_held(void)
290{ 303{
291 if (!debug_lockdep_rcu_enabled()) 304 bool ret;
292 return 1; 305
293 if (!rcu_is_watching()) 306 if (rcu_read_lock_held_common(&ret))
294 return 0; 307 return ret;
295 if (!rcu_lockdep_current_cpu_online())
296 return 0;
297 return in_softirq() || irqs_disabled(); 308 return in_softirq() || irqs_disabled();
298} 309}
299EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); 310EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
300 311
312int rcu_read_lock_any_held(void)
313{
314 bool ret;
315
316 if (rcu_read_lock_held_common(&ret))
317 return ret;
318 if (lock_is_held(&rcu_lock_map) ||
319 lock_is_held(&rcu_bh_lock_map) ||
320 lock_is_held(&rcu_sched_lock_map))
321 return 1;
322 return !preemptible();
323}
324EXPORT_SYMBOL_GPL(rcu_read_lock_any_held);
325
301#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 326#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
302 327
303/** 328/**
@@ -437,6 +462,8 @@ EXPORT_SYMBOL_GPL(rcutorture_sched_setaffinity);
437#endif 462#endif
438 463
439#ifdef CONFIG_RCU_STALL_COMMON 464#ifdef CONFIG_RCU_STALL_COMMON
465int rcu_cpu_stall_ftrace_dump __read_mostly;
466module_param(rcu_cpu_stall_ftrace_dump, int, 0644);
440int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ 467int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
441EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); 468EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress);
442module_param(rcu_cpu_stall_suppress, int, 0644); 469module_param(rcu_cpu_stall_suppress, int, 0644);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2b037f195473..7fa8e74ad2ab 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3486,8 +3486,36 @@ void scheduler_tick(void)
3486 3486
3487struct tick_work { 3487struct tick_work {
3488 int cpu; 3488 int cpu;
3489 atomic_t state;
3489 struct delayed_work work; 3490 struct delayed_work work;
3490}; 3491};
3492/* Values for ->state, see diagram below. */
3493#define TICK_SCHED_REMOTE_OFFLINE 0
3494#define TICK_SCHED_REMOTE_OFFLINING 1
3495#define TICK_SCHED_REMOTE_RUNNING 2
3496
3497/*
3498 * State diagram for ->state:
3499 *
3500 *
3501 * TICK_SCHED_REMOTE_OFFLINE
3502 * | ^
3503 * | |
3504 * | | sched_tick_remote()
3505 * | |
3506 * | |
3507 * +--TICK_SCHED_REMOTE_OFFLINING
3508 * | ^
3509 * | |
3510 * sched_tick_start() | | sched_tick_stop()
3511 * | |
3512 * V |
3513 * TICK_SCHED_REMOTE_RUNNING
3514 *
3515 *
3516 * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote()
3517 * and sched_tick_start() are happy to leave the state in RUNNING.
3518 */
3491 3519
3492static struct tick_work __percpu *tick_work_cpu; 3520static struct tick_work __percpu *tick_work_cpu;
3493 3521
@@ -3500,6 +3528,7 @@ static void sched_tick_remote(struct work_struct *work)
3500 struct task_struct *curr; 3528 struct task_struct *curr;
3501 struct rq_flags rf; 3529 struct rq_flags rf;
3502 u64 delta; 3530 u64 delta;
3531 int os;
3503 3532
3504 /* 3533 /*
3505 * Handle the tick only if it appears the remote CPU is running in full 3534 * Handle the tick only if it appears the remote CPU is running in full
@@ -3513,7 +3542,7 @@ static void sched_tick_remote(struct work_struct *work)
3513 3542
3514 rq_lock_irq(rq, &rf); 3543 rq_lock_irq(rq, &rf);
3515 curr = rq->curr; 3544 curr = rq->curr;
3516 if (is_idle_task(curr)) 3545 if (is_idle_task(curr) || cpu_is_offline(cpu))
3517 goto out_unlock; 3546 goto out_unlock;
3518 3547
3519 update_rq_clock(rq); 3548 update_rq_clock(rq);
@@ -3533,13 +3562,18 @@ out_requeue:
3533 /* 3562 /*
3534 * Run the remote tick once per second (1Hz). This arbitrary 3563 * Run the remote tick once per second (1Hz). This arbitrary
3535 * frequency is large enough to avoid overload but short enough 3564 * frequency is large enough to avoid overload but short enough
3536 * to keep scheduler internal stats reasonably up to date. 3565 * to keep scheduler internal stats reasonably up to date. But
3566 * first update state to reflect hotplug activity if required.
3537 */ 3567 */
3538 queue_delayed_work(system_unbound_wq, dwork, HZ); 3568 os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);
3569 WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
3570 if (os == TICK_SCHED_REMOTE_RUNNING)
3571 queue_delayed_work(system_unbound_wq, dwork, HZ);
3539} 3572}
3540 3573
3541static void sched_tick_start(int cpu) 3574static void sched_tick_start(int cpu)
3542{ 3575{
3576 int os;
3543 struct tick_work *twork; 3577 struct tick_work *twork;
3544 3578
3545 if (housekeeping_cpu(cpu, HK_FLAG_TICK)) 3579 if (housekeeping_cpu(cpu, HK_FLAG_TICK))
@@ -3548,15 +3582,20 @@ static void sched_tick_start(int cpu)
3548 WARN_ON_ONCE(!tick_work_cpu); 3582 WARN_ON_ONCE(!tick_work_cpu);
3549 3583
3550 twork = per_cpu_ptr(tick_work_cpu, cpu); 3584 twork = per_cpu_ptr(tick_work_cpu, cpu);
3551 twork->cpu = cpu; 3585 os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING);
3552 INIT_DELAYED_WORK(&twork->work, sched_tick_remote); 3586 WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING);
3553 queue_delayed_work(system_unbound_wq, &twork->work, HZ); 3587 if (os == TICK_SCHED_REMOTE_OFFLINE) {
3588 twork->cpu = cpu;
3589 INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
3590 queue_delayed_work(system_unbound_wq, &twork->work, HZ);
3591 }
3554} 3592}
3555 3593
3556#ifdef CONFIG_HOTPLUG_CPU 3594#ifdef CONFIG_HOTPLUG_CPU
3557static void sched_tick_stop(int cpu) 3595static void sched_tick_stop(int cpu)
3558{ 3596{
3559 struct tick_work *twork; 3597 struct tick_work *twork;
3598 int os;
3560 3599
3561 if (housekeeping_cpu(cpu, HK_FLAG_TICK)) 3600 if (housekeeping_cpu(cpu, HK_FLAG_TICK))
3562 return; 3601 return;
@@ -3564,7 +3603,10 @@ static void sched_tick_stop(int cpu)
3564 WARN_ON_ONCE(!tick_work_cpu); 3603 WARN_ON_ONCE(!tick_work_cpu);
3565 3604
3566 twork = per_cpu_ptr(tick_work_cpu, cpu); 3605 twork = per_cpu_ptr(tick_work_cpu, cpu);
3567 cancel_delayed_work_sync(&twork->work); 3606 /* There cannot be competing actions, but don't rely on stop-machine. */
3607 os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING);
3608 WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING);
3609 /* Don't cancel, as this would mess up the state machine. */
3568} 3610}
3569#endif /* CONFIG_HOTPLUG_CPU */ 3611#endif /* CONFIG_HOTPLUG_CPU */
3570 3612
@@ -3572,7 +3614,6 @@ int __init sched_tick_offload_init(void)
3572{ 3614{
3573 tick_work_cpu = alloc_percpu(struct tick_work); 3615 tick_work_cpu = alloc_percpu(struct tick_work);
3574 BUG_ON(!tick_work_cpu); 3616 BUG_ON(!tick_work_cpu);
3575
3576 return 0; 3617 return 0;
3577} 3618}
3578 3619
@@ -3904,7 +3945,7 @@ void __noreturn do_task_dead(void)
3904 3945
3905static inline void sched_submit_work(struct task_struct *tsk) 3946static inline void sched_submit_work(struct task_struct *tsk)
3906{ 3947{
3907 if (!tsk->state || tsk_is_pi_blocked(tsk)) 3948 if (!tsk->state)
3908 return; 3949 return;
3909 3950
3910 /* 3951 /*
@@ -3920,6 +3961,9 @@ static inline void sched_submit_work(struct task_struct *tsk)
3920 preempt_enable_no_resched(); 3961 preempt_enable_no_resched();
3921 } 3962 }
3922 3963
3964 if (tsk_is_pi_blocked(tsk))
3965 return;
3966
3923 /* 3967 /*
3924 * If we are going to sleep and we have plugged IO queued, 3968 * If we are going to sleep and we have plugged IO queued,
3925 * make sure to submit it to avoid deadlocks. 3969 * make sure to submit it to avoid deadlocks.
@@ -5102,37 +5146,40 @@ out_unlock:
5102 return retval; 5146 return retval;
5103} 5147}
5104 5148
5105static int sched_read_attr(struct sched_attr __user *uattr, 5149/*
5106 struct sched_attr *attr, 5150 * Copy the kernel size attribute structure (which might be larger
5107 unsigned int usize) 5151 * than what user-space knows about) to user-space.
5152 *
5153 * Note that all cases are valid: user-space buffer can be larger or
5154 * smaller than the kernel-space buffer. The usual case is that both
5155 * have the same size.
5156 */
5157static int
5158sched_attr_copy_to_user(struct sched_attr __user *uattr,
5159 struct sched_attr *kattr,
5160 unsigned int usize)
5108{ 5161{
5109 int ret; 5162 unsigned int ksize = sizeof(*kattr);
5110 5163
5111 if (!access_ok(uattr, usize)) 5164 if (!access_ok(uattr, usize))
5112 return -EFAULT; 5165 return -EFAULT;
5113 5166
5114 /* 5167 /*
5115 * If we're handed a smaller struct than we know of, 5168 * sched_getattr() ABI forwards and backwards compatibility:
5116 * ensure all the unknown bits are 0 - i.e. old 5169 *
5117 * user-space does not get uncomplete information. 5170 * If usize == ksize then we just copy everything to user-space and all is good.
5171 *
5172 * If usize < ksize then we only copy as much as user-space has space for,
5173 * this keeps ABI compatibility as well. We skip the rest.
5174 *
5175 * If usize > ksize then user-space is using a newer version of the ABI,
5176 * which part the kernel doesn't know about. Just ignore it - tooling can
5177 * detect the kernel's knowledge of attributes from the attr->size value
5178 * which is set to ksize in this case.
5118 */ 5179 */
5119 if (usize < sizeof(*attr)) { 5180 kattr->size = min(usize, ksize);
5120 unsigned char *addr;
5121 unsigned char *end;
5122 5181
5123 addr = (void *)attr + usize; 5182 if (copy_to_user(uattr, kattr, kattr->size))
5124 end = (void *)attr + sizeof(*attr);
5125
5126 for (; addr < end; addr++) {
5127 if (*addr)
5128 return -EFBIG;
5129 }
5130
5131 attr->size = usize;
5132 }
5133
5134 ret = copy_to_user(uattr, attr, attr->size);
5135 if (ret)
5136 return -EFAULT; 5183 return -EFAULT;
5137 5184
5138 return 0; 5185 return 0;
@@ -5142,20 +5189,18 @@ static int sched_read_attr(struct sched_attr __user *uattr,
5142 * sys_sched_getattr - similar to sched_getparam, but with sched_attr 5189 * sys_sched_getattr - similar to sched_getparam, but with sched_attr
5143 * @pid: the pid in question. 5190 * @pid: the pid in question.
5144 * @uattr: structure containing the extended parameters. 5191 * @uattr: structure containing the extended parameters.
5145 * @size: sizeof(attr) for fwd/bwd comp. 5192 * @usize: sizeof(attr) that user-space knows about, for forwards and backwards compatibility.
5146 * @flags: for future extension. 5193 * @flags: for future extension.
5147 */ 5194 */
5148SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, 5195SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
5149 unsigned int, size, unsigned int, flags) 5196 unsigned int, usize, unsigned int, flags)
5150{ 5197{
5151 struct sched_attr attr = { 5198 struct sched_attr kattr = { };
5152 .size = sizeof(struct sched_attr),
5153 };
5154 struct task_struct *p; 5199 struct task_struct *p;
5155 int retval; 5200 int retval;
5156 5201
5157 if (!uattr || pid < 0 || size > PAGE_SIZE || 5202 if (!uattr || pid < 0 || usize > PAGE_SIZE ||
5158 size < SCHED_ATTR_SIZE_VER0 || flags) 5203 usize < SCHED_ATTR_SIZE_VER0 || flags)
5159 return -EINVAL; 5204 return -EINVAL;
5160 5205
5161 rcu_read_lock(); 5206 rcu_read_lock();
@@ -5168,25 +5213,24 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
5168 if (retval) 5213 if (retval)
5169 goto out_unlock; 5214 goto out_unlock;
5170 5215
5171 attr.sched_policy = p->policy; 5216 kattr.sched_policy = p->policy;
5172 if (p->sched_reset_on_fork) 5217 if (p->sched_reset_on_fork)
5173 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; 5218 kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
5174 if (task_has_dl_policy(p)) 5219 if (task_has_dl_policy(p))
5175 __getparam_dl(p, &attr); 5220 __getparam_dl(p, &kattr);
5176 else if (task_has_rt_policy(p)) 5221 else if (task_has_rt_policy(p))
5177 attr.sched_priority = p->rt_priority; 5222 kattr.sched_priority = p->rt_priority;
5178 else 5223 else
5179 attr.sched_nice = task_nice(p); 5224 kattr.sched_nice = task_nice(p);
5180 5225
5181#ifdef CONFIG_UCLAMP_TASK 5226#ifdef CONFIG_UCLAMP_TASK
5182 attr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; 5227 kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
5183 attr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; 5228 kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
5184#endif 5229#endif
5185 5230
5186 rcu_read_unlock(); 5231 rcu_read_unlock();
5187 5232
5188 retval = sched_read_attr(uattr, &attr, size); 5233 return sched_attr_copy_to_user(uattr, &kattr, usize);
5189 return retval;
5190 5234
5191out_unlock: 5235out_unlock:
5192 rcu_read_unlock(); 5236 rcu_read_unlock();
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bc9cfeaac8bd..500f5db0de0b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4470,6 +4470,8 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
4470 if (likely(cfs_rq->runtime_remaining > 0)) 4470 if (likely(cfs_rq->runtime_remaining > 0))
4471 return; 4471 return;
4472 4472
4473 if (cfs_rq->throttled)
4474 return;
4473 /* 4475 /*
4474 * if we're unable to extend our runtime we resched so that the active 4476 * if we're unable to extend our runtime we resched so that the active
4475 * hierarchy can be throttled 4477 * hierarchy can be throttled
@@ -4673,6 +4675,9 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
4673 if (!cfs_rq_throttled(cfs_rq)) 4675 if (!cfs_rq_throttled(cfs_rq))
4674 goto next; 4676 goto next;
4675 4677
4678 /* By the above check, this should never be true */
4679 SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
4680
4676 runtime = -cfs_rq->runtime_remaining + 1; 4681 runtime = -cfs_rq->runtime_remaining + 1;
4677 if (runtime > remaining) 4682 if (runtime > remaining)
4678 runtime = remaining; 4683 runtime = remaining;
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 80940939b733..e4bc4aa739b8 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -241,13 +241,14 @@ static void do_idle(void)
241 check_pgt_cache(); 241 check_pgt_cache();
242 rmb(); 242 rmb();
243 243
244 local_irq_disable();
245
244 if (cpu_is_offline(cpu)) { 246 if (cpu_is_offline(cpu)) {
245 tick_nohz_idle_stop_tick_protected(); 247 tick_nohz_idle_stop_tick();
246 cpuhp_report_idle_dead(); 248 cpuhp_report_idle_dead();
247 arch_cpu_idle_dead(); 249 arch_cpu_idle_dead();
248 } 250 }
249 251
250 local_irq_disable();
251 arch_cpu_idle_enter(); 252 arch_cpu_idle_enter();
252 253
253 /* 254 /*
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 23fbbcc414d5..6e52b67b420e 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -1131,7 +1131,15 @@ static void psi_trigger_destroy(struct kref *ref)
1131 * deadlock while waiting for psi_poll_work to acquire trigger_lock 1131 * deadlock while waiting for psi_poll_work to acquire trigger_lock
1132 */ 1132 */
1133 if (kworker_to_destroy) { 1133 if (kworker_to_destroy) {
1134 /*
1135 * After the RCU grace period has expired, the worker
1136 * can no longer be found through group->poll_kworker.
1137 * But it might have been already scheduled before
1138 * that - deschedule it cleanly before destroying it.
1139 */
1134 kthread_cancel_delayed_work_sync(&group->poll_work); 1140 kthread_cancel_delayed_work_sync(&group->poll_work);
1141 atomic_set(&group->poll_scheduled, 0);
1142
1135 kthread_destroy_worker(kworker_to_destroy); 1143 kthread_destroy_worker(kworker_to_destroy);
1136 } 1144 }
1137 kfree(t); 1145 kfree(t);
diff --git a/kernel/signal.c b/kernel/signal.c
index e667be6907d7..c4da1ef56fdf 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -90,6 +90,11 @@ static bool sig_task_ignored(struct task_struct *t, int sig, bool force)
90 handler == SIG_DFL && !(force && sig_kernel_only(sig))) 90 handler == SIG_DFL && !(force && sig_kernel_only(sig)))
91 return true; 91 return true;
92 92
93 /* Only allow kernel generated signals to this kthread */
94 if (unlikely((t->flags & PF_KTHREAD) &&
95 (handler == SIG_KTHREAD_KERNEL) && !force))
96 return true;
97
93 return sig_handler_ignored(handler, sig); 98 return sig_handler_ignored(handler, sig);
94} 99}
95 100
@@ -3673,8 +3678,11 @@ static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info)
3673 3678
3674static struct pid *pidfd_to_pid(const struct file *file) 3679static struct pid *pidfd_to_pid(const struct file *file)
3675{ 3680{
3676 if (file->f_op == &pidfd_fops) 3681 struct pid *pid;
3677 return file->private_data; 3682
3683 pid = pidfd_pid(file);
3684 if (!IS_ERR(pid))
3685 return pid;
3678 3686
3679 return tgid_pidfd_to_pid(file); 3687 return tgid_pidfd_to_pid(file);
3680} 3688}
diff --git a/kernel/sys.c b/kernel/sys.c
index 2969304c29fe..ec48396b4943 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -124,6 +124,12 @@
124#ifndef PAC_RESET_KEYS 124#ifndef PAC_RESET_KEYS
125# define PAC_RESET_KEYS(a, b) (-EINVAL) 125# define PAC_RESET_KEYS(a, b) (-EINVAL)
126#endif 126#endif
127#ifndef SET_TAGGED_ADDR_CTRL
128# define SET_TAGGED_ADDR_CTRL(a) (-EINVAL)
129#endif
130#ifndef GET_TAGGED_ADDR_CTRL
131# define GET_TAGGED_ADDR_CTRL() (-EINVAL)
132#endif
127 133
128/* 134/*
129 * this is where the system-wide overflow UID and GID are defined, for 135 * this is where the system-wide overflow UID and GID are defined, for
@@ -2492,6 +2498,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
2492 return -EINVAL; 2498 return -EINVAL;
2493 error = PAC_RESET_KEYS(me, arg2); 2499 error = PAC_RESET_KEYS(me, arg2);
2494 break; 2500 break;
2501 case PR_SET_TAGGED_ADDR_CTRL:
2502 if (arg3 || arg4 || arg5)
2503 return -EINVAL;
2504 error = SET_TAGGED_ADDR_CTRL(arg2);
2505 break;
2506 case PR_GET_TAGGED_ADDR_CTRL:
2507 if (arg2 || arg3 || arg4 || arg5)
2508 return -EINVAL;
2509 error = GET_TAGGED_ADDR_CTRL();
2510 break;
2495 default: 2511 default:
2496 error = -EINVAL; 2512 error = -EINVAL;
2497 break; 2513 break;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index d911c8470149..ca69290bee2a 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -146,6 +146,11 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm)
146static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) 146static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
147{ 147{
148 tk->offs_boot = ktime_add(tk->offs_boot, delta); 148 tk->offs_boot = ktime_add(tk->offs_boot, delta);
149 /*
150 * Timespec representation for VDSO update to avoid 64bit division
151 * on every update.
152 */
153 tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot);
149} 154}
150 155
151/* 156/*
diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c
index 8cf3596a4ce6..4bc37ac3bb05 100644
--- a/kernel/time/vsyscall.c
+++ b/kernel/time/vsyscall.c
@@ -17,7 +17,7 @@ static inline void update_vdso_data(struct vdso_data *vdata,
17 struct timekeeper *tk) 17 struct timekeeper *tk)
18{ 18{
19 struct vdso_timestamp *vdso_ts; 19 struct vdso_timestamp *vdso_ts;
20 u64 nsec; 20 u64 nsec, sec;
21 21
22 vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; 22 vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last;
23 vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask; 23 vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask;
@@ -45,23 +45,27 @@ static inline void update_vdso_data(struct vdso_data *vdata,
45 } 45 }
46 vdso_ts->nsec = nsec; 46 vdso_ts->nsec = nsec;
47 47
48 /* CLOCK_MONOTONIC_RAW */ 48 /* Copy MONOTONIC time for BOOTTIME */
49 vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; 49 sec = vdso_ts->sec;
50 vdso_ts->sec = tk->raw_sec; 50 /* Add the boot offset */
51 vdso_ts->nsec = tk->tkr_raw.xtime_nsec; 51 sec += tk->monotonic_to_boot.tv_sec;
52 nsec += (u64)tk->monotonic_to_boot.tv_nsec << tk->tkr_mono.shift;
52 53
53 /* CLOCK_BOOTTIME */ 54 /* CLOCK_BOOTTIME */
54 vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; 55 vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME];
55 vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; 56 vdso_ts->sec = sec;
56 nsec = tk->tkr_mono.xtime_nsec; 57
57 nsec += ((u64)(tk->wall_to_monotonic.tv_nsec +
58 ktime_to_ns(tk->offs_boot)) << tk->tkr_mono.shift);
59 while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { 58 while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
60 nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift); 59 nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift);
61 vdso_ts->sec++; 60 vdso_ts->sec++;
62 } 61 }
63 vdso_ts->nsec = nsec; 62 vdso_ts->nsec = nsec;
64 63
64 /* CLOCK_MONOTONIC_RAW */
65 vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW];
66 vdso_ts->sec = tk->raw_sec;
67 vdso_ts->nsec = tk->tkr_raw.xtime_nsec;
68
65 /* CLOCK_TAI */ 69 /* CLOCK_TAI */
66 vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI]; 70 vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI];
67 vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset; 71 vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset;
diff --git a/kernel/torture.c b/kernel/torture.c
index a8d9bdfba7c3..7c13f5558b71 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -263,7 +263,6 @@ static void torture_onoff_cleanup(void)
263 onoff_task = NULL; 263 onoff_task = NULL;
264#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 264#endif /* #ifdef CONFIG_HOTPLUG_CPU */
265} 265}
266EXPORT_SYMBOL_GPL(torture_onoff_cleanup);
267 266
268/* 267/*
269 * Print online/offline testing statistics. 268 * Print online/offline testing statistics.
@@ -449,7 +448,6 @@ static void torture_shuffle_cleanup(void)
449 } 448 }
450 shuffler_task = NULL; 449 shuffler_task = NULL;
451} 450}
452EXPORT_SYMBOL_GPL(torture_shuffle_cleanup);
453 451
454/* 452/*
455 * Variables for auto-shutdown. This allows "lights out" torture runs 453 * Variables for auto-shutdown. This allows "lights out" torture runs
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index eca34503f178..f9821a3374e9 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3095,6 +3095,14 @@ t_probe_next(struct seq_file *m, loff_t *pos)
3095 hnd = &iter->probe_entry->hlist; 3095 hnd = &iter->probe_entry->hlist;
3096 3096
3097 hash = iter->probe->ops.func_hash->filter_hash; 3097 hash = iter->probe->ops.func_hash->filter_hash;
3098
3099 /*
3100 * A probe being registered may temporarily have an empty hash
3101 * and it's at the end of the func_probes list.
3102 */
3103 if (!hash || hash == EMPTY_HASH)
3104 return NULL;
3105
3098 size = 1 << hash->size_bits; 3106 size = 1 << hash->size_bits;
3099 3107
3100 retry: 3108 retry:
@@ -4320,12 +4328,21 @@ register_ftrace_function_probe(char *glob, struct trace_array *tr,
4320 4328
4321 mutex_unlock(&ftrace_lock); 4329 mutex_unlock(&ftrace_lock);
4322 4330
4331 /*
4332 * Note, there's a small window here that the func_hash->filter_hash
4333 * may be NULL or empty. Need to be carefule when reading the loop.
4334 */
4323 mutex_lock(&probe->ops.func_hash->regex_lock); 4335 mutex_lock(&probe->ops.func_hash->regex_lock);
4324 4336
4325 orig_hash = &probe->ops.func_hash->filter_hash; 4337 orig_hash = &probe->ops.func_hash->filter_hash;
4326 old_hash = *orig_hash; 4338 old_hash = *orig_hash;
4327 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); 4339 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash);
4328 4340
4341 if (!hash) {
4342 ret = -ENOMEM;
4343 goto out;
4344 }
4345
4329 ret = ftrace_match_records(hash, glob, strlen(glob)); 4346 ret = ftrace_match_records(hash, glob, strlen(glob));
4330 4347
4331 /* Nothing found? */ 4348 /* Nothing found? */
diff --git a/kernel/trace/ftrace_internal.h b/kernel/trace/ftrace_internal.h
index 0515a2096f90..0456e0a3dab1 100644
--- a/kernel/trace/ftrace_internal.h
+++ b/kernel/trace/ftrace_internal.h
@@ -6,22 +6,22 @@
6 6
7/* 7/*
8 * Traverse the ftrace_global_list, invoking all entries. The reason that we 8 * Traverse the ftrace_global_list, invoking all entries. The reason that we
9 * can use rcu_dereference_raw_notrace() is that elements removed from this list 9 * can use rcu_dereference_raw_check() is that elements removed from this list
10 * are simply leaked, so there is no need to interact with a grace-period 10 * are simply leaked, so there is no need to interact with a grace-period
11 * mechanism. The rcu_dereference_raw_notrace() calls are needed to handle 11 * mechanism. The rcu_dereference_raw_check() calls are needed to handle
12 * concurrent insertions into the ftrace_global_list. 12 * concurrent insertions into the ftrace_global_list.
13 * 13 *
14 * Silly Alpha and silly pointer-speculation compiler optimizations! 14 * Silly Alpha and silly pointer-speculation compiler optimizations!
15 */ 15 */
16#define do_for_each_ftrace_op(op, list) \ 16#define do_for_each_ftrace_op(op, list) \
17 op = rcu_dereference_raw_notrace(list); \ 17 op = rcu_dereference_raw_check(list); \
18 do 18 do
19 19
20/* 20/*
21 * Optimized for just a single item in the list (as that is the normal case). 21 * Optimized for just a single item in the list (as that is the normal case).
22 */ 22 */
23#define while_for_each_ftrace_op(op) \ 23#define while_for_each_ftrace_op(op) \
24 while (likely(op = rcu_dereference_raw_notrace((op)->next)) && \ 24 while (likely(op = rcu_dereference_raw_check((op)->next)) && \
25 unlikely((op) != &ftrace_list_end)) 25 unlikely((op) != &ftrace_list_end))
26 26
27extern struct ftrace_ops __rcu *ftrace_ops_list; 27extern struct ftrace_ops __rcu *ftrace_ops_list;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 525a97fbbc60..947ba433865f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1567,9 +1567,9 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu,
1567 1567
1568/** 1568/**
1569 * update_max_tr_single - only copy one trace over, and reset the rest 1569 * update_max_tr_single - only copy one trace over, and reset the rest
1570 * @tr - tracer 1570 * @tr: tracer
1571 * @tsk - task with the latency 1571 * @tsk: task with the latency
1572 * @cpu - the cpu of the buffer to copy. 1572 * @cpu: the cpu of the buffer to copy.
1573 * 1573 *
1574 * Flip the trace of a single CPU buffer between the @tr and the max_tr. 1574 * Flip the trace of a single CPU buffer between the @tr and the max_tr.
1575 */ 1575 */
@@ -1767,7 +1767,7 @@ static void __init apply_trace_boot_options(void);
1767 1767
1768/** 1768/**
1769 * register_tracer - register a tracer with the ftrace system. 1769 * register_tracer - register a tracer with the ftrace system.
1770 * @type - the plugin for the tracer 1770 * @type: the plugin for the tracer
1771 * 1771 *
1772 * Register a new plugin tracer. 1772 * Register a new plugin tracer.
1773 */ 1773 */
@@ -2230,9 +2230,9 @@ static bool tracing_record_taskinfo_skip(int flags)
2230/** 2230/**
2231 * tracing_record_taskinfo - record the task info of a task 2231 * tracing_record_taskinfo - record the task info of a task
2232 * 2232 *
2233 * @task - task to record 2233 * @task: task to record
2234 * @flags - TRACE_RECORD_CMDLINE for recording comm 2234 * @flags: TRACE_RECORD_CMDLINE for recording comm
2235 * - TRACE_RECORD_TGID for recording tgid 2235 * TRACE_RECORD_TGID for recording tgid
2236 */ 2236 */
2237void tracing_record_taskinfo(struct task_struct *task, int flags) 2237void tracing_record_taskinfo(struct task_struct *task, int flags)
2238{ 2238{
@@ -2258,10 +2258,10 @@ void tracing_record_taskinfo(struct task_struct *task, int flags)
2258/** 2258/**
2259 * tracing_record_taskinfo_sched_switch - record task info for sched_switch 2259 * tracing_record_taskinfo_sched_switch - record task info for sched_switch
2260 * 2260 *
2261 * @prev - previous task during sched_switch 2261 * @prev: previous task during sched_switch
2262 * @next - next task during sched_switch 2262 * @next: next task during sched_switch
2263 * @flags - TRACE_RECORD_CMDLINE for recording comm 2263 * @flags: TRACE_RECORD_CMDLINE for recording comm
2264 * TRACE_RECORD_TGID for recording tgid 2264 * TRACE_RECORD_TGID for recording tgid
2265 */ 2265 */
2266void tracing_record_taskinfo_sched_switch(struct task_struct *prev, 2266void tracing_record_taskinfo_sched_switch(struct task_struct *prev,
2267 struct task_struct *next, int flags) 2267 struct task_struct *next, int flags)
@@ -2642,10 +2642,10 @@ static void ftrace_exports(struct ring_buffer_event *event)
2642 2642
2643 preempt_disable_notrace(); 2643 preempt_disable_notrace();
2644 2644
2645 export = rcu_dereference_raw_notrace(ftrace_exports_list); 2645 export = rcu_dereference_raw_check(ftrace_exports_list);
2646 while (export) { 2646 while (export) {
2647 trace_process_export(export, event); 2647 trace_process_export(export, event);
2648 export = rcu_dereference_raw_notrace(export->next); 2648 export = rcu_dereference_raw_check(export->next);
2649 } 2649 }
2650 2650
2651 preempt_enable_notrace(); 2651 preempt_enable_notrace();
@@ -3072,7 +3072,9 @@ static void trace_printk_start_stop_comm(int enabled)
3072 3072
3073/** 3073/**
3074 * trace_vbprintk - write binary msg to tracing buffer 3074 * trace_vbprintk - write binary msg to tracing buffer
3075 * 3075 * @ip: The address of the caller
3076 * @fmt: The string format to write to the buffer
3077 * @args: Arguments for @fmt
3076 */ 3078 */
3077int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) 3079int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
3078{ 3080{
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index c7506bc81b75..648930823b57 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -787,7 +787,7 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
787 return ret; 787 return ret;
788} 788}
789 789
790static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) 790int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
791{ 791{
792 char *event = NULL, *sub = NULL, *match; 792 char *event = NULL, *sub = NULL, *match;
793 int ret; 793 int ret;
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index dbef0d135075..fb6bfbc5bf86 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -895,7 +895,8 @@ void trace_probe_cleanup(struct trace_probe *tp)
895 for (i = 0; i < tp->nr_args; i++) 895 for (i = 0; i < tp->nr_args; i++)
896 traceprobe_free_probe_arg(&tp->args[i]); 896 traceprobe_free_probe_arg(&tp->args[i]);
897 897
898 kfree(call->class->system); 898 if (call->class)
899 kfree(call->class->system);
899 kfree(call->name); 900 kfree(call->name);
900 kfree(call->print_fmt); 901 kfree(call->print_fmt);
901} 902}