aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorJason Gunthorpe <jgg@mellanox.com>2018-01-29 15:26:40 -0500
committerJason Gunthorpe <jgg@mellanox.com>2018-01-30 11:30:00 -0500
commite7996a9a77fc669387da43ff4823b91cc4872bd0 (patch)
tree617f0a128e222539d67e8cccc359f1bc4b984900 /kernel
parentb5fa635aab8f0d39a824c01991266a6d06f007fb (diff)
parentd8a5b80568a9cb66810e75b182018e9edb68e8ff (diff)
Merge tag v4.15 of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
To resolve conflicts in: drivers/infiniband/hw/mlx5/main.c drivers/infiniband/hw/mlx5/qp.c From patches merged into the -rc cycle. The conflict resolution matches what linux-next has been carrying. Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/acct.c2
-rw-r--r--kernel/bpf/arraymap.c61
-rw-r--r--kernel/bpf/core.c26
-rw-r--r--kernel/bpf/hashtab.c2
-rw-r--r--kernel/bpf/inode.c40
-rw-r--r--kernel/bpf/offload.c15
-rw-r--r--kernel/bpf/sockmap.c11
-rw-r--r--kernel/bpf/syscall.c2
-rw-r--r--kernel/bpf/verifier.c388
-rw-r--r--kernel/cgroup/cgroup-v1.c6
-rw-r--r--kernel/cgroup/cgroup.c21
-rw-r--r--kernel/cgroup/debug.c4
-rw-r--r--kernel/cgroup/stat.c8
-rw-r--r--kernel/cpu.c26
-rw-r--r--kernel/crash_core.c2
-rw-r--r--kernel/debug/kdb/kdb_io.c2
-rw-r--r--kernel/delayacct.c42
-rw-r--r--kernel/events/core.c54
-rw-r--r--kernel/exit.c9
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/futex.c96
-rw-r--r--kernel/groups.c5
-rw-r--r--kernel/irq/debug.h5
-rw-r--r--kernel/irq/debugfs.c1
-rw-r--r--kernel/irq/generic-chip.c11
-rw-r--r--kernel/irq/internals.h2
-rw-r--r--kernel/irq/irqdomain.c13
-rw-r--r--kernel/irq/matrix.c24
-rw-r--r--kernel/irq/msi.c64
-rw-r--r--kernel/jump_label.c12
-rw-r--r--kernel/kcov.c4
-rw-r--r--kernel/locking/lockdep.c653
-rw-r--r--kernel/locking/rtmutex.c26
-rw-r--r--kernel/locking/rtmutex_common.h1
-rw-r--r--kernel/locking/spinlock.c13
-rw-r--r--kernel/pid.c8
-rw-r--r--kernel/printk/printk.c3
-rw-r--r--kernel/sched/completion.c5
-rw-r--r--kernel/sched/core.c28
-rw-r--r--kernel/sched/cpufreq_schedutil.c2
-rw-r--r--kernel/sched/fair.c106
-rw-r--r--kernel/sched/membarrier.c2
-rw-r--r--kernel/sched/rt.c8
-rw-r--r--kernel/sched/wait.c2
-rw-r--r--kernel/time/Kconfig1
-rw-r--r--kernel/time/hrtimer.c3
-rw-r--r--kernel/time/posix-timers.c29
-rw-r--r--kernel/time/tick-sched.c32
-rw-r--r--kernel/time/timer.c37
-rw-r--r--kernel/trace/Kconfig3
-rw-r--r--kernel/trace/bpf_trace.c27
-rw-r--r--kernel/trace/ftrace.c29
-rw-r--r--kernel/trace/ring_buffer.c79
-rw-r--r--kernel/trace/trace.c88
-rw-r--r--kernel/trace/trace_events.c16
-rw-r--r--kernel/trace/trace_events_trigger.c13
-rw-r--r--kernel/trace/trace_functions.c49
-rw-r--r--kernel/trace/trace_stack.c4
-rw-r--r--kernel/uid16.c1
-rw-r--r--kernel/workqueue.c46
60 files changed, 1169 insertions, 1106 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index d15c0ee4d955..addf7732fb56 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -102,7 +102,7 @@ static int check_free_space(struct bsd_acct_struct *acct)
102{ 102{
103 struct kstatfs sbuf; 103 struct kstatfs sbuf;
104 104
105 if (time_is_before_jiffies(acct->needcheck)) 105 if (time_is_after_jiffies(acct->needcheck))
106 goto out; 106 goto out;
107 107
108 /* May block */ 108 /* May block */
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 7c25426d3cf5..ab94d304a634 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -53,9 +53,10 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
53{ 53{
54 bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; 54 bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
55 int numa_node = bpf_map_attr_numa_node(attr); 55 int numa_node = bpf_map_attr_numa_node(attr);
56 u32 elem_size, index_mask, max_entries;
57 bool unpriv = !capable(CAP_SYS_ADMIN);
56 struct bpf_array *array; 58 struct bpf_array *array;
57 u64 array_size; 59 u64 array_size, mask64;
58 u32 elem_size;
59 60
60 /* check sanity of attributes */ 61 /* check sanity of attributes */
61 if (attr->max_entries == 0 || attr->key_size != 4 || 62 if (attr->max_entries == 0 || attr->key_size != 4 ||
@@ -72,11 +73,32 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
72 73
73 elem_size = round_up(attr->value_size, 8); 74 elem_size = round_up(attr->value_size, 8);
74 75
76 max_entries = attr->max_entries;
77
78 /* On 32 bit archs roundup_pow_of_two() with max_entries that has
79 * upper most bit set in u32 space is undefined behavior due to
80 * resulting 1U << 32, so do it manually here in u64 space.
81 */
82 mask64 = fls_long(max_entries - 1);
83 mask64 = 1ULL << mask64;
84 mask64 -= 1;
85
86 index_mask = mask64;
87 if (unpriv) {
88 /* round up array size to nearest power of 2,
89 * since cpu will speculate within index_mask limits
90 */
91 max_entries = index_mask + 1;
92 /* Check for overflows. */
93 if (max_entries < attr->max_entries)
94 return ERR_PTR(-E2BIG);
95 }
96
75 array_size = sizeof(*array); 97 array_size = sizeof(*array);
76 if (percpu) 98 if (percpu)
77 array_size += (u64) attr->max_entries * sizeof(void *); 99 array_size += (u64) max_entries * sizeof(void *);
78 else 100 else
79 array_size += (u64) attr->max_entries * elem_size; 101 array_size += (u64) max_entries * elem_size;
80 102
81 /* make sure there is no u32 overflow later in round_up() */ 103 /* make sure there is no u32 overflow later in round_up() */
82 if (array_size >= U32_MAX - PAGE_SIZE) 104 if (array_size >= U32_MAX - PAGE_SIZE)
@@ -86,6 +108,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
86 array = bpf_map_area_alloc(array_size, numa_node); 108 array = bpf_map_area_alloc(array_size, numa_node);
87 if (!array) 109 if (!array)
88 return ERR_PTR(-ENOMEM); 110 return ERR_PTR(-ENOMEM);
111 array->index_mask = index_mask;
112 array->map.unpriv_array = unpriv;
89 113
90 /* copy mandatory map attributes */ 114 /* copy mandatory map attributes */
91 array->map.map_type = attr->map_type; 115 array->map.map_type = attr->map_type;
@@ -121,12 +145,13 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
121 if (unlikely(index >= array->map.max_entries)) 145 if (unlikely(index >= array->map.max_entries))
122 return NULL; 146 return NULL;
123 147
124 return array->value + array->elem_size * index; 148 return array->value + array->elem_size * (index & array->index_mask);
125} 149}
126 150
127/* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ 151/* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
128static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) 152static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
129{ 153{
154 struct bpf_array *array = container_of(map, struct bpf_array, map);
130 struct bpf_insn *insn = insn_buf; 155 struct bpf_insn *insn = insn_buf;
131 u32 elem_size = round_up(map->value_size, 8); 156 u32 elem_size = round_up(map->value_size, 8);
132 const int ret = BPF_REG_0; 157 const int ret = BPF_REG_0;
@@ -135,7 +160,12 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
135 160
136 *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); 161 *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
137 *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); 162 *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
138 *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); 163 if (map->unpriv_array) {
164 *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4);
165 *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
166 } else {
167 *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3);
168 }
139 169
140 if (is_power_of_2(elem_size)) { 170 if (is_power_of_2(elem_size)) {
141 *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); 171 *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
@@ -157,7 +187,7 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
157 if (unlikely(index >= array->map.max_entries)) 187 if (unlikely(index >= array->map.max_entries))
158 return NULL; 188 return NULL;
159 189
160 return this_cpu_ptr(array->pptrs[index]); 190 return this_cpu_ptr(array->pptrs[index & array->index_mask]);
161} 191}
162 192
163int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) 193int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
@@ -177,7 +207,7 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
177 */ 207 */
178 size = round_up(map->value_size, 8); 208 size = round_up(map->value_size, 8);
179 rcu_read_lock(); 209 rcu_read_lock();
180 pptr = array->pptrs[index]; 210 pptr = array->pptrs[index & array->index_mask];
181 for_each_possible_cpu(cpu) { 211 for_each_possible_cpu(cpu) {
182 bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size); 212 bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size);
183 off += size; 213 off += size;
@@ -225,10 +255,11 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
225 return -EEXIST; 255 return -EEXIST;
226 256
227 if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) 257 if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
228 memcpy(this_cpu_ptr(array->pptrs[index]), 258 memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),
229 value, map->value_size); 259 value, map->value_size);
230 else 260 else
231 memcpy(array->value + array->elem_size * index, 261 memcpy(array->value +
262 array->elem_size * (index & array->index_mask),
232 value, map->value_size); 263 value, map->value_size);
233 return 0; 264 return 0;
234} 265}
@@ -262,7 +293,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
262 */ 293 */
263 size = round_up(map->value_size, 8); 294 size = round_up(map->value_size, 8);
264 rcu_read_lock(); 295 rcu_read_lock();
265 pptr = array->pptrs[index]; 296 pptr = array->pptrs[index & array->index_mask];
266 for_each_possible_cpu(cpu) { 297 for_each_possible_cpu(cpu) {
267 bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size); 298 bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size);
268 off += size; 299 off += size;
@@ -613,6 +644,7 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key)
613static u32 array_of_map_gen_lookup(struct bpf_map *map, 644static u32 array_of_map_gen_lookup(struct bpf_map *map,
614 struct bpf_insn *insn_buf) 645 struct bpf_insn *insn_buf)
615{ 646{
647 struct bpf_array *array = container_of(map, struct bpf_array, map);
616 u32 elem_size = round_up(map->value_size, 8); 648 u32 elem_size = round_up(map->value_size, 8);
617 struct bpf_insn *insn = insn_buf; 649 struct bpf_insn *insn = insn_buf;
618 const int ret = BPF_REG_0; 650 const int ret = BPF_REG_0;
@@ -621,7 +653,12 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map,
621 653
622 *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); 654 *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
623 *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); 655 *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
624 *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); 656 if (map->unpriv_array) {
657 *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6);
658 *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
659 } else {
660 *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
661 }
625 if (is_power_of_2(elem_size)) 662 if (is_power_of_2(elem_size))
626 *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); 663 *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
627 else 664 else
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index b9f8686a84cf..7949e8b8f94e 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -767,6 +767,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
767} 767}
768EXPORT_SYMBOL_GPL(__bpf_call_base); 768EXPORT_SYMBOL_GPL(__bpf_call_base);
769 769
770#ifndef CONFIG_BPF_JIT_ALWAYS_ON
770/** 771/**
771 * __bpf_prog_run - run eBPF program on a given context 772 * __bpf_prog_run - run eBPF program on a given context
772 * @ctx: is the data we are operating on 773 * @ctx: is the data we are operating on
@@ -955,7 +956,7 @@ select_insn:
955 DST = tmp; 956 DST = tmp;
956 CONT; 957 CONT;
957 ALU_MOD_X: 958 ALU_MOD_X:
958 if (unlikely(SRC == 0)) 959 if (unlikely((u32)SRC == 0))
959 return 0; 960 return 0;
960 tmp = (u32) DST; 961 tmp = (u32) DST;
961 DST = do_div(tmp, (u32) SRC); 962 DST = do_div(tmp, (u32) SRC);
@@ -974,7 +975,7 @@ select_insn:
974 DST = div64_u64(DST, SRC); 975 DST = div64_u64(DST, SRC);
975 CONT; 976 CONT;
976 ALU_DIV_X: 977 ALU_DIV_X:
977 if (unlikely(SRC == 0)) 978 if (unlikely((u32)SRC == 0))
978 return 0; 979 return 0;
979 tmp = (u32) DST; 980 tmp = (u32) DST;
980 do_div(tmp, (u32) SRC); 981 do_div(tmp, (u32) SRC);
@@ -1317,6 +1318,14 @@ EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
1317EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) 1318EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
1318}; 1319};
1319 1320
1321#else
1322static unsigned int __bpf_prog_ret0(const void *ctx,
1323 const struct bpf_insn *insn)
1324{
1325 return 0;
1326}
1327#endif
1328
1320bool bpf_prog_array_compatible(struct bpf_array *array, 1329bool bpf_prog_array_compatible(struct bpf_array *array,
1321 const struct bpf_prog *fp) 1330 const struct bpf_prog *fp)
1322{ 1331{
@@ -1364,9 +1373,13 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
1364 */ 1373 */
1365struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) 1374struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
1366{ 1375{
1376#ifndef CONFIG_BPF_JIT_ALWAYS_ON
1367 u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); 1377 u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);
1368 1378
1369 fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; 1379 fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1];
1380#else
1381 fp->bpf_func = __bpf_prog_ret0;
1382#endif
1370 1383
1371 /* eBPF JITs can rewrite the program in case constant 1384 /* eBPF JITs can rewrite the program in case constant
1372 * blinding is active. However, in case of error during 1385 * blinding is active. However, in case of error during
@@ -1376,6 +1389,12 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
1376 */ 1389 */
1377 if (!bpf_prog_is_dev_bound(fp->aux)) { 1390 if (!bpf_prog_is_dev_bound(fp->aux)) {
1378 fp = bpf_int_jit_compile(fp); 1391 fp = bpf_int_jit_compile(fp);
1392#ifdef CONFIG_BPF_JIT_ALWAYS_ON
1393 if (!fp->jited) {
1394 *err = -ENOTSUPP;
1395 return fp;
1396 }
1397#endif
1379 } else { 1398 } else {
1380 *err = bpf_prog_offload_compile(fp); 1399 *err = bpf_prog_offload_compile(fp);
1381 if (*err) 1400 if (*err)
@@ -1447,7 +1466,8 @@ int bpf_prog_array_length(struct bpf_prog_array __rcu *progs)
1447 rcu_read_lock(); 1466 rcu_read_lock();
1448 prog = rcu_dereference(progs)->progs; 1467 prog = rcu_dereference(progs)->progs;
1449 for (; *prog; prog++) 1468 for (; *prog; prog++)
1450 cnt++; 1469 if (*prog != &dummy_bpf_prog.prog)
1470 cnt++;
1451 rcu_read_unlock(); 1471 rcu_read_unlock();
1452 return cnt; 1472 return cnt;
1453} 1473}
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index e469e05c8e83..3905d4bc5b80 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -114,6 +114,7 @@ static void htab_free_elems(struct bpf_htab *htab)
114 pptr = htab_elem_get_ptr(get_htab_elem(htab, i), 114 pptr = htab_elem_get_ptr(get_htab_elem(htab, i),
115 htab->map.key_size); 115 htab->map.key_size);
116 free_percpu(pptr); 116 free_percpu(pptr);
117 cond_resched();
117 } 118 }
118free_elems: 119free_elems:
119 bpf_map_area_free(htab->elems); 120 bpf_map_area_free(htab->elems);
@@ -159,6 +160,7 @@ static int prealloc_init(struct bpf_htab *htab)
159 goto free_elems; 160 goto free_elems;
160 htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size, 161 htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
161 pptr); 162 pptr);
163 cond_resched();
162 } 164 }
163 165
164skip_percpu_elems: 166skip_percpu_elems:
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 01aaef1a77c5..5bb5e49ef4c3 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -368,7 +368,45 @@ out:
368 putname(pname); 368 putname(pname);
369 return ret; 369 return ret;
370} 370}
371EXPORT_SYMBOL_GPL(bpf_obj_get_user); 371
372static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type)
373{
374 struct bpf_prog *prog;
375 int ret = inode_permission(inode, MAY_READ | MAY_WRITE);
376 if (ret)
377 return ERR_PTR(ret);
378
379 if (inode->i_op == &bpf_map_iops)
380 return ERR_PTR(-EINVAL);
381 if (inode->i_op != &bpf_prog_iops)
382 return ERR_PTR(-EACCES);
383
384 prog = inode->i_private;
385
386 ret = security_bpf_prog(prog);
387 if (ret < 0)
388 return ERR_PTR(ret);
389
390 if (!bpf_prog_get_ok(prog, &type, false))
391 return ERR_PTR(-EINVAL);
392
393 return bpf_prog_inc(prog);
394}
395
396struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type)
397{
398 struct bpf_prog *prog;
399 struct path path;
400 int ret = kern_path(name, LOOKUP_FOLLOW, &path);
401 if (ret)
402 return ERR_PTR(ret);
403 prog = __get_prog_inode(d_backing_inode(path.dentry), type);
404 if (!IS_ERR(prog))
405 touch_atime(&path);
406 path_put(&path);
407 return prog;
408}
409EXPORT_SYMBOL(bpf_prog_get_type_path);
372 410
373static void bpf_evict_inode(struct inode *inode) 411static void bpf_evict_inode(struct inode *inode)
374{ 412{
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 68ec884440b7..8455b89d1bbf 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -1,3 +1,18 @@
1/*
2 * Copyright (C) 2017 Netronome Systems, Inc.
3 *
4 * This software is licensed under the GNU General License Version 2,
5 * June 1991 as shown in the file COPYING in the top-level directory of this
6 * source tree.
7 *
8 * THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS"
9 * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
10 * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
11 * FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE
12 * OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME
13 * THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
14 */
15
1#include <linux/bpf.h> 16#include <linux/bpf.h>
2#include <linux/bpf_verifier.h> 17#include <linux/bpf_verifier.h>
3#include <linux/bug.h> 18#include <linux/bug.h>
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 5ee2e41893d9..1712d319c2d8 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -591,8 +591,15 @@ static void sock_map_free(struct bpf_map *map)
591 591
592 write_lock_bh(&sock->sk_callback_lock); 592 write_lock_bh(&sock->sk_callback_lock);
593 psock = smap_psock_sk(sock); 593 psock = smap_psock_sk(sock);
594 smap_list_remove(psock, &stab->sock_map[i]); 594 /* This check handles a racing sock event that can get the
595 smap_release_sock(psock, sock); 595 * sk_callback_lock before this case but after xchg happens
596 * causing the refcnt to hit zero and sock user data (psock)
597 * to be null and queued for garbage collection.
598 */
599 if (likely(psock)) {
600 smap_list_remove(psock, &stab->sock_map[i]);
601 smap_release_sock(psock, sock);
602 }
596 write_unlock_bh(&sock->sk_callback_lock); 603 write_unlock_bh(&sock->sk_callback_lock);
597 } 604 }
598 rcu_read_unlock(); 605 rcu_read_unlock();
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2c4cfeaa8d5e..5cb783fc8224 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1057,7 +1057,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
1057} 1057}
1058EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero); 1058EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
1059 1059
1060static bool bpf_prog_get_ok(struct bpf_prog *prog, 1060bool bpf_prog_get_ok(struct bpf_prog *prog,
1061 enum bpf_prog_type *attach_type, bool attach_drv) 1061 enum bpf_prog_type *attach_type, bool attach_drv)
1062{ 1062{
1063 /* not an attachment, just a refcount inc, always allow */ 1063 /* not an attachment, just a refcount inc, always allow */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d4593571c404..13551e623501 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -978,6 +978,13 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
978 return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno); 978 return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno);
979} 979}
980 980
981static bool is_ctx_reg(struct bpf_verifier_env *env, int regno)
982{
983 const struct bpf_reg_state *reg = cur_regs(env) + regno;
984
985 return reg->type == PTR_TO_CTX;
986}
987
981static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, 988static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
982 const struct bpf_reg_state *reg, 989 const struct bpf_reg_state *reg,
983 int off, int size, bool strict) 990 int off, int size, bool strict)
@@ -1059,6 +1066,11 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
1059 break; 1066 break;
1060 case PTR_TO_STACK: 1067 case PTR_TO_STACK:
1061 pointer_desc = "stack "; 1068 pointer_desc = "stack ";
1069 /* The stack spill tracking logic in check_stack_write()
1070 * and check_stack_read() relies on stack accesses being
1071 * aligned.
1072 */
1073 strict = true;
1062 break; 1074 break;
1063 default: 1075 default:
1064 break; 1076 break;
@@ -1067,6 +1079,29 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
1067 strict); 1079 strict);
1068} 1080}
1069 1081
1082/* truncate register to smaller size (in bytes)
1083 * must be called with size < BPF_REG_SIZE
1084 */
1085static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
1086{
1087 u64 mask;
1088
1089 /* clear high bits in bit representation */
1090 reg->var_off = tnum_cast(reg->var_off, size);
1091
1092 /* fix arithmetic bounds */
1093 mask = ((u64)1 << (size * 8)) - 1;
1094 if ((reg->umin_value & ~mask) == (reg->umax_value & ~mask)) {
1095 reg->umin_value &= mask;
1096 reg->umax_value &= mask;
1097 } else {
1098 reg->umin_value = 0;
1099 reg->umax_value = mask;
1100 }
1101 reg->smin_value = reg->umin_value;
1102 reg->smax_value = reg->umax_value;
1103}
1104
1070/* check whether memory at (regno + off) is accessible for t = (read | write) 1105/* check whether memory at (regno + off) is accessible for t = (read | write)
1071 * if t==write, value_regno is a register which value is stored into memory 1106 * if t==write, value_regno is a register which value is stored into memory
1072 * if t==read, value_regno is a register which will receive the value from memory 1107 * if t==read, value_regno is a register which will receive the value from memory
@@ -1200,9 +1235,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
1200 if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ && 1235 if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ &&
1201 regs[value_regno].type == SCALAR_VALUE) { 1236 regs[value_regno].type == SCALAR_VALUE) {
1202 /* b/h/w load zero-extends, mark upper bits as known 0 */ 1237 /* b/h/w load zero-extends, mark upper bits as known 0 */
1203 regs[value_regno].var_off = 1238 coerce_reg_to_size(&regs[value_regno], size);
1204 tnum_cast(regs[value_regno].var_off, size);
1205 __update_reg_bounds(&regs[value_regno]);
1206 } 1239 }
1207 return err; 1240 return err;
1208} 1241}
@@ -1232,6 +1265,12 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
1232 return -EACCES; 1265 return -EACCES;
1233 } 1266 }
1234 1267
1268 if (is_ctx_reg(env, insn->dst_reg)) {
1269 verbose(env, "BPF_XADD stores into R%d context is not allowed\n",
1270 insn->dst_reg);
1271 return -EACCES;
1272 }
1273
1235 /* check whether atomic_add can read the memory */ 1274 /* check whether atomic_add can read the memory */
1236 err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, 1275 err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
1237 BPF_SIZE(insn->code), BPF_READ, -1); 1276 BPF_SIZE(insn->code), BPF_READ, -1);
@@ -1282,6 +1321,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
1282 tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off); 1321 tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off);
1283 verbose(env, "invalid variable stack read R%d var_off=%s\n", 1322 verbose(env, "invalid variable stack read R%d var_off=%s\n",
1284 regno, tn_buf); 1323 regno, tn_buf);
1324 return -EACCES;
1285 } 1325 }
1286 off = regs[regno].off + regs[regno].var_off.value; 1326 off = regs[regno].off + regs[regno].var_off.value;
1287 if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || 1327 if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
@@ -1674,7 +1714,13 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
1674 return -EINVAL; 1714 return -EINVAL;
1675 } 1715 }
1676 1716
1717 /* With LD_ABS/IND some JITs save/restore skb from r1. */
1677 changes_data = bpf_helper_changes_pkt_data(fn->func); 1718 changes_data = bpf_helper_changes_pkt_data(fn->func);
1719 if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) {
1720 verbose(env, "kernel subsystem misconfigured func %s#%d: r1 != ctx\n",
1721 func_id_name(func_id), func_id);
1722 return -EINVAL;
1723 }
1678 1724
1679 memset(&meta, 0, sizeof(meta)); 1725 memset(&meta, 0, sizeof(meta));
1680 meta.pkt_access = fn->pkt_access; 1726 meta.pkt_access = fn->pkt_access;
@@ -1696,6 +1742,13 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
1696 err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta); 1742 err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta);
1697 if (err) 1743 if (err)
1698 return err; 1744 return err;
1745 if (func_id == BPF_FUNC_tail_call) {
1746 if (meta.map_ptr == NULL) {
1747 verbose(env, "verifier bug\n");
1748 return -EINVAL;
1749 }
1750 env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr;
1751 }
1699 err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta); 1752 err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta);
1700 if (err) 1753 if (err)
1701 return err; 1754 return err;
@@ -1766,14 +1819,6 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
1766 return 0; 1819 return 0;
1767} 1820}
1768 1821
1769static void coerce_reg_to_32(struct bpf_reg_state *reg)
1770{
1771 /* clear high 32 bits */
1772 reg->var_off = tnum_cast(reg->var_off, 4);
1773 /* Update bounds */
1774 __update_reg_bounds(reg);
1775}
1776
1777static bool signed_add_overflows(s64 a, s64 b) 1822static bool signed_add_overflows(s64 a, s64 b)
1778{ 1823{
1779 /* Do the add in u64, where overflow is well-defined */ 1824 /* Do the add in u64, where overflow is well-defined */
@@ -1794,6 +1839,41 @@ static bool signed_sub_overflows(s64 a, s64 b)
1794 return res > a; 1839 return res > a;
1795} 1840}
1796 1841
1842static bool check_reg_sane_offset(struct bpf_verifier_env *env,
1843 const struct bpf_reg_state *reg,
1844 enum bpf_reg_type type)
1845{
1846 bool known = tnum_is_const(reg->var_off);
1847 s64 val = reg->var_off.value;
1848 s64 smin = reg->smin_value;
1849
1850 if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) {
1851 verbose(env, "math between %s pointer and %lld is not allowed\n",
1852 reg_type_str[type], val);
1853 return false;
1854 }
1855
1856 if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) {
1857 verbose(env, "%s pointer offset %d is not allowed\n",
1858 reg_type_str[type], reg->off);
1859 return false;
1860 }
1861
1862 if (smin == S64_MIN) {
1863 verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n",
1864 reg_type_str[type]);
1865 return false;
1866 }
1867
1868 if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) {
1869 verbose(env, "value %lld makes %s pointer be out of bounds\n",
1870 smin, reg_type_str[type]);
1871 return false;
1872 }
1873
1874 return true;
1875}
1876
1797/* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off. 1877/* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.
1798 * Caller should also handle BPF_MOV case separately. 1878 * Caller should also handle BPF_MOV case separately.
1799 * If we return -EACCES, caller may want to try again treating pointer as a 1879 * If we return -EACCES, caller may want to try again treating pointer as a
@@ -1815,44 +1895,36 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
1815 1895
1816 dst_reg = &regs[dst]; 1896 dst_reg = &regs[dst];
1817 1897
1818 if (WARN_ON_ONCE(known && (smin_val != smax_val))) { 1898 if ((known && (smin_val != smax_val || umin_val != umax_val)) ||
1819 print_verifier_state(env, env->cur_state); 1899 smin_val > smax_val || umin_val > umax_val) {
1820 verbose(env, 1900 /* Taint dst register if offset had invalid bounds derived from
1821 "verifier internal error: known but bad sbounds\n"); 1901 * e.g. dead branches.
1822 return -EINVAL; 1902 */
1823 } 1903 __mark_reg_unknown(dst_reg);
1824 if (WARN_ON_ONCE(known && (umin_val != umax_val))) { 1904 return 0;
1825 print_verifier_state(env, env->cur_state);
1826 verbose(env,
1827 "verifier internal error: known but bad ubounds\n");
1828 return -EINVAL;
1829 } 1905 }
1830 1906
1831 if (BPF_CLASS(insn->code) != BPF_ALU64) { 1907 if (BPF_CLASS(insn->code) != BPF_ALU64) {
1832 /* 32-bit ALU ops on pointers produce (meaningless) scalars */ 1908 /* 32-bit ALU ops on pointers produce (meaningless) scalars */
1833 if (!env->allow_ptr_leaks) 1909 verbose(env,
1834 verbose(env, 1910 "R%d 32-bit pointer arithmetic prohibited\n",
1835 "R%d 32-bit pointer arithmetic prohibited\n", 1911 dst);
1836 dst);
1837 return -EACCES; 1912 return -EACCES;
1838 } 1913 }
1839 1914
1840 if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { 1915 if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
1841 if (!env->allow_ptr_leaks) 1916 verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
1842 verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n", 1917 dst);
1843 dst);
1844 return -EACCES; 1918 return -EACCES;
1845 } 1919 }
1846 if (ptr_reg->type == CONST_PTR_TO_MAP) { 1920 if (ptr_reg->type == CONST_PTR_TO_MAP) {
1847 if (!env->allow_ptr_leaks) 1921 verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
1848 verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n", 1922 dst);
1849 dst);
1850 return -EACCES; 1923 return -EACCES;
1851 } 1924 }
1852 if (ptr_reg->type == PTR_TO_PACKET_END) { 1925 if (ptr_reg->type == PTR_TO_PACKET_END) {
1853 if (!env->allow_ptr_leaks) 1926 verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
1854 verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n", 1927 dst);
1855 dst);
1856 return -EACCES; 1928 return -EACCES;
1857 } 1929 }
1858 1930
@@ -1862,6 +1934,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
1862 dst_reg->type = ptr_reg->type; 1934 dst_reg->type = ptr_reg->type;
1863 dst_reg->id = ptr_reg->id; 1935 dst_reg->id = ptr_reg->id;
1864 1936
1937 if (!check_reg_sane_offset(env, off_reg, ptr_reg->type) ||
1938 !check_reg_sane_offset(env, ptr_reg, ptr_reg->type))
1939 return -EINVAL;
1940
1865 switch (opcode) { 1941 switch (opcode) {
1866 case BPF_ADD: 1942 case BPF_ADD:
1867 /* We can take a fixed offset as long as it doesn't overflow 1943 /* We can take a fixed offset as long as it doesn't overflow
@@ -1915,9 +1991,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
1915 case BPF_SUB: 1991 case BPF_SUB:
1916 if (dst_reg == off_reg) { 1992 if (dst_reg == off_reg) {
1917 /* scalar -= pointer. Creates an unknown scalar */ 1993 /* scalar -= pointer. Creates an unknown scalar */
1918 if (!env->allow_ptr_leaks) 1994 verbose(env, "R%d tried to subtract pointer from scalar\n",
1919 verbose(env, "R%d tried to subtract pointer from scalar\n", 1995 dst);
1920 dst);
1921 return -EACCES; 1996 return -EACCES;
1922 } 1997 }
1923 /* We don't allow subtraction from FP, because (according to 1998 /* We don't allow subtraction from FP, because (according to
@@ -1925,9 +2000,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
1925 * be able to deal with it. 2000 * be able to deal with it.
1926 */ 2001 */
1927 if (ptr_reg->type == PTR_TO_STACK) { 2002 if (ptr_reg->type == PTR_TO_STACK) {
1928 if (!env->allow_ptr_leaks) 2003 verbose(env, "R%d subtraction from stack pointer prohibited\n",
1929 verbose(env, "R%d subtraction from stack pointer prohibited\n", 2004 dst);
1930 dst);
1931 return -EACCES; 2005 return -EACCES;
1932 } 2006 }
1933 if (known && (ptr_reg->off - smin_val == 2007 if (known && (ptr_reg->off - smin_val ==
@@ -1976,28 +2050,30 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
1976 case BPF_AND: 2050 case BPF_AND:
1977 case BPF_OR: 2051 case BPF_OR:
1978 case BPF_XOR: 2052 case BPF_XOR:
1979 /* bitwise ops on pointers are troublesome, prohibit for now. 2053 /* bitwise ops on pointers are troublesome, prohibit. */
1980 * (However, in principle we could allow some cases, e.g. 2054 verbose(env, "R%d bitwise operator %s on pointer prohibited\n",
1981 * ptr &= ~3 which would reduce min_value by 3.) 2055 dst, bpf_alu_string[opcode >> 4]);
1982 */
1983 if (!env->allow_ptr_leaks)
1984 verbose(env, "R%d bitwise operator %s on pointer prohibited\n",
1985 dst, bpf_alu_string[opcode >> 4]);
1986 return -EACCES; 2056 return -EACCES;
1987 default: 2057 default:
1988 /* other operators (e.g. MUL,LSH) produce non-pointer results */ 2058 /* other operators (e.g. MUL,LSH) produce non-pointer results */
1989 if (!env->allow_ptr_leaks) 2059 verbose(env, "R%d pointer arithmetic with %s operator prohibited\n",
1990 verbose(env, "R%d pointer arithmetic with %s operator prohibited\n", 2060 dst, bpf_alu_string[opcode >> 4]);
1991 dst, bpf_alu_string[opcode >> 4]);
1992 return -EACCES; 2061 return -EACCES;
1993 } 2062 }
1994 2063
2064 if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type))
2065 return -EINVAL;
2066
1995 __update_reg_bounds(dst_reg); 2067 __update_reg_bounds(dst_reg);
1996 __reg_deduce_bounds(dst_reg); 2068 __reg_deduce_bounds(dst_reg);
1997 __reg_bound_offset(dst_reg); 2069 __reg_bound_offset(dst_reg);
1998 return 0; 2070 return 0;
1999} 2071}
2000 2072
2073/* WARNING: This function does calculations on 64-bit values, but the actual
2074 * execution may occur on 32-bit values. Therefore, things like bitshifts
2075 * need extra checks in the 32-bit case.
2076 */
2001static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, 2077static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
2002 struct bpf_insn *insn, 2078 struct bpf_insn *insn,
2003 struct bpf_reg_state *dst_reg, 2079 struct bpf_reg_state *dst_reg,
@@ -2008,12 +2084,8 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
2008 bool src_known, dst_known; 2084 bool src_known, dst_known;
2009 s64 smin_val, smax_val; 2085 s64 smin_val, smax_val;
2010 u64 umin_val, umax_val; 2086 u64 umin_val, umax_val;
2087 u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32;
2011 2088
2012 if (BPF_CLASS(insn->code) != BPF_ALU64) {
2013 /* 32-bit ALU ops are (32,32)->64 */
2014 coerce_reg_to_32(dst_reg);
2015 coerce_reg_to_32(&src_reg);
2016 }
2017 smin_val = src_reg.smin_value; 2089 smin_val = src_reg.smin_value;
2018 smax_val = src_reg.smax_value; 2090 smax_val = src_reg.smax_value;
2019 umin_val = src_reg.umin_value; 2091 umin_val = src_reg.umin_value;
@@ -2021,6 +2093,21 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
2021 src_known = tnum_is_const(src_reg.var_off); 2093 src_known = tnum_is_const(src_reg.var_off);
2022 dst_known = tnum_is_const(dst_reg->var_off); 2094 dst_known = tnum_is_const(dst_reg->var_off);
2023 2095
2096 if ((src_known && (smin_val != smax_val || umin_val != umax_val)) ||
2097 smin_val > smax_val || umin_val > umax_val) {
2098 /* Taint dst register if offset had invalid bounds derived from
2099 * e.g. dead branches.
2100 */
2101 __mark_reg_unknown(dst_reg);
2102 return 0;
2103 }
2104
2105 if (!src_known &&
2106 opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) {
2107 __mark_reg_unknown(dst_reg);
2108 return 0;
2109 }
2110
2024 switch (opcode) { 2111 switch (opcode) {
2025 case BPF_ADD: 2112 case BPF_ADD:
2026 if (signed_add_overflows(dst_reg->smin_value, smin_val) || 2113 if (signed_add_overflows(dst_reg->smin_value, smin_val) ||
@@ -2149,9 +2236,9 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
2149 __update_reg_bounds(dst_reg); 2236 __update_reg_bounds(dst_reg);
2150 break; 2237 break;
2151 case BPF_LSH: 2238 case BPF_LSH:
2152 if (umax_val > 63) { 2239 if (umax_val >= insn_bitness) {
2153 /* Shifts greater than 63 are undefined. This includes 2240 /* Shifts greater than 31 or 63 are undefined.
2154 * shifts by a negative number. 2241 * This includes shifts by a negative number.
2155 */ 2242 */
2156 mark_reg_unknown(env, regs, insn->dst_reg); 2243 mark_reg_unknown(env, regs, insn->dst_reg);
2157 break; 2244 break;
@@ -2177,27 +2264,29 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
2177 __update_reg_bounds(dst_reg); 2264 __update_reg_bounds(dst_reg);
2178 break; 2265 break;
2179 case BPF_RSH: 2266 case BPF_RSH:
2180 if (umax_val > 63) { 2267 if (umax_val >= insn_bitness) {
2181 /* Shifts greater than 63 are undefined. This includes 2268 /* Shifts greater than 31 or 63 are undefined.
2182 * shifts by a negative number. 2269 * This includes shifts by a negative number.
2183 */ 2270 */
2184 mark_reg_unknown(env, regs, insn->dst_reg); 2271 mark_reg_unknown(env, regs, insn->dst_reg);
2185 break; 2272 break;
2186 } 2273 }
2187 /* BPF_RSH is an unsigned shift, so make the appropriate casts */ 2274 /* BPF_RSH is an unsigned shift. If the value in dst_reg might
2188 if (dst_reg->smin_value < 0) { 2275 * be negative, then either:
2189 if (umin_val) { 2276 * 1) src_reg might be zero, so the sign bit of the result is
2190 /* Sign bit will be cleared */ 2277 * unknown, so we lose our signed bounds
2191 dst_reg->smin_value = 0; 2278 * 2) it's known negative, thus the unsigned bounds capture the
2192 } else { 2279 * signed bounds
2193 /* Lost sign bit information */ 2280 * 3) the signed bounds cross zero, so they tell us nothing
2194 dst_reg->smin_value = S64_MIN; 2281 * about the result
2195 dst_reg->smax_value = S64_MAX; 2282 * If the value in dst_reg is known nonnegative, then again the
2196 } 2283 * unsigned bounts capture the signed bounds.
2197 } else { 2284 * Thus, in all cases it suffices to blow away our signed bounds
2198 dst_reg->smin_value = 2285 * and rely on inferring new ones from the unsigned bounds and
2199 (u64)(dst_reg->smin_value) >> umax_val; 2286 * var_off of the result.
2200 } 2287 */
2288 dst_reg->smin_value = S64_MIN;
2289 dst_reg->smax_value = S64_MAX;
2201 if (src_known) 2290 if (src_known)
2202 dst_reg->var_off = tnum_rshift(dst_reg->var_off, 2291 dst_reg->var_off = tnum_rshift(dst_reg->var_off,
2203 umin_val); 2292 umin_val);
@@ -2213,6 +2302,12 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
2213 break; 2302 break;
2214 } 2303 }
2215 2304
2305 if (BPF_CLASS(insn->code) != BPF_ALU64) {
2306 /* 32-bit ALU ops are (32,32)->32 */
2307 coerce_reg_to_size(dst_reg, 4);
2308 coerce_reg_to_size(&src_reg, 4);
2309 }
2310
2216 __reg_deduce_bounds(dst_reg); 2311 __reg_deduce_bounds(dst_reg);
2217 __reg_bound_offset(dst_reg); 2312 __reg_bound_offset(dst_reg);
2218 return 0; 2313 return 0;
@@ -2227,7 +2322,6 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
2227 struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg; 2322 struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg;
2228 struct bpf_reg_state *ptr_reg = NULL, off_reg = {0}; 2323 struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
2229 u8 opcode = BPF_OP(insn->code); 2324 u8 opcode = BPF_OP(insn->code);
2230 int rc;
2231 2325
2232 dst_reg = &regs[insn->dst_reg]; 2326 dst_reg = &regs[insn->dst_reg];
2233 src_reg = NULL; 2327 src_reg = NULL;
@@ -2238,43 +2332,29 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
2238 if (src_reg->type != SCALAR_VALUE) { 2332 if (src_reg->type != SCALAR_VALUE) {
2239 if (dst_reg->type != SCALAR_VALUE) { 2333 if (dst_reg->type != SCALAR_VALUE) {
2240 /* Combining two pointers by any ALU op yields 2334 /* Combining two pointers by any ALU op yields
2241 * an arbitrary scalar. 2335 * an arbitrary scalar. Disallow all math except
2336 * pointer subtraction
2242 */ 2337 */
2243 if (!env->allow_ptr_leaks) { 2338 if (opcode == BPF_SUB){
2244 verbose(env, "R%d pointer %s pointer prohibited\n", 2339 mark_reg_unknown(env, regs, insn->dst_reg);
2245 insn->dst_reg, 2340 return 0;
2246 bpf_alu_string[opcode >> 4]);
2247 return -EACCES;
2248 } 2341 }
2249 mark_reg_unknown(env, regs, insn->dst_reg); 2342 verbose(env, "R%d pointer %s pointer prohibited\n",
2250 return 0; 2343 insn->dst_reg,
2344 bpf_alu_string[opcode >> 4]);
2345 return -EACCES;
2251 } else { 2346 } else {
2252 /* scalar += pointer 2347 /* scalar += pointer
2253 * This is legal, but we have to reverse our 2348 * This is legal, but we have to reverse our
2254 * src/dest handling in computing the range 2349 * src/dest handling in computing the range
2255 */ 2350 */
2256 rc = adjust_ptr_min_max_vals(env, insn, 2351 return adjust_ptr_min_max_vals(env, insn,
2257 src_reg, dst_reg); 2352 src_reg, dst_reg);
2258 if (rc == -EACCES && env->allow_ptr_leaks) {
2259 /* scalar += unknown scalar */
2260 __mark_reg_unknown(&off_reg);
2261 return adjust_scalar_min_max_vals(
2262 env, insn,
2263 dst_reg, off_reg);
2264 }
2265 return rc;
2266 } 2353 }
2267 } else if (ptr_reg) { 2354 } else if (ptr_reg) {
2268 /* pointer += scalar */ 2355 /* pointer += scalar */
2269 rc = adjust_ptr_min_max_vals(env, insn, 2356 return adjust_ptr_min_max_vals(env, insn,
2270 dst_reg, src_reg); 2357 dst_reg, src_reg);
2271 if (rc == -EACCES && env->allow_ptr_leaks) {
2272 /* unknown scalar += scalar */
2273 __mark_reg_unknown(dst_reg);
2274 return adjust_scalar_min_max_vals(
2275 env, insn, dst_reg, *src_reg);
2276 }
2277 return rc;
2278 } 2358 }
2279 } else { 2359 } else {
2280 /* Pretend the src is a reg with a known value, since we only 2360 /* Pretend the src is a reg with a known value, since we only
@@ -2283,17 +2363,9 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
2283 off_reg.type = SCALAR_VALUE; 2363 off_reg.type = SCALAR_VALUE;
2284 __mark_reg_known(&off_reg, insn->imm); 2364 __mark_reg_known(&off_reg, insn->imm);
2285 src_reg = &off_reg; 2365 src_reg = &off_reg;
2286 if (ptr_reg) { /* pointer += K */ 2366 if (ptr_reg) /* pointer += K */
2287 rc = adjust_ptr_min_max_vals(env, insn, 2367 return adjust_ptr_min_max_vals(env, insn,
2288 ptr_reg, src_reg); 2368 ptr_reg, src_reg);
2289 if (rc == -EACCES && env->allow_ptr_leaks) {
2290 /* unknown scalar += K */
2291 __mark_reg_unknown(dst_reg);
2292 return adjust_scalar_min_max_vals(
2293 env, insn, dst_reg, off_reg);
2294 }
2295 return rc;
2296 }
2297 } 2369 }
2298 2370
2299 /* Got here implies adding two SCALAR_VALUEs */ 2371 /* Got here implies adding two SCALAR_VALUEs */
@@ -2390,17 +2462,20 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
2390 return -EACCES; 2462 return -EACCES;
2391 } 2463 }
2392 mark_reg_unknown(env, regs, insn->dst_reg); 2464 mark_reg_unknown(env, regs, insn->dst_reg);
2393 /* high 32 bits are known zero. */ 2465 coerce_reg_to_size(&regs[insn->dst_reg], 4);
2394 regs[insn->dst_reg].var_off = tnum_cast(
2395 regs[insn->dst_reg].var_off, 4);
2396 __update_reg_bounds(&regs[insn->dst_reg]);
2397 } 2466 }
2398 } else { 2467 } else {
2399 /* case: R = imm 2468 /* case: R = imm
2400 * remember the value we stored into this reg 2469 * remember the value we stored into this reg
2401 */ 2470 */
2402 regs[insn->dst_reg].type = SCALAR_VALUE; 2471 regs[insn->dst_reg].type = SCALAR_VALUE;
2403 __mark_reg_known(regs + insn->dst_reg, insn->imm); 2472 if (BPF_CLASS(insn->code) == BPF_ALU64) {
2473 __mark_reg_known(regs + insn->dst_reg,
2474 insn->imm);
2475 } else {
2476 __mark_reg_known(regs + insn->dst_reg,
2477 (u32)insn->imm);
2478 }
2404 } 2479 }
2405 2480
2406 } else if (opcode > BPF_END) { 2481 } else if (opcode > BPF_END) {
@@ -2436,6 +2511,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
2436 return -EINVAL; 2511 return -EINVAL;
2437 } 2512 }
2438 2513
2514 if (opcode == BPF_ARSH && BPF_CLASS(insn->code) != BPF_ALU64) {
2515 verbose(env, "BPF_ARSH not supported for 32 bit ALU\n");
2516 return -EINVAL;
2517 }
2518
2439 if ((opcode == BPF_LSH || opcode == BPF_RSH || 2519 if ((opcode == BPF_LSH || opcode == BPF_RSH ||
2440 opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) { 2520 opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) {
2441 int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32; 2521 int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
@@ -3431,15 +3511,14 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
3431 return range_within(rold, rcur) && 3511 return range_within(rold, rcur) &&
3432 tnum_in(rold->var_off, rcur->var_off); 3512 tnum_in(rold->var_off, rcur->var_off);
3433 } else { 3513 } else {
3434 /* if we knew anything about the old value, we're not 3514 /* We're trying to use a pointer in place of a scalar.
3435 * equal, because we can't know anything about the 3515 * Even if the scalar was unbounded, this could lead to
3436 * scalar value of the pointer in the new value. 3516 * pointer leaks because scalars are allowed to leak
3517 * while pointers are not. We could make this safe in
3518 * special cases if root is calling us, but it's
3519 * probably not worth the hassle.
3437 */ 3520 */
3438 return rold->umin_value == 0 && 3521 return false;
3439 rold->umax_value == U64_MAX &&
3440 rold->smin_value == S64_MIN &&
3441 rold->smax_value == S64_MAX &&
3442 tnum_is_unknown(rold->var_off);
3443 } 3522 }
3444 case PTR_TO_MAP_VALUE: 3523 case PTR_TO_MAP_VALUE:
3445 /* If the new min/max/var_off satisfy the old ones and 3524 /* If the new min/max/var_off satisfy the old ones and
@@ -3932,6 +4011,12 @@ static int do_check(struct bpf_verifier_env *env)
3932 if (err) 4011 if (err)
3933 return err; 4012 return err;
3934 4013
4014 if (is_ctx_reg(env, insn->dst_reg)) {
4015 verbose(env, "BPF_ST stores into R%d context is not allowed\n",
4016 insn->dst_reg);
4017 return -EACCES;
4018 }
4019
3935 /* check that memory (dst_reg + off) is writeable */ 4020 /* check that memory (dst_reg + off) is writeable */
3936 err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, 4021 err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
3937 BPF_SIZE(insn->code), BPF_WRITE, 4022 BPF_SIZE(insn->code), BPF_WRITE,
@@ -4384,6 +4469,24 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
4384 int i, cnt, delta = 0; 4469 int i, cnt, delta = 0;
4385 4470
4386 for (i = 0; i < insn_cnt; i++, insn++) { 4471 for (i = 0; i < insn_cnt; i++, insn++) {
4472 if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
4473 insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
4474 /* due to JIT bugs clear upper 32-bits of src register
4475 * before div/mod operation
4476 */
4477 insn_buf[0] = BPF_MOV32_REG(insn->src_reg, insn->src_reg);
4478 insn_buf[1] = *insn;
4479 cnt = 2;
4480 new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
4481 if (!new_prog)
4482 return -ENOMEM;
4483
4484 delta += cnt - 1;
4485 env->prog = prog = new_prog;
4486 insn = new_prog->insnsi + i + delta;
4487 continue;
4488 }
4489
4387 if (insn->code != (BPF_JMP | BPF_CALL)) 4490 if (insn->code != (BPF_JMP | BPF_CALL))
4388 continue; 4491 continue;
4389 4492
@@ -4407,6 +4510,35 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
4407 */ 4510 */
4408 insn->imm = 0; 4511 insn->imm = 0;
4409 insn->code = BPF_JMP | BPF_TAIL_CALL; 4512 insn->code = BPF_JMP | BPF_TAIL_CALL;
4513
4514 /* instead of changing every JIT dealing with tail_call
4515 * emit two extra insns:
4516 * if (index >= max_entries) goto out;
4517 * index &= array->index_mask;
4518 * to avoid out-of-bounds cpu speculation
4519 */
4520 map_ptr = env->insn_aux_data[i + delta].map_ptr;
4521 if (map_ptr == BPF_MAP_PTR_POISON) {
4522 verbose(env, "tail_call abusing map_ptr\n");
4523 return -EINVAL;
4524 }
4525 if (!map_ptr->unpriv_array)
4526 continue;
4527 insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
4528 map_ptr->max_entries, 2);
4529 insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
4530 container_of(map_ptr,
4531 struct bpf_array,
4532 map)->index_mask);
4533 insn_buf[2] = *insn;
4534 cnt = 3;
4535 new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
4536 if (!new_prog)
4537 return -ENOMEM;
4538
4539 delta += cnt - 1;
4540 env->prog = prog = new_prog;
4541 insn = new_prog->insnsi + i + delta;
4410 continue; 4542 continue;
4411 } 4543 }
4412 4544
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 024085daab1a..a2c05d2476ac 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -123,7 +123,11 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
123 */ 123 */
124 do { 124 do {
125 css_task_iter_start(&from->self, 0, &it); 125 css_task_iter_start(&from->self, 0, &it);
126 task = css_task_iter_next(&it); 126
127 do {
128 task = css_task_iter_next(&it);
129 } while (task && (task->flags & PF_EXITING));
130
127 if (task) 131 if (task)
128 get_task_struct(task); 132 get_task_struct(task);
129 css_task_iter_end(&it); 133 css_task_iter_end(&it);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 0b1ffe147f24..7e4c44538119 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1397,7 +1397,7 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
1397 cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, 1397 cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
1398 cft->name); 1398 cft->name);
1399 else 1399 else
1400 strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX); 1400 strlcpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
1401 return buf; 1401 return buf;
1402} 1402}
1403 1403
@@ -1864,9 +1864,9 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
1864 1864
1865 root->flags = opts->flags; 1865 root->flags = opts->flags;
1866 if (opts->release_agent) 1866 if (opts->release_agent)
1867 strcpy(root->release_agent_path, opts->release_agent); 1867 strlcpy(root->release_agent_path, opts->release_agent, PATH_MAX);
1868 if (opts->name) 1868 if (opts->name)
1869 strcpy(root->name, opts->name); 1869 strlcpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN);
1870 if (opts->cpuset_clone_children) 1870 if (opts->cpuset_clone_children)
1871 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); 1871 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
1872} 1872}
@@ -4125,26 +4125,24 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
4125 4125
4126static void css_task_iter_advance(struct css_task_iter *it) 4126static void css_task_iter_advance(struct css_task_iter *it)
4127{ 4127{
4128 struct list_head *l = it->task_pos; 4128 struct list_head *next;
4129 4129
4130 lockdep_assert_held(&css_set_lock); 4130 lockdep_assert_held(&css_set_lock);
4131 WARN_ON_ONCE(!l);
4132
4133repeat: 4131repeat:
4134 /* 4132 /*
4135 * Advance iterator to find next entry. cset->tasks is consumed 4133 * Advance iterator to find next entry. cset->tasks is consumed
4136 * first and then ->mg_tasks. After ->mg_tasks, we move onto the 4134 * first and then ->mg_tasks. After ->mg_tasks, we move onto the
4137 * next cset. 4135 * next cset.
4138 */ 4136 */
4139 l = l->next; 4137 next = it->task_pos->next;
4140 4138
4141 if (l == it->tasks_head) 4139 if (next == it->tasks_head)
4142 l = it->mg_tasks_head->next; 4140 next = it->mg_tasks_head->next;
4143 4141
4144 if (l == it->mg_tasks_head) 4142 if (next == it->mg_tasks_head)
4145 css_task_iter_advance_css_set(it); 4143 css_task_iter_advance_css_set(it);
4146 else 4144 else
4147 it->task_pos = l; 4145 it->task_pos = next;
4148 4146
4149 /* if PROCS, skip over tasks which aren't group leaders */ 4147 /* if PROCS, skip over tasks which aren't group leaders */
4150 if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos && 4148 if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
@@ -4449,6 +4447,7 @@ static struct cftype cgroup_base_files[] = {
4449 }, 4447 },
4450 { 4448 {
4451 .name = "cgroup.threads", 4449 .name = "cgroup.threads",
4450 .flags = CFTYPE_NS_DELEGATABLE,
4452 .release = cgroup_procs_release, 4451 .release = cgroup_procs_release,
4453 .seq_start = cgroup_threads_start, 4452 .seq_start = cgroup_threads_start,
4454 .seq_next = cgroup_procs_next, 4453 .seq_next = cgroup_procs_next,
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index 5f780d8f6a9d..9caeda610249 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -50,7 +50,7 @@ static int current_css_set_read(struct seq_file *seq, void *v)
50 50
51 spin_lock_irq(&css_set_lock); 51 spin_lock_irq(&css_set_lock);
52 rcu_read_lock(); 52 rcu_read_lock();
53 cset = rcu_dereference(current->cgroups); 53 cset = task_css_set(current);
54 refcnt = refcount_read(&cset->refcount); 54 refcnt = refcount_read(&cset->refcount);
55 seq_printf(seq, "css_set %pK %d", cset, refcnt); 55 seq_printf(seq, "css_set %pK %d", cset, refcnt);
56 if (refcnt > cset->nr_tasks) 56 if (refcnt > cset->nr_tasks)
@@ -96,7 +96,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
96 96
97 spin_lock_irq(&css_set_lock); 97 spin_lock_irq(&css_set_lock);
98 rcu_read_lock(); 98 rcu_read_lock();
99 cset = rcu_dereference(current->cgroups); 99 cset = task_css_set(current);
100 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { 100 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
101 struct cgroup *c = link->cgrp; 101 struct cgroup *c = link->cgrp;
102 102
diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c
index 133b465691d6..1e111dd455c4 100644
--- a/kernel/cgroup/stat.c
+++ b/kernel/cgroup/stat.c
@@ -296,8 +296,12 @@ int cgroup_stat_init(struct cgroup *cgrp)
296 } 296 }
297 297
298 /* ->updated_children list is self terminated */ 298 /* ->updated_children list is self terminated */
299 for_each_possible_cpu(cpu) 299 for_each_possible_cpu(cpu) {
300 cgroup_cpu_stat(cgrp, cpu)->updated_children = cgrp; 300 struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
301
302 cstat->updated_children = cgrp;
303 u64_stats_init(&cstat->sync);
304 }
301 305
302 prev_cputime_init(&cgrp->stat.prev_cputime); 306 prev_cputime_init(&cgrp->stat.prev_cputime);
303 307
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 04892a82f6ac..53f7dc65f9a3 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -80,19 +80,19 @@ static struct lockdep_map cpuhp_state_down_map =
80 STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map); 80 STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map);
81 81
82 82
83static void inline cpuhp_lock_acquire(bool bringup) 83static inline void cpuhp_lock_acquire(bool bringup)
84{ 84{
85 lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map); 85 lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
86} 86}
87 87
88static void inline cpuhp_lock_release(bool bringup) 88static inline void cpuhp_lock_release(bool bringup)
89{ 89{
90 lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map); 90 lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
91} 91}
92#else 92#else
93 93
94static void inline cpuhp_lock_acquire(bool bringup) { } 94static inline void cpuhp_lock_acquire(bool bringup) { }
95static void inline cpuhp_lock_release(bool bringup) { } 95static inline void cpuhp_lock_release(bool bringup) { }
96 96
97#endif 97#endif
98 98
@@ -780,8 +780,8 @@ static int takedown_cpu(unsigned int cpu)
780 BUG_ON(cpu_online(cpu)); 780 BUG_ON(cpu_online(cpu));
781 781
782 /* 782 /*
783 * The CPUHP_AP_SCHED_MIGRATE_DYING callback will have removed all 783 * The teardown callback for CPUHP_AP_SCHED_STARTING will have removed
784 * runnable tasks from the cpu, there's only the idle task left now 784 * all runnable tasks from the CPU, there's only the idle task left now
785 * that the migration thread is done doing the stop_machine thing. 785 * that the migration thread is done doing the stop_machine thing.
786 * 786 *
787 * Wait for the stop thread to go away. 787 * Wait for the stop thread to go away.
@@ -1277,9 +1277,9 @@ static struct cpuhp_step cpuhp_bp_states[] = {
1277 * before blk_mq_queue_reinit_notify() from notify_dead(), 1277 * before blk_mq_queue_reinit_notify() from notify_dead(),
1278 * otherwise a RCU stall occurs. 1278 * otherwise a RCU stall occurs.
1279 */ 1279 */
1280 [CPUHP_TIMERS_DEAD] = { 1280 [CPUHP_TIMERS_PREPARE] = {
1281 .name = "timers:dead", 1281 .name = "timers:dead",
1282 .startup.single = NULL, 1282 .startup.single = timers_prepare_cpu,
1283 .teardown.single = timers_dead_cpu, 1283 .teardown.single = timers_dead_cpu,
1284 }, 1284 },
1285 /* Kicks the plugged cpu into life */ 1285 /* Kicks the plugged cpu into life */
@@ -1289,11 +1289,6 @@ static struct cpuhp_step cpuhp_bp_states[] = {
1289 .teardown.single = NULL, 1289 .teardown.single = NULL,
1290 .cant_stop = true, 1290 .cant_stop = true,
1291 }, 1291 },
1292 [CPUHP_AP_SMPCFD_DYING] = {
1293 .name = "smpcfd:dying",
1294 .startup.single = NULL,
1295 .teardown.single = smpcfd_dying_cpu,
1296 },
1297 /* 1292 /*
1298 * Handled on controll processor until the plugged processor manages 1293 * Handled on controll processor until the plugged processor manages
1299 * this itself. 1294 * this itself.
@@ -1335,6 +1330,11 @@ static struct cpuhp_step cpuhp_ap_states[] = {
1335 .startup.single = NULL, 1330 .startup.single = NULL,
1336 .teardown.single = rcutree_dying_cpu, 1331 .teardown.single = rcutree_dying_cpu,
1337 }, 1332 },
1333 [CPUHP_AP_SMPCFD_DYING] = {
1334 .name = "smpcfd:dying",
1335 .startup.single = NULL,
1336 .teardown.single = smpcfd_dying_cpu,
1337 },
1338 /* Entry state on starting. Interrupts enabled from here on. Transient 1338 /* Entry state on starting. Interrupts enabled from here on. Transient
1339 * state for synchronsization */ 1339 * state for synchronsization */
1340 [CPUHP_AP_ONLINE] = { 1340 [CPUHP_AP_ONLINE] = {
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index b3663896278e..4f63597c824d 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -410,7 +410,7 @@ static int __init crash_save_vmcoreinfo_init(void)
410 VMCOREINFO_SYMBOL(contig_page_data); 410 VMCOREINFO_SYMBOL(contig_page_data);
411#endif 411#endif
412#ifdef CONFIG_SPARSEMEM 412#ifdef CONFIG_SPARSEMEM
413 VMCOREINFO_SYMBOL(mem_section); 413 VMCOREINFO_SYMBOL_ARRAY(mem_section);
414 VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); 414 VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
415 VMCOREINFO_STRUCT_SIZE(mem_section); 415 VMCOREINFO_STRUCT_SIZE(mem_section);
416 VMCOREINFO_OFFSET(mem_section, section_mem_map); 416 VMCOREINFO_OFFSET(mem_section, section_mem_map);
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index e74be38245ad..ed5d34925ad0 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -350,7 +350,7 @@ poll_again:
350 } 350 }
351 kdb_printf("\n"); 351 kdb_printf("\n");
352 for (i = 0; i < count; i++) { 352 for (i = 0; i < count; i++) {
353 if (kallsyms_symbol_next(p_tmp, i) < 0) 353 if (WARN_ON(!kallsyms_symbol_next(p_tmp, i)))
354 break; 354 break;
355 kdb_printf("%s ", p_tmp); 355 kdb_printf("%s ", p_tmp);
356 *(p_tmp + len) = '\0'; 356 *(p_tmp + len) = '\0';
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 4a1c33416b6a..e2764d767f18 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -51,16 +51,16 @@ void __delayacct_tsk_init(struct task_struct *tsk)
51 * Finish delay accounting for a statistic using its timestamps (@start), 51 * Finish delay accounting for a statistic using its timestamps (@start),
52 * accumalator (@total) and @count 52 * accumalator (@total) and @count
53 */ 53 */
54static void delayacct_end(u64 *start, u64 *total, u32 *count) 54static void delayacct_end(spinlock_t *lock, u64 *start, u64 *total, u32 *count)
55{ 55{
56 s64 ns = ktime_get_ns() - *start; 56 s64 ns = ktime_get_ns() - *start;
57 unsigned long flags; 57 unsigned long flags;
58 58
59 if (ns > 0) { 59 if (ns > 0) {
60 spin_lock_irqsave(&current->delays->lock, flags); 60 spin_lock_irqsave(lock, flags);
61 *total += ns; 61 *total += ns;
62 (*count)++; 62 (*count)++;
63 spin_unlock_irqrestore(&current->delays->lock, flags); 63 spin_unlock_irqrestore(lock, flags);
64 } 64 }
65} 65}
66 66
@@ -69,17 +69,25 @@ void __delayacct_blkio_start(void)
69 current->delays->blkio_start = ktime_get_ns(); 69 current->delays->blkio_start = ktime_get_ns();
70} 70}
71 71
72void __delayacct_blkio_end(void) 72/*
73 * We cannot rely on the `current` macro, as we haven't yet switched back to
74 * the process being woken.
75 */
76void __delayacct_blkio_end(struct task_struct *p)
73{ 77{
74 if (current->delays->flags & DELAYACCT_PF_SWAPIN) 78 struct task_delay_info *delays = p->delays;
75 /* Swapin block I/O */ 79 u64 *total;
76 delayacct_end(&current->delays->blkio_start, 80 u32 *count;
77 &current->delays->swapin_delay, 81
78 &current->delays->swapin_count); 82 if (p->delays->flags & DELAYACCT_PF_SWAPIN) {
79 else /* Other block I/O */ 83 total = &delays->swapin_delay;
80 delayacct_end(&current->delays->blkio_start, 84 count = &delays->swapin_count;
81 &current->delays->blkio_delay, 85 } else {
82 &current->delays->blkio_count); 86 total = &delays->blkio_delay;
87 count = &delays->blkio_count;
88 }
89
90 delayacct_end(&delays->lock, &delays->blkio_start, total, count);
83} 91}
84 92
85int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) 93int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
@@ -153,8 +161,10 @@ void __delayacct_freepages_start(void)
153 161
154void __delayacct_freepages_end(void) 162void __delayacct_freepages_end(void)
155{ 163{
156 delayacct_end(&current->delays->freepages_start, 164 delayacct_end(
157 &current->delays->freepages_delay, 165 &current->delays->lock,
158 &current->delays->freepages_count); 166 &current->delays->freepages_start,
167 &current->delays->freepages_delay,
168 &current->delays->freepages_count);
159} 169}
160 170
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 16beab4767e1..5d8f4031f8d5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1231,6 +1231,10 @@ static void put_ctx(struct perf_event_context *ctx)
1231 * perf_event_context::lock 1231 * perf_event_context::lock
1232 * perf_event::mmap_mutex 1232 * perf_event::mmap_mutex
1233 * mmap_sem 1233 * mmap_sem
1234 *
1235 * cpu_hotplug_lock
1236 * pmus_lock
1237 * cpuctx->mutex / perf_event_context::mutex
1234 */ 1238 */
1235static struct perf_event_context * 1239static struct perf_event_context *
1236perf_event_ctx_lock_nested(struct perf_event *event, int nesting) 1240perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
@@ -4196,6 +4200,7 @@ int perf_event_release_kernel(struct perf_event *event)
4196{ 4200{
4197 struct perf_event_context *ctx = event->ctx; 4201 struct perf_event_context *ctx = event->ctx;
4198 struct perf_event *child, *tmp; 4202 struct perf_event *child, *tmp;
4203 LIST_HEAD(free_list);
4199 4204
4200 /* 4205 /*
4201 * If we got here through err_file: fput(event_file); we will not have 4206 * If we got here through err_file: fput(event_file); we will not have
@@ -4268,8 +4273,7 @@ again:
4268 struct perf_event, child_list); 4273 struct perf_event, child_list);
4269 if (tmp == child) { 4274 if (tmp == child) {
4270 perf_remove_from_context(child, DETACH_GROUP); 4275 perf_remove_from_context(child, DETACH_GROUP);
4271 list_del(&child->child_list); 4276 list_move(&child->child_list, &free_list);
4272 free_event(child);
4273 /* 4277 /*
4274 * This matches the refcount bump in inherit_event(); 4278 * This matches the refcount bump in inherit_event();
4275 * this can't be the last reference. 4279 * this can't be the last reference.
@@ -4284,6 +4288,11 @@ again:
4284 } 4288 }
4285 mutex_unlock(&event->child_mutex); 4289 mutex_unlock(&event->child_mutex);
4286 4290
4291 list_for_each_entry_safe(child, tmp, &free_list, child_list) {
4292 list_del(&child->child_list);
4293 free_event(child);
4294 }
4295
4287no_ctx: 4296no_ctx:
4288 put_event(event); /* Must be the 'last' reference */ 4297 put_event(event); /* Must be the 'last' reference */
4289 return 0; 4298 return 0;
@@ -6639,6 +6648,7 @@ static void perf_event_namespaces_output(struct perf_event *event,
6639 struct perf_namespaces_event *namespaces_event = data; 6648 struct perf_namespaces_event *namespaces_event = data;
6640 struct perf_output_handle handle; 6649 struct perf_output_handle handle;
6641 struct perf_sample_data sample; 6650 struct perf_sample_data sample;
6651 u16 header_size = namespaces_event->event_id.header.size;
6642 int ret; 6652 int ret;
6643 6653
6644 if (!perf_event_namespaces_match(event)) 6654 if (!perf_event_namespaces_match(event))
@@ -6649,7 +6659,7 @@ static void perf_event_namespaces_output(struct perf_event *event,
6649 ret = perf_output_begin(&handle, event, 6659 ret = perf_output_begin(&handle, event,
6650 namespaces_event->event_id.header.size); 6660 namespaces_event->event_id.header.size);
6651 if (ret) 6661 if (ret)
6652 return; 6662 goto out;
6653 6663
6654 namespaces_event->event_id.pid = perf_event_pid(event, 6664 namespaces_event->event_id.pid = perf_event_pid(event,
6655 namespaces_event->task); 6665 namespaces_event->task);
@@ -6661,6 +6671,8 @@ static void perf_event_namespaces_output(struct perf_event *event,
6661 perf_event__output_id_sample(event, &handle, &sample); 6671 perf_event__output_id_sample(event, &handle, &sample);
6662 6672
6663 perf_output_end(&handle); 6673 perf_output_end(&handle);
6674out:
6675 namespaces_event->event_id.header.size = header_size;
6664} 6676}
6665 6677
6666static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info, 6678static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
@@ -7987,11 +7999,11 @@ static void bpf_overflow_handler(struct perf_event *event,
7987{ 7999{
7988 struct bpf_perf_event_data_kern ctx = { 8000 struct bpf_perf_event_data_kern ctx = {
7989 .data = data, 8001 .data = data,
7990 .regs = regs,
7991 .event = event, 8002 .event = event,
7992 }; 8003 };
7993 int ret = 0; 8004 int ret = 0;
7994 8005
8006 ctx.regs = perf_arch_bpf_user_pt_regs(regs);
7995 preempt_disable(); 8007 preempt_disable();
7996 if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) 8008 if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
7997 goto out; 8009 goto out;
@@ -8513,6 +8525,29 @@ fail_clear_files:
8513 return ret; 8525 return ret;
8514} 8526}
8515 8527
8528static int
8529perf_tracepoint_set_filter(struct perf_event *event, char *filter_str)
8530{
8531 struct perf_event_context *ctx = event->ctx;
8532 int ret;
8533
8534 /*
8535 * Beware, here be dragons!!
8536 *
8537 * the tracepoint muck will deadlock against ctx->mutex, but the tracepoint
8538 * stuff does not actually need it. So temporarily drop ctx->mutex. As per
8539 * perf_event_ctx_lock() we already have a reference on ctx.
8540 *
8541 * This can result in event getting moved to a different ctx, but that
8542 * does not affect the tracepoint state.
8543 */
8544 mutex_unlock(&ctx->mutex);
8545 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
8546 mutex_lock(&ctx->mutex);
8547
8548 return ret;
8549}
8550
8516static int perf_event_set_filter(struct perf_event *event, void __user *arg) 8551static int perf_event_set_filter(struct perf_event *event, void __user *arg)
8517{ 8552{
8518 char *filter_str; 8553 char *filter_str;
@@ -8529,8 +8564,7 @@ static int perf_event_set_filter(struct perf_event *event, void __user *arg)
8529 8564
8530 if (IS_ENABLED(CONFIG_EVENT_TRACING) && 8565 if (IS_ENABLED(CONFIG_EVENT_TRACING) &&
8531 event->attr.type == PERF_TYPE_TRACEPOINT) 8566 event->attr.type == PERF_TYPE_TRACEPOINT)
8532 ret = ftrace_profile_set_filter(event, event->attr.config, 8567 ret = perf_tracepoint_set_filter(event, filter_str);
8533 filter_str);
8534 else if (has_addr_filter(event)) 8568 else if (has_addr_filter(event))
8535 ret = perf_event_set_addr_filter(event, filter_str); 8569 ret = perf_event_set_addr_filter(event, filter_str);
8536 8570
@@ -9165,7 +9199,13 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
9165 if (!try_module_get(pmu->module)) 9199 if (!try_module_get(pmu->module))
9166 return -ENODEV; 9200 return -ENODEV;
9167 9201
9168 if (event->group_leader != event) { 9202 /*
9203 * A number of pmu->event_init() methods iterate the sibling_list to,
9204 * for example, validate if the group fits on the PMU. Therefore,
9205 * if this is a sibling event, acquire the ctx->mutex to protect
9206 * the sibling_list.
9207 */
9208 if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) {
9169 /* 9209 /*
9170 * This ctx->mutex can nest when we're called through 9210 * This ctx->mutex can nest when we're called through
9171 * inheritance. See the perf_event_ctx_lock_nested() comment. 9211 * inheritance. See the perf_event_ctx_lock_nested() comment.
diff --git a/kernel/exit.c b/kernel/exit.c
index 6b4298a41167..995453d9fb55 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1755,3 +1755,12 @@ Efault:
1755 return -EFAULT; 1755 return -EFAULT;
1756} 1756}
1757#endif 1757#endif
1758
1759__weak void abort(void)
1760{
1761 BUG();
1762
1763 /* if that doesn't kill us, halt */
1764 panic("Oops failed to kill thread");
1765}
1766EXPORT_SYMBOL(abort);
diff --git a/kernel/fork.c b/kernel/fork.c
index 432eadf6b58c..2295fc69717f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -721,8 +721,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
721 goto out; 721 goto out;
722 } 722 }
723 /* a new mm has just been created */ 723 /* a new mm has just been created */
724 arch_dup_mmap(oldmm, mm); 724 retval = arch_dup_mmap(oldmm, mm);
725 retval = 0;
726out: 725out:
727 up_write(&mm->mmap_sem); 726 up_write(&mm->mmap_sem);
728 flush_tlb_mm(oldmm); 727 flush_tlb_mm(oldmm);
diff --git a/kernel/futex.c b/kernel/futex.c
index 76ed5921117a..7f719d110908 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1582,8 +1582,8 @@ static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
1582{ 1582{
1583 unsigned int op = (encoded_op & 0x70000000) >> 28; 1583 unsigned int op = (encoded_op & 0x70000000) >> 28;
1584 unsigned int cmp = (encoded_op & 0x0f000000) >> 24; 1584 unsigned int cmp = (encoded_op & 0x0f000000) >> 24;
1585 int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 12); 1585 int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11);
1586 int cmparg = sign_extend32(encoded_op & 0x00000fff, 12); 1586 int cmparg = sign_extend32(encoded_op & 0x00000fff, 11);
1587 int oldval, ret; 1587 int oldval, ret;
1588 1588
1589 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) { 1589 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
@@ -1878,6 +1878,9 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1878 struct futex_q *this, *next; 1878 struct futex_q *this, *next;
1879 DEFINE_WAKE_Q(wake_q); 1879 DEFINE_WAKE_Q(wake_q);
1880 1880
1881 if (nr_wake < 0 || nr_requeue < 0)
1882 return -EINVAL;
1883
1881 /* 1884 /*
1882 * When PI not supported: return -ENOSYS if requeue_pi is true, 1885 * When PI not supported: return -ENOSYS if requeue_pi is true,
1883 * consequently the compiler knows requeue_pi is always false past 1886 * consequently the compiler knows requeue_pi is always false past
@@ -2294,34 +2297,33 @@ static void unqueue_me_pi(struct futex_q *q)
2294 spin_unlock(q->lock_ptr); 2297 spin_unlock(q->lock_ptr);
2295} 2298}
2296 2299
2297/*
2298 * Fixup the pi_state owner with the new owner.
2299 *
2300 * Must be called with hash bucket lock held and mm->sem held for non
2301 * private futexes.
2302 */
2303static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 2300static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
2304 struct task_struct *newowner) 2301 struct task_struct *argowner)
2305{ 2302{
2306 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
2307 struct futex_pi_state *pi_state = q->pi_state; 2303 struct futex_pi_state *pi_state = q->pi_state;
2308 u32 uval, uninitialized_var(curval), newval; 2304 u32 uval, uninitialized_var(curval), newval;
2309 struct task_struct *oldowner; 2305 struct task_struct *oldowner, *newowner;
2306 u32 newtid;
2310 int ret; 2307 int ret;
2311 2308
2309 lockdep_assert_held(q->lock_ptr);
2310
2312 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 2311 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
2313 2312
2314 oldowner = pi_state->owner; 2313 oldowner = pi_state->owner;
2315 /* Owner died? */
2316 if (!pi_state->owner)
2317 newtid |= FUTEX_OWNER_DIED;
2318 2314
2319 /* 2315 /*
2320 * We are here either because we stole the rtmutex from the 2316 * We are here because either:
2321 * previous highest priority waiter or we are the highest priority 2317 *
2322 * waiter but have failed to get the rtmutex the first time. 2318 * - we stole the lock and pi_state->owner needs updating to reflect
2319 * that (@argowner == current),
2320 *
2321 * or:
2323 * 2322 *
2324 * We have to replace the newowner TID in the user space variable. 2323 * - someone stole our lock and we need to fix things to point to the
2324 * new owner (@argowner == NULL).
2325 *
2326 * Either way, we have to replace the TID in the user space variable.
2325 * This must be atomic as we have to preserve the owner died bit here. 2327 * This must be atomic as we have to preserve the owner died bit here.
2326 * 2328 *
2327 * Note: We write the user space value _before_ changing the pi_state 2329 * Note: We write the user space value _before_ changing the pi_state
@@ -2334,6 +2336,45 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
2334 * in the PID check in lookup_pi_state. 2336 * in the PID check in lookup_pi_state.
2335 */ 2337 */
2336retry: 2338retry:
2339 if (!argowner) {
2340 if (oldowner != current) {
2341 /*
2342 * We raced against a concurrent self; things are
2343 * already fixed up. Nothing to do.
2344 */
2345 ret = 0;
2346 goto out_unlock;
2347 }
2348
2349 if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
2350 /* We got the lock after all, nothing to fix. */
2351 ret = 0;
2352 goto out_unlock;
2353 }
2354
2355 /*
2356 * Since we just failed the trylock; there must be an owner.
2357 */
2358 newowner = rt_mutex_owner(&pi_state->pi_mutex);
2359 BUG_ON(!newowner);
2360 } else {
2361 WARN_ON_ONCE(argowner != current);
2362 if (oldowner == current) {
2363 /*
2364 * We raced against a concurrent self; things are
2365 * already fixed up. Nothing to do.
2366 */
2367 ret = 0;
2368 goto out_unlock;
2369 }
2370 newowner = argowner;
2371 }
2372
2373 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
2374 /* Owner died? */
2375 if (!pi_state->owner)
2376 newtid |= FUTEX_OWNER_DIED;
2377
2337 if (get_futex_value_locked(&uval, uaddr)) 2378 if (get_futex_value_locked(&uval, uaddr))
2338 goto handle_fault; 2379 goto handle_fault;
2339 2380
@@ -2434,9 +2475,9 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
2434 * Got the lock. We might not be the anticipated owner if we 2475 * Got the lock. We might not be the anticipated owner if we
2435 * did a lock-steal - fix up the PI-state in that case: 2476 * did a lock-steal - fix up the PI-state in that case:
2436 * 2477 *
2437 * We can safely read pi_state->owner without holding wait_lock 2478 * Speculative pi_state->owner read (we don't hold wait_lock);
2438 * because we now own the rt_mutex, only the owner will attempt 2479 * since we own the lock pi_state->owner == current is the
2439 * to change it. 2480 * stable state, anything else needs more attention.
2440 */ 2481 */
2441 if (q->pi_state->owner != current) 2482 if (q->pi_state->owner != current)
2442 ret = fixup_pi_state_owner(uaddr, q, current); 2483 ret = fixup_pi_state_owner(uaddr, q, current);
@@ -2444,6 +2485,19 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
2444 } 2485 }
2445 2486
2446 /* 2487 /*
2488 * If we didn't get the lock; check if anybody stole it from us. In
2489 * that case, we need to fix up the uval to point to them instead of
2490 * us, otherwise bad things happen. [10]
2491 *
2492 * Another speculative read; pi_state->owner == current is unstable
2493 * but needs our attention.
2494 */
2495 if (q->pi_state->owner == current) {
2496 ret = fixup_pi_state_owner(uaddr, q, NULL);
2497 goto out;
2498 }
2499
2500 /*
2447 * Paranoia check. If we did not take the lock, then we should not be 2501 * Paranoia check. If we did not take the lock, then we should not be
2448 * the owner of the rt_mutex. 2502 * the owner of the rt_mutex.
2449 */ 2503 */
diff --git a/kernel/groups.c b/kernel/groups.c
index e357bc800111..daae2f2dc6d4 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -86,11 +86,12 @@ static int gid_cmp(const void *_a, const void *_b)
86 return gid_gt(a, b) - gid_lt(a, b); 86 return gid_gt(a, b) - gid_lt(a, b);
87} 87}
88 88
89static void groups_sort(struct group_info *group_info) 89void groups_sort(struct group_info *group_info)
90{ 90{
91 sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid), 91 sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid),
92 gid_cmp, NULL); 92 gid_cmp, NULL);
93} 93}
94EXPORT_SYMBOL(groups_sort);
94 95
95/* a simple bsearch */ 96/* a simple bsearch */
96int groups_search(const struct group_info *group_info, kgid_t grp) 97int groups_search(const struct group_info *group_info, kgid_t grp)
@@ -122,7 +123,6 @@ int groups_search(const struct group_info *group_info, kgid_t grp)
122void set_groups(struct cred *new, struct group_info *group_info) 123void set_groups(struct cred *new, struct group_info *group_info)
123{ 124{
124 put_group_info(new->group_info); 125 put_group_info(new->group_info);
125 groups_sort(group_info);
126 get_group_info(group_info); 126 get_group_info(group_info);
127 new->group_info = group_info; 127 new->group_info = group_info;
128} 128}
@@ -206,6 +206,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
206 return retval; 206 return retval;
207 } 207 }
208 208
209 groups_sort(group_info);
209 retval = set_current_groups(group_info); 210 retval = set_current_groups(group_info);
210 put_group_info(group_info); 211 put_group_info(group_info);
211 212
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
index 17f05ef8f575..e4d3819a91cc 100644
--- a/kernel/irq/debug.h
+++ b/kernel/irq/debug.h
@@ -12,6 +12,11 @@
12 12
13static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) 13static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
14{ 14{
15 static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 5);
16
17 if (!__ratelimit(&ratelimit))
18 return;
19
15 printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", 20 printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
16 irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); 21 irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
17 printk("->handle_irq(): %p, ", desc->handle_irq); 22 printk("->handle_irq(): %p, ", desc->handle_irq);
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index 7f608ac39653..acfaaef8672a 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -113,6 +113,7 @@ static const struct irq_bit_descr irqdata_states[] = {
113 BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING), 113 BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING),
114 BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED), 114 BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED),
115 BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN), 115 BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN),
116 BIT_MASK_DESCR(IRQD_CAN_RESERVE),
116 117
117 BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU), 118 BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU),
118 119
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index c26c5bb6b491..508c03dfef25 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -364,10 +364,11 @@ irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq)
364EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip); 364EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip);
365 365
366/* 366/*
367 * Separate lockdep class for interrupt chip which can nest irq_desc 367 * Separate lockdep classes for interrupt chip which can nest irq_desc
368 * lock. 368 * lock and request mutex.
369 */ 369 */
370static struct lock_class_key irq_nested_lock_class; 370static struct lock_class_key irq_nested_lock_class;
371static struct lock_class_key irq_nested_request_class;
371 372
372/* 373/*
373 * irq_map_generic_chip - Map a generic chip for an irq domain 374 * irq_map_generic_chip - Map a generic chip for an irq domain
@@ -409,7 +410,8 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
409 set_bit(idx, &gc->installed); 410 set_bit(idx, &gc->installed);
410 411
411 if (dgc->gc_flags & IRQ_GC_INIT_NESTED_LOCK) 412 if (dgc->gc_flags & IRQ_GC_INIT_NESTED_LOCK)
412 irq_set_lockdep_class(virq, &irq_nested_lock_class); 413 irq_set_lockdep_class(virq, &irq_nested_lock_class,
414 &irq_nested_request_class);
413 415
414 if (chip->irq_calc_mask) 416 if (chip->irq_calc_mask)
415 chip->irq_calc_mask(data); 417 chip->irq_calc_mask(data);
@@ -479,7 +481,8 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
479 continue; 481 continue;
480 482
481 if (flags & IRQ_GC_INIT_NESTED_LOCK) 483 if (flags & IRQ_GC_INIT_NESTED_LOCK)
482 irq_set_lockdep_class(i, &irq_nested_lock_class); 484 irq_set_lockdep_class(i, &irq_nested_lock_class,
485 &irq_nested_request_class);
483 486
484 if (!(flags & IRQ_GC_NO_MASK)) { 487 if (!(flags & IRQ_GC_NO_MASK)) {
485 struct irq_data *d = irq_get_irq_data(i); 488 struct irq_data *d = irq_get_irq_data(i);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 07d08ca701ec..ab19371eab9b 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -440,7 +440,7 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear)
440#endif /* !CONFIG_GENERIC_PENDING_IRQ */ 440#endif /* !CONFIG_GENERIC_PENDING_IRQ */
441 441
442#if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY) 442#if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY)
443static inline int irq_domain_activate_irq(struct irq_data *data, bool early) 443static inline int irq_domain_activate_irq(struct irq_data *data, bool reserve)
444{ 444{
445 irqd_set_activated(data); 445 irqd_set_activated(data);
446 return 0; 446 return 0;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 4f4f60015e8a..62068ad46930 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1693,7 +1693,7 @@ static void __irq_domain_deactivate_irq(struct irq_data *irq_data)
1693 } 1693 }
1694} 1694}
1695 1695
1696static int __irq_domain_activate_irq(struct irq_data *irqd, bool early) 1696static int __irq_domain_activate_irq(struct irq_data *irqd, bool reserve)
1697{ 1697{
1698 int ret = 0; 1698 int ret = 0;
1699 1699
@@ -1702,9 +1702,9 @@ static int __irq_domain_activate_irq(struct irq_data *irqd, bool early)
1702 1702
1703 if (irqd->parent_data) 1703 if (irqd->parent_data)
1704 ret = __irq_domain_activate_irq(irqd->parent_data, 1704 ret = __irq_domain_activate_irq(irqd->parent_data,
1705 early); 1705 reserve);
1706 if (!ret && domain->ops->activate) { 1706 if (!ret && domain->ops->activate) {
1707 ret = domain->ops->activate(domain, irqd, early); 1707 ret = domain->ops->activate(domain, irqd, reserve);
1708 /* Rollback in case of error */ 1708 /* Rollback in case of error */
1709 if (ret && irqd->parent_data) 1709 if (ret && irqd->parent_data)
1710 __irq_domain_deactivate_irq(irqd->parent_data); 1710 __irq_domain_deactivate_irq(irqd->parent_data);
@@ -1716,17 +1716,18 @@ static int __irq_domain_activate_irq(struct irq_data *irqd, bool early)
1716/** 1716/**
1717 * irq_domain_activate_irq - Call domain_ops->activate recursively to activate 1717 * irq_domain_activate_irq - Call domain_ops->activate recursively to activate
1718 * interrupt 1718 * interrupt
1719 * @irq_data: outermost irq_data associated with interrupt 1719 * @irq_data: Outermost irq_data associated with interrupt
1720 * @reserve: If set only reserve an interrupt vector instead of assigning one
1720 * 1721 *
1721 * This is the second step to call domain_ops->activate to program interrupt 1722 * This is the second step to call domain_ops->activate to program interrupt
1722 * controllers, so the interrupt could actually get delivered. 1723 * controllers, so the interrupt could actually get delivered.
1723 */ 1724 */
1724int irq_domain_activate_irq(struct irq_data *irq_data, bool early) 1725int irq_domain_activate_irq(struct irq_data *irq_data, bool reserve)
1725{ 1726{
1726 int ret = 0; 1727 int ret = 0;
1727 1728
1728 if (!irqd_is_activated(irq_data)) 1729 if (!irqd_is_activated(irq_data))
1729 ret = __irq_domain_activate_irq(irq_data, early); 1730 ret = __irq_domain_activate_irq(irq_data, reserve);
1730 if (!ret) 1731 if (!ret)
1731 irqd_set_activated(irq_data); 1732 irqd_set_activated(irq_data);
1732 return ret; 1733 return ret;
diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index 7df2480005f8..5187dfe809ac 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -321,15 +321,23 @@ void irq_matrix_remove_reserved(struct irq_matrix *m)
321int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk, 321int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk,
322 bool reserved, unsigned int *mapped_cpu) 322 bool reserved, unsigned int *mapped_cpu)
323{ 323{
324 unsigned int cpu; 324 unsigned int cpu, best_cpu, maxavl = 0;
325 struct cpumap *cm;
326 unsigned int bit;
325 327
328 best_cpu = UINT_MAX;
326 for_each_cpu(cpu, msk) { 329 for_each_cpu(cpu, msk) {
327 struct cpumap *cm = per_cpu_ptr(m->maps, cpu); 330 cm = per_cpu_ptr(m->maps, cpu);
328 unsigned int bit;
329 331
330 if (!cm->online) 332 if (!cm->online || cm->available <= maxavl)
331 continue; 333 continue;
332 334
335 best_cpu = cpu;
336 maxavl = cm->available;
337 }
338
339 if (maxavl) {
340 cm = per_cpu_ptr(m->maps, best_cpu);
333 bit = matrix_alloc_area(m, cm, 1, false); 341 bit = matrix_alloc_area(m, cm, 1, false);
334 if (bit < m->alloc_end) { 342 if (bit < m->alloc_end) {
335 cm->allocated++; 343 cm->allocated++;
@@ -338,8 +346,8 @@ int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk,
338 m->global_available--; 346 m->global_available--;
339 if (reserved) 347 if (reserved)
340 m->global_reserved--; 348 m->global_reserved--;
341 *mapped_cpu = cpu; 349 *mapped_cpu = best_cpu;
342 trace_irq_matrix_alloc(bit, cpu, m, cm); 350 trace_irq_matrix_alloc(bit, best_cpu, m, cm);
343 return bit; 351 return bit;
344 } 352 }
345 } 353 }
@@ -384,7 +392,9 @@ unsigned int irq_matrix_available(struct irq_matrix *m, bool cpudown)
384{ 392{
385 struct cpumap *cm = this_cpu_ptr(m->maps); 393 struct cpumap *cm = this_cpu_ptr(m->maps);
386 394
387 return (m->global_available - cpudown) ? cm->available : 0; 395 if (!cpudown)
396 return m->global_available;
397 return m->global_available - cm->available;
388} 398}
389 399
390/** 400/**
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index edb987b2c58d..2f3c4f5382cc 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -339,6 +339,40 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
339 return ret; 339 return ret;
340} 340}
341 341
342/*
343 * Carefully check whether the device can use reservation mode. If
344 * reservation mode is enabled then the early activation will assign a
345 * dummy vector to the device. If the PCI/MSI device does not support
346 * masking of the entry then this can result in spurious interrupts when
347 * the device driver is not absolutely careful. But even then a malfunction
348 * of the hardware could result in a spurious interrupt on the dummy vector
349 * and render the device unusable. If the entry can be masked then the core
350 * logic will prevent the spurious interrupt and reservation mode can be
351 * used. For now reservation mode is restricted to PCI/MSI.
352 */
353static bool msi_check_reservation_mode(struct irq_domain *domain,
354 struct msi_domain_info *info,
355 struct device *dev)
356{
357 struct msi_desc *desc;
358
359 if (domain->bus_token != DOMAIN_BUS_PCI_MSI)
360 return false;
361
362 if (!(info->flags & MSI_FLAG_MUST_REACTIVATE))
363 return false;
364
365 if (IS_ENABLED(CONFIG_PCI_MSI) && pci_msi_ignore_mask)
366 return false;
367
368 /*
369 * Checking the first MSI descriptor is sufficient. MSIX supports
370 * masking and MSI does so when the maskbit is set.
371 */
372 desc = first_msi_entry(dev);
373 return desc->msi_attrib.is_msix || desc->msi_attrib.maskbit;
374}
375
342/** 376/**
343 * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain 377 * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain
344 * @domain: The domain to allocate from 378 * @domain: The domain to allocate from
@@ -353,9 +387,11 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
353{ 387{
354 struct msi_domain_info *info = domain->host_data; 388 struct msi_domain_info *info = domain->host_data;
355 struct msi_domain_ops *ops = info->ops; 389 struct msi_domain_ops *ops = info->ops;
356 msi_alloc_info_t arg; 390 struct irq_data *irq_data;
357 struct msi_desc *desc; 391 struct msi_desc *desc;
392 msi_alloc_info_t arg;
358 int i, ret, virq; 393 int i, ret, virq;
394 bool can_reserve;
359 395
360 ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg); 396 ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg);
361 if (ret) 397 if (ret)
@@ -385,6 +421,8 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
385 if (ops->msi_finish) 421 if (ops->msi_finish)
386 ops->msi_finish(&arg, 0); 422 ops->msi_finish(&arg, 0);
387 423
424 can_reserve = msi_check_reservation_mode(domain, info, dev);
425
388 for_each_msi_entry(desc, dev) { 426 for_each_msi_entry(desc, dev) {
389 virq = desc->irq; 427 virq = desc->irq;
390 if (desc->nvec_used == 1) 428 if (desc->nvec_used == 1)
@@ -397,15 +435,25 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
397 * the MSI entries before the PCI layer enables MSI in the 435 * the MSI entries before the PCI layer enables MSI in the
398 * card. Otherwise the card latches a random msi message. 436 * card. Otherwise the card latches a random msi message.
399 */ 437 */
400 if (info->flags & MSI_FLAG_ACTIVATE_EARLY) { 438 if (!(info->flags & MSI_FLAG_ACTIVATE_EARLY))
401 struct irq_data *irq_data; 439 continue;
402 440
441 irq_data = irq_domain_get_irq_data(domain, desc->irq);
442 if (!can_reserve)
443 irqd_clr_can_reserve(irq_data);
444 ret = irq_domain_activate_irq(irq_data, can_reserve);
445 if (ret)
446 goto cleanup;
447 }
448
449 /*
450 * If these interrupts use reservation mode, clear the activated bit
451 * so request_irq() will assign the final vector.
452 */
453 if (can_reserve) {
454 for_each_msi_entry(desc, dev) {
403 irq_data = irq_domain_get_irq_data(domain, desc->irq); 455 irq_data = irq_domain_get_irq_data(domain, desc->irq);
404 ret = irq_domain_activate_irq(irq_data, true); 456 irqd_clr_activated(irq_data);
405 if (ret)
406 goto cleanup;
407 if (info->flags & MSI_FLAG_MUST_REACTIVATE)
408 irqd_clr_activated(irq_data);
409 } 457 }
410 } 458 }
411 return 0; 459 return 0;
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 8594d24e4adc..b4517095db6a 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -79,7 +79,7 @@ int static_key_count(struct static_key *key)
79} 79}
80EXPORT_SYMBOL_GPL(static_key_count); 80EXPORT_SYMBOL_GPL(static_key_count);
81 81
82static void static_key_slow_inc_cpuslocked(struct static_key *key) 82void static_key_slow_inc_cpuslocked(struct static_key *key)
83{ 83{
84 int v, v1; 84 int v, v1;
85 85
@@ -180,7 +180,7 @@ void static_key_disable(struct static_key *key)
180} 180}
181EXPORT_SYMBOL_GPL(static_key_disable); 181EXPORT_SYMBOL_GPL(static_key_disable);
182 182
183static void static_key_slow_dec_cpuslocked(struct static_key *key, 183static void __static_key_slow_dec_cpuslocked(struct static_key *key,
184 unsigned long rate_limit, 184 unsigned long rate_limit,
185 struct delayed_work *work) 185 struct delayed_work *work)
186{ 186{
@@ -211,7 +211,7 @@ static void __static_key_slow_dec(struct static_key *key,
211 struct delayed_work *work) 211 struct delayed_work *work)
212{ 212{
213 cpus_read_lock(); 213 cpus_read_lock();
214 static_key_slow_dec_cpuslocked(key, rate_limit, work); 214 __static_key_slow_dec_cpuslocked(key, rate_limit, work);
215 cpus_read_unlock(); 215 cpus_read_unlock();
216} 216}
217 217
@@ -229,6 +229,12 @@ void static_key_slow_dec(struct static_key *key)
229} 229}
230EXPORT_SYMBOL_GPL(static_key_slow_dec); 230EXPORT_SYMBOL_GPL(static_key_slow_dec);
231 231
232void static_key_slow_dec_cpuslocked(struct static_key *key)
233{
234 STATIC_KEY_CHECK_USE(key);
235 __static_key_slow_dec_cpuslocked(key, 0, NULL);
236}
237
232void static_key_slow_dec_deferred(struct static_key_deferred *key) 238void static_key_slow_dec_deferred(struct static_key_deferred *key)
233{ 239{
234 STATIC_KEY_CHECK_USE(key); 240 STATIC_KEY_CHECK_USE(key);
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 15f33faf4013..7594c033d98a 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -157,7 +157,7 @@ void notrace __sanitizer_cov_trace_cmp2(u16 arg1, u16 arg2)
157} 157}
158EXPORT_SYMBOL(__sanitizer_cov_trace_cmp2); 158EXPORT_SYMBOL(__sanitizer_cov_trace_cmp2);
159 159
160void notrace __sanitizer_cov_trace_cmp4(u16 arg1, u16 arg2) 160void notrace __sanitizer_cov_trace_cmp4(u32 arg1, u32 arg2)
161{ 161{
162 write_comp_data(KCOV_CMP_SIZE(2), arg1, arg2, _RET_IP_); 162 write_comp_data(KCOV_CMP_SIZE(2), arg1, arg2, _RET_IP_);
163} 163}
@@ -183,7 +183,7 @@ void notrace __sanitizer_cov_trace_const_cmp2(u16 arg1, u16 arg2)
183} 183}
184EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp2); 184EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp2);
185 185
186void notrace __sanitizer_cov_trace_const_cmp4(u16 arg1, u16 arg2) 186void notrace __sanitizer_cov_trace_const_cmp4(u32 arg1, u32 arg2)
187{ 187{
188 write_comp_data(KCOV_CMP_SIZE(2) | KCOV_CMP_CONST, arg1, arg2, 188 write_comp_data(KCOV_CMP_SIZE(2) | KCOV_CMP_CONST, arg1, arg2,
189 _RET_IP_); 189 _RET_IP_);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 9776da8db180..521659044719 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -49,6 +49,7 @@
49#include <linux/gfp.h> 49#include <linux/gfp.h>
50#include <linux/random.h> 50#include <linux/random.h>
51#include <linux/jhash.h> 51#include <linux/jhash.h>
52#include <linux/nmi.h>
52 53
53#include <asm/sections.h> 54#include <asm/sections.h>
54 55
@@ -57,10 +58,6 @@
57#define CREATE_TRACE_POINTS 58#define CREATE_TRACE_POINTS
58#include <trace/events/lock.h> 59#include <trace/events/lock.h>
59 60
60#ifdef CONFIG_LOCKDEP_CROSSRELEASE
61#include <linux/slab.h>
62#endif
63
64#ifdef CONFIG_PROVE_LOCKING 61#ifdef CONFIG_PROVE_LOCKING
65int prove_locking = 1; 62int prove_locking = 1;
66module_param(prove_locking, int, 0644); 63module_param(prove_locking, int, 0644);
@@ -75,19 +72,6 @@ module_param(lock_stat, int, 0644);
75#define lock_stat 0 72#define lock_stat 0
76#endif 73#endif
77 74
78#ifdef CONFIG_BOOTPARAM_LOCKDEP_CROSSRELEASE_FULLSTACK
79static int crossrelease_fullstack = 1;
80#else
81static int crossrelease_fullstack;
82#endif
83static int __init allow_crossrelease_fullstack(char *str)
84{
85 crossrelease_fullstack = 1;
86 return 0;
87}
88
89early_param("crossrelease_fullstack", allow_crossrelease_fullstack);
90
91/* 75/*
92 * lockdep_lock: protects the lockdep graph, the hashes and the 76 * lockdep_lock: protects the lockdep graph, the hashes and the
93 * class/list/hash allocators. 77 * class/list/hash allocators.
@@ -740,18 +724,6 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
740 return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL); 724 return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL);
741} 725}
742 726
743#ifdef CONFIG_LOCKDEP_CROSSRELEASE
744static void cross_init(struct lockdep_map *lock, int cross);
745static int cross_lock(struct lockdep_map *lock);
746static int lock_acquire_crosslock(struct held_lock *hlock);
747static int lock_release_crosslock(struct lockdep_map *lock);
748#else
749static inline void cross_init(struct lockdep_map *lock, int cross) {}
750static inline int cross_lock(struct lockdep_map *lock) { return 0; }
751static inline int lock_acquire_crosslock(struct held_lock *hlock) { return 2; }
752static inline int lock_release_crosslock(struct lockdep_map *lock) { return 2; }
753#endif
754
755/* 727/*
756 * Register a lock's class in the hash-table, if the class is not present 728 * Register a lock's class in the hash-table, if the class is not present
757 * yet. Otherwise we look it up. We cache the result in the lock object 729 * yet. Otherwise we look it up. We cache the result in the lock object
@@ -1151,41 +1123,22 @@ print_circular_lock_scenario(struct held_lock *src,
1151 printk(KERN_CONT "\n\n"); 1123 printk(KERN_CONT "\n\n");
1152 } 1124 }
1153 1125
1154 if (cross_lock(tgt->instance)) { 1126 printk(" Possible unsafe locking scenario:\n\n");
1155 printk(" Possible unsafe locking scenario by crosslock:\n\n"); 1127 printk(" CPU0 CPU1\n");
1156 printk(" CPU0 CPU1\n"); 1128 printk(" ---- ----\n");
1157 printk(" ---- ----\n"); 1129 printk(" lock(");
1158 printk(" lock("); 1130 __print_lock_name(target);
1159 __print_lock_name(parent); 1131 printk(KERN_CONT ");\n");
1160 printk(KERN_CONT ");\n"); 1132 printk(" lock(");
1161 printk(" lock("); 1133 __print_lock_name(parent);
1162 __print_lock_name(target); 1134 printk(KERN_CONT ");\n");
1163 printk(KERN_CONT ");\n"); 1135 printk(" lock(");
1164 printk(" lock("); 1136 __print_lock_name(target);
1165 __print_lock_name(source); 1137 printk(KERN_CONT ");\n");
1166 printk(KERN_CONT ");\n"); 1138 printk(" lock(");
1167 printk(" unlock("); 1139 __print_lock_name(source);
1168 __print_lock_name(target); 1140 printk(KERN_CONT ");\n");
1169 printk(KERN_CONT ");\n"); 1141 printk("\n *** DEADLOCK ***\n\n");
1170 printk("\n *** DEADLOCK ***\n\n");
1171 } else {
1172 printk(" Possible unsafe locking scenario:\n\n");
1173 printk(" CPU0 CPU1\n");
1174 printk(" ---- ----\n");
1175 printk(" lock(");
1176 __print_lock_name(target);
1177 printk(KERN_CONT ");\n");
1178 printk(" lock(");
1179 __print_lock_name(parent);
1180 printk(KERN_CONT ");\n");
1181 printk(" lock(");
1182 __print_lock_name(target);
1183 printk(KERN_CONT ");\n");
1184 printk(" lock(");
1185 __print_lock_name(source);
1186 printk(KERN_CONT ");\n");
1187 printk("\n *** DEADLOCK ***\n\n");
1188 }
1189} 1142}
1190 1143
1191/* 1144/*
@@ -1211,10 +1164,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
1211 curr->comm, task_pid_nr(curr)); 1164 curr->comm, task_pid_nr(curr));
1212 print_lock(check_src); 1165 print_lock(check_src);
1213 1166
1214 if (cross_lock(check_tgt->instance)) 1167 pr_warn("\nbut task is already holding lock:\n");
1215 pr_warn("\nbut now in release context of a crosslock acquired at the following:\n");
1216 else
1217 pr_warn("\nbut task is already holding lock:\n");
1218 1168
1219 print_lock(check_tgt); 1169 print_lock(check_tgt);
1220 pr_warn("\nwhich lock already depends on the new lock.\n\n"); 1170 pr_warn("\nwhich lock already depends on the new lock.\n\n");
@@ -1244,9 +1194,7 @@ static noinline int print_circular_bug(struct lock_list *this,
1244 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 1194 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
1245 return 0; 1195 return 0;
1246 1196
1247 if (cross_lock(check_tgt->instance)) 1197 if (!save_trace(&this->trace))
1248 this->trace = *trace;
1249 else if (!save_trace(&this->trace))
1250 return 0; 1198 return 0;
1251 1199
1252 depth = get_lock_depth(target); 1200 depth = get_lock_depth(target);
@@ -1850,9 +1798,6 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
1850 if (nest) 1798 if (nest)
1851 return 2; 1799 return 2;
1852 1800
1853 if (cross_lock(prev->instance))
1854 continue;
1855
1856 return print_deadlock_bug(curr, prev, next); 1801 return print_deadlock_bug(curr, prev, next);
1857 } 1802 }
1858 return 1; 1803 return 1;
@@ -2018,31 +1963,26 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
2018 for (;;) { 1963 for (;;) {
2019 int distance = curr->lockdep_depth - depth + 1; 1964 int distance = curr->lockdep_depth - depth + 1;
2020 hlock = curr->held_locks + depth - 1; 1965 hlock = curr->held_locks + depth - 1;
1966
2021 /* 1967 /*
2022 * Only non-crosslock entries get new dependencies added. 1968 * Only non-recursive-read entries get new dependencies
2023 * Crosslock entries will be added by commit later: 1969 * added:
2024 */ 1970 */
2025 if (!cross_lock(hlock->instance)) { 1971 if (hlock->read != 2 && hlock->check) {
1972 int ret = check_prev_add(curr, hlock, next, distance, &trace, save_trace);
1973 if (!ret)
1974 return 0;
1975
2026 /* 1976 /*
2027 * Only non-recursive-read entries get new dependencies 1977 * Stop after the first non-trylock entry,
2028 * added: 1978 * as non-trylock entries have added their
1979 * own direct dependencies already, so this
1980 * lock is connected to them indirectly:
2029 */ 1981 */
2030 if (hlock->read != 2 && hlock->check) { 1982 if (!hlock->trylock)
2031 int ret = check_prev_add(curr, hlock, next, 1983 break;
2032 distance, &trace, save_trace);
2033 if (!ret)
2034 return 0;
2035
2036 /*
2037 * Stop after the first non-trylock entry,
2038 * as non-trylock entries have added their
2039 * own direct dependencies already, so this
2040 * lock is connected to them indirectly:
2041 */
2042 if (!hlock->trylock)
2043 break;
2044 }
2045 } 1984 }
1985
2046 depth--; 1986 depth--;
2047 /* 1987 /*
2048 * End of lock-stack? 1988 * End of lock-stack?
@@ -3292,21 +3232,10 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name,
3292void lockdep_init_map(struct lockdep_map *lock, const char *name, 3232void lockdep_init_map(struct lockdep_map *lock, const char *name,
3293 struct lock_class_key *key, int subclass) 3233 struct lock_class_key *key, int subclass)
3294{ 3234{
3295 cross_init(lock, 0);
3296 __lockdep_init_map(lock, name, key, subclass); 3235 __lockdep_init_map(lock, name, key, subclass);
3297} 3236}
3298EXPORT_SYMBOL_GPL(lockdep_init_map); 3237EXPORT_SYMBOL_GPL(lockdep_init_map);
3299 3238
3300#ifdef CONFIG_LOCKDEP_CROSSRELEASE
3301void lockdep_init_map_crosslock(struct lockdep_map *lock, const char *name,
3302 struct lock_class_key *key, int subclass)
3303{
3304 cross_init(lock, 1);
3305 __lockdep_init_map(lock, name, key, subclass);
3306}
3307EXPORT_SYMBOL_GPL(lockdep_init_map_crosslock);
3308#endif
3309
3310struct lock_class_key __lockdep_no_validate__; 3239struct lock_class_key __lockdep_no_validate__;
3311EXPORT_SYMBOL_GPL(__lockdep_no_validate__); 3240EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
3312 3241
@@ -3362,7 +3291,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3362 int chain_head = 0; 3291 int chain_head = 0;
3363 int class_idx; 3292 int class_idx;
3364 u64 chain_key; 3293 u64 chain_key;
3365 int ret;
3366 3294
3367 if (unlikely(!debug_locks)) 3295 if (unlikely(!debug_locks))
3368 return 0; 3296 return 0;
@@ -3411,8 +3339,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3411 3339
3412 class_idx = class - lock_classes + 1; 3340 class_idx = class - lock_classes + 1;
3413 3341
3414 /* TODO: nest_lock is not implemented for crosslock yet. */ 3342 if (depth) {
3415 if (depth && !cross_lock(lock)) {
3416 hlock = curr->held_locks + depth - 1; 3343 hlock = curr->held_locks + depth - 1;
3417 if (hlock->class_idx == class_idx && nest_lock) { 3344 if (hlock->class_idx == class_idx && nest_lock) {
3418 if (hlock->references) { 3345 if (hlock->references) {
@@ -3500,14 +3427,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3500 if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) 3427 if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
3501 return 0; 3428 return 0;
3502 3429
3503 ret = lock_acquire_crosslock(hlock);
3504 /*
3505 * 2 means normal acquire operations are needed. Otherwise, it's
3506 * ok just to return with '0:fail, 1:success'.
3507 */
3508 if (ret != 2)
3509 return ret;
3510
3511 curr->curr_chain_key = chain_key; 3430 curr->curr_chain_key = chain_key;
3512 curr->lockdep_depth++; 3431 curr->lockdep_depth++;
3513 check_chain_key(curr); 3432 check_chain_key(curr);
@@ -3745,19 +3664,11 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
3745 struct task_struct *curr = current; 3664 struct task_struct *curr = current;
3746 struct held_lock *hlock; 3665 struct held_lock *hlock;
3747 unsigned int depth; 3666 unsigned int depth;
3748 int ret, i; 3667 int i;
3749 3668
3750 if (unlikely(!debug_locks)) 3669 if (unlikely(!debug_locks))
3751 return 0; 3670 return 0;
3752 3671
3753 ret = lock_release_crosslock(lock);
3754 /*
3755 * 2 means normal release operations are needed. Otherwise, it's
3756 * ok just to return with '0:fail, 1:success'.
3757 */
3758 if (ret != 2)
3759 return ret;
3760
3761 depth = curr->lockdep_depth; 3672 depth = curr->lockdep_depth;
3762 /* 3673 /*
3763 * So we're all set to release this lock.. wait what lock? We don't 3674 * So we're all set to release this lock.. wait what lock? We don't
@@ -4580,6 +4491,7 @@ retry:
4580 if (!unlock) 4491 if (!unlock)
4581 if (read_trylock(&tasklist_lock)) 4492 if (read_trylock(&tasklist_lock))
4582 unlock = 1; 4493 unlock = 1;
4494 touch_nmi_watchdog();
4583 } while_each_thread(g, p); 4495 } while_each_thread(g, p);
4584 4496
4585 pr_warn("\n"); 4497 pr_warn("\n");
@@ -4675,494 +4587,3 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
4675 dump_stack(); 4587 dump_stack();
4676} 4588}
4677EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); 4589EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
4678
4679#ifdef CONFIG_LOCKDEP_CROSSRELEASE
4680
4681/*
4682 * Crossrelease works by recording a lock history for each thread and
4683 * connecting those historic locks that were taken after the
4684 * wait_for_completion() in the complete() context.
4685 *
4686 * Task-A Task-B
4687 *
4688 * mutex_lock(&A);
4689 * mutex_unlock(&A);
4690 *
4691 * wait_for_completion(&C);
4692 * lock_acquire_crosslock();
4693 * atomic_inc_return(&cross_gen_id);
4694 * |
4695 * | mutex_lock(&B);
4696 * | mutex_unlock(&B);
4697 * |
4698 * | complete(&C);
4699 * `-- lock_commit_crosslock();
4700 *
4701 * Which will then add a dependency between B and C.
4702 */
4703
4704#define xhlock(i) (current->xhlocks[(i) % MAX_XHLOCKS_NR])
4705
4706/*
4707 * Whenever a crosslock is held, cross_gen_id will be increased.
4708 */
4709static atomic_t cross_gen_id; /* Can be wrapped */
4710
4711/*
4712 * Make an entry of the ring buffer invalid.
4713 */
4714static inline void invalidate_xhlock(struct hist_lock *xhlock)
4715{
4716 /*
4717 * Normally, xhlock->hlock.instance must be !NULL.
4718 */
4719 xhlock->hlock.instance = NULL;
4720}
4721
4722/*
4723 * Lock history stacks; we have 2 nested lock history stacks:
4724 *
4725 * HARD(IRQ)
4726 * SOFT(IRQ)
4727 *
4728 * The thing is that once we complete a HARD/SOFT IRQ the future task locks
4729 * should not depend on any of the locks observed while running the IRQ. So
4730 * what we do is rewind the history buffer and erase all our knowledge of that
4731 * temporal event.
4732 */
4733
4734void crossrelease_hist_start(enum xhlock_context_t c)
4735{
4736 struct task_struct *cur = current;
4737
4738 if (!cur->xhlocks)
4739 return;
4740
4741 cur->xhlock_idx_hist[c] = cur->xhlock_idx;
4742 cur->hist_id_save[c] = cur->hist_id;
4743}
4744
4745void crossrelease_hist_end(enum xhlock_context_t c)
4746{
4747 struct task_struct *cur = current;
4748
4749 if (cur->xhlocks) {
4750 unsigned int idx = cur->xhlock_idx_hist[c];
4751 struct hist_lock *h = &xhlock(idx);
4752
4753 cur->xhlock_idx = idx;
4754
4755 /* Check if the ring was overwritten. */
4756 if (h->hist_id != cur->hist_id_save[c])
4757 invalidate_xhlock(h);
4758 }
4759}
4760
4761/*
4762 * lockdep_invariant_state() is used to annotate independence inside a task, to
4763 * make one task look like multiple independent 'tasks'.
4764 *
4765 * Take for instance workqueues; each work is independent of the last. The
4766 * completion of a future work does not depend on the completion of a past work
4767 * (in general). Therefore we must not carry that (lock) dependency across
4768 * works.
4769 *
4770 * This is true for many things; pretty much all kthreads fall into this
4771 * pattern, where they have an invariant state and future completions do not
4772 * depend on past completions. Its just that since they all have the 'same'
4773 * form -- the kthread does the same over and over -- it doesn't typically
4774 * matter.
4775 *
4776 * The same is true for system-calls, once a system call is completed (we've
4777 * returned to userspace) the next system call does not depend on the lock
4778 * history of the previous system call.
4779 *
4780 * They key property for independence, this invariant state, is that it must be
4781 * a point where we hold no locks and have no history. Because if we were to
4782 * hold locks, the restore at _end() would not necessarily recover it's history
4783 * entry. Similarly, independence per-definition means it does not depend on
4784 * prior state.
4785 */
4786void lockdep_invariant_state(bool force)
4787{
4788 /*
4789 * We call this at an invariant point, no current state, no history.
4790 * Verify the former, enforce the latter.
4791 */
4792 WARN_ON_ONCE(!force && current->lockdep_depth);
4793 invalidate_xhlock(&xhlock(current->xhlock_idx));
4794}
4795
4796static int cross_lock(struct lockdep_map *lock)
4797{
4798 return lock ? lock->cross : 0;
4799}
4800
4801/*
4802 * This is needed to decide the relationship between wrapable variables.
4803 */
4804static inline int before(unsigned int a, unsigned int b)
4805{
4806 return (int)(a - b) < 0;
4807}
4808
4809static inline struct lock_class *xhlock_class(struct hist_lock *xhlock)
4810{
4811 return hlock_class(&xhlock->hlock);
4812}
4813
4814static inline struct lock_class *xlock_class(struct cross_lock *xlock)
4815{
4816 return hlock_class(&xlock->hlock);
4817}
4818
4819/*
4820 * Should we check a dependency with previous one?
4821 */
4822static inline int depend_before(struct held_lock *hlock)
4823{
4824 return hlock->read != 2 && hlock->check && !hlock->trylock;
4825}
4826
4827/*
4828 * Should we check a dependency with next one?
4829 */
4830static inline int depend_after(struct held_lock *hlock)
4831{
4832 return hlock->read != 2 && hlock->check;
4833}
4834
4835/*
4836 * Check if the xhlock is valid, which would be false if,
4837 *
4838 * 1. Has not used after initializaion yet.
4839 * 2. Got invalidated.
4840 *
4841 * Remind hist_lock is implemented as a ring buffer.
4842 */
4843static inline int xhlock_valid(struct hist_lock *xhlock)
4844{
4845 /*
4846 * xhlock->hlock.instance must be !NULL.
4847 */
4848 return !!xhlock->hlock.instance;
4849}
4850
4851/*
4852 * Record a hist_lock entry.
4853 *
4854 * Irq disable is only required.
4855 */
4856static void add_xhlock(struct held_lock *hlock)
4857{
4858 unsigned int idx = ++current->xhlock_idx;
4859 struct hist_lock *xhlock = &xhlock(idx);
4860
4861#ifdef CONFIG_DEBUG_LOCKDEP
4862 /*
4863 * This can be done locklessly because they are all task-local
4864 * state, we must however ensure IRQs are disabled.
4865 */
4866 WARN_ON_ONCE(!irqs_disabled());
4867#endif
4868
4869 /* Initialize hist_lock's members */
4870 xhlock->hlock = *hlock;
4871 xhlock->hist_id = ++current->hist_id;
4872
4873 xhlock->trace.nr_entries = 0;
4874 xhlock->trace.max_entries = MAX_XHLOCK_TRACE_ENTRIES;
4875 xhlock->trace.entries = xhlock->trace_entries;
4876
4877 if (crossrelease_fullstack) {
4878 xhlock->trace.skip = 3;
4879 save_stack_trace(&xhlock->trace);
4880 } else {
4881 xhlock->trace.nr_entries = 1;
4882 xhlock->trace.entries[0] = hlock->acquire_ip;
4883 }
4884}
4885
4886static inline int same_context_xhlock(struct hist_lock *xhlock)
4887{
4888 return xhlock->hlock.irq_context == task_irq_context(current);
4889}
4890
4891/*
4892 * This should be lockless as far as possible because this would be
4893 * called very frequently.
4894 */
4895static void check_add_xhlock(struct held_lock *hlock)
4896{
4897 /*
4898 * Record a hist_lock, only in case that acquisitions ahead
4899 * could depend on the held_lock. For example, if the held_lock
4900 * is trylock then acquisitions ahead never depends on that.
4901 * In that case, we don't need to record it. Just return.
4902 */
4903 if (!current->xhlocks || !depend_before(hlock))
4904 return;
4905
4906 add_xhlock(hlock);
4907}
4908
4909/*
4910 * For crosslock.
4911 */
4912static int add_xlock(struct held_lock *hlock)
4913{
4914 struct cross_lock *xlock;
4915 unsigned int gen_id;
4916
4917 if (!graph_lock())
4918 return 0;
4919
4920 xlock = &((struct lockdep_map_cross *)hlock->instance)->xlock;
4921
4922 /*
4923 * When acquisitions for a crosslock are overlapped, we use
4924 * nr_acquire to perform commit for them, based on cross_gen_id
4925 * of the first acquisition, which allows to add additional
4926 * dependencies.
4927 *
4928 * Moreover, when no acquisition of a crosslock is in progress,
4929 * we should not perform commit because the lock might not exist
4930 * any more, which might cause incorrect memory access. So we
4931 * have to track the number of acquisitions of a crosslock.
4932 *
4933 * depend_after() is necessary to initialize only the first
4934 * valid xlock so that the xlock can be used on its commit.
4935 */
4936 if (xlock->nr_acquire++ && depend_after(&xlock->hlock))
4937 goto unlock;
4938
4939 gen_id = (unsigned int)atomic_inc_return(&cross_gen_id);
4940 xlock->hlock = *hlock;
4941 xlock->hlock.gen_id = gen_id;
4942unlock:
4943 graph_unlock();
4944 return 1;
4945}
4946
4947/*
4948 * Called for both normal and crosslock acquires. Normal locks will be
4949 * pushed on the hist_lock queue. Cross locks will record state and
4950 * stop regular lock_acquire() to avoid being placed on the held_lock
4951 * stack.
4952 *
4953 * Return: 0 - failure;
4954 * 1 - crosslock, done;
4955 * 2 - normal lock, continue to held_lock[] ops.
4956 */
4957static int lock_acquire_crosslock(struct held_lock *hlock)
4958{
4959 /*
4960 * CONTEXT 1 CONTEXT 2
4961 * --------- ---------
4962 * lock A (cross)
4963 * X = atomic_inc_return(&cross_gen_id)
4964 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4965 * Y = atomic_read_acquire(&cross_gen_id)
4966 * lock B
4967 *
4968 * atomic_read_acquire() is for ordering between A and B,
4969 * IOW, A happens before B, when CONTEXT 2 see Y >= X.
4970 *
4971 * Pairs with atomic_inc_return() in add_xlock().
4972 */
4973 hlock->gen_id = (unsigned int)atomic_read_acquire(&cross_gen_id);
4974
4975 if (cross_lock(hlock->instance))
4976 return add_xlock(hlock);
4977
4978 check_add_xhlock(hlock);
4979 return 2;
4980}
4981
4982static int copy_trace(struct stack_trace *trace)
4983{
4984 unsigned long *buf = stack_trace + nr_stack_trace_entries;
4985 unsigned int max_nr = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
4986 unsigned int nr = min(max_nr, trace->nr_entries);
4987
4988 trace->nr_entries = nr;
4989 memcpy(buf, trace->entries, nr * sizeof(trace->entries[0]));
4990 trace->entries = buf;
4991 nr_stack_trace_entries += nr;
4992
4993 if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
4994 if (!debug_locks_off_graph_unlock())
4995 return 0;
4996
4997 print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");
4998 dump_stack();
4999
5000 return 0;
5001 }
5002
5003 return 1;
5004}
5005
5006static int commit_xhlock(struct cross_lock *xlock, struct hist_lock *xhlock)
5007{
5008 unsigned int xid, pid;
5009 u64 chain_key;
5010
5011 xid = xlock_class(xlock) - lock_classes;
5012 chain_key = iterate_chain_key((u64)0, xid);
5013 pid = xhlock_class(xhlock) - lock_classes;
5014 chain_key = iterate_chain_key(chain_key, pid);
5015
5016 if (lookup_chain_cache(chain_key))
5017 return 1;
5018
5019 if (!add_chain_cache_classes(xid, pid, xhlock->hlock.irq_context,
5020 chain_key))
5021 return 0;
5022
5023 if (!check_prev_add(current, &xlock->hlock, &xhlock->hlock, 1,
5024 &xhlock->trace, copy_trace))
5025 return 0;
5026
5027 return 1;
5028}
5029
5030static void commit_xhlocks(struct cross_lock *xlock)
5031{
5032 unsigned int cur = current->xhlock_idx;
5033 unsigned int prev_hist_id = xhlock(cur).hist_id;
5034 unsigned int i;
5035
5036 if (!graph_lock())
5037 return;
5038
5039 if (xlock->nr_acquire) {
5040 for (i = 0; i < MAX_XHLOCKS_NR; i++) {
5041 struct hist_lock *xhlock = &xhlock(cur - i);
5042
5043 if (!xhlock_valid(xhlock))
5044 break;
5045
5046 if (before(xhlock->hlock.gen_id, xlock->hlock.gen_id))
5047 break;
5048
5049 if (!same_context_xhlock(xhlock))
5050 break;
5051
5052 /*
5053 * Filter out the cases where the ring buffer was
5054 * overwritten and the current entry has a bigger
5055 * hist_id than the previous one, which is impossible
5056 * otherwise:
5057 */
5058 if (unlikely(before(prev_hist_id, xhlock->hist_id)))
5059 break;
5060
5061 prev_hist_id = xhlock->hist_id;
5062
5063 /*
5064 * commit_xhlock() returns 0 with graph_lock already
5065 * released if fail.
5066 */
5067 if (!commit_xhlock(xlock, xhlock))
5068 return;
5069 }
5070 }
5071
5072 graph_unlock();
5073}
5074
5075void lock_commit_crosslock(struct lockdep_map *lock)
5076{
5077 struct cross_lock *xlock;
5078 unsigned long flags;
5079
5080 if (unlikely(!debug_locks || current->lockdep_recursion))
5081 return;
5082
5083 if (!current->xhlocks)
5084 return;
5085
5086 /*
5087 * Do commit hist_locks with the cross_lock, only in case that
5088 * the cross_lock could depend on acquisitions after that.
5089 *
5090 * For example, if the cross_lock does not have the 'check' flag
5091 * then we don't need to check dependencies and commit for that.
5092 * Just skip it. In that case, of course, the cross_lock does
5093 * not depend on acquisitions ahead, either.
5094 *
5095 * WARNING: Don't do that in add_xlock() in advance. When an
5096 * acquisition context is different from the commit context,
5097 * invalid(skipped) cross_lock might be accessed.
5098 */
5099 if (!depend_after(&((struct lockdep_map_cross *)lock)->xlock.hlock))
5100 return;
5101
5102 raw_local_irq_save(flags);
5103 check_flags(flags);
5104 current->lockdep_recursion = 1;
5105 xlock = &((struct lockdep_map_cross *)lock)->xlock;
5106 commit_xhlocks(xlock);
5107 current->lockdep_recursion = 0;
5108 raw_local_irq_restore(flags);
5109}
5110EXPORT_SYMBOL_GPL(lock_commit_crosslock);
5111
5112/*
5113 * Return: 0 - failure;
5114 * 1 - crosslock, done;
5115 * 2 - normal lock, continue to held_lock[] ops.
5116 */
5117static int lock_release_crosslock(struct lockdep_map *lock)
5118{
5119 if (cross_lock(lock)) {
5120 if (!graph_lock())
5121 return 0;
5122 ((struct lockdep_map_cross *)lock)->xlock.nr_acquire--;
5123 graph_unlock();
5124 return 1;
5125 }
5126 return 2;
5127}
5128
5129static void cross_init(struct lockdep_map *lock, int cross)
5130{
5131 if (cross)
5132 ((struct lockdep_map_cross *)lock)->xlock.nr_acquire = 0;
5133
5134 lock->cross = cross;
5135
5136 /*
5137 * Crossrelease assumes that the ring buffer size of xhlocks
5138 * is aligned with power of 2. So force it on build.
5139 */
5140 BUILD_BUG_ON(MAX_XHLOCKS_NR & (MAX_XHLOCKS_NR - 1));
5141}
5142
5143void lockdep_init_task(struct task_struct *task)
5144{
5145 int i;
5146
5147 task->xhlock_idx = UINT_MAX;
5148 task->hist_id = 0;
5149
5150 for (i = 0; i < XHLOCK_CTX_NR; i++) {
5151 task->xhlock_idx_hist[i] = UINT_MAX;
5152 task->hist_id_save[i] = 0;
5153 }
5154
5155 task->xhlocks = kzalloc(sizeof(struct hist_lock) * MAX_XHLOCKS_NR,
5156 GFP_KERNEL);
5157}
5158
5159void lockdep_free_task(struct task_struct *task)
5160{
5161 if (task->xhlocks) {
5162 void *tmp = task->xhlocks;
5163 /* Diable crossrelease for current */
5164 task->xhlocks = NULL;
5165 kfree(tmp);
5166 }
5167}
5168#endif
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 6f3dba6e4e9e..65cc0cb984e6 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1290,6 +1290,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
1290 return ret; 1290 return ret;
1291} 1291}
1292 1292
1293static inline int __rt_mutex_slowtrylock(struct rt_mutex *lock)
1294{
1295 int ret = try_to_take_rt_mutex(lock, current, NULL);
1296
1297 /*
1298 * try_to_take_rt_mutex() sets the lock waiters bit
1299 * unconditionally. Clean this up.
1300 */
1301 fixup_rt_mutex_waiters(lock);
1302
1303 return ret;
1304}
1305
1293/* 1306/*
1294 * Slow path try-lock function: 1307 * Slow path try-lock function:
1295 */ 1308 */
@@ -1312,13 +1325,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
1312 */ 1325 */
1313 raw_spin_lock_irqsave(&lock->wait_lock, flags); 1326 raw_spin_lock_irqsave(&lock->wait_lock, flags);
1314 1327
1315 ret = try_to_take_rt_mutex(lock, current, NULL); 1328 ret = __rt_mutex_slowtrylock(lock);
1316
1317 /*
1318 * try_to_take_rt_mutex() sets the lock waiters bit
1319 * unconditionally. Clean this up.
1320 */
1321 fixup_rt_mutex_waiters(lock);
1322 1329
1323 raw_spin_unlock_irqrestore(&lock->wait_lock, flags); 1330 raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
1324 1331
@@ -1505,6 +1512,11 @@ int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
1505 return rt_mutex_slowtrylock(lock); 1512 return rt_mutex_slowtrylock(lock);
1506} 1513}
1507 1514
1515int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock)
1516{
1517 return __rt_mutex_slowtrylock(lock);
1518}
1519
1508/** 1520/**
1509 * rt_mutex_timed_lock - lock a rt_mutex interruptible 1521 * rt_mutex_timed_lock - lock a rt_mutex interruptible
1510 * the timeout structure is provided 1522 * the timeout structure is provided
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 124e98ca0b17..68686b3ec3c1 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -148,6 +148,7 @@ extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
148 struct rt_mutex_waiter *waiter); 148 struct rt_mutex_waiter *waiter);
149 149
150extern int rt_mutex_futex_trylock(struct rt_mutex *l); 150extern int rt_mutex_futex_trylock(struct rt_mutex *l);
151extern int __rt_mutex_futex_trylock(struct rt_mutex *l);
151 152
152extern void rt_mutex_futex_unlock(struct rt_mutex *lock); 153extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
153extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock, 154extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 1fd1a7543cdd..936f3d14dd6b 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -66,12 +66,8 @@ void __lockfunc __raw_##op##_lock(locktype##_t *lock) \
66 break; \ 66 break; \
67 preempt_enable(); \ 67 preempt_enable(); \
68 \ 68 \
69 if (!(lock)->break_lock) \ 69 arch_##op##_relax(&lock->raw_lock); \
70 (lock)->break_lock = 1; \
71 while ((lock)->break_lock) \
72 arch_##op##_relax(&lock->raw_lock); \
73 } \ 70 } \
74 (lock)->break_lock = 0; \
75} \ 71} \
76 \ 72 \
77unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ 73unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \
@@ -86,12 +82,9 @@ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \
86 local_irq_restore(flags); \ 82 local_irq_restore(flags); \
87 preempt_enable(); \ 83 preempt_enable(); \
88 \ 84 \
89 if (!(lock)->break_lock) \ 85 arch_##op##_relax(&lock->raw_lock); \
90 (lock)->break_lock = 1; \
91 while ((lock)->break_lock) \
92 arch_##op##_relax(&lock->raw_lock); \
93 } \ 86 } \
94 (lock)->break_lock = 0; \ 87 \
95 return flags; \ 88 return flags; \
96} \ 89} \
97 \ 90 \
diff --git a/kernel/pid.c b/kernel/pid.c
index b13b624e2c49..1e8bb6550ec4 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -193,10 +193,8 @@ struct pid *alloc_pid(struct pid_namespace *ns)
193 } 193 }
194 194
195 if (unlikely(is_child_reaper(pid))) { 195 if (unlikely(is_child_reaper(pid))) {
196 if (pid_ns_prepare_proc(ns)) { 196 if (pid_ns_prepare_proc(ns))
197 disable_pid_allocation(ns);
198 goto out_free; 197 goto out_free;
199 }
200 } 198 }
201 199
202 get_pid_ns(ns); 200 get_pid_ns(ns);
@@ -226,6 +224,10 @@ out_free:
226 while (++i <= ns->level) 224 while (++i <= ns->level)
227 idr_remove(&ns->idr, (pid->numbers + i)->nr); 225 idr_remove(&ns->idr, (pid->numbers + i)->nr);
228 226
227 /* On failure to allocate the first pid, reset the state */
228 if (ns->pid_allocated == PIDNS_ADDING)
229 idr_set_cursor(&ns->idr, 0);
230
229 spin_unlock_irq(&pidmap_lock); 231 spin_unlock_irq(&pidmap_lock);
230 232
231 kmem_cache_free(ns->pid_cachep, pid); 233 kmem_cache_free(ns->pid_cachep, pid);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 5d81206a572d..b9006617710f 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3141,9 +3141,6 @@ void dump_stack_print_info(const char *log_lvl)
3141void show_regs_print_info(const char *log_lvl) 3141void show_regs_print_info(const char *log_lvl)
3142{ 3142{
3143 dump_stack_print_info(log_lvl); 3143 dump_stack_print_info(log_lvl);
3144
3145 printk("%stask: %p task.stack: %p\n",
3146 log_lvl, current, task_stack_page(current));
3147} 3144}
3148 3145
3149#endif 3146#endif
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 2ddaec40956f..0926aef10dad 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -34,11 +34,6 @@ void complete(struct completion *x)
34 34
35 spin_lock_irqsave(&x->wait.lock, flags); 35 spin_lock_irqsave(&x->wait.lock, flags);
36 36
37 /*
38 * Perform commit of crossrelease here.
39 */
40 complete_release_commit(x);
41
42 if (x->done != UINT_MAX) 37 if (x->done != UINT_MAX)
43 x->done++; 38 x->done++;
44 __wake_up_locked(&x->wait, TASK_NORMAL, 1); 39 __wake_up_locked(&x->wait, TASK_NORMAL, 1);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 75554f366fd3..a7bf32aabfda 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2056,7 +2056,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2056 p->state = TASK_WAKING; 2056 p->state = TASK_WAKING;
2057 2057
2058 if (p->in_iowait) { 2058 if (p->in_iowait) {
2059 delayacct_blkio_end(); 2059 delayacct_blkio_end(p);
2060 atomic_dec(&task_rq(p)->nr_iowait); 2060 atomic_dec(&task_rq(p)->nr_iowait);
2061 } 2061 }
2062 2062
@@ -2069,7 +2069,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2069#else /* CONFIG_SMP */ 2069#else /* CONFIG_SMP */
2070 2070
2071 if (p->in_iowait) { 2071 if (p->in_iowait) {
2072 delayacct_blkio_end(); 2072 delayacct_blkio_end(p);
2073 atomic_dec(&task_rq(p)->nr_iowait); 2073 atomic_dec(&task_rq(p)->nr_iowait);
2074 } 2074 }
2075 2075
@@ -2122,7 +2122,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
2122 2122
2123 if (!task_on_rq_queued(p)) { 2123 if (!task_on_rq_queued(p)) {
2124 if (p->in_iowait) { 2124 if (p->in_iowait) {
2125 delayacct_blkio_end(); 2125 delayacct_blkio_end(p);
2126 atomic_dec(&rq->nr_iowait); 2126 atomic_dec(&rq->nr_iowait);
2127 } 2127 }
2128 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK); 2128 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
@@ -5097,17 +5097,6 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
5097 return ret; 5097 return ret;
5098} 5098}
5099 5099
5100/**
5101 * sys_sched_rr_get_interval - return the default timeslice of a process.
5102 * @pid: pid of the process.
5103 * @interval: userspace pointer to the timeslice value.
5104 *
5105 * this syscall writes the default timeslice value of a given process
5106 * into the user-space timespec buffer. A value of '0' means infinity.
5107 *
5108 * Return: On success, 0 and the timeslice is in @interval. Otherwise,
5109 * an error code.
5110 */
5111static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) 5100static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
5112{ 5101{
5113 struct task_struct *p; 5102 struct task_struct *p;
@@ -5144,6 +5133,17 @@ out_unlock:
5144 return retval; 5133 return retval;
5145} 5134}
5146 5135
5136/**
5137 * sys_sched_rr_get_interval - return the default timeslice of a process.
5138 * @pid: pid of the process.
5139 * @interval: userspace pointer to the timeslice value.
5140 *
5141 * this syscall writes the default timeslice value of a given process
5142 * into the user-space timespec buffer. A value of '0' means infinity.
5143 *
5144 * Return: On success, 0 and the timeslice is in @interval. Otherwise,
5145 * an error code.
5146 */
5147SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, 5147SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5148 struct timespec __user *, interval) 5148 struct timespec __user *, interval)
5149{ 5149{
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 2f52ec0f1539..d6717a3331a1 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -244,7 +244,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
244#ifdef CONFIG_NO_HZ_COMMON 244#ifdef CONFIG_NO_HZ_COMMON
245static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) 245static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
246{ 246{
247 unsigned long idle_calls = tick_nohz_get_idle_calls(); 247 unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu);
248 bool ret = idle_calls == sg_cpu->saved_idle_calls; 248 bool ret = idle_calls == sg_cpu->saved_idle_calls;
249 249
250 sg_cpu->saved_idle_calls = idle_calls; 250 sg_cpu->saved_idle_calls = idle_calls;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4037e19bbca2..26a71ebcd3c2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3413,9 +3413,9 @@ void set_task_rq_fair(struct sched_entity *se,
3413 * _IFF_ we look at the pure running and runnable sums. Because they 3413 * _IFF_ we look at the pure running and runnable sums. Because they
3414 * represent the very same entity, just at different points in the hierarchy. 3414 * represent the very same entity, just at different points in the hierarchy.
3415 * 3415 *
3416 * 3416 * Per the above update_tg_cfs_util() is trivial and simply copies the running
3417 * Per the above update_tg_cfs_util() is trivial (and still 'wrong') and 3417 * sum over (but still wrong, because the group entity and group rq do not have
3418 * simply copies the running sum over. 3418 * their PELT windows aligned).
3419 * 3419 *
3420 * However, update_tg_cfs_runnable() is more complex. So we have: 3420 * However, update_tg_cfs_runnable() is more complex. So we have:
3421 * 3421 *
@@ -3424,11 +3424,11 @@ void set_task_rq_fair(struct sched_entity *se,
3424 * And since, like util, the runnable part should be directly transferable, 3424 * And since, like util, the runnable part should be directly transferable,
3425 * the following would _appear_ to be the straight forward approach: 3425 * the following would _appear_ to be the straight forward approach:
3426 * 3426 *
3427 * grq->avg.load_avg = grq->load.weight * grq->avg.running_avg (3) 3427 * grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg (3)
3428 * 3428 *
3429 * And per (1) we have: 3429 * And per (1) we have:
3430 * 3430 *
3431 * ge->avg.running_avg == grq->avg.running_avg 3431 * ge->avg.runnable_avg == grq->avg.runnable_avg
3432 * 3432 *
3433 * Which gives: 3433 * Which gives:
3434 * 3434 *
@@ -3447,27 +3447,28 @@ void set_task_rq_fair(struct sched_entity *se,
3447 * to (shortly) return to us. This only works by keeping the weights as 3447 * to (shortly) return to us. This only works by keeping the weights as
3448 * integral part of the sum. We therefore cannot decompose as per (3). 3448 * integral part of the sum. We therefore cannot decompose as per (3).
3449 * 3449 *
3450 * OK, so what then? 3450 * Another reason this doesn't work is that runnable isn't a 0-sum entity.
3451 * Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the
3452 * rq itself is runnable anywhere between 2/3 and 1 depending on how the
3453 * runnable section of these tasks overlap (or not). If they were to perfectly
3454 * align the rq as a whole would be runnable 2/3 of the time. If however we
3455 * always have at least 1 runnable task, the rq as a whole is always runnable.
3451 * 3456 *
3457 * So we'll have to approximate.. :/
3452 * 3458 *
3453 * Another way to look at things is: 3459 * Given the constraint:
3454 * 3460 *
3455 * grq->avg.load_avg = \Sum se->avg.load_avg 3461 * ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX
3456 * 3462 *
3457 * Therefore, per (2): 3463 * We can construct a rule that adds runnable to a rq by assuming minimal
3464 * overlap.
3458 * 3465 *
3459 * grq->avg.load_avg = \Sum se->load.weight * se->avg.runnable_avg 3466 * On removal, we'll assume each task is equally runnable; which yields:
3460 * 3467 *
3461 * And the very thing we're propagating is a change in that sum (someone 3468 * grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight
3462 * joined/left). So we can easily know the runnable change, which would be, per
3463 * (2) the already tracked se->load_avg divided by the corresponding
3464 * se->weight.
3465 * 3469 *
3466 * Basically (4) but in differential form: 3470 * XXX: only do this for the part of runnable > running ?
3467 * 3471 *
3468 * d(runnable_avg) += se->avg.load_avg / se->load.weight
3469 * (5)
3470 * ge->avg.load_avg += ge->load.weight * d(runnable_avg)
3471 */ 3472 */
3472 3473
3473static inline void 3474static inline void
@@ -3479,6 +3480,14 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
3479 if (!delta) 3480 if (!delta)
3480 return; 3481 return;
3481 3482
3483 /*
3484 * The relation between sum and avg is:
3485 *
3486 * LOAD_AVG_MAX - 1024 + sa->period_contrib
3487 *
3488 * however, the PELT windows are not aligned between grq and gse.
3489 */
3490
3482 /* Set new sched_entity's utilization */ 3491 /* Set new sched_entity's utilization */
3483 se->avg.util_avg = gcfs_rq->avg.util_avg; 3492 se->avg.util_avg = gcfs_rq->avg.util_avg;
3484 se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX; 3493 se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
@@ -3491,33 +3500,68 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
3491static inline void 3500static inline void
3492update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) 3501update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3493{ 3502{
3494 long runnable_sum = gcfs_rq->prop_runnable_sum; 3503 long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
3495 long runnable_load_avg, load_avg; 3504 unsigned long runnable_load_avg, load_avg;
3496 s64 runnable_load_sum, load_sum; 3505 u64 runnable_load_sum, load_sum = 0;
3506 s64 delta_sum;
3497 3507
3498 if (!runnable_sum) 3508 if (!runnable_sum)
3499 return; 3509 return;
3500 3510
3501 gcfs_rq->prop_runnable_sum = 0; 3511 gcfs_rq->prop_runnable_sum = 0;
3502 3512
3513 if (runnable_sum >= 0) {
3514 /*
3515 * Add runnable; clip at LOAD_AVG_MAX. Reflects that until
3516 * the CPU is saturated running == runnable.
3517 */
3518 runnable_sum += se->avg.load_sum;
3519 runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX);
3520 } else {
3521 /*
3522 * Estimate the new unweighted runnable_sum of the gcfs_rq by
3523 * assuming all tasks are equally runnable.
3524 */
3525 if (scale_load_down(gcfs_rq->load.weight)) {
3526 load_sum = div_s64(gcfs_rq->avg.load_sum,
3527 scale_load_down(gcfs_rq->load.weight));
3528 }
3529
3530 /* But make sure to not inflate se's runnable */
3531 runnable_sum = min(se->avg.load_sum, load_sum);
3532 }
3533
3534 /*
3535 * runnable_sum can't be lower than running_sum
3536 * As running sum is scale with cpu capacity wehreas the runnable sum
3537 * is not we rescale running_sum 1st
3538 */
3539 running_sum = se->avg.util_sum /
3540 arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
3541 runnable_sum = max(runnable_sum, running_sum);
3542
3503 load_sum = (s64)se_weight(se) * runnable_sum; 3543 load_sum = (s64)se_weight(se) * runnable_sum;
3504 load_avg = div_s64(load_sum, LOAD_AVG_MAX); 3544 load_avg = div_s64(load_sum, LOAD_AVG_MAX);
3505 3545
3506 add_positive(&se->avg.load_sum, runnable_sum); 3546 delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
3507 add_positive(&se->avg.load_avg, load_avg); 3547 delta_avg = load_avg - se->avg.load_avg;
3508 3548
3509 add_positive(&cfs_rq->avg.load_avg, load_avg); 3549 se->avg.load_sum = runnable_sum;
3510 add_positive(&cfs_rq->avg.load_sum, load_sum); 3550 se->avg.load_avg = load_avg;
3551 add_positive(&cfs_rq->avg.load_avg, delta_avg);
3552 add_positive(&cfs_rq->avg.load_sum, delta_sum);
3511 3553
3512 runnable_load_sum = (s64)se_runnable(se) * runnable_sum; 3554 runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
3513 runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX); 3555 runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
3556 delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum;
3557 delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
3514 3558
3515 add_positive(&se->avg.runnable_load_sum, runnable_sum); 3559 se->avg.runnable_load_sum = runnable_sum;
3516 add_positive(&se->avg.runnable_load_avg, runnable_load_avg); 3560 se->avg.runnable_load_avg = runnable_load_avg;
3517 3561
3518 if (se->on_rq) { 3562 if (se->on_rq) {
3519 add_positive(&cfs_rq->avg.runnable_load_avg, runnable_load_avg); 3563 add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
3520 add_positive(&cfs_rq->avg.runnable_load_sum, runnable_load_sum); 3564 add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
3521 } 3565 }
3522} 3566}
3523 3567
@@ -4321,12 +4365,12 @@ static inline bool cfs_bandwidth_used(void)
4321 4365
4322void cfs_bandwidth_usage_inc(void) 4366void cfs_bandwidth_usage_inc(void)
4323{ 4367{
4324 static_key_slow_inc(&__cfs_bandwidth_used); 4368 static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
4325} 4369}
4326 4370
4327void cfs_bandwidth_usage_dec(void) 4371void cfs_bandwidth_usage_dec(void)
4328{ 4372{
4329 static_key_slow_dec(&__cfs_bandwidth_used); 4373 static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
4330} 4374}
4331#else /* HAVE_JUMP_LABEL */ 4375#else /* HAVE_JUMP_LABEL */
4332static bool cfs_bandwidth_used(void) 4376static bool cfs_bandwidth_used(void)
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index dd7908743dab..9bcbacba82a8 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -89,7 +89,9 @@ static int membarrier_private_expedited(void)
89 rcu_read_unlock(); 89 rcu_read_unlock();
90 } 90 }
91 if (!fallback) { 91 if (!fallback) {
92 preempt_disable();
92 smp_call_function_many(tmpmask, ipi_mb, NULL, 1); 93 smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
94 preempt_enable();
93 free_cpumask_var(tmpmask); 95 free_cpumask_var(tmpmask);
94 } 96 }
95 cpus_read_unlock(); 97 cpus_read_unlock();
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 4056c19ca3f0..665ace2fc558 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2034,8 +2034,9 @@ static void pull_rt_task(struct rq *this_rq)
2034 bool resched = false; 2034 bool resched = false;
2035 struct task_struct *p; 2035 struct task_struct *p;
2036 struct rq *src_rq; 2036 struct rq *src_rq;
2037 int rt_overload_count = rt_overloaded(this_rq);
2037 2038
2038 if (likely(!rt_overloaded(this_rq))) 2039 if (likely(!rt_overload_count))
2039 return; 2040 return;
2040 2041
2041 /* 2042 /*
@@ -2044,6 +2045,11 @@ static void pull_rt_task(struct rq *this_rq)
2044 */ 2045 */
2045 smp_rmb(); 2046 smp_rmb();
2046 2047
2048 /* If we are the only overloaded CPU do nothing */
2049 if (rt_overload_count == 1 &&
2050 cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
2051 return;
2052
2047#ifdef HAVE_RT_PUSH_IPI 2053#ifdef HAVE_RT_PUSH_IPI
2048 if (sched_feat(RT_PUSH_IPI)) { 2054 if (sched_feat(RT_PUSH_IPI)) {
2049 tell_cpu_to_push(this_rq); 2055 tell_cpu_to_push(this_rq);
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 98feab7933c7..929ecb7d6b78 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -27,7 +27,7 @@ void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq
27 27
28 wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE; 28 wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE;
29 spin_lock_irqsave(&wq_head->lock, flags); 29 spin_lock_irqsave(&wq_head->lock, flags);
30 __add_wait_queue_entry_tail(wq_head, wq_entry); 30 __add_wait_queue(wq_head, wq_entry);
31 spin_unlock_irqrestore(&wq_head->lock, flags); 31 spin_unlock_irqrestore(&wq_head->lock, flags);
32} 32}
33EXPORT_SYMBOL(add_wait_queue); 33EXPORT_SYMBOL(add_wait_queue);
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index e776fc8cc1df..f6b5f19223d6 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -95,6 +95,7 @@ config NO_HZ_FULL
95 select RCU_NOCB_CPU 95 select RCU_NOCB_CPU
96 select VIRT_CPU_ACCOUNTING_GEN 96 select VIRT_CPU_ACCOUNTING_GEN
97 select IRQ_WORK 97 select IRQ_WORK
98 select CPU_ISOLATION
98 help 99 help
99 Adaptively try to shutdown the tick whenever possible, even when 100 Adaptively try to shutdown the tick whenever possible, even when
100 the CPU is running tasks. Typically this requires running a single 101 the CPU is running tasks. Typically this requires running a single
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index d32520840fde..aa9d2a2b1210 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -655,7 +655,9 @@ static void hrtimer_reprogram(struct hrtimer *timer,
655static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) 655static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
656{ 656{
657 base->expires_next = KTIME_MAX; 657 base->expires_next = KTIME_MAX;
658 base->hang_detected = 0;
658 base->hres_active = 0; 659 base->hres_active = 0;
660 base->next_timer = NULL;
659} 661}
660 662
661/* 663/*
@@ -1589,6 +1591,7 @@ int hrtimers_prepare_cpu(unsigned int cpu)
1589 timerqueue_init_head(&cpu_base->clock_base[i].active); 1591 timerqueue_init_head(&cpu_base->clock_base[i].active);
1590 } 1592 }
1591 1593
1594 cpu_base->active_bases = 0;
1592 cpu_base->cpu = cpu; 1595 cpu_base->cpu = cpu;
1593 hrtimer_init_hres(cpu_base); 1596 hrtimer_init_hres(cpu_base);
1594 return 0; 1597 return 0;
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 13d6881f908b..ec999f32c840 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -434,17 +434,22 @@ static struct pid *good_sigevent(sigevent_t * event)
434{ 434{
435 struct task_struct *rtn = current->group_leader; 435 struct task_struct *rtn = current->group_leader;
436 436
437 if ((event->sigev_notify & SIGEV_THREAD_ID ) && 437 switch (event->sigev_notify) {
438 (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) || 438 case SIGEV_SIGNAL | SIGEV_THREAD_ID:
439 !same_thread_group(rtn, current) || 439 rtn = find_task_by_vpid(event->sigev_notify_thread_id);
440 (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL)) 440 if (!rtn || !same_thread_group(rtn, current))
441 return NULL;
442 /* FALLTHRU */
443 case SIGEV_SIGNAL:
444 case SIGEV_THREAD:
445 if (event->sigev_signo <= 0 || event->sigev_signo > SIGRTMAX)
446 return NULL;
447 /* FALLTHRU */
448 case SIGEV_NONE:
449 return task_pid(rtn);
450 default:
441 return NULL; 451 return NULL;
442 452 }
443 if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) &&
444 ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
445 return NULL;
446
447 return task_pid(rtn);
448} 453}
449 454
450static struct k_itimer * alloc_posix_timer(void) 455static struct k_itimer * alloc_posix_timer(void)
@@ -669,7 +674,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
669 struct timespec64 ts64; 674 struct timespec64 ts64;
670 bool sig_none; 675 bool sig_none;
671 676
672 sig_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE; 677 sig_none = timr->it_sigev_notify == SIGEV_NONE;
673 iv = timr->it_interval; 678 iv = timr->it_interval;
674 679
675 /* interval timer ? */ 680 /* interval timer ? */
@@ -856,7 +861,7 @@ int common_timer_set(struct k_itimer *timr, int flags,
856 861
857 timr->it_interval = timespec64_to_ktime(new_setting->it_interval); 862 timr->it_interval = timespec64_to_ktime(new_setting->it_interval);
858 expires = timespec64_to_ktime(new_setting->it_value); 863 expires = timespec64_to_ktime(new_setting->it_value);
859 sigev_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE; 864 sigev_none = timr->it_sigev_notify == SIGEV_NONE;
860 865
861 kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none); 866 kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none);
862 timr->it_active = !sigev_none; 867 timr->it_active = !sigev_none;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 99578f06c8d4..f7cc7abfcf25 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -650,6 +650,11 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
650 ts->next_tick = 0; 650 ts->next_tick = 0;
651} 651}
652 652
653static inline bool local_timer_softirq_pending(void)
654{
655 return local_softirq_pending() & TIMER_SOFTIRQ;
656}
657
653static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, 658static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
654 ktime_t now, int cpu) 659 ktime_t now, int cpu)
655{ 660{
@@ -666,8 +671,18 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
666 } while (read_seqretry(&jiffies_lock, seq)); 671 } while (read_seqretry(&jiffies_lock, seq));
667 ts->last_jiffies = basejiff; 672 ts->last_jiffies = basejiff;
668 673
669 if (rcu_needs_cpu(basemono, &next_rcu) || 674 /*
670 arch_needs_cpu() || irq_work_needs_cpu()) { 675 * Keep the periodic tick, when RCU, architecture or irq_work
676 * requests it.
677 * Aside of that check whether the local timer softirq is
678 * pending. If so its a bad idea to call get_next_timer_interrupt()
679 * because there is an already expired timer, so it will request
680 * immeditate expiry, which rearms the hardware timer with a
681 * minimal delta which brings us back to this place
682 * immediately. Lather, rinse and repeat...
683 */
684 if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() ||
685 irq_work_needs_cpu() || local_timer_softirq_pending()) {
671 next_tick = basemono + TICK_NSEC; 686 next_tick = basemono + TICK_NSEC;
672 } else { 687 } else {
673 /* 688 /*
@@ -986,6 +1001,19 @@ ktime_t tick_nohz_get_sleep_length(void)
986} 1001}
987 1002
988/** 1003/**
1004 * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value
1005 * for a particular CPU.
1006 *
1007 * Called from the schedutil frequency scaling governor in scheduler context.
1008 */
1009unsigned long tick_nohz_get_idle_calls_cpu(int cpu)
1010{
1011 struct tick_sched *ts = tick_get_tick_sched(cpu);
1012
1013 return ts->idle_calls;
1014}
1015
1016/**
989 * tick_nohz_get_idle_calls - return the current idle calls counter value 1017 * tick_nohz_get_idle_calls - return the current idle calls counter value
990 * 1018 *
991 * Called from the schedutil frequency scaling governor in scheduler context. 1019 * Called from the schedutil frequency scaling governor in scheduler context.
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index ffebcf878fba..0bcf00e3ce48 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -823,11 +823,10 @@ static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
823 struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu); 823 struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu);
824 824
825 /* 825 /*
826 * If the timer is deferrable and nohz is active then we need to use 826 * If the timer is deferrable and NO_HZ_COMMON is set then we need
827 * the deferrable base. 827 * to use the deferrable base.
828 */ 828 */
829 if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active && 829 if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
830 (tflags & TIMER_DEFERRABLE))
831 base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu); 830 base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu);
832 return base; 831 return base;
833} 832}
@@ -837,11 +836,10 @@ static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
837 struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); 836 struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
838 837
839 /* 838 /*
840 * If the timer is deferrable and nohz is active then we need to use 839 * If the timer is deferrable and NO_HZ_COMMON is set then we need
841 * the deferrable base. 840 * to use the deferrable base.
842 */ 841 */
843 if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active && 842 if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
844 (tflags & TIMER_DEFERRABLE))
845 base = this_cpu_ptr(&timer_bases[BASE_DEF]); 843 base = this_cpu_ptr(&timer_bases[BASE_DEF]);
846 return base; 844 return base;
847} 845}
@@ -1009,8 +1007,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
1009 if (!ret && (options & MOD_TIMER_PENDING_ONLY)) 1007 if (!ret && (options & MOD_TIMER_PENDING_ONLY))
1010 goto out_unlock; 1008 goto out_unlock;
1011 1009
1012 debug_activate(timer, expires);
1013
1014 new_base = get_target_base(base, timer->flags); 1010 new_base = get_target_base(base, timer->flags);
1015 1011
1016 if (base != new_base) { 1012 if (base != new_base) {
@@ -1034,6 +1030,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option
1034 } 1030 }
1035 } 1031 }
1036 1032
1033 debug_activate(timer, expires);
1034
1037 timer->expires = expires; 1035 timer->expires = expires;
1038 /* 1036 /*
1039 * If 'idx' was calculated above and the base time did not advance 1037 * If 'idx' was calculated above and the base time did not advance
@@ -1684,7 +1682,7 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
1684 base->must_forward_clk = false; 1682 base->must_forward_clk = false;
1685 1683
1686 __run_timers(base); 1684 __run_timers(base);
1687 if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) 1685 if (IS_ENABLED(CONFIG_NO_HZ_COMMON))
1688 __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); 1686 __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
1689} 1687}
1690 1688
@@ -1698,7 +1696,7 @@ void run_local_timers(void)
1698 hrtimer_run_queues(); 1696 hrtimer_run_queues();
1699 /* Raise the softirq only if required. */ 1697 /* Raise the softirq only if required. */
1700 if (time_before(jiffies, base->clk)) { 1698 if (time_before(jiffies, base->clk)) {
1701 if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active) 1699 if (!IS_ENABLED(CONFIG_NO_HZ_COMMON))
1702 return; 1700 return;
1703 /* CPU is awake, so check the deferrable base. */ 1701 /* CPU is awake, so check the deferrable base. */
1704 base++; 1702 base++;
@@ -1855,6 +1853,21 @@ static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *h
1855 } 1853 }
1856} 1854}
1857 1855
1856int timers_prepare_cpu(unsigned int cpu)
1857{
1858 struct timer_base *base;
1859 int b;
1860
1861 for (b = 0; b < NR_BASES; b++) {
1862 base = per_cpu_ptr(&timer_bases[b], cpu);
1863 base->clk = jiffies;
1864 base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
1865 base->is_idle = false;
1866 base->must_forward_clk = true;
1867 }
1868 return 0;
1869}
1870
1858int timers_dead_cpu(unsigned int cpu) 1871int timers_dead_cpu(unsigned int cpu)
1859{ 1872{
1860 struct timer_base *old_base; 1873 struct timer_base *old_base;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index af7dad126c13..f54dc62b599c 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -164,6 +164,7 @@ config PREEMPTIRQ_EVENTS
164 bool "Enable trace events for preempt and irq disable/enable" 164 bool "Enable trace events for preempt and irq disable/enable"
165 select TRACE_IRQFLAGS 165 select TRACE_IRQFLAGS
166 depends on DEBUG_PREEMPT || !PROVE_LOCKING 166 depends on DEBUG_PREEMPT || !PROVE_LOCKING
167 depends on TRACING
167 default n 168 default n
168 help 169 help
169 Enable tracing of disable and enable events for preemption and irqs. 170 Enable tracing of disable and enable events for preemption and irqs.
@@ -354,7 +355,7 @@ config PROFILE_ANNOTATED_BRANCHES
354 on if you need to profile the system's use of these macros. 355 on if you need to profile the system's use of these macros.
355 356
356config PROFILE_ALL_BRANCHES 357config PROFILE_ALL_BRANCHES
357 bool "Profile all if conditionals" 358 bool "Profile all if conditionals" if !FORTIFY_SOURCE
358 select TRACE_BRANCH_PROFILING 359 select TRACE_BRANCH_PROFILING
359 help 360 help
360 This tracer profiles all branch conditions. Every if () 361 This tracer profiles all branch conditions. Every if ()
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 27d1f4ffa3de..40207c2a4113 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -343,14 +343,13 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
343 .arg4_type = ARG_CONST_SIZE, 343 .arg4_type = ARG_CONST_SIZE,
344}; 344};
345 345
346static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd); 346static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd);
347 347
348static __always_inline u64 348static __always_inline u64
349__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, 349__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
350 u64 flags, struct perf_raw_record *raw) 350 u64 flags, struct perf_sample_data *sd)
351{ 351{
352 struct bpf_array *array = container_of(map, struct bpf_array, map); 352 struct bpf_array *array = container_of(map, struct bpf_array, map);
353 struct perf_sample_data *sd = this_cpu_ptr(&bpf_sd);
354 unsigned int cpu = smp_processor_id(); 353 unsigned int cpu = smp_processor_id();
355 u64 index = flags & BPF_F_INDEX_MASK; 354 u64 index = flags & BPF_F_INDEX_MASK;
356 struct bpf_event_entry *ee; 355 struct bpf_event_entry *ee;
@@ -373,8 +372,6 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
373 if (unlikely(event->oncpu != cpu)) 372 if (unlikely(event->oncpu != cpu))
374 return -EOPNOTSUPP; 373 return -EOPNOTSUPP;
375 374
376 perf_sample_data_init(sd, 0, 0);
377 sd->raw = raw;
378 perf_event_output(event, sd, regs); 375 perf_event_output(event, sd, regs);
379 return 0; 376 return 0;
380} 377}
@@ -382,6 +379,7 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
382BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, 379BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
383 u64, flags, void *, data, u64, size) 380 u64, flags, void *, data, u64, size)
384{ 381{
382 struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd);
385 struct perf_raw_record raw = { 383 struct perf_raw_record raw = {
386 .frag = { 384 .frag = {
387 .size = size, 385 .size = size,
@@ -392,7 +390,10 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
392 if (unlikely(flags & ~(BPF_F_INDEX_MASK))) 390 if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
393 return -EINVAL; 391 return -EINVAL;
394 392
395 return __bpf_perf_event_output(regs, map, flags, &raw); 393 perf_sample_data_init(sd, 0, 0);
394 sd->raw = &raw;
395
396 return __bpf_perf_event_output(regs, map, flags, sd);
396} 397}
397 398
398static const struct bpf_func_proto bpf_perf_event_output_proto = { 399static const struct bpf_func_proto bpf_perf_event_output_proto = {
@@ -407,10 +408,12 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
407}; 408};
408 409
409static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); 410static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
411static DEFINE_PER_CPU(struct perf_sample_data, bpf_misc_sd);
410 412
411u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, 413u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
412 void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) 414 void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
413{ 415{
416 struct perf_sample_data *sd = this_cpu_ptr(&bpf_misc_sd);
414 struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs); 417 struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs);
415 struct perf_raw_frag frag = { 418 struct perf_raw_frag frag = {
416 .copy = ctx_copy, 419 .copy = ctx_copy,
@@ -428,8 +431,10 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
428 }; 431 };
429 432
430 perf_fetch_caller_regs(regs); 433 perf_fetch_caller_regs(regs);
434 perf_sample_data_init(sd, 0, 0);
435 sd->raw = &raw;
431 436
432 return __bpf_perf_event_output(regs, map, flags, &raw); 437 return __bpf_perf_event_output(regs, map, flags, sd);
433} 438}
434 439
435BPF_CALL_0(bpf_get_current_task) 440BPF_CALL_0(bpf_get_current_task)
@@ -759,6 +764,8 @@ const struct bpf_prog_ops perf_event_prog_ops = {
759 764
760static DEFINE_MUTEX(bpf_event_mutex); 765static DEFINE_MUTEX(bpf_event_mutex);
761 766
767#define BPF_TRACE_MAX_PROGS 64
768
762int perf_event_attach_bpf_prog(struct perf_event *event, 769int perf_event_attach_bpf_prog(struct perf_event *event,
763 struct bpf_prog *prog) 770 struct bpf_prog *prog)
764{ 771{
@@ -772,6 +779,12 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
772 goto unlock; 779 goto unlock;
773 780
774 old_array = event->tp_event->prog_array; 781 old_array = event->tp_event->prog_array;
782 if (old_array &&
783 bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) {
784 ret = -E2BIG;
785 goto unlock;
786 }
787
775 ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array); 788 ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array);
776 if (ret < 0) 789 if (ret < 0)
777 goto unlock; 790 goto unlock;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ccdf3664e4a9..554b517c61a0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1119,15 +1119,11 @@ static struct ftrace_ops global_ops = {
1119}; 1119};
1120 1120
1121/* 1121/*
1122 * This is used by __kernel_text_address() to return true if the 1122 * Used by the stack undwinder to know about dynamic ftrace trampolines.
1123 * address is on a dynamically allocated trampoline that would
1124 * not return true for either core_kernel_text() or
1125 * is_module_text_address().
1126 */ 1123 */
1127bool is_ftrace_trampoline(unsigned long addr) 1124struct ftrace_ops *ftrace_ops_trampoline(unsigned long addr)
1128{ 1125{
1129 struct ftrace_ops *op; 1126 struct ftrace_ops *op = NULL;
1130 bool ret = false;
1131 1127
1132 /* 1128 /*
1133 * Some of the ops may be dynamically allocated, 1129 * Some of the ops may be dynamically allocated,
@@ -1144,15 +1140,24 @@ bool is_ftrace_trampoline(unsigned long addr)
1144 if (op->trampoline && op->trampoline_size) 1140 if (op->trampoline && op->trampoline_size)
1145 if (addr >= op->trampoline && 1141 if (addr >= op->trampoline &&
1146 addr < op->trampoline + op->trampoline_size) { 1142 addr < op->trampoline + op->trampoline_size) {
1147 ret = true; 1143 preempt_enable_notrace();
1148 goto out; 1144 return op;
1149 } 1145 }
1150 } while_for_each_ftrace_op(op); 1146 } while_for_each_ftrace_op(op);
1151
1152 out:
1153 preempt_enable_notrace(); 1147 preempt_enable_notrace();
1154 1148
1155 return ret; 1149 return NULL;
1150}
1151
1152/*
1153 * This is used by __kernel_text_address() to return true if the
1154 * address is on a dynamically allocated trampoline that would
1155 * not return true for either core_kernel_text() or
1156 * is_module_text_address().
1157 */
1158bool is_ftrace_trampoline(unsigned long addr)
1159{
1160 return ftrace_ops_trampoline(addr) != NULL;
1156} 1161}
1157 1162
1158struct ftrace_page { 1163struct ftrace_page {
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 91874a95060d..5af2842dea96 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -280,6 +280,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
280/* Missed count stored at end */ 280/* Missed count stored at end */
281#define RB_MISSED_STORED (1 << 30) 281#define RB_MISSED_STORED (1 << 30)
282 282
283#define RB_MISSED_FLAGS (RB_MISSED_EVENTS|RB_MISSED_STORED)
284
283struct buffer_data_page { 285struct buffer_data_page {
284 u64 time_stamp; /* page time stamp */ 286 u64 time_stamp; /* page time stamp */
285 local_t commit; /* write committed index */ 287 local_t commit; /* write committed index */
@@ -331,7 +333,9 @@ static void rb_init_page(struct buffer_data_page *bpage)
331 */ 333 */
332size_t ring_buffer_page_len(void *page) 334size_t ring_buffer_page_len(void *page)
333{ 335{
334 return local_read(&((struct buffer_data_page *)page)->commit) 336 struct buffer_data_page *bpage = page;
337
338 return (local_read(&bpage->commit) & ~RB_MISSED_FLAGS)
335 + BUF_PAGE_HDR_SIZE; 339 + BUF_PAGE_HDR_SIZE;
336} 340}
337 341
@@ -1799,12 +1803,6 @@ void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val)
1799} 1803}
1800EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite); 1804EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite);
1801 1805
1802static __always_inline void *
1803__rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
1804{
1805 return bpage->data + index;
1806}
1807
1808static __always_inline void *__rb_page_index(struct buffer_page *bpage, unsigned index) 1806static __always_inline void *__rb_page_index(struct buffer_page *bpage, unsigned index)
1809{ 1807{
1810 return bpage->page->data + index; 1808 return bpage->page->data + index;
@@ -2536,29 +2534,58 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
2536 * The lock and unlock are done within a preempt disable section. 2534 * The lock and unlock are done within a preempt disable section.
2537 * The current_context per_cpu variable can only be modified 2535 * The current_context per_cpu variable can only be modified
2538 * by the current task between lock and unlock. But it can 2536 * by the current task between lock and unlock. But it can
2539 * be modified more than once via an interrupt. There are four 2537 * be modified more than once via an interrupt. To pass this
2540 * different contexts that we need to consider. 2538 * information from the lock to the unlock without having to
2539 * access the 'in_interrupt()' functions again (which do show
2540 * a bit of overhead in something as critical as function tracing,
2541 * we use a bitmask trick.
2542 *
2543 * bit 0 = NMI context
2544 * bit 1 = IRQ context
2545 * bit 2 = SoftIRQ context
2546 * bit 3 = normal context.
2547 *
2548 * This works because this is the order of contexts that can
2549 * preempt other contexts. A SoftIRQ never preempts an IRQ
2550 * context.
2551 *
2552 * When the context is determined, the corresponding bit is
2553 * checked and set (if it was set, then a recursion of that context
2554 * happened).
2555 *
2556 * On unlock, we need to clear this bit. To do so, just subtract
2557 * 1 from the current_context and AND it to itself.
2541 * 2558 *
2542 * Normal context. 2559 * (binary)
2543 * SoftIRQ context 2560 * 101 - 1 = 100
2544 * IRQ context 2561 * 101 & 100 = 100 (clearing bit zero)
2545 * NMI context
2546 * 2562 *
2547 * If for some reason the ring buffer starts to recurse, we 2563 * 1010 - 1 = 1001
2548 * only allow that to happen at most 4 times (one for each 2564 * 1010 & 1001 = 1000 (clearing bit 1)
2549 * context). If it happens 5 times, then we consider this a 2565 *
2550 * recusive loop and do not let it go further. 2566 * The least significant bit can be cleared this way, and it
2567 * just so happens that it is the same bit corresponding to
2568 * the current context.
2551 */ 2569 */
2552 2570
2553static __always_inline int 2571static __always_inline int
2554trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) 2572trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
2555{ 2573{
2556 if (cpu_buffer->current_context >= 4) 2574 unsigned int val = cpu_buffer->current_context;
2575 unsigned long pc = preempt_count();
2576 int bit;
2577
2578 if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
2579 bit = RB_CTX_NORMAL;
2580 else
2581 bit = pc & NMI_MASK ? RB_CTX_NMI :
2582 pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ;
2583
2584 if (unlikely(val & (1 << bit)))
2557 return 1; 2585 return 1;
2558 2586
2559 cpu_buffer->current_context++; 2587 val |= (1 << bit);
2560 /* Interrupts must see this update */ 2588 cpu_buffer->current_context = val;
2561 barrier();
2562 2589
2563 return 0; 2590 return 0;
2564} 2591}
@@ -2566,9 +2593,7 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
2566static __always_inline void 2593static __always_inline void
2567trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) 2594trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
2568{ 2595{
2569 /* Don't let the dec leak out */ 2596 cpu_buffer->current_context &= cpu_buffer->current_context - 1;
2570 barrier();
2571 cpu_buffer->current_context--;
2572} 2597}
2573 2598
2574/** 2599/**
@@ -4406,8 +4431,13 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data)
4406{ 4431{
4407 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; 4432 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
4408 struct buffer_data_page *bpage = data; 4433 struct buffer_data_page *bpage = data;
4434 struct page *page = virt_to_page(bpage);
4409 unsigned long flags; 4435 unsigned long flags;
4410 4436
4437 /* If the page is still in use someplace else, we can't reuse it */
4438 if (page_ref_count(page) > 1)
4439 goto out;
4440
4411 local_irq_save(flags); 4441 local_irq_save(flags);
4412 arch_spin_lock(&cpu_buffer->lock); 4442 arch_spin_lock(&cpu_buffer->lock);
4413 4443
@@ -4419,6 +4449,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data)
4419 arch_spin_unlock(&cpu_buffer->lock); 4449 arch_spin_unlock(&cpu_buffer->lock);
4420 local_irq_restore(flags); 4450 local_irq_restore(flags);
4421 4451
4452 out:
4422 free_page((unsigned long)bpage); 4453 free_page((unsigned long)bpage);
4423} 4454}
4424EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); 4455EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 73e67b68c53b..8e3f20a18a06 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -362,7 +362,7 @@ trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct
362} 362}
363 363
364/** 364/**
365 * trace_pid_filter_add_remove - Add or remove a task from a pid_list 365 * trace_pid_filter_add_remove_task - Add or remove a task from a pid_list
366 * @pid_list: The list to modify 366 * @pid_list: The list to modify
367 * @self: The current task for fork or NULL for exit 367 * @self: The current task for fork or NULL for exit
368 * @task: The task to add or remove 368 * @task: The task to add or remove
@@ -925,7 +925,7 @@ static void tracing_snapshot_instance(struct trace_array *tr)
925} 925}
926 926
927/** 927/**
928 * trace_snapshot - take a snapshot of the current buffer. 928 * tracing_snapshot - take a snapshot of the current buffer.
929 * 929 *
930 * This causes a swap between the snapshot buffer and the current live 930 * This causes a swap between the snapshot buffer and the current live
931 * tracing buffer. You can use this to take snapshots of the live 931 * tracing buffer. You can use this to take snapshots of the live
@@ -1004,9 +1004,9 @@ int tracing_alloc_snapshot(void)
1004EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); 1004EXPORT_SYMBOL_GPL(tracing_alloc_snapshot);
1005 1005
1006/** 1006/**
1007 * trace_snapshot_alloc - allocate and take a snapshot of the current buffer. 1007 * tracing_snapshot_alloc - allocate and take a snapshot of the current buffer.
1008 * 1008 *
1009 * This is similar to trace_snapshot(), but it will allocate the 1009 * This is similar to tracing_snapshot(), but it will allocate the
1010 * snapshot buffer if it isn't already allocated. Use this only 1010 * snapshot buffer if it isn't already allocated. Use this only
1011 * where it is safe to sleep, as the allocation may sleep. 1011 * where it is safe to sleep, as the allocation may sleep.
1012 * 1012 *
@@ -1303,7 +1303,7 @@ unsigned long __read_mostly tracing_thresh;
1303/* 1303/*
1304 * Copy the new maximum trace into the separate maximum-trace 1304 * Copy the new maximum trace into the separate maximum-trace
1305 * structure. (this way the maximum trace is permanently saved, 1305 * structure. (this way the maximum trace is permanently saved,
1306 * for later retrieval via /sys/kernel/debug/tracing/latency_trace) 1306 * for later retrieval via /sys/kernel/tracing/tracing_max_latency)
1307 */ 1307 */
1308static void 1308static void
1309__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) 1309__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
@@ -2374,6 +2374,15 @@ void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
2374} 2374}
2375EXPORT_SYMBOL_GPL(trace_event_buffer_commit); 2375EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
2376 2376
2377/*
2378 * Skip 3:
2379 *
2380 * trace_buffer_unlock_commit_regs()
2381 * trace_event_buffer_commit()
2382 * trace_event_raw_event_xxx()
2383*/
2384# define STACK_SKIP 3
2385
2377void trace_buffer_unlock_commit_regs(struct trace_array *tr, 2386void trace_buffer_unlock_commit_regs(struct trace_array *tr,
2378 struct ring_buffer *buffer, 2387 struct ring_buffer *buffer,
2379 struct ring_buffer_event *event, 2388 struct ring_buffer_event *event,
@@ -2383,16 +2392,12 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
2383 __buffer_unlock_commit(buffer, event); 2392 __buffer_unlock_commit(buffer, event);
2384 2393
2385 /* 2394 /*
2386 * If regs is not set, then skip the following callers: 2395 * If regs is not set, then skip the necessary functions.
2387 * trace_buffer_unlock_commit_regs
2388 * event_trigger_unlock_commit
2389 * trace_event_buffer_commit
2390 * trace_event_raw_event_sched_switch
2391 * Note, we can still get here via blktrace, wakeup tracer 2396 * Note, we can still get here via blktrace, wakeup tracer
2392 * and mmiotrace, but that's ok if they lose a function or 2397 * and mmiotrace, but that's ok if they lose a function or
2393 * two. They are that meaningful. 2398 * two. They are not that meaningful.
2394 */ 2399 */
2395 ftrace_trace_stack(tr, buffer, flags, regs ? 0 : 4, pc, regs); 2400 ftrace_trace_stack(tr, buffer, flags, regs ? 0 : STACK_SKIP, pc, regs);
2396 ftrace_trace_userstack(buffer, flags, pc); 2401 ftrace_trace_userstack(buffer, flags, pc);
2397} 2402}
2398 2403
@@ -2415,7 +2420,7 @@ trace_process_export(struct trace_export *export,
2415 2420
2416 entry = ring_buffer_event_data(event); 2421 entry = ring_buffer_event_data(event);
2417 size = ring_buffer_event_length(event); 2422 size = ring_buffer_event_length(event);
2418 export->write(entry, size); 2423 export->write(export, entry, size);
2419} 2424}
2420 2425
2421static DEFINE_MUTEX(ftrace_export_lock); 2426static DEFINE_MUTEX(ftrace_export_lock);
@@ -2579,11 +2584,13 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
2579 trace.skip = skip; 2584 trace.skip = skip;
2580 2585
2581 /* 2586 /*
2582 * Add two, for this function and the call to save_stack_trace() 2587 * Add one, for this function and the call to save_stack_trace()
2583 * If regs is set, then these functions will not be in the way. 2588 * If regs is set, then these functions will not be in the way.
2584 */ 2589 */
2590#ifndef CONFIG_UNWINDER_ORC
2585 if (!regs) 2591 if (!regs)
2586 trace.skip += 2; 2592 trace.skip++;
2593#endif
2587 2594
2588 /* 2595 /*
2589 * Since events can happen in NMIs there's no safe way to 2596 * Since events can happen in NMIs there's no safe way to
@@ -2711,11 +2718,10 @@ void trace_dump_stack(int skip)
2711 2718
2712 local_save_flags(flags); 2719 local_save_flags(flags);
2713 2720
2714 /* 2721#ifndef CONFIG_UNWINDER_ORC
2715 * Skip 3 more, seems to get us at the caller of 2722 /* Skip 1 to skip this function. */
2716 * this function. 2723 skip++;
2717 */ 2724#endif
2718 skip += 3;
2719 __ftrace_trace_stack(global_trace.trace_buffer.buffer, 2725 __ftrace_trace_stack(global_trace.trace_buffer.buffer,
2720 flags, skip, preempt_count(), NULL); 2726 flags, skip, preempt_count(), NULL);
2721} 2727}
@@ -4178,37 +4184,30 @@ static const struct file_operations show_traces_fops = {
4178 .llseek = seq_lseek, 4184 .llseek = seq_lseek,
4179}; 4185};
4180 4186
4181/*
4182 * The tracer itself will not take this lock, but still we want
4183 * to provide a consistent cpumask to user-space:
4184 */
4185static DEFINE_MUTEX(tracing_cpumask_update_lock);
4186
4187/*
4188 * Temporary storage for the character representation of the
4189 * CPU bitmask (and one more byte for the newline):
4190 */
4191static char mask_str[NR_CPUS + 1];
4192
4193static ssize_t 4187static ssize_t
4194tracing_cpumask_read(struct file *filp, char __user *ubuf, 4188tracing_cpumask_read(struct file *filp, char __user *ubuf,
4195 size_t count, loff_t *ppos) 4189 size_t count, loff_t *ppos)
4196{ 4190{
4197 struct trace_array *tr = file_inode(filp)->i_private; 4191 struct trace_array *tr = file_inode(filp)->i_private;
4192 char *mask_str;
4198 int len; 4193 int len;
4199 4194
4200 mutex_lock(&tracing_cpumask_update_lock); 4195 len = snprintf(NULL, 0, "%*pb\n",
4196 cpumask_pr_args(tr->tracing_cpumask)) + 1;
4197 mask_str = kmalloc(len, GFP_KERNEL);
4198 if (!mask_str)
4199 return -ENOMEM;
4201 4200
4202 len = snprintf(mask_str, count, "%*pb\n", 4201 len = snprintf(mask_str, len, "%*pb\n",
4203 cpumask_pr_args(tr->tracing_cpumask)); 4202 cpumask_pr_args(tr->tracing_cpumask));
4204 if (len >= count) { 4203 if (len >= count) {
4205 count = -EINVAL; 4204 count = -EINVAL;
4206 goto out_err; 4205 goto out_err;
4207 } 4206 }
4208 count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1); 4207 count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len);
4209 4208
4210out_err: 4209out_err:
4211 mutex_unlock(&tracing_cpumask_update_lock); 4210 kfree(mask_str);
4212 4211
4213 return count; 4212 return count;
4214} 4213}
@@ -4228,8 +4227,6 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
4228 if (err) 4227 if (err)
4229 goto err_unlock; 4228 goto err_unlock;
4230 4229
4231 mutex_lock(&tracing_cpumask_update_lock);
4232
4233 local_irq_disable(); 4230 local_irq_disable();
4234 arch_spin_lock(&tr->max_lock); 4231 arch_spin_lock(&tr->max_lock);
4235 for_each_tracing_cpu(cpu) { 4232 for_each_tracing_cpu(cpu) {
@@ -4252,8 +4249,6 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
4252 local_irq_enable(); 4249 local_irq_enable();
4253 4250
4254 cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new); 4251 cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new);
4255
4256 mutex_unlock(&tracing_cpumask_update_lock);
4257 free_cpumask_var(tracing_cpumask_new); 4252 free_cpumask_var(tracing_cpumask_new);
4258 4253
4259 return count; 4254 return count;
@@ -6780,7 +6775,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
6780 .spd_release = buffer_spd_release, 6775 .spd_release = buffer_spd_release,
6781 }; 6776 };
6782 struct buffer_ref *ref; 6777 struct buffer_ref *ref;
6783 int entries, size, i; 6778 int entries, i;
6784 ssize_t ret = 0; 6779 ssize_t ret = 0;
6785 6780
6786#ifdef CONFIG_TRACER_MAX_TRACE 6781#ifdef CONFIG_TRACER_MAX_TRACE
@@ -6834,14 +6829,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
6834 break; 6829 break;
6835 } 6830 }
6836 6831
6837 /*
6838 * zero out any left over data, this is going to
6839 * user land.
6840 */
6841 size = ring_buffer_page_len(ref->page);
6842 if (size < PAGE_SIZE)
6843 memset(ref->page + size, 0, PAGE_SIZE - size);
6844
6845 page = virt_to_page(ref->page); 6832 page = virt_to_page(ref->page);
6846 6833
6847 spd.pages[i] = page; 6834 spd.pages[i] = page;
@@ -7599,6 +7586,7 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size
7599 buf->data = alloc_percpu(struct trace_array_cpu); 7586 buf->data = alloc_percpu(struct trace_array_cpu);
7600 if (!buf->data) { 7587 if (!buf->data) {
7601 ring_buffer_free(buf->buffer); 7588 ring_buffer_free(buf->buffer);
7589 buf->buffer = NULL;
7602 return -ENOMEM; 7590 return -ENOMEM;
7603 } 7591 }
7604 7592
@@ -7622,7 +7610,9 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
7622 allocate_snapshot ? size : 1); 7610 allocate_snapshot ? size : 1);
7623 if (WARN_ON(ret)) { 7611 if (WARN_ON(ret)) {
7624 ring_buffer_free(tr->trace_buffer.buffer); 7612 ring_buffer_free(tr->trace_buffer.buffer);
7613 tr->trace_buffer.buffer = NULL;
7625 free_percpu(tr->trace_buffer.data); 7614 free_percpu(tr->trace_buffer.data);
7615 tr->trace_buffer.data = NULL;
7626 return -ENOMEM; 7616 return -ENOMEM;
7627 } 7617 }
7628 tr->allocated_snapshot = allocate_snapshot; 7618 tr->allocated_snapshot = allocate_snapshot;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index ec0f9aa4e151..1b87157edbff 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2213,6 +2213,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
2213{ 2213{
2214 struct trace_event_call *call, *p; 2214 struct trace_event_call *call, *p;
2215 const char *last_system = NULL; 2215 const char *last_system = NULL;
2216 bool first = false;
2216 int last_i; 2217 int last_i;
2217 int i; 2218 int i;
2218 2219
@@ -2220,15 +2221,28 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
2220 list_for_each_entry_safe(call, p, &ftrace_events, list) { 2221 list_for_each_entry_safe(call, p, &ftrace_events, list) {
2221 /* events are usually grouped together with systems */ 2222 /* events are usually grouped together with systems */
2222 if (!last_system || call->class->system != last_system) { 2223 if (!last_system || call->class->system != last_system) {
2224 first = true;
2223 last_i = 0; 2225 last_i = 0;
2224 last_system = call->class->system; 2226 last_system = call->class->system;
2225 } 2227 }
2226 2228
2229 /*
2230 * Since calls are grouped by systems, the likelyhood that the
2231 * next call in the iteration belongs to the same system as the
2232 * previous call is high. As an optimization, we skip seaching
2233 * for a map[] that matches the call's system if the last call
2234 * was from the same system. That's what last_i is for. If the
2235 * call has the same system as the previous call, then last_i
2236 * will be the index of the first map[] that has a matching
2237 * system.
2238 */
2227 for (i = last_i; i < len; i++) { 2239 for (i = last_i; i < len; i++) {
2228 if (call->class->system == map[i]->system) { 2240 if (call->class->system == map[i]->system) {
2229 /* Save the first system if need be */ 2241 /* Save the first system if need be */
2230 if (!last_i) 2242 if (first) {
2231 last_i = i; 2243 last_i = i;
2244 first = false;
2245 }
2232 update_event_printk(call, map[i]); 2246 update_event_printk(call, map[i]);
2233 } 2247 }
2234 } 2248 }
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index f2ac9d44f6c4..87411482a46f 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -1123,13 +1123,22 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; }
1123#endif /* CONFIG_TRACER_SNAPSHOT */ 1123#endif /* CONFIG_TRACER_SNAPSHOT */
1124 1124
1125#ifdef CONFIG_STACKTRACE 1125#ifdef CONFIG_STACKTRACE
1126#ifdef CONFIG_UNWINDER_ORC
1127/* Skip 2:
1128 * event_triggers_post_call()
1129 * trace_event_raw_event_xxx()
1130 */
1131# define STACK_SKIP 2
1132#else
1126/* 1133/*
1127 * Skip 3: 1134 * Skip 4:
1128 * stacktrace_trigger() 1135 * stacktrace_trigger()
1129 * event_triggers_post_call() 1136 * event_triggers_post_call()
1137 * trace_event_buffer_commit()
1130 * trace_event_raw_event_xxx() 1138 * trace_event_raw_event_xxx()
1131 */ 1139 */
1132#define STACK_SKIP 3 1140#define STACK_SKIP 4
1141#endif
1133 1142
1134static void 1143static void
1135stacktrace_trigger(struct event_trigger_data *data, void *rec) 1144stacktrace_trigger(struct event_trigger_data *data, void *rec)
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 27f7ad12c4b1..b611cd36e22d 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -154,6 +154,24 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
154 preempt_enable_notrace(); 154 preempt_enable_notrace();
155} 155}
156 156
157#ifdef CONFIG_UNWINDER_ORC
158/*
159 * Skip 2:
160 *
161 * function_stack_trace_call()
162 * ftrace_call()
163 */
164#define STACK_SKIP 2
165#else
166/*
167 * Skip 3:
168 * __trace_stack()
169 * function_stack_trace_call()
170 * ftrace_call()
171 */
172#define STACK_SKIP 3
173#endif
174
157static void 175static void
158function_stack_trace_call(unsigned long ip, unsigned long parent_ip, 176function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
159 struct ftrace_ops *op, struct pt_regs *pt_regs) 177 struct ftrace_ops *op, struct pt_regs *pt_regs)
@@ -180,15 +198,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
180 if (likely(disabled == 1)) { 198 if (likely(disabled == 1)) {
181 pc = preempt_count(); 199 pc = preempt_count();
182 trace_function(tr, ip, parent_ip, flags, pc); 200 trace_function(tr, ip, parent_ip, flags, pc);
183 /* 201 __trace_stack(tr, flags, STACK_SKIP, pc);
184 * skip over 5 funcs:
185 * __ftrace_trace_stack,
186 * __trace_stack,
187 * function_stack_trace_call
188 * ftrace_list_func
189 * ftrace_call
190 */
191 __trace_stack(tr, flags, 5, pc);
192 } 202 }
193 203
194 atomic_dec(&data->disabled); 204 atomic_dec(&data->disabled);
@@ -367,14 +377,27 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip,
367 tracer_tracing_off(tr); 377 tracer_tracing_off(tr);
368} 378}
369 379
380#ifdef CONFIG_UNWINDER_ORC
370/* 381/*
371 * Skip 4: 382 * Skip 3:
383 *
384 * function_trace_probe_call()
385 * ftrace_ops_assist_func()
386 * ftrace_call()
387 */
388#define FTRACE_STACK_SKIP 3
389#else
390/*
391 * Skip 5:
392 *
393 * __trace_stack()
372 * ftrace_stacktrace() 394 * ftrace_stacktrace()
373 * function_trace_probe_call() 395 * function_trace_probe_call()
374 * ftrace_ops_list_func() 396 * ftrace_ops_assist_func()
375 * ftrace_call() 397 * ftrace_call()
376 */ 398 */
377#define STACK_SKIP 4 399#define FTRACE_STACK_SKIP 5
400#endif
378 401
379static __always_inline void trace_stack(struct trace_array *tr) 402static __always_inline void trace_stack(struct trace_array *tr)
380{ 403{
@@ -384,7 +407,7 @@ static __always_inline void trace_stack(struct trace_array *tr)
384 local_save_flags(flags); 407 local_save_flags(flags);
385 pc = preempt_count(); 408 pc = preempt_count();
386 409
387 __trace_stack(tr, flags, STACK_SKIP, pc); 410 __trace_stack(tr, flags, FTRACE_STACK_SKIP, pc);
388} 411}
389 412
390static void 413static void
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 734accc02418..3c7bfc4bf5e9 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -209,6 +209,10 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
209 if (__this_cpu_read(disable_stack_tracer) != 1) 209 if (__this_cpu_read(disable_stack_tracer) != 1)
210 goto out; 210 goto out;
211 211
212 /* If rcu is not watching, then save stack trace can fail */
213 if (!rcu_is_watching())
214 goto out;
215
212 ip += MCOUNT_INSN_SIZE; 216 ip += MCOUNT_INSN_SIZE;
213 217
214 check_stack(ip, &stack); 218 check_stack(ip, &stack);
diff --git a/kernel/uid16.c b/kernel/uid16.c
index ce74a4901d2b..ef1da2a5f9bd 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -192,6 +192,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
192 return retval; 192 return retval;
193 } 193 }
194 194
195 groups_sort(group_info);
195 retval = set_current_groups(group_info); 196 retval = set_current_groups(group_info);
196 put_group_info(group_info); 197 put_group_info(group_info);
197 198
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8fdb710bfdd7..f699122dab32 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -38,7 +38,6 @@
38#include <linux/hardirq.h> 38#include <linux/hardirq.h>
39#include <linux/mempolicy.h> 39#include <linux/mempolicy.h>
40#include <linux/freezer.h> 40#include <linux/freezer.h>
41#include <linux/kallsyms.h>
42#include <linux/debug_locks.h> 41#include <linux/debug_locks.h>
43#include <linux/lockdep.h> 42#include <linux/lockdep.h>
44#include <linux/idr.h> 43#include <linux/idr.h>
@@ -48,6 +47,8 @@
48#include <linux/nodemask.h> 47#include <linux/nodemask.h>
49#include <linux/moduleparam.h> 48#include <linux/moduleparam.h>
50#include <linux/uaccess.h> 49#include <linux/uaccess.h>
50#include <linux/sched/isolation.h>
51#include <linux/nmi.h>
51 52
52#include "workqueue_internal.h" 53#include "workqueue_internal.h"
53 54
@@ -1634,7 +1635,7 @@ static void worker_enter_idle(struct worker *worker)
1634 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); 1635 mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
1635 1636
1636 /* 1637 /*
1637 * Sanity check nr_running. Because wq_unbind_fn() releases 1638 * Sanity check nr_running. Because unbind_workers() releases
1638 * pool->lock between setting %WORKER_UNBOUND and zapping 1639 * pool->lock between setting %WORKER_UNBOUND and zapping
1639 * nr_running, the warning may trigger spuriously. Check iff 1640 * nr_running, the warning may trigger spuriously. Check iff
1640 * unbind is not in progress. 1641 * unbind is not in progress.
@@ -4463,6 +4464,12 @@ void show_workqueue_state(void)
4463 if (pwq->nr_active || !list_empty(&pwq->delayed_works)) 4464 if (pwq->nr_active || !list_empty(&pwq->delayed_works))
4464 show_pwq(pwq); 4465 show_pwq(pwq);
4465 spin_unlock_irqrestore(&pwq->pool->lock, flags); 4466 spin_unlock_irqrestore(&pwq->pool->lock, flags);
4467 /*
4468 * We could be printing a lot from atomic context, e.g.
4469 * sysrq-t -> show_workqueue_state(). Avoid triggering
4470 * hard lockup.
4471 */
4472 touch_nmi_watchdog();
4466 } 4473 }
4467 } 4474 }
4468 4475
@@ -4490,6 +4497,12 @@ void show_workqueue_state(void)
4490 pr_cont("\n"); 4497 pr_cont("\n");
4491 next_pool: 4498 next_pool:
4492 spin_unlock_irqrestore(&pool->lock, flags); 4499 spin_unlock_irqrestore(&pool->lock, flags);
4500 /*
4501 * We could be printing a lot from atomic context, e.g.
4502 * sysrq-t -> show_workqueue_state(). Avoid triggering
4503 * hard lockup.
4504 */
4505 touch_nmi_watchdog();
4493 } 4506 }
4494 4507
4495 rcu_read_unlock_sched(); 4508 rcu_read_unlock_sched();
@@ -4510,9 +4523,8 @@ void show_workqueue_state(void)
4510 * cpu comes back online. 4523 * cpu comes back online.
4511 */ 4524 */
4512 4525
4513static void wq_unbind_fn(struct work_struct *work) 4526static void unbind_workers(int cpu)
4514{ 4527{
4515 int cpu = smp_processor_id();
4516 struct worker_pool *pool; 4528 struct worker_pool *pool;
4517 struct worker *worker; 4529 struct worker *worker;
4518 4530
@@ -4589,16 +4601,6 @@ static void rebind_workers(struct worker_pool *pool)
4589 4601
4590 spin_lock_irq(&pool->lock); 4602 spin_lock_irq(&pool->lock);
4591 4603
4592 /*
4593 * XXX: CPU hotplug notifiers are weird and can call DOWN_FAILED
4594 * w/o preceding DOWN_PREPARE. Work around it. CPU hotplug is
4595 * being reworked and this can go away in time.
4596 */
4597 if (!(pool->flags & POOL_DISASSOCIATED)) {
4598 spin_unlock_irq(&pool->lock);
4599 return;
4600 }
4601
4602 pool->flags &= ~POOL_DISASSOCIATED; 4604 pool->flags &= ~POOL_DISASSOCIATED;
4603 4605
4604 for_each_pool_worker(worker, pool) { 4606 for_each_pool_worker(worker, pool) {
@@ -4709,12 +4711,13 @@ int workqueue_online_cpu(unsigned int cpu)
4709 4711
4710int workqueue_offline_cpu(unsigned int cpu) 4712int workqueue_offline_cpu(unsigned int cpu)
4711{ 4713{
4712 struct work_struct unbind_work;
4713 struct workqueue_struct *wq; 4714 struct workqueue_struct *wq;
4714 4715
4715 /* unbinding per-cpu workers should happen on the local CPU */ 4716 /* unbinding per-cpu workers should happen on the local CPU */
4716 INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); 4717 if (WARN_ON(cpu != smp_processor_id()))
4717 queue_work_on(cpu, system_highpri_wq, &unbind_work); 4718 return -1;
4719
4720 unbind_workers(cpu);
4718 4721
4719 /* update NUMA affinity of unbound workqueues */ 4722 /* update NUMA affinity of unbound workqueues */
4720 mutex_lock(&wq_pool_mutex); 4723 mutex_lock(&wq_pool_mutex);
@@ -4722,9 +4725,6 @@ int workqueue_offline_cpu(unsigned int cpu)
4722 wq_update_unbound_numa(wq, cpu, false); 4725 wq_update_unbound_numa(wq, cpu, false);
4723 mutex_unlock(&wq_pool_mutex); 4726 mutex_unlock(&wq_pool_mutex);
4724 4727
4725 /* wait for per-cpu unbinding to finish */
4726 flush_work(&unbind_work);
4727 destroy_work_on_stack(&unbind_work);
4728 return 0; 4728 return 0;
4729} 4729}
4730 4730
@@ -4957,6 +4957,10 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
4957 if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL)) 4957 if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL))
4958 return -ENOMEM; 4958 return -ENOMEM;
4959 4959
4960 /*
4961 * Not excluding isolated cpus on purpose.
4962 * If the user wishes to include them, we allow that.
4963 */
4960 cpumask_and(cpumask, cpumask, cpu_possible_mask); 4964 cpumask_and(cpumask, cpumask, cpu_possible_mask);
4961 if (!cpumask_empty(cpumask)) { 4965 if (!cpumask_empty(cpumask)) {
4962 apply_wqattrs_lock(); 4966 apply_wqattrs_lock();
@@ -5555,7 +5559,7 @@ int __init workqueue_init_early(void)
5555 WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); 5559 WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
5556 5560
5557 BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); 5561 BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
5558 cpumask_copy(wq_unbound_cpumask, cpu_possible_mask); 5562 cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_FLAG_DOMAIN));
5559 5563
5560 pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); 5564 pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
5561 5565