diff options
author | Jason Gunthorpe <jgg@mellanox.com> | 2018-01-29 15:26:40 -0500 |
---|---|---|
committer | Jason Gunthorpe <jgg@mellanox.com> | 2018-01-30 11:30:00 -0500 |
commit | e7996a9a77fc669387da43ff4823b91cc4872bd0 (patch) | |
tree | 617f0a128e222539d67e8cccc359f1bc4b984900 /kernel | |
parent | b5fa635aab8f0d39a824c01991266a6d06f007fb (diff) | |
parent | d8a5b80568a9cb66810e75b182018e9edb68e8ff (diff) |
Merge tag v4.15 of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
To resolve conflicts in:
drivers/infiniband/hw/mlx5/main.c
drivers/infiniband/hw/mlx5/qp.c
From patches merged into the -rc cycle. The conflict resolution matches
what linux-next has been carrying.
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Diffstat (limited to 'kernel')
60 files changed, 1169 insertions, 1106 deletions
diff --git a/kernel/acct.c b/kernel/acct.c index d15c0ee4d955..addf7732fb56 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
@@ -102,7 +102,7 @@ static int check_free_space(struct bsd_acct_struct *acct) | |||
102 | { | 102 | { |
103 | struct kstatfs sbuf; | 103 | struct kstatfs sbuf; |
104 | 104 | ||
105 | if (time_is_before_jiffies(acct->needcheck)) | 105 | if (time_is_after_jiffies(acct->needcheck)) |
106 | goto out; | 106 | goto out; |
107 | 107 | ||
108 | /* May block */ | 108 | /* May block */ |
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 7c25426d3cf5..ab94d304a634 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c | |||
@@ -53,9 +53,10 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) | |||
53 | { | 53 | { |
54 | bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; | 54 | bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; |
55 | int numa_node = bpf_map_attr_numa_node(attr); | 55 | int numa_node = bpf_map_attr_numa_node(attr); |
56 | u32 elem_size, index_mask, max_entries; | ||
57 | bool unpriv = !capable(CAP_SYS_ADMIN); | ||
56 | struct bpf_array *array; | 58 | struct bpf_array *array; |
57 | u64 array_size; | 59 | u64 array_size, mask64; |
58 | u32 elem_size; | ||
59 | 60 | ||
60 | /* check sanity of attributes */ | 61 | /* check sanity of attributes */ |
61 | if (attr->max_entries == 0 || attr->key_size != 4 || | 62 | if (attr->max_entries == 0 || attr->key_size != 4 || |
@@ -72,11 +73,32 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) | |||
72 | 73 | ||
73 | elem_size = round_up(attr->value_size, 8); | 74 | elem_size = round_up(attr->value_size, 8); |
74 | 75 | ||
76 | max_entries = attr->max_entries; | ||
77 | |||
78 | /* On 32 bit archs roundup_pow_of_two() with max_entries that has | ||
79 | * upper most bit set in u32 space is undefined behavior due to | ||
80 | * resulting 1U << 32, so do it manually here in u64 space. | ||
81 | */ | ||
82 | mask64 = fls_long(max_entries - 1); | ||
83 | mask64 = 1ULL << mask64; | ||
84 | mask64 -= 1; | ||
85 | |||
86 | index_mask = mask64; | ||
87 | if (unpriv) { | ||
88 | /* round up array size to nearest power of 2, | ||
89 | * since cpu will speculate within index_mask limits | ||
90 | */ | ||
91 | max_entries = index_mask + 1; | ||
92 | /* Check for overflows. */ | ||
93 | if (max_entries < attr->max_entries) | ||
94 | return ERR_PTR(-E2BIG); | ||
95 | } | ||
96 | |||
75 | array_size = sizeof(*array); | 97 | array_size = sizeof(*array); |
76 | if (percpu) | 98 | if (percpu) |
77 | array_size += (u64) attr->max_entries * sizeof(void *); | 99 | array_size += (u64) max_entries * sizeof(void *); |
78 | else | 100 | else |
79 | array_size += (u64) attr->max_entries * elem_size; | 101 | array_size += (u64) max_entries * elem_size; |
80 | 102 | ||
81 | /* make sure there is no u32 overflow later in round_up() */ | 103 | /* make sure there is no u32 overflow later in round_up() */ |
82 | if (array_size >= U32_MAX - PAGE_SIZE) | 104 | if (array_size >= U32_MAX - PAGE_SIZE) |
@@ -86,6 +108,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) | |||
86 | array = bpf_map_area_alloc(array_size, numa_node); | 108 | array = bpf_map_area_alloc(array_size, numa_node); |
87 | if (!array) | 109 | if (!array) |
88 | return ERR_PTR(-ENOMEM); | 110 | return ERR_PTR(-ENOMEM); |
111 | array->index_mask = index_mask; | ||
112 | array->map.unpriv_array = unpriv; | ||
89 | 113 | ||
90 | /* copy mandatory map attributes */ | 114 | /* copy mandatory map attributes */ |
91 | array->map.map_type = attr->map_type; | 115 | array->map.map_type = attr->map_type; |
@@ -121,12 +145,13 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) | |||
121 | if (unlikely(index >= array->map.max_entries)) | 145 | if (unlikely(index >= array->map.max_entries)) |
122 | return NULL; | 146 | return NULL; |
123 | 147 | ||
124 | return array->value + array->elem_size * index; | 148 | return array->value + array->elem_size * (index & array->index_mask); |
125 | } | 149 | } |
126 | 150 | ||
127 | /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ | 151 | /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ |
128 | static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) | 152 | static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) |
129 | { | 153 | { |
154 | struct bpf_array *array = container_of(map, struct bpf_array, map); | ||
130 | struct bpf_insn *insn = insn_buf; | 155 | struct bpf_insn *insn = insn_buf; |
131 | u32 elem_size = round_up(map->value_size, 8); | 156 | u32 elem_size = round_up(map->value_size, 8); |
132 | const int ret = BPF_REG_0; | 157 | const int ret = BPF_REG_0; |
@@ -135,7 +160,12 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) | |||
135 | 160 | ||
136 | *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); | 161 | *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); |
137 | *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); | 162 | *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); |
138 | *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); | 163 | if (map->unpriv_array) { |
164 | *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4); | ||
165 | *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); | ||
166 | } else { | ||
167 | *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); | ||
168 | } | ||
139 | 169 | ||
140 | if (is_power_of_2(elem_size)) { | 170 | if (is_power_of_2(elem_size)) { |
141 | *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); | 171 | *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); |
@@ -157,7 +187,7 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) | |||
157 | if (unlikely(index >= array->map.max_entries)) | 187 | if (unlikely(index >= array->map.max_entries)) |
158 | return NULL; | 188 | return NULL; |
159 | 189 | ||
160 | return this_cpu_ptr(array->pptrs[index]); | 190 | return this_cpu_ptr(array->pptrs[index & array->index_mask]); |
161 | } | 191 | } |
162 | 192 | ||
163 | int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) | 193 | int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) |
@@ -177,7 +207,7 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) | |||
177 | */ | 207 | */ |
178 | size = round_up(map->value_size, 8); | 208 | size = round_up(map->value_size, 8); |
179 | rcu_read_lock(); | 209 | rcu_read_lock(); |
180 | pptr = array->pptrs[index]; | 210 | pptr = array->pptrs[index & array->index_mask]; |
181 | for_each_possible_cpu(cpu) { | 211 | for_each_possible_cpu(cpu) { |
182 | bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size); | 212 | bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size); |
183 | off += size; | 213 | off += size; |
@@ -225,10 +255,11 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, | |||
225 | return -EEXIST; | 255 | return -EEXIST; |
226 | 256 | ||
227 | if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) | 257 | if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) |
228 | memcpy(this_cpu_ptr(array->pptrs[index]), | 258 | memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]), |
229 | value, map->value_size); | 259 | value, map->value_size); |
230 | else | 260 | else |
231 | memcpy(array->value + array->elem_size * index, | 261 | memcpy(array->value + |
262 | array->elem_size * (index & array->index_mask), | ||
232 | value, map->value_size); | 263 | value, map->value_size); |
233 | return 0; | 264 | return 0; |
234 | } | 265 | } |
@@ -262,7 +293,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, | |||
262 | */ | 293 | */ |
263 | size = round_up(map->value_size, 8); | 294 | size = round_up(map->value_size, 8); |
264 | rcu_read_lock(); | 295 | rcu_read_lock(); |
265 | pptr = array->pptrs[index]; | 296 | pptr = array->pptrs[index & array->index_mask]; |
266 | for_each_possible_cpu(cpu) { | 297 | for_each_possible_cpu(cpu) { |
267 | bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size); | 298 | bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size); |
268 | off += size; | 299 | off += size; |
@@ -613,6 +644,7 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key) | |||
613 | static u32 array_of_map_gen_lookup(struct bpf_map *map, | 644 | static u32 array_of_map_gen_lookup(struct bpf_map *map, |
614 | struct bpf_insn *insn_buf) | 645 | struct bpf_insn *insn_buf) |
615 | { | 646 | { |
647 | struct bpf_array *array = container_of(map, struct bpf_array, map); | ||
616 | u32 elem_size = round_up(map->value_size, 8); | 648 | u32 elem_size = round_up(map->value_size, 8); |
617 | struct bpf_insn *insn = insn_buf; | 649 | struct bpf_insn *insn = insn_buf; |
618 | const int ret = BPF_REG_0; | 650 | const int ret = BPF_REG_0; |
@@ -621,7 +653,12 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map, | |||
621 | 653 | ||
622 | *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); | 654 | *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); |
623 | *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); | 655 | *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); |
624 | *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); | 656 | if (map->unpriv_array) { |
657 | *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6); | ||
658 | *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); | ||
659 | } else { | ||
660 | *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); | ||
661 | } | ||
625 | if (is_power_of_2(elem_size)) | 662 | if (is_power_of_2(elem_size)) |
626 | *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); | 663 | *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); |
627 | else | 664 | else |
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b9f8686a84cf..7949e8b8f94e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c | |||
@@ -767,6 +767,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | |||
767 | } | 767 | } |
768 | EXPORT_SYMBOL_GPL(__bpf_call_base); | 768 | EXPORT_SYMBOL_GPL(__bpf_call_base); |
769 | 769 | ||
770 | #ifndef CONFIG_BPF_JIT_ALWAYS_ON | ||
770 | /** | 771 | /** |
771 | * __bpf_prog_run - run eBPF program on a given context | 772 | * __bpf_prog_run - run eBPF program on a given context |
772 | * @ctx: is the data we are operating on | 773 | * @ctx: is the data we are operating on |
@@ -955,7 +956,7 @@ select_insn: | |||
955 | DST = tmp; | 956 | DST = tmp; |
956 | CONT; | 957 | CONT; |
957 | ALU_MOD_X: | 958 | ALU_MOD_X: |
958 | if (unlikely(SRC == 0)) | 959 | if (unlikely((u32)SRC == 0)) |
959 | return 0; | 960 | return 0; |
960 | tmp = (u32) DST; | 961 | tmp = (u32) DST; |
961 | DST = do_div(tmp, (u32) SRC); | 962 | DST = do_div(tmp, (u32) SRC); |
@@ -974,7 +975,7 @@ select_insn: | |||
974 | DST = div64_u64(DST, SRC); | 975 | DST = div64_u64(DST, SRC); |
975 | CONT; | 976 | CONT; |
976 | ALU_DIV_X: | 977 | ALU_DIV_X: |
977 | if (unlikely(SRC == 0)) | 978 | if (unlikely((u32)SRC == 0)) |
978 | return 0; | 979 | return 0; |
979 | tmp = (u32) DST; | 980 | tmp = (u32) DST; |
980 | do_div(tmp, (u32) SRC); | 981 | do_div(tmp, (u32) SRC); |
@@ -1317,6 +1318,14 @@ EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) | |||
1317 | EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) | 1318 | EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) |
1318 | }; | 1319 | }; |
1319 | 1320 | ||
1321 | #else | ||
1322 | static unsigned int __bpf_prog_ret0(const void *ctx, | ||
1323 | const struct bpf_insn *insn) | ||
1324 | { | ||
1325 | return 0; | ||
1326 | } | ||
1327 | #endif | ||
1328 | |||
1320 | bool bpf_prog_array_compatible(struct bpf_array *array, | 1329 | bool bpf_prog_array_compatible(struct bpf_array *array, |
1321 | const struct bpf_prog *fp) | 1330 | const struct bpf_prog *fp) |
1322 | { | 1331 | { |
@@ -1364,9 +1373,13 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) | |||
1364 | */ | 1373 | */ |
1365 | struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) | 1374 | struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) |
1366 | { | 1375 | { |
1376 | #ifndef CONFIG_BPF_JIT_ALWAYS_ON | ||
1367 | u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); | 1377 | u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); |
1368 | 1378 | ||
1369 | fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; | 1379 | fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; |
1380 | #else | ||
1381 | fp->bpf_func = __bpf_prog_ret0; | ||
1382 | #endif | ||
1370 | 1383 | ||
1371 | /* eBPF JITs can rewrite the program in case constant | 1384 | /* eBPF JITs can rewrite the program in case constant |
1372 | * blinding is active. However, in case of error during | 1385 | * blinding is active. However, in case of error during |
@@ -1376,6 +1389,12 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) | |||
1376 | */ | 1389 | */ |
1377 | if (!bpf_prog_is_dev_bound(fp->aux)) { | 1390 | if (!bpf_prog_is_dev_bound(fp->aux)) { |
1378 | fp = bpf_int_jit_compile(fp); | 1391 | fp = bpf_int_jit_compile(fp); |
1392 | #ifdef CONFIG_BPF_JIT_ALWAYS_ON | ||
1393 | if (!fp->jited) { | ||
1394 | *err = -ENOTSUPP; | ||
1395 | return fp; | ||
1396 | } | ||
1397 | #endif | ||
1379 | } else { | 1398 | } else { |
1380 | *err = bpf_prog_offload_compile(fp); | 1399 | *err = bpf_prog_offload_compile(fp); |
1381 | if (*err) | 1400 | if (*err) |
@@ -1447,7 +1466,8 @@ int bpf_prog_array_length(struct bpf_prog_array __rcu *progs) | |||
1447 | rcu_read_lock(); | 1466 | rcu_read_lock(); |
1448 | prog = rcu_dereference(progs)->progs; | 1467 | prog = rcu_dereference(progs)->progs; |
1449 | for (; *prog; prog++) | 1468 | for (; *prog; prog++) |
1450 | cnt++; | 1469 | if (*prog != &dummy_bpf_prog.prog) |
1470 | cnt++; | ||
1451 | rcu_read_unlock(); | 1471 | rcu_read_unlock(); |
1452 | return cnt; | 1472 | return cnt; |
1453 | } | 1473 | } |
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index e469e05c8e83..3905d4bc5b80 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c | |||
@@ -114,6 +114,7 @@ static void htab_free_elems(struct bpf_htab *htab) | |||
114 | pptr = htab_elem_get_ptr(get_htab_elem(htab, i), | 114 | pptr = htab_elem_get_ptr(get_htab_elem(htab, i), |
115 | htab->map.key_size); | 115 | htab->map.key_size); |
116 | free_percpu(pptr); | 116 | free_percpu(pptr); |
117 | cond_resched(); | ||
117 | } | 118 | } |
118 | free_elems: | 119 | free_elems: |
119 | bpf_map_area_free(htab->elems); | 120 | bpf_map_area_free(htab->elems); |
@@ -159,6 +160,7 @@ static int prealloc_init(struct bpf_htab *htab) | |||
159 | goto free_elems; | 160 | goto free_elems; |
160 | htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size, | 161 | htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size, |
161 | pptr); | 162 | pptr); |
163 | cond_resched(); | ||
162 | } | 164 | } |
163 | 165 | ||
164 | skip_percpu_elems: | 166 | skip_percpu_elems: |
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 01aaef1a77c5..5bb5e49ef4c3 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c | |||
@@ -368,7 +368,45 @@ out: | |||
368 | putname(pname); | 368 | putname(pname); |
369 | return ret; | 369 | return ret; |
370 | } | 370 | } |
371 | EXPORT_SYMBOL_GPL(bpf_obj_get_user); | 371 | |
372 | static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type) | ||
373 | { | ||
374 | struct bpf_prog *prog; | ||
375 | int ret = inode_permission(inode, MAY_READ | MAY_WRITE); | ||
376 | if (ret) | ||
377 | return ERR_PTR(ret); | ||
378 | |||
379 | if (inode->i_op == &bpf_map_iops) | ||
380 | return ERR_PTR(-EINVAL); | ||
381 | if (inode->i_op != &bpf_prog_iops) | ||
382 | return ERR_PTR(-EACCES); | ||
383 | |||
384 | prog = inode->i_private; | ||
385 | |||
386 | ret = security_bpf_prog(prog); | ||
387 | if (ret < 0) | ||
388 | return ERR_PTR(ret); | ||
389 | |||
390 | if (!bpf_prog_get_ok(prog, &type, false)) | ||
391 | return ERR_PTR(-EINVAL); | ||
392 | |||
393 | return bpf_prog_inc(prog); | ||
394 | } | ||
395 | |||
396 | struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type) | ||
397 | { | ||
398 | struct bpf_prog *prog; | ||
399 | struct path path; | ||
400 | int ret = kern_path(name, LOOKUP_FOLLOW, &path); | ||
401 | if (ret) | ||
402 | return ERR_PTR(ret); | ||
403 | prog = __get_prog_inode(d_backing_inode(path.dentry), type); | ||
404 | if (!IS_ERR(prog)) | ||
405 | touch_atime(&path); | ||
406 | path_put(&path); | ||
407 | return prog; | ||
408 | } | ||
409 | EXPORT_SYMBOL(bpf_prog_get_type_path); | ||
372 | 410 | ||
373 | static void bpf_evict_inode(struct inode *inode) | 411 | static void bpf_evict_inode(struct inode *inode) |
374 | { | 412 | { |
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 68ec884440b7..8455b89d1bbf 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c | |||
@@ -1,3 +1,18 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2017 Netronome Systems, Inc. | ||
3 | * | ||
4 | * This software is licensed under the GNU General License Version 2, | ||
5 | * June 1991 as shown in the file COPYING in the top-level directory of this | ||
6 | * source tree. | ||
7 | * | ||
8 | * THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" | ||
9 | * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, | ||
10 | * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS | ||
11 | * FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE | ||
12 | * OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME | ||
13 | * THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. | ||
14 | */ | ||
15 | |||
1 | #include <linux/bpf.h> | 16 | #include <linux/bpf.h> |
2 | #include <linux/bpf_verifier.h> | 17 | #include <linux/bpf_verifier.h> |
3 | #include <linux/bug.h> | 18 | #include <linux/bug.h> |
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 5ee2e41893d9..1712d319c2d8 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c | |||
@@ -591,8 +591,15 @@ static void sock_map_free(struct bpf_map *map) | |||
591 | 591 | ||
592 | write_lock_bh(&sock->sk_callback_lock); | 592 | write_lock_bh(&sock->sk_callback_lock); |
593 | psock = smap_psock_sk(sock); | 593 | psock = smap_psock_sk(sock); |
594 | smap_list_remove(psock, &stab->sock_map[i]); | 594 | /* This check handles a racing sock event that can get the |
595 | smap_release_sock(psock, sock); | 595 | * sk_callback_lock before this case but after xchg happens |
596 | * causing the refcnt to hit zero and sock user data (psock) | ||
597 | * to be null and queued for garbage collection. | ||
598 | */ | ||
599 | if (likely(psock)) { | ||
600 | smap_list_remove(psock, &stab->sock_map[i]); | ||
601 | smap_release_sock(psock, sock); | ||
602 | } | ||
596 | write_unlock_bh(&sock->sk_callback_lock); | 603 | write_unlock_bh(&sock->sk_callback_lock); |
597 | } | 604 | } |
598 | rcu_read_unlock(); | 605 | rcu_read_unlock(); |
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2c4cfeaa8d5e..5cb783fc8224 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c | |||
@@ -1057,7 +1057,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) | |||
1057 | } | 1057 | } |
1058 | EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero); | 1058 | EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero); |
1059 | 1059 | ||
1060 | static bool bpf_prog_get_ok(struct bpf_prog *prog, | 1060 | bool bpf_prog_get_ok(struct bpf_prog *prog, |
1061 | enum bpf_prog_type *attach_type, bool attach_drv) | 1061 | enum bpf_prog_type *attach_type, bool attach_drv) |
1062 | { | 1062 | { |
1063 | /* not an attachment, just a refcount inc, always allow */ | 1063 | /* not an attachment, just a refcount inc, always allow */ |
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d4593571c404..13551e623501 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c | |||
@@ -978,6 +978,13 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno) | |||
978 | return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno); | 978 | return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno); |
979 | } | 979 | } |
980 | 980 | ||
981 | static bool is_ctx_reg(struct bpf_verifier_env *env, int regno) | ||
982 | { | ||
983 | const struct bpf_reg_state *reg = cur_regs(env) + regno; | ||
984 | |||
985 | return reg->type == PTR_TO_CTX; | ||
986 | } | ||
987 | |||
981 | static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, | 988 | static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, |
982 | const struct bpf_reg_state *reg, | 989 | const struct bpf_reg_state *reg, |
983 | int off, int size, bool strict) | 990 | int off, int size, bool strict) |
@@ -1059,6 +1066,11 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, | |||
1059 | break; | 1066 | break; |
1060 | case PTR_TO_STACK: | 1067 | case PTR_TO_STACK: |
1061 | pointer_desc = "stack "; | 1068 | pointer_desc = "stack "; |
1069 | /* The stack spill tracking logic in check_stack_write() | ||
1070 | * and check_stack_read() relies on stack accesses being | ||
1071 | * aligned. | ||
1072 | */ | ||
1073 | strict = true; | ||
1062 | break; | 1074 | break; |
1063 | default: | 1075 | default: |
1064 | break; | 1076 | break; |
@@ -1067,6 +1079,29 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, | |||
1067 | strict); | 1079 | strict); |
1068 | } | 1080 | } |
1069 | 1081 | ||
1082 | /* truncate register to smaller size (in bytes) | ||
1083 | * must be called with size < BPF_REG_SIZE | ||
1084 | */ | ||
1085 | static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) | ||
1086 | { | ||
1087 | u64 mask; | ||
1088 | |||
1089 | /* clear high bits in bit representation */ | ||
1090 | reg->var_off = tnum_cast(reg->var_off, size); | ||
1091 | |||
1092 | /* fix arithmetic bounds */ | ||
1093 | mask = ((u64)1 << (size * 8)) - 1; | ||
1094 | if ((reg->umin_value & ~mask) == (reg->umax_value & ~mask)) { | ||
1095 | reg->umin_value &= mask; | ||
1096 | reg->umax_value &= mask; | ||
1097 | } else { | ||
1098 | reg->umin_value = 0; | ||
1099 | reg->umax_value = mask; | ||
1100 | } | ||
1101 | reg->smin_value = reg->umin_value; | ||
1102 | reg->smax_value = reg->umax_value; | ||
1103 | } | ||
1104 | |||
1070 | /* check whether memory at (regno + off) is accessible for t = (read | write) | 1105 | /* check whether memory at (regno + off) is accessible for t = (read | write) |
1071 | * if t==write, value_regno is a register which value is stored into memory | 1106 | * if t==write, value_regno is a register which value is stored into memory |
1072 | * if t==read, value_regno is a register which will receive the value from memory | 1107 | * if t==read, value_regno is a register which will receive the value from memory |
@@ -1200,9 +1235,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn | |||
1200 | if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ && | 1235 | if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ && |
1201 | regs[value_regno].type == SCALAR_VALUE) { | 1236 | regs[value_regno].type == SCALAR_VALUE) { |
1202 | /* b/h/w load zero-extends, mark upper bits as known 0 */ | 1237 | /* b/h/w load zero-extends, mark upper bits as known 0 */ |
1203 | regs[value_regno].var_off = | 1238 | coerce_reg_to_size(®s[value_regno], size); |
1204 | tnum_cast(regs[value_regno].var_off, size); | ||
1205 | __update_reg_bounds(®s[value_regno]); | ||
1206 | } | 1239 | } |
1207 | return err; | 1240 | return err; |
1208 | } | 1241 | } |
@@ -1232,6 +1265,12 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins | |||
1232 | return -EACCES; | 1265 | return -EACCES; |
1233 | } | 1266 | } |
1234 | 1267 | ||
1268 | if (is_ctx_reg(env, insn->dst_reg)) { | ||
1269 | verbose(env, "BPF_XADD stores into R%d context is not allowed\n", | ||
1270 | insn->dst_reg); | ||
1271 | return -EACCES; | ||
1272 | } | ||
1273 | |||
1235 | /* check whether atomic_add can read the memory */ | 1274 | /* check whether atomic_add can read the memory */ |
1236 | err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, | 1275 | err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, |
1237 | BPF_SIZE(insn->code), BPF_READ, -1); | 1276 | BPF_SIZE(insn->code), BPF_READ, -1); |
@@ -1282,6 +1321,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, | |||
1282 | tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off); | 1321 | tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off); |
1283 | verbose(env, "invalid variable stack read R%d var_off=%s\n", | 1322 | verbose(env, "invalid variable stack read R%d var_off=%s\n", |
1284 | regno, tn_buf); | 1323 | regno, tn_buf); |
1324 | return -EACCES; | ||
1285 | } | 1325 | } |
1286 | off = regs[regno].off + regs[regno].var_off.value; | 1326 | off = regs[regno].off + regs[regno].var_off.value; |
1287 | if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || | 1327 | if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || |
@@ -1674,7 +1714,13 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) | |||
1674 | return -EINVAL; | 1714 | return -EINVAL; |
1675 | } | 1715 | } |
1676 | 1716 | ||
1717 | /* With LD_ABS/IND some JITs save/restore skb from r1. */ | ||
1677 | changes_data = bpf_helper_changes_pkt_data(fn->func); | 1718 | changes_data = bpf_helper_changes_pkt_data(fn->func); |
1719 | if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) { | ||
1720 | verbose(env, "kernel subsystem misconfigured func %s#%d: r1 != ctx\n", | ||
1721 | func_id_name(func_id), func_id); | ||
1722 | return -EINVAL; | ||
1723 | } | ||
1678 | 1724 | ||
1679 | memset(&meta, 0, sizeof(meta)); | 1725 | memset(&meta, 0, sizeof(meta)); |
1680 | meta.pkt_access = fn->pkt_access; | 1726 | meta.pkt_access = fn->pkt_access; |
@@ -1696,6 +1742,13 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) | |||
1696 | err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta); | 1742 | err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta); |
1697 | if (err) | 1743 | if (err) |
1698 | return err; | 1744 | return err; |
1745 | if (func_id == BPF_FUNC_tail_call) { | ||
1746 | if (meta.map_ptr == NULL) { | ||
1747 | verbose(env, "verifier bug\n"); | ||
1748 | return -EINVAL; | ||
1749 | } | ||
1750 | env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr; | ||
1751 | } | ||
1699 | err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta); | 1752 | err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta); |
1700 | if (err) | 1753 | if (err) |
1701 | return err; | 1754 | return err; |
@@ -1766,14 +1819,6 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) | |||
1766 | return 0; | 1819 | return 0; |
1767 | } | 1820 | } |
1768 | 1821 | ||
1769 | static void coerce_reg_to_32(struct bpf_reg_state *reg) | ||
1770 | { | ||
1771 | /* clear high 32 bits */ | ||
1772 | reg->var_off = tnum_cast(reg->var_off, 4); | ||
1773 | /* Update bounds */ | ||
1774 | __update_reg_bounds(reg); | ||
1775 | } | ||
1776 | |||
1777 | static bool signed_add_overflows(s64 a, s64 b) | 1822 | static bool signed_add_overflows(s64 a, s64 b) |
1778 | { | 1823 | { |
1779 | /* Do the add in u64, where overflow is well-defined */ | 1824 | /* Do the add in u64, where overflow is well-defined */ |
@@ -1794,6 +1839,41 @@ static bool signed_sub_overflows(s64 a, s64 b) | |||
1794 | return res > a; | 1839 | return res > a; |
1795 | } | 1840 | } |
1796 | 1841 | ||
1842 | static bool check_reg_sane_offset(struct bpf_verifier_env *env, | ||
1843 | const struct bpf_reg_state *reg, | ||
1844 | enum bpf_reg_type type) | ||
1845 | { | ||
1846 | bool known = tnum_is_const(reg->var_off); | ||
1847 | s64 val = reg->var_off.value; | ||
1848 | s64 smin = reg->smin_value; | ||
1849 | |||
1850 | if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) { | ||
1851 | verbose(env, "math between %s pointer and %lld is not allowed\n", | ||
1852 | reg_type_str[type], val); | ||
1853 | return false; | ||
1854 | } | ||
1855 | |||
1856 | if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) { | ||
1857 | verbose(env, "%s pointer offset %d is not allowed\n", | ||
1858 | reg_type_str[type], reg->off); | ||
1859 | return false; | ||
1860 | } | ||
1861 | |||
1862 | if (smin == S64_MIN) { | ||
1863 | verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n", | ||
1864 | reg_type_str[type]); | ||
1865 | return false; | ||
1866 | } | ||
1867 | |||
1868 | if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) { | ||
1869 | verbose(env, "value %lld makes %s pointer be out of bounds\n", | ||
1870 | smin, reg_type_str[type]); | ||
1871 | return false; | ||
1872 | } | ||
1873 | |||
1874 | return true; | ||
1875 | } | ||
1876 | |||
1797 | /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off. | 1877 | /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off. |
1798 | * Caller should also handle BPF_MOV case separately. | 1878 | * Caller should also handle BPF_MOV case separately. |
1799 | * If we return -EACCES, caller may want to try again treating pointer as a | 1879 | * If we return -EACCES, caller may want to try again treating pointer as a |
@@ -1815,44 +1895,36 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, | |||
1815 | 1895 | ||
1816 | dst_reg = ®s[dst]; | 1896 | dst_reg = ®s[dst]; |
1817 | 1897 | ||
1818 | if (WARN_ON_ONCE(known && (smin_val != smax_val))) { | 1898 | if ((known && (smin_val != smax_val || umin_val != umax_val)) || |
1819 | print_verifier_state(env, env->cur_state); | 1899 | smin_val > smax_val || umin_val > umax_val) { |
1820 | verbose(env, | 1900 | /* Taint dst register if offset had invalid bounds derived from |
1821 | "verifier internal error: known but bad sbounds\n"); | 1901 | * e.g. dead branches. |
1822 | return -EINVAL; | 1902 | */ |
1823 | } | 1903 | __mark_reg_unknown(dst_reg); |
1824 | if (WARN_ON_ONCE(known && (umin_val != umax_val))) { | 1904 | return 0; |
1825 | print_verifier_state(env, env->cur_state); | ||
1826 | verbose(env, | ||
1827 | "verifier internal error: known but bad ubounds\n"); | ||
1828 | return -EINVAL; | ||
1829 | } | 1905 | } |
1830 | 1906 | ||
1831 | if (BPF_CLASS(insn->code) != BPF_ALU64) { | 1907 | if (BPF_CLASS(insn->code) != BPF_ALU64) { |
1832 | /* 32-bit ALU ops on pointers produce (meaningless) scalars */ | 1908 | /* 32-bit ALU ops on pointers produce (meaningless) scalars */ |
1833 | if (!env->allow_ptr_leaks) | 1909 | verbose(env, |
1834 | verbose(env, | 1910 | "R%d 32-bit pointer arithmetic prohibited\n", |
1835 | "R%d 32-bit pointer arithmetic prohibited\n", | 1911 | dst); |
1836 | dst); | ||
1837 | return -EACCES; | 1912 | return -EACCES; |
1838 | } | 1913 | } |
1839 | 1914 | ||
1840 | if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { | 1915 | if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { |
1841 | if (!env->allow_ptr_leaks) | 1916 | verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n", |
1842 | verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n", | 1917 | dst); |
1843 | dst); | ||
1844 | return -EACCES; | 1918 | return -EACCES; |
1845 | } | 1919 | } |
1846 | if (ptr_reg->type == CONST_PTR_TO_MAP) { | 1920 | if (ptr_reg->type == CONST_PTR_TO_MAP) { |
1847 | if (!env->allow_ptr_leaks) | 1921 | verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n", |
1848 | verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n", | 1922 | dst); |
1849 | dst); | ||
1850 | return -EACCES; | 1923 | return -EACCES; |
1851 | } | 1924 | } |
1852 | if (ptr_reg->type == PTR_TO_PACKET_END) { | 1925 | if (ptr_reg->type == PTR_TO_PACKET_END) { |
1853 | if (!env->allow_ptr_leaks) | 1926 | verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n", |
1854 | verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n", | 1927 | dst); |
1855 | dst); | ||
1856 | return -EACCES; | 1928 | return -EACCES; |
1857 | } | 1929 | } |
1858 | 1930 | ||
@@ -1862,6 +1934,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, | |||
1862 | dst_reg->type = ptr_reg->type; | 1934 | dst_reg->type = ptr_reg->type; |
1863 | dst_reg->id = ptr_reg->id; | 1935 | dst_reg->id = ptr_reg->id; |
1864 | 1936 | ||
1937 | if (!check_reg_sane_offset(env, off_reg, ptr_reg->type) || | ||
1938 | !check_reg_sane_offset(env, ptr_reg, ptr_reg->type)) | ||
1939 | return -EINVAL; | ||
1940 | |||
1865 | switch (opcode) { | 1941 | switch (opcode) { |
1866 | case BPF_ADD: | 1942 | case BPF_ADD: |
1867 | /* We can take a fixed offset as long as it doesn't overflow | 1943 | /* We can take a fixed offset as long as it doesn't overflow |
@@ -1915,9 +1991,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, | |||
1915 | case BPF_SUB: | 1991 | case BPF_SUB: |
1916 | if (dst_reg == off_reg) { | 1992 | if (dst_reg == off_reg) { |
1917 | /* scalar -= pointer. Creates an unknown scalar */ | 1993 | /* scalar -= pointer. Creates an unknown scalar */ |
1918 | if (!env->allow_ptr_leaks) | 1994 | verbose(env, "R%d tried to subtract pointer from scalar\n", |
1919 | verbose(env, "R%d tried to subtract pointer from scalar\n", | 1995 | dst); |
1920 | dst); | ||
1921 | return -EACCES; | 1996 | return -EACCES; |
1922 | } | 1997 | } |
1923 | /* We don't allow subtraction from FP, because (according to | 1998 | /* We don't allow subtraction from FP, because (according to |
@@ -1925,9 +2000,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, | |||
1925 | * be able to deal with it. | 2000 | * be able to deal with it. |
1926 | */ | 2001 | */ |
1927 | if (ptr_reg->type == PTR_TO_STACK) { | 2002 | if (ptr_reg->type == PTR_TO_STACK) { |
1928 | if (!env->allow_ptr_leaks) | 2003 | verbose(env, "R%d subtraction from stack pointer prohibited\n", |
1929 | verbose(env, "R%d subtraction from stack pointer prohibited\n", | 2004 | dst); |
1930 | dst); | ||
1931 | return -EACCES; | 2005 | return -EACCES; |
1932 | } | 2006 | } |
1933 | if (known && (ptr_reg->off - smin_val == | 2007 | if (known && (ptr_reg->off - smin_val == |
@@ -1976,28 +2050,30 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, | |||
1976 | case BPF_AND: | 2050 | case BPF_AND: |
1977 | case BPF_OR: | 2051 | case BPF_OR: |
1978 | case BPF_XOR: | 2052 | case BPF_XOR: |
1979 | /* bitwise ops on pointers are troublesome, prohibit for now. | 2053 | /* bitwise ops on pointers are troublesome, prohibit. */ |
1980 | * (However, in principle we could allow some cases, e.g. | 2054 | verbose(env, "R%d bitwise operator %s on pointer prohibited\n", |
1981 | * ptr &= ~3 which would reduce min_value by 3.) | 2055 | dst, bpf_alu_string[opcode >> 4]); |
1982 | */ | ||
1983 | if (!env->allow_ptr_leaks) | ||
1984 | verbose(env, "R%d bitwise operator %s on pointer prohibited\n", | ||
1985 | dst, bpf_alu_string[opcode >> 4]); | ||
1986 | return -EACCES; | 2056 | return -EACCES; |
1987 | default: | 2057 | default: |
1988 | /* other operators (e.g. MUL,LSH) produce non-pointer results */ | 2058 | /* other operators (e.g. MUL,LSH) produce non-pointer results */ |
1989 | if (!env->allow_ptr_leaks) | 2059 | verbose(env, "R%d pointer arithmetic with %s operator prohibited\n", |
1990 | verbose(env, "R%d pointer arithmetic with %s operator prohibited\n", | 2060 | dst, bpf_alu_string[opcode >> 4]); |
1991 | dst, bpf_alu_string[opcode >> 4]); | ||
1992 | return -EACCES; | 2061 | return -EACCES; |
1993 | } | 2062 | } |
1994 | 2063 | ||
2064 | if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type)) | ||
2065 | return -EINVAL; | ||
2066 | |||
1995 | __update_reg_bounds(dst_reg); | 2067 | __update_reg_bounds(dst_reg); |
1996 | __reg_deduce_bounds(dst_reg); | 2068 | __reg_deduce_bounds(dst_reg); |
1997 | __reg_bound_offset(dst_reg); | 2069 | __reg_bound_offset(dst_reg); |
1998 | return 0; | 2070 | return 0; |
1999 | } | 2071 | } |
2000 | 2072 | ||
2073 | /* WARNING: This function does calculations on 64-bit values, but the actual | ||
2074 | * execution may occur on 32-bit values. Therefore, things like bitshifts | ||
2075 | * need extra checks in the 32-bit case. | ||
2076 | */ | ||
2001 | static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, | 2077 | static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, |
2002 | struct bpf_insn *insn, | 2078 | struct bpf_insn *insn, |
2003 | struct bpf_reg_state *dst_reg, | 2079 | struct bpf_reg_state *dst_reg, |
@@ -2008,12 +2084,8 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, | |||
2008 | bool src_known, dst_known; | 2084 | bool src_known, dst_known; |
2009 | s64 smin_val, smax_val; | 2085 | s64 smin_val, smax_val; |
2010 | u64 umin_val, umax_val; | 2086 | u64 umin_val, umax_val; |
2087 | u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32; | ||
2011 | 2088 | ||
2012 | if (BPF_CLASS(insn->code) != BPF_ALU64) { | ||
2013 | /* 32-bit ALU ops are (32,32)->64 */ | ||
2014 | coerce_reg_to_32(dst_reg); | ||
2015 | coerce_reg_to_32(&src_reg); | ||
2016 | } | ||
2017 | smin_val = src_reg.smin_value; | 2089 | smin_val = src_reg.smin_value; |
2018 | smax_val = src_reg.smax_value; | 2090 | smax_val = src_reg.smax_value; |
2019 | umin_val = src_reg.umin_value; | 2091 | umin_val = src_reg.umin_value; |
@@ -2021,6 +2093,21 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, | |||
2021 | src_known = tnum_is_const(src_reg.var_off); | 2093 | src_known = tnum_is_const(src_reg.var_off); |
2022 | dst_known = tnum_is_const(dst_reg->var_off); | 2094 | dst_known = tnum_is_const(dst_reg->var_off); |
2023 | 2095 | ||
2096 | if ((src_known && (smin_val != smax_val || umin_val != umax_val)) || | ||
2097 | smin_val > smax_val || umin_val > umax_val) { | ||
2098 | /* Taint dst register if offset had invalid bounds derived from | ||
2099 | * e.g. dead branches. | ||
2100 | */ | ||
2101 | __mark_reg_unknown(dst_reg); | ||
2102 | return 0; | ||
2103 | } | ||
2104 | |||
2105 | if (!src_known && | ||
2106 | opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) { | ||
2107 | __mark_reg_unknown(dst_reg); | ||
2108 | return 0; | ||
2109 | } | ||
2110 | |||
2024 | switch (opcode) { | 2111 | switch (opcode) { |
2025 | case BPF_ADD: | 2112 | case BPF_ADD: |
2026 | if (signed_add_overflows(dst_reg->smin_value, smin_val) || | 2113 | if (signed_add_overflows(dst_reg->smin_value, smin_val) || |
@@ -2149,9 +2236,9 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, | |||
2149 | __update_reg_bounds(dst_reg); | 2236 | __update_reg_bounds(dst_reg); |
2150 | break; | 2237 | break; |
2151 | case BPF_LSH: | 2238 | case BPF_LSH: |
2152 | if (umax_val > 63) { | 2239 | if (umax_val >= insn_bitness) { |
2153 | /* Shifts greater than 63 are undefined. This includes | 2240 | /* Shifts greater than 31 or 63 are undefined. |
2154 | * shifts by a negative number. | 2241 | * This includes shifts by a negative number. |
2155 | */ | 2242 | */ |
2156 | mark_reg_unknown(env, regs, insn->dst_reg); | 2243 | mark_reg_unknown(env, regs, insn->dst_reg); |
2157 | break; | 2244 | break; |
@@ -2177,27 +2264,29 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, | |||
2177 | __update_reg_bounds(dst_reg); | 2264 | __update_reg_bounds(dst_reg); |
2178 | break; | 2265 | break; |
2179 | case BPF_RSH: | 2266 | case BPF_RSH: |
2180 | if (umax_val > 63) { | 2267 | if (umax_val >= insn_bitness) { |
2181 | /* Shifts greater than 63 are undefined. This includes | 2268 | /* Shifts greater than 31 or 63 are undefined. |
2182 | * shifts by a negative number. | 2269 | * This includes shifts by a negative number. |
2183 | */ | 2270 | */ |
2184 | mark_reg_unknown(env, regs, insn->dst_reg); | 2271 | mark_reg_unknown(env, regs, insn->dst_reg); |
2185 | break; | 2272 | break; |
2186 | } | 2273 | } |
2187 | /* BPF_RSH is an unsigned shift, so make the appropriate casts */ | 2274 | /* BPF_RSH is an unsigned shift. If the value in dst_reg might |
2188 | if (dst_reg->smin_value < 0) { | 2275 | * be negative, then either: |
2189 | if (umin_val) { | 2276 | * 1) src_reg might be zero, so the sign bit of the result is |
2190 | /* Sign bit will be cleared */ | 2277 | * unknown, so we lose our signed bounds |
2191 | dst_reg->smin_value = 0; | 2278 | * 2) it's known negative, thus the unsigned bounds capture the |
2192 | } else { | 2279 | * signed bounds |
2193 | /* Lost sign bit information */ | 2280 | * 3) the signed bounds cross zero, so they tell us nothing |
2194 | dst_reg->smin_value = S64_MIN; | 2281 | * about the result |
2195 | dst_reg->smax_value = S64_MAX; | 2282 | * If the value in dst_reg is known nonnegative, then again the |
2196 | } | 2283 | * unsigned bounts capture the signed bounds. |
2197 | } else { | 2284 | * Thus, in all cases it suffices to blow away our signed bounds |
2198 | dst_reg->smin_value = | 2285 | * and rely on inferring new ones from the unsigned bounds and |
2199 | (u64)(dst_reg->smin_value) >> umax_val; | 2286 | * var_off of the result. |
2200 | } | 2287 | */ |
2288 | dst_reg->smin_value = S64_MIN; | ||
2289 | dst_reg->smax_value = S64_MAX; | ||
2201 | if (src_known) | 2290 | if (src_known) |
2202 | dst_reg->var_off = tnum_rshift(dst_reg->var_off, | 2291 | dst_reg->var_off = tnum_rshift(dst_reg->var_off, |
2203 | umin_val); | 2292 | umin_val); |
@@ -2213,6 +2302,12 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, | |||
2213 | break; | 2302 | break; |
2214 | } | 2303 | } |
2215 | 2304 | ||
2305 | if (BPF_CLASS(insn->code) != BPF_ALU64) { | ||
2306 | /* 32-bit ALU ops are (32,32)->32 */ | ||
2307 | coerce_reg_to_size(dst_reg, 4); | ||
2308 | coerce_reg_to_size(&src_reg, 4); | ||
2309 | } | ||
2310 | |||
2216 | __reg_deduce_bounds(dst_reg); | 2311 | __reg_deduce_bounds(dst_reg); |
2217 | __reg_bound_offset(dst_reg); | 2312 | __reg_bound_offset(dst_reg); |
2218 | return 0; | 2313 | return 0; |
@@ -2227,7 +2322,6 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, | |||
2227 | struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg; | 2322 | struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg; |
2228 | struct bpf_reg_state *ptr_reg = NULL, off_reg = {0}; | 2323 | struct bpf_reg_state *ptr_reg = NULL, off_reg = {0}; |
2229 | u8 opcode = BPF_OP(insn->code); | 2324 | u8 opcode = BPF_OP(insn->code); |
2230 | int rc; | ||
2231 | 2325 | ||
2232 | dst_reg = ®s[insn->dst_reg]; | 2326 | dst_reg = ®s[insn->dst_reg]; |
2233 | src_reg = NULL; | 2327 | src_reg = NULL; |
@@ -2238,43 +2332,29 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, | |||
2238 | if (src_reg->type != SCALAR_VALUE) { | 2332 | if (src_reg->type != SCALAR_VALUE) { |
2239 | if (dst_reg->type != SCALAR_VALUE) { | 2333 | if (dst_reg->type != SCALAR_VALUE) { |
2240 | /* Combining two pointers by any ALU op yields | 2334 | /* Combining two pointers by any ALU op yields |
2241 | * an arbitrary scalar. | 2335 | * an arbitrary scalar. Disallow all math except |
2336 | * pointer subtraction | ||
2242 | */ | 2337 | */ |
2243 | if (!env->allow_ptr_leaks) { | 2338 | if (opcode == BPF_SUB){ |
2244 | verbose(env, "R%d pointer %s pointer prohibited\n", | 2339 | mark_reg_unknown(env, regs, insn->dst_reg); |
2245 | insn->dst_reg, | 2340 | return 0; |
2246 | bpf_alu_string[opcode >> 4]); | ||
2247 | return -EACCES; | ||
2248 | } | 2341 | } |
2249 | mark_reg_unknown(env, regs, insn->dst_reg); | 2342 | verbose(env, "R%d pointer %s pointer prohibited\n", |
2250 | return 0; | 2343 | insn->dst_reg, |
2344 | bpf_alu_string[opcode >> 4]); | ||
2345 | return -EACCES; | ||
2251 | } else { | 2346 | } else { |
2252 | /* scalar += pointer | 2347 | /* scalar += pointer |
2253 | * This is legal, but we have to reverse our | 2348 | * This is legal, but we have to reverse our |
2254 | * src/dest handling in computing the range | 2349 | * src/dest handling in computing the range |
2255 | */ | 2350 | */ |
2256 | rc = adjust_ptr_min_max_vals(env, insn, | 2351 | return adjust_ptr_min_max_vals(env, insn, |
2257 | src_reg, dst_reg); | 2352 | src_reg, dst_reg); |
2258 | if (rc == -EACCES && env->allow_ptr_leaks) { | ||
2259 | /* scalar += unknown scalar */ | ||
2260 | __mark_reg_unknown(&off_reg); | ||
2261 | return adjust_scalar_min_max_vals( | ||
2262 | env, insn, | ||
2263 | dst_reg, off_reg); | ||
2264 | } | ||
2265 | return rc; | ||
2266 | } | 2353 | } |
2267 | } else if (ptr_reg) { | 2354 | } else if (ptr_reg) { |
2268 | /* pointer += scalar */ | 2355 | /* pointer += scalar */ |
2269 | rc = adjust_ptr_min_max_vals(env, insn, | 2356 | return adjust_ptr_min_max_vals(env, insn, |
2270 | dst_reg, src_reg); | 2357 | dst_reg, src_reg); |
2271 | if (rc == -EACCES && env->allow_ptr_leaks) { | ||
2272 | /* unknown scalar += scalar */ | ||
2273 | __mark_reg_unknown(dst_reg); | ||
2274 | return adjust_scalar_min_max_vals( | ||
2275 | env, insn, dst_reg, *src_reg); | ||
2276 | } | ||
2277 | return rc; | ||
2278 | } | 2358 | } |
2279 | } else { | 2359 | } else { |
2280 | /* Pretend the src is a reg with a known value, since we only | 2360 | /* Pretend the src is a reg with a known value, since we only |
@@ -2283,17 +2363,9 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, | |||
2283 | off_reg.type = SCALAR_VALUE; | 2363 | off_reg.type = SCALAR_VALUE; |
2284 | __mark_reg_known(&off_reg, insn->imm); | 2364 | __mark_reg_known(&off_reg, insn->imm); |
2285 | src_reg = &off_reg; | 2365 | src_reg = &off_reg; |
2286 | if (ptr_reg) { /* pointer += K */ | 2366 | if (ptr_reg) /* pointer += K */ |
2287 | rc = adjust_ptr_min_max_vals(env, insn, | 2367 | return adjust_ptr_min_max_vals(env, insn, |
2288 | ptr_reg, src_reg); | 2368 | ptr_reg, src_reg); |
2289 | if (rc == -EACCES && env->allow_ptr_leaks) { | ||
2290 | /* unknown scalar += K */ | ||
2291 | __mark_reg_unknown(dst_reg); | ||
2292 | return adjust_scalar_min_max_vals( | ||
2293 | env, insn, dst_reg, off_reg); | ||
2294 | } | ||
2295 | return rc; | ||
2296 | } | ||
2297 | } | 2369 | } |
2298 | 2370 | ||
2299 | /* Got here implies adding two SCALAR_VALUEs */ | 2371 | /* Got here implies adding two SCALAR_VALUEs */ |
@@ -2390,17 +2462,20 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) | |||
2390 | return -EACCES; | 2462 | return -EACCES; |
2391 | } | 2463 | } |
2392 | mark_reg_unknown(env, regs, insn->dst_reg); | 2464 | mark_reg_unknown(env, regs, insn->dst_reg); |
2393 | /* high 32 bits are known zero. */ | 2465 | coerce_reg_to_size(®s[insn->dst_reg], 4); |
2394 | regs[insn->dst_reg].var_off = tnum_cast( | ||
2395 | regs[insn->dst_reg].var_off, 4); | ||
2396 | __update_reg_bounds(®s[insn->dst_reg]); | ||
2397 | } | 2466 | } |
2398 | } else { | 2467 | } else { |
2399 | /* case: R = imm | 2468 | /* case: R = imm |
2400 | * remember the value we stored into this reg | 2469 | * remember the value we stored into this reg |
2401 | */ | 2470 | */ |
2402 | regs[insn->dst_reg].type = SCALAR_VALUE; | 2471 | regs[insn->dst_reg].type = SCALAR_VALUE; |
2403 | __mark_reg_known(regs + insn->dst_reg, insn->imm); | 2472 | if (BPF_CLASS(insn->code) == BPF_ALU64) { |
2473 | __mark_reg_known(regs + insn->dst_reg, | ||
2474 | insn->imm); | ||
2475 | } else { | ||
2476 | __mark_reg_known(regs + insn->dst_reg, | ||
2477 | (u32)insn->imm); | ||
2478 | } | ||
2404 | } | 2479 | } |
2405 | 2480 | ||
2406 | } else if (opcode > BPF_END) { | 2481 | } else if (opcode > BPF_END) { |
@@ -2436,6 +2511,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) | |||
2436 | return -EINVAL; | 2511 | return -EINVAL; |
2437 | } | 2512 | } |
2438 | 2513 | ||
2514 | if (opcode == BPF_ARSH && BPF_CLASS(insn->code) != BPF_ALU64) { | ||
2515 | verbose(env, "BPF_ARSH not supported for 32 bit ALU\n"); | ||
2516 | return -EINVAL; | ||
2517 | } | ||
2518 | |||
2439 | if ((opcode == BPF_LSH || opcode == BPF_RSH || | 2519 | if ((opcode == BPF_LSH || opcode == BPF_RSH || |
2440 | opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) { | 2520 | opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) { |
2441 | int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32; | 2521 | int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32; |
@@ -3431,15 +3511,14 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, | |||
3431 | return range_within(rold, rcur) && | 3511 | return range_within(rold, rcur) && |
3432 | tnum_in(rold->var_off, rcur->var_off); | 3512 | tnum_in(rold->var_off, rcur->var_off); |
3433 | } else { | 3513 | } else { |
3434 | /* if we knew anything about the old value, we're not | 3514 | /* We're trying to use a pointer in place of a scalar. |
3435 | * equal, because we can't know anything about the | 3515 | * Even if the scalar was unbounded, this could lead to |
3436 | * scalar value of the pointer in the new value. | 3516 | * pointer leaks because scalars are allowed to leak |
3517 | * while pointers are not. We could make this safe in | ||
3518 | * special cases if root is calling us, but it's | ||
3519 | * probably not worth the hassle. | ||
3437 | */ | 3520 | */ |
3438 | return rold->umin_value == 0 && | 3521 | return false; |
3439 | rold->umax_value == U64_MAX && | ||
3440 | rold->smin_value == S64_MIN && | ||
3441 | rold->smax_value == S64_MAX && | ||
3442 | tnum_is_unknown(rold->var_off); | ||
3443 | } | 3522 | } |
3444 | case PTR_TO_MAP_VALUE: | 3523 | case PTR_TO_MAP_VALUE: |
3445 | /* If the new min/max/var_off satisfy the old ones and | 3524 | /* If the new min/max/var_off satisfy the old ones and |
@@ -3932,6 +4011,12 @@ static int do_check(struct bpf_verifier_env *env) | |||
3932 | if (err) | 4011 | if (err) |
3933 | return err; | 4012 | return err; |
3934 | 4013 | ||
4014 | if (is_ctx_reg(env, insn->dst_reg)) { | ||
4015 | verbose(env, "BPF_ST stores into R%d context is not allowed\n", | ||
4016 | insn->dst_reg); | ||
4017 | return -EACCES; | ||
4018 | } | ||
4019 | |||
3935 | /* check that memory (dst_reg + off) is writeable */ | 4020 | /* check that memory (dst_reg + off) is writeable */ |
3936 | err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, | 4021 | err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, |
3937 | BPF_SIZE(insn->code), BPF_WRITE, | 4022 | BPF_SIZE(insn->code), BPF_WRITE, |
@@ -4384,6 +4469,24 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) | |||
4384 | int i, cnt, delta = 0; | 4469 | int i, cnt, delta = 0; |
4385 | 4470 | ||
4386 | for (i = 0; i < insn_cnt; i++, insn++) { | 4471 | for (i = 0; i < insn_cnt; i++, insn++) { |
4472 | if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) || | ||
4473 | insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { | ||
4474 | /* due to JIT bugs clear upper 32-bits of src register | ||
4475 | * before div/mod operation | ||
4476 | */ | ||
4477 | insn_buf[0] = BPF_MOV32_REG(insn->src_reg, insn->src_reg); | ||
4478 | insn_buf[1] = *insn; | ||
4479 | cnt = 2; | ||
4480 | new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); | ||
4481 | if (!new_prog) | ||
4482 | return -ENOMEM; | ||
4483 | |||
4484 | delta += cnt - 1; | ||
4485 | env->prog = prog = new_prog; | ||
4486 | insn = new_prog->insnsi + i + delta; | ||
4487 | continue; | ||
4488 | } | ||
4489 | |||
4387 | if (insn->code != (BPF_JMP | BPF_CALL)) | 4490 | if (insn->code != (BPF_JMP | BPF_CALL)) |
4388 | continue; | 4491 | continue; |
4389 | 4492 | ||
@@ -4407,6 +4510,35 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) | |||
4407 | */ | 4510 | */ |
4408 | insn->imm = 0; | 4511 | insn->imm = 0; |
4409 | insn->code = BPF_JMP | BPF_TAIL_CALL; | 4512 | insn->code = BPF_JMP | BPF_TAIL_CALL; |
4513 | |||
4514 | /* instead of changing every JIT dealing with tail_call | ||
4515 | * emit two extra insns: | ||
4516 | * if (index >= max_entries) goto out; | ||
4517 | * index &= array->index_mask; | ||
4518 | * to avoid out-of-bounds cpu speculation | ||
4519 | */ | ||
4520 | map_ptr = env->insn_aux_data[i + delta].map_ptr; | ||
4521 | if (map_ptr == BPF_MAP_PTR_POISON) { | ||
4522 | verbose(env, "tail_call abusing map_ptr\n"); | ||
4523 | return -EINVAL; | ||
4524 | } | ||
4525 | if (!map_ptr->unpriv_array) | ||
4526 | continue; | ||
4527 | insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3, | ||
4528 | map_ptr->max_entries, 2); | ||
4529 | insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3, | ||
4530 | container_of(map_ptr, | ||
4531 | struct bpf_array, | ||
4532 | map)->index_mask); | ||
4533 | insn_buf[2] = *insn; | ||
4534 | cnt = 3; | ||
4535 | new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); | ||
4536 | if (!new_prog) | ||
4537 | return -ENOMEM; | ||
4538 | |||
4539 | delta += cnt - 1; | ||
4540 | env->prog = prog = new_prog; | ||
4541 | insn = new_prog->insnsi + i + delta; | ||
4410 | continue; | 4542 | continue; |
4411 | } | 4543 | } |
4412 | 4544 | ||
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 024085daab1a..a2c05d2476ac 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c | |||
@@ -123,7 +123,11 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) | |||
123 | */ | 123 | */ |
124 | do { | 124 | do { |
125 | css_task_iter_start(&from->self, 0, &it); | 125 | css_task_iter_start(&from->self, 0, &it); |
126 | task = css_task_iter_next(&it); | 126 | |
127 | do { | ||
128 | task = css_task_iter_next(&it); | ||
129 | } while (task && (task->flags & PF_EXITING)); | ||
130 | |||
127 | if (task) | 131 | if (task) |
128 | get_task_struct(task); | 132 | get_task_struct(task); |
129 | css_task_iter_end(&it); | 133 | css_task_iter_end(&it); |
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 0b1ffe147f24..7e4c44538119 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c | |||
@@ -1397,7 +1397,7 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, | |||
1397 | cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, | 1397 | cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, |
1398 | cft->name); | 1398 | cft->name); |
1399 | else | 1399 | else |
1400 | strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX); | 1400 | strlcpy(buf, cft->name, CGROUP_FILE_NAME_MAX); |
1401 | return buf; | 1401 | return buf; |
1402 | } | 1402 | } |
1403 | 1403 | ||
@@ -1864,9 +1864,9 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) | |||
1864 | 1864 | ||
1865 | root->flags = opts->flags; | 1865 | root->flags = opts->flags; |
1866 | if (opts->release_agent) | 1866 | if (opts->release_agent) |
1867 | strcpy(root->release_agent_path, opts->release_agent); | 1867 | strlcpy(root->release_agent_path, opts->release_agent, PATH_MAX); |
1868 | if (opts->name) | 1868 | if (opts->name) |
1869 | strcpy(root->name, opts->name); | 1869 | strlcpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN); |
1870 | if (opts->cpuset_clone_children) | 1870 | if (opts->cpuset_clone_children) |
1871 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); | 1871 | set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); |
1872 | } | 1872 | } |
@@ -4125,26 +4125,24 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) | |||
4125 | 4125 | ||
4126 | static void css_task_iter_advance(struct css_task_iter *it) | 4126 | static void css_task_iter_advance(struct css_task_iter *it) |
4127 | { | 4127 | { |
4128 | struct list_head *l = it->task_pos; | 4128 | struct list_head *next; |
4129 | 4129 | ||
4130 | lockdep_assert_held(&css_set_lock); | 4130 | lockdep_assert_held(&css_set_lock); |
4131 | WARN_ON_ONCE(!l); | ||
4132 | |||
4133 | repeat: | 4131 | repeat: |
4134 | /* | 4132 | /* |
4135 | * Advance iterator to find next entry. cset->tasks is consumed | 4133 | * Advance iterator to find next entry. cset->tasks is consumed |
4136 | * first and then ->mg_tasks. After ->mg_tasks, we move onto the | 4134 | * first and then ->mg_tasks. After ->mg_tasks, we move onto the |
4137 | * next cset. | 4135 | * next cset. |
4138 | */ | 4136 | */ |
4139 | l = l->next; | 4137 | next = it->task_pos->next; |
4140 | 4138 | ||
4141 | if (l == it->tasks_head) | 4139 | if (next == it->tasks_head) |
4142 | l = it->mg_tasks_head->next; | 4140 | next = it->mg_tasks_head->next; |
4143 | 4141 | ||
4144 | if (l == it->mg_tasks_head) | 4142 | if (next == it->mg_tasks_head) |
4145 | css_task_iter_advance_css_set(it); | 4143 | css_task_iter_advance_css_set(it); |
4146 | else | 4144 | else |
4147 | it->task_pos = l; | 4145 | it->task_pos = next; |
4148 | 4146 | ||
4149 | /* if PROCS, skip over tasks which aren't group leaders */ | 4147 | /* if PROCS, skip over tasks which aren't group leaders */ |
4150 | if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos && | 4148 | if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos && |
@@ -4449,6 +4447,7 @@ static struct cftype cgroup_base_files[] = { | |||
4449 | }, | 4447 | }, |
4450 | { | 4448 | { |
4451 | .name = "cgroup.threads", | 4449 | .name = "cgroup.threads", |
4450 | .flags = CFTYPE_NS_DELEGATABLE, | ||
4452 | .release = cgroup_procs_release, | 4451 | .release = cgroup_procs_release, |
4453 | .seq_start = cgroup_threads_start, | 4452 | .seq_start = cgroup_threads_start, |
4454 | .seq_next = cgroup_procs_next, | 4453 | .seq_next = cgroup_procs_next, |
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index 5f780d8f6a9d..9caeda610249 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c | |||
@@ -50,7 +50,7 @@ static int current_css_set_read(struct seq_file *seq, void *v) | |||
50 | 50 | ||
51 | spin_lock_irq(&css_set_lock); | 51 | spin_lock_irq(&css_set_lock); |
52 | rcu_read_lock(); | 52 | rcu_read_lock(); |
53 | cset = rcu_dereference(current->cgroups); | 53 | cset = task_css_set(current); |
54 | refcnt = refcount_read(&cset->refcount); | 54 | refcnt = refcount_read(&cset->refcount); |
55 | seq_printf(seq, "css_set %pK %d", cset, refcnt); | 55 | seq_printf(seq, "css_set %pK %d", cset, refcnt); |
56 | if (refcnt > cset->nr_tasks) | 56 | if (refcnt > cset->nr_tasks) |
@@ -96,7 +96,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) | |||
96 | 96 | ||
97 | spin_lock_irq(&css_set_lock); | 97 | spin_lock_irq(&css_set_lock); |
98 | rcu_read_lock(); | 98 | rcu_read_lock(); |
99 | cset = rcu_dereference(current->cgroups); | 99 | cset = task_css_set(current); |
100 | list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { | 100 | list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { |
101 | struct cgroup *c = link->cgrp; | 101 | struct cgroup *c = link->cgrp; |
102 | 102 | ||
diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c index 133b465691d6..1e111dd455c4 100644 --- a/kernel/cgroup/stat.c +++ b/kernel/cgroup/stat.c | |||
@@ -296,8 +296,12 @@ int cgroup_stat_init(struct cgroup *cgrp) | |||
296 | } | 296 | } |
297 | 297 | ||
298 | /* ->updated_children list is self terminated */ | 298 | /* ->updated_children list is self terminated */ |
299 | for_each_possible_cpu(cpu) | 299 | for_each_possible_cpu(cpu) { |
300 | cgroup_cpu_stat(cgrp, cpu)->updated_children = cgrp; | 300 | struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); |
301 | |||
302 | cstat->updated_children = cgrp; | ||
303 | u64_stats_init(&cstat->sync); | ||
304 | } | ||
301 | 305 | ||
302 | prev_cputime_init(&cgrp->stat.prev_cputime); | 306 | prev_cputime_init(&cgrp->stat.prev_cputime); |
303 | 307 | ||
diff --git a/kernel/cpu.c b/kernel/cpu.c index 04892a82f6ac..53f7dc65f9a3 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -80,19 +80,19 @@ static struct lockdep_map cpuhp_state_down_map = | |||
80 | STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map); | 80 | STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map); |
81 | 81 | ||
82 | 82 | ||
83 | static void inline cpuhp_lock_acquire(bool bringup) | 83 | static inline void cpuhp_lock_acquire(bool bringup) |
84 | { | 84 | { |
85 | lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map); | 85 | lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map); |
86 | } | 86 | } |
87 | 87 | ||
88 | static void inline cpuhp_lock_release(bool bringup) | 88 | static inline void cpuhp_lock_release(bool bringup) |
89 | { | 89 | { |
90 | lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map); | 90 | lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map); |
91 | } | 91 | } |
92 | #else | 92 | #else |
93 | 93 | ||
94 | static void inline cpuhp_lock_acquire(bool bringup) { } | 94 | static inline void cpuhp_lock_acquire(bool bringup) { } |
95 | static void inline cpuhp_lock_release(bool bringup) { } | 95 | static inline void cpuhp_lock_release(bool bringup) { } |
96 | 96 | ||
97 | #endif | 97 | #endif |
98 | 98 | ||
@@ -780,8 +780,8 @@ static int takedown_cpu(unsigned int cpu) | |||
780 | BUG_ON(cpu_online(cpu)); | 780 | BUG_ON(cpu_online(cpu)); |
781 | 781 | ||
782 | /* | 782 | /* |
783 | * The CPUHP_AP_SCHED_MIGRATE_DYING callback will have removed all | 783 | * The teardown callback for CPUHP_AP_SCHED_STARTING will have removed |
784 | * runnable tasks from the cpu, there's only the idle task left now | 784 | * all runnable tasks from the CPU, there's only the idle task left now |
785 | * that the migration thread is done doing the stop_machine thing. | 785 | * that the migration thread is done doing the stop_machine thing. |
786 | * | 786 | * |
787 | * Wait for the stop thread to go away. | 787 | * Wait for the stop thread to go away. |
@@ -1277,9 +1277,9 @@ static struct cpuhp_step cpuhp_bp_states[] = { | |||
1277 | * before blk_mq_queue_reinit_notify() from notify_dead(), | 1277 | * before blk_mq_queue_reinit_notify() from notify_dead(), |
1278 | * otherwise a RCU stall occurs. | 1278 | * otherwise a RCU stall occurs. |
1279 | */ | 1279 | */ |
1280 | [CPUHP_TIMERS_DEAD] = { | 1280 | [CPUHP_TIMERS_PREPARE] = { |
1281 | .name = "timers:dead", | 1281 | .name = "timers:dead", |
1282 | .startup.single = NULL, | 1282 | .startup.single = timers_prepare_cpu, |
1283 | .teardown.single = timers_dead_cpu, | 1283 | .teardown.single = timers_dead_cpu, |
1284 | }, | 1284 | }, |
1285 | /* Kicks the plugged cpu into life */ | 1285 | /* Kicks the plugged cpu into life */ |
@@ -1289,11 +1289,6 @@ static struct cpuhp_step cpuhp_bp_states[] = { | |||
1289 | .teardown.single = NULL, | 1289 | .teardown.single = NULL, |
1290 | .cant_stop = true, | 1290 | .cant_stop = true, |
1291 | }, | 1291 | }, |
1292 | [CPUHP_AP_SMPCFD_DYING] = { | ||
1293 | .name = "smpcfd:dying", | ||
1294 | .startup.single = NULL, | ||
1295 | .teardown.single = smpcfd_dying_cpu, | ||
1296 | }, | ||
1297 | /* | 1292 | /* |
1298 | * Handled on controll processor until the plugged processor manages | 1293 | * Handled on controll processor until the plugged processor manages |
1299 | * this itself. | 1294 | * this itself. |
@@ -1335,6 +1330,11 @@ static struct cpuhp_step cpuhp_ap_states[] = { | |||
1335 | .startup.single = NULL, | 1330 | .startup.single = NULL, |
1336 | .teardown.single = rcutree_dying_cpu, | 1331 | .teardown.single = rcutree_dying_cpu, |
1337 | }, | 1332 | }, |
1333 | [CPUHP_AP_SMPCFD_DYING] = { | ||
1334 | .name = "smpcfd:dying", | ||
1335 | .startup.single = NULL, | ||
1336 | .teardown.single = smpcfd_dying_cpu, | ||
1337 | }, | ||
1338 | /* Entry state on starting. Interrupts enabled from here on. Transient | 1338 | /* Entry state on starting. Interrupts enabled from here on. Transient |
1339 | * state for synchronsization */ | 1339 | * state for synchronsization */ |
1340 | [CPUHP_AP_ONLINE] = { | 1340 | [CPUHP_AP_ONLINE] = { |
diff --git a/kernel/crash_core.c b/kernel/crash_core.c index b3663896278e..4f63597c824d 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c | |||
@@ -410,7 +410,7 @@ static int __init crash_save_vmcoreinfo_init(void) | |||
410 | VMCOREINFO_SYMBOL(contig_page_data); | 410 | VMCOREINFO_SYMBOL(contig_page_data); |
411 | #endif | 411 | #endif |
412 | #ifdef CONFIG_SPARSEMEM | 412 | #ifdef CONFIG_SPARSEMEM |
413 | VMCOREINFO_SYMBOL(mem_section); | 413 | VMCOREINFO_SYMBOL_ARRAY(mem_section); |
414 | VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); | 414 | VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); |
415 | VMCOREINFO_STRUCT_SIZE(mem_section); | 415 | VMCOREINFO_STRUCT_SIZE(mem_section); |
416 | VMCOREINFO_OFFSET(mem_section, section_mem_map); | 416 | VMCOREINFO_OFFSET(mem_section, section_mem_map); |
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index e74be38245ad..ed5d34925ad0 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c | |||
@@ -350,7 +350,7 @@ poll_again: | |||
350 | } | 350 | } |
351 | kdb_printf("\n"); | 351 | kdb_printf("\n"); |
352 | for (i = 0; i < count; i++) { | 352 | for (i = 0; i < count; i++) { |
353 | if (kallsyms_symbol_next(p_tmp, i) < 0) | 353 | if (WARN_ON(!kallsyms_symbol_next(p_tmp, i))) |
354 | break; | 354 | break; |
355 | kdb_printf("%s ", p_tmp); | 355 | kdb_printf("%s ", p_tmp); |
356 | *(p_tmp + len) = '\0'; | 356 | *(p_tmp + len) = '\0'; |
diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 4a1c33416b6a..e2764d767f18 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c | |||
@@ -51,16 +51,16 @@ void __delayacct_tsk_init(struct task_struct *tsk) | |||
51 | * Finish delay accounting for a statistic using its timestamps (@start), | 51 | * Finish delay accounting for a statistic using its timestamps (@start), |
52 | * accumalator (@total) and @count | 52 | * accumalator (@total) and @count |
53 | */ | 53 | */ |
54 | static void delayacct_end(u64 *start, u64 *total, u32 *count) | 54 | static void delayacct_end(spinlock_t *lock, u64 *start, u64 *total, u32 *count) |
55 | { | 55 | { |
56 | s64 ns = ktime_get_ns() - *start; | 56 | s64 ns = ktime_get_ns() - *start; |
57 | unsigned long flags; | 57 | unsigned long flags; |
58 | 58 | ||
59 | if (ns > 0) { | 59 | if (ns > 0) { |
60 | spin_lock_irqsave(¤t->delays->lock, flags); | 60 | spin_lock_irqsave(lock, flags); |
61 | *total += ns; | 61 | *total += ns; |
62 | (*count)++; | 62 | (*count)++; |
63 | spin_unlock_irqrestore(¤t->delays->lock, flags); | 63 | spin_unlock_irqrestore(lock, flags); |
64 | } | 64 | } |
65 | } | 65 | } |
66 | 66 | ||
@@ -69,17 +69,25 @@ void __delayacct_blkio_start(void) | |||
69 | current->delays->blkio_start = ktime_get_ns(); | 69 | current->delays->blkio_start = ktime_get_ns(); |
70 | } | 70 | } |
71 | 71 | ||
72 | void __delayacct_blkio_end(void) | 72 | /* |
73 | * We cannot rely on the `current` macro, as we haven't yet switched back to | ||
74 | * the process being woken. | ||
75 | */ | ||
76 | void __delayacct_blkio_end(struct task_struct *p) | ||
73 | { | 77 | { |
74 | if (current->delays->flags & DELAYACCT_PF_SWAPIN) | 78 | struct task_delay_info *delays = p->delays; |
75 | /* Swapin block I/O */ | 79 | u64 *total; |
76 | delayacct_end(¤t->delays->blkio_start, | 80 | u32 *count; |
77 | ¤t->delays->swapin_delay, | 81 | |
78 | ¤t->delays->swapin_count); | 82 | if (p->delays->flags & DELAYACCT_PF_SWAPIN) { |
79 | else /* Other block I/O */ | 83 | total = &delays->swapin_delay; |
80 | delayacct_end(¤t->delays->blkio_start, | 84 | count = &delays->swapin_count; |
81 | ¤t->delays->blkio_delay, | 85 | } else { |
82 | ¤t->delays->blkio_count); | 86 | total = &delays->blkio_delay; |
87 | count = &delays->blkio_count; | ||
88 | } | ||
89 | |||
90 | delayacct_end(&delays->lock, &delays->blkio_start, total, count); | ||
83 | } | 91 | } |
84 | 92 | ||
85 | int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) | 93 | int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) |
@@ -153,8 +161,10 @@ void __delayacct_freepages_start(void) | |||
153 | 161 | ||
154 | void __delayacct_freepages_end(void) | 162 | void __delayacct_freepages_end(void) |
155 | { | 163 | { |
156 | delayacct_end(¤t->delays->freepages_start, | 164 | delayacct_end( |
157 | ¤t->delays->freepages_delay, | 165 | ¤t->delays->lock, |
158 | ¤t->delays->freepages_count); | 166 | ¤t->delays->freepages_start, |
167 | ¤t->delays->freepages_delay, | ||
168 | ¤t->delays->freepages_count); | ||
159 | } | 169 | } |
160 | 170 | ||
diff --git a/kernel/events/core.c b/kernel/events/core.c index 16beab4767e1..5d8f4031f8d5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -1231,6 +1231,10 @@ static void put_ctx(struct perf_event_context *ctx) | |||
1231 | * perf_event_context::lock | 1231 | * perf_event_context::lock |
1232 | * perf_event::mmap_mutex | 1232 | * perf_event::mmap_mutex |
1233 | * mmap_sem | 1233 | * mmap_sem |
1234 | * | ||
1235 | * cpu_hotplug_lock | ||
1236 | * pmus_lock | ||
1237 | * cpuctx->mutex / perf_event_context::mutex | ||
1234 | */ | 1238 | */ |
1235 | static struct perf_event_context * | 1239 | static struct perf_event_context * |
1236 | perf_event_ctx_lock_nested(struct perf_event *event, int nesting) | 1240 | perf_event_ctx_lock_nested(struct perf_event *event, int nesting) |
@@ -4196,6 +4200,7 @@ int perf_event_release_kernel(struct perf_event *event) | |||
4196 | { | 4200 | { |
4197 | struct perf_event_context *ctx = event->ctx; | 4201 | struct perf_event_context *ctx = event->ctx; |
4198 | struct perf_event *child, *tmp; | 4202 | struct perf_event *child, *tmp; |
4203 | LIST_HEAD(free_list); | ||
4199 | 4204 | ||
4200 | /* | 4205 | /* |
4201 | * If we got here through err_file: fput(event_file); we will not have | 4206 | * If we got here through err_file: fput(event_file); we will not have |
@@ -4268,8 +4273,7 @@ again: | |||
4268 | struct perf_event, child_list); | 4273 | struct perf_event, child_list); |
4269 | if (tmp == child) { | 4274 | if (tmp == child) { |
4270 | perf_remove_from_context(child, DETACH_GROUP); | 4275 | perf_remove_from_context(child, DETACH_GROUP); |
4271 | list_del(&child->child_list); | 4276 | list_move(&child->child_list, &free_list); |
4272 | free_event(child); | ||
4273 | /* | 4277 | /* |
4274 | * This matches the refcount bump in inherit_event(); | 4278 | * This matches the refcount bump in inherit_event(); |
4275 | * this can't be the last reference. | 4279 | * this can't be the last reference. |
@@ -4284,6 +4288,11 @@ again: | |||
4284 | } | 4288 | } |
4285 | mutex_unlock(&event->child_mutex); | 4289 | mutex_unlock(&event->child_mutex); |
4286 | 4290 | ||
4291 | list_for_each_entry_safe(child, tmp, &free_list, child_list) { | ||
4292 | list_del(&child->child_list); | ||
4293 | free_event(child); | ||
4294 | } | ||
4295 | |||
4287 | no_ctx: | 4296 | no_ctx: |
4288 | put_event(event); /* Must be the 'last' reference */ | 4297 | put_event(event); /* Must be the 'last' reference */ |
4289 | return 0; | 4298 | return 0; |
@@ -6639,6 +6648,7 @@ static void perf_event_namespaces_output(struct perf_event *event, | |||
6639 | struct perf_namespaces_event *namespaces_event = data; | 6648 | struct perf_namespaces_event *namespaces_event = data; |
6640 | struct perf_output_handle handle; | 6649 | struct perf_output_handle handle; |
6641 | struct perf_sample_data sample; | 6650 | struct perf_sample_data sample; |
6651 | u16 header_size = namespaces_event->event_id.header.size; | ||
6642 | int ret; | 6652 | int ret; |
6643 | 6653 | ||
6644 | if (!perf_event_namespaces_match(event)) | 6654 | if (!perf_event_namespaces_match(event)) |
@@ -6649,7 +6659,7 @@ static void perf_event_namespaces_output(struct perf_event *event, | |||
6649 | ret = perf_output_begin(&handle, event, | 6659 | ret = perf_output_begin(&handle, event, |
6650 | namespaces_event->event_id.header.size); | 6660 | namespaces_event->event_id.header.size); |
6651 | if (ret) | 6661 | if (ret) |
6652 | return; | 6662 | goto out; |
6653 | 6663 | ||
6654 | namespaces_event->event_id.pid = perf_event_pid(event, | 6664 | namespaces_event->event_id.pid = perf_event_pid(event, |
6655 | namespaces_event->task); | 6665 | namespaces_event->task); |
@@ -6661,6 +6671,8 @@ static void perf_event_namespaces_output(struct perf_event *event, | |||
6661 | perf_event__output_id_sample(event, &handle, &sample); | 6671 | perf_event__output_id_sample(event, &handle, &sample); |
6662 | 6672 | ||
6663 | perf_output_end(&handle); | 6673 | perf_output_end(&handle); |
6674 | out: | ||
6675 | namespaces_event->event_id.header.size = header_size; | ||
6664 | } | 6676 | } |
6665 | 6677 | ||
6666 | static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info, | 6678 | static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info, |
@@ -7987,11 +7999,11 @@ static void bpf_overflow_handler(struct perf_event *event, | |||
7987 | { | 7999 | { |
7988 | struct bpf_perf_event_data_kern ctx = { | 8000 | struct bpf_perf_event_data_kern ctx = { |
7989 | .data = data, | 8001 | .data = data, |
7990 | .regs = regs, | ||
7991 | .event = event, | 8002 | .event = event, |
7992 | }; | 8003 | }; |
7993 | int ret = 0; | 8004 | int ret = 0; |
7994 | 8005 | ||
8006 | ctx.regs = perf_arch_bpf_user_pt_regs(regs); | ||
7995 | preempt_disable(); | 8007 | preempt_disable(); |
7996 | if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) | 8008 | if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) |
7997 | goto out; | 8009 | goto out; |
@@ -8513,6 +8525,29 @@ fail_clear_files: | |||
8513 | return ret; | 8525 | return ret; |
8514 | } | 8526 | } |
8515 | 8527 | ||
8528 | static int | ||
8529 | perf_tracepoint_set_filter(struct perf_event *event, char *filter_str) | ||
8530 | { | ||
8531 | struct perf_event_context *ctx = event->ctx; | ||
8532 | int ret; | ||
8533 | |||
8534 | /* | ||
8535 | * Beware, here be dragons!! | ||
8536 | * | ||
8537 | * the tracepoint muck will deadlock against ctx->mutex, but the tracepoint | ||
8538 | * stuff does not actually need it. So temporarily drop ctx->mutex. As per | ||
8539 | * perf_event_ctx_lock() we already have a reference on ctx. | ||
8540 | * | ||
8541 | * This can result in event getting moved to a different ctx, but that | ||
8542 | * does not affect the tracepoint state. | ||
8543 | */ | ||
8544 | mutex_unlock(&ctx->mutex); | ||
8545 | ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); | ||
8546 | mutex_lock(&ctx->mutex); | ||
8547 | |||
8548 | return ret; | ||
8549 | } | ||
8550 | |||
8516 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) | 8551 | static int perf_event_set_filter(struct perf_event *event, void __user *arg) |
8517 | { | 8552 | { |
8518 | char *filter_str; | 8553 | char *filter_str; |
@@ -8529,8 +8564,7 @@ static int perf_event_set_filter(struct perf_event *event, void __user *arg) | |||
8529 | 8564 | ||
8530 | if (IS_ENABLED(CONFIG_EVENT_TRACING) && | 8565 | if (IS_ENABLED(CONFIG_EVENT_TRACING) && |
8531 | event->attr.type == PERF_TYPE_TRACEPOINT) | 8566 | event->attr.type == PERF_TYPE_TRACEPOINT) |
8532 | ret = ftrace_profile_set_filter(event, event->attr.config, | 8567 | ret = perf_tracepoint_set_filter(event, filter_str); |
8533 | filter_str); | ||
8534 | else if (has_addr_filter(event)) | 8568 | else if (has_addr_filter(event)) |
8535 | ret = perf_event_set_addr_filter(event, filter_str); | 8569 | ret = perf_event_set_addr_filter(event, filter_str); |
8536 | 8570 | ||
@@ -9165,7 +9199,13 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) | |||
9165 | if (!try_module_get(pmu->module)) | 9199 | if (!try_module_get(pmu->module)) |
9166 | return -ENODEV; | 9200 | return -ENODEV; |
9167 | 9201 | ||
9168 | if (event->group_leader != event) { | 9202 | /* |
9203 | * A number of pmu->event_init() methods iterate the sibling_list to, | ||
9204 | * for example, validate if the group fits on the PMU. Therefore, | ||
9205 | * if this is a sibling event, acquire the ctx->mutex to protect | ||
9206 | * the sibling_list. | ||
9207 | */ | ||
9208 | if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) { | ||
9169 | /* | 9209 | /* |
9170 | * This ctx->mutex can nest when we're called through | 9210 | * This ctx->mutex can nest when we're called through |
9171 | * inheritance. See the perf_event_ctx_lock_nested() comment. | 9211 | * inheritance. See the perf_event_ctx_lock_nested() comment. |
diff --git a/kernel/exit.c b/kernel/exit.c index 6b4298a41167..995453d9fb55 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -1755,3 +1755,12 @@ Efault: | |||
1755 | return -EFAULT; | 1755 | return -EFAULT; |
1756 | } | 1756 | } |
1757 | #endif | 1757 | #endif |
1758 | |||
1759 | __weak void abort(void) | ||
1760 | { | ||
1761 | BUG(); | ||
1762 | |||
1763 | /* if that doesn't kill us, halt */ | ||
1764 | panic("Oops failed to kill thread"); | ||
1765 | } | ||
1766 | EXPORT_SYMBOL(abort); | ||
diff --git a/kernel/fork.c b/kernel/fork.c index 432eadf6b58c..2295fc69717f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -721,8 +721,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, | |||
721 | goto out; | 721 | goto out; |
722 | } | 722 | } |
723 | /* a new mm has just been created */ | 723 | /* a new mm has just been created */ |
724 | arch_dup_mmap(oldmm, mm); | 724 | retval = arch_dup_mmap(oldmm, mm); |
725 | retval = 0; | ||
726 | out: | 725 | out: |
727 | up_write(&mm->mmap_sem); | 726 | up_write(&mm->mmap_sem); |
728 | flush_tlb_mm(oldmm); | 727 | flush_tlb_mm(oldmm); |
diff --git a/kernel/futex.c b/kernel/futex.c index 76ed5921117a..7f719d110908 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -1582,8 +1582,8 @@ static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) | |||
1582 | { | 1582 | { |
1583 | unsigned int op = (encoded_op & 0x70000000) >> 28; | 1583 | unsigned int op = (encoded_op & 0x70000000) >> 28; |
1584 | unsigned int cmp = (encoded_op & 0x0f000000) >> 24; | 1584 | unsigned int cmp = (encoded_op & 0x0f000000) >> 24; |
1585 | int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 12); | 1585 | int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11); |
1586 | int cmparg = sign_extend32(encoded_op & 0x00000fff, 12); | 1586 | int cmparg = sign_extend32(encoded_op & 0x00000fff, 11); |
1587 | int oldval, ret; | 1587 | int oldval, ret; |
1588 | 1588 | ||
1589 | if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) { | 1589 | if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) { |
@@ -1878,6 +1878,9 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, | |||
1878 | struct futex_q *this, *next; | 1878 | struct futex_q *this, *next; |
1879 | DEFINE_WAKE_Q(wake_q); | 1879 | DEFINE_WAKE_Q(wake_q); |
1880 | 1880 | ||
1881 | if (nr_wake < 0 || nr_requeue < 0) | ||
1882 | return -EINVAL; | ||
1883 | |||
1881 | /* | 1884 | /* |
1882 | * When PI not supported: return -ENOSYS if requeue_pi is true, | 1885 | * When PI not supported: return -ENOSYS if requeue_pi is true, |
1883 | * consequently the compiler knows requeue_pi is always false past | 1886 | * consequently the compiler knows requeue_pi is always false past |
@@ -2294,34 +2297,33 @@ static void unqueue_me_pi(struct futex_q *q) | |||
2294 | spin_unlock(q->lock_ptr); | 2297 | spin_unlock(q->lock_ptr); |
2295 | } | 2298 | } |
2296 | 2299 | ||
2297 | /* | ||
2298 | * Fixup the pi_state owner with the new owner. | ||
2299 | * | ||
2300 | * Must be called with hash bucket lock held and mm->sem held for non | ||
2301 | * private futexes. | ||
2302 | */ | ||
2303 | static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, | 2300 | static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, |
2304 | struct task_struct *newowner) | 2301 | struct task_struct *argowner) |
2305 | { | 2302 | { |
2306 | u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; | ||
2307 | struct futex_pi_state *pi_state = q->pi_state; | 2303 | struct futex_pi_state *pi_state = q->pi_state; |
2308 | u32 uval, uninitialized_var(curval), newval; | 2304 | u32 uval, uninitialized_var(curval), newval; |
2309 | struct task_struct *oldowner; | 2305 | struct task_struct *oldowner, *newowner; |
2306 | u32 newtid; | ||
2310 | int ret; | 2307 | int ret; |
2311 | 2308 | ||
2309 | lockdep_assert_held(q->lock_ptr); | ||
2310 | |||
2312 | raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); | 2311 | raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); |
2313 | 2312 | ||
2314 | oldowner = pi_state->owner; | 2313 | oldowner = pi_state->owner; |
2315 | /* Owner died? */ | ||
2316 | if (!pi_state->owner) | ||
2317 | newtid |= FUTEX_OWNER_DIED; | ||
2318 | 2314 | ||
2319 | /* | 2315 | /* |
2320 | * We are here either because we stole the rtmutex from the | 2316 | * We are here because either: |
2321 | * previous highest priority waiter or we are the highest priority | 2317 | * |
2322 | * waiter but have failed to get the rtmutex the first time. | 2318 | * - we stole the lock and pi_state->owner needs updating to reflect |
2319 | * that (@argowner == current), | ||
2320 | * | ||
2321 | * or: | ||
2323 | * | 2322 | * |
2324 | * We have to replace the newowner TID in the user space variable. | 2323 | * - someone stole our lock and we need to fix things to point to the |
2324 | * new owner (@argowner == NULL). | ||
2325 | * | ||
2326 | * Either way, we have to replace the TID in the user space variable. | ||
2325 | * This must be atomic as we have to preserve the owner died bit here. | 2327 | * This must be atomic as we have to preserve the owner died bit here. |
2326 | * | 2328 | * |
2327 | * Note: We write the user space value _before_ changing the pi_state | 2329 | * Note: We write the user space value _before_ changing the pi_state |
@@ -2334,6 +2336,45 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, | |||
2334 | * in the PID check in lookup_pi_state. | 2336 | * in the PID check in lookup_pi_state. |
2335 | */ | 2337 | */ |
2336 | retry: | 2338 | retry: |
2339 | if (!argowner) { | ||
2340 | if (oldowner != current) { | ||
2341 | /* | ||
2342 | * We raced against a concurrent self; things are | ||
2343 | * already fixed up. Nothing to do. | ||
2344 | */ | ||
2345 | ret = 0; | ||
2346 | goto out_unlock; | ||
2347 | } | ||
2348 | |||
2349 | if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) { | ||
2350 | /* We got the lock after all, nothing to fix. */ | ||
2351 | ret = 0; | ||
2352 | goto out_unlock; | ||
2353 | } | ||
2354 | |||
2355 | /* | ||
2356 | * Since we just failed the trylock; there must be an owner. | ||
2357 | */ | ||
2358 | newowner = rt_mutex_owner(&pi_state->pi_mutex); | ||
2359 | BUG_ON(!newowner); | ||
2360 | } else { | ||
2361 | WARN_ON_ONCE(argowner != current); | ||
2362 | if (oldowner == current) { | ||
2363 | /* | ||
2364 | * We raced against a concurrent self; things are | ||
2365 | * already fixed up. Nothing to do. | ||
2366 | */ | ||
2367 | ret = 0; | ||
2368 | goto out_unlock; | ||
2369 | } | ||
2370 | newowner = argowner; | ||
2371 | } | ||
2372 | |||
2373 | newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; | ||
2374 | /* Owner died? */ | ||
2375 | if (!pi_state->owner) | ||
2376 | newtid |= FUTEX_OWNER_DIED; | ||
2377 | |||
2337 | if (get_futex_value_locked(&uval, uaddr)) | 2378 | if (get_futex_value_locked(&uval, uaddr)) |
2338 | goto handle_fault; | 2379 | goto handle_fault; |
2339 | 2380 | ||
@@ -2434,9 +2475,9 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) | |||
2434 | * Got the lock. We might not be the anticipated owner if we | 2475 | * Got the lock. We might not be the anticipated owner if we |
2435 | * did a lock-steal - fix up the PI-state in that case: | 2476 | * did a lock-steal - fix up the PI-state in that case: |
2436 | * | 2477 | * |
2437 | * We can safely read pi_state->owner without holding wait_lock | 2478 | * Speculative pi_state->owner read (we don't hold wait_lock); |
2438 | * because we now own the rt_mutex, only the owner will attempt | 2479 | * since we own the lock pi_state->owner == current is the |
2439 | * to change it. | 2480 | * stable state, anything else needs more attention. |
2440 | */ | 2481 | */ |
2441 | if (q->pi_state->owner != current) | 2482 | if (q->pi_state->owner != current) |
2442 | ret = fixup_pi_state_owner(uaddr, q, current); | 2483 | ret = fixup_pi_state_owner(uaddr, q, current); |
@@ -2444,6 +2485,19 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) | |||
2444 | } | 2485 | } |
2445 | 2486 | ||
2446 | /* | 2487 | /* |
2488 | * If we didn't get the lock; check if anybody stole it from us. In | ||
2489 | * that case, we need to fix up the uval to point to them instead of | ||
2490 | * us, otherwise bad things happen. [10] | ||
2491 | * | ||
2492 | * Another speculative read; pi_state->owner == current is unstable | ||
2493 | * but needs our attention. | ||
2494 | */ | ||
2495 | if (q->pi_state->owner == current) { | ||
2496 | ret = fixup_pi_state_owner(uaddr, q, NULL); | ||
2497 | goto out; | ||
2498 | } | ||
2499 | |||
2500 | /* | ||
2447 | * Paranoia check. If we did not take the lock, then we should not be | 2501 | * Paranoia check. If we did not take the lock, then we should not be |
2448 | * the owner of the rt_mutex. | 2502 | * the owner of the rt_mutex. |
2449 | */ | 2503 | */ |
diff --git a/kernel/groups.c b/kernel/groups.c index e357bc800111..daae2f2dc6d4 100644 --- a/kernel/groups.c +++ b/kernel/groups.c | |||
@@ -86,11 +86,12 @@ static int gid_cmp(const void *_a, const void *_b) | |||
86 | return gid_gt(a, b) - gid_lt(a, b); | 86 | return gid_gt(a, b) - gid_lt(a, b); |
87 | } | 87 | } |
88 | 88 | ||
89 | static void groups_sort(struct group_info *group_info) | 89 | void groups_sort(struct group_info *group_info) |
90 | { | 90 | { |
91 | sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid), | 91 | sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid), |
92 | gid_cmp, NULL); | 92 | gid_cmp, NULL); |
93 | } | 93 | } |
94 | EXPORT_SYMBOL(groups_sort); | ||
94 | 95 | ||
95 | /* a simple bsearch */ | 96 | /* a simple bsearch */ |
96 | int groups_search(const struct group_info *group_info, kgid_t grp) | 97 | int groups_search(const struct group_info *group_info, kgid_t grp) |
@@ -122,7 +123,6 @@ int groups_search(const struct group_info *group_info, kgid_t grp) | |||
122 | void set_groups(struct cred *new, struct group_info *group_info) | 123 | void set_groups(struct cred *new, struct group_info *group_info) |
123 | { | 124 | { |
124 | put_group_info(new->group_info); | 125 | put_group_info(new->group_info); |
125 | groups_sort(group_info); | ||
126 | get_group_info(group_info); | 126 | get_group_info(group_info); |
127 | new->group_info = group_info; | 127 | new->group_info = group_info; |
128 | } | 128 | } |
@@ -206,6 +206,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) | |||
206 | return retval; | 206 | return retval; |
207 | } | 207 | } |
208 | 208 | ||
209 | groups_sort(group_info); | ||
209 | retval = set_current_groups(group_info); | 210 | retval = set_current_groups(group_info); |
210 | put_group_info(group_info); | 211 | put_group_info(group_info); |
211 | 212 | ||
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h index 17f05ef8f575..e4d3819a91cc 100644 --- a/kernel/irq/debug.h +++ b/kernel/irq/debug.h | |||
@@ -12,6 +12,11 @@ | |||
12 | 12 | ||
13 | static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) | 13 | static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) |
14 | { | 14 | { |
15 | static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 5); | ||
16 | |||
17 | if (!__ratelimit(&ratelimit)) | ||
18 | return; | ||
19 | |||
15 | printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", | 20 | printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", |
16 | irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); | 21 | irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); |
17 | printk("->handle_irq(): %p, ", desc->handle_irq); | 22 | printk("->handle_irq(): %p, ", desc->handle_irq); |
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index 7f608ac39653..acfaaef8672a 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c | |||
@@ -113,6 +113,7 @@ static const struct irq_bit_descr irqdata_states[] = { | |||
113 | BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING), | 113 | BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING), |
114 | BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED), | 114 | BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED), |
115 | BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN), | 115 | BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN), |
116 | BIT_MASK_DESCR(IRQD_CAN_RESERVE), | ||
116 | 117 | ||
117 | BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU), | 118 | BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU), |
118 | 119 | ||
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index c26c5bb6b491..508c03dfef25 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c | |||
@@ -364,10 +364,11 @@ irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq) | |||
364 | EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip); | 364 | EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip); |
365 | 365 | ||
366 | /* | 366 | /* |
367 | * Separate lockdep class for interrupt chip which can nest irq_desc | 367 | * Separate lockdep classes for interrupt chip which can nest irq_desc |
368 | * lock. | 368 | * lock and request mutex. |
369 | */ | 369 | */ |
370 | static struct lock_class_key irq_nested_lock_class; | 370 | static struct lock_class_key irq_nested_lock_class; |
371 | static struct lock_class_key irq_nested_request_class; | ||
371 | 372 | ||
372 | /* | 373 | /* |
373 | * irq_map_generic_chip - Map a generic chip for an irq domain | 374 | * irq_map_generic_chip - Map a generic chip for an irq domain |
@@ -409,7 +410,8 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, | |||
409 | set_bit(idx, &gc->installed); | 410 | set_bit(idx, &gc->installed); |
410 | 411 | ||
411 | if (dgc->gc_flags & IRQ_GC_INIT_NESTED_LOCK) | 412 | if (dgc->gc_flags & IRQ_GC_INIT_NESTED_LOCK) |
412 | irq_set_lockdep_class(virq, &irq_nested_lock_class); | 413 | irq_set_lockdep_class(virq, &irq_nested_lock_class, |
414 | &irq_nested_request_class); | ||
413 | 415 | ||
414 | if (chip->irq_calc_mask) | 416 | if (chip->irq_calc_mask) |
415 | chip->irq_calc_mask(data); | 417 | chip->irq_calc_mask(data); |
@@ -479,7 +481,8 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, | |||
479 | continue; | 481 | continue; |
480 | 482 | ||
481 | if (flags & IRQ_GC_INIT_NESTED_LOCK) | 483 | if (flags & IRQ_GC_INIT_NESTED_LOCK) |
482 | irq_set_lockdep_class(i, &irq_nested_lock_class); | 484 | irq_set_lockdep_class(i, &irq_nested_lock_class, |
485 | &irq_nested_request_class); | ||
483 | 486 | ||
484 | if (!(flags & IRQ_GC_NO_MASK)) { | 487 | if (!(flags & IRQ_GC_NO_MASK)) { |
485 | struct irq_data *d = irq_get_irq_data(i); | 488 | struct irq_data *d = irq_get_irq_data(i); |
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 07d08ca701ec..ab19371eab9b 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h | |||
@@ -440,7 +440,7 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear) | |||
440 | #endif /* !CONFIG_GENERIC_PENDING_IRQ */ | 440 | #endif /* !CONFIG_GENERIC_PENDING_IRQ */ |
441 | 441 | ||
442 | #if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY) | 442 | #if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY) |
443 | static inline int irq_domain_activate_irq(struct irq_data *data, bool early) | 443 | static inline int irq_domain_activate_irq(struct irq_data *data, bool reserve) |
444 | { | 444 | { |
445 | irqd_set_activated(data); | 445 | irqd_set_activated(data); |
446 | return 0; | 446 | return 0; |
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 4f4f60015e8a..62068ad46930 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c | |||
@@ -1693,7 +1693,7 @@ static void __irq_domain_deactivate_irq(struct irq_data *irq_data) | |||
1693 | } | 1693 | } |
1694 | } | 1694 | } |
1695 | 1695 | ||
1696 | static int __irq_domain_activate_irq(struct irq_data *irqd, bool early) | 1696 | static int __irq_domain_activate_irq(struct irq_data *irqd, bool reserve) |
1697 | { | 1697 | { |
1698 | int ret = 0; | 1698 | int ret = 0; |
1699 | 1699 | ||
@@ -1702,9 +1702,9 @@ static int __irq_domain_activate_irq(struct irq_data *irqd, bool early) | |||
1702 | 1702 | ||
1703 | if (irqd->parent_data) | 1703 | if (irqd->parent_data) |
1704 | ret = __irq_domain_activate_irq(irqd->parent_data, | 1704 | ret = __irq_domain_activate_irq(irqd->parent_data, |
1705 | early); | 1705 | reserve); |
1706 | if (!ret && domain->ops->activate) { | 1706 | if (!ret && domain->ops->activate) { |
1707 | ret = domain->ops->activate(domain, irqd, early); | 1707 | ret = domain->ops->activate(domain, irqd, reserve); |
1708 | /* Rollback in case of error */ | 1708 | /* Rollback in case of error */ |
1709 | if (ret && irqd->parent_data) | 1709 | if (ret && irqd->parent_data) |
1710 | __irq_domain_deactivate_irq(irqd->parent_data); | 1710 | __irq_domain_deactivate_irq(irqd->parent_data); |
@@ -1716,17 +1716,18 @@ static int __irq_domain_activate_irq(struct irq_data *irqd, bool early) | |||
1716 | /** | 1716 | /** |
1717 | * irq_domain_activate_irq - Call domain_ops->activate recursively to activate | 1717 | * irq_domain_activate_irq - Call domain_ops->activate recursively to activate |
1718 | * interrupt | 1718 | * interrupt |
1719 | * @irq_data: outermost irq_data associated with interrupt | 1719 | * @irq_data: Outermost irq_data associated with interrupt |
1720 | * @reserve: If set only reserve an interrupt vector instead of assigning one | ||
1720 | * | 1721 | * |
1721 | * This is the second step to call domain_ops->activate to program interrupt | 1722 | * This is the second step to call domain_ops->activate to program interrupt |
1722 | * controllers, so the interrupt could actually get delivered. | 1723 | * controllers, so the interrupt could actually get delivered. |
1723 | */ | 1724 | */ |
1724 | int irq_domain_activate_irq(struct irq_data *irq_data, bool early) | 1725 | int irq_domain_activate_irq(struct irq_data *irq_data, bool reserve) |
1725 | { | 1726 | { |
1726 | int ret = 0; | 1727 | int ret = 0; |
1727 | 1728 | ||
1728 | if (!irqd_is_activated(irq_data)) | 1729 | if (!irqd_is_activated(irq_data)) |
1729 | ret = __irq_domain_activate_irq(irq_data, early); | 1730 | ret = __irq_domain_activate_irq(irq_data, reserve); |
1730 | if (!ret) | 1731 | if (!ret) |
1731 | irqd_set_activated(irq_data); | 1732 | irqd_set_activated(irq_data); |
1732 | return ret; | 1733 | return ret; |
diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c index 7df2480005f8..5187dfe809ac 100644 --- a/kernel/irq/matrix.c +++ b/kernel/irq/matrix.c | |||
@@ -321,15 +321,23 @@ void irq_matrix_remove_reserved(struct irq_matrix *m) | |||
321 | int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk, | 321 | int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk, |
322 | bool reserved, unsigned int *mapped_cpu) | 322 | bool reserved, unsigned int *mapped_cpu) |
323 | { | 323 | { |
324 | unsigned int cpu; | 324 | unsigned int cpu, best_cpu, maxavl = 0; |
325 | struct cpumap *cm; | ||
326 | unsigned int bit; | ||
325 | 327 | ||
328 | best_cpu = UINT_MAX; | ||
326 | for_each_cpu(cpu, msk) { | 329 | for_each_cpu(cpu, msk) { |
327 | struct cpumap *cm = per_cpu_ptr(m->maps, cpu); | 330 | cm = per_cpu_ptr(m->maps, cpu); |
328 | unsigned int bit; | ||
329 | 331 | ||
330 | if (!cm->online) | 332 | if (!cm->online || cm->available <= maxavl) |
331 | continue; | 333 | continue; |
332 | 334 | ||
335 | best_cpu = cpu; | ||
336 | maxavl = cm->available; | ||
337 | } | ||
338 | |||
339 | if (maxavl) { | ||
340 | cm = per_cpu_ptr(m->maps, best_cpu); | ||
333 | bit = matrix_alloc_area(m, cm, 1, false); | 341 | bit = matrix_alloc_area(m, cm, 1, false); |
334 | if (bit < m->alloc_end) { | 342 | if (bit < m->alloc_end) { |
335 | cm->allocated++; | 343 | cm->allocated++; |
@@ -338,8 +346,8 @@ int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk, | |||
338 | m->global_available--; | 346 | m->global_available--; |
339 | if (reserved) | 347 | if (reserved) |
340 | m->global_reserved--; | 348 | m->global_reserved--; |
341 | *mapped_cpu = cpu; | 349 | *mapped_cpu = best_cpu; |
342 | trace_irq_matrix_alloc(bit, cpu, m, cm); | 350 | trace_irq_matrix_alloc(bit, best_cpu, m, cm); |
343 | return bit; | 351 | return bit; |
344 | } | 352 | } |
345 | } | 353 | } |
@@ -384,7 +392,9 @@ unsigned int irq_matrix_available(struct irq_matrix *m, bool cpudown) | |||
384 | { | 392 | { |
385 | struct cpumap *cm = this_cpu_ptr(m->maps); | 393 | struct cpumap *cm = this_cpu_ptr(m->maps); |
386 | 394 | ||
387 | return (m->global_available - cpudown) ? cm->available : 0; | 395 | if (!cpudown) |
396 | return m->global_available; | ||
397 | return m->global_available - cm->available; | ||
388 | } | 398 | } |
389 | 399 | ||
390 | /** | 400 | /** |
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index edb987b2c58d..2f3c4f5382cc 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c | |||
@@ -339,6 +339,40 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev, | |||
339 | return ret; | 339 | return ret; |
340 | } | 340 | } |
341 | 341 | ||
342 | /* | ||
343 | * Carefully check whether the device can use reservation mode. If | ||
344 | * reservation mode is enabled then the early activation will assign a | ||
345 | * dummy vector to the device. If the PCI/MSI device does not support | ||
346 | * masking of the entry then this can result in spurious interrupts when | ||
347 | * the device driver is not absolutely careful. But even then a malfunction | ||
348 | * of the hardware could result in a spurious interrupt on the dummy vector | ||
349 | * and render the device unusable. If the entry can be masked then the core | ||
350 | * logic will prevent the spurious interrupt and reservation mode can be | ||
351 | * used. For now reservation mode is restricted to PCI/MSI. | ||
352 | */ | ||
353 | static bool msi_check_reservation_mode(struct irq_domain *domain, | ||
354 | struct msi_domain_info *info, | ||
355 | struct device *dev) | ||
356 | { | ||
357 | struct msi_desc *desc; | ||
358 | |||
359 | if (domain->bus_token != DOMAIN_BUS_PCI_MSI) | ||
360 | return false; | ||
361 | |||
362 | if (!(info->flags & MSI_FLAG_MUST_REACTIVATE)) | ||
363 | return false; | ||
364 | |||
365 | if (IS_ENABLED(CONFIG_PCI_MSI) && pci_msi_ignore_mask) | ||
366 | return false; | ||
367 | |||
368 | /* | ||
369 | * Checking the first MSI descriptor is sufficient. MSIX supports | ||
370 | * masking and MSI does so when the maskbit is set. | ||
371 | */ | ||
372 | desc = first_msi_entry(dev); | ||
373 | return desc->msi_attrib.is_msix || desc->msi_attrib.maskbit; | ||
374 | } | ||
375 | |||
342 | /** | 376 | /** |
343 | * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain | 377 | * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain |
344 | * @domain: The domain to allocate from | 378 | * @domain: The domain to allocate from |
@@ -353,9 +387,11 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, | |||
353 | { | 387 | { |
354 | struct msi_domain_info *info = domain->host_data; | 388 | struct msi_domain_info *info = domain->host_data; |
355 | struct msi_domain_ops *ops = info->ops; | 389 | struct msi_domain_ops *ops = info->ops; |
356 | msi_alloc_info_t arg; | 390 | struct irq_data *irq_data; |
357 | struct msi_desc *desc; | 391 | struct msi_desc *desc; |
392 | msi_alloc_info_t arg; | ||
358 | int i, ret, virq; | 393 | int i, ret, virq; |
394 | bool can_reserve; | ||
359 | 395 | ||
360 | ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg); | 396 | ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg); |
361 | if (ret) | 397 | if (ret) |
@@ -385,6 +421,8 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, | |||
385 | if (ops->msi_finish) | 421 | if (ops->msi_finish) |
386 | ops->msi_finish(&arg, 0); | 422 | ops->msi_finish(&arg, 0); |
387 | 423 | ||
424 | can_reserve = msi_check_reservation_mode(domain, info, dev); | ||
425 | |||
388 | for_each_msi_entry(desc, dev) { | 426 | for_each_msi_entry(desc, dev) { |
389 | virq = desc->irq; | 427 | virq = desc->irq; |
390 | if (desc->nvec_used == 1) | 428 | if (desc->nvec_used == 1) |
@@ -397,15 +435,25 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, | |||
397 | * the MSI entries before the PCI layer enables MSI in the | 435 | * the MSI entries before the PCI layer enables MSI in the |
398 | * card. Otherwise the card latches a random msi message. | 436 | * card. Otherwise the card latches a random msi message. |
399 | */ | 437 | */ |
400 | if (info->flags & MSI_FLAG_ACTIVATE_EARLY) { | 438 | if (!(info->flags & MSI_FLAG_ACTIVATE_EARLY)) |
401 | struct irq_data *irq_data; | 439 | continue; |
402 | 440 | ||
441 | irq_data = irq_domain_get_irq_data(domain, desc->irq); | ||
442 | if (!can_reserve) | ||
443 | irqd_clr_can_reserve(irq_data); | ||
444 | ret = irq_domain_activate_irq(irq_data, can_reserve); | ||
445 | if (ret) | ||
446 | goto cleanup; | ||
447 | } | ||
448 | |||
449 | /* | ||
450 | * If these interrupts use reservation mode, clear the activated bit | ||
451 | * so request_irq() will assign the final vector. | ||
452 | */ | ||
453 | if (can_reserve) { | ||
454 | for_each_msi_entry(desc, dev) { | ||
403 | irq_data = irq_domain_get_irq_data(domain, desc->irq); | 455 | irq_data = irq_domain_get_irq_data(domain, desc->irq); |
404 | ret = irq_domain_activate_irq(irq_data, true); | 456 | irqd_clr_activated(irq_data); |
405 | if (ret) | ||
406 | goto cleanup; | ||
407 | if (info->flags & MSI_FLAG_MUST_REACTIVATE) | ||
408 | irqd_clr_activated(irq_data); | ||
409 | } | 457 | } |
410 | } | 458 | } |
411 | return 0; | 459 | return 0; |
diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 8594d24e4adc..b4517095db6a 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c | |||
@@ -79,7 +79,7 @@ int static_key_count(struct static_key *key) | |||
79 | } | 79 | } |
80 | EXPORT_SYMBOL_GPL(static_key_count); | 80 | EXPORT_SYMBOL_GPL(static_key_count); |
81 | 81 | ||
82 | static void static_key_slow_inc_cpuslocked(struct static_key *key) | 82 | void static_key_slow_inc_cpuslocked(struct static_key *key) |
83 | { | 83 | { |
84 | int v, v1; | 84 | int v, v1; |
85 | 85 | ||
@@ -180,7 +180,7 @@ void static_key_disable(struct static_key *key) | |||
180 | } | 180 | } |
181 | EXPORT_SYMBOL_GPL(static_key_disable); | 181 | EXPORT_SYMBOL_GPL(static_key_disable); |
182 | 182 | ||
183 | static void static_key_slow_dec_cpuslocked(struct static_key *key, | 183 | static void __static_key_slow_dec_cpuslocked(struct static_key *key, |
184 | unsigned long rate_limit, | 184 | unsigned long rate_limit, |
185 | struct delayed_work *work) | 185 | struct delayed_work *work) |
186 | { | 186 | { |
@@ -211,7 +211,7 @@ static void __static_key_slow_dec(struct static_key *key, | |||
211 | struct delayed_work *work) | 211 | struct delayed_work *work) |
212 | { | 212 | { |
213 | cpus_read_lock(); | 213 | cpus_read_lock(); |
214 | static_key_slow_dec_cpuslocked(key, rate_limit, work); | 214 | __static_key_slow_dec_cpuslocked(key, rate_limit, work); |
215 | cpus_read_unlock(); | 215 | cpus_read_unlock(); |
216 | } | 216 | } |
217 | 217 | ||
@@ -229,6 +229,12 @@ void static_key_slow_dec(struct static_key *key) | |||
229 | } | 229 | } |
230 | EXPORT_SYMBOL_GPL(static_key_slow_dec); | 230 | EXPORT_SYMBOL_GPL(static_key_slow_dec); |
231 | 231 | ||
232 | void static_key_slow_dec_cpuslocked(struct static_key *key) | ||
233 | { | ||
234 | STATIC_KEY_CHECK_USE(key); | ||
235 | __static_key_slow_dec_cpuslocked(key, 0, NULL); | ||
236 | } | ||
237 | |||
232 | void static_key_slow_dec_deferred(struct static_key_deferred *key) | 238 | void static_key_slow_dec_deferred(struct static_key_deferred *key) |
233 | { | 239 | { |
234 | STATIC_KEY_CHECK_USE(key); | 240 | STATIC_KEY_CHECK_USE(key); |
diff --git a/kernel/kcov.c b/kernel/kcov.c index 15f33faf4013..7594c033d98a 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c | |||
@@ -157,7 +157,7 @@ void notrace __sanitizer_cov_trace_cmp2(u16 arg1, u16 arg2) | |||
157 | } | 157 | } |
158 | EXPORT_SYMBOL(__sanitizer_cov_trace_cmp2); | 158 | EXPORT_SYMBOL(__sanitizer_cov_trace_cmp2); |
159 | 159 | ||
160 | void notrace __sanitizer_cov_trace_cmp4(u16 arg1, u16 arg2) | 160 | void notrace __sanitizer_cov_trace_cmp4(u32 arg1, u32 arg2) |
161 | { | 161 | { |
162 | write_comp_data(KCOV_CMP_SIZE(2), arg1, arg2, _RET_IP_); | 162 | write_comp_data(KCOV_CMP_SIZE(2), arg1, arg2, _RET_IP_); |
163 | } | 163 | } |
@@ -183,7 +183,7 @@ void notrace __sanitizer_cov_trace_const_cmp2(u16 arg1, u16 arg2) | |||
183 | } | 183 | } |
184 | EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp2); | 184 | EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp2); |
185 | 185 | ||
186 | void notrace __sanitizer_cov_trace_const_cmp4(u16 arg1, u16 arg2) | 186 | void notrace __sanitizer_cov_trace_const_cmp4(u32 arg1, u32 arg2) |
187 | { | 187 | { |
188 | write_comp_data(KCOV_CMP_SIZE(2) | KCOV_CMP_CONST, arg1, arg2, | 188 | write_comp_data(KCOV_CMP_SIZE(2) | KCOV_CMP_CONST, arg1, arg2, |
189 | _RET_IP_); | 189 | _RET_IP_); |
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 9776da8db180..521659044719 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c | |||
@@ -49,6 +49,7 @@ | |||
49 | #include <linux/gfp.h> | 49 | #include <linux/gfp.h> |
50 | #include <linux/random.h> | 50 | #include <linux/random.h> |
51 | #include <linux/jhash.h> | 51 | #include <linux/jhash.h> |
52 | #include <linux/nmi.h> | ||
52 | 53 | ||
53 | #include <asm/sections.h> | 54 | #include <asm/sections.h> |
54 | 55 | ||
@@ -57,10 +58,6 @@ | |||
57 | #define CREATE_TRACE_POINTS | 58 | #define CREATE_TRACE_POINTS |
58 | #include <trace/events/lock.h> | 59 | #include <trace/events/lock.h> |
59 | 60 | ||
60 | #ifdef CONFIG_LOCKDEP_CROSSRELEASE | ||
61 | #include <linux/slab.h> | ||
62 | #endif | ||
63 | |||
64 | #ifdef CONFIG_PROVE_LOCKING | 61 | #ifdef CONFIG_PROVE_LOCKING |
65 | int prove_locking = 1; | 62 | int prove_locking = 1; |
66 | module_param(prove_locking, int, 0644); | 63 | module_param(prove_locking, int, 0644); |
@@ -75,19 +72,6 @@ module_param(lock_stat, int, 0644); | |||
75 | #define lock_stat 0 | 72 | #define lock_stat 0 |
76 | #endif | 73 | #endif |
77 | 74 | ||
78 | #ifdef CONFIG_BOOTPARAM_LOCKDEP_CROSSRELEASE_FULLSTACK | ||
79 | static int crossrelease_fullstack = 1; | ||
80 | #else | ||
81 | static int crossrelease_fullstack; | ||
82 | #endif | ||
83 | static int __init allow_crossrelease_fullstack(char *str) | ||
84 | { | ||
85 | crossrelease_fullstack = 1; | ||
86 | return 0; | ||
87 | } | ||
88 | |||
89 | early_param("crossrelease_fullstack", allow_crossrelease_fullstack); | ||
90 | |||
91 | /* | 75 | /* |
92 | * lockdep_lock: protects the lockdep graph, the hashes and the | 76 | * lockdep_lock: protects the lockdep graph, the hashes and the |
93 | * class/list/hash allocators. | 77 | * class/list/hash allocators. |
@@ -740,18 +724,6 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) | |||
740 | return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL); | 724 | return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL); |
741 | } | 725 | } |
742 | 726 | ||
743 | #ifdef CONFIG_LOCKDEP_CROSSRELEASE | ||
744 | static void cross_init(struct lockdep_map *lock, int cross); | ||
745 | static int cross_lock(struct lockdep_map *lock); | ||
746 | static int lock_acquire_crosslock(struct held_lock *hlock); | ||
747 | static int lock_release_crosslock(struct lockdep_map *lock); | ||
748 | #else | ||
749 | static inline void cross_init(struct lockdep_map *lock, int cross) {} | ||
750 | static inline int cross_lock(struct lockdep_map *lock) { return 0; } | ||
751 | static inline int lock_acquire_crosslock(struct held_lock *hlock) { return 2; } | ||
752 | static inline int lock_release_crosslock(struct lockdep_map *lock) { return 2; } | ||
753 | #endif | ||
754 | |||
755 | /* | 727 | /* |
756 | * Register a lock's class in the hash-table, if the class is not present | 728 | * Register a lock's class in the hash-table, if the class is not present |
757 | * yet. Otherwise we look it up. We cache the result in the lock object | 729 | * yet. Otherwise we look it up. We cache the result in the lock object |
@@ -1151,41 +1123,22 @@ print_circular_lock_scenario(struct held_lock *src, | |||
1151 | printk(KERN_CONT "\n\n"); | 1123 | printk(KERN_CONT "\n\n"); |
1152 | } | 1124 | } |
1153 | 1125 | ||
1154 | if (cross_lock(tgt->instance)) { | 1126 | printk(" Possible unsafe locking scenario:\n\n"); |
1155 | printk(" Possible unsafe locking scenario by crosslock:\n\n"); | 1127 | printk(" CPU0 CPU1\n"); |
1156 | printk(" CPU0 CPU1\n"); | 1128 | printk(" ---- ----\n"); |
1157 | printk(" ---- ----\n"); | 1129 | printk(" lock("); |
1158 | printk(" lock("); | 1130 | __print_lock_name(target); |
1159 | __print_lock_name(parent); | 1131 | printk(KERN_CONT ");\n"); |
1160 | printk(KERN_CONT ");\n"); | 1132 | printk(" lock("); |
1161 | printk(" lock("); | 1133 | __print_lock_name(parent); |
1162 | __print_lock_name(target); | 1134 | printk(KERN_CONT ");\n"); |
1163 | printk(KERN_CONT ");\n"); | 1135 | printk(" lock("); |
1164 | printk(" lock("); | 1136 | __print_lock_name(target); |
1165 | __print_lock_name(source); | 1137 | printk(KERN_CONT ");\n"); |
1166 | printk(KERN_CONT ");\n"); | 1138 | printk(" lock("); |
1167 | printk(" unlock("); | 1139 | __print_lock_name(source); |
1168 | __print_lock_name(target); | 1140 | printk(KERN_CONT ");\n"); |
1169 | printk(KERN_CONT ");\n"); | 1141 | printk("\n *** DEADLOCK ***\n\n"); |
1170 | printk("\n *** DEADLOCK ***\n\n"); | ||
1171 | } else { | ||
1172 | printk(" Possible unsafe locking scenario:\n\n"); | ||
1173 | printk(" CPU0 CPU1\n"); | ||
1174 | printk(" ---- ----\n"); | ||
1175 | printk(" lock("); | ||
1176 | __print_lock_name(target); | ||
1177 | printk(KERN_CONT ");\n"); | ||
1178 | printk(" lock("); | ||
1179 | __print_lock_name(parent); | ||
1180 | printk(KERN_CONT ");\n"); | ||
1181 | printk(" lock("); | ||
1182 | __print_lock_name(target); | ||
1183 | printk(KERN_CONT ");\n"); | ||
1184 | printk(" lock("); | ||
1185 | __print_lock_name(source); | ||
1186 | printk(KERN_CONT ");\n"); | ||
1187 | printk("\n *** DEADLOCK ***\n\n"); | ||
1188 | } | ||
1189 | } | 1142 | } |
1190 | 1143 | ||
1191 | /* | 1144 | /* |
@@ -1211,10 +1164,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, | |||
1211 | curr->comm, task_pid_nr(curr)); | 1164 | curr->comm, task_pid_nr(curr)); |
1212 | print_lock(check_src); | 1165 | print_lock(check_src); |
1213 | 1166 | ||
1214 | if (cross_lock(check_tgt->instance)) | 1167 | pr_warn("\nbut task is already holding lock:\n"); |
1215 | pr_warn("\nbut now in release context of a crosslock acquired at the following:\n"); | ||
1216 | else | ||
1217 | pr_warn("\nbut task is already holding lock:\n"); | ||
1218 | 1168 | ||
1219 | print_lock(check_tgt); | 1169 | print_lock(check_tgt); |
1220 | pr_warn("\nwhich lock already depends on the new lock.\n\n"); | 1170 | pr_warn("\nwhich lock already depends on the new lock.\n\n"); |
@@ -1244,9 +1194,7 @@ static noinline int print_circular_bug(struct lock_list *this, | |||
1244 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) | 1194 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) |
1245 | return 0; | 1195 | return 0; |
1246 | 1196 | ||
1247 | if (cross_lock(check_tgt->instance)) | 1197 | if (!save_trace(&this->trace)) |
1248 | this->trace = *trace; | ||
1249 | else if (!save_trace(&this->trace)) | ||
1250 | return 0; | 1198 | return 0; |
1251 | 1199 | ||
1252 | depth = get_lock_depth(target); | 1200 | depth = get_lock_depth(target); |
@@ -1850,9 +1798,6 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, | |||
1850 | if (nest) | 1798 | if (nest) |
1851 | return 2; | 1799 | return 2; |
1852 | 1800 | ||
1853 | if (cross_lock(prev->instance)) | ||
1854 | continue; | ||
1855 | |||
1856 | return print_deadlock_bug(curr, prev, next); | 1801 | return print_deadlock_bug(curr, prev, next); |
1857 | } | 1802 | } |
1858 | return 1; | 1803 | return 1; |
@@ -2018,31 +1963,26 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) | |||
2018 | for (;;) { | 1963 | for (;;) { |
2019 | int distance = curr->lockdep_depth - depth + 1; | 1964 | int distance = curr->lockdep_depth - depth + 1; |
2020 | hlock = curr->held_locks + depth - 1; | 1965 | hlock = curr->held_locks + depth - 1; |
1966 | |||
2021 | /* | 1967 | /* |
2022 | * Only non-crosslock entries get new dependencies added. | 1968 | * Only non-recursive-read entries get new dependencies |
2023 | * Crosslock entries will be added by commit later: | 1969 | * added: |
2024 | */ | 1970 | */ |
2025 | if (!cross_lock(hlock->instance)) { | 1971 | if (hlock->read != 2 && hlock->check) { |
1972 | int ret = check_prev_add(curr, hlock, next, distance, &trace, save_trace); | ||
1973 | if (!ret) | ||
1974 | return 0; | ||
1975 | |||
2026 | /* | 1976 | /* |
2027 | * Only non-recursive-read entries get new dependencies | 1977 | * Stop after the first non-trylock entry, |
2028 | * added: | 1978 | * as non-trylock entries have added their |
1979 | * own direct dependencies already, so this | ||
1980 | * lock is connected to them indirectly: | ||
2029 | */ | 1981 | */ |
2030 | if (hlock->read != 2 && hlock->check) { | 1982 | if (!hlock->trylock) |
2031 | int ret = check_prev_add(curr, hlock, next, | 1983 | break; |
2032 | distance, &trace, save_trace); | ||
2033 | if (!ret) | ||
2034 | return 0; | ||
2035 | |||
2036 | /* | ||
2037 | * Stop after the first non-trylock entry, | ||
2038 | * as non-trylock entries have added their | ||
2039 | * own direct dependencies already, so this | ||
2040 | * lock is connected to them indirectly: | ||
2041 | */ | ||
2042 | if (!hlock->trylock) | ||
2043 | break; | ||
2044 | } | ||
2045 | } | 1984 | } |
1985 | |||
2046 | depth--; | 1986 | depth--; |
2047 | /* | 1987 | /* |
2048 | * End of lock-stack? | 1988 | * End of lock-stack? |
@@ -3292,21 +3232,10 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name, | |||
3292 | void lockdep_init_map(struct lockdep_map *lock, const char *name, | 3232 | void lockdep_init_map(struct lockdep_map *lock, const char *name, |
3293 | struct lock_class_key *key, int subclass) | 3233 | struct lock_class_key *key, int subclass) |
3294 | { | 3234 | { |
3295 | cross_init(lock, 0); | ||
3296 | __lockdep_init_map(lock, name, key, subclass); | 3235 | __lockdep_init_map(lock, name, key, subclass); |
3297 | } | 3236 | } |
3298 | EXPORT_SYMBOL_GPL(lockdep_init_map); | 3237 | EXPORT_SYMBOL_GPL(lockdep_init_map); |
3299 | 3238 | ||
3300 | #ifdef CONFIG_LOCKDEP_CROSSRELEASE | ||
3301 | void lockdep_init_map_crosslock(struct lockdep_map *lock, const char *name, | ||
3302 | struct lock_class_key *key, int subclass) | ||
3303 | { | ||
3304 | cross_init(lock, 1); | ||
3305 | __lockdep_init_map(lock, name, key, subclass); | ||
3306 | } | ||
3307 | EXPORT_SYMBOL_GPL(lockdep_init_map_crosslock); | ||
3308 | #endif | ||
3309 | |||
3310 | struct lock_class_key __lockdep_no_validate__; | 3239 | struct lock_class_key __lockdep_no_validate__; |
3311 | EXPORT_SYMBOL_GPL(__lockdep_no_validate__); | 3240 | EXPORT_SYMBOL_GPL(__lockdep_no_validate__); |
3312 | 3241 | ||
@@ -3362,7 +3291,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
3362 | int chain_head = 0; | 3291 | int chain_head = 0; |
3363 | int class_idx; | 3292 | int class_idx; |
3364 | u64 chain_key; | 3293 | u64 chain_key; |
3365 | int ret; | ||
3366 | 3294 | ||
3367 | if (unlikely(!debug_locks)) | 3295 | if (unlikely(!debug_locks)) |
3368 | return 0; | 3296 | return 0; |
@@ -3411,8 +3339,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
3411 | 3339 | ||
3412 | class_idx = class - lock_classes + 1; | 3340 | class_idx = class - lock_classes + 1; |
3413 | 3341 | ||
3414 | /* TODO: nest_lock is not implemented for crosslock yet. */ | 3342 | if (depth) { |
3415 | if (depth && !cross_lock(lock)) { | ||
3416 | hlock = curr->held_locks + depth - 1; | 3343 | hlock = curr->held_locks + depth - 1; |
3417 | if (hlock->class_idx == class_idx && nest_lock) { | 3344 | if (hlock->class_idx == class_idx && nest_lock) { |
3418 | if (hlock->references) { | 3345 | if (hlock->references) { |
@@ -3500,14 +3427,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
3500 | if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) | 3427 | if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) |
3501 | return 0; | 3428 | return 0; |
3502 | 3429 | ||
3503 | ret = lock_acquire_crosslock(hlock); | ||
3504 | /* | ||
3505 | * 2 means normal acquire operations are needed. Otherwise, it's | ||
3506 | * ok just to return with '0:fail, 1:success'. | ||
3507 | */ | ||
3508 | if (ret != 2) | ||
3509 | return ret; | ||
3510 | |||
3511 | curr->curr_chain_key = chain_key; | 3430 | curr->curr_chain_key = chain_key; |
3512 | curr->lockdep_depth++; | 3431 | curr->lockdep_depth++; |
3513 | check_chain_key(curr); | 3432 | check_chain_key(curr); |
@@ -3745,19 +3664,11 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) | |||
3745 | struct task_struct *curr = current; | 3664 | struct task_struct *curr = current; |
3746 | struct held_lock *hlock; | 3665 | struct held_lock *hlock; |
3747 | unsigned int depth; | 3666 | unsigned int depth; |
3748 | int ret, i; | 3667 | int i; |
3749 | 3668 | ||
3750 | if (unlikely(!debug_locks)) | 3669 | if (unlikely(!debug_locks)) |
3751 | return 0; | 3670 | return 0; |
3752 | 3671 | ||
3753 | ret = lock_release_crosslock(lock); | ||
3754 | /* | ||
3755 | * 2 means normal release operations are needed. Otherwise, it's | ||
3756 | * ok just to return with '0:fail, 1:success'. | ||
3757 | */ | ||
3758 | if (ret != 2) | ||
3759 | return ret; | ||
3760 | |||
3761 | depth = curr->lockdep_depth; | 3672 | depth = curr->lockdep_depth; |
3762 | /* | 3673 | /* |
3763 | * So we're all set to release this lock.. wait what lock? We don't | 3674 | * So we're all set to release this lock.. wait what lock? We don't |
@@ -4580,6 +4491,7 @@ retry: | |||
4580 | if (!unlock) | 4491 | if (!unlock) |
4581 | if (read_trylock(&tasklist_lock)) | 4492 | if (read_trylock(&tasklist_lock)) |
4582 | unlock = 1; | 4493 | unlock = 1; |
4494 | touch_nmi_watchdog(); | ||
4583 | } while_each_thread(g, p); | 4495 | } while_each_thread(g, p); |
4584 | 4496 | ||
4585 | pr_warn("\n"); | 4497 | pr_warn("\n"); |
@@ -4675,494 +4587,3 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) | |||
4675 | dump_stack(); | 4587 | dump_stack(); |
4676 | } | 4588 | } |
4677 | EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); | 4589 | EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); |
4678 | |||
4679 | #ifdef CONFIG_LOCKDEP_CROSSRELEASE | ||
4680 | |||
4681 | /* | ||
4682 | * Crossrelease works by recording a lock history for each thread and | ||
4683 | * connecting those historic locks that were taken after the | ||
4684 | * wait_for_completion() in the complete() context. | ||
4685 | * | ||
4686 | * Task-A Task-B | ||
4687 | * | ||
4688 | * mutex_lock(&A); | ||
4689 | * mutex_unlock(&A); | ||
4690 | * | ||
4691 | * wait_for_completion(&C); | ||
4692 | * lock_acquire_crosslock(); | ||
4693 | * atomic_inc_return(&cross_gen_id); | ||
4694 | * | | ||
4695 | * | mutex_lock(&B); | ||
4696 | * | mutex_unlock(&B); | ||
4697 | * | | ||
4698 | * | complete(&C); | ||
4699 | * `-- lock_commit_crosslock(); | ||
4700 | * | ||
4701 | * Which will then add a dependency between B and C. | ||
4702 | */ | ||
4703 | |||
4704 | #define xhlock(i) (current->xhlocks[(i) % MAX_XHLOCKS_NR]) | ||
4705 | |||
4706 | /* | ||
4707 | * Whenever a crosslock is held, cross_gen_id will be increased. | ||
4708 | */ | ||
4709 | static atomic_t cross_gen_id; /* Can be wrapped */ | ||
4710 | |||
4711 | /* | ||
4712 | * Make an entry of the ring buffer invalid. | ||
4713 | */ | ||
4714 | static inline void invalidate_xhlock(struct hist_lock *xhlock) | ||
4715 | { | ||
4716 | /* | ||
4717 | * Normally, xhlock->hlock.instance must be !NULL. | ||
4718 | */ | ||
4719 | xhlock->hlock.instance = NULL; | ||
4720 | } | ||
4721 | |||
4722 | /* | ||
4723 | * Lock history stacks; we have 2 nested lock history stacks: | ||
4724 | * | ||
4725 | * HARD(IRQ) | ||
4726 | * SOFT(IRQ) | ||
4727 | * | ||
4728 | * The thing is that once we complete a HARD/SOFT IRQ the future task locks | ||
4729 | * should not depend on any of the locks observed while running the IRQ. So | ||
4730 | * what we do is rewind the history buffer and erase all our knowledge of that | ||
4731 | * temporal event. | ||
4732 | */ | ||
4733 | |||
4734 | void crossrelease_hist_start(enum xhlock_context_t c) | ||
4735 | { | ||
4736 | struct task_struct *cur = current; | ||
4737 | |||
4738 | if (!cur->xhlocks) | ||
4739 | return; | ||
4740 | |||
4741 | cur->xhlock_idx_hist[c] = cur->xhlock_idx; | ||
4742 | cur->hist_id_save[c] = cur->hist_id; | ||
4743 | } | ||
4744 | |||
4745 | void crossrelease_hist_end(enum xhlock_context_t c) | ||
4746 | { | ||
4747 | struct task_struct *cur = current; | ||
4748 | |||
4749 | if (cur->xhlocks) { | ||
4750 | unsigned int idx = cur->xhlock_idx_hist[c]; | ||
4751 | struct hist_lock *h = &xhlock(idx); | ||
4752 | |||
4753 | cur->xhlock_idx = idx; | ||
4754 | |||
4755 | /* Check if the ring was overwritten. */ | ||
4756 | if (h->hist_id != cur->hist_id_save[c]) | ||
4757 | invalidate_xhlock(h); | ||
4758 | } | ||
4759 | } | ||
4760 | |||
4761 | /* | ||
4762 | * lockdep_invariant_state() is used to annotate independence inside a task, to | ||
4763 | * make one task look like multiple independent 'tasks'. | ||
4764 | * | ||
4765 | * Take for instance workqueues; each work is independent of the last. The | ||
4766 | * completion of a future work does not depend on the completion of a past work | ||
4767 | * (in general). Therefore we must not carry that (lock) dependency across | ||
4768 | * works. | ||
4769 | * | ||
4770 | * This is true for many things; pretty much all kthreads fall into this | ||
4771 | * pattern, where they have an invariant state and future completions do not | ||
4772 | * depend on past completions. Its just that since they all have the 'same' | ||
4773 | * form -- the kthread does the same over and over -- it doesn't typically | ||
4774 | * matter. | ||
4775 | * | ||
4776 | * The same is true for system-calls, once a system call is completed (we've | ||
4777 | * returned to userspace) the next system call does not depend on the lock | ||
4778 | * history of the previous system call. | ||
4779 | * | ||
4780 | * They key property for independence, this invariant state, is that it must be | ||
4781 | * a point where we hold no locks and have no history. Because if we were to | ||
4782 | * hold locks, the restore at _end() would not necessarily recover it's history | ||
4783 | * entry. Similarly, independence per-definition means it does not depend on | ||
4784 | * prior state. | ||
4785 | */ | ||
4786 | void lockdep_invariant_state(bool force) | ||
4787 | { | ||
4788 | /* | ||
4789 | * We call this at an invariant point, no current state, no history. | ||
4790 | * Verify the former, enforce the latter. | ||
4791 | */ | ||
4792 | WARN_ON_ONCE(!force && current->lockdep_depth); | ||
4793 | invalidate_xhlock(&xhlock(current->xhlock_idx)); | ||
4794 | } | ||
4795 | |||
4796 | static int cross_lock(struct lockdep_map *lock) | ||
4797 | { | ||
4798 | return lock ? lock->cross : 0; | ||
4799 | } | ||
4800 | |||
4801 | /* | ||
4802 | * This is needed to decide the relationship between wrapable variables. | ||
4803 | */ | ||
4804 | static inline int before(unsigned int a, unsigned int b) | ||
4805 | { | ||
4806 | return (int)(a - b) < 0; | ||
4807 | } | ||
4808 | |||
4809 | static inline struct lock_class *xhlock_class(struct hist_lock *xhlock) | ||
4810 | { | ||
4811 | return hlock_class(&xhlock->hlock); | ||
4812 | } | ||
4813 | |||
4814 | static inline struct lock_class *xlock_class(struct cross_lock *xlock) | ||
4815 | { | ||
4816 | return hlock_class(&xlock->hlock); | ||
4817 | } | ||
4818 | |||
4819 | /* | ||
4820 | * Should we check a dependency with previous one? | ||
4821 | */ | ||
4822 | static inline int depend_before(struct held_lock *hlock) | ||
4823 | { | ||
4824 | return hlock->read != 2 && hlock->check && !hlock->trylock; | ||
4825 | } | ||
4826 | |||
4827 | /* | ||
4828 | * Should we check a dependency with next one? | ||
4829 | */ | ||
4830 | static inline int depend_after(struct held_lock *hlock) | ||
4831 | { | ||
4832 | return hlock->read != 2 && hlock->check; | ||
4833 | } | ||
4834 | |||
4835 | /* | ||
4836 | * Check if the xhlock is valid, which would be false if, | ||
4837 | * | ||
4838 | * 1. Has not used after initializaion yet. | ||
4839 | * 2. Got invalidated. | ||
4840 | * | ||
4841 | * Remind hist_lock is implemented as a ring buffer. | ||
4842 | */ | ||
4843 | static inline int xhlock_valid(struct hist_lock *xhlock) | ||
4844 | { | ||
4845 | /* | ||
4846 | * xhlock->hlock.instance must be !NULL. | ||
4847 | */ | ||
4848 | return !!xhlock->hlock.instance; | ||
4849 | } | ||
4850 | |||
4851 | /* | ||
4852 | * Record a hist_lock entry. | ||
4853 | * | ||
4854 | * Irq disable is only required. | ||
4855 | */ | ||
4856 | static void add_xhlock(struct held_lock *hlock) | ||
4857 | { | ||
4858 | unsigned int idx = ++current->xhlock_idx; | ||
4859 | struct hist_lock *xhlock = &xhlock(idx); | ||
4860 | |||
4861 | #ifdef CONFIG_DEBUG_LOCKDEP | ||
4862 | /* | ||
4863 | * This can be done locklessly because they are all task-local | ||
4864 | * state, we must however ensure IRQs are disabled. | ||
4865 | */ | ||
4866 | WARN_ON_ONCE(!irqs_disabled()); | ||
4867 | #endif | ||
4868 | |||
4869 | /* Initialize hist_lock's members */ | ||
4870 | xhlock->hlock = *hlock; | ||
4871 | xhlock->hist_id = ++current->hist_id; | ||
4872 | |||
4873 | xhlock->trace.nr_entries = 0; | ||
4874 | xhlock->trace.max_entries = MAX_XHLOCK_TRACE_ENTRIES; | ||
4875 | xhlock->trace.entries = xhlock->trace_entries; | ||
4876 | |||
4877 | if (crossrelease_fullstack) { | ||
4878 | xhlock->trace.skip = 3; | ||
4879 | save_stack_trace(&xhlock->trace); | ||
4880 | } else { | ||
4881 | xhlock->trace.nr_entries = 1; | ||
4882 | xhlock->trace.entries[0] = hlock->acquire_ip; | ||
4883 | } | ||
4884 | } | ||
4885 | |||
4886 | static inline int same_context_xhlock(struct hist_lock *xhlock) | ||
4887 | { | ||
4888 | return xhlock->hlock.irq_context == task_irq_context(current); | ||
4889 | } | ||
4890 | |||
4891 | /* | ||
4892 | * This should be lockless as far as possible because this would be | ||
4893 | * called very frequently. | ||
4894 | */ | ||
4895 | static void check_add_xhlock(struct held_lock *hlock) | ||
4896 | { | ||
4897 | /* | ||
4898 | * Record a hist_lock, only in case that acquisitions ahead | ||
4899 | * could depend on the held_lock. For example, if the held_lock | ||
4900 | * is trylock then acquisitions ahead never depends on that. | ||
4901 | * In that case, we don't need to record it. Just return. | ||
4902 | */ | ||
4903 | if (!current->xhlocks || !depend_before(hlock)) | ||
4904 | return; | ||
4905 | |||
4906 | add_xhlock(hlock); | ||
4907 | } | ||
4908 | |||
4909 | /* | ||
4910 | * For crosslock. | ||
4911 | */ | ||
4912 | static int add_xlock(struct held_lock *hlock) | ||
4913 | { | ||
4914 | struct cross_lock *xlock; | ||
4915 | unsigned int gen_id; | ||
4916 | |||
4917 | if (!graph_lock()) | ||
4918 | return 0; | ||
4919 | |||
4920 | xlock = &((struct lockdep_map_cross *)hlock->instance)->xlock; | ||
4921 | |||
4922 | /* | ||
4923 | * When acquisitions for a crosslock are overlapped, we use | ||
4924 | * nr_acquire to perform commit for them, based on cross_gen_id | ||
4925 | * of the first acquisition, which allows to add additional | ||
4926 | * dependencies. | ||
4927 | * | ||
4928 | * Moreover, when no acquisition of a crosslock is in progress, | ||
4929 | * we should not perform commit because the lock might not exist | ||
4930 | * any more, which might cause incorrect memory access. So we | ||
4931 | * have to track the number of acquisitions of a crosslock. | ||
4932 | * | ||
4933 | * depend_after() is necessary to initialize only the first | ||
4934 | * valid xlock so that the xlock can be used on its commit. | ||
4935 | */ | ||
4936 | if (xlock->nr_acquire++ && depend_after(&xlock->hlock)) | ||
4937 | goto unlock; | ||
4938 | |||
4939 | gen_id = (unsigned int)atomic_inc_return(&cross_gen_id); | ||
4940 | xlock->hlock = *hlock; | ||
4941 | xlock->hlock.gen_id = gen_id; | ||
4942 | unlock: | ||
4943 | graph_unlock(); | ||
4944 | return 1; | ||
4945 | } | ||
4946 | |||
4947 | /* | ||
4948 | * Called for both normal and crosslock acquires. Normal locks will be | ||
4949 | * pushed on the hist_lock queue. Cross locks will record state and | ||
4950 | * stop regular lock_acquire() to avoid being placed on the held_lock | ||
4951 | * stack. | ||
4952 | * | ||
4953 | * Return: 0 - failure; | ||
4954 | * 1 - crosslock, done; | ||
4955 | * 2 - normal lock, continue to held_lock[] ops. | ||
4956 | */ | ||
4957 | static int lock_acquire_crosslock(struct held_lock *hlock) | ||
4958 | { | ||
4959 | /* | ||
4960 | * CONTEXT 1 CONTEXT 2 | ||
4961 | * --------- --------- | ||
4962 | * lock A (cross) | ||
4963 | * X = atomic_inc_return(&cross_gen_id) | ||
4964 | * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
4965 | * Y = atomic_read_acquire(&cross_gen_id) | ||
4966 | * lock B | ||
4967 | * | ||
4968 | * atomic_read_acquire() is for ordering between A and B, | ||
4969 | * IOW, A happens before B, when CONTEXT 2 see Y >= X. | ||
4970 | * | ||
4971 | * Pairs with atomic_inc_return() in add_xlock(). | ||
4972 | */ | ||
4973 | hlock->gen_id = (unsigned int)atomic_read_acquire(&cross_gen_id); | ||
4974 | |||
4975 | if (cross_lock(hlock->instance)) | ||
4976 | return add_xlock(hlock); | ||
4977 | |||
4978 | check_add_xhlock(hlock); | ||
4979 | return 2; | ||
4980 | } | ||
4981 | |||
4982 | static int copy_trace(struct stack_trace *trace) | ||
4983 | { | ||
4984 | unsigned long *buf = stack_trace + nr_stack_trace_entries; | ||
4985 | unsigned int max_nr = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; | ||
4986 | unsigned int nr = min(max_nr, trace->nr_entries); | ||
4987 | |||
4988 | trace->nr_entries = nr; | ||
4989 | memcpy(buf, trace->entries, nr * sizeof(trace->entries[0])); | ||
4990 | trace->entries = buf; | ||
4991 | nr_stack_trace_entries += nr; | ||
4992 | |||
4993 | if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) { | ||
4994 | if (!debug_locks_off_graph_unlock()) | ||
4995 | return 0; | ||
4996 | |||
4997 | print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!"); | ||
4998 | dump_stack(); | ||
4999 | |||
5000 | return 0; | ||
5001 | } | ||
5002 | |||
5003 | return 1; | ||
5004 | } | ||
5005 | |||
5006 | static int commit_xhlock(struct cross_lock *xlock, struct hist_lock *xhlock) | ||
5007 | { | ||
5008 | unsigned int xid, pid; | ||
5009 | u64 chain_key; | ||
5010 | |||
5011 | xid = xlock_class(xlock) - lock_classes; | ||
5012 | chain_key = iterate_chain_key((u64)0, xid); | ||
5013 | pid = xhlock_class(xhlock) - lock_classes; | ||
5014 | chain_key = iterate_chain_key(chain_key, pid); | ||
5015 | |||
5016 | if (lookup_chain_cache(chain_key)) | ||
5017 | return 1; | ||
5018 | |||
5019 | if (!add_chain_cache_classes(xid, pid, xhlock->hlock.irq_context, | ||
5020 | chain_key)) | ||
5021 | return 0; | ||
5022 | |||
5023 | if (!check_prev_add(current, &xlock->hlock, &xhlock->hlock, 1, | ||
5024 | &xhlock->trace, copy_trace)) | ||
5025 | return 0; | ||
5026 | |||
5027 | return 1; | ||
5028 | } | ||
5029 | |||
5030 | static void commit_xhlocks(struct cross_lock *xlock) | ||
5031 | { | ||
5032 | unsigned int cur = current->xhlock_idx; | ||
5033 | unsigned int prev_hist_id = xhlock(cur).hist_id; | ||
5034 | unsigned int i; | ||
5035 | |||
5036 | if (!graph_lock()) | ||
5037 | return; | ||
5038 | |||
5039 | if (xlock->nr_acquire) { | ||
5040 | for (i = 0; i < MAX_XHLOCKS_NR; i++) { | ||
5041 | struct hist_lock *xhlock = &xhlock(cur - i); | ||
5042 | |||
5043 | if (!xhlock_valid(xhlock)) | ||
5044 | break; | ||
5045 | |||
5046 | if (before(xhlock->hlock.gen_id, xlock->hlock.gen_id)) | ||
5047 | break; | ||
5048 | |||
5049 | if (!same_context_xhlock(xhlock)) | ||
5050 | break; | ||
5051 | |||
5052 | /* | ||
5053 | * Filter out the cases where the ring buffer was | ||
5054 | * overwritten and the current entry has a bigger | ||
5055 | * hist_id than the previous one, which is impossible | ||
5056 | * otherwise: | ||
5057 | */ | ||
5058 | if (unlikely(before(prev_hist_id, xhlock->hist_id))) | ||
5059 | break; | ||
5060 | |||
5061 | prev_hist_id = xhlock->hist_id; | ||
5062 | |||
5063 | /* | ||
5064 | * commit_xhlock() returns 0 with graph_lock already | ||
5065 | * released if fail. | ||
5066 | */ | ||
5067 | if (!commit_xhlock(xlock, xhlock)) | ||
5068 | return; | ||
5069 | } | ||
5070 | } | ||
5071 | |||
5072 | graph_unlock(); | ||
5073 | } | ||
5074 | |||
5075 | void lock_commit_crosslock(struct lockdep_map *lock) | ||
5076 | { | ||
5077 | struct cross_lock *xlock; | ||
5078 | unsigned long flags; | ||
5079 | |||
5080 | if (unlikely(!debug_locks || current->lockdep_recursion)) | ||
5081 | return; | ||
5082 | |||
5083 | if (!current->xhlocks) | ||
5084 | return; | ||
5085 | |||
5086 | /* | ||
5087 | * Do commit hist_locks with the cross_lock, only in case that | ||
5088 | * the cross_lock could depend on acquisitions after that. | ||
5089 | * | ||
5090 | * For example, if the cross_lock does not have the 'check' flag | ||
5091 | * then we don't need to check dependencies and commit for that. | ||
5092 | * Just skip it. In that case, of course, the cross_lock does | ||
5093 | * not depend on acquisitions ahead, either. | ||
5094 | * | ||
5095 | * WARNING: Don't do that in add_xlock() in advance. When an | ||
5096 | * acquisition context is different from the commit context, | ||
5097 | * invalid(skipped) cross_lock might be accessed. | ||
5098 | */ | ||
5099 | if (!depend_after(&((struct lockdep_map_cross *)lock)->xlock.hlock)) | ||
5100 | return; | ||
5101 | |||
5102 | raw_local_irq_save(flags); | ||
5103 | check_flags(flags); | ||
5104 | current->lockdep_recursion = 1; | ||
5105 | xlock = &((struct lockdep_map_cross *)lock)->xlock; | ||
5106 | commit_xhlocks(xlock); | ||
5107 | current->lockdep_recursion = 0; | ||
5108 | raw_local_irq_restore(flags); | ||
5109 | } | ||
5110 | EXPORT_SYMBOL_GPL(lock_commit_crosslock); | ||
5111 | |||
5112 | /* | ||
5113 | * Return: 0 - failure; | ||
5114 | * 1 - crosslock, done; | ||
5115 | * 2 - normal lock, continue to held_lock[] ops. | ||
5116 | */ | ||
5117 | static int lock_release_crosslock(struct lockdep_map *lock) | ||
5118 | { | ||
5119 | if (cross_lock(lock)) { | ||
5120 | if (!graph_lock()) | ||
5121 | return 0; | ||
5122 | ((struct lockdep_map_cross *)lock)->xlock.nr_acquire--; | ||
5123 | graph_unlock(); | ||
5124 | return 1; | ||
5125 | } | ||
5126 | return 2; | ||
5127 | } | ||
5128 | |||
5129 | static void cross_init(struct lockdep_map *lock, int cross) | ||
5130 | { | ||
5131 | if (cross) | ||
5132 | ((struct lockdep_map_cross *)lock)->xlock.nr_acquire = 0; | ||
5133 | |||
5134 | lock->cross = cross; | ||
5135 | |||
5136 | /* | ||
5137 | * Crossrelease assumes that the ring buffer size of xhlocks | ||
5138 | * is aligned with power of 2. So force it on build. | ||
5139 | */ | ||
5140 | BUILD_BUG_ON(MAX_XHLOCKS_NR & (MAX_XHLOCKS_NR - 1)); | ||
5141 | } | ||
5142 | |||
5143 | void lockdep_init_task(struct task_struct *task) | ||
5144 | { | ||
5145 | int i; | ||
5146 | |||
5147 | task->xhlock_idx = UINT_MAX; | ||
5148 | task->hist_id = 0; | ||
5149 | |||
5150 | for (i = 0; i < XHLOCK_CTX_NR; i++) { | ||
5151 | task->xhlock_idx_hist[i] = UINT_MAX; | ||
5152 | task->hist_id_save[i] = 0; | ||
5153 | } | ||
5154 | |||
5155 | task->xhlocks = kzalloc(sizeof(struct hist_lock) * MAX_XHLOCKS_NR, | ||
5156 | GFP_KERNEL); | ||
5157 | } | ||
5158 | |||
5159 | void lockdep_free_task(struct task_struct *task) | ||
5160 | { | ||
5161 | if (task->xhlocks) { | ||
5162 | void *tmp = task->xhlocks; | ||
5163 | /* Diable crossrelease for current */ | ||
5164 | task->xhlocks = NULL; | ||
5165 | kfree(tmp); | ||
5166 | } | ||
5167 | } | ||
5168 | #endif | ||
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 6f3dba6e4e9e..65cc0cb984e6 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c | |||
@@ -1290,6 +1290,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, | |||
1290 | return ret; | 1290 | return ret; |
1291 | } | 1291 | } |
1292 | 1292 | ||
1293 | static inline int __rt_mutex_slowtrylock(struct rt_mutex *lock) | ||
1294 | { | ||
1295 | int ret = try_to_take_rt_mutex(lock, current, NULL); | ||
1296 | |||
1297 | /* | ||
1298 | * try_to_take_rt_mutex() sets the lock waiters bit | ||
1299 | * unconditionally. Clean this up. | ||
1300 | */ | ||
1301 | fixup_rt_mutex_waiters(lock); | ||
1302 | |||
1303 | return ret; | ||
1304 | } | ||
1305 | |||
1293 | /* | 1306 | /* |
1294 | * Slow path try-lock function: | 1307 | * Slow path try-lock function: |
1295 | */ | 1308 | */ |
@@ -1312,13 +1325,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) | |||
1312 | */ | 1325 | */ |
1313 | raw_spin_lock_irqsave(&lock->wait_lock, flags); | 1326 | raw_spin_lock_irqsave(&lock->wait_lock, flags); |
1314 | 1327 | ||
1315 | ret = try_to_take_rt_mutex(lock, current, NULL); | 1328 | ret = __rt_mutex_slowtrylock(lock); |
1316 | |||
1317 | /* | ||
1318 | * try_to_take_rt_mutex() sets the lock waiters bit | ||
1319 | * unconditionally. Clean this up. | ||
1320 | */ | ||
1321 | fixup_rt_mutex_waiters(lock); | ||
1322 | 1329 | ||
1323 | raw_spin_unlock_irqrestore(&lock->wait_lock, flags); | 1330 | raw_spin_unlock_irqrestore(&lock->wait_lock, flags); |
1324 | 1331 | ||
@@ -1505,6 +1512,11 @@ int __sched rt_mutex_futex_trylock(struct rt_mutex *lock) | |||
1505 | return rt_mutex_slowtrylock(lock); | 1512 | return rt_mutex_slowtrylock(lock); |
1506 | } | 1513 | } |
1507 | 1514 | ||
1515 | int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock) | ||
1516 | { | ||
1517 | return __rt_mutex_slowtrylock(lock); | ||
1518 | } | ||
1519 | |||
1508 | /** | 1520 | /** |
1509 | * rt_mutex_timed_lock - lock a rt_mutex interruptible | 1521 | * rt_mutex_timed_lock - lock a rt_mutex interruptible |
1510 | * the timeout structure is provided | 1522 | * the timeout structure is provided |
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 124e98ca0b17..68686b3ec3c1 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h | |||
@@ -148,6 +148,7 @@ extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, | |||
148 | struct rt_mutex_waiter *waiter); | 148 | struct rt_mutex_waiter *waiter); |
149 | 149 | ||
150 | extern int rt_mutex_futex_trylock(struct rt_mutex *l); | 150 | extern int rt_mutex_futex_trylock(struct rt_mutex *l); |
151 | extern int __rt_mutex_futex_trylock(struct rt_mutex *l); | ||
151 | 152 | ||
152 | extern void rt_mutex_futex_unlock(struct rt_mutex *lock); | 153 | extern void rt_mutex_futex_unlock(struct rt_mutex *lock); |
153 | extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock, | 154 | extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock, |
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 1fd1a7543cdd..936f3d14dd6b 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c | |||
@@ -66,12 +66,8 @@ void __lockfunc __raw_##op##_lock(locktype##_t *lock) \ | |||
66 | break; \ | 66 | break; \ |
67 | preempt_enable(); \ | 67 | preempt_enable(); \ |
68 | \ | 68 | \ |
69 | if (!(lock)->break_lock) \ | 69 | arch_##op##_relax(&lock->raw_lock); \ |
70 | (lock)->break_lock = 1; \ | ||
71 | while ((lock)->break_lock) \ | ||
72 | arch_##op##_relax(&lock->raw_lock); \ | ||
73 | } \ | 70 | } \ |
74 | (lock)->break_lock = 0; \ | ||
75 | } \ | 71 | } \ |
76 | \ | 72 | \ |
77 | unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ | 73 | unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ |
@@ -86,12 +82,9 @@ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ | |||
86 | local_irq_restore(flags); \ | 82 | local_irq_restore(flags); \ |
87 | preempt_enable(); \ | 83 | preempt_enable(); \ |
88 | \ | 84 | \ |
89 | if (!(lock)->break_lock) \ | 85 | arch_##op##_relax(&lock->raw_lock); \ |
90 | (lock)->break_lock = 1; \ | ||
91 | while ((lock)->break_lock) \ | ||
92 | arch_##op##_relax(&lock->raw_lock); \ | ||
93 | } \ | 86 | } \ |
94 | (lock)->break_lock = 0; \ | 87 | \ |
95 | return flags; \ | 88 | return flags; \ |
96 | } \ | 89 | } \ |
97 | \ | 90 | \ |
diff --git a/kernel/pid.c b/kernel/pid.c index b13b624e2c49..1e8bb6550ec4 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -193,10 +193,8 @@ struct pid *alloc_pid(struct pid_namespace *ns) | |||
193 | } | 193 | } |
194 | 194 | ||
195 | if (unlikely(is_child_reaper(pid))) { | 195 | if (unlikely(is_child_reaper(pid))) { |
196 | if (pid_ns_prepare_proc(ns)) { | 196 | if (pid_ns_prepare_proc(ns)) |
197 | disable_pid_allocation(ns); | ||
198 | goto out_free; | 197 | goto out_free; |
199 | } | ||
200 | } | 198 | } |
201 | 199 | ||
202 | get_pid_ns(ns); | 200 | get_pid_ns(ns); |
@@ -226,6 +224,10 @@ out_free: | |||
226 | while (++i <= ns->level) | 224 | while (++i <= ns->level) |
227 | idr_remove(&ns->idr, (pid->numbers + i)->nr); | 225 | idr_remove(&ns->idr, (pid->numbers + i)->nr); |
228 | 226 | ||
227 | /* On failure to allocate the first pid, reset the state */ | ||
228 | if (ns->pid_allocated == PIDNS_ADDING) | ||
229 | idr_set_cursor(&ns->idr, 0); | ||
230 | |||
229 | spin_unlock_irq(&pidmap_lock); | 231 | spin_unlock_irq(&pidmap_lock); |
230 | 232 | ||
231 | kmem_cache_free(ns->pid_cachep, pid); | 233 | kmem_cache_free(ns->pid_cachep, pid); |
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 5d81206a572d..b9006617710f 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c | |||
@@ -3141,9 +3141,6 @@ void dump_stack_print_info(const char *log_lvl) | |||
3141 | void show_regs_print_info(const char *log_lvl) | 3141 | void show_regs_print_info(const char *log_lvl) |
3142 | { | 3142 | { |
3143 | dump_stack_print_info(log_lvl); | 3143 | dump_stack_print_info(log_lvl); |
3144 | |||
3145 | printk("%stask: %p task.stack: %p\n", | ||
3146 | log_lvl, current, task_stack_page(current)); | ||
3147 | } | 3144 | } |
3148 | 3145 | ||
3149 | #endif | 3146 | #endif |
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 2ddaec40956f..0926aef10dad 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c | |||
@@ -34,11 +34,6 @@ void complete(struct completion *x) | |||
34 | 34 | ||
35 | spin_lock_irqsave(&x->wait.lock, flags); | 35 | spin_lock_irqsave(&x->wait.lock, flags); |
36 | 36 | ||
37 | /* | ||
38 | * Perform commit of crossrelease here. | ||
39 | */ | ||
40 | complete_release_commit(x); | ||
41 | |||
42 | if (x->done != UINT_MAX) | 37 | if (x->done != UINT_MAX) |
43 | x->done++; | 38 | x->done++; |
44 | __wake_up_locked(&x->wait, TASK_NORMAL, 1); | 39 | __wake_up_locked(&x->wait, TASK_NORMAL, 1); |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 75554f366fd3..a7bf32aabfda 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -2056,7 +2056,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
2056 | p->state = TASK_WAKING; | 2056 | p->state = TASK_WAKING; |
2057 | 2057 | ||
2058 | if (p->in_iowait) { | 2058 | if (p->in_iowait) { |
2059 | delayacct_blkio_end(); | 2059 | delayacct_blkio_end(p); |
2060 | atomic_dec(&task_rq(p)->nr_iowait); | 2060 | atomic_dec(&task_rq(p)->nr_iowait); |
2061 | } | 2061 | } |
2062 | 2062 | ||
@@ -2069,7 +2069,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
2069 | #else /* CONFIG_SMP */ | 2069 | #else /* CONFIG_SMP */ |
2070 | 2070 | ||
2071 | if (p->in_iowait) { | 2071 | if (p->in_iowait) { |
2072 | delayacct_blkio_end(); | 2072 | delayacct_blkio_end(p); |
2073 | atomic_dec(&task_rq(p)->nr_iowait); | 2073 | atomic_dec(&task_rq(p)->nr_iowait); |
2074 | } | 2074 | } |
2075 | 2075 | ||
@@ -2122,7 +2122,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) | |||
2122 | 2122 | ||
2123 | if (!task_on_rq_queued(p)) { | 2123 | if (!task_on_rq_queued(p)) { |
2124 | if (p->in_iowait) { | 2124 | if (p->in_iowait) { |
2125 | delayacct_blkio_end(); | 2125 | delayacct_blkio_end(p); |
2126 | atomic_dec(&rq->nr_iowait); | 2126 | atomic_dec(&rq->nr_iowait); |
2127 | } | 2127 | } |
2128 | ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK); | 2128 | ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK); |
@@ -5097,17 +5097,6 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) | |||
5097 | return ret; | 5097 | return ret; |
5098 | } | 5098 | } |
5099 | 5099 | ||
5100 | /** | ||
5101 | * sys_sched_rr_get_interval - return the default timeslice of a process. | ||
5102 | * @pid: pid of the process. | ||
5103 | * @interval: userspace pointer to the timeslice value. | ||
5104 | * | ||
5105 | * this syscall writes the default timeslice value of a given process | ||
5106 | * into the user-space timespec buffer. A value of '0' means infinity. | ||
5107 | * | ||
5108 | * Return: On success, 0 and the timeslice is in @interval. Otherwise, | ||
5109 | * an error code. | ||
5110 | */ | ||
5111 | static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) | 5100 | static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) |
5112 | { | 5101 | { |
5113 | struct task_struct *p; | 5102 | struct task_struct *p; |
@@ -5144,6 +5133,17 @@ out_unlock: | |||
5144 | return retval; | 5133 | return retval; |
5145 | } | 5134 | } |
5146 | 5135 | ||
5136 | /** | ||
5137 | * sys_sched_rr_get_interval - return the default timeslice of a process. | ||
5138 | * @pid: pid of the process. | ||
5139 | * @interval: userspace pointer to the timeslice value. | ||
5140 | * | ||
5141 | * this syscall writes the default timeslice value of a given process | ||
5142 | * into the user-space timespec buffer. A value of '0' means infinity. | ||
5143 | * | ||
5144 | * Return: On success, 0 and the timeslice is in @interval. Otherwise, | ||
5145 | * an error code. | ||
5146 | */ | ||
5147 | SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, | 5147 | SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, |
5148 | struct timespec __user *, interval) | 5148 | struct timespec __user *, interval) |
5149 | { | 5149 | { |
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 2f52ec0f1539..d6717a3331a1 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c | |||
@@ -244,7 +244,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, | |||
244 | #ifdef CONFIG_NO_HZ_COMMON | 244 | #ifdef CONFIG_NO_HZ_COMMON |
245 | static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) | 245 | static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) |
246 | { | 246 | { |
247 | unsigned long idle_calls = tick_nohz_get_idle_calls(); | 247 | unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu); |
248 | bool ret = idle_calls == sg_cpu->saved_idle_calls; | 248 | bool ret = idle_calls == sg_cpu->saved_idle_calls; |
249 | 249 | ||
250 | sg_cpu->saved_idle_calls = idle_calls; | 250 | sg_cpu->saved_idle_calls = idle_calls; |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4037e19bbca2..26a71ebcd3c2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -3413,9 +3413,9 @@ void set_task_rq_fair(struct sched_entity *se, | |||
3413 | * _IFF_ we look at the pure running and runnable sums. Because they | 3413 | * _IFF_ we look at the pure running and runnable sums. Because they |
3414 | * represent the very same entity, just at different points in the hierarchy. | 3414 | * represent the very same entity, just at different points in the hierarchy. |
3415 | * | 3415 | * |
3416 | * | 3416 | * Per the above update_tg_cfs_util() is trivial and simply copies the running |
3417 | * Per the above update_tg_cfs_util() is trivial (and still 'wrong') and | 3417 | * sum over (but still wrong, because the group entity and group rq do not have |
3418 | * simply copies the running sum over. | 3418 | * their PELT windows aligned). |
3419 | * | 3419 | * |
3420 | * However, update_tg_cfs_runnable() is more complex. So we have: | 3420 | * However, update_tg_cfs_runnable() is more complex. So we have: |
3421 | * | 3421 | * |
@@ -3424,11 +3424,11 @@ void set_task_rq_fair(struct sched_entity *se, | |||
3424 | * And since, like util, the runnable part should be directly transferable, | 3424 | * And since, like util, the runnable part should be directly transferable, |
3425 | * the following would _appear_ to be the straight forward approach: | 3425 | * the following would _appear_ to be the straight forward approach: |
3426 | * | 3426 | * |
3427 | * grq->avg.load_avg = grq->load.weight * grq->avg.running_avg (3) | 3427 | * grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg (3) |
3428 | * | 3428 | * |
3429 | * And per (1) we have: | 3429 | * And per (1) we have: |
3430 | * | 3430 | * |
3431 | * ge->avg.running_avg == grq->avg.running_avg | 3431 | * ge->avg.runnable_avg == grq->avg.runnable_avg |
3432 | * | 3432 | * |
3433 | * Which gives: | 3433 | * Which gives: |
3434 | * | 3434 | * |
@@ -3447,27 +3447,28 @@ void set_task_rq_fair(struct sched_entity *se, | |||
3447 | * to (shortly) return to us. This only works by keeping the weights as | 3447 | * to (shortly) return to us. This only works by keeping the weights as |
3448 | * integral part of the sum. We therefore cannot decompose as per (3). | 3448 | * integral part of the sum. We therefore cannot decompose as per (3). |
3449 | * | 3449 | * |
3450 | * OK, so what then? | 3450 | * Another reason this doesn't work is that runnable isn't a 0-sum entity. |
3451 | * Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the | ||
3452 | * rq itself is runnable anywhere between 2/3 and 1 depending on how the | ||
3453 | * runnable section of these tasks overlap (or not). If they were to perfectly | ||
3454 | * align the rq as a whole would be runnable 2/3 of the time. If however we | ||
3455 | * always have at least 1 runnable task, the rq as a whole is always runnable. | ||
3451 | * | 3456 | * |
3457 | * So we'll have to approximate.. :/ | ||
3452 | * | 3458 | * |
3453 | * Another way to look at things is: | 3459 | * Given the constraint: |
3454 | * | 3460 | * |
3455 | * grq->avg.load_avg = \Sum se->avg.load_avg | 3461 | * ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX |
3456 | * | 3462 | * |
3457 | * Therefore, per (2): | 3463 | * We can construct a rule that adds runnable to a rq by assuming minimal |
3464 | * overlap. | ||
3458 | * | 3465 | * |
3459 | * grq->avg.load_avg = \Sum se->load.weight * se->avg.runnable_avg | 3466 | * On removal, we'll assume each task is equally runnable; which yields: |
3460 | * | 3467 | * |
3461 | * And the very thing we're propagating is a change in that sum (someone | 3468 | * grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight |
3462 | * joined/left). So we can easily know the runnable change, which would be, per | ||
3463 | * (2) the already tracked se->load_avg divided by the corresponding | ||
3464 | * se->weight. | ||
3465 | * | 3469 | * |
3466 | * Basically (4) but in differential form: | 3470 | * XXX: only do this for the part of runnable > running ? |
3467 | * | 3471 | * |
3468 | * d(runnable_avg) += se->avg.load_avg / se->load.weight | ||
3469 | * (5) | ||
3470 | * ge->avg.load_avg += ge->load.weight * d(runnable_avg) | ||
3471 | */ | 3472 | */ |
3472 | 3473 | ||
3473 | static inline void | 3474 | static inline void |
@@ -3479,6 +3480,14 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq | |||
3479 | if (!delta) | 3480 | if (!delta) |
3480 | return; | 3481 | return; |
3481 | 3482 | ||
3483 | /* | ||
3484 | * The relation between sum and avg is: | ||
3485 | * | ||
3486 | * LOAD_AVG_MAX - 1024 + sa->period_contrib | ||
3487 | * | ||
3488 | * however, the PELT windows are not aligned between grq and gse. | ||
3489 | */ | ||
3490 | |||
3482 | /* Set new sched_entity's utilization */ | 3491 | /* Set new sched_entity's utilization */ |
3483 | se->avg.util_avg = gcfs_rq->avg.util_avg; | 3492 | se->avg.util_avg = gcfs_rq->avg.util_avg; |
3484 | se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX; | 3493 | se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX; |
@@ -3491,33 +3500,68 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq | |||
3491 | static inline void | 3500 | static inline void |
3492 | update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) | 3501 | update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) |
3493 | { | 3502 | { |
3494 | long runnable_sum = gcfs_rq->prop_runnable_sum; | 3503 | long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum; |
3495 | long runnable_load_avg, load_avg; | 3504 | unsigned long runnable_load_avg, load_avg; |
3496 | s64 runnable_load_sum, load_sum; | 3505 | u64 runnable_load_sum, load_sum = 0; |
3506 | s64 delta_sum; | ||
3497 | 3507 | ||
3498 | if (!runnable_sum) | 3508 | if (!runnable_sum) |
3499 | return; | 3509 | return; |
3500 | 3510 | ||
3501 | gcfs_rq->prop_runnable_sum = 0; | 3511 | gcfs_rq->prop_runnable_sum = 0; |
3502 | 3512 | ||
3513 | if (runnable_sum >= 0) { | ||
3514 | /* | ||
3515 | * Add runnable; clip at LOAD_AVG_MAX. Reflects that until | ||
3516 | * the CPU is saturated running == runnable. | ||
3517 | */ | ||
3518 | runnable_sum += se->avg.load_sum; | ||
3519 | runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX); | ||
3520 | } else { | ||
3521 | /* | ||
3522 | * Estimate the new unweighted runnable_sum of the gcfs_rq by | ||
3523 | * assuming all tasks are equally runnable. | ||
3524 | */ | ||
3525 | if (scale_load_down(gcfs_rq->load.weight)) { | ||
3526 | load_sum = div_s64(gcfs_rq->avg.load_sum, | ||
3527 | scale_load_down(gcfs_rq->load.weight)); | ||
3528 | } | ||
3529 | |||
3530 | /* But make sure to not inflate se's runnable */ | ||
3531 | runnable_sum = min(se->avg.load_sum, load_sum); | ||
3532 | } | ||
3533 | |||
3534 | /* | ||
3535 | * runnable_sum can't be lower than running_sum | ||
3536 | * As running sum is scale with cpu capacity wehreas the runnable sum | ||
3537 | * is not we rescale running_sum 1st | ||
3538 | */ | ||
3539 | running_sum = se->avg.util_sum / | ||
3540 | arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq))); | ||
3541 | runnable_sum = max(runnable_sum, running_sum); | ||
3542 | |||
3503 | load_sum = (s64)se_weight(se) * runnable_sum; | 3543 | load_sum = (s64)se_weight(se) * runnable_sum; |
3504 | load_avg = div_s64(load_sum, LOAD_AVG_MAX); | 3544 | load_avg = div_s64(load_sum, LOAD_AVG_MAX); |
3505 | 3545 | ||
3506 | add_positive(&se->avg.load_sum, runnable_sum); | 3546 | delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum; |
3507 | add_positive(&se->avg.load_avg, load_avg); | 3547 | delta_avg = load_avg - se->avg.load_avg; |
3508 | 3548 | ||
3509 | add_positive(&cfs_rq->avg.load_avg, load_avg); | 3549 | se->avg.load_sum = runnable_sum; |
3510 | add_positive(&cfs_rq->avg.load_sum, load_sum); | 3550 | se->avg.load_avg = load_avg; |
3551 | add_positive(&cfs_rq->avg.load_avg, delta_avg); | ||
3552 | add_positive(&cfs_rq->avg.load_sum, delta_sum); | ||
3511 | 3553 | ||
3512 | runnable_load_sum = (s64)se_runnable(se) * runnable_sum; | 3554 | runnable_load_sum = (s64)se_runnable(se) * runnable_sum; |
3513 | runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX); | 3555 | runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX); |
3556 | delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum; | ||
3557 | delta_avg = runnable_load_avg - se->avg.runnable_load_avg; | ||
3514 | 3558 | ||
3515 | add_positive(&se->avg.runnable_load_sum, runnable_sum); | 3559 | se->avg.runnable_load_sum = runnable_sum; |
3516 | add_positive(&se->avg.runnable_load_avg, runnable_load_avg); | 3560 | se->avg.runnable_load_avg = runnable_load_avg; |
3517 | 3561 | ||
3518 | if (se->on_rq) { | 3562 | if (se->on_rq) { |
3519 | add_positive(&cfs_rq->avg.runnable_load_avg, runnable_load_avg); | 3563 | add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg); |
3520 | add_positive(&cfs_rq->avg.runnable_load_sum, runnable_load_sum); | 3564 | add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum); |
3521 | } | 3565 | } |
3522 | } | 3566 | } |
3523 | 3567 | ||
@@ -4321,12 +4365,12 @@ static inline bool cfs_bandwidth_used(void) | |||
4321 | 4365 | ||
4322 | void cfs_bandwidth_usage_inc(void) | 4366 | void cfs_bandwidth_usage_inc(void) |
4323 | { | 4367 | { |
4324 | static_key_slow_inc(&__cfs_bandwidth_used); | 4368 | static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used); |
4325 | } | 4369 | } |
4326 | 4370 | ||
4327 | void cfs_bandwidth_usage_dec(void) | 4371 | void cfs_bandwidth_usage_dec(void) |
4328 | { | 4372 | { |
4329 | static_key_slow_dec(&__cfs_bandwidth_used); | 4373 | static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used); |
4330 | } | 4374 | } |
4331 | #else /* HAVE_JUMP_LABEL */ | 4375 | #else /* HAVE_JUMP_LABEL */ |
4332 | static bool cfs_bandwidth_used(void) | 4376 | static bool cfs_bandwidth_used(void) |
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index dd7908743dab..9bcbacba82a8 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c | |||
@@ -89,7 +89,9 @@ static int membarrier_private_expedited(void) | |||
89 | rcu_read_unlock(); | 89 | rcu_read_unlock(); |
90 | } | 90 | } |
91 | if (!fallback) { | 91 | if (!fallback) { |
92 | preempt_disable(); | ||
92 | smp_call_function_many(tmpmask, ipi_mb, NULL, 1); | 93 | smp_call_function_many(tmpmask, ipi_mb, NULL, 1); |
94 | preempt_enable(); | ||
93 | free_cpumask_var(tmpmask); | 95 | free_cpumask_var(tmpmask); |
94 | } | 96 | } |
95 | cpus_read_unlock(); | 97 | cpus_read_unlock(); |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 4056c19ca3f0..665ace2fc558 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -2034,8 +2034,9 @@ static void pull_rt_task(struct rq *this_rq) | |||
2034 | bool resched = false; | 2034 | bool resched = false; |
2035 | struct task_struct *p; | 2035 | struct task_struct *p; |
2036 | struct rq *src_rq; | 2036 | struct rq *src_rq; |
2037 | int rt_overload_count = rt_overloaded(this_rq); | ||
2037 | 2038 | ||
2038 | if (likely(!rt_overloaded(this_rq))) | 2039 | if (likely(!rt_overload_count)) |
2039 | return; | 2040 | return; |
2040 | 2041 | ||
2041 | /* | 2042 | /* |
@@ -2044,6 +2045,11 @@ static void pull_rt_task(struct rq *this_rq) | |||
2044 | */ | 2045 | */ |
2045 | smp_rmb(); | 2046 | smp_rmb(); |
2046 | 2047 | ||
2048 | /* If we are the only overloaded CPU do nothing */ | ||
2049 | if (rt_overload_count == 1 && | ||
2050 | cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask)) | ||
2051 | return; | ||
2052 | |||
2047 | #ifdef HAVE_RT_PUSH_IPI | 2053 | #ifdef HAVE_RT_PUSH_IPI |
2048 | if (sched_feat(RT_PUSH_IPI)) { | 2054 | if (sched_feat(RT_PUSH_IPI)) { |
2049 | tell_cpu_to_push(this_rq); | 2055 | tell_cpu_to_push(this_rq); |
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 98feab7933c7..929ecb7d6b78 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c | |||
@@ -27,7 +27,7 @@ void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq | |||
27 | 27 | ||
28 | wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE; | 28 | wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE; |
29 | spin_lock_irqsave(&wq_head->lock, flags); | 29 | spin_lock_irqsave(&wq_head->lock, flags); |
30 | __add_wait_queue_entry_tail(wq_head, wq_entry); | 30 | __add_wait_queue(wq_head, wq_entry); |
31 | spin_unlock_irqrestore(&wq_head->lock, flags); | 31 | spin_unlock_irqrestore(&wq_head->lock, flags); |
32 | } | 32 | } |
33 | EXPORT_SYMBOL(add_wait_queue); | 33 | EXPORT_SYMBOL(add_wait_queue); |
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index e776fc8cc1df..f6b5f19223d6 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig | |||
@@ -95,6 +95,7 @@ config NO_HZ_FULL | |||
95 | select RCU_NOCB_CPU | 95 | select RCU_NOCB_CPU |
96 | select VIRT_CPU_ACCOUNTING_GEN | 96 | select VIRT_CPU_ACCOUNTING_GEN |
97 | select IRQ_WORK | 97 | select IRQ_WORK |
98 | select CPU_ISOLATION | ||
98 | help | 99 | help |
99 | Adaptively try to shutdown the tick whenever possible, even when | 100 | Adaptively try to shutdown the tick whenever possible, even when |
100 | the CPU is running tasks. Typically this requires running a single | 101 | the CPU is running tasks. Typically this requires running a single |
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index d32520840fde..aa9d2a2b1210 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c | |||
@@ -655,7 +655,9 @@ static void hrtimer_reprogram(struct hrtimer *timer, | |||
655 | static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) | 655 | static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) |
656 | { | 656 | { |
657 | base->expires_next = KTIME_MAX; | 657 | base->expires_next = KTIME_MAX; |
658 | base->hang_detected = 0; | ||
658 | base->hres_active = 0; | 659 | base->hres_active = 0; |
660 | base->next_timer = NULL; | ||
659 | } | 661 | } |
660 | 662 | ||
661 | /* | 663 | /* |
@@ -1589,6 +1591,7 @@ int hrtimers_prepare_cpu(unsigned int cpu) | |||
1589 | timerqueue_init_head(&cpu_base->clock_base[i].active); | 1591 | timerqueue_init_head(&cpu_base->clock_base[i].active); |
1590 | } | 1592 | } |
1591 | 1593 | ||
1594 | cpu_base->active_bases = 0; | ||
1592 | cpu_base->cpu = cpu; | 1595 | cpu_base->cpu = cpu; |
1593 | hrtimer_init_hres(cpu_base); | 1596 | hrtimer_init_hres(cpu_base); |
1594 | return 0; | 1597 | return 0; |
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 13d6881f908b..ec999f32c840 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c | |||
@@ -434,17 +434,22 @@ static struct pid *good_sigevent(sigevent_t * event) | |||
434 | { | 434 | { |
435 | struct task_struct *rtn = current->group_leader; | 435 | struct task_struct *rtn = current->group_leader; |
436 | 436 | ||
437 | if ((event->sigev_notify & SIGEV_THREAD_ID ) && | 437 | switch (event->sigev_notify) { |
438 | (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) || | 438 | case SIGEV_SIGNAL | SIGEV_THREAD_ID: |
439 | !same_thread_group(rtn, current) || | 439 | rtn = find_task_by_vpid(event->sigev_notify_thread_id); |
440 | (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL)) | 440 | if (!rtn || !same_thread_group(rtn, current)) |
441 | return NULL; | ||
442 | /* FALLTHRU */ | ||
443 | case SIGEV_SIGNAL: | ||
444 | case SIGEV_THREAD: | ||
445 | if (event->sigev_signo <= 0 || event->sigev_signo > SIGRTMAX) | ||
446 | return NULL; | ||
447 | /* FALLTHRU */ | ||
448 | case SIGEV_NONE: | ||
449 | return task_pid(rtn); | ||
450 | default: | ||
441 | return NULL; | 451 | return NULL; |
442 | 452 | } | |
443 | if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) && | ||
444 | ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX))) | ||
445 | return NULL; | ||
446 | |||
447 | return task_pid(rtn); | ||
448 | } | 453 | } |
449 | 454 | ||
450 | static struct k_itimer * alloc_posix_timer(void) | 455 | static struct k_itimer * alloc_posix_timer(void) |
@@ -669,7 +674,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) | |||
669 | struct timespec64 ts64; | 674 | struct timespec64 ts64; |
670 | bool sig_none; | 675 | bool sig_none; |
671 | 676 | ||
672 | sig_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE; | 677 | sig_none = timr->it_sigev_notify == SIGEV_NONE; |
673 | iv = timr->it_interval; | 678 | iv = timr->it_interval; |
674 | 679 | ||
675 | /* interval timer ? */ | 680 | /* interval timer ? */ |
@@ -856,7 +861,7 @@ int common_timer_set(struct k_itimer *timr, int flags, | |||
856 | 861 | ||
857 | timr->it_interval = timespec64_to_ktime(new_setting->it_interval); | 862 | timr->it_interval = timespec64_to_ktime(new_setting->it_interval); |
858 | expires = timespec64_to_ktime(new_setting->it_value); | 863 | expires = timespec64_to_ktime(new_setting->it_value); |
859 | sigev_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE; | 864 | sigev_none = timr->it_sigev_notify == SIGEV_NONE; |
860 | 865 | ||
861 | kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none); | 866 | kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none); |
862 | timr->it_active = !sigev_none; | 867 | timr->it_active = !sigev_none; |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 99578f06c8d4..f7cc7abfcf25 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -650,6 +650,11 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) | |||
650 | ts->next_tick = 0; | 650 | ts->next_tick = 0; |
651 | } | 651 | } |
652 | 652 | ||
653 | static inline bool local_timer_softirq_pending(void) | ||
654 | { | ||
655 | return local_softirq_pending() & TIMER_SOFTIRQ; | ||
656 | } | ||
657 | |||
653 | static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | 658 | static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, |
654 | ktime_t now, int cpu) | 659 | ktime_t now, int cpu) |
655 | { | 660 | { |
@@ -666,8 +671,18 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
666 | } while (read_seqretry(&jiffies_lock, seq)); | 671 | } while (read_seqretry(&jiffies_lock, seq)); |
667 | ts->last_jiffies = basejiff; | 672 | ts->last_jiffies = basejiff; |
668 | 673 | ||
669 | if (rcu_needs_cpu(basemono, &next_rcu) || | 674 | /* |
670 | arch_needs_cpu() || irq_work_needs_cpu()) { | 675 | * Keep the periodic tick, when RCU, architecture or irq_work |
676 | * requests it. | ||
677 | * Aside of that check whether the local timer softirq is | ||
678 | * pending. If so its a bad idea to call get_next_timer_interrupt() | ||
679 | * because there is an already expired timer, so it will request | ||
680 | * immeditate expiry, which rearms the hardware timer with a | ||
681 | * minimal delta which brings us back to this place | ||
682 | * immediately. Lather, rinse and repeat... | ||
683 | */ | ||
684 | if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() || | ||
685 | irq_work_needs_cpu() || local_timer_softirq_pending()) { | ||
671 | next_tick = basemono + TICK_NSEC; | 686 | next_tick = basemono + TICK_NSEC; |
672 | } else { | 687 | } else { |
673 | /* | 688 | /* |
@@ -986,6 +1001,19 @@ ktime_t tick_nohz_get_sleep_length(void) | |||
986 | } | 1001 | } |
987 | 1002 | ||
988 | /** | 1003 | /** |
1004 | * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value | ||
1005 | * for a particular CPU. | ||
1006 | * | ||
1007 | * Called from the schedutil frequency scaling governor in scheduler context. | ||
1008 | */ | ||
1009 | unsigned long tick_nohz_get_idle_calls_cpu(int cpu) | ||
1010 | { | ||
1011 | struct tick_sched *ts = tick_get_tick_sched(cpu); | ||
1012 | |||
1013 | return ts->idle_calls; | ||
1014 | } | ||
1015 | |||
1016 | /** | ||
989 | * tick_nohz_get_idle_calls - return the current idle calls counter value | 1017 | * tick_nohz_get_idle_calls - return the current idle calls counter value |
990 | * | 1018 | * |
991 | * Called from the schedutil frequency scaling governor in scheduler context. | 1019 | * Called from the schedutil frequency scaling governor in scheduler context. |
diff --git a/kernel/time/timer.c b/kernel/time/timer.c index ffebcf878fba..0bcf00e3ce48 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c | |||
@@ -823,11 +823,10 @@ static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu) | |||
823 | struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu); | 823 | struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu); |
824 | 824 | ||
825 | /* | 825 | /* |
826 | * If the timer is deferrable and nohz is active then we need to use | 826 | * If the timer is deferrable and NO_HZ_COMMON is set then we need |
827 | * the deferrable base. | 827 | * to use the deferrable base. |
828 | */ | 828 | */ |
829 | if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active && | 829 | if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) |
830 | (tflags & TIMER_DEFERRABLE)) | ||
831 | base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu); | 830 | base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu); |
832 | return base; | 831 | return base; |
833 | } | 832 | } |
@@ -837,11 +836,10 @@ static inline struct timer_base *get_timer_this_cpu_base(u32 tflags) | |||
837 | struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); | 836 | struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); |
838 | 837 | ||
839 | /* | 838 | /* |
840 | * If the timer is deferrable and nohz is active then we need to use | 839 | * If the timer is deferrable and NO_HZ_COMMON is set then we need |
841 | * the deferrable base. | 840 | * to use the deferrable base. |
842 | */ | 841 | */ |
843 | if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active && | 842 | if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) |
844 | (tflags & TIMER_DEFERRABLE)) | ||
845 | base = this_cpu_ptr(&timer_bases[BASE_DEF]); | 843 | base = this_cpu_ptr(&timer_bases[BASE_DEF]); |
846 | return base; | 844 | return base; |
847 | } | 845 | } |
@@ -1009,8 +1007,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option | |||
1009 | if (!ret && (options & MOD_TIMER_PENDING_ONLY)) | 1007 | if (!ret && (options & MOD_TIMER_PENDING_ONLY)) |
1010 | goto out_unlock; | 1008 | goto out_unlock; |
1011 | 1009 | ||
1012 | debug_activate(timer, expires); | ||
1013 | |||
1014 | new_base = get_target_base(base, timer->flags); | 1010 | new_base = get_target_base(base, timer->flags); |
1015 | 1011 | ||
1016 | if (base != new_base) { | 1012 | if (base != new_base) { |
@@ -1034,6 +1030,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option | |||
1034 | } | 1030 | } |
1035 | } | 1031 | } |
1036 | 1032 | ||
1033 | debug_activate(timer, expires); | ||
1034 | |||
1037 | timer->expires = expires; | 1035 | timer->expires = expires; |
1038 | /* | 1036 | /* |
1039 | * If 'idx' was calculated above and the base time did not advance | 1037 | * If 'idx' was calculated above and the base time did not advance |
@@ -1684,7 +1682,7 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h) | |||
1684 | base->must_forward_clk = false; | 1682 | base->must_forward_clk = false; |
1685 | 1683 | ||
1686 | __run_timers(base); | 1684 | __run_timers(base); |
1687 | if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) | 1685 | if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) |
1688 | __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); | 1686 | __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); |
1689 | } | 1687 | } |
1690 | 1688 | ||
@@ -1698,7 +1696,7 @@ void run_local_timers(void) | |||
1698 | hrtimer_run_queues(); | 1696 | hrtimer_run_queues(); |
1699 | /* Raise the softirq only if required. */ | 1697 | /* Raise the softirq only if required. */ |
1700 | if (time_before(jiffies, base->clk)) { | 1698 | if (time_before(jiffies, base->clk)) { |
1701 | if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active) | 1699 | if (!IS_ENABLED(CONFIG_NO_HZ_COMMON)) |
1702 | return; | 1700 | return; |
1703 | /* CPU is awake, so check the deferrable base. */ | 1701 | /* CPU is awake, so check the deferrable base. */ |
1704 | base++; | 1702 | base++; |
@@ -1855,6 +1853,21 @@ static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *h | |||
1855 | } | 1853 | } |
1856 | } | 1854 | } |
1857 | 1855 | ||
1856 | int timers_prepare_cpu(unsigned int cpu) | ||
1857 | { | ||
1858 | struct timer_base *base; | ||
1859 | int b; | ||
1860 | |||
1861 | for (b = 0; b < NR_BASES; b++) { | ||
1862 | base = per_cpu_ptr(&timer_bases[b], cpu); | ||
1863 | base->clk = jiffies; | ||
1864 | base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; | ||
1865 | base->is_idle = false; | ||
1866 | base->must_forward_clk = true; | ||
1867 | } | ||
1868 | return 0; | ||
1869 | } | ||
1870 | |||
1858 | int timers_dead_cpu(unsigned int cpu) | 1871 | int timers_dead_cpu(unsigned int cpu) |
1859 | { | 1872 | { |
1860 | struct timer_base *old_base; | 1873 | struct timer_base *old_base; |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index af7dad126c13..f54dc62b599c 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
@@ -164,6 +164,7 @@ config PREEMPTIRQ_EVENTS | |||
164 | bool "Enable trace events for preempt and irq disable/enable" | 164 | bool "Enable trace events for preempt and irq disable/enable" |
165 | select TRACE_IRQFLAGS | 165 | select TRACE_IRQFLAGS |
166 | depends on DEBUG_PREEMPT || !PROVE_LOCKING | 166 | depends on DEBUG_PREEMPT || !PROVE_LOCKING |
167 | depends on TRACING | ||
167 | default n | 168 | default n |
168 | help | 169 | help |
169 | Enable tracing of disable and enable events for preemption and irqs. | 170 | Enable tracing of disable and enable events for preemption and irqs. |
@@ -354,7 +355,7 @@ config PROFILE_ANNOTATED_BRANCHES | |||
354 | on if you need to profile the system's use of these macros. | 355 | on if you need to profile the system's use of these macros. |
355 | 356 | ||
356 | config PROFILE_ALL_BRANCHES | 357 | config PROFILE_ALL_BRANCHES |
357 | bool "Profile all if conditionals" | 358 | bool "Profile all if conditionals" if !FORTIFY_SOURCE |
358 | select TRACE_BRANCH_PROFILING | 359 | select TRACE_BRANCH_PROFILING |
359 | help | 360 | help |
360 | This tracer profiles all branch conditions. Every if () | 361 | This tracer profiles all branch conditions. Every if () |
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 27d1f4ffa3de..40207c2a4113 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c | |||
@@ -343,14 +343,13 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = { | |||
343 | .arg4_type = ARG_CONST_SIZE, | 343 | .arg4_type = ARG_CONST_SIZE, |
344 | }; | 344 | }; |
345 | 345 | ||
346 | static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd); | 346 | static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd); |
347 | 347 | ||
348 | static __always_inline u64 | 348 | static __always_inline u64 |
349 | __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, | 349 | __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, |
350 | u64 flags, struct perf_raw_record *raw) | 350 | u64 flags, struct perf_sample_data *sd) |
351 | { | 351 | { |
352 | struct bpf_array *array = container_of(map, struct bpf_array, map); | 352 | struct bpf_array *array = container_of(map, struct bpf_array, map); |
353 | struct perf_sample_data *sd = this_cpu_ptr(&bpf_sd); | ||
354 | unsigned int cpu = smp_processor_id(); | 353 | unsigned int cpu = smp_processor_id(); |
355 | u64 index = flags & BPF_F_INDEX_MASK; | 354 | u64 index = flags & BPF_F_INDEX_MASK; |
356 | struct bpf_event_entry *ee; | 355 | struct bpf_event_entry *ee; |
@@ -373,8 +372,6 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, | |||
373 | if (unlikely(event->oncpu != cpu)) | 372 | if (unlikely(event->oncpu != cpu)) |
374 | return -EOPNOTSUPP; | 373 | return -EOPNOTSUPP; |
375 | 374 | ||
376 | perf_sample_data_init(sd, 0, 0); | ||
377 | sd->raw = raw; | ||
378 | perf_event_output(event, sd, regs); | 375 | perf_event_output(event, sd, regs); |
379 | return 0; | 376 | return 0; |
380 | } | 377 | } |
@@ -382,6 +379,7 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, | |||
382 | BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, | 379 | BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, |
383 | u64, flags, void *, data, u64, size) | 380 | u64, flags, void *, data, u64, size) |
384 | { | 381 | { |
382 | struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd); | ||
385 | struct perf_raw_record raw = { | 383 | struct perf_raw_record raw = { |
386 | .frag = { | 384 | .frag = { |
387 | .size = size, | 385 | .size = size, |
@@ -392,7 +390,10 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, | |||
392 | if (unlikely(flags & ~(BPF_F_INDEX_MASK))) | 390 | if (unlikely(flags & ~(BPF_F_INDEX_MASK))) |
393 | return -EINVAL; | 391 | return -EINVAL; |
394 | 392 | ||
395 | return __bpf_perf_event_output(regs, map, flags, &raw); | 393 | perf_sample_data_init(sd, 0, 0); |
394 | sd->raw = &raw; | ||
395 | |||
396 | return __bpf_perf_event_output(regs, map, flags, sd); | ||
396 | } | 397 | } |
397 | 398 | ||
398 | static const struct bpf_func_proto bpf_perf_event_output_proto = { | 399 | static const struct bpf_func_proto bpf_perf_event_output_proto = { |
@@ -407,10 +408,12 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = { | |||
407 | }; | 408 | }; |
408 | 409 | ||
409 | static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); | 410 | static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); |
411 | static DEFINE_PER_CPU(struct perf_sample_data, bpf_misc_sd); | ||
410 | 412 | ||
411 | u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, | 413 | u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, |
412 | void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) | 414 | void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) |
413 | { | 415 | { |
416 | struct perf_sample_data *sd = this_cpu_ptr(&bpf_misc_sd); | ||
414 | struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs); | 417 | struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs); |
415 | struct perf_raw_frag frag = { | 418 | struct perf_raw_frag frag = { |
416 | .copy = ctx_copy, | 419 | .copy = ctx_copy, |
@@ -428,8 +431,10 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, | |||
428 | }; | 431 | }; |
429 | 432 | ||
430 | perf_fetch_caller_regs(regs); | 433 | perf_fetch_caller_regs(regs); |
434 | perf_sample_data_init(sd, 0, 0); | ||
435 | sd->raw = &raw; | ||
431 | 436 | ||
432 | return __bpf_perf_event_output(regs, map, flags, &raw); | 437 | return __bpf_perf_event_output(regs, map, flags, sd); |
433 | } | 438 | } |
434 | 439 | ||
435 | BPF_CALL_0(bpf_get_current_task) | 440 | BPF_CALL_0(bpf_get_current_task) |
@@ -759,6 +764,8 @@ const struct bpf_prog_ops perf_event_prog_ops = { | |||
759 | 764 | ||
760 | static DEFINE_MUTEX(bpf_event_mutex); | 765 | static DEFINE_MUTEX(bpf_event_mutex); |
761 | 766 | ||
767 | #define BPF_TRACE_MAX_PROGS 64 | ||
768 | |||
762 | int perf_event_attach_bpf_prog(struct perf_event *event, | 769 | int perf_event_attach_bpf_prog(struct perf_event *event, |
763 | struct bpf_prog *prog) | 770 | struct bpf_prog *prog) |
764 | { | 771 | { |
@@ -772,6 +779,12 @@ int perf_event_attach_bpf_prog(struct perf_event *event, | |||
772 | goto unlock; | 779 | goto unlock; |
773 | 780 | ||
774 | old_array = event->tp_event->prog_array; | 781 | old_array = event->tp_event->prog_array; |
782 | if (old_array && | ||
783 | bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) { | ||
784 | ret = -E2BIG; | ||
785 | goto unlock; | ||
786 | } | ||
787 | |||
775 | ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array); | 788 | ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array); |
776 | if (ret < 0) | 789 | if (ret < 0) |
777 | goto unlock; | 790 | goto unlock; |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ccdf3664e4a9..554b517c61a0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -1119,15 +1119,11 @@ static struct ftrace_ops global_ops = { | |||
1119 | }; | 1119 | }; |
1120 | 1120 | ||
1121 | /* | 1121 | /* |
1122 | * This is used by __kernel_text_address() to return true if the | 1122 | * Used by the stack undwinder to know about dynamic ftrace trampolines. |
1123 | * address is on a dynamically allocated trampoline that would | ||
1124 | * not return true for either core_kernel_text() or | ||
1125 | * is_module_text_address(). | ||
1126 | */ | 1123 | */ |
1127 | bool is_ftrace_trampoline(unsigned long addr) | 1124 | struct ftrace_ops *ftrace_ops_trampoline(unsigned long addr) |
1128 | { | 1125 | { |
1129 | struct ftrace_ops *op; | 1126 | struct ftrace_ops *op = NULL; |
1130 | bool ret = false; | ||
1131 | 1127 | ||
1132 | /* | 1128 | /* |
1133 | * Some of the ops may be dynamically allocated, | 1129 | * Some of the ops may be dynamically allocated, |
@@ -1144,15 +1140,24 @@ bool is_ftrace_trampoline(unsigned long addr) | |||
1144 | if (op->trampoline && op->trampoline_size) | 1140 | if (op->trampoline && op->trampoline_size) |
1145 | if (addr >= op->trampoline && | 1141 | if (addr >= op->trampoline && |
1146 | addr < op->trampoline + op->trampoline_size) { | 1142 | addr < op->trampoline + op->trampoline_size) { |
1147 | ret = true; | 1143 | preempt_enable_notrace(); |
1148 | goto out; | 1144 | return op; |
1149 | } | 1145 | } |
1150 | } while_for_each_ftrace_op(op); | 1146 | } while_for_each_ftrace_op(op); |
1151 | |||
1152 | out: | ||
1153 | preempt_enable_notrace(); | 1147 | preempt_enable_notrace(); |
1154 | 1148 | ||
1155 | return ret; | 1149 | return NULL; |
1150 | } | ||
1151 | |||
1152 | /* | ||
1153 | * This is used by __kernel_text_address() to return true if the | ||
1154 | * address is on a dynamically allocated trampoline that would | ||
1155 | * not return true for either core_kernel_text() or | ||
1156 | * is_module_text_address(). | ||
1157 | */ | ||
1158 | bool is_ftrace_trampoline(unsigned long addr) | ||
1159 | { | ||
1160 | return ftrace_ops_trampoline(addr) != NULL; | ||
1156 | } | 1161 | } |
1157 | 1162 | ||
1158 | struct ftrace_page { | 1163 | struct ftrace_page { |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 91874a95060d..5af2842dea96 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -280,6 +280,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); | |||
280 | /* Missed count stored at end */ | 280 | /* Missed count stored at end */ |
281 | #define RB_MISSED_STORED (1 << 30) | 281 | #define RB_MISSED_STORED (1 << 30) |
282 | 282 | ||
283 | #define RB_MISSED_FLAGS (RB_MISSED_EVENTS|RB_MISSED_STORED) | ||
284 | |||
283 | struct buffer_data_page { | 285 | struct buffer_data_page { |
284 | u64 time_stamp; /* page time stamp */ | 286 | u64 time_stamp; /* page time stamp */ |
285 | local_t commit; /* write committed index */ | 287 | local_t commit; /* write committed index */ |
@@ -331,7 +333,9 @@ static void rb_init_page(struct buffer_data_page *bpage) | |||
331 | */ | 333 | */ |
332 | size_t ring_buffer_page_len(void *page) | 334 | size_t ring_buffer_page_len(void *page) |
333 | { | 335 | { |
334 | return local_read(&((struct buffer_data_page *)page)->commit) | 336 | struct buffer_data_page *bpage = page; |
337 | |||
338 | return (local_read(&bpage->commit) & ~RB_MISSED_FLAGS) | ||
335 | + BUF_PAGE_HDR_SIZE; | 339 | + BUF_PAGE_HDR_SIZE; |
336 | } | 340 | } |
337 | 341 | ||
@@ -1799,12 +1803,6 @@ void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val) | |||
1799 | } | 1803 | } |
1800 | EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite); | 1804 | EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite); |
1801 | 1805 | ||
1802 | static __always_inline void * | ||
1803 | __rb_data_page_index(struct buffer_data_page *bpage, unsigned index) | ||
1804 | { | ||
1805 | return bpage->data + index; | ||
1806 | } | ||
1807 | |||
1808 | static __always_inline void *__rb_page_index(struct buffer_page *bpage, unsigned index) | 1806 | static __always_inline void *__rb_page_index(struct buffer_page *bpage, unsigned index) |
1809 | { | 1807 | { |
1810 | return bpage->page->data + index; | 1808 | return bpage->page->data + index; |
@@ -2536,29 +2534,58 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) | |||
2536 | * The lock and unlock are done within a preempt disable section. | 2534 | * The lock and unlock are done within a preempt disable section. |
2537 | * The current_context per_cpu variable can only be modified | 2535 | * The current_context per_cpu variable can only be modified |
2538 | * by the current task between lock and unlock. But it can | 2536 | * by the current task between lock and unlock. But it can |
2539 | * be modified more than once via an interrupt. There are four | 2537 | * be modified more than once via an interrupt. To pass this |
2540 | * different contexts that we need to consider. | 2538 | * information from the lock to the unlock without having to |
2539 | * access the 'in_interrupt()' functions again (which do show | ||
2540 | * a bit of overhead in something as critical as function tracing, | ||
2541 | * we use a bitmask trick. | ||
2542 | * | ||
2543 | * bit 0 = NMI context | ||
2544 | * bit 1 = IRQ context | ||
2545 | * bit 2 = SoftIRQ context | ||
2546 | * bit 3 = normal context. | ||
2547 | * | ||
2548 | * This works because this is the order of contexts that can | ||
2549 | * preempt other contexts. A SoftIRQ never preempts an IRQ | ||
2550 | * context. | ||
2551 | * | ||
2552 | * When the context is determined, the corresponding bit is | ||
2553 | * checked and set (if it was set, then a recursion of that context | ||
2554 | * happened). | ||
2555 | * | ||
2556 | * On unlock, we need to clear this bit. To do so, just subtract | ||
2557 | * 1 from the current_context and AND it to itself. | ||
2541 | * | 2558 | * |
2542 | * Normal context. | 2559 | * (binary) |
2543 | * SoftIRQ context | 2560 | * 101 - 1 = 100 |
2544 | * IRQ context | 2561 | * 101 & 100 = 100 (clearing bit zero) |
2545 | * NMI context | ||
2546 | * | 2562 | * |
2547 | * If for some reason the ring buffer starts to recurse, we | 2563 | * 1010 - 1 = 1001 |
2548 | * only allow that to happen at most 4 times (one for each | 2564 | * 1010 & 1001 = 1000 (clearing bit 1) |
2549 | * context). If it happens 5 times, then we consider this a | 2565 | * |
2550 | * recusive loop and do not let it go further. | 2566 | * The least significant bit can be cleared this way, and it |
2567 | * just so happens that it is the same bit corresponding to | ||
2568 | * the current context. | ||
2551 | */ | 2569 | */ |
2552 | 2570 | ||
2553 | static __always_inline int | 2571 | static __always_inline int |
2554 | trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) | 2572 | trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) |
2555 | { | 2573 | { |
2556 | if (cpu_buffer->current_context >= 4) | 2574 | unsigned int val = cpu_buffer->current_context; |
2575 | unsigned long pc = preempt_count(); | ||
2576 | int bit; | ||
2577 | |||
2578 | if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))) | ||
2579 | bit = RB_CTX_NORMAL; | ||
2580 | else | ||
2581 | bit = pc & NMI_MASK ? RB_CTX_NMI : | ||
2582 | pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ; | ||
2583 | |||
2584 | if (unlikely(val & (1 << bit))) | ||
2557 | return 1; | 2585 | return 1; |
2558 | 2586 | ||
2559 | cpu_buffer->current_context++; | 2587 | val |= (1 << bit); |
2560 | /* Interrupts must see this update */ | 2588 | cpu_buffer->current_context = val; |
2561 | barrier(); | ||
2562 | 2589 | ||
2563 | return 0; | 2590 | return 0; |
2564 | } | 2591 | } |
@@ -2566,9 +2593,7 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) | |||
2566 | static __always_inline void | 2593 | static __always_inline void |
2567 | trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) | 2594 | trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) |
2568 | { | 2595 | { |
2569 | /* Don't let the dec leak out */ | 2596 | cpu_buffer->current_context &= cpu_buffer->current_context - 1; |
2570 | barrier(); | ||
2571 | cpu_buffer->current_context--; | ||
2572 | } | 2597 | } |
2573 | 2598 | ||
2574 | /** | 2599 | /** |
@@ -4406,8 +4431,13 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data) | |||
4406 | { | 4431 | { |
4407 | struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; | 4432 | struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; |
4408 | struct buffer_data_page *bpage = data; | 4433 | struct buffer_data_page *bpage = data; |
4434 | struct page *page = virt_to_page(bpage); | ||
4409 | unsigned long flags; | 4435 | unsigned long flags; |
4410 | 4436 | ||
4437 | /* If the page is still in use someplace else, we can't reuse it */ | ||
4438 | if (page_ref_count(page) > 1) | ||
4439 | goto out; | ||
4440 | |||
4411 | local_irq_save(flags); | 4441 | local_irq_save(flags); |
4412 | arch_spin_lock(&cpu_buffer->lock); | 4442 | arch_spin_lock(&cpu_buffer->lock); |
4413 | 4443 | ||
@@ -4419,6 +4449,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data) | |||
4419 | arch_spin_unlock(&cpu_buffer->lock); | 4449 | arch_spin_unlock(&cpu_buffer->lock); |
4420 | local_irq_restore(flags); | 4450 | local_irq_restore(flags); |
4421 | 4451 | ||
4452 | out: | ||
4422 | free_page((unsigned long)bpage); | 4453 | free_page((unsigned long)bpage); |
4423 | } | 4454 | } |
4424 | EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); | 4455 | EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 73e67b68c53b..8e3f20a18a06 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -362,7 +362,7 @@ trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct | |||
362 | } | 362 | } |
363 | 363 | ||
364 | /** | 364 | /** |
365 | * trace_pid_filter_add_remove - Add or remove a task from a pid_list | 365 | * trace_pid_filter_add_remove_task - Add or remove a task from a pid_list |
366 | * @pid_list: The list to modify | 366 | * @pid_list: The list to modify |
367 | * @self: The current task for fork or NULL for exit | 367 | * @self: The current task for fork or NULL for exit |
368 | * @task: The task to add or remove | 368 | * @task: The task to add or remove |
@@ -925,7 +925,7 @@ static void tracing_snapshot_instance(struct trace_array *tr) | |||
925 | } | 925 | } |
926 | 926 | ||
927 | /** | 927 | /** |
928 | * trace_snapshot - take a snapshot of the current buffer. | 928 | * tracing_snapshot - take a snapshot of the current buffer. |
929 | * | 929 | * |
930 | * This causes a swap between the snapshot buffer and the current live | 930 | * This causes a swap between the snapshot buffer and the current live |
931 | * tracing buffer. You can use this to take snapshots of the live | 931 | * tracing buffer. You can use this to take snapshots of the live |
@@ -1004,9 +1004,9 @@ int tracing_alloc_snapshot(void) | |||
1004 | EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); | 1004 | EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); |
1005 | 1005 | ||
1006 | /** | 1006 | /** |
1007 | * trace_snapshot_alloc - allocate and take a snapshot of the current buffer. | 1007 | * tracing_snapshot_alloc - allocate and take a snapshot of the current buffer. |
1008 | * | 1008 | * |
1009 | * This is similar to trace_snapshot(), but it will allocate the | 1009 | * This is similar to tracing_snapshot(), but it will allocate the |
1010 | * snapshot buffer if it isn't already allocated. Use this only | 1010 | * snapshot buffer if it isn't already allocated. Use this only |
1011 | * where it is safe to sleep, as the allocation may sleep. | 1011 | * where it is safe to sleep, as the allocation may sleep. |
1012 | * | 1012 | * |
@@ -1303,7 +1303,7 @@ unsigned long __read_mostly tracing_thresh; | |||
1303 | /* | 1303 | /* |
1304 | * Copy the new maximum trace into the separate maximum-trace | 1304 | * Copy the new maximum trace into the separate maximum-trace |
1305 | * structure. (this way the maximum trace is permanently saved, | 1305 | * structure. (this way the maximum trace is permanently saved, |
1306 | * for later retrieval via /sys/kernel/debug/tracing/latency_trace) | 1306 | * for later retrieval via /sys/kernel/tracing/tracing_max_latency) |
1307 | */ | 1307 | */ |
1308 | static void | 1308 | static void |
1309 | __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) | 1309 | __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) |
@@ -2374,6 +2374,15 @@ void trace_event_buffer_commit(struct trace_event_buffer *fbuffer) | |||
2374 | } | 2374 | } |
2375 | EXPORT_SYMBOL_GPL(trace_event_buffer_commit); | 2375 | EXPORT_SYMBOL_GPL(trace_event_buffer_commit); |
2376 | 2376 | ||
2377 | /* | ||
2378 | * Skip 3: | ||
2379 | * | ||
2380 | * trace_buffer_unlock_commit_regs() | ||
2381 | * trace_event_buffer_commit() | ||
2382 | * trace_event_raw_event_xxx() | ||
2383 | */ | ||
2384 | # define STACK_SKIP 3 | ||
2385 | |||
2377 | void trace_buffer_unlock_commit_regs(struct trace_array *tr, | 2386 | void trace_buffer_unlock_commit_regs(struct trace_array *tr, |
2378 | struct ring_buffer *buffer, | 2387 | struct ring_buffer *buffer, |
2379 | struct ring_buffer_event *event, | 2388 | struct ring_buffer_event *event, |
@@ -2383,16 +2392,12 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, | |||
2383 | __buffer_unlock_commit(buffer, event); | 2392 | __buffer_unlock_commit(buffer, event); |
2384 | 2393 | ||
2385 | /* | 2394 | /* |
2386 | * If regs is not set, then skip the following callers: | 2395 | * If regs is not set, then skip the necessary functions. |
2387 | * trace_buffer_unlock_commit_regs | ||
2388 | * event_trigger_unlock_commit | ||
2389 | * trace_event_buffer_commit | ||
2390 | * trace_event_raw_event_sched_switch | ||
2391 | * Note, we can still get here via blktrace, wakeup tracer | 2396 | * Note, we can still get here via blktrace, wakeup tracer |
2392 | * and mmiotrace, but that's ok if they lose a function or | 2397 | * and mmiotrace, but that's ok if they lose a function or |
2393 | * two. They are that meaningful. | 2398 | * two. They are not that meaningful. |
2394 | */ | 2399 | */ |
2395 | ftrace_trace_stack(tr, buffer, flags, regs ? 0 : 4, pc, regs); | 2400 | ftrace_trace_stack(tr, buffer, flags, regs ? 0 : STACK_SKIP, pc, regs); |
2396 | ftrace_trace_userstack(buffer, flags, pc); | 2401 | ftrace_trace_userstack(buffer, flags, pc); |
2397 | } | 2402 | } |
2398 | 2403 | ||
@@ -2415,7 +2420,7 @@ trace_process_export(struct trace_export *export, | |||
2415 | 2420 | ||
2416 | entry = ring_buffer_event_data(event); | 2421 | entry = ring_buffer_event_data(event); |
2417 | size = ring_buffer_event_length(event); | 2422 | size = ring_buffer_event_length(event); |
2418 | export->write(entry, size); | 2423 | export->write(export, entry, size); |
2419 | } | 2424 | } |
2420 | 2425 | ||
2421 | static DEFINE_MUTEX(ftrace_export_lock); | 2426 | static DEFINE_MUTEX(ftrace_export_lock); |
@@ -2579,11 +2584,13 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, | |||
2579 | trace.skip = skip; | 2584 | trace.skip = skip; |
2580 | 2585 | ||
2581 | /* | 2586 | /* |
2582 | * Add two, for this function and the call to save_stack_trace() | 2587 | * Add one, for this function and the call to save_stack_trace() |
2583 | * If regs is set, then these functions will not be in the way. | 2588 | * If regs is set, then these functions will not be in the way. |
2584 | */ | 2589 | */ |
2590 | #ifndef CONFIG_UNWINDER_ORC | ||
2585 | if (!regs) | 2591 | if (!regs) |
2586 | trace.skip += 2; | 2592 | trace.skip++; |
2593 | #endif | ||
2587 | 2594 | ||
2588 | /* | 2595 | /* |
2589 | * Since events can happen in NMIs there's no safe way to | 2596 | * Since events can happen in NMIs there's no safe way to |
@@ -2711,11 +2718,10 @@ void trace_dump_stack(int skip) | |||
2711 | 2718 | ||
2712 | local_save_flags(flags); | 2719 | local_save_flags(flags); |
2713 | 2720 | ||
2714 | /* | 2721 | #ifndef CONFIG_UNWINDER_ORC |
2715 | * Skip 3 more, seems to get us at the caller of | 2722 | /* Skip 1 to skip this function. */ |
2716 | * this function. | 2723 | skip++; |
2717 | */ | 2724 | #endif |
2718 | skip += 3; | ||
2719 | __ftrace_trace_stack(global_trace.trace_buffer.buffer, | 2725 | __ftrace_trace_stack(global_trace.trace_buffer.buffer, |
2720 | flags, skip, preempt_count(), NULL); | 2726 | flags, skip, preempt_count(), NULL); |
2721 | } | 2727 | } |
@@ -4178,37 +4184,30 @@ static const struct file_operations show_traces_fops = { | |||
4178 | .llseek = seq_lseek, | 4184 | .llseek = seq_lseek, |
4179 | }; | 4185 | }; |
4180 | 4186 | ||
4181 | /* | ||
4182 | * The tracer itself will not take this lock, but still we want | ||
4183 | * to provide a consistent cpumask to user-space: | ||
4184 | */ | ||
4185 | static DEFINE_MUTEX(tracing_cpumask_update_lock); | ||
4186 | |||
4187 | /* | ||
4188 | * Temporary storage for the character representation of the | ||
4189 | * CPU bitmask (and one more byte for the newline): | ||
4190 | */ | ||
4191 | static char mask_str[NR_CPUS + 1]; | ||
4192 | |||
4193 | static ssize_t | 4187 | static ssize_t |
4194 | tracing_cpumask_read(struct file *filp, char __user *ubuf, | 4188 | tracing_cpumask_read(struct file *filp, char __user *ubuf, |
4195 | size_t count, loff_t *ppos) | 4189 | size_t count, loff_t *ppos) |
4196 | { | 4190 | { |
4197 | struct trace_array *tr = file_inode(filp)->i_private; | 4191 | struct trace_array *tr = file_inode(filp)->i_private; |
4192 | char *mask_str; | ||
4198 | int len; | 4193 | int len; |
4199 | 4194 | ||
4200 | mutex_lock(&tracing_cpumask_update_lock); | 4195 | len = snprintf(NULL, 0, "%*pb\n", |
4196 | cpumask_pr_args(tr->tracing_cpumask)) + 1; | ||
4197 | mask_str = kmalloc(len, GFP_KERNEL); | ||
4198 | if (!mask_str) | ||
4199 | return -ENOMEM; | ||
4201 | 4200 | ||
4202 | len = snprintf(mask_str, count, "%*pb\n", | 4201 | len = snprintf(mask_str, len, "%*pb\n", |
4203 | cpumask_pr_args(tr->tracing_cpumask)); | 4202 | cpumask_pr_args(tr->tracing_cpumask)); |
4204 | if (len >= count) { | 4203 | if (len >= count) { |
4205 | count = -EINVAL; | 4204 | count = -EINVAL; |
4206 | goto out_err; | 4205 | goto out_err; |
4207 | } | 4206 | } |
4208 | count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1); | 4207 | count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len); |
4209 | 4208 | ||
4210 | out_err: | 4209 | out_err: |
4211 | mutex_unlock(&tracing_cpumask_update_lock); | 4210 | kfree(mask_str); |
4212 | 4211 | ||
4213 | return count; | 4212 | return count; |
4214 | } | 4213 | } |
@@ -4228,8 +4227,6 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, | |||
4228 | if (err) | 4227 | if (err) |
4229 | goto err_unlock; | 4228 | goto err_unlock; |
4230 | 4229 | ||
4231 | mutex_lock(&tracing_cpumask_update_lock); | ||
4232 | |||
4233 | local_irq_disable(); | 4230 | local_irq_disable(); |
4234 | arch_spin_lock(&tr->max_lock); | 4231 | arch_spin_lock(&tr->max_lock); |
4235 | for_each_tracing_cpu(cpu) { | 4232 | for_each_tracing_cpu(cpu) { |
@@ -4252,8 +4249,6 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, | |||
4252 | local_irq_enable(); | 4249 | local_irq_enable(); |
4253 | 4250 | ||
4254 | cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new); | 4251 | cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new); |
4255 | |||
4256 | mutex_unlock(&tracing_cpumask_update_lock); | ||
4257 | free_cpumask_var(tracing_cpumask_new); | 4252 | free_cpumask_var(tracing_cpumask_new); |
4258 | 4253 | ||
4259 | return count; | 4254 | return count; |
@@ -6780,7 +6775,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
6780 | .spd_release = buffer_spd_release, | 6775 | .spd_release = buffer_spd_release, |
6781 | }; | 6776 | }; |
6782 | struct buffer_ref *ref; | 6777 | struct buffer_ref *ref; |
6783 | int entries, size, i; | 6778 | int entries, i; |
6784 | ssize_t ret = 0; | 6779 | ssize_t ret = 0; |
6785 | 6780 | ||
6786 | #ifdef CONFIG_TRACER_MAX_TRACE | 6781 | #ifdef CONFIG_TRACER_MAX_TRACE |
@@ -6834,14 +6829,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
6834 | break; | 6829 | break; |
6835 | } | 6830 | } |
6836 | 6831 | ||
6837 | /* | ||
6838 | * zero out any left over data, this is going to | ||
6839 | * user land. | ||
6840 | */ | ||
6841 | size = ring_buffer_page_len(ref->page); | ||
6842 | if (size < PAGE_SIZE) | ||
6843 | memset(ref->page + size, 0, PAGE_SIZE - size); | ||
6844 | |||
6845 | page = virt_to_page(ref->page); | 6832 | page = virt_to_page(ref->page); |
6846 | 6833 | ||
6847 | spd.pages[i] = page; | 6834 | spd.pages[i] = page; |
@@ -7599,6 +7586,7 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size | |||
7599 | buf->data = alloc_percpu(struct trace_array_cpu); | 7586 | buf->data = alloc_percpu(struct trace_array_cpu); |
7600 | if (!buf->data) { | 7587 | if (!buf->data) { |
7601 | ring_buffer_free(buf->buffer); | 7588 | ring_buffer_free(buf->buffer); |
7589 | buf->buffer = NULL; | ||
7602 | return -ENOMEM; | 7590 | return -ENOMEM; |
7603 | } | 7591 | } |
7604 | 7592 | ||
@@ -7622,7 +7610,9 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) | |||
7622 | allocate_snapshot ? size : 1); | 7610 | allocate_snapshot ? size : 1); |
7623 | if (WARN_ON(ret)) { | 7611 | if (WARN_ON(ret)) { |
7624 | ring_buffer_free(tr->trace_buffer.buffer); | 7612 | ring_buffer_free(tr->trace_buffer.buffer); |
7613 | tr->trace_buffer.buffer = NULL; | ||
7625 | free_percpu(tr->trace_buffer.data); | 7614 | free_percpu(tr->trace_buffer.data); |
7615 | tr->trace_buffer.data = NULL; | ||
7626 | return -ENOMEM; | 7616 | return -ENOMEM; |
7627 | } | 7617 | } |
7628 | tr->allocated_snapshot = allocate_snapshot; | 7618 | tr->allocated_snapshot = allocate_snapshot; |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index ec0f9aa4e151..1b87157edbff 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -2213,6 +2213,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) | |||
2213 | { | 2213 | { |
2214 | struct trace_event_call *call, *p; | 2214 | struct trace_event_call *call, *p; |
2215 | const char *last_system = NULL; | 2215 | const char *last_system = NULL; |
2216 | bool first = false; | ||
2216 | int last_i; | 2217 | int last_i; |
2217 | int i; | 2218 | int i; |
2218 | 2219 | ||
@@ -2220,15 +2221,28 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) | |||
2220 | list_for_each_entry_safe(call, p, &ftrace_events, list) { | 2221 | list_for_each_entry_safe(call, p, &ftrace_events, list) { |
2221 | /* events are usually grouped together with systems */ | 2222 | /* events are usually grouped together with systems */ |
2222 | if (!last_system || call->class->system != last_system) { | 2223 | if (!last_system || call->class->system != last_system) { |
2224 | first = true; | ||
2223 | last_i = 0; | 2225 | last_i = 0; |
2224 | last_system = call->class->system; | 2226 | last_system = call->class->system; |
2225 | } | 2227 | } |
2226 | 2228 | ||
2229 | /* | ||
2230 | * Since calls are grouped by systems, the likelyhood that the | ||
2231 | * next call in the iteration belongs to the same system as the | ||
2232 | * previous call is high. As an optimization, we skip seaching | ||
2233 | * for a map[] that matches the call's system if the last call | ||
2234 | * was from the same system. That's what last_i is for. If the | ||
2235 | * call has the same system as the previous call, then last_i | ||
2236 | * will be the index of the first map[] that has a matching | ||
2237 | * system. | ||
2238 | */ | ||
2227 | for (i = last_i; i < len; i++) { | 2239 | for (i = last_i; i < len; i++) { |
2228 | if (call->class->system == map[i]->system) { | 2240 | if (call->class->system == map[i]->system) { |
2229 | /* Save the first system if need be */ | 2241 | /* Save the first system if need be */ |
2230 | if (!last_i) | 2242 | if (first) { |
2231 | last_i = i; | 2243 | last_i = i; |
2244 | first = false; | ||
2245 | } | ||
2232 | update_event_printk(call, map[i]); | 2246 | update_event_printk(call, map[i]); |
2233 | } | 2247 | } |
2234 | } | 2248 | } |
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index f2ac9d44f6c4..87411482a46f 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c | |||
@@ -1123,13 +1123,22 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; } | |||
1123 | #endif /* CONFIG_TRACER_SNAPSHOT */ | 1123 | #endif /* CONFIG_TRACER_SNAPSHOT */ |
1124 | 1124 | ||
1125 | #ifdef CONFIG_STACKTRACE | 1125 | #ifdef CONFIG_STACKTRACE |
1126 | #ifdef CONFIG_UNWINDER_ORC | ||
1127 | /* Skip 2: | ||
1128 | * event_triggers_post_call() | ||
1129 | * trace_event_raw_event_xxx() | ||
1130 | */ | ||
1131 | # define STACK_SKIP 2 | ||
1132 | #else | ||
1126 | /* | 1133 | /* |
1127 | * Skip 3: | 1134 | * Skip 4: |
1128 | * stacktrace_trigger() | 1135 | * stacktrace_trigger() |
1129 | * event_triggers_post_call() | 1136 | * event_triggers_post_call() |
1137 | * trace_event_buffer_commit() | ||
1130 | * trace_event_raw_event_xxx() | 1138 | * trace_event_raw_event_xxx() |
1131 | */ | 1139 | */ |
1132 | #define STACK_SKIP 3 | 1140 | #define STACK_SKIP 4 |
1141 | #endif | ||
1133 | 1142 | ||
1134 | static void | 1143 | static void |
1135 | stacktrace_trigger(struct event_trigger_data *data, void *rec) | 1144 | stacktrace_trigger(struct event_trigger_data *data, void *rec) |
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 27f7ad12c4b1..b611cd36e22d 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c | |||
@@ -154,6 +154,24 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, | |||
154 | preempt_enable_notrace(); | 154 | preempt_enable_notrace(); |
155 | } | 155 | } |
156 | 156 | ||
157 | #ifdef CONFIG_UNWINDER_ORC | ||
158 | /* | ||
159 | * Skip 2: | ||
160 | * | ||
161 | * function_stack_trace_call() | ||
162 | * ftrace_call() | ||
163 | */ | ||
164 | #define STACK_SKIP 2 | ||
165 | #else | ||
166 | /* | ||
167 | * Skip 3: | ||
168 | * __trace_stack() | ||
169 | * function_stack_trace_call() | ||
170 | * ftrace_call() | ||
171 | */ | ||
172 | #define STACK_SKIP 3 | ||
173 | #endif | ||
174 | |||
157 | static void | 175 | static void |
158 | function_stack_trace_call(unsigned long ip, unsigned long parent_ip, | 176 | function_stack_trace_call(unsigned long ip, unsigned long parent_ip, |
159 | struct ftrace_ops *op, struct pt_regs *pt_regs) | 177 | struct ftrace_ops *op, struct pt_regs *pt_regs) |
@@ -180,15 +198,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, | |||
180 | if (likely(disabled == 1)) { | 198 | if (likely(disabled == 1)) { |
181 | pc = preempt_count(); | 199 | pc = preempt_count(); |
182 | trace_function(tr, ip, parent_ip, flags, pc); | 200 | trace_function(tr, ip, parent_ip, flags, pc); |
183 | /* | 201 | __trace_stack(tr, flags, STACK_SKIP, pc); |
184 | * skip over 5 funcs: | ||
185 | * __ftrace_trace_stack, | ||
186 | * __trace_stack, | ||
187 | * function_stack_trace_call | ||
188 | * ftrace_list_func | ||
189 | * ftrace_call | ||
190 | */ | ||
191 | __trace_stack(tr, flags, 5, pc); | ||
192 | } | 202 | } |
193 | 203 | ||
194 | atomic_dec(&data->disabled); | 204 | atomic_dec(&data->disabled); |
@@ -367,14 +377,27 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, | |||
367 | tracer_tracing_off(tr); | 377 | tracer_tracing_off(tr); |
368 | } | 378 | } |
369 | 379 | ||
380 | #ifdef CONFIG_UNWINDER_ORC | ||
370 | /* | 381 | /* |
371 | * Skip 4: | 382 | * Skip 3: |
383 | * | ||
384 | * function_trace_probe_call() | ||
385 | * ftrace_ops_assist_func() | ||
386 | * ftrace_call() | ||
387 | */ | ||
388 | #define FTRACE_STACK_SKIP 3 | ||
389 | #else | ||
390 | /* | ||
391 | * Skip 5: | ||
392 | * | ||
393 | * __trace_stack() | ||
372 | * ftrace_stacktrace() | 394 | * ftrace_stacktrace() |
373 | * function_trace_probe_call() | 395 | * function_trace_probe_call() |
374 | * ftrace_ops_list_func() | 396 | * ftrace_ops_assist_func() |
375 | * ftrace_call() | 397 | * ftrace_call() |
376 | */ | 398 | */ |
377 | #define STACK_SKIP 4 | 399 | #define FTRACE_STACK_SKIP 5 |
400 | #endif | ||
378 | 401 | ||
379 | static __always_inline void trace_stack(struct trace_array *tr) | 402 | static __always_inline void trace_stack(struct trace_array *tr) |
380 | { | 403 | { |
@@ -384,7 +407,7 @@ static __always_inline void trace_stack(struct trace_array *tr) | |||
384 | local_save_flags(flags); | 407 | local_save_flags(flags); |
385 | pc = preempt_count(); | 408 | pc = preempt_count(); |
386 | 409 | ||
387 | __trace_stack(tr, flags, STACK_SKIP, pc); | 410 | __trace_stack(tr, flags, FTRACE_STACK_SKIP, pc); |
388 | } | 411 | } |
389 | 412 | ||
390 | static void | 413 | static void |
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 734accc02418..3c7bfc4bf5e9 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c | |||
@@ -209,6 +209,10 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, | |||
209 | if (__this_cpu_read(disable_stack_tracer) != 1) | 209 | if (__this_cpu_read(disable_stack_tracer) != 1) |
210 | goto out; | 210 | goto out; |
211 | 211 | ||
212 | /* If rcu is not watching, then save stack trace can fail */ | ||
213 | if (!rcu_is_watching()) | ||
214 | goto out; | ||
215 | |||
212 | ip += MCOUNT_INSN_SIZE; | 216 | ip += MCOUNT_INSN_SIZE; |
213 | 217 | ||
214 | check_stack(ip, &stack); | 218 | check_stack(ip, &stack); |
diff --git a/kernel/uid16.c b/kernel/uid16.c index ce74a4901d2b..ef1da2a5f9bd 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c | |||
@@ -192,6 +192,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) | |||
192 | return retval; | 192 | return retval; |
193 | } | 193 | } |
194 | 194 | ||
195 | groups_sort(group_info); | ||
195 | retval = set_current_groups(group_info); | 196 | retval = set_current_groups(group_info); |
196 | put_group_info(group_info); | 197 | put_group_info(group_info); |
197 | 198 | ||
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8fdb710bfdd7..f699122dab32 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -38,7 +38,6 @@ | |||
38 | #include <linux/hardirq.h> | 38 | #include <linux/hardirq.h> |
39 | #include <linux/mempolicy.h> | 39 | #include <linux/mempolicy.h> |
40 | #include <linux/freezer.h> | 40 | #include <linux/freezer.h> |
41 | #include <linux/kallsyms.h> | ||
42 | #include <linux/debug_locks.h> | 41 | #include <linux/debug_locks.h> |
43 | #include <linux/lockdep.h> | 42 | #include <linux/lockdep.h> |
44 | #include <linux/idr.h> | 43 | #include <linux/idr.h> |
@@ -48,6 +47,8 @@ | |||
48 | #include <linux/nodemask.h> | 47 | #include <linux/nodemask.h> |
49 | #include <linux/moduleparam.h> | 48 | #include <linux/moduleparam.h> |
50 | #include <linux/uaccess.h> | 49 | #include <linux/uaccess.h> |
50 | #include <linux/sched/isolation.h> | ||
51 | #include <linux/nmi.h> | ||
51 | 52 | ||
52 | #include "workqueue_internal.h" | 53 | #include "workqueue_internal.h" |
53 | 54 | ||
@@ -1634,7 +1635,7 @@ static void worker_enter_idle(struct worker *worker) | |||
1634 | mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); | 1635 | mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); |
1635 | 1636 | ||
1636 | /* | 1637 | /* |
1637 | * Sanity check nr_running. Because wq_unbind_fn() releases | 1638 | * Sanity check nr_running. Because unbind_workers() releases |
1638 | * pool->lock between setting %WORKER_UNBOUND and zapping | 1639 | * pool->lock between setting %WORKER_UNBOUND and zapping |
1639 | * nr_running, the warning may trigger spuriously. Check iff | 1640 | * nr_running, the warning may trigger spuriously. Check iff |
1640 | * unbind is not in progress. | 1641 | * unbind is not in progress. |
@@ -4463,6 +4464,12 @@ void show_workqueue_state(void) | |||
4463 | if (pwq->nr_active || !list_empty(&pwq->delayed_works)) | 4464 | if (pwq->nr_active || !list_empty(&pwq->delayed_works)) |
4464 | show_pwq(pwq); | 4465 | show_pwq(pwq); |
4465 | spin_unlock_irqrestore(&pwq->pool->lock, flags); | 4466 | spin_unlock_irqrestore(&pwq->pool->lock, flags); |
4467 | /* | ||
4468 | * We could be printing a lot from atomic context, e.g. | ||
4469 | * sysrq-t -> show_workqueue_state(). Avoid triggering | ||
4470 | * hard lockup. | ||
4471 | */ | ||
4472 | touch_nmi_watchdog(); | ||
4466 | } | 4473 | } |
4467 | } | 4474 | } |
4468 | 4475 | ||
@@ -4490,6 +4497,12 @@ void show_workqueue_state(void) | |||
4490 | pr_cont("\n"); | 4497 | pr_cont("\n"); |
4491 | next_pool: | 4498 | next_pool: |
4492 | spin_unlock_irqrestore(&pool->lock, flags); | 4499 | spin_unlock_irqrestore(&pool->lock, flags); |
4500 | /* | ||
4501 | * We could be printing a lot from atomic context, e.g. | ||
4502 | * sysrq-t -> show_workqueue_state(). Avoid triggering | ||
4503 | * hard lockup. | ||
4504 | */ | ||
4505 | touch_nmi_watchdog(); | ||
4493 | } | 4506 | } |
4494 | 4507 | ||
4495 | rcu_read_unlock_sched(); | 4508 | rcu_read_unlock_sched(); |
@@ -4510,9 +4523,8 @@ void show_workqueue_state(void) | |||
4510 | * cpu comes back online. | 4523 | * cpu comes back online. |
4511 | */ | 4524 | */ |
4512 | 4525 | ||
4513 | static void wq_unbind_fn(struct work_struct *work) | 4526 | static void unbind_workers(int cpu) |
4514 | { | 4527 | { |
4515 | int cpu = smp_processor_id(); | ||
4516 | struct worker_pool *pool; | 4528 | struct worker_pool *pool; |
4517 | struct worker *worker; | 4529 | struct worker *worker; |
4518 | 4530 | ||
@@ -4589,16 +4601,6 @@ static void rebind_workers(struct worker_pool *pool) | |||
4589 | 4601 | ||
4590 | spin_lock_irq(&pool->lock); | 4602 | spin_lock_irq(&pool->lock); |
4591 | 4603 | ||
4592 | /* | ||
4593 | * XXX: CPU hotplug notifiers are weird and can call DOWN_FAILED | ||
4594 | * w/o preceding DOWN_PREPARE. Work around it. CPU hotplug is | ||
4595 | * being reworked and this can go away in time. | ||
4596 | */ | ||
4597 | if (!(pool->flags & POOL_DISASSOCIATED)) { | ||
4598 | spin_unlock_irq(&pool->lock); | ||
4599 | return; | ||
4600 | } | ||
4601 | |||
4602 | pool->flags &= ~POOL_DISASSOCIATED; | 4604 | pool->flags &= ~POOL_DISASSOCIATED; |
4603 | 4605 | ||
4604 | for_each_pool_worker(worker, pool) { | 4606 | for_each_pool_worker(worker, pool) { |
@@ -4709,12 +4711,13 @@ int workqueue_online_cpu(unsigned int cpu) | |||
4709 | 4711 | ||
4710 | int workqueue_offline_cpu(unsigned int cpu) | 4712 | int workqueue_offline_cpu(unsigned int cpu) |
4711 | { | 4713 | { |
4712 | struct work_struct unbind_work; | ||
4713 | struct workqueue_struct *wq; | 4714 | struct workqueue_struct *wq; |
4714 | 4715 | ||
4715 | /* unbinding per-cpu workers should happen on the local CPU */ | 4716 | /* unbinding per-cpu workers should happen on the local CPU */ |
4716 | INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); | 4717 | if (WARN_ON(cpu != smp_processor_id())) |
4717 | queue_work_on(cpu, system_highpri_wq, &unbind_work); | 4718 | return -1; |
4719 | |||
4720 | unbind_workers(cpu); | ||
4718 | 4721 | ||
4719 | /* update NUMA affinity of unbound workqueues */ | 4722 | /* update NUMA affinity of unbound workqueues */ |
4720 | mutex_lock(&wq_pool_mutex); | 4723 | mutex_lock(&wq_pool_mutex); |
@@ -4722,9 +4725,6 @@ int workqueue_offline_cpu(unsigned int cpu) | |||
4722 | wq_update_unbound_numa(wq, cpu, false); | 4725 | wq_update_unbound_numa(wq, cpu, false); |
4723 | mutex_unlock(&wq_pool_mutex); | 4726 | mutex_unlock(&wq_pool_mutex); |
4724 | 4727 | ||
4725 | /* wait for per-cpu unbinding to finish */ | ||
4726 | flush_work(&unbind_work); | ||
4727 | destroy_work_on_stack(&unbind_work); | ||
4728 | return 0; | 4728 | return 0; |
4729 | } | 4729 | } |
4730 | 4730 | ||
@@ -4957,6 +4957,10 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) | |||
4957 | if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL)) | 4957 | if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL)) |
4958 | return -ENOMEM; | 4958 | return -ENOMEM; |
4959 | 4959 | ||
4960 | /* | ||
4961 | * Not excluding isolated cpus on purpose. | ||
4962 | * If the user wishes to include them, we allow that. | ||
4963 | */ | ||
4960 | cpumask_and(cpumask, cpumask, cpu_possible_mask); | 4964 | cpumask_and(cpumask, cpumask, cpu_possible_mask); |
4961 | if (!cpumask_empty(cpumask)) { | 4965 | if (!cpumask_empty(cpumask)) { |
4962 | apply_wqattrs_lock(); | 4966 | apply_wqattrs_lock(); |
@@ -5555,7 +5559,7 @@ int __init workqueue_init_early(void) | |||
5555 | WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); | 5559 | WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); |
5556 | 5560 | ||
5557 | BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); | 5561 | BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); |
5558 | cpumask_copy(wq_unbound_cpumask, cpu_possible_mask); | 5562 | cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_FLAG_DOMAIN)); |
5559 | 5563 | ||
5560 | pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); | 5564 | pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); |
5561 | 5565 | ||