diff options
Diffstat (limited to 'kernel')
109 files changed, 4399 insertions, 2035 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore index 790d83c7d160..b3097bde4e9c 100644 --- a/kernel/.gitignore +++ b/kernel/.gitignore | |||
@@ -5,4 +5,3 @@ config_data.h | |||
5 | config_data.gz | 5 | config_data.gz |
6 | timeconst.h | 6 | timeconst.h |
7 | hz.bc | 7 | hz.bc |
8 | x509_certificate_list | ||
diff --git a/kernel/audit.c b/kernel/audit.c index 662c007635fb..5ffcbd354a52 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -407,16 +407,33 @@ static void audit_printk_skb(struct sk_buff *skb) | |||
407 | static void kauditd_send_skb(struct sk_buff *skb) | 407 | static void kauditd_send_skb(struct sk_buff *skb) |
408 | { | 408 | { |
409 | int err; | 409 | int err; |
410 | int attempts = 0; | ||
411 | #define AUDITD_RETRIES 5 | ||
412 | |||
413 | restart: | ||
410 | /* take a reference in case we can't send it and we want to hold it */ | 414 | /* take a reference in case we can't send it and we want to hold it */ |
411 | skb_get(skb); | 415 | skb_get(skb); |
412 | err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0); | 416 | err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0); |
413 | if (err < 0) { | 417 | if (err < 0) { |
414 | BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ | 418 | pr_err("netlink_unicast sending to audit_pid=%d returned error: %d\n", |
419 | audit_pid, err); | ||
415 | if (audit_pid) { | 420 | if (audit_pid) { |
416 | pr_err("*NO* daemon at audit_pid=%d\n", audit_pid); | 421 | if (err == -ECONNREFUSED || err == -EPERM |
417 | audit_log_lost("auditd disappeared"); | 422 | || ++attempts >= AUDITD_RETRIES) { |
418 | audit_pid = 0; | 423 | char s[32]; |
419 | audit_sock = NULL; | 424 | |
425 | snprintf(s, sizeof(s), "audit_pid=%d reset", audit_pid); | ||
426 | audit_log_lost(s); | ||
427 | audit_pid = 0; | ||
428 | audit_sock = NULL; | ||
429 | } else { | ||
430 | pr_warn("re-scheduling(#%d) write to audit_pid=%d\n", | ||
431 | attempts, audit_pid); | ||
432 | set_current_state(TASK_INTERRUPTIBLE); | ||
433 | schedule(); | ||
434 | __set_current_state(TASK_RUNNING); | ||
435 | goto restart; | ||
436 | } | ||
420 | } | 437 | } |
421 | /* we might get lucky and get this in the next auditd */ | 438 | /* we might get lucky and get this in the next auditd */ |
422 | audit_hold_skb(skb); | 439 | audit_hold_skb(skb); |
@@ -684,25 +701,22 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) | |||
684 | return err; | 701 | return err; |
685 | } | 702 | } |
686 | 703 | ||
687 | static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) | 704 | static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) |
688 | { | 705 | { |
689 | int rc = 0; | ||
690 | uid_t uid = from_kuid(&init_user_ns, current_uid()); | 706 | uid_t uid = from_kuid(&init_user_ns, current_uid()); |
691 | pid_t pid = task_tgid_nr(current); | 707 | pid_t pid = task_tgid_nr(current); |
692 | 708 | ||
693 | if (!audit_enabled && msg_type != AUDIT_USER_AVC) { | 709 | if (!audit_enabled && msg_type != AUDIT_USER_AVC) { |
694 | *ab = NULL; | 710 | *ab = NULL; |
695 | return rc; | 711 | return; |
696 | } | 712 | } |
697 | 713 | ||
698 | *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); | 714 | *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); |
699 | if (unlikely(!*ab)) | 715 | if (unlikely(!*ab)) |
700 | return rc; | 716 | return; |
701 | audit_log_format(*ab, "pid=%d uid=%u", pid, uid); | 717 | audit_log_format(*ab, "pid=%d uid=%u", pid, uid); |
702 | audit_log_session_info(*ab); | 718 | audit_log_session_info(*ab); |
703 | audit_log_task_context(*ab); | 719 | audit_log_task_context(*ab); |
704 | |||
705 | return rc; | ||
706 | } | 720 | } |
707 | 721 | ||
708 | int is_audit_feature_set(int i) | 722 | int is_audit_feature_set(int i) |
@@ -1357,16 +1371,16 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, | |||
1357 | if (unlikely(audit_filter_type(type))) | 1371 | if (unlikely(audit_filter_type(type))) |
1358 | return NULL; | 1372 | return NULL; |
1359 | 1373 | ||
1360 | if (gfp_mask & __GFP_WAIT) { | 1374 | if (gfp_mask & __GFP_DIRECT_RECLAIM) { |
1361 | if (audit_pid && audit_pid == current->pid) | 1375 | if (audit_pid && audit_pid == current->pid) |
1362 | gfp_mask &= ~__GFP_WAIT; | 1376 | gfp_mask &= ~__GFP_DIRECT_RECLAIM; |
1363 | else | 1377 | else |
1364 | reserve = 0; | 1378 | reserve = 0; |
1365 | } | 1379 | } |
1366 | 1380 | ||
1367 | while (audit_backlog_limit | 1381 | while (audit_backlog_limit |
1368 | && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) { | 1382 | && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) { |
1369 | if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) { | 1383 | if (gfp_mask & __GFP_DIRECT_RECLAIM && audit_backlog_wait_time) { |
1370 | long sleep_time; | 1384 | long sleep_time; |
1371 | 1385 | ||
1372 | sleep_time = timeout_start + audit_backlog_wait_time - jiffies; | 1386 | sleep_time = timeout_start + audit_backlog_wait_time - jiffies; |
@@ -1566,14 +1580,14 @@ void audit_log_n_string(struct audit_buffer *ab, const char *string, | |||
1566 | * @string: string to be checked | 1580 | * @string: string to be checked |
1567 | * @len: max length of the string to check | 1581 | * @len: max length of the string to check |
1568 | */ | 1582 | */ |
1569 | int audit_string_contains_control(const char *string, size_t len) | 1583 | bool audit_string_contains_control(const char *string, size_t len) |
1570 | { | 1584 | { |
1571 | const unsigned char *p; | 1585 | const unsigned char *p; |
1572 | for (p = string; p < (const unsigned char *)string + len; p++) { | 1586 | for (p = string; p < (const unsigned char *)string + len; p++) { |
1573 | if (*p == '"' || *p < 0x21 || *p > 0x7e) | 1587 | if (*p == '"' || *p < 0x21 || *p > 0x7e) |
1574 | return 1; | 1588 | return true; |
1575 | } | 1589 | } |
1576 | return 0; | 1590 | return false; |
1577 | } | 1591 | } |
1578 | 1592 | ||
1579 | /** | 1593 | /** |
diff --git a/kernel/audit.h b/kernel/audit.h index dadf86a0e59e..de6cbb7cf547 100644 --- a/kernel/audit.h +++ b/kernel/audit.h | |||
@@ -301,7 +301,7 @@ extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark | |||
301 | #ifdef CONFIG_AUDIT_TREE | 301 | #ifdef CONFIG_AUDIT_TREE |
302 | extern struct audit_chunk *audit_tree_lookup(const struct inode *); | 302 | extern struct audit_chunk *audit_tree_lookup(const struct inode *); |
303 | extern void audit_put_chunk(struct audit_chunk *); | 303 | extern void audit_put_chunk(struct audit_chunk *); |
304 | extern int audit_tree_match(struct audit_chunk *, struct audit_tree *); | 304 | extern bool audit_tree_match(struct audit_chunk *, struct audit_tree *); |
305 | extern int audit_make_tree(struct audit_krule *, char *, u32); | 305 | extern int audit_make_tree(struct audit_krule *, char *, u32); |
306 | extern int audit_add_tree_rule(struct audit_krule *); | 306 | extern int audit_add_tree_rule(struct audit_krule *); |
307 | extern int audit_remove_tree_rule(struct audit_krule *); | 307 | extern int audit_remove_tree_rule(struct audit_krule *); |
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 94ecdabda8e6..5efe9b299a12 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c | |||
@@ -197,13 +197,13 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode) | |||
197 | return NULL; | 197 | return NULL; |
198 | } | 198 | } |
199 | 199 | ||
200 | int audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree) | 200 | bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree) |
201 | { | 201 | { |
202 | int n; | 202 | int n; |
203 | for (n = 0; n < chunk->count; n++) | 203 | for (n = 0; n < chunk->count; n++) |
204 | if (chunk->owners[n].owner == tree) | 204 | if (chunk->owners[n].owner == tree) |
205 | return 1; | 205 | return true; |
206 | return 0; | 206 | return false; |
207 | } | 207 | } |
208 | 208 | ||
209 | /* tagging and untagging inodes with trees */ | 209 | /* tagging and untagging inodes with trees */ |
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 7714d93edb85..b8ff9e193753 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c | |||
@@ -39,13 +39,13 @@ | |||
39 | * Locking model: | 39 | * Locking model: |
40 | * | 40 | * |
41 | * audit_filter_mutex: | 41 | * audit_filter_mutex: |
42 | * Synchronizes writes and blocking reads of audit's filterlist | 42 | * Synchronizes writes and blocking reads of audit's filterlist |
43 | * data. Rcu is used to traverse the filterlist and access | 43 | * data. Rcu is used to traverse the filterlist and access |
44 | * contents of structs audit_entry, audit_watch and opaque | 44 | * contents of structs audit_entry, audit_watch and opaque |
45 | * LSM rules during filtering. If modified, these structures | 45 | * LSM rules during filtering. If modified, these structures |
46 | * must be copied and replace their counterparts in the filterlist. | 46 | * must be copied and replace their counterparts in the filterlist. |
47 | * An audit_parent struct is not accessed during filtering, so may | 47 | * An audit_parent struct is not accessed during filtering, so may |
48 | * be written directly provided audit_filter_mutex is held. | 48 | * be written directly provided audit_filter_mutex is held. |
49 | */ | 49 | */ |
50 | 50 | ||
51 | /* Audit filter lists, defined in <linux/audit.h> */ | 51 | /* Audit filter lists, defined in <linux/audit.h> */ |
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e6983be12bd3..13272582eee0 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile | |||
@@ -1,2 +1,4 @@ | |||
1 | obj-y := core.o | 1 | obj-y := core.o |
2 | obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o | 2 | |
3 | obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o | ||
4 | obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o | ||
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 29ace107f236..3f4c99e06c6b 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/slab.h> | 15 | #include <linux/slab.h> |
16 | #include <linux/mm.h> | 16 | #include <linux/mm.h> |
17 | #include <linux/filter.h> | 17 | #include <linux/filter.h> |
18 | #include <linux/perf_event.h> | ||
18 | 19 | ||
19 | /* Called from syscall */ | 20 | /* Called from syscall */ |
20 | static struct bpf_map *array_map_alloc(union bpf_attr *attr) | 21 | static struct bpf_map *array_map_alloc(union bpf_attr *attr) |
@@ -48,7 +49,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) | |||
48 | array->map.key_size = attr->key_size; | 49 | array->map.key_size = attr->key_size; |
49 | array->map.value_size = attr->value_size; | 50 | array->map.value_size = attr->value_size; |
50 | array->map.max_entries = attr->max_entries; | 51 | array->map.max_entries = attr->max_entries; |
51 | 52 | array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT; | |
52 | array->elem_size = elem_size; | 53 | array->elem_size = elem_size; |
53 | 54 | ||
54 | return &array->map; | 55 | return &array->map; |
@@ -291,14 +292,23 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) | |||
291 | 292 | ||
292 | attr = perf_event_attrs(event); | 293 | attr = perf_event_attrs(event); |
293 | if (IS_ERR(attr)) | 294 | if (IS_ERR(attr)) |
294 | return (void *)attr; | 295 | goto err; |
295 | 296 | ||
296 | if (attr->type != PERF_TYPE_RAW && | 297 | if (attr->inherit) |
297 | attr->type != PERF_TYPE_HARDWARE) { | 298 | goto err; |
298 | perf_event_release_kernel(event); | 299 | |
299 | return ERR_PTR(-EINVAL); | 300 | if (attr->type == PERF_TYPE_RAW) |
300 | } | 301 | return event; |
301 | return event; | 302 | |
303 | if (attr->type == PERF_TYPE_HARDWARE) | ||
304 | return event; | ||
305 | |||
306 | if (attr->type == PERF_TYPE_SOFTWARE && | ||
307 | attr->config == PERF_COUNT_SW_BPF_OUTPUT) | ||
308 | return event; | ||
309 | err: | ||
310 | perf_event_release_kernel(event); | ||
311 | return ERR_PTR(-EINVAL); | ||
302 | } | 312 | } |
303 | 313 | ||
304 | static void perf_event_fd_array_put_ptr(void *ptr) | 314 | static void perf_event_fd_array_put_ptr(void *ptr) |
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 67c380cfa9ca..334b1bdd572c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c | |||
@@ -82,6 +82,8 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) | |||
82 | if (fp == NULL) | 82 | if (fp == NULL) |
83 | return NULL; | 83 | return NULL; |
84 | 84 | ||
85 | kmemcheck_annotate_bitfield(fp, meta); | ||
86 | |||
85 | aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags); | 87 | aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags); |
86 | if (aux == NULL) { | 88 | if (aux == NULL) { |
87 | vfree(fp); | 89 | vfree(fp); |
@@ -90,6 +92,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) | |||
90 | 92 | ||
91 | fp->pages = size / PAGE_SIZE; | 93 | fp->pages = size / PAGE_SIZE; |
92 | fp->aux = aux; | 94 | fp->aux = aux; |
95 | fp->aux->prog = fp; | ||
93 | 96 | ||
94 | return fp; | 97 | return fp; |
95 | } | 98 | } |
@@ -110,8 +113,11 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, | |||
110 | 113 | ||
111 | fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); | 114 | fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); |
112 | if (fp != NULL) { | 115 | if (fp != NULL) { |
116 | kmemcheck_annotate_bitfield(fp, meta); | ||
117 | |||
113 | memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); | 118 | memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); |
114 | fp->pages = size / PAGE_SIZE; | 119 | fp->pages = size / PAGE_SIZE; |
120 | fp->aux->prog = fp; | ||
115 | 121 | ||
116 | /* We keep fp->aux from fp_old around in the new | 122 | /* We keep fp->aux from fp_old around in the new |
117 | * reallocated structure. | 123 | * reallocated structure. |
@@ -722,11 +728,36 @@ void bpf_prog_free(struct bpf_prog *fp) | |||
722 | struct bpf_prog_aux *aux = fp->aux; | 728 | struct bpf_prog_aux *aux = fp->aux; |
723 | 729 | ||
724 | INIT_WORK(&aux->work, bpf_prog_free_deferred); | 730 | INIT_WORK(&aux->work, bpf_prog_free_deferred); |
725 | aux->prog = fp; | ||
726 | schedule_work(&aux->work); | 731 | schedule_work(&aux->work); |
727 | } | 732 | } |
728 | EXPORT_SYMBOL_GPL(bpf_prog_free); | 733 | EXPORT_SYMBOL_GPL(bpf_prog_free); |
729 | 734 | ||
735 | /* RNG for unpriviledged user space with separated state from prandom_u32(). */ | ||
736 | static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state); | ||
737 | |||
738 | void bpf_user_rnd_init_once(void) | ||
739 | { | ||
740 | prandom_init_once(&bpf_user_rnd_state); | ||
741 | } | ||
742 | |||
743 | u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | ||
744 | { | ||
745 | /* Should someone ever have the rather unwise idea to use some | ||
746 | * of the registers passed into this function, then note that | ||
747 | * this function is called from native eBPF and classic-to-eBPF | ||
748 | * transformations. Register assignments from both sides are | ||
749 | * different, f.e. classic always sets fn(ctx, A, X) here. | ||
750 | */ | ||
751 | struct rnd_state *state; | ||
752 | u32 res; | ||
753 | |||
754 | state = &get_cpu_var(bpf_user_rnd_state); | ||
755 | res = prandom_u32_state(state); | ||
756 | put_cpu_var(state); | ||
757 | |||
758 | return res; | ||
759 | } | ||
760 | |||
730 | /* Weak definitions of helper functions in case we don't have bpf syscall. */ | 761 | /* Weak definitions of helper functions in case we don't have bpf syscall. */ |
731 | const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; | 762 | const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; |
732 | const struct bpf_func_proto bpf_map_update_elem_proto __weak; | 763 | const struct bpf_func_proto bpf_map_update_elem_proto __weak; |
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 83c209d9b17a..19909b22b4f8 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c | |||
@@ -17,7 +17,7 @@ | |||
17 | struct bpf_htab { | 17 | struct bpf_htab { |
18 | struct bpf_map map; | 18 | struct bpf_map map; |
19 | struct hlist_head *buckets; | 19 | struct hlist_head *buckets; |
20 | spinlock_t lock; | 20 | raw_spinlock_t lock; |
21 | u32 count; /* number of elements in this hashtable */ | 21 | u32 count; /* number of elements in this hashtable */ |
22 | u32 n_buckets; /* number of hash buckets */ | 22 | u32 n_buckets; /* number of hash buckets */ |
23 | u32 elem_size; /* size of each element in bytes */ | 23 | u32 elem_size; /* size of each element in bytes */ |
@@ -82,12 +82,16 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) | |||
82 | for (i = 0; i < htab->n_buckets; i++) | 82 | for (i = 0; i < htab->n_buckets; i++) |
83 | INIT_HLIST_HEAD(&htab->buckets[i]); | 83 | INIT_HLIST_HEAD(&htab->buckets[i]); |
84 | 84 | ||
85 | spin_lock_init(&htab->lock); | 85 | raw_spin_lock_init(&htab->lock); |
86 | htab->count = 0; | 86 | htab->count = 0; |
87 | 87 | ||
88 | htab->elem_size = sizeof(struct htab_elem) + | 88 | htab->elem_size = sizeof(struct htab_elem) + |
89 | round_up(htab->map.key_size, 8) + | 89 | round_up(htab->map.key_size, 8) + |
90 | htab->map.value_size; | 90 | htab->map.value_size; |
91 | |||
92 | htab->map.pages = round_up(htab->n_buckets * sizeof(struct hlist_head) + | ||
93 | htab->elem_size * htab->map.max_entries, | ||
94 | PAGE_SIZE) >> PAGE_SHIFT; | ||
91 | return &htab->map; | 95 | return &htab->map; |
92 | 96 | ||
93 | free_htab: | 97 | free_htab: |
@@ -230,7 +234,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, | |||
230 | l_new->hash = htab_map_hash(l_new->key, key_size); | 234 | l_new->hash = htab_map_hash(l_new->key, key_size); |
231 | 235 | ||
232 | /* bpf_map_update_elem() can be called in_irq() */ | 236 | /* bpf_map_update_elem() can be called in_irq() */ |
233 | spin_lock_irqsave(&htab->lock, flags); | 237 | raw_spin_lock_irqsave(&htab->lock, flags); |
234 | 238 | ||
235 | head = select_bucket(htab, l_new->hash); | 239 | head = select_bucket(htab, l_new->hash); |
236 | 240 | ||
@@ -266,11 +270,11 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, | |||
266 | } else { | 270 | } else { |
267 | htab->count++; | 271 | htab->count++; |
268 | } | 272 | } |
269 | spin_unlock_irqrestore(&htab->lock, flags); | 273 | raw_spin_unlock_irqrestore(&htab->lock, flags); |
270 | 274 | ||
271 | return 0; | 275 | return 0; |
272 | err: | 276 | err: |
273 | spin_unlock_irqrestore(&htab->lock, flags); | 277 | raw_spin_unlock_irqrestore(&htab->lock, flags); |
274 | kfree(l_new); | 278 | kfree(l_new); |
275 | return ret; | 279 | return ret; |
276 | } | 280 | } |
@@ -291,7 +295,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) | |||
291 | 295 | ||
292 | hash = htab_map_hash(key, key_size); | 296 | hash = htab_map_hash(key, key_size); |
293 | 297 | ||
294 | spin_lock_irqsave(&htab->lock, flags); | 298 | raw_spin_lock_irqsave(&htab->lock, flags); |
295 | 299 | ||
296 | head = select_bucket(htab, hash); | 300 | head = select_bucket(htab, hash); |
297 | 301 | ||
@@ -304,7 +308,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) | |||
304 | ret = 0; | 308 | ret = 0; |
305 | } | 309 | } |
306 | 310 | ||
307 | spin_unlock_irqrestore(&htab->lock, flags); | 311 | raw_spin_unlock_irqrestore(&htab->lock, flags); |
308 | return ret; | 312 | return ret; |
309 | } | 313 | } |
310 | 314 | ||
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1447ec09421e..4504ca66118d 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c | |||
@@ -93,13 +93,8 @@ const struct bpf_func_proto bpf_map_delete_elem_proto = { | |||
93 | .arg2_type = ARG_PTR_TO_MAP_KEY, | 93 | .arg2_type = ARG_PTR_TO_MAP_KEY, |
94 | }; | 94 | }; |
95 | 95 | ||
96 | static u64 bpf_get_prandom_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | ||
97 | { | ||
98 | return prandom_u32(); | ||
99 | } | ||
100 | |||
101 | const struct bpf_func_proto bpf_get_prandom_u32_proto = { | 96 | const struct bpf_func_proto bpf_get_prandom_u32_proto = { |
102 | .func = bpf_get_prandom_u32, | 97 | .func = bpf_user_rnd_u32, |
103 | .gpl_only = false, | 98 | .gpl_only = false, |
104 | .ret_type = RET_INTEGER, | 99 | .ret_type = RET_INTEGER, |
105 | }; | 100 | }; |
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c new file mode 100644 index 000000000000..be6d726e31c9 --- /dev/null +++ b/kernel/bpf/inode.c | |||
@@ -0,0 +1,387 @@ | |||
1 | /* | ||
2 | * Minimal file system backend for holding eBPF maps and programs, | ||
3 | * used by bpf(2) object pinning. | ||
4 | * | ||
5 | * Authors: | ||
6 | * | ||
7 | * Daniel Borkmann <daniel@iogearbox.net> | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or | ||
10 | * modify it under the terms of the GNU General Public License | ||
11 | * version 2 as published by the Free Software Foundation. | ||
12 | */ | ||
13 | |||
14 | #include <linux/module.h> | ||
15 | #include <linux/magic.h> | ||
16 | #include <linux/major.h> | ||
17 | #include <linux/mount.h> | ||
18 | #include <linux/namei.h> | ||
19 | #include <linux/fs.h> | ||
20 | #include <linux/kdev_t.h> | ||
21 | #include <linux/filter.h> | ||
22 | #include <linux/bpf.h> | ||
23 | |||
24 | enum bpf_type { | ||
25 | BPF_TYPE_UNSPEC = 0, | ||
26 | BPF_TYPE_PROG, | ||
27 | BPF_TYPE_MAP, | ||
28 | }; | ||
29 | |||
30 | static void *bpf_any_get(void *raw, enum bpf_type type) | ||
31 | { | ||
32 | switch (type) { | ||
33 | case BPF_TYPE_PROG: | ||
34 | atomic_inc(&((struct bpf_prog *)raw)->aux->refcnt); | ||
35 | break; | ||
36 | case BPF_TYPE_MAP: | ||
37 | atomic_inc(&((struct bpf_map *)raw)->refcnt); | ||
38 | break; | ||
39 | default: | ||
40 | WARN_ON_ONCE(1); | ||
41 | break; | ||
42 | } | ||
43 | |||
44 | return raw; | ||
45 | } | ||
46 | |||
47 | static void bpf_any_put(void *raw, enum bpf_type type) | ||
48 | { | ||
49 | switch (type) { | ||
50 | case BPF_TYPE_PROG: | ||
51 | bpf_prog_put(raw); | ||
52 | break; | ||
53 | case BPF_TYPE_MAP: | ||
54 | bpf_map_put(raw); | ||
55 | break; | ||
56 | default: | ||
57 | WARN_ON_ONCE(1); | ||
58 | break; | ||
59 | } | ||
60 | } | ||
61 | |||
62 | static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type) | ||
63 | { | ||
64 | void *raw; | ||
65 | |||
66 | *type = BPF_TYPE_MAP; | ||
67 | raw = bpf_map_get(ufd); | ||
68 | if (IS_ERR(raw)) { | ||
69 | *type = BPF_TYPE_PROG; | ||
70 | raw = bpf_prog_get(ufd); | ||
71 | } | ||
72 | |||
73 | return raw; | ||
74 | } | ||
75 | |||
76 | static const struct inode_operations bpf_dir_iops; | ||
77 | |||
78 | static const struct inode_operations bpf_prog_iops = { }; | ||
79 | static const struct inode_operations bpf_map_iops = { }; | ||
80 | |||
81 | static struct inode *bpf_get_inode(struct super_block *sb, | ||
82 | const struct inode *dir, | ||
83 | umode_t mode) | ||
84 | { | ||
85 | struct inode *inode; | ||
86 | |||
87 | switch (mode & S_IFMT) { | ||
88 | case S_IFDIR: | ||
89 | case S_IFREG: | ||
90 | break; | ||
91 | default: | ||
92 | return ERR_PTR(-EINVAL); | ||
93 | } | ||
94 | |||
95 | inode = new_inode(sb); | ||
96 | if (!inode) | ||
97 | return ERR_PTR(-ENOSPC); | ||
98 | |||
99 | inode->i_ino = get_next_ino(); | ||
100 | inode->i_atime = CURRENT_TIME; | ||
101 | inode->i_mtime = inode->i_atime; | ||
102 | inode->i_ctime = inode->i_atime; | ||
103 | |||
104 | inode_init_owner(inode, dir, mode); | ||
105 | |||
106 | return inode; | ||
107 | } | ||
108 | |||
109 | static int bpf_inode_type(const struct inode *inode, enum bpf_type *type) | ||
110 | { | ||
111 | *type = BPF_TYPE_UNSPEC; | ||
112 | if (inode->i_op == &bpf_prog_iops) | ||
113 | *type = BPF_TYPE_PROG; | ||
114 | else if (inode->i_op == &bpf_map_iops) | ||
115 | *type = BPF_TYPE_MAP; | ||
116 | else | ||
117 | return -EACCES; | ||
118 | |||
119 | return 0; | ||
120 | } | ||
121 | |||
122 | static bool bpf_dname_reserved(const struct dentry *dentry) | ||
123 | { | ||
124 | return strchr(dentry->d_name.name, '.'); | ||
125 | } | ||
126 | |||
127 | static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) | ||
128 | { | ||
129 | struct inode *inode; | ||
130 | |||
131 | if (bpf_dname_reserved(dentry)) | ||
132 | return -EPERM; | ||
133 | |||
134 | inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR); | ||
135 | if (IS_ERR(inode)) | ||
136 | return PTR_ERR(inode); | ||
137 | |||
138 | inode->i_op = &bpf_dir_iops; | ||
139 | inode->i_fop = &simple_dir_operations; | ||
140 | |||
141 | inc_nlink(inode); | ||
142 | inc_nlink(dir); | ||
143 | |||
144 | d_instantiate(dentry, inode); | ||
145 | dget(dentry); | ||
146 | |||
147 | return 0; | ||
148 | } | ||
149 | |||
150 | static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry, | ||
151 | umode_t mode, const struct inode_operations *iops) | ||
152 | { | ||
153 | struct inode *inode; | ||
154 | |||
155 | if (bpf_dname_reserved(dentry)) | ||
156 | return -EPERM; | ||
157 | |||
158 | inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFREG); | ||
159 | if (IS_ERR(inode)) | ||
160 | return PTR_ERR(inode); | ||
161 | |||
162 | inode->i_op = iops; | ||
163 | inode->i_private = dentry->d_fsdata; | ||
164 | |||
165 | d_instantiate(dentry, inode); | ||
166 | dget(dentry); | ||
167 | |||
168 | return 0; | ||
169 | } | ||
170 | |||
171 | static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode, | ||
172 | dev_t devt) | ||
173 | { | ||
174 | enum bpf_type type = MINOR(devt); | ||
175 | |||
176 | if (MAJOR(devt) != UNNAMED_MAJOR || !S_ISREG(mode) || | ||
177 | dentry->d_fsdata == NULL) | ||
178 | return -EPERM; | ||
179 | |||
180 | switch (type) { | ||
181 | case BPF_TYPE_PROG: | ||
182 | return bpf_mkobj_ops(dir, dentry, mode, &bpf_prog_iops); | ||
183 | case BPF_TYPE_MAP: | ||
184 | return bpf_mkobj_ops(dir, dentry, mode, &bpf_map_iops); | ||
185 | default: | ||
186 | return -EPERM; | ||
187 | } | ||
188 | } | ||
189 | |||
190 | static const struct inode_operations bpf_dir_iops = { | ||
191 | .lookup = simple_lookup, | ||
192 | .mknod = bpf_mkobj, | ||
193 | .mkdir = bpf_mkdir, | ||
194 | .rmdir = simple_rmdir, | ||
195 | .unlink = simple_unlink, | ||
196 | }; | ||
197 | |||
198 | static int bpf_obj_do_pin(const struct filename *pathname, void *raw, | ||
199 | enum bpf_type type) | ||
200 | { | ||
201 | struct dentry *dentry; | ||
202 | struct inode *dir; | ||
203 | struct path path; | ||
204 | umode_t mode; | ||
205 | dev_t devt; | ||
206 | int ret; | ||
207 | |||
208 | dentry = kern_path_create(AT_FDCWD, pathname->name, &path, 0); | ||
209 | if (IS_ERR(dentry)) | ||
210 | return PTR_ERR(dentry); | ||
211 | |||
212 | mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask()); | ||
213 | devt = MKDEV(UNNAMED_MAJOR, type); | ||
214 | |||
215 | ret = security_path_mknod(&path, dentry, mode, devt); | ||
216 | if (ret) | ||
217 | goto out; | ||
218 | |||
219 | dir = d_inode(path.dentry); | ||
220 | if (dir->i_op != &bpf_dir_iops) { | ||
221 | ret = -EPERM; | ||
222 | goto out; | ||
223 | } | ||
224 | |||
225 | dentry->d_fsdata = raw; | ||
226 | ret = vfs_mknod(dir, dentry, mode, devt); | ||
227 | dentry->d_fsdata = NULL; | ||
228 | out: | ||
229 | done_path_create(&path, dentry); | ||
230 | return ret; | ||
231 | } | ||
232 | |||
233 | int bpf_obj_pin_user(u32 ufd, const char __user *pathname) | ||
234 | { | ||
235 | struct filename *pname; | ||
236 | enum bpf_type type; | ||
237 | void *raw; | ||
238 | int ret; | ||
239 | |||
240 | pname = getname(pathname); | ||
241 | if (IS_ERR(pname)) | ||
242 | return PTR_ERR(pname); | ||
243 | |||
244 | raw = bpf_fd_probe_obj(ufd, &type); | ||
245 | if (IS_ERR(raw)) { | ||
246 | ret = PTR_ERR(raw); | ||
247 | goto out; | ||
248 | } | ||
249 | |||
250 | ret = bpf_obj_do_pin(pname, raw, type); | ||
251 | if (ret != 0) | ||
252 | bpf_any_put(raw, type); | ||
253 | out: | ||
254 | putname(pname); | ||
255 | return ret; | ||
256 | } | ||
257 | |||
258 | static void *bpf_obj_do_get(const struct filename *pathname, | ||
259 | enum bpf_type *type) | ||
260 | { | ||
261 | struct inode *inode; | ||
262 | struct path path; | ||
263 | void *raw; | ||
264 | int ret; | ||
265 | |||
266 | ret = kern_path(pathname->name, LOOKUP_FOLLOW, &path); | ||
267 | if (ret) | ||
268 | return ERR_PTR(ret); | ||
269 | |||
270 | inode = d_backing_inode(path.dentry); | ||
271 | ret = inode_permission(inode, MAY_WRITE); | ||
272 | if (ret) | ||
273 | goto out; | ||
274 | |||
275 | ret = bpf_inode_type(inode, type); | ||
276 | if (ret) | ||
277 | goto out; | ||
278 | |||
279 | raw = bpf_any_get(inode->i_private, *type); | ||
280 | touch_atime(&path); | ||
281 | |||
282 | path_put(&path); | ||
283 | return raw; | ||
284 | out: | ||
285 | path_put(&path); | ||
286 | return ERR_PTR(ret); | ||
287 | } | ||
288 | |||
289 | int bpf_obj_get_user(const char __user *pathname) | ||
290 | { | ||
291 | enum bpf_type type = BPF_TYPE_UNSPEC; | ||
292 | struct filename *pname; | ||
293 | int ret = -ENOENT; | ||
294 | void *raw; | ||
295 | |||
296 | pname = getname(pathname); | ||
297 | if (IS_ERR(pname)) | ||
298 | return PTR_ERR(pname); | ||
299 | |||
300 | raw = bpf_obj_do_get(pname, &type); | ||
301 | if (IS_ERR(raw)) { | ||
302 | ret = PTR_ERR(raw); | ||
303 | goto out; | ||
304 | } | ||
305 | |||
306 | if (type == BPF_TYPE_PROG) | ||
307 | ret = bpf_prog_new_fd(raw); | ||
308 | else if (type == BPF_TYPE_MAP) | ||
309 | ret = bpf_map_new_fd(raw); | ||
310 | else | ||
311 | goto out; | ||
312 | |||
313 | if (ret < 0) | ||
314 | bpf_any_put(raw, type); | ||
315 | out: | ||
316 | putname(pname); | ||
317 | return ret; | ||
318 | } | ||
319 | |||
320 | static void bpf_evict_inode(struct inode *inode) | ||
321 | { | ||
322 | enum bpf_type type; | ||
323 | |||
324 | truncate_inode_pages_final(&inode->i_data); | ||
325 | clear_inode(inode); | ||
326 | |||
327 | if (!bpf_inode_type(inode, &type)) | ||
328 | bpf_any_put(inode->i_private, type); | ||
329 | } | ||
330 | |||
331 | static const struct super_operations bpf_super_ops = { | ||
332 | .statfs = simple_statfs, | ||
333 | .drop_inode = generic_delete_inode, | ||
334 | .evict_inode = bpf_evict_inode, | ||
335 | }; | ||
336 | |||
337 | static int bpf_fill_super(struct super_block *sb, void *data, int silent) | ||
338 | { | ||
339 | static struct tree_descr bpf_rfiles[] = { { "" } }; | ||
340 | struct inode *inode; | ||
341 | int ret; | ||
342 | |||
343 | ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles); | ||
344 | if (ret) | ||
345 | return ret; | ||
346 | |||
347 | sb->s_op = &bpf_super_ops; | ||
348 | |||
349 | inode = sb->s_root->d_inode; | ||
350 | inode->i_op = &bpf_dir_iops; | ||
351 | inode->i_mode &= ~S_IALLUGO; | ||
352 | inode->i_mode |= S_ISVTX | S_IRWXUGO; | ||
353 | |||
354 | return 0; | ||
355 | } | ||
356 | |||
357 | static struct dentry *bpf_mount(struct file_system_type *type, int flags, | ||
358 | const char *dev_name, void *data) | ||
359 | { | ||
360 | return mount_ns(type, flags, current->nsproxy->mnt_ns, bpf_fill_super); | ||
361 | } | ||
362 | |||
363 | static struct file_system_type bpf_fs_type = { | ||
364 | .owner = THIS_MODULE, | ||
365 | .name = "bpf", | ||
366 | .mount = bpf_mount, | ||
367 | .kill_sb = kill_litter_super, | ||
368 | .fs_flags = FS_USERNS_MOUNT, | ||
369 | }; | ||
370 | |||
371 | MODULE_ALIAS_FS("bpf"); | ||
372 | |||
373 | static int __init bpf_init(void) | ||
374 | { | ||
375 | int ret; | ||
376 | |||
377 | ret = sysfs_create_mount_point(fs_kobj, "bpf"); | ||
378 | if (ret) | ||
379 | return ret; | ||
380 | |||
381 | ret = register_filesystem(&bpf_fs_type); | ||
382 | if (ret) | ||
383 | sysfs_remove_mount_point(fs_kobj, "bpf"); | ||
384 | |||
385 | return ret; | ||
386 | } | ||
387 | fs_initcall(bpf_init); | ||
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 35bac8e8b071..0d3313d02a7e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c | |||
@@ -18,6 +18,8 @@ | |||
18 | #include <linux/filter.h> | 18 | #include <linux/filter.h> |
19 | #include <linux/version.h> | 19 | #include <linux/version.h> |
20 | 20 | ||
21 | int sysctl_unprivileged_bpf_disabled __read_mostly; | ||
22 | |||
21 | static LIST_HEAD(bpf_map_types); | 23 | static LIST_HEAD(bpf_map_types); |
22 | 24 | ||
23 | static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) | 25 | static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) |
@@ -44,11 +46,38 @@ void bpf_register_map_type(struct bpf_map_type_list *tl) | |||
44 | list_add(&tl->list_node, &bpf_map_types); | 46 | list_add(&tl->list_node, &bpf_map_types); |
45 | } | 47 | } |
46 | 48 | ||
49 | static int bpf_map_charge_memlock(struct bpf_map *map) | ||
50 | { | ||
51 | struct user_struct *user = get_current_user(); | ||
52 | unsigned long memlock_limit; | ||
53 | |||
54 | memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; | ||
55 | |||
56 | atomic_long_add(map->pages, &user->locked_vm); | ||
57 | |||
58 | if (atomic_long_read(&user->locked_vm) > memlock_limit) { | ||
59 | atomic_long_sub(map->pages, &user->locked_vm); | ||
60 | free_uid(user); | ||
61 | return -EPERM; | ||
62 | } | ||
63 | map->user = user; | ||
64 | return 0; | ||
65 | } | ||
66 | |||
67 | static void bpf_map_uncharge_memlock(struct bpf_map *map) | ||
68 | { | ||
69 | struct user_struct *user = map->user; | ||
70 | |||
71 | atomic_long_sub(map->pages, &user->locked_vm); | ||
72 | free_uid(user); | ||
73 | } | ||
74 | |||
47 | /* called from workqueue */ | 75 | /* called from workqueue */ |
48 | static void bpf_map_free_deferred(struct work_struct *work) | 76 | static void bpf_map_free_deferred(struct work_struct *work) |
49 | { | 77 | { |
50 | struct bpf_map *map = container_of(work, struct bpf_map, work); | 78 | struct bpf_map *map = container_of(work, struct bpf_map, work); |
51 | 79 | ||
80 | bpf_map_uncharge_memlock(map); | ||
52 | /* implementation dependent freeing */ | 81 | /* implementation dependent freeing */ |
53 | map->ops->map_free(map); | 82 | map->ops->map_free(map); |
54 | } | 83 | } |
@@ -82,6 +111,12 @@ static const struct file_operations bpf_map_fops = { | |||
82 | .release = bpf_map_release, | 111 | .release = bpf_map_release, |
83 | }; | 112 | }; |
84 | 113 | ||
114 | int bpf_map_new_fd(struct bpf_map *map) | ||
115 | { | ||
116 | return anon_inode_getfd("bpf-map", &bpf_map_fops, map, | ||
117 | O_RDWR | O_CLOEXEC); | ||
118 | } | ||
119 | |||
85 | /* helper macro to check that unused fields 'union bpf_attr' are zero */ | 120 | /* helper macro to check that unused fields 'union bpf_attr' are zero */ |
86 | #define CHECK_ATTR(CMD) \ | 121 | #define CHECK_ATTR(CMD) \ |
87 | memchr_inv((void *) &attr->CMD##_LAST_FIELD + \ | 122 | memchr_inv((void *) &attr->CMD##_LAST_FIELD + \ |
@@ -108,8 +143,11 @@ static int map_create(union bpf_attr *attr) | |||
108 | 143 | ||
109 | atomic_set(&map->refcnt, 1); | 144 | atomic_set(&map->refcnt, 1); |
110 | 145 | ||
111 | err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC); | 146 | err = bpf_map_charge_memlock(map); |
147 | if (err) | ||
148 | goto free_map; | ||
112 | 149 | ||
150 | err = bpf_map_new_fd(map); | ||
113 | if (err < 0) | 151 | if (err < 0) |
114 | /* failed to allocate fd */ | 152 | /* failed to allocate fd */ |
115 | goto free_map; | 153 | goto free_map; |
@@ -124,19 +162,29 @@ free_map: | |||
124 | /* if error is returned, fd is released. | 162 | /* if error is returned, fd is released. |
125 | * On success caller should complete fd access with matching fdput() | 163 | * On success caller should complete fd access with matching fdput() |
126 | */ | 164 | */ |
127 | struct bpf_map *bpf_map_get(struct fd f) | 165 | struct bpf_map *__bpf_map_get(struct fd f) |
128 | { | 166 | { |
129 | struct bpf_map *map; | ||
130 | |||
131 | if (!f.file) | 167 | if (!f.file) |
132 | return ERR_PTR(-EBADF); | 168 | return ERR_PTR(-EBADF); |
133 | |||
134 | if (f.file->f_op != &bpf_map_fops) { | 169 | if (f.file->f_op != &bpf_map_fops) { |
135 | fdput(f); | 170 | fdput(f); |
136 | return ERR_PTR(-EINVAL); | 171 | return ERR_PTR(-EINVAL); |
137 | } | 172 | } |
138 | 173 | ||
139 | map = f.file->private_data; | 174 | return f.file->private_data; |
175 | } | ||
176 | |||
177 | struct bpf_map *bpf_map_get(u32 ufd) | ||
178 | { | ||
179 | struct fd f = fdget(ufd); | ||
180 | struct bpf_map *map; | ||
181 | |||
182 | map = __bpf_map_get(f); | ||
183 | if (IS_ERR(map)) | ||
184 | return map; | ||
185 | |||
186 | atomic_inc(&map->refcnt); | ||
187 | fdput(f); | ||
140 | 188 | ||
141 | return map; | 189 | return map; |
142 | } | 190 | } |
@@ -164,7 +212,7 @@ static int map_lookup_elem(union bpf_attr *attr) | |||
164 | return -EINVAL; | 212 | return -EINVAL; |
165 | 213 | ||
166 | f = fdget(ufd); | 214 | f = fdget(ufd); |
167 | map = bpf_map_get(f); | 215 | map = __bpf_map_get(f); |
168 | if (IS_ERR(map)) | 216 | if (IS_ERR(map)) |
169 | return PTR_ERR(map); | 217 | return PTR_ERR(map); |
170 | 218 | ||
@@ -223,7 +271,7 @@ static int map_update_elem(union bpf_attr *attr) | |||
223 | return -EINVAL; | 271 | return -EINVAL; |
224 | 272 | ||
225 | f = fdget(ufd); | 273 | f = fdget(ufd); |
226 | map = bpf_map_get(f); | 274 | map = __bpf_map_get(f); |
227 | if (IS_ERR(map)) | 275 | if (IS_ERR(map)) |
228 | return PTR_ERR(map); | 276 | return PTR_ERR(map); |
229 | 277 | ||
@@ -276,7 +324,7 @@ static int map_delete_elem(union bpf_attr *attr) | |||
276 | return -EINVAL; | 324 | return -EINVAL; |
277 | 325 | ||
278 | f = fdget(ufd); | 326 | f = fdget(ufd); |
279 | map = bpf_map_get(f); | 327 | map = __bpf_map_get(f); |
280 | if (IS_ERR(map)) | 328 | if (IS_ERR(map)) |
281 | return PTR_ERR(map); | 329 | return PTR_ERR(map); |
282 | 330 | ||
@@ -317,7 +365,7 @@ static int map_get_next_key(union bpf_attr *attr) | |||
317 | return -EINVAL; | 365 | return -EINVAL; |
318 | 366 | ||
319 | f = fdget(ufd); | 367 | f = fdget(ufd); |
320 | map = bpf_map_get(f); | 368 | map = __bpf_map_get(f); |
321 | if (IS_ERR(map)) | 369 | if (IS_ERR(map)) |
322 | return PTR_ERR(map); | 370 | return PTR_ERR(map); |
323 | 371 | ||
@@ -402,6 +450,10 @@ static void fixup_bpf_calls(struct bpf_prog *prog) | |||
402 | */ | 450 | */ |
403 | BUG_ON(!prog->aux->ops->get_func_proto); | 451 | BUG_ON(!prog->aux->ops->get_func_proto); |
404 | 452 | ||
453 | if (insn->imm == BPF_FUNC_get_route_realm) | ||
454 | prog->dst_needed = 1; | ||
455 | if (insn->imm == BPF_FUNC_get_prandom_u32) | ||
456 | bpf_user_rnd_init_once(); | ||
405 | if (insn->imm == BPF_FUNC_tail_call) { | 457 | if (insn->imm == BPF_FUNC_tail_call) { |
406 | /* mark bpf_tail_call as different opcode | 458 | /* mark bpf_tail_call as different opcode |
407 | * to avoid conditional branch in | 459 | * to avoid conditional branch in |
@@ -436,29 +488,51 @@ static void free_used_maps(struct bpf_prog_aux *aux) | |||
436 | kfree(aux->used_maps); | 488 | kfree(aux->used_maps); |
437 | } | 489 | } |
438 | 490 | ||
439 | static void __prog_put_rcu(struct rcu_head *rcu) | 491 | static int bpf_prog_charge_memlock(struct bpf_prog *prog) |
492 | { | ||
493 | struct user_struct *user = get_current_user(); | ||
494 | unsigned long memlock_limit; | ||
495 | |||
496 | memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; | ||
497 | |||
498 | atomic_long_add(prog->pages, &user->locked_vm); | ||
499 | if (atomic_long_read(&user->locked_vm) > memlock_limit) { | ||
500 | atomic_long_sub(prog->pages, &user->locked_vm); | ||
501 | free_uid(user); | ||
502 | return -EPERM; | ||
503 | } | ||
504 | prog->aux->user = user; | ||
505 | return 0; | ||
506 | } | ||
507 | |||
508 | static void bpf_prog_uncharge_memlock(struct bpf_prog *prog) | ||
509 | { | ||
510 | struct user_struct *user = prog->aux->user; | ||
511 | |||
512 | atomic_long_sub(prog->pages, &user->locked_vm); | ||
513 | free_uid(user); | ||
514 | } | ||
515 | |||
516 | static void __prog_put_common(struct rcu_head *rcu) | ||
440 | { | 517 | { |
441 | struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); | 518 | struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); |
442 | 519 | ||
443 | free_used_maps(aux); | 520 | free_used_maps(aux); |
521 | bpf_prog_uncharge_memlock(aux->prog); | ||
444 | bpf_prog_free(aux->prog); | 522 | bpf_prog_free(aux->prog); |
445 | } | 523 | } |
446 | 524 | ||
447 | /* version of bpf_prog_put() that is called after a grace period */ | 525 | /* version of bpf_prog_put() that is called after a grace period */ |
448 | void bpf_prog_put_rcu(struct bpf_prog *prog) | 526 | void bpf_prog_put_rcu(struct bpf_prog *prog) |
449 | { | 527 | { |
450 | if (atomic_dec_and_test(&prog->aux->refcnt)) { | 528 | if (atomic_dec_and_test(&prog->aux->refcnt)) |
451 | prog->aux->prog = prog; | 529 | call_rcu(&prog->aux->rcu, __prog_put_common); |
452 | call_rcu(&prog->aux->rcu, __prog_put_rcu); | ||
453 | } | ||
454 | } | 530 | } |
455 | 531 | ||
456 | void bpf_prog_put(struct bpf_prog *prog) | 532 | void bpf_prog_put(struct bpf_prog *prog) |
457 | { | 533 | { |
458 | if (atomic_dec_and_test(&prog->aux->refcnt)) { | 534 | if (atomic_dec_and_test(&prog->aux->refcnt)) |
459 | free_used_maps(prog->aux); | 535 | __prog_put_common(&prog->aux->rcu); |
460 | bpf_prog_free(prog); | ||
461 | } | ||
462 | } | 536 | } |
463 | EXPORT_SYMBOL_GPL(bpf_prog_put); | 537 | EXPORT_SYMBOL_GPL(bpf_prog_put); |
464 | 538 | ||
@@ -474,21 +548,22 @@ static const struct file_operations bpf_prog_fops = { | |||
474 | .release = bpf_prog_release, | 548 | .release = bpf_prog_release, |
475 | }; | 549 | }; |
476 | 550 | ||
477 | static struct bpf_prog *get_prog(struct fd f) | 551 | int bpf_prog_new_fd(struct bpf_prog *prog) |
478 | { | 552 | { |
479 | struct bpf_prog *prog; | 553 | return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, |
554 | O_RDWR | O_CLOEXEC); | ||
555 | } | ||
480 | 556 | ||
557 | static struct bpf_prog *__bpf_prog_get(struct fd f) | ||
558 | { | ||
481 | if (!f.file) | 559 | if (!f.file) |
482 | return ERR_PTR(-EBADF); | 560 | return ERR_PTR(-EBADF); |
483 | |||
484 | if (f.file->f_op != &bpf_prog_fops) { | 561 | if (f.file->f_op != &bpf_prog_fops) { |
485 | fdput(f); | 562 | fdput(f); |
486 | return ERR_PTR(-EINVAL); | 563 | return ERR_PTR(-EINVAL); |
487 | } | 564 | } |
488 | 565 | ||
489 | prog = f.file->private_data; | 566 | return f.file->private_data; |
490 | |||
491 | return prog; | ||
492 | } | 567 | } |
493 | 568 | ||
494 | /* called by sockets/tracing/seccomp before attaching program to an event | 569 | /* called by sockets/tracing/seccomp before attaching program to an event |
@@ -499,13 +574,13 @@ struct bpf_prog *bpf_prog_get(u32 ufd) | |||
499 | struct fd f = fdget(ufd); | 574 | struct fd f = fdget(ufd); |
500 | struct bpf_prog *prog; | 575 | struct bpf_prog *prog; |
501 | 576 | ||
502 | prog = get_prog(f); | 577 | prog = __bpf_prog_get(f); |
503 | |||
504 | if (IS_ERR(prog)) | 578 | if (IS_ERR(prog)) |
505 | return prog; | 579 | return prog; |
506 | 580 | ||
507 | atomic_inc(&prog->aux->refcnt); | 581 | atomic_inc(&prog->aux->refcnt); |
508 | fdput(f); | 582 | fdput(f); |
583 | |||
509 | return prog; | 584 | return prog; |
510 | } | 585 | } |
511 | EXPORT_SYMBOL_GPL(bpf_prog_get); | 586 | EXPORT_SYMBOL_GPL(bpf_prog_get); |
@@ -540,11 +615,18 @@ static int bpf_prog_load(union bpf_attr *attr) | |||
540 | attr->kern_version != LINUX_VERSION_CODE) | 615 | attr->kern_version != LINUX_VERSION_CODE) |
541 | return -EINVAL; | 616 | return -EINVAL; |
542 | 617 | ||
618 | if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN)) | ||
619 | return -EPERM; | ||
620 | |||
543 | /* plain bpf_prog allocation */ | 621 | /* plain bpf_prog allocation */ |
544 | prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); | 622 | prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); |
545 | if (!prog) | 623 | if (!prog) |
546 | return -ENOMEM; | 624 | return -ENOMEM; |
547 | 625 | ||
626 | err = bpf_prog_charge_memlock(prog); | ||
627 | if (err) | ||
628 | goto free_prog_nouncharge; | ||
629 | |||
548 | prog->len = attr->insn_cnt; | 630 | prog->len = attr->insn_cnt; |
549 | 631 | ||
550 | err = -EFAULT; | 632 | err = -EFAULT; |
@@ -553,10 +635,10 @@ static int bpf_prog_load(union bpf_attr *attr) | |||
553 | goto free_prog; | 635 | goto free_prog; |
554 | 636 | ||
555 | prog->orig_prog = NULL; | 637 | prog->orig_prog = NULL; |
556 | prog->jited = false; | 638 | prog->jited = 0; |
557 | 639 | ||
558 | atomic_set(&prog->aux->refcnt, 1); | 640 | atomic_set(&prog->aux->refcnt, 1); |
559 | prog->gpl_compatible = is_gpl; | 641 | prog->gpl_compatible = is_gpl ? 1 : 0; |
560 | 642 | ||
561 | /* find program type: socket_filter vs tracing_filter */ | 643 | /* find program type: socket_filter vs tracing_filter */ |
562 | err = find_prog_type(type, prog); | 644 | err = find_prog_type(type, prog); |
@@ -576,7 +658,7 @@ static int bpf_prog_load(union bpf_attr *attr) | |||
576 | if (err < 0) | 658 | if (err < 0) |
577 | goto free_used_maps; | 659 | goto free_used_maps; |
578 | 660 | ||
579 | err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC); | 661 | err = bpf_prog_new_fd(prog); |
580 | if (err < 0) | 662 | if (err < 0) |
581 | /* failed to allocate fd */ | 663 | /* failed to allocate fd */ |
582 | goto free_used_maps; | 664 | goto free_used_maps; |
@@ -586,20 +668,36 @@ static int bpf_prog_load(union bpf_attr *attr) | |||
586 | free_used_maps: | 668 | free_used_maps: |
587 | free_used_maps(prog->aux); | 669 | free_used_maps(prog->aux); |
588 | free_prog: | 670 | free_prog: |
671 | bpf_prog_uncharge_memlock(prog); | ||
672 | free_prog_nouncharge: | ||
589 | bpf_prog_free(prog); | 673 | bpf_prog_free(prog); |
590 | return err; | 674 | return err; |
591 | } | 675 | } |
592 | 676 | ||
677 | #define BPF_OBJ_LAST_FIELD bpf_fd | ||
678 | |||
679 | static int bpf_obj_pin(const union bpf_attr *attr) | ||
680 | { | ||
681 | if (CHECK_ATTR(BPF_OBJ)) | ||
682 | return -EINVAL; | ||
683 | |||
684 | return bpf_obj_pin_user(attr->bpf_fd, u64_to_ptr(attr->pathname)); | ||
685 | } | ||
686 | |||
687 | static int bpf_obj_get(const union bpf_attr *attr) | ||
688 | { | ||
689 | if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0) | ||
690 | return -EINVAL; | ||
691 | |||
692 | return bpf_obj_get_user(u64_to_ptr(attr->pathname)); | ||
693 | } | ||
694 | |||
593 | SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) | 695 | SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) |
594 | { | 696 | { |
595 | union bpf_attr attr = {}; | 697 | union bpf_attr attr = {}; |
596 | int err; | 698 | int err; |
597 | 699 | ||
598 | /* the syscall is limited to root temporarily. This restriction will be | 700 | if (!capable(CAP_SYS_ADMIN) && sysctl_unprivileged_bpf_disabled) |
599 | * lifted when security audit is clean. Note that eBPF+tracing must have | ||
600 | * this restriction, since it may pass kernel data to user space | ||
601 | */ | ||
602 | if (!capable(CAP_SYS_ADMIN)) | ||
603 | return -EPERM; | 701 | return -EPERM; |
604 | 702 | ||
605 | if (!access_ok(VERIFY_READ, uattr, 1)) | 703 | if (!access_ok(VERIFY_READ, uattr, 1)) |
@@ -654,6 +752,12 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz | |||
654 | case BPF_PROG_LOAD: | 752 | case BPF_PROG_LOAD: |
655 | err = bpf_prog_load(&attr); | 753 | err = bpf_prog_load(&attr); |
656 | break; | 754 | break; |
755 | case BPF_OBJ_PIN: | ||
756 | err = bpf_obj_pin(&attr); | ||
757 | break; | ||
758 | case BPF_OBJ_GET: | ||
759 | err = bpf_obj_get(&attr); | ||
760 | break; | ||
657 | default: | 761 | default: |
658 | err = -EINVAL; | 762 | err = -EINVAL; |
659 | break; | 763 | break; |
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b074b23000d6..c6073056badf 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c | |||
@@ -199,6 +199,7 @@ struct verifier_env { | |||
199 | struct verifier_state_list **explored_states; /* search pruning optimization */ | 199 | struct verifier_state_list **explored_states; /* search pruning optimization */ |
200 | struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ | 200 | struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ |
201 | u32 used_map_cnt; /* number of used maps */ | 201 | u32 used_map_cnt; /* number of used maps */ |
202 | bool allow_ptr_leaks; | ||
202 | }; | 203 | }; |
203 | 204 | ||
204 | /* verbose verifier prints what it's seeing | 205 | /* verbose verifier prints what it's seeing |
@@ -213,7 +214,7 @@ static DEFINE_MUTEX(bpf_verifier_lock); | |||
213 | * verbose() is used to dump the verification trace to the log, so the user | 214 | * verbose() is used to dump the verification trace to the log, so the user |
214 | * can figure out what's wrong with the program | 215 | * can figure out what's wrong with the program |
215 | */ | 216 | */ |
216 | static void verbose(const char *fmt, ...) | 217 | static __printf(1, 2) void verbose(const char *fmt, ...) |
217 | { | 218 | { |
218 | va_list args; | 219 | va_list args; |
219 | 220 | ||
@@ -244,6 +245,7 @@ static const struct { | |||
244 | } func_limit[] = { | 245 | } func_limit[] = { |
245 | {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call}, | 246 | {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call}, |
246 | {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read}, | 247 | {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read}, |
248 | {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_output}, | ||
247 | }; | 249 | }; |
248 | 250 | ||
249 | static void print_verifier_state(struct verifier_env *env) | 251 | static void print_verifier_state(struct verifier_env *env) |
@@ -538,6 +540,21 @@ static int bpf_size_to_bytes(int bpf_size) | |||
538 | return -EINVAL; | 540 | return -EINVAL; |
539 | } | 541 | } |
540 | 542 | ||
543 | static bool is_spillable_regtype(enum bpf_reg_type type) | ||
544 | { | ||
545 | switch (type) { | ||
546 | case PTR_TO_MAP_VALUE: | ||
547 | case PTR_TO_MAP_VALUE_OR_NULL: | ||
548 | case PTR_TO_STACK: | ||
549 | case PTR_TO_CTX: | ||
550 | case FRAME_PTR: | ||
551 | case CONST_PTR_TO_MAP: | ||
552 | return true; | ||
553 | default: | ||
554 | return false; | ||
555 | } | ||
556 | } | ||
557 | |||
541 | /* check_stack_read/write functions track spill/fill of registers, | 558 | /* check_stack_read/write functions track spill/fill of registers, |
542 | * stack boundary and alignment are checked in check_mem_access() | 559 | * stack boundary and alignment are checked in check_mem_access() |
543 | */ | 560 | */ |
@@ -550,9 +567,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size, | |||
550 | */ | 567 | */ |
551 | 568 | ||
552 | if (value_regno >= 0 && | 569 | if (value_regno >= 0 && |
553 | (state->regs[value_regno].type == PTR_TO_MAP_VALUE || | 570 | is_spillable_regtype(state->regs[value_regno].type)) { |
554 | state->regs[value_regno].type == PTR_TO_STACK || | ||
555 | state->regs[value_regno].type == PTR_TO_CTX)) { | ||
556 | 571 | ||
557 | /* register containing pointer is being spilled into stack */ | 572 | /* register containing pointer is being spilled into stack */ |
558 | if (size != BPF_REG_SIZE) { | 573 | if (size != BPF_REG_SIZE) { |
@@ -643,6 +658,20 @@ static int check_ctx_access(struct verifier_env *env, int off, int size, | |||
643 | return -EACCES; | 658 | return -EACCES; |
644 | } | 659 | } |
645 | 660 | ||
661 | static bool is_pointer_value(struct verifier_env *env, int regno) | ||
662 | { | ||
663 | if (env->allow_ptr_leaks) | ||
664 | return false; | ||
665 | |||
666 | switch (env->cur_state.regs[regno].type) { | ||
667 | case UNKNOWN_VALUE: | ||
668 | case CONST_IMM: | ||
669 | return false; | ||
670 | default: | ||
671 | return true; | ||
672 | } | ||
673 | } | ||
674 | |||
646 | /* check whether memory at (regno + off) is accessible for t = (read | write) | 675 | /* check whether memory at (regno + off) is accessible for t = (read | write) |
647 | * if t==write, value_regno is a register which value is stored into memory | 676 | * if t==write, value_regno is a register which value is stored into memory |
648 | * if t==read, value_regno is a register which will receive the value from memory | 677 | * if t==read, value_regno is a register which will receive the value from memory |
@@ -669,11 +698,21 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, | |||
669 | } | 698 | } |
670 | 699 | ||
671 | if (state->regs[regno].type == PTR_TO_MAP_VALUE) { | 700 | if (state->regs[regno].type == PTR_TO_MAP_VALUE) { |
701 | if (t == BPF_WRITE && value_regno >= 0 && | ||
702 | is_pointer_value(env, value_regno)) { | ||
703 | verbose("R%d leaks addr into map\n", value_regno); | ||
704 | return -EACCES; | ||
705 | } | ||
672 | err = check_map_access(env, regno, off, size); | 706 | err = check_map_access(env, regno, off, size); |
673 | if (!err && t == BPF_READ && value_regno >= 0) | 707 | if (!err && t == BPF_READ && value_regno >= 0) |
674 | mark_reg_unknown_value(state->regs, value_regno); | 708 | mark_reg_unknown_value(state->regs, value_regno); |
675 | 709 | ||
676 | } else if (state->regs[regno].type == PTR_TO_CTX) { | 710 | } else if (state->regs[regno].type == PTR_TO_CTX) { |
711 | if (t == BPF_WRITE && value_regno >= 0 && | ||
712 | is_pointer_value(env, value_regno)) { | ||
713 | verbose("R%d leaks addr into ctx\n", value_regno); | ||
714 | return -EACCES; | ||
715 | } | ||
677 | err = check_ctx_access(env, off, size, t); | 716 | err = check_ctx_access(env, off, size, t); |
678 | if (!err && t == BPF_READ && value_regno >= 0) | 717 | if (!err && t == BPF_READ && value_regno >= 0) |
679 | mark_reg_unknown_value(state->regs, value_regno); | 718 | mark_reg_unknown_value(state->regs, value_regno); |
@@ -684,10 +723,17 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, | |||
684 | verbose("invalid stack off=%d size=%d\n", off, size); | 723 | verbose("invalid stack off=%d size=%d\n", off, size); |
685 | return -EACCES; | 724 | return -EACCES; |
686 | } | 725 | } |
687 | if (t == BPF_WRITE) | 726 | if (t == BPF_WRITE) { |
727 | if (!env->allow_ptr_leaks && | ||
728 | state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL && | ||
729 | size != BPF_REG_SIZE) { | ||
730 | verbose("attempt to corrupt spilled pointer on stack\n"); | ||
731 | return -EACCES; | ||
732 | } | ||
688 | err = check_stack_write(state, off, size, value_regno); | 733 | err = check_stack_write(state, off, size, value_regno); |
689 | else | 734 | } else { |
690 | err = check_stack_read(state, off, size, value_regno); | 735 | err = check_stack_read(state, off, size, value_regno); |
736 | } | ||
691 | } else { | 737 | } else { |
692 | verbose("R%d invalid mem access '%s'\n", | 738 | verbose("R%d invalid mem access '%s'\n", |
693 | regno, reg_type_str[state->regs[regno].type]); | 739 | regno, reg_type_str[state->regs[regno].type]); |
@@ -775,8 +821,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno, | |||
775 | return -EACCES; | 821 | return -EACCES; |
776 | } | 822 | } |
777 | 823 | ||
778 | if (arg_type == ARG_ANYTHING) | 824 | if (arg_type == ARG_ANYTHING) { |
825 | if (is_pointer_value(env, regno)) { | ||
826 | verbose("R%d leaks addr into helper function\n", regno); | ||
827 | return -EACCES; | ||
828 | } | ||
779 | return 0; | 829 | return 0; |
830 | } | ||
780 | 831 | ||
781 | if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY || | 832 | if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY || |
782 | arg_type == ARG_PTR_TO_MAP_VALUE) { | 833 | arg_type == ARG_PTR_TO_MAP_VALUE) { |
@@ -860,7 +911,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) | |||
860 | * don't allow any other map type to be passed into | 911 | * don't allow any other map type to be passed into |
861 | * the special func; | 912 | * the special func; |
862 | */ | 913 | */ |
863 | if (bool_map != bool_func) | 914 | if (bool_func && bool_map != bool_func) |
864 | return -EINVAL; | 915 | return -EINVAL; |
865 | } | 916 | } |
866 | 917 | ||
@@ -950,8 +1001,9 @@ static int check_call(struct verifier_env *env, int func_id) | |||
950 | } | 1001 | } |
951 | 1002 | ||
952 | /* check validity of 32-bit and 64-bit arithmetic operations */ | 1003 | /* check validity of 32-bit and 64-bit arithmetic operations */ |
953 | static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn) | 1004 | static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) |
954 | { | 1005 | { |
1006 | struct reg_state *regs = env->cur_state.regs; | ||
955 | u8 opcode = BPF_OP(insn->code); | 1007 | u8 opcode = BPF_OP(insn->code); |
956 | int err; | 1008 | int err; |
957 | 1009 | ||
@@ -976,6 +1028,12 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn) | |||
976 | if (err) | 1028 | if (err) |
977 | return err; | 1029 | return err; |
978 | 1030 | ||
1031 | if (is_pointer_value(env, insn->dst_reg)) { | ||
1032 | verbose("R%d pointer arithmetic prohibited\n", | ||
1033 | insn->dst_reg); | ||
1034 | return -EACCES; | ||
1035 | } | ||
1036 | |||
979 | /* check dest operand */ | 1037 | /* check dest operand */ |
980 | err = check_reg_arg(regs, insn->dst_reg, DST_OP); | 1038 | err = check_reg_arg(regs, insn->dst_reg, DST_OP); |
981 | if (err) | 1039 | if (err) |
@@ -1012,6 +1070,11 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn) | |||
1012 | */ | 1070 | */ |
1013 | regs[insn->dst_reg] = regs[insn->src_reg]; | 1071 | regs[insn->dst_reg] = regs[insn->src_reg]; |
1014 | } else { | 1072 | } else { |
1073 | if (is_pointer_value(env, insn->src_reg)) { | ||
1074 | verbose("R%d partial copy of pointer\n", | ||
1075 | insn->src_reg); | ||
1076 | return -EACCES; | ||
1077 | } | ||
1015 | regs[insn->dst_reg].type = UNKNOWN_VALUE; | 1078 | regs[insn->dst_reg].type = UNKNOWN_VALUE; |
1016 | regs[insn->dst_reg].map_ptr = NULL; | 1079 | regs[insn->dst_reg].map_ptr = NULL; |
1017 | } | 1080 | } |
@@ -1061,8 +1124,18 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn) | |||
1061 | /* pattern match 'bpf_add Rx, imm' instruction */ | 1124 | /* pattern match 'bpf_add Rx, imm' instruction */ |
1062 | if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && | 1125 | if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && |
1063 | regs[insn->dst_reg].type == FRAME_PTR && | 1126 | regs[insn->dst_reg].type == FRAME_PTR && |
1064 | BPF_SRC(insn->code) == BPF_K) | 1127 | BPF_SRC(insn->code) == BPF_K) { |
1065 | stack_relative = true; | 1128 | stack_relative = true; |
1129 | } else if (is_pointer_value(env, insn->dst_reg)) { | ||
1130 | verbose("R%d pointer arithmetic prohibited\n", | ||
1131 | insn->dst_reg); | ||
1132 | return -EACCES; | ||
1133 | } else if (BPF_SRC(insn->code) == BPF_X && | ||
1134 | is_pointer_value(env, insn->src_reg)) { | ||
1135 | verbose("R%d pointer arithmetic prohibited\n", | ||
1136 | insn->src_reg); | ||
1137 | return -EACCES; | ||
1138 | } | ||
1066 | 1139 | ||
1067 | /* check dest operand */ | 1140 | /* check dest operand */ |
1068 | err = check_reg_arg(regs, insn->dst_reg, DST_OP); | 1141 | err = check_reg_arg(regs, insn->dst_reg, DST_OP); |
@@ -1101,6 +1174,12 @@ static int check_cond_jmp_op(struct verifier_env *env, | |||
1101 | err = check_reg_arg(regs, insn->src_reg, SRC_OP); | 1174 | err = check_reg_arg(regs, insn->src_reg, SRC_OP); |
1102 | if (err) | 1175 | if (err) |
1103 | return err; | 1176 | return err; |
1177 | |||
1178 | if (is_pointer_value(env, insn->src_reg)) { | ||
1179 | verbose("R%d pointer comparison prohibited\n", | ||
1180 | insn->src_reg); | ||
1181 | return -EACCES; | ||
1182 | } | ||
1104 | } else { | 1183 | } else { |
1105 | if (insn->src_reg != BPF_REG_0) { | 1184 | if (insn->src_reg != BPF_REG_0) { |
1106 | verbose("BPF_JMP uses reserved fields\n"); | 1185 | verbose("BPF_JMP uses reserved fields\n"); |
@@ -1155,6 +1234,9 @@ static int check_cond_jmp_op(struct verifier_env *env, | |||
1155 | regs[insn->dst_reg].type = CONST_IMM; | 1234 | regs[insn->dst_reg].type = CONST_IMM; |
1156 | regs[insn->dst_reg].imm = 0; | 1235 | regs[insn->dst_reg].imm = 0; |
1157 | } | 1236 | } |
1237 | } else if (is_pointer_value(env, insn->dst_reg)) { | ||
1238 | verbose("R%d pointer comparison prohibited\n", insn->dst_reg); | ||
1239 | return -EACCES; | ||
1158 | } else if (BPF_SRC(insn->code) == BPF_K && | 1240 | } else if (BPF_SRC(insn->code) == BPF_K && |
1159 | (opcode == BPF_JEQ || opcode == BPF_JNE)) { | 1241 | (opcode == BPF_JEQ || opcode == BPF_JNE)) { |
1160 | 1242 | ||
@@ -1658,7 +1740,7 @@ static int do_check(struct verifier_env *env) | |||
1658 | } | 1740 | } |
1659 | 1741 | ||
1660 | if (class == BPF_ALU || class == BPF_ALU64) { | 1742 | if (class == BPF_ALU || class == BPF_ALU64) { |
1661 | err = check_alu_op(regs, insn); | 1743 | err = check_alu_op(env, insn); |
1662 | if (err) | 1744 | if (err) |
1663 | return err; | 1745 | return err; |
1664 | 1746 | ||
@@ -1816,6 +1898,11 @@ static int do_check(struct verifier_env *env) | |||
1816 | if (err) | 1898 | if (err) |
1817 | return err; | 1899 | return err; |
1818 | 1900 | ||
1901 | if (is_pointer_value(env, BPF_REG_0)) { | ||
1902 | verbose("R0 leaks addr as return value\n"); | ||
1903 | return -EACCES; | ||
1904 | } | ||
1905 | |||
1819 | process_bpf_exit: | 1906 | process_bpf_exit: |
1820 | insn_idx = pop_stack(env, &prev_insn_idx); | 1907 | insn_idx = pop_stack(env, &prev_insn_idx); |
1821 | if (insn_idx < 0) { | 1908 | if (insn_idx < 0) { |
@@ -1902,8 +1989,7 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) | |||
1902 | } | 1989 | } |
1903 | 1990 | ||
1904 | f = fdget(insn->imm); | 1991 | f = fdget(insn->imm); |
1905 | 1992 | map = __bpf_map_get(f); | |
1906 | map = bpf_map_get(f); | ||
1907 | if (IS_ERR(map)) { | 1993 | if (IS_ERR(map)) { |
1908 | verbose("fd %d is not pointing to valid bpf_map\n", | 1994 | verbose("fd %d is not pointing to valid bpf_map\n", |
1909 | insn->imm); | 1995 | insn->imm); |
@@ -2024,7 +2110,7 @@ static int convert_ctx_accesses(struct verifier_env *env) | |||
2024 | 2110 | ||
2025 | cnt = env->prog->aux->ops-> | 2111 | cnt = env->prog->aux->ops-> |
2026 | convert_ctx_access(type, insn->dst_reg, insn->src_reg, | 2112 | convert_ctx_access(type, insn->dst_reg, insn->src_reg, |
2027 | insn->off, insn_buf); | 2113 | insn->off, insn_buf, env->prog); |
2028 | if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { | 2114 | if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { |
2029 | verbose("bpf verifier is misconfigured\n"); | 2115 | verbose("bpf verifier is misconfigured\n"); |
2030 | return -EINVAL; | 2116 | return -EINVAL; |
@@ -2144,6 +2230,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) | |||
2144 | if (ret < 0) | 2230 | if (ret < 0) |
2145 | goto skip_full_check; | 2231 | goto skip_full_check; |
2146 | 2232 | ||
2233 | env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); | ||
2234 | |||
2147 | ret = do_check(env); | 2235 | ret = do_check(env); |
2148 | 2236 | ||
2149 | skip_full_check: | 2237 | skip_full_check: |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2cf0f79f1fc9..f1603c153890 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -45,7 +45,6 @@ | |||
45 | #include <linux/sched.h> | 45 | #include <linux/sched.h> |
46 | #include <linux/slab.h> | 46 | #include <linux/slab.h> |
47 | #include <linux/spinlock.h> | 47 | #include <linux/spinlock.h> |
48 | #include <linux/rwsem.h> | ||
49 | #include <linux/percpu-rwsem.h> | 48 | #include <linux/percpu-rwsem.h> |
50 | #include <linux/string.h> | 49 | #include <linux/string.h> |
51 | #include <linux/sort.h> | 50 | #include <linux/sort.h> |
@@ -76,7 +75,7 @@ | |||
76 | * cgroup_mutex is the master lock. Any modification to cgroup or its | 75 | * cgroup_mutex is the master lock. Any modification to cgroup or its |
77 | * hierarchy must be performed while holding it. | 76 | * hierarchy must be performed while holding it. |
78 | * | 77 | * |
79 | * css_set_rwsem protects task->cgroups pointer, the list of css_set | 78 | * css_set_lock protects task->cgroups pointer, the list of css_set |
80 | * objects, and the chain of tasks off each css_set. | 79 | * objects, and the chain of tasks off each css_set. |
81 | * | 80 | * |
82 | * These locks are exported if CONFIG_PROVE_RCU so that accessors in | 81 | * These locks are exported if CONFIG_PROVE_RCU so that accessors in |
@@ -84,12 +83,12 @@ | |||
84 | */ | 83 | */ |
85 | #ifdef CONFIG_PROVE_RCU | 84 | #ifdef CONFIG_PROVE_RCU |
86 | DEFINE_MUTEX(cgroup_mutex); | 85 | DEFINE_MUTEX(cgroup_mutex); |
87 | DECLARE_RWSEM(css_set_rwsem); | 86 | DEFINE_SPINLOCK(css_set_lock); |
88 | EXPORT_SYMBOL_GPL(cgroup_mutex); | 87 | EXPORT_SYMBOL_GPL(cgroup_mutex); |
89 | EXPORT_SYMBOL_GPL(css_set_rwsem); | 88 | EXPORT_SYMBOL_GPL(css_set_lock); |
90 | #else | 89 | #else |
91 | static DEFINE_MUTEX(cgroup_mutex); | 90 | static DEFINE_MUTEX(cgroup_mutex); |
92 | static DECLARE_RWSEM(css_set_rwsem); | 91 | static DEFINE_SPINLOCK(css_set_lock); |
93 | #endif | 92 | #endif |
94 | 93 | ||
95 | /* | 94 | /* |
@@ -139,6 +138,27 @@ static const char *cgroup_subsys_name[] = { | |||
139 | }; | 138 | }; |
140 | #undef SUBSYS | 139 | #undef SUBSYS |
141 | 140 | ||
141 | /* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */ | ||
142 | #define SUBSYS(_x) \ | ||
143 | DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \ | ||
144 | DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \ | ||
145 | EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \ | ||
146 | EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key); | ||
147 | #include <linux/cgroup_subsys.h> | ||
148 | #undef SUBSYS | ||
149 | |||
150 | #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key, | ||
151 | static struct static_key_true *cgroup_subsys_enabled_key[] = { | ||
152 | #include <linux/cgroup_subsys.h> | ||
153 | }; | ||
154 | #undef SUBSYS | ||
155 | |||
156 | #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key, | ||
157 | static struct static_key_true *cgroup_subsys_on_dfl_key[] = { | ||
158 | #include <linux/cgroup_subsys.h> | ||
159 | }; | ||
160 | #undef SUBSYS | ||
161 | |||
142 | /* | 162 | /* |
143 | * The default hierarchy, reserved for the subsystems that are otherwise | 163 | * The default hierarchy, reserved for the subsystems that are otherwise |
144 | * unattached - it never has more than a single cgroup, and all tasks are | 164 | * unattached - it never has more than a single cgroup, and all tasks are |
@@ -153,12 +173,6 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root); | |||
153 | */ | 173 | */ |
154 | static bool cgrp_dfl_root_visible; | 174 | static bool cgrp_dfl_root_visible; |
155 | 175 | ||
156 | /* | ||
157 | * Set by the boot param of the same name and makes subsystems with NULL | ||
158 | * ->dfl_files to use ->legacy_files on the default hierarchy. | ||
159 | */ | ||
160 | static bool cgroup_legacy_files_on_dfl; | ||
161 | |||
162 | /* some controllers are not supported in the default hierarchy */ | 176 | /* some controllers are not supported in the default hierarchy */ |
163 | static unsigned long cgrp_dfl_root_inhibit_ss_mask; | 177 | static unsigned long cgrp_dfl_root_inhibit_ss_mask; |
164 | 178 | ||
@@ -186,6 +200,7 @@ static u64 css_serial_nr_next = 1; | |||
186 | */ | 200 | */ |
187 | static unsigned long have_fork_callback __read_mostly; | 201 | static unsigned long have_fork_callback __read_mostly; |
188 | static unsigned long have_exit_callback __read_mostly; | 202 | static unsigned long have_exit_callback __read_mostly; |
203 | static unsigned long have_free_callback __read_mostly; | ||
189 | 204 | ||
190 | /* Ditto for the can_fork callback. */ | 205 | /* Ditto for the can_fork callback. */ |
191 | static unsigned long have_canfork_callback __read_mostly; | 206 | static unsigned long have_canfork_callback __read_mostly; |
@@ -195,14 +210,87 @@ static struct cftype cgroup_legacy_base_files[]; | |||
195 | 210 | ||
196 | static int rebind_subsystems(struct cgroup_root *dst_root, | 211 | static int rebind_subsystems(struct cgroup_root *dst_root, |
197 | unsigned long ss_mask); | 212 | unsigned long ss_mask); |
213 | static void css_task_iter_advance(struct css_task_iter *it); | ||
198 | static int cgroup_destroy_locked(struct cgroup *cgrp); | 214 | static int cgroup_destroy_locked(struct cgroup *cgrp); |
199 | static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, | 215 | static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, |
200 | bool visible); | 216 | bool visible); |
201 | static void css_release(struct percpu_ref *ref); | 217 | static void css_release(struct percpu_ref *ref); |
202 | static void kill_css(struct cgroup_subsys_state *css); | 218 | static void kill_css(struct cgroup_subsys_state *css); |
203 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], | 219 | static int cgroup_addrm_files(struct cgroup_subsys_state *css, |
220 | struct cgroup *cgrp, struct cftype cfts[], | ||
204 | bool is_add); | 221 | bool is_add); |
205 | 222 | ||
223 | /** | ||
224 | * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID | ||
225 | * @ssid: subsys ID of interest | ||
226 | * | ||
227 | * cgroup_subsys_enabled() can only be used with literal subsys names which | ||
228 | * is fine for individual subsystems but unsuitable for cgroup core. This | ||
229 | * is slower static_key_enabled() based test indexed by @ssid. | ||
230 | */ | ||
231 | static bool cgroup_ssid_enabled(int ssid) | ||
232 | { | ||
233 | return static_key_enabled(cgroup_subsys_enabled_key[ssid]); | ||
234 | } | ||
235 | |||
236 | /** | ||
237 | * cgroup_on_dfl - test whether a cgroup is on the default hierarchy | ||
238 | * @cgrp: the cgroup of interest | ||
239 | * | ||
240 | * The default hierarchy is the v2 interface of cgroup and this function | ||
241 | * can be used to test whether a cgroup is on the default hierarchy for | ||
242 | * cases where a subsystem should behave differnetly depending on the | ||
243 | * interface version. | ||
244 | * | ||
245 | * The set of behaviors which change on the default hierarchy are still | ||
246 | * being determined and the mount option is prefixed with __DEVEL__. | ||
247 | * | ||
248 | * List of changed behaviors: | ||
249 | * | ||
250 | * - Mount options "noprefix", "xattr", "clone_children", "release_agent" | ||
251 | * and "name" are disallowed. | ||
252 | * | ||
253 | * - When mounting an existing superblock, mount options should match. | ||
254 | * | ||
255 | * - Remount is disallowed. | ||
256 | * | ||
257 | * - rename(2) is disallowed. | ||
258 | * | ||
259 | * - "tasks" is removed. Everything should be at process granularity. Use | ||
260 | * "cgroup.procs" instead. | ||
261 | * | ||
262 | * - "cgroup.procs" is not sorted. pids will be unique unless they got | ||
263 | * recycled inbetween reads. | ||
264 | * | ||
265 | * - "release_agent" and "notify_on_release" are removed. Replacement | ||
266 | * notification mechanism will be implemented. | ||
267 | * | ||
268 | * - "cgroup.clone_children" is removed. | ||
269 | * | ||
270 | * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup | ||
271 | * and its descendants contain no task; otherwise, 1. The file also | ||
272 | * generates kernfs notification which can be monitored through poll and | ||
273 | * [di]notify when the value of the file changes. | ||
274 | * | ||
275 | * - cpuset: tasks will be kept in empty cpusets when hotplug happens and | ||
276 | * take masks of ancestors with non-empty cpus/mems, instead of being | ||
277 | * moved to an ancestor. | ||
278 | * | ||
279 | * - cpuset: a task can be moved into an empty cpuset, and again it takes | ||
280 | * masks of ancestors. | ||
281 | * | ||
282 | * - memcg: use_hierarchy is on by default and the cgroup file for the flag | ||
283 | * is not created. | ||
284 | * | ||
285 | * - blkcg: blk-throttle becomes properly hierarchical. | ||
286 | * | ||
287 | * - debug: disallowed on the default hierarchy. | ||
288 | */ | ||
289 | static bool cgroup_on_dfl(const struct cgroup *cgrp) | ||
290 | { | ||
291 | return cgrp->root == &cgrp_dfl_root; | ||
292 | } | ||
293 | |||
206 | /* IDR wrappers which synchronize using cgroup_idr_lock */ | 294 | /* IDR wrappers which synchronize using cgroup_idr_lock */ |
207 | static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, | 295 | static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, |
208 | gfp_t gfp_mask) | 296 | gfp_t gfp_mask) |
@@ -211,7 +299,7 @@ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, | |||
211 | 299 | ||
212 | idr_preload(gfp_mask); | 300 | idr_preload(gfp_mask); |
213 | spin_lock_bh(&cgroup_idr_lock); | 301 | spin_lock_bh(&cgroup_idr_lock); |
214 | ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_WAIT); | 302 | ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM); |
215 | spin_unlock_bh(&cgroup_idr_lock); | 303 | spin_unlock_bh(&cgroup_idr_lock); |
216 | idr_preload_end(); | 304 | idr_preload_end(); |
217 | return ret; | 305 | return ret; |
@@ -335,6 +423,22 @@ static inline bool cgroup_is_dead(const struct cgroup *cgrp) | |||
335 | return !(cgrp->self.flags & CSS_ONLINE); | 423 | return !(cgrp->self.flags & CSS_ONLINE); |
336 | } | 424 | } |
337 | 425 | ||
426 | static void cgroup_get(struct cgroup *cgrp) | ||
427 | { | ||
428 | WARN_ON_ONCE(cgroup_is_dead(cgrp)); | ||
429 | css_get(&cgrp->self); | ||
430 | } | ||
431 | |||
432 | static bool cgroup_tryget(struct cgroup *cgrp) | ||
433 | { | ||
434 | return css_tryget(&cgrp->self); | ||
435 | } | ||
436 | |||
437 | static void cgroup_put(struct cgroup *cgrp) | ||
438 | { | ||
439 | css_put(&cgrp->self); | ||
440 | } | ||
441 | |||
338 | struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) | 442 | struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) |
339 | { | 443 | { |
340 | struct cgroup *cgrp = of->kn->parent->priv; | 444 | struct cgroup *cgrp = of->kn->parent->priv; |
@@ -484,19 +588,31 @@ struct css_set init_css_set = { | |||
484 | .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), | 588 | .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), |
485 | .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), | 589 | .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), |
486 | .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), | 590 | .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), |
591 | .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), | ||
487 | }; | 592 | }; |
488 | 593 | ||
489 | static int css_set_count = 1; /* 1 for init_css_set */ | 594 | static int css_set_count = 1; /* 1 for init_css_set */ |
490 | 595 | ||
491 | /** | 596 | /** |
597 | * css_set_populated - does a css_set contain any tasks? | ||
598 | * @cset: target css_set | ||
599 | */ | ||
600 | static bool css_set_populated(struct css_set *cset) | ||
601 | { | ||
602 | lockdep_assert_held(&css_set_lock); | ||
603 | |||
604 | return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks); | ||
605 | } | ||
606 | |||
607 | /** | ||
492 | * cgroup_update_populated - updated populated count of a cgroup | 608 | * cgroup_update_populated - updated populated count of a cgroup |
493 | * @cgrp: the target cgroup | 609 | * @cgrp: the target cgroup |
494 | * @populated: inc or dec populated count | 610 | * @populated: inc or dec populated count |
495 | * | 611 | * |
496 | * @cgrp is either getting the first task (css_set) or losing the last. | 612 | * One of the css_sets associated with @cgrp is either getting its first |
497 | * Update @cgrp->populated_cnt accordingly. The count is propagated | 613 | * task or losing the last. Update @cgrp->populated_cnt accordingly. The |
498 | * towards root so that a given cgroup's populated_cnt is zero iff the | 614 | * count is propagated towards root so that a given cgroup's populated_cnt |
499 | * cgroup and all its descendants are empty. | 615 | * is zero iff the cgroup and all its descendants don't contain any tasks. |
500 | * | 616 | * |
501 | * @cgrp's interface file "cgroup.populated" is zero if | 617 | * @cgrp's interface file "cgroup.populated" is zero if |
502 | * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt | 618 | * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt |
@@ -506,7 +622,7 @@ static int css_set_count = 1; /* 1 for init_css_set */ | |||
506 | */ | 622 | */ |
507 | static void cgroup_update_populated(struct cgroup *cgrp, bool populated) | 623 | static void cgroup_update_populated(struct cgroup *cgrp, bool populated) |
508 | { | 624 | { |
509 | lockdep_assert_held(&css_set_rwsem); | 625 | lockdep_assert_held(&css_set_lock); |
510 | 626 | ||
511 | do { | 627 | do { |
512 | bool trigger; | 628 | bool trigger; |
@@ -519,12 +635,93 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) | |||
519 | if (!trigger) | 635 | if (!trigger) |
520 | break; | 636 | break; |
521 | 637 | ||
522 | if (cgrp->populated_kn) | 638 | check_for_release(cgrp); |
523 | kernfs_notify(cgrp->populated_kn); | 639 | cgroup_file_notify(&cgrp->events_file); |
640 | |||
524 | cgrp = cgroup_parent(cgrp); | 641 | cgrp = cgroup_parent(cgrp); |
525 | } while (cgrp); | 642 | } while (cgrp); |
526 | } | 643 | } |
527 | 644 | ||
645 | /** | ||
646 | * css_set_update_populated - update populated state of a css_set | ||
647 | * @cset: target css_set | ||
648 | * @populated: whether @cset is populated or depopulated | ||
649 | * | ||
650 | * @cset is either getting the first task or losing the last. Update the | ||
651 | * ->populated_cnt of all associated cgroups accordingly. | ||
652 | */ | ||
653 | static void css_set_update_populated(struct css_set *cset, bool populated) | ||
654 | { | ||
655 | struct cgrp_cset_link *link; | ||
656 | |||
657 | lockdep_assert_held(&css_set_lock); | ||
658 | |||
659 | list_for_each_entry(link, &cset->cgrp_links, cgrp_link) | ||
660 | cgroup_update_populated(link->cgrp, populated); | ||
661 | } | ||
662 | |||
663 | /** | ||
664 | * css_set_move_task - move a task from one css_set to another | ||
665 | * @task: task being moved | ||
666 | * @from_cset: css_set @task currently belongs to (may be NULL) | ||
667 | * @to_cset: new css_set @task is being moved to (may be NULL) | ||
668 | * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks | ||
669 | * | ||
670 | * Move @task from @from_cset to @to_cset. If @task didn't belong to any | ||
671 | * css_set, @from_cset can be NULL. If @task is being disassociated | ||
672 | * instead of moved, @to_cset can be NULL. | ||
673 | * | ||
674 | * This function automatically handles populated_cnt updates and | ||
675 | * css_task_iter adjustments but the caller is responsible for managing | ||
676 | * @from_cset and @to_cset's reference counts. | ||
677 | */ | ||
678 | static void css_set_move_task(struct task_struct *task, | ||
679 | struct css_set *from_cset, struct css_set *to_cset, | ||
680 | bool use_mg_tasks) | ||
681 | { | ||
682 | lockdep_assert_held(&css_set_lock); | ||
683 | |||
684 | if (from_cset) { | ||
685 | struct css_task_iter *it, *pos; | ||
686 | |||
687 | WARN_ON_ONCE(list_empty(&task->cg_list)); | ||
688 | |||
689 | /* | ||
690 | * @task is leaving, advance task iterators which are | ||
691 | * pointing to it so that they can resume at the next | ||
692 | * position. Advancing an iterator might remove it from | ||
693 | * the list, use safe walk. See css_task_iter_advance*() | ||
694 | * for details. | ||
695 | */ | ||
696 | list_for_each_entry_safe(it, pos, &from_cset->task_iters, | ||
697 | iters_node) | ||
698 | if (it->task_pos == &task->cg_list) | ||
699 | css_task_iter_advance(it); | ||
700 | |||
701 | list_del_init(&task->cg_list); | ||
702 | if (!css_set_populated(from_cset)) | ||
703 | css_set_update_populated(from_cset, false); | ||
704 | } else { | ||
705 | WARN_ON_ONCE(!list_empty(&task->cg_list)); | ||
706 | } | ||
707 | |||
708 | if (to_cset) { | ||
709 | /* | ||
710 | * We are synchronized through cgroup_threadgroup_rwsem | ||
711 | * against PF_EXITING setting such that we can't race | ||
712 | * against cgroup_exit() changing the css_set to | ||
713 | * init_css_set and dropping the old one. | ||
714 | */ | ||
715 | WARN_ON_ONCE(task->flags & PF_EXITING); | ||
716 | |||
717 | if (!css_set_populated(to_cset)) | ||
718 | css_set_update_populated(to_cset, true); | ||
719 | rcu_assign_pointer(task->cgroups, to_cset); | ||
720 | list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks : | ||
721 | &to_cset->tasks); | ||
722 | } | ||
723 | } | ||
724 | |||
528 | /* | 725 | /* |
529 | * hash table for cgroup groups. This improves the performance to find | 726 | * hash table for cgroup groups. This improves the performance to find |
530 | * an existing css_set. This hash doesn't (currently) take into | 727 | * an existing css_set. This hash doesn't (currently) take into |
@@ -552,7 +749,7 @@ static void put_css_set_locked(struct css_set *cset) | |||
552 | struct cgroup_subsys *ss; | 749 | struct cgroup_subsys *ss; |
553 | int ssid; | 750 | int ssid; |
554 | 751 | ||
555 | lockdep_assert_held(&css_set_rwsem); | 752 | lockdep_assert_held(&css_set_lock); |
556 | 753 | ||
557 | if (!atomic_dec_and_test(&cset->refcount)) | 754 | if (!atomic_dec_and_test(&cset->refcount)) |
558 | return; | 755 | return; |
@@ -564,17 +761,10 @@ static void put_css_set_locked(struct css_set *cset) | |||
564 | css_set_count--; | 761 | css_set_count--; |
565 | 762 | ||
566 | list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) { | 763 | list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) { |
567 | struct cgroup *cgrp = link->cgrp; | ||
568 | |||
569 | list_del(&link->cset_link); | 764 | list_del(&link->cset_link); |
570 | list_del(&link->cgrp_link); | 765 | list_del(&link->cgrp_link); |
571 | 766 | if (cgroup_parent(link->cgrp)) | |
572 | /* @cgrp can't go away while we're holding css_set_rwsem */ | 767 | cgroup_put(link->cgrp); |
573 | if (list_empty(&cgrp->cset_links)) { | ||
574 | cgroup_update_populated(cgrp, false); | ||
575 | check_for_release(cgrp); | ||
576 | } | ||
577 | |||
578 | kfree(link); | 768 | kfree(link); |
579 | } | 769 | } |
580 | 770 | ||
@@ -591,9 +781,9 @@ static void put_css_set(struct css_set *cset) | |||
591 | if (atomic_add_unless(&cset->refcount, -1, 1)) | 781 | if (atomic_add_unless(&cset->refcount, -1, 1)) |
592 | return; | 782 | return; |
593 | 783 | ||
594 | down_write(&css_set_rwsem); | 784 | spin_lock_bh(&css_set_lock); |
595 | put_css_set_locked(cset); | 785 | put_css_set_locked(cset); |
596 | up_write(&css_set_rwsem); | 786 | spin_unlock_bh(&css_set_lock); |
597 | } | 787 | } |
598 | 788 | ||
599 | /* | 789 | /* |
@@ -782,15 +972,15 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset, | |||
782 | link->cset = cset; | 972 | link->cset = cset; |
783 | link->cgrp = cgrp; | 973 | link->cgrp = cgrp; |
784 | 974 | ||
785 | if (list_empty(&cgrp->cset_links)) | ||
786 | cgroup_update_populated(cgrp, true); | ||
787 | list_move(&link->cset_link, &cgrp->cset_links); | ||
788 | |||
789 | /* | 975 | /* |
790 | * Always add links to the tail of the list so that the list | 976 | * Always add links to the tail of the lists so that the lists are |
791 | * is sorted by order of hierarchy creation | 977 | * in choronological order. |
792 | */ | 978 | */ |
979 | list_move_tail(&link->cset_link, &cgrp->cset_links); | ||
793 | list_add_tail(&link->cgrp_link, &cset->cgrp_links); | 980 | list_add_tail(&link->cgrp_link, &cset->cgrp_links); |
981 | |||
982 | if (cgroup_parent(cgrp)) | ||
983 | cgroup_get(cgrp); | ||
794 | } | 984 | } |
795 | 985 | ||
796 | /** | 986 | /** |
@@ -816,11 +1006,11 @@ static struct css_set *find_css_set(struct css_set *old_cset, | |||
816 | 1006 | ||
817 | /* First see if we already have a cgroup group that matches | 1007 | /* First see if we already have a cgroup group that matches |
818 | * the desired set */ | 1008 | * the desired set */ |
819 | down_read(&css_set_rwsem); | 1009 | spin_lock_bh(&css_set_lock); |
820 | cset = find_existing_css_set(old_cset, cgrp, template); | 1010 | cset = find_existing_css_set(old_cset, cgrp, template); |
821 | if (cset) | 1011 | if (cset) |
822 | get_css_set(cset); | 1012 | get_css_set(cset); |
823 | up_read(&css_set_rwsem); | 1013 | spin_unlock_bh(&css_set_lock); |
824 | 1014 | ||
825 | if (cset) | 1015 | if (cset) |
826 | return cset; | 1016 | return cset; |
@@ -841,13 +1031,14 @@ static struct css_set *find_css_set(struct css_set *old_cset, | |||
841 | INIT_LIST_HEAD(&cset->mg_tasks); | 1031 | INIT_LIST_HEAD(&cset->mg_tasks); |
842 | INIT_LIST_HEAD(&cset->mg_preload_node); | 1032 | INIT_LIST_HEAD(&cset->mg_preload_node); |
843 | INIT_LIST_HEAD(&cset->mg_node); | 1033 | INIT_LIST_HEAD(&cset->mg_node); |
1034 | INIT_LIST_HEAD(&cset->task_iters); | ||
844 | INIT_HLIST_NODE(&cset->hlist); | 1035 | INIT_HLIST_NODE(&cset->hlist); |
845 | 1036 | ||
846 | /* Copy the set of subsystem state objects generated in | 1037 | /* Copy the set of subsystem state objects generated in |
847 | * find_existing_css_set() */ | 1038 | * find_existing_css_set() */ |
848 | memcpy(cset->subsys, template, sizeof(cset->subsys)); | 1039 | memcpy(cset->subsys, template, sizeof(cset->subsys)); |
849 | 1040 | ||
850 | down_write(&css_set_rwsem); | 1041 | spin_lock_bh(&css_set_lock); |
851 | /* Add reference counts and links from the new css_set. */ | 1042 | /* Add reference counts and links from the new css_set. */ |
852 | list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { | 1043 | list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { |
853 | struct cgroup *c = link->cgrp; | 1044 | struct cgroup *c = link->cgrp; |
@@ -869,7 +1060,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, | |||
869 | list_add_tail(&cset->e_cset_node[ssid], | 1060 | list_add_tail(&cset->e_cset_node[ssid], |
870 | &cset->subsys[ssid]->cgroup->e_csets[ssid]); | 1061 | &cset->subsys[ssid]->cgroup->e_csets[ssid]); |
871 | 1062 | ||
872 | up_write(&css_set_rwsem); | 1063 | spin_unlock_bh(&css_set_lock); |
873 | 1064 | ||
874 | return cset; | 1065 | return cset; |
875 | } | 1066 | } |
@@ -933,14 +1124,15 @@ static void cgroup_destroy_root(struct cgroup_root *root) | |||
933 | * Release all the links from cset_links to this hierarchy's | 1124 | * Release all the links from cset_links to this hierarchy's |
934 | * root cgroup | 1125 | * root cgroup |
935 | */ | 1126 | */ |
936 | down_write(&css_set_rwsem); | 1127 | spin_lock_bh(&css_set_lock); |
937 | 1128 | ||
938 | list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { | 1129 | list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { |
939 | list_del(&link->cset_link); | 1130 | list_del(&link->cset_link); |
940 | list_del(&link->cgrp_link); | 1131 | list_del(&link->cgrp_link); |
941 | kfree(link); | 1132 | kfree(link); |
942 | } | 1133 | } |
943 | up_write(&css_set_rwsem); | 1134 | |
1135 | spin_unlock_bh(&css_set_lock); | ||
944 | 1136 | ||
945 | if (!list_empty(&root->root_list)) { | 1137 | if (!list_empty(&root->root_list)) { |
946 | list_del(&root->root_list); | 1138 | list_del(&root->root_list); |
@@ -962,7 +1154,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, | |||
962 | struct cgroup *res = NULL; | 1154 | struct cgroup *res = NULL; |
963 | 1155 | ||
964 | lockdep_assert_held(&cgroup_mutex); | 1156 | lockdep_assert_held(&cgroup_mutex); |
965 | lockdep_assert_held(&css_set_rwsem); | 1157 | lockdep_assert_held(&css_set_lock); |
966 | 1158 | ||
967 | if (cset == &init_css_set) { | 1159 | if (cset == &init_css_set) { |
968 | res = &root->cgrp; | 1160 | res = &root->cgrp; |
@@ -985,7 +1177,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, | |||
985 | 1177 | ||
986 | /* | 1178 | /* |
987 | * Return the cgroup for "task" from the given hierarchy. Must be | 1179 | * Return the cgroup for "task" from the given hierarchy. Must be |
988 | * called with cgroup_mutex and css_set_rwsem held. | 1180 | * called with cgroup_mutex and css_set_lock held. |
989 | */ | 1181 | */ |
990 | static struct cgroup *task_cgroup_from_root(struct task_struct *task, | 1182 | static struct cgroup *task_cgroup_from_root(struct task_struct *task, |
991 | struct cgroup_root *root) | 1183 | struct cgroup_root *root) |
@@ -1024,7 +1216,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, | |||
1024 | * update of a tasks cgroup pointer by cgroup_attach_task() | 1216 | * update of a tasks cgroup pointer by cgroup_attach_task() |
1025 | */ | 1217 | */ |
1026 | 1218 | ||
1027 | static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); | ||
1028 | static struct kernfs_syscall_ops cgroup_kf_syscall_ops; | 1219 | static struct kernfs_syscall_ops cgroup_kf_syscall_ops; |
1029 | static const struct file_operations proc_cgroupstats_operations; | 1220 | static const struct file_operations proc_cgroupstats_operations; |
1030 | 1221 | ||
@@ -1047,43 +1238,25 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, | |||
1047 | * cgroup_file_mode - deduce file mode of a control file | 1238 | * cgroup_file_mode - deduce file mode of a control file |
1048 | * @cft: the control file in question | 1239 | * @cft: the control file in question |
1049 | * | 1240 | * |
1050 | * returns cft->mode if ->mode is not 0 | 1241 | * S_IRUGO for read, S_IWUSR for write. |
1051 | * returns S_IRUGO|S_IWUSR if it has both a read and a write handler | ||
1052 | * returns S_IRUGO if it has only a read handler | ||
1053 | * returns S_IWUSR if it has only a write hander | ||
1054 | */ | 1242 | */ |
1055 | static umode_t cgroup_file_mode(const struct cftype *cft) | 1243 | static umode_t cgroup_file_mode(const struct cftype *cft) |
1056 | { | 1244 | { |
1057 | umode_t mode = 0; | 1245 | umode_t mode = 0; |
1058 | 1246 | ||
1059 | if (cft->mode) | ||
1060 | return cft->mode; | ||
1061 | |||
1062 | if (cft->read_u64 || cft->read_s64 || cft->seq_show) | 1247 | if (cft->read_u64 || cft->read_s64 || cft->seq_show) |
1063 | mode |= S_IRUGO; | 1248 | mode |= S_IRUGO; |
1064 | 1249 | ||
1065 | if (cft->write_u64 || cft->write_s64 || cft->write) | 1250 | if (cft->write_u64 || cft->write_s64 || cft->write) { |
1066 | mode |= S_IWUSR; | 1251 | if (cft->flags & CFTYPE_WORLD_WRITABLE) |
1252 | mode |= S_IWUGO; | ||
1253 | else | ||
1254 | mode |= S_IWUSR; | ||
1255 | } | ||
1067 | 1256 | ||
1068 | return mode; | 1257 | return mode; |
1069 | } | 1258 | } |
1070 | 1259 | ||
1071 | static void cgroup_get(struct cgroup *cgrp) | ||
1072 | { | ||
1073 | WARN_ON_ONCE(cgroup_is_dead(cgrp)); | ||
1074 | css_get(&cgrp->self); | ||
1075 | } | ||
1076 | |||
1077 | static bool cgroup_tryget(struct cgroup *cgrp) | ||
1078 | { | ||
1079 | return css_tryget(&cgrp->self); | ||
1080 | } | ||
1081 | |||
1082 | static void cgroup_put(struct cgroup *cgrp) | ||
1083 | { | ||
1084 | css_put(&cgrp->self); | ||
1085 | } | ||
1086 | |||
1087 | /** | 1260 | /** |
1088 | * cgroup_calc_child_subsys_mask - calculate child_subsys_mask | 1261 | * cgroup_calc_child_subsys_mask - calculate child_subsys_mask |
1089 | * @cgrp: the target cgroup | 1262 | * @cgrp: the target cgroup |
@@ -1224,28 +1397,64 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) | |||
1224 | } | 1397 | } |
1225 | 1398 | ||
1226 | /** | 1399 | /** |
1227 | * cgroup_clear_dir - remove subsys files in a cgroup directory | 1400 | * css_clear_dir - remove subsys files in a cgroup directory |
1228 | * @cgrp: target cgroup | 1401 | * @css: taget css |
1229 | * @subsys_mask: mask of the subsystem ids whose files should be removed | 1402 | * @cgrp_override: specify if target cgroup is different from css->cgroup |
1230 | */ | 1403 | */ |
1231 | static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) | 1404 | static void css_clear_dir(struct cgroup_subsys_state *css, |
1405 | struct cgroup *cgrp_override) | ||
1232 | { | 1406 | { |
1233 | struct cgroup_subsys *ss; | 1407 | struct cgroup *cgrp = cgrp_override ?: css->cgroup; |
1234 | int i; | 1408 | struct cftype *cfts; |
1235 | 1409 | ||
1236 | for_each_subsys(ss, i) { | 1410 | list_for_each_entry(cfts, &css->ss->cfts, node) |
1237 | struct cftype *cfts; | 1411 | cgroup_addrm_files(css, cgrp, cfts, false); |
1412 | } | ||
1238 | 1413 | ||
1239 | if (!(subsys_mask & (1 << i))) | 1414 | /** |
1240 | continue; | 1415 | * css_populate_dir - create subsys files in a cgroup directory |
1241 | list_for_each_entry(cfts, &ss->cfts, node) | 1416 | * @css: target css |
1242 | cgroup_addrm_files(cgrp, cfts, false); | 1417 | * @cgrp_overried: specify if target cgroup is different from css->cgroup |
1418 | * | ||
1419 | * On failure, no file is added. | ||
1420 | */ | ||
1421 | static int css_populate_dir(struct cgroup_subsys_state *css, | ||
1422 | struct cgroup *cgrp_override) | ||
1423 | { | ||
1424 | struct cgroup *cgrp = cgrp_override ?: css->cgroup; | ||
1425 | struct cftype *cfts, *failed_cfts; | ||
1426 | int ret; | ||
1427 | |||
1428 | if (!css->ss) { | ||
1429 | if (cgroup_on_dfl(cgrp)) | ||
1430 | cfts = cgroup_dfl_base_files; | ||
1431 | else | ||
1432 | cfts = cgroup_legacy_base_files; | ||
1433 | |||
1434 | return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true); | ||
1243 | } | 1435 | } |
1436 | |||
1437 | list_for_each_entry(cfts, &css->ss->cfts, node) { | ||
1438 | ret = cgroup_addrm_files(css, cgrp, cfts, true); | ||
1439 | if (ret < 0) { | ||
1440 | failed_cfts = cfts; | ||
1441 | goto err; | ||
1442 | } | ||
1443 | } | ||
1444 | return 0; | ||
1445 | err: | ||
1446 | list_for_each_entry(cfts, &css->ss->cfts, node) { | ||
1447 | if (cfts == failed_cfts) | ||
1448 | break; | ||
1449 | cgroup_addrm_files(css, cgrp, cfts, false); | ||
1450 | } | ||
1451 | return ret; | ||
1244 | } | 1452 | } |
1245 | 1453 | ||
1246 | static int rebind_subsystems(struct cgroup_root *dst_root, | 1454 | static int rebind_subsystems(struct cgroup_root *dst_root, |
1247 | unsigned long ss_mask) | 1455 | unsigned long ss_mask) |
1248 | { | 1456 | { |
1457 | struct cgroup *dcgrp = &dst_root->cgrp; | ||
1249 | struct cgroup_subsys *ss; | 1458 | struct cgroup_subsys *ss; |
1250 | unsigned long tmp_ss_mask; | 1459 | unsigned long tmp_ss_mask; |
1251 | int ssid, i, ret; | 1460 | int ssid, i, ret; |
@@ -1267,10 +1476,13 @@ static int rebind_subsystems(struct cgroup_root *dst_root, | |||
1267 | if (dst_root == &cgrp_dfl_root) | 1476 | if (dst_root == &cgrp_dfl_root) |
1268 | tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask; | 1477 | tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask; |
1269 | 1478 | ||
1270 | ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask); | 1479 | for_each_subsys_which(ss, ssid, &tmp_ss_mask) { |
1271 | if (ret) { | 1480 | struct cgroup *scgrp = &ss->root->cgrp; |
1272 | if (dst_root != &cgrp_dfl_root) | 1481 | int tssid; |
1273 | return ret; | 1482 | |
1483 | ret = css_populate_dir(cgroup_css(scgrp, ss), dcgrp); | ||
1484 | if (!ret) | ||
1485 | continue; | ||
1274 | 1486 | ||
1275 | /* | 1487 | /* |
1276 | * Rebinding back to the default root is not allowed to | 1488 | * Rebinding back to the default root is not allowed to |
@@ -1278,57 +1490,67 @@ static int rebind_subsystems(struct cgroup_root *dst_root, | |||
1278 | * be rare. Moving subsystems back and forth even more so. | 1490 | * be rare. Moving subsystems back and forth even more so. |
1279 | * Just warn about it and continue. | 1491 | * Just warn about it and continue. |
1280 | */ | 1492 | */ |
1281 | if (cgrp_dfl_root_visible) { | 1493 | if (dst_root == &cgrp_dfl_root) { |
1282 | pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n", | 1494 | if (cgrp_dfl_root_visible) { |
1283 | ret, ss_mask); | 1495 | pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n", |
1284 | pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); | 1496 | ret, ss_mask); |
1497 | pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); | ||
1498 | } | ||
1499 | continue; | ||
1500 | } | ||
1501 | |||
1502 | for_each_subsys_which(ss, tssid, &tmp_ss_mask) { | ||
1503 | if (tssid == ssid) | ||
1504 | break; | ||
1505 | css_clear_dir(cgroup_css(scgrp, ss), dcgrp); | ||
1285 | } | 1506 | } |
1507 | return ret; | ||
1286 | } | 1508 | } |
1287 | 1509 | ||
1288 | /* | 1510 | /* |
1289 | * Nothing can fail from this point on. Remove files for the | 1511 | * Nothing can fail from this point on. Remove files for the |
1290 | * removed subsystems and rebind each subsystem. | 1512 | * removed subsystems and rebind each subsystem. |
1291 | */ | 1513 | */ |
1292 | for_each_subsys_which(ss, ssid, &ss_mask) | ||
1293 | cgroup_clear_dir(&ss->root->cgrp, 1 << ssid); | ||
1294 | |||
1295 | for_each_subsys_which(ss, ssid, &ss_mask) { | 1514 | for_each_subsys_which(ss, ssid, &ss_mask) { |
1296 | struct cgroup_root *src_root; | 1515 | struct cgroup_root *src_root = ss->root; |
1297 | struct cgroup_subsys_state *css; | 1516 | struct cgroup *scgrp = &src_root->cgrp; |
1517 | struct cgroup_subsys_state *css = cgroup_css(scgrp, ss); | ||
1298 | struct css_set *cset; | 1518 | struct css_set *cset; |
1299 | 1519 | ||
1300 | src_root = ss->root; | 1520 | WARN_ON(!css || cgroup_css(dcgrp, ss)); |
1301 | css = cgroup_css(&src_root->cgrp, ss); | ||
1302 | 1521 | ||
1303 | WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss)); | 1522 | css_clear_dir(css, NULL); |
1304 | 1523 | ||
1305 | RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL); | 1524 | RCU_INIT_POINTER(scgrp->subsys[ssid], NULL); |
1306 | rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css); | 1525 | rcu_assign_pointer(dcgrp->subsys[ssid], css); |
1307 | ss->root = dst_root; | 1526 | ss->root = dst_root; |
1308 | css->cgroup = &dst_root->cgrp; | 1527 | css->cgroup = dcgrp; |
1309 | 1528 | ||
1310 | down_write(&css_set_rwsem); | 1529 | spin_lock_bh(&css_set_lock); |
1311 | hash_for_each(css_set_table, i, cset, hlist) | 1530 | hash_for_each(css_set_table, i, cset, hlist) |
1312 | list_move_tail(&cset->e_cset_node[ss->id], | 1531 | list_move_tail(&cset->e_cset_node[ss->id], |
1313 | &dst_root->cgrp.e_csets[ss->id]); | 1532 | &dcgrp->e_csets[ss->id]); |
1314 | up_write(&css_set_rwsem); | 1533 | spin_unlock_bh(&css_set_lock); |
1315 | 1534 | ||
1316 | src_root->subsys_mask &= ~(1 << ssid); | 1535 | src_root->subsys_mask &= ~(1 << ssid); |
1317 | src_root->cgrp.subtree_control &= ~(1 << ssid); | 1536 | scgrp->subtree_control &= ~(1 << ssid); |
1318 | cgroup_refresh_child_subsys_mask(&src_root->cgrp); | 1537 | cgroup_refresh_child_subsys_mask(scgrp); |
1319 | 1538 | ||
1320 | /* default hierarchy doesn't enable controllers by default */ | 1539 | /* default hierarchy doesn't enable controllers by default */ |
1321 | dst_root->subsys_mask |= 1 << ssid; | 1540 | dst_root->subsys_mask |= 1 << ssid; |
1322 | if (dst_root != &cgrp_dfl_root) { | 1541 | if (dst_root == &cgrp_dfl_root) { |
1323 | dst_root->cgrp.subtree_control |= 1 << ssid; | 1542 | static_branch_enable(cgroup_subsys_on_dfl_key[ssid]); |
1324 | cgroup_refresh_child_subsys_mask(&dst_root->cgrp); | 1543 | } else { |
1544 | dcgrp->subtree_control |= 1 << ssid; | ||
1545 | cgroup_refresh_child_subsys_mask(dcgrp); | ||
1546 | static_branch_disable(cgroup_subsys_on_dfl_key[ssid]); | ||
1325 | } | 1547 | } |
1326 | 1548 | ||
1327 | if (ss->bind) | 1549 | if (ss->bind) |
1328 | ss->bind(css); | 1550 | ss->bind(css); |
1329 | } | 1551 | } |
1330 | 1552 | ||
1331 | kernfs_activate(dst_root->cgrp.kn); | 1553 | kernfs_activate(dcgrp->kn); |
1332 | return 0; | 1554 | return 0; |
1333 | } | 1555 | } |
1334 | 1556 | ||
@@ -1458,7 +1680,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1458 | for_each_subsys(ss, i) { | 1680 | for_each_subsys(ss, i) { |
1459 | if (strcmp(token, ss->legacy_name)) | 1681 | if (strcmp(token, ss->legacy_name)) |
1460 | continue; | 1682 | continue; |
1461 | if (ss->disabled) | 1683 | if (!cgroup_ssid_enabled(i)) |
1462 | continue; | 1684 | continue; |
1463 | 1685 | ||
1464 | /* Mutually exclusive option 'all' + subsystem name */ | 1686 | /* Mutually exclusive option 'all' + subsystem name */ |
@@ -1489,7 +1711,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1489 | */ | 1711 | */ |
1490 | if (all_ss || (!one_ss && !opts->none && !opts->name)) | 1712 | if (all_ss || (!one_ss && !opts->none && !opts->name)) |
1491 | for_each_subsys(ss, i) | 1713 | for_each_subsys(ss, i) |
1492 | if (!ss->disabled) | 1714 | if (cgroup_ssid_enabled(i)) |
1493 | opts->subsys_mask |= (1 << i); | 1715 | opts->subsys_mask |= (1 << i); |
1494 | 1716 | ||
1495 | /* | 1717 | /* |
@@ -1585,7 +1807,7 @@ static void cgroup_enable_task_cg_lists(void) | |||
1585 | { | 1807 | { |
1586 | struct task_struct *p, *g; | 1808 | struct task_struct *p, *g; |
1587 | 1809 | ||
1588 | down_write(&css_set_rwsem); | 1810 | spin_lock_bh(&css_set_lock); |
1589 | 1811 | ||
1590 | if (use_task_css_set_links) | 1812 | if (use_task_css_set_links) |
1591 | goto out_unlock; | 1813 | goto out_unlock; |
@@ -1615,14 +1837,16 @@ static void cgroup_enable_task_cg_lists(void) | |||
1615 | if (!(p->flags & PF_EXITING)) { | 1837 | if (!(p->flags & PF_EXITING)) { |
1616 | struct css_set *cset = task_css_set(p); | 1838 | struct css_set *cset = task_css_set(p); |
1617 | 1839 | ||
1618 | list_add(&p->cg_list, &cset->tasks); | 1840 | if (!css_set_populated(cset)) |
1841 | css_set_update_populated(cset, true); | ||
1842 | list_add_tail(&p->cg_list, &cset->tasks); | ||
1619 | get_css_set(cset); | 1843 | get_css_set(cset); |
1620 | } | 1844 | } |
1621 | spin_unlock_irq(&p->sighand->siglock); | 1845 | spin_unlock_irq(&p->sighand->siglock); |
1622 | } while_each_thread(g, p); | 1846 | } while_each_thread(g, p); |
1623 | read_unlock(&tasklist_lock); | 1847 | read_unlock(&tasklist_lock); |
1624 | out_unlock: | 1848 | out_unlock: |
1625 | up_write(&css_set_rwsem); | 1849 | spin_unlock_bh(&css_set_lock); |
1626 | } | 1850 | } |
1627 | 1851 | ||
1628 | static void init_cgroup_housekeeping(struct cgroup *cgrp) | 1852 | static void init_cgroup_housekeeping(struct cgroup *cgrp) |
@@ -1632,6 +1856,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
1632 | 1856 | ||
1633 | INIT_LIST_HEAD(&cgrp->self.sibling); | 1857 | INIT_LIST_HEAD(&cgrp->self.sibling); |
1634 | INIT_LIST_HEAD(&cgrp->self.children); | 1858 | INIT_LIST_HEAD(&cgrp->self.children); |
1859 | INIT_LIST_HEAD(&cgrp->self.files); | ||
1635 | INIT_LIST_HEAD(&cgrp->cset_links); | 1860 | INIT_LIST_HEAD(&cgrp->cset_links); |
1636 | INIT_LIST_HEAD(&cgrp->pidlists); | 1861 | INIT_LIST_HEAD(&cgrp->pidlists); |
1637 | mutex_init(&cgrp->pidlist_mutex); | 1862 | mutex_init(&cgrp->pidlist_mutex); |
@@ -1669,7 +1894,6 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) | |||
1669 | { | 1894 | { |
1670 | LIST_HEAD(tmp_links); | 1895 | LIST_HEAD(tmp_links); |
1671 | struct cgroup *root_cgrp = &root->cgrp; | 1896 | struct cgroup *root_cgrp = &root->cgrp; |
1672 | struct cftype *base_files; | ||
1673 | struct css_set *cset; | 1897 | struct css_set *cset; |
1674 | int i, ret; | 1898 | int i, ret; |
1675 | 1899 | ||
@@ -1686,7 +1910,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) | |||
1686 | goto out; | 1910 | goto out; |
1687 | 1911 | ||
1688 | /* | 1912 | /* |
1689 | * We're accessing css_set_count without locking css_set_rwsem here, | 1913 | * We're accessing css_set_count without locking css_set_lock here, |
1690 | * but that's OK - it can only be increased by someone holding | 1914 | * but that's OK - it can only be increased by someone holding |
1691 | * cgroup_lock, and that's us. The worst that can happen is that we | 1915 | * cgroup_lock, and that's us. The worst that can happen is that we |
1692 | * have some link structures left over | 1916 | * have some link structures left over |
@@ -1708,12 +1932,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) | |||
1708 | } | 1932 | } |
1709 | root_cgrp->kn = root->kf_root->kn; | 1933 | root_cgrp->kn = root->kf_root->kn; |
1710 | 1934 | ||
1711 | if (root == &cgrp_dfl_root) | 1935 | ret = css_populate_dir(&root_cgrp->self, NULL); |
1712 | base_files = cgroup_dfl_base_files; | ||
1713 | else | ||
1714 | base_files = cgroup_legacy_base_files; | ||
1715 | |||
1716 | ret = cgroup_addrm_files(root_cgrp, base_files, true); | ||
1717 | if (ret) | 1936 | if (ret) |
1718 | goto destroy_root; | 1937 | goto destroy_root; |
1719 | 1938 | ||
@@ -1733,10 +1952,13 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) | |||
1733 | * Link the root cgroup in this hierarchy into all the css_set | 1952 | * Link the root cgroup in this hierarchy into all the css_set |
1734 | * objects. | 1953 | * objects. |
1735 | */ | 1954 | */ |
1736 | down_write(&css_set_rwsem); | 1955 | spin_lock_bh(&css_set_lock); |
1737 | hash_for_each(css_set_table, i, cset, hlist) | 1956 | hash_for_each(css_set_table, i, cset, hlist) { |
1738 | link_css_set(&tmp_links, cset, root_cgrp); | 1957 | link_css_set(&tmp_links, cset, root_cgrp); |
1739 | up_write(&css_set_rwsem); | 1958 | if (css_set_populated(cset)) |
1959 | cgroup_update_populated(root_cgrp, true); | ||
1960 | } | ||
1961 | spin_unlock_bh(&css_set_lock); | ||
1740 | 1962 | ||
1741 | BUG_ON(!list_empty(&root_cgrp->self.children)); | 1963 | BUG_ON(!list_empty(&root_cgrp->self.children)); |
1742 | BUG_ON(atomic_read(&root->nr_cgrps) != 1); | 1964 | BUG_ON(atomic_read(&root->nr_cgrps) != 1); |
@@ -1969,7 +2191,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) | |||
1969 | char *path = NULL; | 2191 | char *path = NULL; |
1970 | 2192 | ||
1971 | mutex_lock(&cgroup_mutex); | 2193 | mutex_lock(&cgroup_mutex); |
1972 | down_read(&css_set_rwsem); | 2194 | spin_lock_bh(&css_set_lock); |
1973 | 2195 | ||
1974 | root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); | 2196 | root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); |
1975 | 2197 | ||
@@ -1982,7 +2204,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) | |||
1982 | path = buf; | 2204 | path = buf; |
1983 | } | 2205 | } |
1984 | 2206 | ||
1985 | up_read(&css_set_rwsem); | 2207 | spin_unlock_bh(&css_set_lock); |
1986 | mutex_unlock(&cgroup_mutex); | 2208 | mutex_unlock(&cgroup_mutex); |
1987 | return path; | 2209 | return path; |
1988 | } | 2210 | } |
@@ -2010,6 +2232,49 @@ struct cgroup_taskset { | |||
2010 | struct task_struct *cur_task; | 2232 | struct task_struct *cur_task; |
2011 | }; | 2233 | }; |
2012 | 2234 | ||
2235 | #define CGROUP_TASKSET_INIT(tset) (struct cgroup_taskset){ \ | ||
2236 | .src_csets = LIST_HEAD_INIT(tset.src_csets), \ | ||
2237 | .dst_csets = LIST_HEAD_INIT(tset.dst_csets), \ | ||
2238 | .csets = &tset.src_csets, \ | ||
2239 | } | ||
2240 | |||
2241 | /** | ||
2242 | * cgroup_taskset_add - try to add a migration target task to a taskset | ||
2243 | * @task: target task | ||
2244 | * @tset: target taskset | ||
2245 | * | ||
2246 | * Add @task, which is a migration target, to @tset. This function becomes | ||
2247 | * noop if @task doesn't need to be migrated. @task's css_set should have | ||
2248 | * been added as a migration source and @task->cg_list will be moved from | ||
2249 | * the css_set's tasks list to mg_tasks one. | ||
2250 | */ | ||
2251 | static void cgroup_taskset_add(struct task_struct *task, | ||
2252 | struct cgroup_taskset *tset) | ||
2253 | { | ||
2254 | struct css_set *cset; | ||
2255 | |||
2256 | lockdep_assert_held(&css_set_lock); | ||
2257 | |||
2258 | /* @task either already exited or can't exit until the end */ | ||
2259 | if (task->flags & PF_EXITING) | ||
2260 | return; | ||
2261 | |||
2262 | /* leave @task alone if post_fork() hasn't linked it yet */ | ||
2263 | if (list_empty(&task->cg_list)) | ||
2264 | return; | ||
2265 | |||
2266 | cset = task_css_set(task); | ||
2267 | if (!cset->mg_src_cgrp) | ||
2268 | return; | ||
2269 | |||
2270 | list_move_tail(&task->cg_list, &cset->mg_tasks); | ||
2271 | if (list_empty(&cset->mg_node)) | ||
2272 | list_add_tail(&cset->mg_node, &tset->src_csets); | ||
2273 | if (list_empty(&cset->mg_dst_cset->mg_node)) | ||
2274 | list_move_tail(&cset->mg_dst_cset->mg_node, | ||
2275 | &tset->dst_csets); | ||
2276 | } | ||
2277 | |||
2013 | /** | 2278 | /** |
2014 | * cgroup_taskset_first - reset taskset and return the first task | 2279 | * cgroup_taskset_first - reset taskset and return the first task |
2015 | * @tset: taskset of interest | 2280 | * @tset: taskset of interest |
@@ -2057,47 +2322,86 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) | |||
2057 | } | 2322 | } |
2058 | 2323 | ||
2059 | /** | 2324 | /** |
2060 | * cgroup_task_migrate - move a task from one cgroup to another. | 2325 | * cgroup_taskset_migrate - migrate a taskset to a cgroup |
2061 | * @old_cgrp: the cgroup @tsk is being migrated from | 2326 | * @tset: taget taskset |
2062 | * @tsk: the task being migrated | 2327 | * @dst_cgrp: destination cgroup |
2063 | * @new_cset: the new css_set @tsk is being attached to | ||
2064 | * | 2328 | * |
2065 | * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked. | 2329 | * Migrate tasks in @tset to @dst_cgrp. This function fails iff one of the |
2330 | * ->can_attach callbacks fails and guarantees that either all or none of | ||
2331 | * the tasks in @tset are migrated. @tset is consumed regardless of | ||
2332 | * success. | ||
2066 | */ | 2333 | */ |
2067 | static void cgroup_task_migrate(struct cgroup *old_cgrp, | 2334 | static int cgroup_taskset_migrate(struct cgroup_taskset *tset, |
2068 | struct task_struct *tsk, | 2335 | struct cgroup *dst_cgrp) |
2069 | struct css_set *new_cset) | ||
2070 | { | 2336 | { |
2071 | struct css_set *old_cset; | 2337 | struct cgroup_subsys_state *css, *failed_css = NULL; |
2072 | 2338 | struct task_struct *task, *tmp_task; | |
2073 | lockdep_assert_held(&cgroup_mutex); | 2339 | struct css_set *cset, *tmp_cset; |
2074 | lockdep_assert_held(&css_set_rwsem); | 2340 | int i, ret; |
2075 | 2341 | ||
2076 | /* | 2342 | /* methods shouldn't be called if no task is actually migrating */ |
2077 | * We are synchronized through cgroup_threadgroup_rwsem against | 2343 | if (list_empty(&tset->src_csets)) |
2078 | * PF_EXITING setting such that we can't race against cgroup_exit() | 2344 | return 0; |
2079 | * changing the css_set to init_css_set and dropping the old one. | ||
2080 | */ | ||
2081 | WARN_ON_ONCE(tsk->flags & PF_EXITING); | ||
2082 | old_cset = task_css_set(tsk); | ||
2083 | 2345 | ||
2084 | get_css_set(new_cset); | 2346 | /* check that we can legitimately attach to the cgroup */ |
2085 | rcu_assign_pointer(tsk->cgroups, new_cset); | 2347 | for_each_e_css(css, i, dst_cgrp) { |
2348 | if (css->ss->can_attach) { | ||
2349 | ret = css->ss->can_attach(css, tset); | ||
2350 | if (ret) { | ||
2351 | failed_css = css; | ||
2352 | goto out_cancel_attach; | ||
2353 | } | ||
2354 | } | ||
2355 | } | ||
2086 | 2356 | ||
2087 | /* | 2357 | /* |
2088 | * Use move_tail so that cgroup_taskset_first() still returns the | 2358 | * Now that we're guaranteed success, proceed to move all tasks to |
2089 | * leader after migration. This works because cgroup_migrate() | 2359 | * the new cgroup. There are no failure cases after here, so this |
2090 | * ensures that the dst_cset of the leader is the first on the | 2360 | * is the commit point. |
2091 | * tset's dst_csets list. | ||
2092 | */ | 2361 | */ |
2093 | list_move_tail(&tsk->cg_list, &new_cset->mg_tasks); | 2362 | spin_lock_bh(&css_set_lock); |
2363 | list_for_each_entry(cset, &tset->src_csets, mg_node) { | ||
2364 | list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) { | ||
2365 | struct css_set *from_cset = task_css_set(task); | ||
2366 | struct css_set *to_cset = cset->mg_dst_cset; | ||
2367 | |||
2368 | get_css_set(to_cset); | ||
2369 | css_set_move_task(task, from_cset, to_cset, true); | ||
2370 | put_css_set_locked(from_cset); | ||
2371 | } | ||
2372 | } | ||
2373 | spin_unlock_bh(&css_set_lock); | ||
2094 | 2374 | ||
2095 | /* | 2375 | /* |
2096 | * We just gained a reference on old_cset by taking it from the | 2376 | * Migration is committed, all target tasks are now on dst_csets. |
2097 | * task. As trading it for new_cset is protected by cgroup_mutex, | 2377 | * Nothing is sensitive to fork() after this point. Notify |
2098 | * we're safe to drop it here; it will be freed under RCU. | 2378 | * controllers that migration is complete. |
2099 | */ | 2379 | */ |
2100 | put_css_set_locked(old_cset); | 2380 | tset->csets = &tset->dst_csets; |
2381 | |||
2382 | for_each_e_css(css, i, dst_cgrp) | ||
2383 | if (css->ss->attach) | ||
2384 | css->ss->attach(css, tset); | ||
2385 | |||
2386 | ret = 0; | ||
2387 | goto out_release_tset; | ||
2388 | |||
2389 | out_cancel_attach: | ||
2390 | for_each_e_css(css, i, dst_cgrp) { | ||
2391 | if (css == failed_css) | ||
2392 | break; | ||
2393 | if (css->ss->cancel_attach) | ||
2394 | css->ss->cancel_attach(css, tset); | ||
2395 | } | ||
2396 | out_release_tset: | ||
2397 | spin_lock_bh(&css_set_lock); | ||
2398 | list_splice_init(&tset->dst_csets, &tset->src_csets); | ||
2399 | list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) { | ||
2400 | list_splice_tail_init(&cset->mg_tasks, &cset->tasks); | ||
2401 | list_del_init(&cset->mg_node); | ||
2402 | } | ||
2403 | spin_unlock_bh(&css_set_lock); | ||
2404 | return ret; | ||
2101 | } | 2405 | } |
2102 | 2406 | ||
2103 | /** | 2407 | /** |
@@ -2113,14 +2417,14 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) | |||
2113 | 2417 | ||
2114 | lockdep_assert_held(&cgroup_mutex); | 2418 | lockdep_assert_held(&cgroup_mutex); |
2115 | 2419 | ||
2116 | down_write(&css_set_rwsem); | 2420 | spin_lock_bh(&css_set_lock); |
2117 | list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) { | 2421 | list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) { |
2118 | cset->mg_src_cgrp = NULL; | 2422 | cset->mg_src_cgrp = NULL; |
2119 | cset->mg_dst_cset = NULL; | 2423 | cset->mg_dst_cset = NULL; |
2120 | list_del_init(&cset->mg_preload_node); | 2424 | list_del_init(&cset->mg_preload_node); |
2121 | put_css_set_locked(cset); | 2425 | put_css_set_locked(cset); |
2122 | } | 2426 | } |
2123 | up_write(&css_set_rwsem); | 2427 | spin_unlock_bh(&css_set_lock); |
2124 | } | 2428 | } |
2125 | 2429 | ||
2126 | /** | 2430 | /** |
@@ -2146,7 +2450,7 @@ static void cgroup_migrate_add_src(struct css_set *src_cset, | |||
2146 | struct cgroup *src_cgrp; | 2450 | struct cgroup *src_cgrp; |
2147 | 2451 | ||
2148 | lockdep_assert_held(&cgroup_mutex); | 2452 | lockdep_assert_held(&cgroup_mutex); |
2149 | lockdep_assert_held(&css_set_rwsem); | 2453 | lockdep_assert_held(&css_set_lock); |
2150 | 2454 | ||
2151 | src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); | 2455 | src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); |
2152 | 2456 | ||
@@ -2235,9 +2539,9 @@ err: | |||
2235 | 2539 | ||
2236 | /** | 2540 | /** |
2237 | * cgroup_migrate - migrate a process or task to a cgroup | 2541 | * cgroup_migrate - migrate a process or task to a cgroup |
2238 | * @cgrp: the destination cgroup | ||
2239 | * @leader: the leader of the process or the task to migrate | 2542 | * @leader: the leader of the process or the task to migrate |
2240 | * @threadgroup: whether @leader points to the whole process or a single task | 2543 | * @threadgroup: whether @leader points to the whole process or a single task |
2544 | * @cgrp: the destination cgroup | ||
2241 | * | 2545 | * |
2242 | * Migrate a process or task denoted by @leader to @cgrp. If migrating a | 2546 | * Migrate a process or task denoted by @leader to @cgrp. If migrating a |
2243 | * process, the caller must be holding cgroup_threadgroup_rwsem. The | 2547 | * process, the caller must be holding cgroup_threadgroup_rwsem. The |
@@ -2251,115 +2555,29 @@ err: | |||
2251 | * decided for all targets by invoking group_migrate_prepare_dst() before | 2555 | * decided for all targets by invoking group_migrate_prepare_dst() before |
2252 | * actually starting migrating. | 2556 | * actually starting migrating. |
2253 | */ | 2557 | */ |
2254 | static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, | 2558 | static int cgroup_migrate(struct task_struct *leader, bool threadgroup, |
2255 | bool threadgroup) | 2559 | struct cgroup *cgrp) |
2256 | { | 2560 | { |
2257 | struct cgroup_taskset tset = { | 2561 | struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); |
2258 | .src_csets = LIST_HEAD_INIT(tset.src_csets), | 2562 | struct task_struct *task; |
2259 | .dst_csets = LIST_HEAD_INIT(tset.dst_csets), | ||
2260 | .csets = &tset.src_csets, | ||
2261 | }; | ||
2262 | struct cgroup_subsys_state *css, *failed_css = NULL; | ||
2263 | struct css_set *cset, *tmp_cset; | ||
2264 | struct task_struct *task, *tmp_task; | ||
2265 | int i, ret; | ||
2266 | 2563 | ||
2267 | /* | 2564 | /* |
2268 | * Prevent freeing of tasks while we take a snapshot. Tasks that are | 2565 | * Prevent freeing of tasks while we take a snapshot. Tasks that are |
2269 | * already PF_EXITING could be freed from underneath us unless we | 2566 | * already PF_EXITING could be freed from underneath us unless we |
2270 | * take an rcu_read_lock. | 2567 | * take an rcu_read_lock. |
2271 | */ | 2568 | */ |
2272 | down_write(&css_set_rwsem); | 2569 | spin_lock_bh(&css_set_lock); |
2273 | rcu_read_lock(); | 2570 | rcu_read_lock(); |
2274 | task = leader; | 2571 | task = leader; |
2275 | do { | 2572 | do { |
2276 | /* @task either already exited or can't exit until the end */ | 2573 | cgroup_taskset_add(task, &tset); |
2277 | if (task->flags & PF_EXITING) | ||
2278 | goto next; | ||
2279 | |||
2280 | /* leave @task alone if post_fork() hasn't linked it yet */ | ||
2281 | if (list_empty(&task->cg_list)) | ||
2282 | goto next; | ||
2283 | |||
2284 | cset = task_css_set(task); | ||
2285 | if (!cset->mg_src_cgrp) | ||
2286 | goto next; | ||
2287 | |||
2288 | /* | ||
2289 | * cgroup_taskset_first() must always return the leader. | ||
2290 | * Take care to avoid disturbing the ordering. | ||
2291 | */ | ||
2292 | list_move_tail(&task->cg_list, &cset->mg_tasks); | ||
2293 | if (list_empty(&cset->mg_node)) | ||
2294 | list_add_tail(&cset->mg_node, &tset.src_csets); | ||
2295 | if (list_empty(&cset->mg_dst_cset->mg_node)) | ||
2296 | list_move_tail(&cset->mg_dst_cset->mg_node, | ||
2297 | &tset.dst_csets); | ||
2298 | next: | ||
2299 | if (!threadgroup) | 2574 | if (!threadgroup) |
2300 | break; | 2575 | break; |
2301 | } while_each_thread(leader, task); | 2576 | } while_each_thread(leader, task); |
2302 | rcu_read_unlock(); | 2577 | rcu_read_unlock(); |
2303 | up_write(&css_set_rwsem); | 2578 | spin_unlock_bh(&css_set_lock); |
2304 | |||
2305 | /* methods shouldn't be called if no task is actually migrating */ | ||
2306 | if (list_empty(&tset.src_csets)) | ||
2307 | return 0; | ||
2308 | |||
2309 | /* check that we can legitimately attach to the cgroup */ | ||
2310 | for_each_e_css(css, i, cgrp) { | ||
2311 | if (css->ss->can_attach) { | ||
2312 | ret = css->ss->can_attach(css, &tset); | ||
2313 | if (ret) { | ||
2314 | failed_css = css; | ||
2315 | goto out_cancel_attach; | ||
2316 | } | ||
2317 | } | ||
2318 | } | ||
2319 | |||
2320 | /* | ||
2321 | * Now that we're guaranteed success, proceed to move all tasks to | ||
2322 | * the new cgroup. There are no failure cases after here, so this | ||
2323 | * is the commit point. | ||
2324 | */ | ||
2325 | down_write(&css_set_rwsem); | ||
2326 | list_for_each_entry(cset, &tset.src_csets, mg_node) { | ||
2327 | list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) | ||
2328 | cgroup_task_migrate(cset->mg_src_cgrp, task, | ||
2329 | cset->mg_dst_cset); | ||
2330 | } | ||
2331 | up_write(&css_set_rwsem); | ||
2332 | |||
2333 | /* | ||
2334 | * Migration is committed, all target tasks are now on dst_csets. | ||
2335 | * Nothing is sensitive to fork() after this point. Notify | ||
2336 | * controllers that migration is complete. | ||
2337 | */ | ||
2338 | tset.csets = &tset.dst_csets; | ||
2339 | 2579 | ||
2340 | for_each_e_css(css, i, cgrp) | 2580 | return cgroup_taskset_migrate(&tset, cgrp); |
2341 | if (css->ss->attach) | ||
2342 | css->ss->attach(css, &tset); | ||
2343 | |||
2344 | ret = 0; | ||
2345 | goto out_release_tset; | ||
2346 | |||
2347 | out_cancel_attach: | ||
2348 | for_each_e_css(css, i, cgrp) { | ||
2349 | if (css == failed_css) | ||
2350 | break; | ||
2351 | if (css->ss->cancel_attach) | ||
2352 | css->ss->cancel_attach(css, &tset); | ||
2353 | } | ||
2354 | out_release_tset: | ||
2355 | down_write(&css_set_rwsem); | ||
2356 | list_splice_init(&tset.dst_csets, &tset.src_csets); | ||
2357 | list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) { | ||
2358 | list_splice_tail_init(&cset->mg_tasks, &cset->tasks); | ||
2359 | list_del_init(&cset->mg_node); | ||
2360 | } | ||
2361 | up_write(&css_set_rwsem); | ||
2362 | return ret; | ||
2363 | } | 2581 | } |
2364 | 2582 | ||
2365 | /** | 2583 | /** |
@@ -2378,7 +2596,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, | |||
2378 | int ret; | 2596 | int ret; |
2379 | 2597 | ||
2380 | /* look up all src csets */ | 2598 | /* look up all src csets */ |
2381 | down_read(&css_set_rwsem); | 2599 | spin_lock_bh(&css_set_lock); |
2382 | rcu_read_lock(); | 2600 | rcu_read_lock(); |
2383 | task = leader; | 2601 | task = leader; |
2384 | do { | 2602 | do { |
@@ -2388,12 +2606,12 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, | |||
2388 | break; | 2606 | break; |
2389 | } while_each_thread(leader, task); | 2607 | } while_each_thread(leader, task); |
2390 | rcu_read_unlock(); | 2608 | rcu_read_unlock(); |
2391 | up_read(&css_set_rwsem); | 2609 | spin_unlock_bh(&css_set_lock); |
2392 | 2610 | ||
2393 | /* prepare dst csets and commit */ | 2611 | /* prepare dst csets and commit */ |
2394 | ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets); | 2612 | ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets); |
2395 | if (!ret) | 2613 | if (!ret) |
2396 | ret = cgroup_migrate(dst_cgrp, leader, threadgroup); | 2614 | ret = cgroup_migrate(leader, threadgroup, dst_cgrp); |
2397 | 2615 | ||
2398 | cgroup_migrate_finish(&preloaded_csets); | 2616 | cgroup_migrate_finish(&preloaded_csets); |
2399 | return ret; | 2617 | return ret; |
@@ -2421,15 +2639,15 @@ static int cgroup_procs_write_permission(struct task_struct *task, | |||
2421 | struct cgroup *cgrp; | 2639 | struct cgroup *cgrp; |
2422 | struct inode *inode; | 2640 | struct inode *inode; |
2423 | 2641 | ||
2424 | down_read(&css_set_rwsem); | 2642 | spin_lock_bh(&css_set_lock); |
2425 | cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); | 2643 | cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); |
2426 | up_read(&css_set_rwsem); | 2644 | spin_unlock_bh(&css_set_lock); |
2427 | 2645 | ||
2428 | while (!cgroup_is_descendant(dst_cgrp, cgrp)) | 2646 | while (!cgroup_is_descendant(dst_cgrp, cgrp)) |
2429 | cgrp = cgroup_parent(cgrp); | 2647 | cgrp = cgroup_parent(cgrp); |
2430 | 2648 | ||
2431 | ret = -ENOMEM; | 2649 | ret = -ENOMEM; |
2432 | inode = kernfs_get_inode(sb, cgrp->procs_kn); | 2650 | inode = kernfs_get_inode(sb, cgrp->procs_file.kn); |
2433 | if (inode) { | 2651 | if (inode) { |
2434 | ret = inode_permission(inode, MAY_WRITE); | 2652 | ret = inode_permission(inode, MAY_WRITE); |
2435 | iput(inode); | 2653 | iput(inode); |
@@ -2520,9 +2738,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) | |||
2520 | if (root == &cgrp_dfl_root) | 2738 | if (root == &cgrp_dfl_root) |
2521 | continue; | 2739 | continue; |
2522 | 2740 | ||
2523 | down_read(&css_set_rwsem); | 2741 | spin_lock_bh(&css_set_lock); |
2524 | from_cgrp = task_cgroup_from_root(from, root); | 2742 | from_cgrp = task_cgroup_from_root(from, root); |
2525 | up_read(&css_set_rwsem); | 2743 | spin_unlock_bh(&css_set_lock); |
2526 | 2744 | ||
2527 | retval = cgroup_attach_task(from_cgrp, tsk, false); | 2745 | retval = cgroup_attach_task(from_cgrp, tsk, false); |
2528 | if (retval) | 2746 | if (retval) |
@@ -2637,6 +2855,7 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v) | |||
2637 | static int cgroup_update_dfl_csses(struct cgroup *cgrp) | 2855 | static int cgroup_update_dfl_csses(struct cgroup *cgrp) |
2638 | { | 2856 | { |
2639 | LIST_HEAD(preloaded_csets); | 2857 | LIST_HEAD(preloaded_csets); |
2858 | struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); | ||
2640 | struct cgroup_subsys_state *css; | 2859 | struct cgroup_subsys_state *css; |
2641 | struct css_set *src_cset; | 2860 | struct css_set *src_cset; |
2642 | int ret; | 2861 | int ret; |
@@ -2646,7 +2865,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) | |||
2646 | percpu_down_write(&cgroup_threadgroup_rwsem); | 2865 | percpu_down_write(&cgroup_threadgroup_rwsem); |
2647 | 2866 | ||
2648 | /* look up all csses currently attached to @cgrp's subtree */ | 2867 | /* look up all csses currently attached to @cgrp's subtree */ |
2649 | down_read(&css_set_rwsem); | 2868 | spin_lock_bh(&css_set_lock); |
2650 | css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { | 2869 | css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { |
2651 | struct cgrp_cset_link *link; | 2870 | struct cgrp_cset_link *link; |
2652 | 2871 | ||
@@ -2658,57 +2877,28 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) | |||
2658 | cgroup_migrate_add_src(link->cset, cgrp, | 2877 | cgroup_migrate_add_src(link->cset, cgrp, |
2659 | &preloaded_csets); | 2878 | &preloaded_csets); |
2660 | } | 2879 | } |
2661 | up_read(&css_set_rwsem); | 2880 | spin_unlock_bh(&css_set_lock); |
2662 | 2881 | ||
2663 | /* NULL dst indicates self on default hierarchy */ | 2882 | /* NULL dst indicates self on default hierarchy */ |
2664 | ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets); | 2883 | ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets); |
2665 | if (ret) | 2884 | if (ret) |
2666 | goto out_finish; | 2885 | goto out_finish; |
2667 | 2886 | ||
2887 | spin_lock_bh(&css_set_lock); | ||
2668 | list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) { | 2888 | list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) { |
2669 | struct task_struct *last_task = NULL, *task; | 2889 | struct task_struct *task, *ntask; |
2670 | 2890 | ||
2671 | /* src_csets precede dst_csets, break on the first dst_cset */ | 2891 | /* src_csets precede dst_csets, break on the first dst_cset */ |
2672 | if (!src_cset->mg_src_cgrp) | 2892 | if (!src_cset->mg_src_cgrp) |
2673 | break; | 2893 | break; |
2674 | 2894 | ||
2675 | /* | 2895 | /* all tasks in src_csets need to be migrated */ |
2676 | * All tasks in src_cset need to be migrated to the | 2896 | list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list) |
2677 | * matching dst_cset. Empty it process by process. We | 2897 | cgroup_taskset_add(task, &tset); |
2678 | * walk tasks but migrate processes. The leader might even | ||
2679 | * belong to a different cset but such src_cset would also | ||
2680 | * be among the target src_csets because the default | ||
2681 | * hierarchy enforces per-process membership. | ||
2682 | */ | ||
2683 | while (true) { | ||
2684 | down_read(&css_set_rwsem); | ||
2685 | task = list_first_entry_or_null(&src_cset->tasks, | ||
2686 | struct task_struct, cg_list); | ||
2687 | if (task) { | ||
2688 | task = task->group_leader; | ||
2689 | WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp); | ||
2690 | get_task_struct(task); | ||
2691 | } | ||
2692 | up_read(&css_set_rwsem); | ||
2693 | |||
2694 | if (!task) | ||
2695 | break; | ||
2696 | |||
2697 | /* guard against possible infinite loop */ | ||
2698 | if (WARN(last_task == task, | ||
2699 | "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n")) | ||
2700 | goto out_finish; | ||
2701 | last_task = task; | ||
2702 | |||
2703 | ret = cgroup_migrate(src_cset->dfl_cgrp, task, true); | ||
2704 | |||
2705 | put_task_struct(task); | ||
2706 | |||
2707 | if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret)) | ||
2708 | goto out_finish; | ||
2709 | } | ||
2710 | } | 2898 | } |
2899 | spin_unlock_bh(&css_set_lock); | ||
2711 | 2900 | ||
2901 | ret = cgroup_taskset_migrate(&tset, cgrp); | ||
2712 | out_finish: | 2902 | out_finish: |
2713 | cgroup_migrate_finish(&preloaded_csets); | 2903 | cgroup_migrate_finish(&preloaded_csets); |
2714 | percpu_up_write(&cgroup_threadgroup_rwsem); | 2904 | percpu_up_write(&cgroup_threadgroup_rwsem); |
@@ -2738,7 +2928,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, | |||
2738 | if (tok[0] == '\0') | 2928 | if (tok[0] == '\0') |
2739 | continue; | 2929 | continue; |
2740 | for_each_subsys_which(ss, ssid, &tmp_ss_mask) { | 2930 | for_each_subsys_which(ss, ssid, &tmp_ss_mask) { |
2741 | if (ss->disabled || strcmp(tok + 1, ss->name)) | 2931 | if (!cgroup_ssid_enabled(ssid) || |
2932 | strcmp(tok + 1, ss->name)) | ||
2742 | continue; | 2933 | continue; |
2743 | 2934 | ||
2744 | if (*tok == '+') { | 2935 | if (*tok == '+') { |
@@ -2862,7 +3053,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, | |||
2862 | ret = create_css(child, ss, | 3053 | ret = create_css(child, ss, |
2863 | cgrp->subtree_control & (1 << ssid)); | 3054 | cgrp->subtree_control & (1 << ssid)); |
2864 | else | 3055 | else |
2865 | ret = cgroup_populate_dir(child, 1 << ssid); | 3056 | ret = css_populate_dir(cgroup_css(child, ss), |
3057 | NULL); | ||
2866 | if (ret) | 3058 | if (ret) |
2867 | goto err_undo_css; | 3059 | goto err_undo_css; |
2868 | } | 3060 | } |
@@ -2895,7 +3087,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, | |||
2895 | if (css_disable & (1 << ssid)) { | 3087 | if (css_disable & (1 << ssid)) { |
2896 | kill_css(css); | 3088 | kill_css(css); |
2897 | } else { | 3089 | } else { |
2898 | cgroup_clear_dir(child, 1 << ssid); | 3090 | css_clear_dir(css, NULL); |
2899 | if (ss->css_reset) | 3091 | if (ss->css_reset) |
2900 | ss->css_reset(css); | 3092 | ss->css_reset(css); |
2901 | } | 3093 | } |
@@ -2943,15 +3135,16 @@ err_undo_css: | |||
2943 | if (css_enable & (1 << ssid)) | 3135 | if (css_enable & (1 << ssid)) |
2944 | kill_css(css); | 3136 | kill_css(css); |
2945 | else | 3137 | else |
2946 | cgroup_clear_dir(child, 1 << ssid); | 3138 | css_clear_dir(css, NULL); |
2947 | } | 3139 | } |
2948 | } | 3140 | } |
2949 | goto out_unlock; | 3141 | goto out_unlock; |
2950 | } | 3142 | } |
2951 | 3143 | ||
2952 | static int cgroup_populated_show(struct seq_file *seq, void *v) | 3144 | static int cgroup_events_show(struct seq_file *seq, void *v) |
2953 | { | 3145 | { |
2954 | seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt); | 3146 | seq_printf(seq, "populated %d\n", |
3147 | cgroup_is_populated(seq_css(seq)->cgroup)); | ||
2955 | return 0; | 3148 | return 0; |
2956 | } | 3149 | } |
2957 | 3150 | ||
@@ -3094,7 +3287,8 @@ static int cgroup_kn_set_ugid(struct kernfs_node *kn) | |||
3094 | return kernfs_setattr(kn, &iattr); | 3287 | return kernfs_setattr(kn, &iattr); |
3095 | } | 3288 | } |
3096 | 3289 | ||
3097 | static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) | 3290 | static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, |
3291 | struct cftype *cft) | ||
3098 | { | 3292 | { |
3099 | char name[CGROUP_FILE_NAME_MAX]; | 3293 | char name[CGROUP_FILE_NAME_MAX]; |
3100 | struct kernfs_node *kn; | 3294 | struct kernfs_node *kn; |
@@ -3116,33 +3310,38 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) | |||
3116 | return ret; | 3310 | return ret; |
3117 | } | 3311 | } |
3118 | 3312 | ||
3119 | if (cft->write == cgroup_procs_write) | 3313 | if (cft->file_offset) { |
3120 | cgrp->procs_kn = kn; | 3314 | struct cgroup_file *cfile = (void *)css + cft->file_offset; |
3121 | else if (cft->seq_show == cgroup_populated_show) | 3315 | |
3122 | cgrp->populated_kn = kn; | 3316 | kernfs_get(kn); |
3317 | cfile->kn = kn; | ||
3318 | list_add(&cfile->node, &css->files); | ||
3319 | } | ||
3320 | |||
3123 | return 0; | 3321 | return 0; |
3124 | } | 3322 | } |
3125 | 3323 | ||
3126 | /** | 3324 | /** |
3127 | * cgroup_addrm_files - add or remove files to a cgroup directory | 3325 | * cgroup_addrm_files - add or remove files to a cgroup directory |
3128 | * @cgrp: the target cgroup | 3326 | * @css: the target css |
3327 | * @cgrp: the target cgroup (usually css->cgroup) | ||
3129 | * @cfts: array of cftypes to be added | 3328 | * @cfts: array of cftypes to be added |
3130 | * @is_add: whether to add or remove | 3329 | * @is_add: whether to add or remove |
3131 | * | 3330 | * |
3132 | * Depending on @is_add, add or remove files defined by @cfts on @cgrp. | 3331 | * Depending on @is_add, add or remove files defined by @cfts on @cgrp. |
3133 | * For removals, this function never fails. If addition fails, this | 3332 | * For removals, this function never fails. |
3134 | * function doesn't remove files already added. The caller is responsible | ||
3135 | * for cleaning up. | ||
3136 | */ | 3333 | */ |
3137 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], | 3334 | static int cgroup_addrm_files(struct cgroup_subsys_state *css, |
3335 | struct cgroup *cgrp, struct cftype cfts[], | ||
3138 | bool is_add) | 3336 | bool is_add) |
3139 | { | 3337 | { |
3140 | struct cftype *cft; | 3338 | struct cftype *cft, *cft_end = NULL; |
3141 | int ret; | 3339 | int ret; |
3142 | 3340 | ||
3143 | lockdep_assert_held(&cgroup_mutex); | 3341 | lockdep_assert_held(&cgroup_mutex); |
3144 | 3342 | ||
3145 | for (cft = cfts; cft->name[0] != '\0'; cft++) { | 3343 | restart: |
3344 | for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) { | ||
3146 | /* does cft->flags tell us to skip this file on @cgrp? */ | 3345 | /* does cft->flags tell us to skip this file on @cgrp? */ |
3147 | if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) | 3346 | if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) |
3148 | continue; | 3347 | continue; |
@@ -3154,11 +3353,13 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], | |||
3154 | continue; | 3353 | continue; |
3155 | 3354 | ||
3156 | if (is_add) { | 3355 | if (is_add) { |
3157 | ret = cgroup_add_file(cgrp, cft); | 3356 | ret = cgroup_add_file(css, cgrp, cft); |
3158 | if (ret) { | 3357 | if (ret) { |
3159 | pr_warn("%s: failed to add %s, err=%d\n", | 3358 | pr_warn("%s: failed to add %s, err=%d\n", |
3160 | __func__, cft->name, ret); | 3359 | __func__, cft->name, ret); |
3161 | return ret; | 3360 | cft_end = cft; |
3361 | is_add = false; | ||
3362 | goto restart; | ||
3162 | } | 3363 | } |
3163 | } else { | 3364 | } else { |
3164 | cgroup_rm_file(cgrp, cft); | 3365 | cgroup_rm_file(cgrp, cft); |
@@ -3184,7 +3385,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) | |||
3184 | if (cgroup_is_dead(cgrp)) | 3385 | if (cgroup_is_dead(cgrp)) |
3185 | continue; | 3386 | continue; |
3186 | 3387 | ||
3187 | ret = cgroup_addrm_files(cgrp, cfts, is_add); | 3388 | ret = cgroup_addrm_files(css, cgrp, cfts, is_add); |
3188 | if (ret) | 3389 | if (ret) |
3189 | break; | 3390 | break; |
3190 | } | 3391 | } |
@@ -3296,7 +3497,7 @@ static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) | |||
3296 | { | 3497 | { |
3297 | int ret; | 3498 | int ret; |
3298 | 3499 | ||
3299 | if (ss->disabled) | 3500 | if (!cgroup_ssid_enabled(ss->id)) |
3300 | return 0; | 3501 | return 0; |
3301 | 3502 | ||
3302 | if (!cfts || cfts[0].name[0] == '\0') | 3503 | if (!cfts || cfts[0].name[0] == '\0') |
@@ -3346,17 +3547,8 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) | |||
3346 | { | 3547 | { |
3347 | struct cftype *cft; | 3548 | struct cftype *cft; |
3348 | 3549 | ||
3349 | /* | 3550 | for (cft = cfts; cft && cft->name[0] != '\0'; cft++) |
3350 | * If legacy_flies_on_dfl, we want to show the legacy files on the | 3551 | cft->flags |= __CFTYPE_NOT_ON_DFL; |
3351 | * dfl hierarchy but iff the target subsystem hasn't been updated | ||
3352 | * for the dfl hierarchy yet. | ||
3353 | */ | ||
3354 | if (!cgroup_legacy_files_on_dfl || | ||
3355 | ss->dfl_cftypes != ss->legacy_cftypes) { | ||
3356 | for (cft = cfts; cft && cft->name[0] != '\0'; cft++) | ||
3357 | cft->flags |= __CFTYPE_NOT_ON_DFL; | ||
3358 | } | ||
3359 | |||
3360 | return cgroup_add_cftypes(ss, cfts); | 3552 | return cgroup_add_cftypes(ss, cfts); |
3361 | } | 3553 | } |
3362 | 3554 | ||
@@ -3371,10 +3563,10 @@ static int cgroup_task_count(const struct cgroup *cgrp) | |||
3371 | int count = 0; | 3563 | int count = 0; |
3372 | struct cgrp_cset_link *link; | 3564 | struct cgrp_cset_link *link; |
3373 | 3565 | ||
3374 | down_read(&css_set_rwsem); | 3566 | spin_lock_bh(&css_set_lock); |
3375 | list_for_each_entry(link, &cgrp->cset_links, cset_link) | 3567 | list_for_each_entry(link, &cgrp->cset_links, cset_link) |
3376 | count += atomic_read(&link->cset->refcount); | 3568 | count += atomic_read(&link->cset->refcount); |
3377 | up_read(&css_set_rwsem); | 3569 | spin_unlock_bh(&css_set_lock); |
3378 | return count; | 3570 | return count; |
3379 | } | 3571 | } |
3380 | 3572 | ||
@@ -3606,22 +3798,25 @@ bool css_has_online_children(struct cgroup_subsys_state *css) | |||
3606 | } | 3798 | } |
3607 | 3799 | ||
3608 | /** | 3800 | /** |
3609 | * css_advance_task_iter - advance a task itererator to the next css_set | 3801 | * css_task_iter_advance_css_set - advance a task itererator to the next css_set |
3610 | * @it: the iterator to advance | 3802 | * @it: the iterator to advance |
3611 | * | 3803 | * |
3612 | * Advance @it to the next css_set to walk. | 3804 | * Advance @it to the next css_set to walk. |
3613 | */ | 3805 | */ |
3614 | static void css_advance_task_iter(struct css_task_iter *it) | 3806 | static void css_task_iter_advance_css_set(struct css_task_iter *it) |
3615 | { | 3807 | { |
3616 | struct list_head *l = it->cset_pos; | 3808 | struct list_head *l = it->cset_pos; |
3617 | struct cgrp_cset_link *link; | 3809 | struct cgrp_cset_link *link; |
3618 | struct css_set *cset; | 3810 | struct css_set *cset; |
3619 | 3811 | ||
3812 | lockdep_assert_held(&css_set_lock); | ||
3813 | |||
3620 | /* Advance to the next non-empty css_set */ | 3814 | /* Advance to the next non-empty css_set */ |
3621 | do { | 3815 | do { |
3622 | l = l->next; | 3816 | l = l->next; |
3623 | if (l == it->cset_head) { | 3817 | if (l == it->cset_head) { |
3624 | it->cset_pos = NULL; | 3818 | it->cset_pos = NULL; |
3819 | it->task_pos = NULL; | ||
3625 | return; | 3820 | return; |
3626 | } | 3821 | } |
3627 | 3822 | ||
@@ -3632,7 +3827,7 @@ static void css_advance_task_iter(struct css_task_iter *it) | |||
3632 | link = list_entry(l, struct cgrp_cset_link, cset_link); | 3827 | link = list_entry(l, struct cgrp_cset_link, cset_link); |
3633 | cset = link->cset; | 3828 | cset = link->cset; |
3634 | } | 3829 | } |
3635 | } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks)); | 3830 | } while (!css_set_populated(cset)); |
3636 | 3831 | ||
3637 | it->cset_pos = l; | 3832 | it->cset_pos = l; |
3638 | 3833 | ||
@@ -3643,6 +3838,52 @@ static void css_advance_task_iter(struct css_task_iter *it) | |||
3643 | 3838 | ||
3644 | it->tasks_head = &cset->tasks; | 3839 | it->tasks_head = &cset->tasks; |
3645 | it->mg_tasks_head = &cset->mg_tasks; | 3840 | it->mg_tasks_head = &cset->mg_tasks; |
3841 | |||
3842 | /* | ||
3843 | * We don't keep css_sets locked across iteration steps and thus | ||
3844 | * need to take steps to ensure that iteration can be resumed after | ||
3845 | * the lock is re-acquired. Iteration is performed at two levels - | ||
3846 | * css_sets and tasks in them. | ||
3847 | * | ||
3848 | * Once created, a css_set never leaves its cgroup lists, so a | ||
3849 | * pinned css_set is guaranteed to stay put and we can resume | ||
3850 | * iteration afterwards. | ||
3851 | * | ||
3852 | * Tasks may leave @cset across iteration steps. This is resolved | ||
3853 | * by registering each iterator with the css_set currently being | ||
3854 | * walked and making css_set_move_task() advance iterators whose | ||
3855 | * next task is leaving. | ||
3856 | */ | ||
3857 | if (it->cur_cset) { | ||
3858 | list_del(&it->iters_node); | ||
3859 | put_css_set_locked(it->cur_cset); | ||
3860 | } | ||
3861 | get_css_set(cset); | ||
3862 | it->cur_cset = cset; | ||
3863 | list_add(&it->iters_node, &cset->task_iters); | ||
3864 | } | ||
3865 | |||
3866 | static void css_task_iter_advance(struct css_task_iter *it) | ||
3867 | { | ||
3868 | struct list_head *l = it->task_pos; | ||
3869 | |||
3870 | lockdep_assert_held(&css_set_lock); | ||
3871 | WARN_ON_ONCE(!l); | ||
3872 | |||
3873 | /* | ||
3874 | * Advance iterator to find next entry. cset->tasks is consumed | ||
3875 | * first and then ->mg_tasks. After ->mg_tasks, we move onto the | ||
3876 | * next cset. | ||
3877 | */ | ||
3878 | l = l->next; | ||
3879 | |||
3880 | if (l == it->tasks_head) | ||
3881 | l = it->mg_tasks_head->next; | ||
3882 | |||
3883 | if (l == it->mg_tasks_head) | ||
3884 | css_task_iter_advance_css_set(it); | ||
3885 | else | ||
3886 | it->task_pos = l; | ||
3646 | } | 3887 | } |
3647 | 3888 | ||
3648 | /** | 3889 | /** |
@@ -3654,19 +3895,16 @@ static void css_advance_task_iter(struct css_task_iter *it) | |||
3654 | * css_task_iter_next() to walk through the tasks until the function | 3895 | * css_task_iter_next() to walk through the tasks until the function |
3655 | * returns NULL. On completion of iteration, css_task_iter_end() must be | 3896 | * returns NULL. On completion of iteration, css_task_iter_end() must be |
3656 | * called. | 3897 | * called. |
3657 | * | ||
3658 | * Note that this function acquires a lock which is released when the | ||
3659 | * iteration finishes. The caller can't sleep while iteration is in | ||
3660 | * progress. | ||
3661 | */ | 3898 | */ |
3662 | void css_task_iter_start(struct cgroup_subsys_state *css, | 3899 | void css_task_iter_start(struct cgroup_subsys_state *css, |
3663 | struct css_task_iter *it) | 3900 | struct css_task_iter *it) |
3664 | __acquires(css_set_rwsem) | ||
3665 | { | 3901 | { |
3666 | /* no one should try to iterate before mounting cgroups */ | 3902 | /* no one should try to iterate before mounting cgroups */ |
3667 | WARN_ON_ONCE(!use_task_css_set_links); | 3903 | WARN_ON_ONCE(!use_task_css_set_links); |
3668 | 3904 | ||
3669 | down_read(&css_set_rwsem); | 3905 | memset(it, 0, sizeof(*it)); |
3906 | |||
3907 | spin_lock_bh(&css_set_lock); | ||
3670 | 3908 | ||
3671 | it->ss = css->ss; | 3909 | it->ss = css->ss; |
3672 | 3910 | ||
@@ -3677,7 +3915,9 @@ void css_task_iter_start(struct cgroup_subsys_state *css, | |||
3677 | 3915 | ||
3678 | it->cset_head = it->cset_pos; | 3916 | it->cset_head = it->cset_pos; |
3679 | 3917 | ||
3680 | css_advance_task_iter(it); | 3918 | css_task_iter_advance_css_set(it); |
3919 | |||
3920 | spin_unlock_bh(&css_set_lock); | ||
3681 | } | 3921 | } |
3682 | 3922 | ||
3683 | /** | 3923 | /** |
@@ -3690,30 +3930,23 @@ void css_task_iter_start(struct cgroup_subsys_state *css, | |||
3690 | */ | 3930 | */ |
3691 | struct task_struct *css_task_iter_next(struct css_task_iter *it) | 3931 | struct task_struct *css_task_iter_next(struct css_task_iter *it) |
3692 | { | 3932 | { |
3693 | struct task_struct *res; | 3933 | if (it->cur_task) { |
3694 | struct list_head *l = it->task_pos; | 3934 | put_task_struct(it->cur_task); |
3935 | it->cur_task = NULL; | ||
3936 | } | ||
3695 | 3937 | ||
3696 | /* If the iterator cg is NULL, we have no tasks */ | 3938 | spin_lock_bh(&css_set_lock); |
3697 | if (!it->cset_pos) | ||
3698 | return NULL; | ||
3699 | res = list_entry(l, struct task_struct, cg_list); | ||
3700 | 3939 | ||
3701 | /* | 3940 | if (it->task_pos) { |
3702 | * Advance iterator to find next entry. cset->tasks is consumed | 3941 | it->cur_task = list_entry(it->task_pos, struct task_struct, |
3703 | * first and then ->mg_tasks. After ->mg_tasks, we move onto the | 3942 | cg_list); |
3704 | * next cset. | 3943 | get_task_struct(it->cur_task); |
3705 | */ | 3944 | css_task_iter_advance(it); |
3706 | l = l->next; | 3945 | } |
3707 | 3946 | ||
3708 | if (l == it->tasks_head) | 3947 | spin_unlock_bh(&css_set_lock); |
3709 | l = it->mg_tasks_head->next; | ||
3710 | 3948 | ||
3711 | if (l == it->mg_tasks_head) | 3949 | return it->cur_task; |
3712 | css_advance_task_iter(it); | ||
3713 | else | ||
3714 | it->task_pos = l; | ||
3715 | |||
3716 | return res; | ||
3717 | } | 3950 | } |
3718 | 3951 | ||
3719 | /** | 3952 | /** |
@@ -3723,9 +3956,16 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) | |||
3723 | * Finish task iteration started by css_task_iter_start(). | 3956 | * Finish task iteration started by css_task_iter_start(). |
3724 | */ | 3957 | */ |
3725 | void css_task_iter_end(struct css_task_iter *it) | 3958 | void css_task_iter_end(struct css_task_iter *it) |
3726 | __releases(css_set_rwsem) | ||
3727 | { | 3959 | { |
3728 | up_read(&css_set_rwsem); | 3960 | if (it->cur_cset) { |
3961 | spin_lock_bh(&css_set_lock); | ||
3962 | list_del(&it->iters_node); | ||
3963 | put_css_set_locked(it->cur_cset); | ||
3964 | spin_unlock_bh(&css_set_lock); | ||
3965 | } | ||
3966 | |||
3967 | if (it->cur_task) | ||
3968 | put_task_struct(it->cur_task); | ||
3729 | } | 3969 | } |
3730 | 3970 | ||
3731 | /** | 3971 | /** |
@@ -3750,10 +3990,10 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) | |||
3750 | mutex_lock(&cgroup_mutex); | 3990 | mutex_lock(&cgroup_mutex); |
3751 | 3991 | ||
3752 | /* all tasks in @from are being moved, all csets are source */ | 3992 | /* all tasks in @from are being moved, all csets are source */ |
3753 | down_read(&css_set_rwsem); | 3993 | spin_lock_bh(&css_set_lock); |
3754 | list_for_each_entry(link, &from->cset_links, cset_link) | 3994 | list_for_each_entry(link, &from->cset_links, cset_link) |
3755 | cgroup_migrate_add_src(link->cset, to, &preloaded_csets); | 3995 | cgroup_migrate_add_src(link->cset, to, &preloaded_csets); |
3756 | up_read(&css_set_rwsem); | 3996 | spin_unlock_bh(&css_set_lock); |
3757 | 3997 | ||
3758 | ret = cgroup_migrate_prepare_dst(to, &preloaded_csets); | 3998 | ret = cgroup_migrate_prepare_dst(to, &preloaded_csets); |
3759 | if (ret) | 3999 | if (ret) |
@@ -3771,7 +4011,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) | |||
3771 | css_task_iter_end(&it); | 4011 | css_task_iter_end(&it); |
3772 | 4012 | ||
3773 | if (task) { | 4013 | if (task) { |
3774 | ret = cgroup_migrate(to, task, false); | 4014 | ret = cgroup_migrate(task, false, to); |
3775 | put_task_struct(task); | 4015 | put_task_struct(task); |
3776 | } | 4016 | } |
3777 | } while (task && !ret); | 4017 | } while (task && !ret); |
@@ -4268,13 +4508,13 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css, | |||
4268 | static struct cftype cgroup_dfl_base_files[] = { | 4508 | static struct cftype cgroup_dfl_base_files[] = { |
4269 | { | 4509 | { |
4270 | .name = "cgroup.procs", | 4510 | .name = "cgroup.procs", |
4511 | .file_offset = offsetof(struct cgroup, procs_file), | ||
4271 | .seq_start = cgroup_pidlist_start, | 4512 | .seq_start = cgroup_pidlist_start, |
4272 | .seq_next = cgroup_pidlist_next, | 4513 | .seq_next = cgroup_pidlist_next, |
4273 | .seq_stop = cgroup_pidlist_stop, | 4514 | .seq_stop = cgroup_pidlist_stop, |
4274 | .seq_show = cgroup_pidlist_show, | 4515 | .seq_show = cgroup_pidlist_show, |
4275 | .private = CGROUP_FILE_PROCS, | 4516 | .private = CGROUP_FILE_PROCS, |
4276 | .write = cgroup_procs_write, | 4517 | .write = cgroup_procs_write, |
4277 | .mode = S_IRUGO | S_IWUSR, | ||
4278 | }, | 4518 | }, |
4279 | { | 4519 | { |
4280 | .name = "cgroup.controllers", | 4520 | .name = "cgroup.controllers", |
@@ -4292,9 +4532,10 @@ static struct cftype cgroup_dfl_base_files[] = { | |||
4292 | .write = cgroup_subtree_control_write, | 4532 | .write = cgroup_subtree_control_write, |
4293 | }, | 4533 | }, |
4294 | { | 4534 | { |
4295 | .name = "cgroup.populated", | 4535 | .name = "cgroup.events", |
4296 | .flags = CFTYPE_NOT_ON_ROOT, | 4536 | .flags = CFTYPE_NOT_ON_ROOT, |
4297 | .seq_show = cgroup_populated_show, | 4537 | .file_offset = offsetof(struct cgroup, events_file), |
4538 | .seq_show = cgroup_events_show, | ||
4298 | }, | 4539 | }, |
4299 | { } /* terminate */ | 4540 | { } /* terminate */ |
4300 | }; | 4541 | }; |
@@ -4309,7 +4550,6 @@ static struct cftype cgroup_legacy_base_files[] = { | |||
4309 | .seq_show = cgroup_pidlist_show, | 4550 | .seq_show = cgroup_pidlist_show, |
4310 | .private = CGROUP_FILE_PROCS, | 4551 | .private = CGROUP_FILE_PROCS, |
4311 | .write = cgroup_procs_write, | 4552 | .write = cgroup_procs_write, |
4312 | .mode = S_IRUGO | S_IWUSR, | ||
4313 | }, | 4553 | }, |
4314 | { | 4554 | { |
4315 | .name = "cgroup.clone_children", | 4555 | .name = "cgroup.clone_children", |
@@ -4329,7 +4569,6 @@ static struct cftype cgroup_legacy_base_files[] = { | |||
4329 | .seq_show = cgroup_pidlist_show, | 4569 | .seq_show = cgroup_pidlist_show, |
4330 | .private = CGROUP_FILE_TASKS, | 4570 | .private = CGROUP_FILE_TASKS, |
4331 | .write = cgroup_tasks_write, | 4571 | .write = cgroup_tasks_write, |
4332 | .mode = S_IRUGO | S_IWUSR, | ||
4333 | }, | 4572 | }, |
4334 | { | 4573 | { |
4335 | .name = "notify_on_release", | 4574 | .name = "notify_on_release", |
@@ -4346,37 +4585,6 @@ static struct cftype cgroup_legacy_base_files[] = { | |||
4346 | { } /* terminate */ | 4585 | { } /* terminate */ |
4347 | }; | 4586 | }; |
4348 | 4587 | ||
4349 | /** | ||
4350 | * cgroup_populate_dir - create subsys files in a cgroup directory | ||
4351 | * @cgrp: target cgroup | ||
4352 | * @subsys_mask: mask of the subsystem ids whose files should be added | ||
4353 | * | ||
4354 | * On failure, no file is added. | ||
4355 | */ | ||
4356 | static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) | ||
4357 | { | ||
4358 | struct cgroup_subsys *ss; | ||
4359 | int i, ret = 0; | ||
4360 | |||
4361 | /* process cftsets of each subsystem */ | ||
4362 | for_each_subsys(ss, i) { | ||
4363 | struct cftype *cfts; | ||
4364 | |||
4365 | if (!(subsys_mask & (1 << i))) | ||
4366 | continue; | ||
4367 | |||
4368 | list_for_each_entry(cfts, &ss->cfts, node) { | ||
4369 | ret = cgroup_addrm_files(cgrp, cfts, true); | ||
4370 | if (ret < 0) | ||
4371 | goto err; | ||
4372 | } | ||
4373 | } | ||
4374 | return 0; | ||
4375 | err: | ||
4376 | cgroup_clear_dir(cgrp, subsys_mask); | ||
4377 | return ret; | ||
4378 | } | ||
4379 | |||
4380 | /* | 4588 | /* |
4381 | * css destruction is four-stage process. | 4589 | * css destruction is four-stage process. |
4382 | * | 4590 | * |
@@ -4405,9 +4613,13 @@ static void css_free_work_fn(struct work_struct *work) | |||
4405 | container_of(work, struct cgroup_subsys_state, destroy_work); | 4613 | container_of(work, struct cgroup_subsys_state, destroy_work); |
4406 | struct cgroup_subsys *ss = css->ss; | 4614 | struct cgroup_subsys *ss = css->ss; |
4407 | struct cgroup *cgrp = css->cgroup; | 4615 | struct cgroup *cgrp = css->cgroup; |
4616 | struct cgroup_file *cfile; | ||
4408 | 4617 | ||
4409 | percpu_ref_exit(&css->refcnt); | 4618 | percpu_ref_exit(&css->refcnt); |
4410 | 4619 | ||
4620 | list_for_each_entry(cfile, &css->files, node) | ||
4621 | kernfs_put(cfile->kn); | ||
4622 | |||
4411 | if (ss) { | 4623 | if (ss) { |
4412 | /* css free path */ | 4624 | /* css free path */ |
4413 | int id = css->id; | 4625 | int id = css->id; |
@@ -4512,6 +4724,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, | |||
4512 | css->ss = ss; | 4724 | css->ss = ss; |
4513 | INIT_LIST_HEAD(&css->sibling); | 4725 | INIT_LIST_HEAD(&css->sibling); |
4514 | INIT_LIST_HEAD(&css->children); | 4726 | INIT_LIST_HEAD(&css->children); |
4727 | INIT_LIST_HEAD(&css->files); | ||
4515 | css->serial_nr = css_serial_nr_next++; | 4728 | css->serial_nr = css_serial_nr_next++; |
4516 | 4729 | ||
4517 | if (cgroup_parent(cgrp)) { | 4730 | if (cgroup_parent(cgrp)) { |
@@ -4594,7 +4807,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, | |||
4594 | css->id = err; | 4807 | css->id = err; |
4595 | 4808 | ||
4596 | if (visible) { | 4809 | if (visible) { |
4597 | err = cgroup_populate_dir(cgrp, 1 << ss->id); | 4810 | err = css_populate_dir(css, NULL); |
4598 | if (err) | 4811 | if (err) |
4599 | goto err_free_id; | 4812 | goto err_free_id; |
4600 | } | 4813 | } |
@@ -4620,7 +4833,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, | |||
4620 | 4833 | ||
4621 | err_list_del: | 4834 | err_list_del: |
4622 | list_del_rcu(&css->sibling); | 4835 | list_del_rcu(&css->sibling); |
4623 | cgroup_clear_dir(css->cgroup, 1 << css->ss->id); | 4836 | css_clear_dir(css, NULL); |
4624 | err_free_id: | 4837 | err_free_id: |
4625 | cgroup_idr_remove(&ss->css_idr, css->id); | 4838 | cgroup_idr_remove(&ss->css_idr, css->id); |
4626 | err_free_percpu_ref: | 4839 | err_free_percpu_ref: |
@@ -4637,7 +4850,6 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, | |||
4637 | struct cgroup_root *root; | 4850 | struct cgroup_root *root; |
4638 | struct cgroup_subsys *ss; | 4851 | struct cgroup_subsys *ss; |
4639 | struct kernfs_node *kn; | 4852 | struct kernfs_node *kn; |
4640 | struct cftype *base_files; | ||
4641 | int ssid, ret; | 4853 | int ssid, ret; |
4642 | 4854 | ||
4643 | /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable. | 4855 | /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable. |
@@ -4713,12 +4925,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, | |||
4713 | if (ret) | 4925 | if (ret) |
4714 | goto out_destroy; | 4926 | goto out_destroy; |
4715 | 4927 | ||
4716 | if (cgroup_on_dfl(cgrp)) | 4928 | ret = css_populate_dir(&cgrp->self, NULL); |
4717 | base_files = cgroup_dfl_base_files; | ||
4718 | else | ||
4719 | base_files = cgroup_legacy_base_files; | ||
4720 | |||
4721 | ret = cgroup_addrm_files(cgrp, base_files, true); | ||
4722 | if (ret) | 4929 | if (ret) |
4723 | goto out_destroy; | 4930 | goto out_destroy; |
4724 | 4931 | ||
@@ -4805,7 +5012,7 @@ static void kill_css(struct cgroup_subsys_state *css) | |||
4805 | * This must happen before css is disassociated with its cgroup. | 5012 | * This must happen before css is disassociated with its cgroup. |
4806 | * See seq_css() for details. | 5013 | * See seq_css() for details. |
4807 | */ | 5014 | */ |
4808 | cgroup_clear_dir(css->cgroup, 1 << css->ss->id); | 5015 | css_clear_dir(css, NULL); |
4809 | 5016 | ||
4810 | /* | 5017 | /* |
4811 | * Killing would put the base ref, but we need to keep it alive | 5018 | * Killing would put the base ref, but we need to keep it alive |
@@ -4854,19 +5061,15 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
4854 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) | 5061 | __releases(&cgroup_mutex) __acquires(&cgroup_mutex) |
4855 | { | 5062 | { |
4856 | struct cgroup_subsys_state *css; | 5063 | struct cgroup_subsys_state *css; |
4857 | bool empty; | ||
4858 | int ssid; | 5064 | int ssid; |
4859 | 5065 | ||
4860 | lockdep_assert_held(&cgroup_mutex); | 5066 | lockdep_assert_held(&cgroup_mutex); |
4861 | 5067 | ||
4862 | /* | 5068 | /* |
4863 | * css_set_rwsem synchronizes access to ->cset_links and prevents | 5069 | * Only migration can raise populated from zero and we're already |
4864 | * @cgrp from being removed while put_css_set() is in progress. | 5070 | * holding cgroup_mutex. |
4865 | */ | 5071 | */ |
4866 | down_read(&css_set_rwsem); | 5072 | if (cgroup_is_populated(cgrp)) |
4867 | empty = list_empty(&cgrp->cset_links); | ||
4868 | up_read(&css_set_rwsem); | ||
4869 | if (!empty) | ||
4870 | return -EBUSY; | 5073 | return -EBUSY; |
4871 | 5074 | ||
4872 | /* | 5075 | /* |
@@ -4964,6 +5167,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) | |||
4964 | 5167 | ||
4965 | have_fork_callback |= (bool)ss->fork << ss->id; | 5168 | have_fork_callback |= (bool)ss->fork << ss->id; |
4966 | have_exit_callback |= (bool)ss->exit << ss->id; | 5169 | have_exit_callback |= (bool)ss->exit << ss->id; |
5170 | have_free_callback |= (bool)ss->free << ss->id; | ||
4967 | have_canfork_callback |= (bool)ss->can_fork << ss->id; | 5171 | have_canfork_callback |= (bool)ss->can_fork << ss->id; |
4968 | 5172 | ||
4969 | /* At system boot, before all subsystems have been | 5173 | /* At system boot, before all subsystems have been |
@@ -5012,6 +5216,8 @@ int __init cgroup_init_early(void) | |||
5012 | return 0; | 5216 | return 0; |
5013 | } | 5217 | } |
5014 | 5218 | ||
5219 | static unsigned long cgroup_disable_mask __initdata; | ||
5220 | |||
5015 | /** | 5221 | /** |
5016 | * cgroup_init - cgroup initialization | 5222 | * cgroup_init - cgroup initialization |
5017 | * | 5223 | * |
@@ -5022,7 +5228,7 @@ int __init cgroup_init(void) | |||
5022 | { | 5228 | { |
5023 | struct cgroup_subsys *ss; | 5229 | struct cgroup_subsys *ss; |
5024 | unsigned long key; | 5230 | unsigned long key; |
5025 | int ssid, err; | 5231 | int ssid; |
5026 | 5232 | ||
5027 | BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); | 5233 | BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); |
5028 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); | 5234 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); |
@@ -5058,14 +5264,15 @@ int __init cgroup_init(void) | |||
5058 | * disabled flag and cftype registration needs kmalloc, | 5264 | * disabled flag and cftype registration needs kmalloc, |
5059 | * both of which aren't available during early_init. | 5265 | * both of which aren't available during early_init. |
5060 | */ | 5266 | */ |
5061 | if (ss->disabled) | 5267 | if (cgroup_disable_mask & (1 << ssid)) { |
5268 | static_branch_disable(cgroup_subsys_enabled_key[ssid]); | ||
5269 | printk(KERN_INFO "Disabling %s control group subsystem\n", | ||
5270 | ss->name); | ||
5062 | continue; | 5271 | continue; |
5272 | } | ||
5063 | 5273 | ||
5064 | cgrp_dfl_root.subsys_mask |= 1 << ss->id; | 5274 | cgrp_dfl_root.subsys_mask |= 1 << ss->id; |
5065 | 5275 | ||
5066 | if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes) | ||
5067 | ss->dfl_cftypes = ss->legacy_cftypes; | ||
5068 | |||
5069 | if (!ss->dfl_cftypes) | 5276 | if (!ss->dfl_cftypes) |
5070 | cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id; | 5277 | cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id; |
5071 | 5278 | ||
@@ -5080,17 +5287,10 @@ int __init cgroup_init(void) | |||
5080 | ss->bind(init_css_set.subsys[ssid]); | 5287 | ss->bind(init_css_set.subsys[ssid]); |
5081 | } | 5288 | } |
5082 | 5289 | ||
5083 | err = sysfs_create_mount_point(fs_kobj, "cgroup"); | 5290 | WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup")); |
5084 | if (err) | 5291 | WARN_ON(register_filesystem(&cgroup_fs_type)); |
5085 | return err; | 5292 | WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations)); |
5086 | |||
5087 | err = register_filesystem(&cgroup_fs_type); | ||
5088 | if (err < 0) { | ||
5089 | sysfs_remove_mount_point(fs_kobj, "cgroup"); | ||
5090 | return err; | ||
5091 | } | ||
5092 | 5293 | ||
5093 | proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); | ||
5094 | return 0; | 5294 | return 0; |
5095 | } | 5295 | } |
5096 | 5296 | ||
@@ -5137,7 +5337,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, | |||
5137 | goto out; | 5337 | goto out; |
5138 | 5338 | ||
5139 | mutex_lock(&cgroup_mutex); | 5339 | mutex_lock(&cgroup_mutex); |
5140 | down_read(&css_set_rwsem); | 5340 | spin_lock_bh(&css_set_lock); |
5141 | 5341 | ||
5142 | for_each_root(root) { | 5342 | for_each_root(root) { |
5143 | struct cgroup_subsys *ss; | 5343 | struct cgroup_subsys *ss; |
@@ -5157,19 +5357,39 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, | |||
5157 | seq_printf(m, "%sname=%s", count ? "," : "", | 5357 | seq_printf(m, "%sname=%s", count ? "," : "", |
5158 | root->name); | 5358 | root->name); |
5159 | seq_putc(m, ':'); | 5359 | seq_putc(m, ':'); |
5360 | |||
5160 | cgrp = task_cgroup_from_root(tsk, root); | 5361 | cgrp = task_cgroup_from_root(tsk, root); |
5161 | path = cgroup_path(cgrp, buf, PATH_MAX); | 5362 | |
5162 | if (!path) { | 5363 | /* |
5163 | retval = -ENAMETOOLONG; | 5364 | * On traditional hierarchies, all zombie tasks show up as |
5164 | goto out_unlock; | 5365 | * belonging to the root cgroup. On the default hierarchy, |
5366 | * while a zombie doesn't show up in "cgroup.procs" and | ||
5367 | * thus can't be migrated, its /proc/PID/cgroup keeps | ||
5368 | * reporting the cgroup it belonged to before exiting. If | ||
5369 | * the cgroup is removed before the zombie is reaped, | ||
5370 | * " (deleted)" is appended to the cgroup path. | ||
5371 | */ | ||
5372 | if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) { | ||
5373 | path = cgroup_path(cgrp, buf, PATH_MAX); | ||
5374 | if (!path) { | ||
5375 | retval = -ENAMETOOLONG; | ||
5376 | goto out_unlock; | ||
5377 | } | ||
5378 | } else { | ||
5379 | path = "/"; | ||
5165 | } | 5380 | } |
5381 | |||
5166 | seq_puts(m, path); | 5382 | seq_puts(m, path); |
5167 | seq_putc(m, '\n'); | 5383 | |
5384 | if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp)) | ||
5385 | seq_puts(m, " (deleted)\n"); | ||
5386 | else | ||
5387 | seq_putc(m, '\n'); | ||
5168 | } | 5388 | } |
5169 | 5389 | ||
5170 | retval = 0; | 5390 | retval = 0; |
5171 | out_unlock: | 5391 | out_unlock: |
5172 | up_read(&css_set_rwsem); | 5392 | spin_unlock_bh(&css_set_lock); |
5173 | mutex_unlock(&cgroup_mutex); | 5393 | mutex_unlock(&cgroup_mutex); |
5174 | kfree(buf); | 5394 | kfree(buf); |
5175 | out: | 5395 | out: |
@@ -5193,7 +5413,8 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) | |||
5193 | for_each_subsys(ss, i) | 5413 | for_each_subsys(ss, i) |
5194 | seq_printf(m, "%s\t%d\t%d\t%d\n", | 5414 | seq_printf(m, "%s\t%d\t%d\t%d\n", |
5195 | ss->legacy_name, ss->root->hierarchy_id, | 5415 | ss->legacy_name, ss->root->hierarchy_id, |
5196 | atomic_read(&ss->root->nr_cgrps), !ss->disabled); | 5416 | atomic_read(&ss->root->nr_cgrps), |
5417 | cgroup_ssid_enabled(i)); | ||
5197 | 5418 | ||
5198 | mutex_unlock(&cgroup_mutex); | 5419 | mutex_unlock(&cgroup_mutex); |
5199 | return 0; | 5420 | return 0; |
@@ -5314,7 +5535,7 @@ void cgroup_post_fork(struct task_struct *child, | |||
5314 | * @child during its iteration. | 5535 | * @child during its iteration. |
5315 | * | 5536 | * |
5316 | * If we won the race, @child is associated with %current's | 5537 | * If we won the race, @child is associated with %current's |
5317 | * css_set. Grabbing css_set_rwsem guarantees both that the | 5538 | * css_set. Grabbing css_set_lock guarantees both that the |
5318 | * association is stable, and, on completion of the parent's | 5539 | * association is stable, and, on completion of the parent's |
5319 | * migration, @child is visible in the source of migration or | 5540 | * migration, @child is visible in the source of migration or |
5320 | * already in the destination cgroup. This guarantee is necessary | 5541 | * already in the destination cgroup. This guarantee is necessary |
@@ -5329,14 +5550,13 @@ void cgroup_post_fork(struct task_struct *child, | |||
5329 | if (use_task_css_set_links) { | 5550 | if (use_task_css_set_links) { |
5330 | struct css_set *cset; | 5551 | struct css_set *cset; |
5331 | 5552 | ||
5332 | down_write(&css_set_rwsem); | 5553 | spin_lock_bh(&css_set_lock); |
5333 | cset = task_css_set(current); | 5554 | cset = task_css_set(current); |
5334 | if (list_empty(&child->cg_list)) { | 5555 | if (list_empty(&child->cg_list)) { |
5335 | rcu_assign_pointer(child->cgroups, cset); | ||
5336 | list_add(&child->cg_list, &cset->tasks); | ||
5337 | get_css_set(cset); | 5556 | get_css_set(cset); |
5557 | css_set_move_task(child, NULL, cset, false); | ||
5338 | } | 5558 | } |
5339 | up_write(&css_set_rwsem); | 5559 | spin_unlock_bh(&css_set_lock); |
5340 | } | 5560 | } |
5341 | 5561 | ||
5342 | /* | 5562 | /* |
@@ -5371,39 +5591,42 @@ void cgroup_exit(struct task_struct *tsk) | |||
5371 | { | 5591 | { |
5372 | struct cgroup_subsys *ss; | 5592 | struct cgroup_subsys *ss; |
5373 | struct css_set *cset; | 5593 | struct css_set *cset; |
5374 | bool put_cset = false; | ||
5375 | int i; | 5594 | int i; |
5376 | 5595 | ||
5377 | /* | 5596 | /* |
5378 | * Unlink from @tsk from its css_set. As migration path can't race | 5597 | * Unlink from @tsk from its css_set. As migration path can't race |
5379 | * with us, we can check cg_list without grabbing css_set_rwsem. | 5598 | * with us, we can check css_set and cg_list without synchronization. |
5380 | */ | 5599 | */ |
5600 | cset = task_css_set(tsk); | ||
5601 | |||
5381 | if (!list_empty(&tsk->cg_list)) { | 5602 | if (!list_empty(&tsk->cg_list)) { |
5382 | down_write(&css_set_rwsem); | 5603 | spin_lock_bh(&css_set_lock); |
5383 | list_del_init(&tsk->cg_list); | 5604 | css_set_move_task(tsk, cset, NULL, false); |
5384 | up_write(&css_set_rwsem); | 5605 | spin_unlock_bh(&css_set_lock); |
5385 | put_cset = true; | 5606 | } else { |
5607 | get_css_set(cset); | ||
5386 | } | 5608 | } |
5387 | 5609 | ||
5388 | /* Reassign the task to the init_css_set. */ | ||
5389 | cset = task_css_set(tsk); | ||
5390 | RCU_INIT_POINTER(tsk->cgroups, &init_css_set); | ||
5391 | |||
5392 | /* see cgroup_post_fork() for details */ | 5610 | /* see cgroup_post_fork() for details */ |
5393 | for_each_subsys_which(ss, i, &have_exit_callback) { | 5611 | for_each_subsys_which(ss, i, &have_exit_callback) |
5394 | struct cgroup_subsys_state *old_css = cset->subsys[i]; | 5612 | ss->exit(tsk); |
5395 | struct cgroup_subsys_state *css = task_css(tsk, i); | 5613 | } |
5396 | 5614 | ||
5397 | ss->exit(css, old_css, tsk); | 5615 | void cgroup_free(struct task_struct *task) |
5398 | } | 5616 | { |
5617 | struct css_set *cset = task_css_set(task); | ||
5618 | struct cgroup_subsys *ss; | ||
5619 | int ssid; | ||
5399 | 5620 | ||
5400 | if (put_cset) | 5621 | for_each_subsys_which(ss, ssid, &have_free_callback) |
5401 | put_css_set(cset); | 5622 | ss->free(task); |
5623 | |||
5624 | put_css_set(cset); | ||
5402 | } | 5625 | } |
5403 | 5626 | ||
5404 | static void check_for_release(struct cgroup *cgrp) | 5627 | static void check_for_release(struct cgroup *cgrp) |
5405 | { | 5628 | { |
5406 | if (notify_on_release(cgrp) && !cgroup_has_tasks(cgrp) && | 5629 | if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) && |
5407 | !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) | 5630 | !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) |
5408 | schedule_work(&cgrp->release_agent_work); | 5631 | schedule_work(&cgrp->release_agent_work); |
5409 | } | 5632 | } |
@@ -5482,25 +5705,13 @@ static int __init cgroup_disable(char *str) | |||
5482 | if (strcmp(token, ss->name) && | 5705 | if (strcmp(token, ss->name) && |
5483 | strcmp(token, ss->legacy_name)) | 5706 | strcmp(token, ss->legacy_name)) |
5484 | continue; | 5707 | continue; |
5485 | 5708 | cgroup_disable_mask |= 1 << i; | |
5486 | ss->disabled = 1; | ||
5487 | printk(KERN_INFO "Disabling %s control group subsystem\n", | ||
5488 | ss->name); | ||
5489 | break; | ||
5490 | } | 5709 | } |
5491 | } | 5710 | } |
5492 | return 1; | 5711 | return 1; |
5493 | } | 5712 | } |
5494 | __setup("cgroup_disable=", cgroup_disable); | 5713 | __setup("cgroup_disable=", cgroup_disable); |
5495 | 5714 | ||
5496 | static int __init cgroup_set_legacy_files_on_dfl(char *str) | ||
5497 | { | ||
5498 | printk("cgroup: using legacy files on the default hierarchy\n"); | ||
5499 | cgroup_legacy_files_on_dfl = true; | ||
5500 | return 0; | ||
5501 | } | ||
5502 | __setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl); | ||
5503 | |||
5504 | /** | 5715 | /** |
5505 | * css_tryget_online_from_dir - get corresponding css from a cgroup dentry | 5716 | * css_tryget_online_from_dir - get corresponding css from a cgroup dentry |
5506 | * @dentry: directory dentry of interest | 5717 | * @dentry: directory dentry of interest |
@@ -5604,7 +5815,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) | |||
5604 | if (!name_buf) | 5815 | if (!name_buf) |
5605 | return -ENOMEM; | 5816 | return -ENOMEM; |
5606 | 5817 | ||
5607 | down_read(&css_set_rwsem); | 5818 | spin_lock_bh(&css_set_lock); |
5608 | rcu_read_lock(); | 5819 | rcu_read_lock(); |
5609 | cset = rcu_dereference(current->cgroups); | 5820 | cset = rcu_dereference(current->cgroups); |
5610 | list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { | 5821 | list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { |
@@ -5615,7 +5826,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) | |||
5615 | c->root->hierarchy_id, name_buf); | 5826 | c->root->hierarchy_id, name_buf); |
5616 | } | 5827 | } |
5617 | rcu_read_unlock(); | 5828 | rcu_read_unlock(); |
5618 | up_read(&css_set_rwsem); | 5829 | spin_unlock_bh(&css_set_lock); |
5619 | kfree(name_buf); | 5830 | kfree(name_buf); |
5620 | return 0; | 5831 | return 0; |
5621 | } | 5832 | } |
@@ -5626,7 +5837,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) | |||
5626 | struct cgroup_subsys_state *css = seq_css(seq); | 5837 | struct cgroup_subsys_state *css = seq_css(seq); |
5627 | struct cgrp_cset_link *link; | 5838 | struct cgrp_cset_link *link; |
5628 | 5839 | ||
5629 | down_read(&css_set_rwsem); | 5840 | spin_lock_bh(&css_set_lock); |
5630 | list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { | 5841 | list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { |
5631 | struct css_set *cset = link->cset; | 5842 | struct css_set *cset = link->cset; |
5632 | struct task_struct *task; | 5843 | struct task_struct *task; |
@@ -5649,13 +5860,13 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) | |||
5649 | overflow: | 5860 | overflow: |
5650 | seq_puts(seq, " ...\n"); | 5861 | seq_puts(seq, " ...\n"); |
5651 | } | 5862 | } |
5652 | up_read(&css_set_rwsem); | 5863 | spin_unlock_bh(&css_set_lock); |
5653 | return 0; | 5864 | return 0; |
5654 | } | 5865 | } |
5655 | 5866 | ||
5656 | static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) | 5867 | static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) |
5657 | { | 5868 | { |
5658 | return (!cgroup_has_tasks(css->cgroup) && | 5869 | return (!cgroup_is_populated(css->cgroup) && |
5659 | !css_has_online_children(&css->cgroup->self)); | 5870 | !css_has_online_children(&css->cgroup->self)); |
5660 | } | 5871 | } |
5661 | 5872 | ||
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c index 806cd7693ac8..cdd8df4e991c 100644 --- a/kernel/cgroup_pids.c +++ b/kernel/cgroup_pids.c | |||
@@ -266,11 +266,9 @@ static void pids_fork(struct task_struct *task, void *priv) | |||
266 | css_put(old_css); | 266 | css_put(old_css); |
267 | } | 267 | } |
268 | 268 | ||
269 | static void pids_exit(struct cgroup_subsys_state *css, | 269 | static void pids_free(struct task_struct *task) |
270 | struct cgroup_subsys_state *old_css, | ||
271 | struct task_struct *task) | ||
272 | { | 270 | { |
273 | struct pids_cgroup *pids = css_pids(old_css); | 271 | struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id)); |
274 | 272 | ||
275 | pids_uncharge(pids, 1); | 273 | pids_uncharge(pids, 1); |
276 | } | 274 | } |
@@ -349,7 +347,7 @@ struct cgroup_subsys pids_cgrp_subsys = { | |||
349 | .can_fork = pids_can_fork, | 347 | .can_fork = pids_can_fork, |
350 | .cancel_fork = pids_cancel_fork, | 348 | .cancel_fork = pids_cancel_fork, |
351 | .fork = pids_fork, | 349 | .fork = pids_fork, |
352 | .exit = pids_exit, | 350 | .free = pids_free, |
353 | .legacy_cftypes = pids_files, | 351 | .legacy_cftypes = pids_files, |
354 | .dfl_cftypes = pids_files, | 352 | .dfl_cftypes = pids_files, |
355 | }; | 353 | }; |
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 0a495ab35bc7..d8560ee3bab7 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c | |||
@@ -58,36 +58,13 @@ static void context_tracking_recursion_exit(void) | |||
58 | * instructions to execute won't use any RCU read side critical section | 58 | * instructions to execute won't use any RCU read side critical section |
59 | * because this function sets RCU in extended quiescent state. | 59 | * because this function sets RCU in extended quiescent state. |
60 | */ | 60 | */ |
61 | void context_tracking_enter(enum ctx_state state) | 61 | void __context_tracking_enter(enum ctx_state state) |
62 | { | 62 | { |
63 | unsigned long flags; | ||
64 | |||
65 | /* | ||
66 | * Repeat the user_enter() check here because some archs may be calling | ||
67 | * this from asm and if no CPU needs context tracking, they shouldn't | ||
68 | * go further. Repeat the check here until they support the inline static | ||
69 | * key check. | ||
70 | */ | ||
71 | if (!context_tracking_is_enabled()) | ||
72 | return; | ||
73 | |||
74 | /* | ||
75 | * Some contexts may involve an exception occuring in an irq, | ||
76 | * leading to that nesting: | ||
77 | * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() | ||
78 | * This would mess up the dyntick_nesting count though. And rcu_irq_*() | ||
79 | * helpers are enough to protect RCU uses inside the exception. So | ||
80 | * just return immediately if we detect we are in an IRQ. | ||
81 | */ | ||
82 | if (in_interrupt()) | ||
83 | return; | ||
84 | |||
85 | /* Kernel threads aren't supposed to go to userspace */ | 63 | /* Kernel threads aren't supposed to go to userspace */ |
86 | WARN_ON_ONCE(!current->mm); | 64 | WARN_ON_ONCE(!current->mm); |
87 | 65 | ||
88 | local_irq_save(flags); | ||
89 | if (!context_tracking_recursion_enter()) | 66 | if (!context_tracking_recursion_enter()) |
90 | goto out_irq_restore; | 67 | return; |
91 | 68 | ||
92 | if ( __this_cpu_read(context_tracking.state) != state) { | 69 | if ( __this_cpu_read(context_tracking.state) != state) { |
93 | if (__this_cpu_read(context_tracking.active)) { | 70 | if (__this_cpu_read(context_tracking.active)) { |
@@ -120,7 +97,27 @@ void context_tracking_enter(enum ctx_state state) | |||
120 | __this_cpu_write(context_tracking.state, state); | 97 | __this_cpu_write(context_tracking.state, state); |
121 | } | 98 | } |
122 | context_tracking_recursion_exit(); | 99 | context_tracking_recursion_exit(); |
123 | out_irq_restore: | 100 | } |
101 | NOKPROBE_SYMBOL(__context_tracking_enter); | ||
102 | EXPORT_SYMBOL_GPL(__context_tracking_enter); | ||
103 | |||
104 | void context_tracking_enter(enum ctx_state state) | ||
105 | { | ||
106 | unsigned long flags; | ||
107 | |||
108 | /* | ||
109 | * Some contexts may involve an exception occuring in an irq, | ||
110 | * leading to that nesting: | ||
111 | * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() | ||
112 | * This would mess up the dyntick_nesting count though. And rcu_irq_*() | ||
113 | * helpers are enough to protect RCU uses inside the exception. So | ||
114 | * just return immediately if we detect we are in an IRQ. | ||
115 | */ | ||
116 | if (in_interrupt()) | ||
117 | return; | ||
118 | |||
119 | local_irq_save(flags); | ||
120 | __context_tracking_enter(state); | ||
124 | local_irq_restore(flags); | 121 | local_irq_restore(flags); |
125 | } | 122 | } |
126 | NOKPROBE_SYMBOL(context_tracking_enter); | 123 | NOKPROBE_SYMBOL(context_tracking_enter); |
@@ -128,7 +125,7 @@ EXPORT_SYMBOL_GPL(context_tracking_enter); | |||
128 | 125 | ||
129 | void context_tracking_user_enter(void) | 126 | void context_tracking_user_enter(void) |
130 | { | 127 | { |
131 | context_tracking_enter(CONTEXT_USER); | 128 | user_enter(); |
132 | } | 129 | } |
133 | NOKPROBE_SYMBOL(context_tracking_user_enter); | 130 | NOKPROBE_SYMBOL(context_tracking_user_enter); |
134 | 131 | ||
@@ -144,19 +141,10 @@ NOKPROBE_SYMBOL(context_tracking_user_enter); | |||
144 | * This call supports re-entrancy. This way it can be called from any exception | 141 | * This call supports re-entrancy. This way it can be called from any exception |
145 | * handler without needing to know if we came from userspace or not. | 142 | * handler without needing to know if we came from userspace or not. |
146 | */ | 143 | */ |
147 | void context_tracking_exit(enum ctx_state state) | 144 | void __context_tracking_exit(enum ctx_state state) |
148 | { | 145 | { |
149 | unsigned long flags; | ||
150 | |||
151 | if (!context_tracking_is_enabled()) | ||
152 | return; | ||
153 | |||
154 | if (in_interrupt()) | ||
155 | return; | ||
156 | |||
157 | local_irq_save(flags); | ||
158 | if (!context_tracking_recursion_enter()) | 146 | if (!context_tracking_recursion_enter()) |
159 | goto out_irq_restore; | 147 | return; |
160 | 148 | ||
161 | if (__this_cpu_read(context_tracking.state) == state) { | 149 | if (__this_cpu_read(context_tracking.state) == state) { |
162 | if (__this_cpu_read(context_tracking.active)) { | 150 | if (__this_cpu_read(context_tracking.active)) { |
@@ -173,7 +161,19 @@ void context_tracking_exit(enum ctx_state state) | |||
173 | __this_cpu_write(context_tracking.state, CONTEXT_KERNEL); | 161 | __this_cpu_write(context_tracking.state, CONTEXT_KERNEL); |
174 | } | 162 | } |
175 | context_tracking_recursion_exit(); | 163 | context_tracking_recursion_exit(); |
176 | out_irq_restore: | 164 | } |
165 | NOKPROBE_SYMBOL(__context_tracking_exit); | ||
166 | EXPORT_SYMBOL_GPL(__context_tracking_exit); | ||
167 | |||
168 | void context_tracking_exit(enum ctx_state state) | ||
169 | { | ||
170 | unsigned long flags; | ||
171 | |||
172 | if (in_interrupt()) | ||
173 | return; | ||
174 | |||
175 | local_irq_save(flags); | ||
176 | __context_tracking_exit(state); | ||
177 | local_irq_restore(flags); | 177 | local_irq_restore(flags); |
178 | } | 178 | } |
179 | NOKPROBE_SYMBOL(context_tracking_exit); | 179 | NOKPROBE_SYMBOL(context_tracking_exit); |
@@ -181,7 +181,7 @@ EXPORT_SYMBOL_GPL(context_tracking_exit); | |||
181 | 181 | ||
182 | void context_tracking_user_exit(void) | 182 | void context_tracking_user_exit(void) |
183 | { | 183 | { |
184 | context_tracking_exit(CONTEXT_USER); | 184 | user_exit(); |
185 | } | 185 | } |
186 | NOKPROBE_SYMBOL(context_tracking_user_exit); | 186 | NOKPROBE_SYMBOL(context_tracking_user_exit); |
187 | 187 | ||
diff --git a/kernel/cpu.c b/kernel/cpu.c index 82cf9dff4295..85ff5e26e23b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -102,19 +102,6 @@ void get_online_cpus(void) | |||
102 | } | 102 | } |
103 | EXPORT_SYMBOL_GPL(get_online_cpus); | 103 | EXPORT_SYMBOL_GPL(get_online_cpus); |
104 | 104 | ||
105 | bool try_get_online_cpus(void) | ||
106 | { | ||
107 | if (cpu_hotplug.active_writer == current) | ||
108 | return true; | ||
109 | if (!mutex_trylock(&cpu_hotplug.lock)) | ||
110 | return false; | ||
111 | cpuhp_lock_acquire_tryread(); | ||
112 | atomic_inc(&cpu_hotplug.refcount); | ||
113 | mutex_unlock(&cpu_hotplug.lock); | ||
114 | return true; | ||
115 | } | ||
116 | EXPORT_SYMBOL_GPL(try_get_online_cpus); | ||
117 | |||
118 | void put_online_cpus(void) | 105 | void put_online_cpus(void) |
119 | { | 106 | { |
120 | int refcount; | 107 | int refcount; |
@@ -304,8 +291,8 @@ static inline void check_for_tasks(int dead_cpu) | |||
304 | { | 291 | { |
305 | struct task_struct *g, *p; | 292 | struct task_struct *g, *p; |
306 | 293 | ||
307 | read_lock_irq(&tasklist_lock); | 294 | read_lock(&tasklist_lock); |
308 | do_each_thread(g, p) { | 295 | for_each_process_thread(g, p) { |
309 | if (!p->on_rq) | 296 | if (!p->on_rq) |
310 | continue; | 297 | continue; |
311 | /* | 298 | /* |
@@ -320,8 +307,8 @@ static inline void check_for_tasks(int dead_cpu) | |||
320 | 307 | ||
321 | pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n", | 308 | pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n", |
322 | p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags); | 309 | p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags); |
323 | } while_each_thread(g, p); | 310 | } |
324 | read_unlock_irq(&tasklist_lock); | 311 | read_unlock(&tasklist_lock); |
325 | } | 312 | } |
326 | 313 | ||
327 | struct take_cpu_down_param { | 314 | struct take_cpu_down_param { |
@@ -344,7 +331,7 @@ static int take_cpu_down(void *_param) | |||
344 | /* Give up timekeeping duties */ | 331 | /* Give up timekeeping duties */ |
345 | tick_handover_do_timer(); | 332 | tick_handover_do_timer(); |
346 | /* Park the stopper thread */ | 333 | /* Park the stopper thread */ |
347 | kthread_park(current); | 334 | stop_machine_park((long)param->hcpu); |
348 | return 0; | 335 | return 0; |
349 | } | 336 | } |
350 | 337 | ||
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index f0acff0f66c9..10ae73611d80 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -473,7 +473,8 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) | |||
473 | 473 | ||
474 | /* On legacy hiearchy, we must be a subset of our parent cpuset. */ | 474 | /* On legacy hiearchy, we must be a subset of our parent cpuset. */ |
475 | ret = -EACCES; | 475 | ret = -EACCES; |
476 | if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par)) | 476 | if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && |
477 | !is_cpuset_subset(trial, par)) | ||
477 | goto out; | 478 | goto out; |
478 | 479 | ||
479 | /* | 480 | /* |
@@ -497,7 +498,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) | |||
497 | * be changed to have empty cpus_allowed or mems_allowed. | 498 | * be changed to have empty cpus_allowed or mems_allowed. |
498 | */ | 499 | */ |
499 | ret = -ENOSPC; | 500 | ret = -ENOSPC; |
500 | if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) { | 501 | if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) { |
501 | if (!cpumask_empty(cur->cpus_allowed) && | 502 | if (!cpumask_empty(cur->cpus_allowed) && |
502 | cpumask_empty(trial->cpus_allowed)) | 503 | cpumask_empty(trial->cpus_allowed)) |
503 | goto out; | 504 | goto out; |
@@ -879,7 +880,8 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) | |||
879 | * If it becomes empty, inherit the effective mask of the | 880 | * If it becomes empty, inherit the effective mask of the |
880 | * parent, which is guaranteed to have some CPUs. | 881 | * parent, which is guaranteed to have some CPUs. |
881 | */ | 882 | */ |
882 | if (cgroup_on_dfl(cp->css.cgroup) && cpumask_empty(new_cpus)) | 883 | if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && |
884 | cpumask_empty(new_cpus)) | ||
883 | cpumask_copy(new_cpus, parent->effective_cpus); | 885 | cpumask_copy(new_cpus, parent->effective_cpus); |
884 | 886 | ||
885 | /* Skip the whole subtree if the cpumask remains the same. */ | 887 | /* Skip the whole subtree if the cpumask remains the same. */ |
@@ -896,7 +898,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) | |||
896 | cpumask_copy(cp->effective_cpus, new_cpus); | 898 | cpumask_copy(cp->effective_cpus, new_cpus); |
897 | spin_unlock_irq(&callback_lock); | 899 | spin_unlock_irq(&callback_lock); |
898 | 900 | ||
899 | WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && | 901 | WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && |
900 | !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); | 902 | !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); |
901 | 903 | ||
902 | update_tasks_cpumask(cp); | 904 | update_tasks_cpumask(cp); |
@@ -1135,7 +1137,8 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) | |||
1135 | * If it becomes empty, inherit the effective mask of the | 1137 | * If it becomes empty, inherit the effective mask of the |
1136 | * parent, which is guaranteed to have some MEMs. | 1138 | * parent, which is guaranteed to have some MEMs. |
1137 | */ | 1139 | */ |
1138 | if (cgroup_on_dfl(cp->css.cgroup) && nodes_empty(*new_mems)) | 1140 | if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && |
1141 | nodes_empty(*new_mems)) | ||
1139 | *new_mems = parent->effective_mems; | 1142 | *new_mems = parent->effective_mems; |
1140 | 1143 | ||
1141 | /* Skip the whole subtree if the nodemask remains the same. */ | 1144 | /* Skip the whole subtree if the nodemask remains the same. */ |
@@ -1152,7 +1155,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) | |||
1152 | cp->effective_mems = *new_mems; | 1155 | cp->effective_mems = *new_mems; |
1153 | spin_unlock_irq(&callback_lock); | 1156 | spin_unlock_irq(&callback_lock); |
1154 | 1157 | ||
1155 | WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && | 1158 | WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && |
1156 | !nodes_equal(cp->mems_allowed, cp->effective_mems)); | 1159 | !nodes_equal(cp->mems_allowed, cp->effective_mems)); |
1157 | 1160 | ||
1158 | update_tasks_nodemask(cp); | 1161 | update_tasks_nodemask(cp); |
@@ -1440,7 +1443,7 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css, | |||
1440 | 1443 | ||
1441 | /* allow moving tasks into an empty cpuset if on default hierarchy */ | 1444 | /* allow moving tasks into an empty cpuset if on default hierarchy */ |
1442 | ret = -ENOSPC; | 1445 | ret = -ENOSPC; |
1443 | if (!cgroup_on_dfl(css->cgroup) && | 1446 | if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && |
1444 | (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) | 1447 | (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) |
1445 | goto out_unlock; | 1448 | goto out_unlock; |
1446 | 1449 | ||
@@ -1484,9 +1487,8 @@ static void cpuset_attach(struct cgroup_subsys_state *css, | |||
1484 | { | 1487 | { |
1485 | /* static buf protected by cpuset_mutex */ | 1488 | /* static buf protected by cpuset_mutex */ |
1486 | static nodemask_t cpuset_attach_nodemask_to; | 1489 | static nodemask_t cpuset_attach_nodemask_to; |
1487 | struct mm_struct *mm; | ||
1488 | struct task_struct *task; | 1490 | struct task_struct *task; |
1489 | struct task_struct *leader = cgroup_taskset_first(tset); | 1491 | struct task_struct *leader; |
1490 | struct cpuset *cs = css_cs(css); | 1492 | struct cpuset *cs = css_cs(css); |
1491 | struct cpuset *oldcs = cpuset_attach_old_cs; | 1493 | struct cpuset *oldcs = cpuset_attach_old_cs; |
1492 | 1494 | ||
@@ -1512,26 +1514,30 @@ static void cpuset_attach(struct cgroup_subsys_state *css, | |||
1512 | } | 1514 | } |
1513 | 1515 | ||
1514 | /* | 1516 | /* |
1515 | * Change mm, possibly for multiple threads in a threadgroup. This is | 1517 | * Change mm for all threadgroup leaders. This is expensive and may |
1516 | * expensive and may sleep. | 1518 | * sleep and should be moved outside migration path proper. |
1517 | */ | 1519 | */ |
1518 | cpuset_attach_nodemask_to = cs->effective_mems; | 1520 | cpuset_attach_nodemask_to = cs->effective_mems; |
1519 | mm = get_task_mm(leader); | 1521 | cgroup_taskset_for_each_leader(leader, tset) { |
1520 | if (mm) { | 1522 | struct mm_struct *mm = get_task_mm(leader); |
1521 | mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); | 1523 | |
1522 | 1524 | if (mm) { | |
1523 | /* | 1525 | mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); |
1524 | * old_mems_allowed is the same with mems_allowed here, except | 1526 | |
1525 | * if this task is being moved automatically due to hotplug. | 1527 | /* |
1526 | * In that case @mems_allowed has been updated and is empty, | 1528 | * old_mems_allowed is the same with mems_allowed |
1527 | * so @old_mems_allowed is the right nodesets that we migrate | 1529 | * here, except if this task is being moved |
1528 | * mm from. | 1530 | * automatically due to hotplug. In that case |
1529 | */ | 1531 | * @mems_allowed has been updated and is empty, so |
1530 | if (is_memory_migrate(cs)) { | 1532 | * @old_mems_allowed is the right nodesets that we |
1531 | cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, | 1533 | * migrate mm from. |
1532 | &cpuset_attach_nodemask_to); | 1534 | */ |
1535 | if (is_memory_migrate(cs)) { | ||
1536 | cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, | ||
1537 | &cpuset_attach_nodemask_to); | ||
1538 | } | ||
1539 | mmput(mm); | ||
1533 | } | 1540 | } |
1534 | mmput(mm); | ||
1535 | } | 1541 | } |
1536 | 1542 | ||
1537 | cs->old_mems_allowed = cpuset_attach_nodemask_to; | 1543 | cs->old_mems_allowed = cpuset_attach_nodemask_to; |
@@ -1594,9 +1600,6 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, | |||
1594 | case FILE_MEMORY_PRESSURE_ENABLED: | 1600 | case FILE_MEMORY_PRESSURE_ENABLED: |
1595 | cpuset_memory_pressure_enabled = !!val; | 1601 | cpuset_memory_pressure_enabled = !!val; |
1596 | break; | 1602 | break; |
1597 | case FILE_MEMORY_PRESSURE: | ||
1598 | retval = -EACCES; | ||
1599 | break; | ||
1600 | case FILE_SPREAD_PAGE: | 1603 | case FILE_SPREAD_PAGE: |
1601 | retval = update_flag(CS_SPREAD_PAGE, cs, val); | 1604 | retval = update_flag(CS_SPREAD_PAGE, cs, val); |
1602 | break; | 1605 | break; |
@@ -1863,9 +1866,6 @@ static struct cftype files[] = { | |||
1863 | { | 1866 | { |
1864 | .name = "memory_pressure", | 1867 | .name = "memory_pressure", |
1865 | .read_u64 = cpuset_read_u64, | 1868 | .read_u64 = cpuset_read_u64, |
1866 | .write_u64 = cpuset_write_u64, | ||
1867 | .private = FILE_MEMORY_PRESSURE, | ||
1868 | .mode = S_IRUGO, | ||
1869 | }, | 1869 | }, |
1870 | 1870 | ||
1871 | { | 1871 | { |
@@ -1952,7 +1952,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) | |||
1952 | cpuset_inc(); | 1952 | cpuset_inc(); |
1953 | 1953 | ||
1954 | spin_lock_irq(&callback_lock); | 1954 | spin_lock_irq(&callback_lock); |
1955 | if (cgroup_on_dfl(cs->css.cgroup)) { | 1955 | if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { |
1956 | cpumask_copy(cs->effective_cpus, parent->effective_cpus); | 1956 | cpumask_copy(cs->effective_cpus, parent->effective_cpus); |
1957 | cs->effective_mems = parent->effective_mems; | 1957 | cs->effective_mems = parent->effective_mems; |
1958 | } | 1958 | } |
@@ -2029,7 +2029,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) | |||
2029 | mutex_lock(&cpuset_mutex); | 2029 | mutex_lock(&cpuset_mutex); |
2030 | spin_lock_irq(&callback_lock); | 2030 | spin_lock_irq(&callback_lock); |
2031 | 2031 | ||
2032 | if (cgroup_on_dfl(root_css->cgroup)) { | 2032 | if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { |
2033 | cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); | 2033 | cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); |
2034 | top_cpuset.mems_allowed = node_possible_map; | 2034 | top_cpuset.mems_allowed = node_possible_map; |
2035 | } else { | 2035 | } else { |
@@ -2210,7 +2210,7 @@ retry: | |||
2210 | cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); | 2210 | cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); |
2211 | mems_updated = !nodes_equal(new_mems, cs->effective_mems); | 2211 | mems_updated = !nodes_equal(new_mems, cs->effective_mems); |
2212 | 2212 | ||
2213 | if (cgroup_on_dfl(cs->css.cgroup)) | 2213 | if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) |
2214 | hotplug_update_tasks(cs, &new_cpus, &new_mems, | 2214 | hotplug_update_tasks(cs, &new_cpus, &new_mems, |
2215 | cpus_updated, mems_updated); | 2215 | cpus_updated, mems_updated); |
2216 | else | 2216 | else |
@@ -2241,7 +2241,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) | |||
2241 | static cpumask_t new_cpus; | 2241 | static cpumask_t new_cpus; |
2242 | static nodemask_t new_mems; | 2242 | static nodemask_t new_mems; |
2243 | bool cpus_updated, mems_updated; | 2243 | bool cpus_updated, mems_updated; |
2244 | bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup); | 2244 | bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys); |
2245 | 2245 | ||
2246 | mutex_lock(&cpuset_mutex); | 2246 | mutex_lock(&cpuset_mutex); |
2247 | 2247 | ||
@@ -2598,22 +2598,22 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, | |||
2598 | } | 2598 | } |
2599 | 2599 | ||
2600 | /** | 2600 | /** |
2601 | * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed | 2601 | * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed |
2602 | * @tsk: pointer to task_struct of some task. | ||
2603 | * | 2602 | * |
2604 | * Description: Prints @task's name, cpuset name, and cached copy of its | 2603 | * Description: Prints current's name, cpuset name, and cached copy of its |
2605 | * mems_allowed to the kernel log. | 2604 | * mems_allowed to the kernel log. |
2606 | */ | 2605 | */ |
2607 | void cpuset_print_task_mems_allowed(struct task_struct *tsk) | 2606 | void cpuset_print_current_mems_allowed(void) |
2608 | { | 2607 | { |
2609 | struct cgroup *cgrp; | 2608 | struct cgroup *cgrp; |
2610 | 2609 | ||
2611 | rcu_read_lock(); | 2610 | rcu_read_lock(); |
2612 | 2611 | ||
2613 | cgrp = task_cs(tsk)->css.cgroup; | 2612 | cgrp = task_cs(current)->css.cgroup; |
2614 | pr_info("%s cpuset=", tsk->comm); | 2613 | pr_info("%s cpuset=", current->comm); |
2615 | pr_cont_cgroup_name(cgrp); | 2614 | pr_cont_cgroup_name(cgrp); |
2616 | pr_cont(" mems_allowed=%*pbl\n", nodemask_pr_args(&tsk->mems_allowed)); | 2615 | pr_cont(" mems_allowed=%*pbl\n", |
2616 | nodemask_pr_args(¤t->mems_allowed)); | ||
2617 | 2617 | ||
2618 | rcu_read_unlock(); | 2618 | rcu_read_unlock(); |
2619 | } | 2619 | } |
diff --git a/kernel/events/core.c b/kernel/events/core.c index f548f69c4299..1a734e0adfa7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -196,7 +196,7 @@ static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; | |||
196 | static int perf_sample_allowed_ns __read_mostly = | 196 | static int perf_sample_allowed_ns __read_mostly = |
197 | DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100; | 197 | DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100; |
198 | 198 | ||
199 | void update_perf_cpu_limits(void) | 199 | static void update_perf_cpu_limits(void) |
200 | { | 200 | { |
201 | u64 tmp = perf_sample_period_ns; | 201 | u64 tmp = perf_sample_period_ns; |
202 | 202 | ||
@@ -472,7 +472,7 @@ perf_cgroup_set_timestamp(struct task_struct *task, | |||
472 | * mode SWOUT : schedule out everything | 472 | * mode SWOUT : schedule out everything |
473 | * mode SWIN : schedule in based on cgroup for next | 473 | * mode SWIN : schedule in based on cgroup for next |
474 | */ | 474 | */ |
475 | void perf_cgroup_switch(struct task_struct *task, int mode) | 475 | static void perf_cgroup_switch(struct task_struct *task, int mode) |
476 | { | 476 | { |
477 | struct perf_cpu_context *cpuctx; | 477 | struct perf_cpu_context *cpuctx; |
478 | struct pmu *pmu; | 478 | struct pmu *pmu; |
@@ -1243,11 +1243,7 @@ static inline void perf_event__state_init(struct perf_event *event) | |||
1243 | PERF_EVENT_STATE_INACTIVE; | 1243 | PERF_EVENT_STATE_INACTIVE; |
1244 | } | 1244 | } |
1245 | 1245 | ||
1246 | /* | 1246 | static void __perf_event_read_size(struct perf_event *event, int nr_siblings) |
1247 | * Called at perf_event creation and when events are attached/detached from a | ||
1248 | * group. | ||
1249 | */ | ||
1250 | static void perf_event__read_size(struct perf_event *event) | ||
1251 | { | 1247 | { |
1252 | int entry = sizeof(u64); /* value */ | 1248 | int entry = sizeof(u64); /* value */ |
1253 | int size = 0; | 1249 | int size = 0; |
@@ -1263,7 +1259,7 @@ static void perf_event__read_size(struct perf_event *event) | |||
1263 | entry += sizeof(u64); | 1259 | entry += sizeof(u64); |
1264 | 1260 | ||
1265 | if (event->attr.read_format & PERF_FORMAT_GROUP) { | 1261 | if (event->attr.read_format & PERF_FORMAT_GROUP) { |
1266 | nr += event->group_leader->nr_siblings; | 1262 | nr += nr_siblings; |
1267 | size += sizeof(u64); | 1263 | size += sizeof(u64); |
1268 | } | 1264 | } |
1269 | 1265 | ||
@@ -1271,14 +1267,11 @@ static void perf_event__read_size(struct perf_event *event) | |||
1271 | event->read_size = size; | 1267 | event->read_size = size; |
1272 | } | 1268 | } |
1273 | 1269 | ||
1274 | static void perf_event__header_size(struct perf_event *event) | 1270 | static void __perf_event_header_size(struct perf_event *event, u64 sample_type) |
1275 | { | 1271 | { |
1276 | struct perf_sample_data *data; | 1272 | struct perf_sample_data *data; |
1277 | u64 sample_type = event->attr.sample_type; | ||
1278 | u16 size = 0; | 1273 | u16 size = 0; |
1279 | 1274 | ||
1280 | perf_event__read_size(event); | ||
1281 | |||
1282 | if (sample_type & PERF_SAMPLE_IP) | 1275 | if (sample_type & PERF_SAMPLE_IP) |
1283 | size += sizeof(data->ip); | 1276 | size += sizeof(data->ip); |
1284 | 1277 | ||
@@ -1303,6 +1296,17 @@ static void perf_event__header_size(struct perf_event *event) | |||
1303 | event->header_size = size; | 1296 | event->header_size = size; |
1304 | } | 1297 | } |
1305 | 1298 | ||
1299 | /* | ||
1300 | * Called at perf_event creation and when events are attached/detached from a | ||
1301 | * group. | ||
1302 | */ | ||
1303 | static void perf_event__header_size(struct perf_event *event) | ||
1304 | { | ||
1305 | __perf_event_read_size(event, | ||
1306 | event->group_leader->nr_siblings); | ||
1307 | __perf_event_header_size(event, event->attr.sample_type); | ||
1308 | } | ||
1309 | |||
1306 | static void perf_event__id_header_size(struct perf_event *event) | 1310 | static void perf_event__id_header_size(struct perf_event *event) |
1307 | { | 1311 | { |
1308 | struct perf_sample_data *data; | 1312 | struct perf_sample_data *data; |
@@ -1330,6 +1334,27 @@ static void perf_event__id_header_size(struct perf_event *event) | |||
1330 | event->id_header_size = size; | 1334 | event->id_header_size = size; |
1331 | } | 1335 | } |
1332 | 1336 | ||
1337 | static bool perf_event_validate_size(struct perf_event *event) | ||
1338 | { | ||
1339 | /* | ||
1340 | * The values computed here will be over-written when we actually | ||
1341 | * attach the event. | ||
1342 | */ | ||
1343 | __perf_event_read_size(event, event->group_leader->nr_siblings + 1); | ||
1344 | __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ); | ||
1345 | perf_event__id_header_size(event); | ||
1346 | |||
1347 | /* | ||
1348 | * Sum the lot; should not exceed the 64k limit we have on records. | ||
1349 | * Conservative limit to allow for callchains and other variable fields. | ||
1350 | */ | ||
1351 | if (event->read_size + event->header_size + | ||
1352 | event->id_header_size + sizeof(struct perf_event_header) >= 16*1024) | ||
1353 | return false; | ||
1354 | |||
1355 | return true; | ||
1356 | } | ||
1357 | |||
1333 | static void perf_group_attach(struct perf_event *event) | 1358 | static void perf_group_attach(struct perf_event *event) |
1334 | { | 1359 | { |
1335 | struct perf_event *group_leader = event->group_leader, *pos; | 1360 | struct perf_event *group_leader = event->group_leader, *pos; |
@@ -1914,7 +1939,7 @@ group_sched_in(struct perf_event *group_event, | |||
1914 | if (group_event->state == PERF_EVENT_STATE_OFF) | 1939 | if (group_event->state == PERF_EVENT_STATE_OFF) |
1915 | return 0; | 1940 | return 0; |
1916 | 1941 | ||
1917 | pmu->start_txn(pmu); | 1942 | pmu->start_txn(pmu, PERF_PMU_TXN_ADD); |
1918 | 1943 | ||
1919 | if (event_sched_in(group_event, cpuctx, ctx)) { | 1944 | if (event_sched_in(group_event, cpuctx, ctx)) { |
1920 | pmu->cancel_txn(pmu); | 1945 | pmu->cancel_txn(pmu); |
@@ -3184,14 +3209,22 @@ void perf_event_exec(void) | |||
3184 | rcu_read_unlock(); | 3209 | rcu_read_unlock(); |
3185 | } | 3210 | } |
3186 | 3211 | ||
3212 | struct perf_read_data { | ||
3213 | struct perf_event *event; | ||
3214 | bool group; | ||
3215 | int ret; | ||
3216 | }; | ||
3217 | |||
3187 | /* | 3218 | /* |
3188 | * Cross CPU call to read the hardware event | 3219 | * Cross CPU call to read the hardware event |
3189 | */ | 3220 | */ |
3190 | static void __perf_event_read(void *info) | 3221 | static void __perf_event_read(void *info) |
3191 | { | 3222 | { |
3192 | struct perf_event *event = info; | 3223 | struct perf_read_data *data = info; |
3224 | struct perf_event *sub, *event = data->event; | ||
3193 | struct perf_event_context *ctx = event->ctx; | 3225 | struct perf_event_context *ctx = event->ctx; |
3194 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | 3226 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); |
3227 | struct pmu *pmu = event->pmu; | ||
3195 | 3228 | ||
3196 | /* | 3229 | /* |
3197 | * If this is a task context, we need to check whether it is | 3230 | * If this is a task context, we need to check whether it is |
@@ -3208,9 +3241,35 @@ static void __perf_event_read(void *info) | |||
3208 | update_context_time(ctx); | 3241 | update_context_time(ctx); |
3209 | update_cgrp_time_from_event(event); | 3242 | update_cgrp_time_from_event(event); |
3210 | } | 3243 | } |
3244 | |||
3211 | update_event_times(event); | 3245 | update_event_times(event); |
3212 | if (event->state == PERF_EVENT_STATE_ACTIVE) | 3246 | if (event->state != PERF_EVENT_STATE_ACTIVE) |
3213 | event->pmu->read(event); | 3247 | goto unlock; |
3248 | |||
3249 | if (!data->group) { | ||
3250 | pmu->read(event); | ||
3251 | data->ret = 0; | ||
3252 | goto unlock; | ||
3253 | } | ||
3254 | |||
3255 | pmu->start_txn(pmu, PERF_PMU_TXN_READ); | ||
3256 | |||
3257 | pmu->read(event); | ||
3258 | |||
3259 | list_for_each_entry(sub, &event->sibling_list, group_entry) { | ||
3260 | update_event_times(sub); | ||
3261 | if (sub->state == PERF_EVENT_STATE_ACTIVE) { | ||
3262 | /* | ||
3263 | * Use sibling's PMU rather than @event's since | ||
3264 | * sibling could be on different (eg: software) PMU. | ||
3265 | */ | ||
3266 | sub->pmu->read(sub); | ||
3267 | } | ||
3268 | } | ||
3269 | |||
3270 | data->ret = pmu->commit_txn(pmu); | ||
3271 | |||
3272 | unlock: | ||
3214 | raw_spin_unlock(&ctx->lock); | 3273 | raw_spin_unlock(&ctx->lock); |
3215 | } | 3274 | } |
3216 | 3275 | ||
@@ -3275,15 +3334,23 @@ u64 perf_event_read_local(struct perf_event *event) | |||
3275 | return val; | 3334 | return val; |
3276 | } | 3335 | } |
3277 | 3336 | ||
3278 | static u64 perf_event_read(struct perf_event *event) | 3337 | static int perf_event_read(struct perf_event *event, bool group) |
3279 | { | 3338 | { |
3339 | int ret = 0; | ||
3340 | |||
3280 | /* | 3341 | /* |
3281 | * If event is enabled and currently active on a CPU, update the | 3342 | * If event is enabled and currently active on a CPU, update the |
3282 | * value in the event structure: | 3343 | * value in the event structure: |
3283 | */ | 3344 | */ |
3284 | if (event->state == PERF_EVENT_STATE_ACTIVE) { | 3345 | if (event->state == PERF_EVENT_STATE_ACTIVE) { |
3346 | struct perf_read_data data = { | ||
3347 | .event = event, | ||
3348 | .group = group, | ||
3349 | .ret = 0, | ||
3350 | }; | ||
3285 | smp_call_function_single(event->oncpu, | 3351 | smp_call_function_single(event->oncpu, |
3286 | __perf_event_read, event, 1); | 3352 | __perf_event_read, &data, 1); |
3353 | ret = data.ret; | ||
3287 | } else if (event->state == PERF_EVENT_STATE_INACTIVE) { | 3354 | } else if (event->state == PERF_EVENT_STATE_INACTIVE) { |
3288 | struct perf_event_context *ctx = event->ctx; | 3355 | struct perf_event_context *ctx = event->ctx; |
3289 | unsigned long flags; | 3356 | unsigned long flags; |
@@ -3298,11 +3365,14 @@ static u64 perf_event_read(struct perf_event *event) | |||
3298 | update_context_time(ctx); | 3365 | update_context_time(ctx); |
3299 | update_cgrp_time_from_event(event); | 3366 | update_cgrp_time_from_event(event); |
3300 | } | 3367 | } |
3301 | update_event_times(event); | 3368 | if (group) |
3369 | update_group_times(event); | ||
3370 | else | ||
3371 | update_event_times(event); | ||
3302 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 3372 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
3303 | } | 3373 | } |
3304 | 3374 | ||
3305 | return perf_event_count(event); | 3375 | return ret; |
3306 | } | 3376 | } |
3307 | 3377 | ||
3308 | /* | 3378 | /* |
@@ -3744,7 +3814,7 @@ static void put_event(struct perf_event *event) | |||
3744 | * see the comment there. | 3814 | * see the comment there. |
3745 | * | 3815 | * |
3746 | * 2) there is a lock-inversion with mmap_sem through | 3816 | * 2) there is a lock-inversion with mmap_sem through |
3747 | * perf_event_read_group(), which takes faults while | 3817 | * perf_read_group(), which takes faults while |
3748 | * holding ctx->mutex, however this is called after | 3818 | * holding ctx->mutex, however this is called after |
3749 | * the last filedesc died, so there is no possibility | 3819 | * the last filedesc died, so there is no possibility |
3750 | * to trigger the AB-BA case. | 3820 | * to trigger the AB-BA case. |
@@ -3818,14 +3888,18 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) | |||
3818 | *running = 0; | 3888 | *running = 0; |
3819 | 3889 | ||
3820 | mutex_lock(&event->child_mutex); | 3890 | mutex_lock(&event->child_mutex); |
3821 | total += perf_event_read(event); | 3891 | |
3892 | (void)perf_event_read(event, false); | ||
3893 | total += perf_event_count(event); | ||
3894 | |||
3822 | *enabled += event->total_time_enabled + | 3895 | *enabled += event->total_time_enabled + |
3823 | atomic64_read(&event->child_total_time_enabled); | 3896 | atomic64_read(&event->child_total_time_enabled); |
3824 | *running += event->total_time_running + | 3897 | *running += event->total_time_running + |
3825 | atomic64_read(&event->child_total_time_running); | 3898 | atomic64_read(&event->child_total_time_running); |
3826 | 3899 | ||
3827 | list_for_each_entry(child, &event->child_list, child_list) { | 3900 | list_for_each_entry(child, &event->child_list, child_list) { |
3828 | total += perf_event_read(child); | 3901 | (void)perf_event_read(child, false); |
3902 | total += perf_event_count(child); | ||
3829 | *enabled += child->total_time_enabled; | 3903 | *enabled += child->total_time_enabled; |
3830 | *running += child->total_time_running; | 3904 | *running += child->total_time_running; |
3831 | } | 3905 | } |
@@ -3835,55 +3909,95 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) | |||
3835 | } | 3909 | } |
3836 | EXPORT_SYMBOL_GPL(perf_event_read_value); | 3910 | EXPORT_SYMBOL_GPL(perf_event_read_value); |
3837 | 3911 | ||
3838 | static int perf_event_read_group(struct perf_event *event, | 3912 | static int __perf_read_group_add(struct perf_event *leader, |
3839 | u64 read_format, char __user *buf) | 3913 | u64 read_format, u64 *values) |
3840 | { | 3914 | { |
3841 | struct perf_event *leader = event->group_leader, *sub; | 3915 | struct perf_event *sub; |
3842 | struct perf_event_context *ctx = leader->ctx; | 3916 | int n = 1; /* skip @nr */ |
3843 | int n = 0, size = 0, ret; | 3917 | int ret; |
3844 | u64 count, enabled, running; | ||
3845 | u64 values[5]; | ||
3846 | 3918 | ||
3847 | lockdep_assert_held(&ctx->mutex); | 3919 | ret = perf_event_read(leader, true); |
3920 | if (ret) | ||
3921 | return ret; | ||
3922 | |||
3923 | /* | ||
3924 | * Since we co-schedule groups, {enabled,running} times of siblings | ||
3925 | * will be identical to those of the leader, so we only publish one | ||
3926 | * set. | ||
3927 | */ | ||
3928 | if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { | ||
3929 | values[n++] += leader->total_time_enabled + | ||
3930 | atomic64_read(&leader->child_total_time_enabled); | ||
3931 | } | ||
3848 | 3932 | ||
3849 | count = perf_event_read_value(leader, &enabled, &running); | 3933 | if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { |
3934 | values[n++] += leader->total_time_running + | ||
3935 | atomic64_read(&leader->child_total_time_running); | ||
3936 | } | ||
3850 | 3937 | ||
3851 | values[n++] = 1 + leader->nr_siblings; | 3938 | /* |
3852 | if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) | 3939 | * Write {count,id} tuples for every sibling. |
3853 | values[n++] = enabled; | 3940 | */ |
3854 | if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) | 3941 | values[n++] += perf_event_count(leader); |
3855 | values[n++] = running; | ||
3856 | values[n++] = count; | ||
3857 | if (read_format & PERF_FORMAT_ID) | 3942 | if (read_format & PERF_FORMAT_ID) |
3858 | values[n++] = primary_event_id(leader); | 3943 | values[n++] = primary_event_id(leader); |
3859 | 3944 | ||
3860 | size = n * sizeof(u64); | 3945 | list_for_each_entry(sub, &leader->sibling_list, group_entry) { |
3946 | values[n++] += perf_event_count(sub); | ||
3947 | if (read_format & PERF_FORMAT_ID) | ||
3948 | values[n++] = primary_event_id(sub); | ||
3949 | } | ||
3861 | 3950 | ||
3862 | if (copy_to_user(buf, values, size)) | 3951 | return 0; |
3863 | return -EFAULT; | 3952 | } |
3864 | 3953 | ||
3865 | ret = size; | 3954 | static int perf_read_group(struct perf_event *event, |
3955 | u64 read_format, char __user *buf) | ||
3956 | { | ||
3957 | struct perf_event *leader = event->group_leader, *child; | ||
3958 | struct perf_event_context *ctx = leader->ctx; | ||
3959 | int ret; | ||
3960 | u64 *values; | ||
3866 | 3961 | ||
3867 | list_for_each_entry(sub, &leader->sibling_list, group_entry) { | 3962 | lockdep_assert_held(&ctx->mutex); |
3868 | n = 0; | ||
3869 | 3963 | ||
3870 | values[n++] = perf_event_read_value(sub, &enabled, &running); | 3964 | values = kzalloc(event->read_size, GFP_KERNEL); |
3871 | if (read_format & PERF_FORMAT_ID) | 3965 | if (!values) |
3872 | values[n++] = primary_event_id(sub); | 3966 | return -ENOMEM; |
3873 | 3967 | ||
3874 | size = n * sizeof(u64); | 3968 | values[0] = 1 + leader->nr_siblings; |
3875 | 3969 | ||
3876 | if (copy_to_user(buf + ret, values, size)) { | 3970 | /* |
3877 | return -EFAULT; | 3971 | * By locking the child_mutex of the leader we effectively |
3878 | } | 3972 | * lock the child list of all siblings.. XXX explain how. |
3973 | */ | ||
3974 | mutex_lock(&leader->child_mutex); | ||
3879 | 3975 | ||
3880 | ret += size; | 3976 | ret = __perf_read_group_add(leader, read_format, values); |
3977 | if (ret) | ||
3978 | goto unlock; | ||
3979 | |||
3980 | list_for_each_entry(child, &leader->child_list, child_list) { | ||
3981 | ret = __perf_read_group_add(child, read_format, values); | ||
3982 | if (ret) | ||
3983 | goto unlock; | ||
3881 | } | 3984 | } |
3882 | 3985 | ||
3986 | mutex_unlock(&leader->child_mutex); | ||
3987 | |||
3988 | ret = event->read_size; | ||
3989 | if (copy_to_user(buf, values, event->read_size)) | ||
3990 | ret = -EFAULT; | ||
3991 | goto out; | ||
3992 | |||
3993 | unlock: | ||
3994 | mutex_unlock(&leader->child_mutex); | ||
3995 | out: | ||
3996 | kfree(values); | ||
3883 | return ret; | 3997 | return ret; |
3884 | } | 3998 | } |
3885 | 3999 | ||
3886 | static int perf_event_read_one(struct perf_event *event, | 4000 | static int perf_read_one(struct perf_event *event, |
3887 | u64 read_format, char __user *buf) | 4001 | u64 read_format, char __user *buf) |
3888 | { | 4002 | { |
3889 | u64 enabled, running; | 4003 | u64 enabled, running; |
@@ -3921,7 +4035,7 @@ static bool is_event_hup(struct perf_event *event) | |||
3921 | * Read the performance event - simple non blocking version for now | 4035 | * Read the performance event - simple non blocking version for now |
3922 | */ | 4036 | */ |
3923 | static ssize_t | 4037 | static ssize_t |
3924 | perf_read_hw(struct perf_event *event, char __user *buf, size_t count) | 4038 | __perf_read(struct perf_event *event, char __user *buf, size_t count) |
3925 | { | 4039 | { |
3926 | u64 read_format = event->attr.read_format; | 4040 | u64 read_format = event->attr.read_format; |
3927 | int ret; | 4041 | int ret; |
@@ -3939,9 +4053,9 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count) | |||
3939 | 4053 | ||
3940 | WARN_ON_ONCE(event->ctx->parent_ctx); | 4054 | WARN_ON_ONCE(event->ctx->parent_ctx); |
3941 | if (read_format & PERF_FORMAT_GROUP) | 4055 | if (read_format & PERF_FORMAT_GROUP) |
3942 | ret = perf_event_read_group(event, read_format, buf); | 4056 | ret = perf_read_group(event, read_format, buf); |
3943 | else | 4057 | else |
3944 | ret = perf_event_read_one(event, read_format, buf); | 4058 | ret = perf_read_one(event, read_format, buf); |
3945 | 4059 | ||
3946 | return ret; | 4060 | return ret; |
3947 | } | 4061 | } |
@@ -3954,7 +4068,7 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) | |||
3954 | int ret; | 4068 | int ret; |
3955 | 4069 | ||
3956 | ctx = perf_event_ctx_lock(event); | 4070 | ctx = perf_event_ctx_lock(event); |
3957 | ret = perf_read_hw(event, buf, count); | 4071 | ret = __perf_read(event, buf, count); |
3958 | perf_event_ctx_unlock(event, ctx); | 4072 | perf_event_ctx_unlock(event, ctx); |
3959 | 4073 | ||
3960 | return ret; | 4074 | return ret; |
@@ -3985,7 +4099,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) | |||
3985 | 4099 | ||
3986 | static void _perf_event_reset(struct perf_event *event) | 4100 | static void _perf_event_reset(struct perf_event *event) |
3987 | { | 4101 | { |
3988 | (void)perf_event_read(event); | 4102 | (void)perf_event_read(event, false); |
3989 | local64_set(&event->count, 0); | 4103 | local64_set(&event->count, 0); |
3990 | perf_event_update_userpage(event); | 4104 | perf_event_update_userpage(event); |
3991 | } | 4105 | } |
@@ -5261,9 +5375,15 @@ void perf_output_sample(struct perf_output_handle *handle, | |||
5261 | 5375 | ||
5262 | if (sample_type & PERF_SAMPLE_RAW) { | 5376 | if (sample_type & PERF_SAMPLE_RAW) { |
5263 | if (data->raw) { | 5377 | if (data->raw) { |
5264 | perf_output_put(handle, data->raw->size); | 5378 | u32 raw_size = data->raw->size; |
5265 | __output_copy(handle, data->raw->data, | 5379 | u32 real_size = round_up(raw_size + sizeof(u32), |
5266 | data->raw->size); | 5380 | sizeof(u64)) - sizeof(u32); |
5381 | u64 zero = 0; | ||
5382 | |||
5383 | perf_output_put(handle, real_size); | ||
5384 | __output_copy(handle, data->raw->data, raw_size); | ||
5385 | if (real_size - raw_size) | ||
5386 | __output_copy(handle, &zero, real_size - raw_size); | ||
5267 | } else { | 5387 | } else { |
5268 | struct { | 5388 | struct { |
5269 | u32 size; | 5389 | u32 size; |
@@ -5395,8 +5515,7 @@ void perf_prepare_sample(struct perf_event_header *header, | |||
5395 | else | 5515 | else |
5396 | size += sizeof(u32); | 5516 | size += sizeof(u32); |
5397 | 5517 | ||
5398 | WARN_ON_ONCE(size & (sizeof(u64)-1)); | 5518 | header->size += round_up(size, sizeof(u64)); |
5399 | header->size += size; | ||
5400 | } | 5519 | } |
5401 | 5520 | ||
5402 | if (sample_type & PERF_SAMPLE_BRANCH_STACK) { | 5521 | if (sample_type & PERF_SAMPLE_BRANCH_STACK) { |
@@ -7267,24 +7386,49 @@ static void perf_pmu_nop_void(struct pmu *pmu) | |||
7267 | { | 7386 | { |
7268 | } | 7387 | } |
7269 | 7388 | ||
7389 | static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags) | ||
7390 | { | ||
7391 | } | ||
7392 | |||
7270 | static int perf_pmu_nop_int(struct pmu *pmu) | 7393 | static int perf_pmu_nop_int(struct pmu *pmu) |
7271 | { | 7394 | { |
7272 | return 0; | 7395 | return 0; |
7273 | } | 7396 | } |
7274 | 7397 | ||
7275 | static void perf_pmu_start_txn(struct pmu *pmu) | 7398 | static DEFINE_PER_CPU(unsigned int, nop_txn_flags); |
7399 | |||
7400 | static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags) | ||
7276 | { | 7401 | { |
7402 | __this_cpu_write(nop_txn_flags, flags); | ||
7403 | |||
7404 | if (flags & ~PERF_PMU_TXN_ADD) | ||
7405 | return; | ||
7406 | |||
7277 | perf_pmu_disable(pmu); | 7407 | perf_pmu_disable(pmu); |
7278 | } | 7408 | } |
7279 | 7409 | ||
7280 | static int perf_pmu_commit_txn(struct pmu *pmu) | 7410 | static int perf_pmu_commit_txn(struct pmu *pmu) |
7281 | { | 7411 | { |
7412 | unsigned int flags = __this_cpu_read(nop_txn_flags); | ||
7413 | |||
7414 | __this_cpu_write(nop_txn_flags, 0); | ||
7415 | |||
7416 | if (flags & ~PERF_PMU_TXN_ADD) | ||
7417 | return 0; | ||
7418 | |||
7282 | perf_pmu_enable(pmu); | 7419 | perf_pmu_enable(pmu); |
7283 | return 0; | 7420 | return 0; |
7284 | } | 7421 | } |
7285 | 7422 | ||
7286 | static void perf_pmu_cancel_txn(struct pmu *pmu) | 7423 | static void perf_pmu_cancel_txn(struct pmu *pmu) |
7287 | { | 7424 | { |
7425 | unsigned int flags = __this_cpu_read(nop_txn_flags); | ||
7426 | |||
7427 | __this_cpu_write(nop_txn_flags, 0); | ||
7428 | |||
7429 | if (flags & ~PERF_PMU_TXN_ADD) | ||
7430 | return; | ||
7431 | |||
7288 | perf_pmu_enable(pmu); | 7432 | perf_pmu_enable(pmu); |
7289 | } | 7433 | } |
7290 | 7434 | ||
@@ -7523,7 +7667,7 @@ got_cpu_context: | |||
7523 | pmu->commit_txn = perf_pmu_commit_txn; | 7667 | pmu->commit_txn = perf_pmu_commit_txn; |
7524 | pmu->cancel_txn = perf_pmu_cancel_txn; | 7668 | pmu->cancel_txn = perf_pmu_cancel_txn; |
7525 | } else { | 7669 | } else { |
7526 | pmu->start_txn = perf_pmu_nop_void; | 7670 | pmu->start_txn = perf_pmu_nop_txn; |
7527 | pmu->commit_txn = perf_pmu_nop_int; | 7671 | pmu->commit_txn = perf_pmu_nop_int; |
7528 | pmu->cancel_txn = perf_pmu_nop_void; | 7672 | pmu->cancel_txn = perf_pmu_nop_void; |
7529 | } | 7673 | } |
@@ -7611,7 +7755,7 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) | |||
7611 | return ret; | 7755 | return ret; |
7612 | } | 7756 | } |
7613 | 7757 | ||
7614 | struct pmu *perf_init_event(struct perf_event *event) | 7758 | static struct pmu *perf_init_event(struct perf_event *event) |
7615 | { | 7759 | { |
7616 | struct pmu *pmu = NULL; | 7760 | struct pmu *pmu = NULL; |
7617 | int idx; | 7761 | int idx; |
@@ -8297,13 +8441,35 @@ SYSCALL_DEFINE5(perf_event_open, | |||
8297 | 8441 | ||
8298 | if (move_group) { | 8442 | if (move_group) { |
8299 | gctx = group_leader->ctx; | 8443 | gctx = group_leader->ctx; |
8444 | mutex_lock_double(&gctx->mutex, &ctx->mutex); | ||
8445 | } else { | ||
8446 | mutex_lock(&ctx->mutex); | ||
8447 | } | ||
8448 | |||
8449 | if (!perf_event_validate_size(event)) { | ||
8450 | err = -E2BIG; | ||
8451 | goto err_locked; | ||
8452 | } | ||
8453 | |||
8454 | /* | ||
8455 | * Must be under the same ctx::mutex as perf_install_in_context(), | ||
8456 | * because we need to serialize with concurrent event creation. | ||
8457 | */ | ||
8458 | if (!exclusive_event_installable(event, ctx)) { | ||
8459 | /* exclusive and group stuff are assumed mutually exclusive */ | ||
8460 | WARN_ON_ONCE(move_group); | ||
8300 | 8461 | ||
8462 | err = -EBUSY; | ||
8463 | goto err_locked; | ||
8464 | } | ||
8465 | |||
8466 | WARN_ON_ONCE(ctx->parent_ctx); | ||
8467 | |||
8468 | if (move_group) { | ||
8301 | /* | 8469 | /* |
8302 | * See perf_event_ctx_lock() for comments on the details | 8470 | * See perf_event_ctx_lock() for comments on the details |
8303 | * of swizzling perf_event::ctx. | 8471 | * of swizzling perf_event::ctx. |
8304 | */ | 8472 | */ |
8305 | mutex_lock_double(&gctx->mutex, &ctx->mutex); | ||
8306 | |||
8307 | perf_remove_from_context(group_leader, false); | 8473 | perf_remove_from_context(group_leader, false); |
8308 | 8474 | ||
8309 | list_for_each_entry(sibling, &group_leader->sibling_list, | 8475 | list_for_each_entry(sibling, &group_leader->sibling_list, |
@@ -8311,13 +8477,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
8311 | perf_remove_from_context(sibling, false); | 8477 | perf_remove_from_context(sibling, false); |
8312 | put_ctx(gctx); | 8478 | put_ctx(gctx); |
8313 | } | 8479 | } |
8314 | } else { | ||
8315 | mutex_lock(&ctx->mutex); | ||
8316 | } | ||
8317 | 8480 | ||
8318 | WARN_ON_ONCE(ctx->parent_ctx); | ||
8319 | |||
8320 | if (move_group) { | ||
8321 | /* | 8481 | /* |
8322 | * Wait for everybody to stop referencing the events through | 8482 | * Wait for everybody to stop referencing the events through |
8323 | * the old lists, before installing it on new lists. | 8483 | * the old lists, before installing it on new lists. |
@@ -8349,22 +8509,29 @@ SYSCALL_DEFINE5(perf_event_open, | |||
8349 | perf_event__state_init(group_leader); | 8509 | perf_event__state_init(group_leader); |
8350 | perf_install_in_context(ctx, group_leader, group_leader->cpu); | 8510 | perf_install_in_context(ctx, group_leader, group_leader->cpu); |
8351 | get_ctx(ctx); | 8511 | get_ctx(ctx); |
8352 | } | ||
8353 | 8512 | ||
8354 | if (!exclusive_event_installable(event, ctx)) { | 8513 | /* |
8355 | err = -EBUSY; | 8514 | * Now that all events are installed in @ctx, nothing |
8356 | mutex_unlock(&ctx->mutex); | 8515 | * references @gctx anymore, so drop the last reference we have |
8357 | fput(event_file); | 8516 | * on it. |
8358 | goto err_context; | 8517 | */ |
8518 | put_ctx(gctx); | ||
8359 | } | 8519 | } |
8360 | 8520 | ||
8521 | /* | ||
8522 | * Precalculate sample_data sizes; do while holding ctx::mutex such | ||
8523 | * that we're serialized against further additions and before | ||
8524 | * perf_install_in_context() which is the point the event is active and | ||
8525 | * can use these values. | ||
8526 | */ | ||
8527 | perf_event__header_size(event); | ||
8528 | perf_event__id_header_size(event); | ||
8529 | |||
8361 | perf_install_in_context(ctx, event, event->cpu); | 8530 | perf_install_in_context(ctx, event, event->cpu); |
8362 | perf_unpin_context(ctx); | 8531 | perf_unpin_context(ctx); |
8363 | 8532 | ||
8364 | if (move_group) { | 8533 | if (move_group) |
8365 | mutex_unlock(&gctx->mutex); | 8534 | mutex_unlock(&gctx->mutex); |
8366 | put_ctx(gctx); | ||
8367 | } | ||
8368 | mutex_unlock(&ctx->mutex); | 8535 | mutex_unlock(&ctx->mutex); |
8369 | 8536 | ||
8370 | put_online_cpus(); | 8537 | put_online_cpus(); |
@@ -8376,12 +8543,6 @@ SYSCALL_DEFINE5(perf_event_open, | |||
8376 | mutex_unlock(¤t->perf_event_mutex); | 8543 | mutex_unlock(¤t->perf_event_mutex); |
8377 | 8544 | ||
8378 | /* | 8545 | /* |
8379 | * Precalculate sample_data sizes | ||
8380 | */ | ||
8381 | perf_event__header_size(event); | ||
8382 | perf_event__id_header_size(event); | ||
8383 | |||
8384 | /* | ||
8385 | * Drop the reference on the group_event after placing the | 8546 | * Drop the reference on the group_event after placing the |
8386 | * new event on the sibling_list. This ensures destruction | 8547 | * new event on the sibling_list. This ensures destruction |
8387 | * of the group leader will find the pointer to itself in | 8548 | * of the group leader will find the pointer to itself in |
@@ -8391,6 +8552,12 @@ SYSCALL_DEFINE5(perf_event_open, | |||
8391 | fd_install(event_fd, event_file); | 8552 | fd_install(event_fd, event_file); |
8392 | return event_fd; | 8553 | return event_fd; |
8393 | 8554 | ||
8555 | err_locked: | ||
8556 | if (move_group) | ||
8557 | mutex_unlock(&gctx->mutex); | ||
8558 | mutex_unlock(&ctx->mutex); | ||
8559 | /* err_file: */ | ||
8560 | fput(event_file); | ||
8394 | err_context: | 8561 | err_context: |
8395 | perf_unpin_context(ctx); | 8562 | perf_unpin_context(ctx); |
8396 | put_ctx(ctx); | 8563 | put_ctx(ctx); |
@@ -9293,25 +9460,9 @@ static void perf_cgroup_attach(struct cgroup_subsys_state *css, | |||
9293 | task_function_call(task, __perf_cgroup_move, task); | 9460 | task_function_call(task, __perf_cgroup_move, task); |
9294 | } | 9461 | } |
9295 | 9462 | ||
9296 | static void perf_cgroup_exit(struct cgroup_subsys_state *css, | ||
9297 | struct cgroup_subsys_state *old_css, | ||
9298 | struct task_struct *task) | ||
9299 | { | ||
9300 | /* | ||
9301 | * cgroup_exit() is called in the copy_process() failure path. | ||
9302 | * Ignore this case since the task hasn't ran yet, this avoids | ||
9303 | * trying to poke a half freed task state from generic code. | ||
9304 | */ | ||
9305 | if (!(task->flags & PF_EXITING)) | ||
9306 | return; | ||
9307 | |||
9308 | task_function_call(task, __perf_cgroup_move, task); | ||
9309 | } | ||
9310 | |||
9311 | struct cgroup_subsys perf_event_cgrp_subsys = { | 9463 | struct cgroup_subsys perf_event_cgrp_subsys = { |
9312 | .css_alloc = perf_cgroup_css_alloc, | 9464 | .css_alloc = perf_cgroup_css_alloc, |
9313 | .css_free = perf_cgroup_css_free, | 9465 | .css_free = perf_cgroup_css_free, |
9314 | .exit = perf_cgroup_exit, | ||
9315 | .attach = perf_cgroup_attach, | 9466 | .attach = perf_cgroup_attach, |
9316 | }; | 9467 | }; |
9317 | #endif /* CONFIG_CGROUP_PERF */ | 9468 | #endif /* CONFIG_CGROUP_PERF */ |
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 182bc30899d5..b5d1ea79c595 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c | |||
@@ -141,7 +141,7 @@ int perf_output_begin(struct perf_output_handle *handle, | |||
141 | perf_output_get_handle(handle); | 141 | perf_output_get_handle(handle); |
142 | 142 | ||
143 | do { | 143 | do { |
144 | tail = READ_ONCE_CTRL(rb->user_page->data_tail); | 144 | tail = READ_ONCE(rb->user_page->data_tail); |
145 | offset = head = local_read(&rb->head); | 145 | offset = head = local_read(&rb->head); |
146 | if (!rb->overwrite && | 146 | if (!rb->overwrite && |
147 | unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size)) | 147 | unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size)) |
diff --git a/kernel/exit.c b/kernel/exit.c index ea95ee1b5ef7..07110c6020a0 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -706,10 +706,12 @@ void do_exit(long code) | |||
706 | smp_mb(); | 706 | smp_mb(); |
707 | raw_spin_unlock_wait(&tsk->pi_lock); | 707 | raw_spin_unlock_wait(&tsk->pi_lock); |
708 | 708 | ||
709 | if (unlikely(in_atomic())) | 709 | if (unlikely(in_atomic())) { |
710 | pr_info("note: %s[%d] exited with preempt_count %d\n", | 710 | pr_info("note: %s[%d] exited with preempt_count %d\n", |
711 | current->comm, task_pid_nr(current), | 711 | current->comm, task_pid_nr(current), |
712 | preempt_count()); | 712 | preempt_count()); |
713 | preempt_count_set(PREEMPT_ENABLED); | ||
714 | } | ||
713 | 715 | ||
714 | /* sync mm's RSS info before statistics gathering */ | 716 | /* sync mm's RSS info before statistics gathering */ |
715 | if (tsk->mm) | 717 | if (tsk->mm) |
@@ -761,7 +763,9 @@ void do_exit(long code) | |||
761 | */ | 763 | */ |
762 | flush_ptrace_hw_breakpoint(tsk); | 764 | flush_ptrace_hw_breakpoint(tsk); |
763 | 765 | ||
766 | TASKS_RCU(preempt_disable()); | ||
764 | TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu)); | 767 | TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu)); |
768 | TASKS_RCU(preempt_enable()); | ||
765 | exit_notify(tsk, group_dead); | 769 | exit_notify(tsk, group_dead); |
766 | proc_exit_connector(tsk); | 770 | proc_exit_connector(tsk); |
767 | #ifdef CONFIG_NUMA | 771 | #ifdef CONFIG_NUMA |
diff --git a/kernel/fork.c b/kernel/fork.c index 7d5f0f118a63..f97f2c449f5c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -251,6 +251,7 @@ void __put_task_struct(struct task_struct *tsk) | |||
251 | WARN_ON(atomic_read(&tsk->usage)); | 251 | WARN_ON(atomic_read(&tsk->usage)); |
252 | WARN_ON(tsk == current); | 252 | WARN_ON(tsk == current); |
253 | 253 | ||
254 | cgroup_free(tsk); | ||
254 | task_numa_free(tsk); | 255 | task_numa_free(tsk); |
255 | security_task_free(tsk); | 256 | security_task_free(tsk); |
256 | exit_creds(tsk); | 257 | exit_creds(tsk); |
@@ -454,7 +455,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
454 | tmp->vm_mm = mm; | 455 | tmp->vm_mm = mm; |
455 | if (anon_vma_fork(tmp, mpnt)) | 456 | if (anon_vma_fork(tmp, mpnt)) |
456 | goto fail_nomem_anon_vma_fork; | 457 | goto fail_nomem_anon_vma_fork; |
457 | tmp->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP); | 458 | tmp->vm_flags &= |
459 | ~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP); | ||
458 | tmp->vm_next = tmp->vm_prev = NULL; | 460 | tmp->vm_next = tmp->vm_prev = NULL; |
459 | tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; | 461 | tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; |
460 | file = tmp->vm_file; | 462 | file = tmp->vm_file; |
@@ -1101,7 +1103,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) | |||
1101 | cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); | 1103 | cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); |
1102 | if (cpu_limit != RLIM_INFINITY) { | 1104 | if (cpu_limit != RLIM_INFINITY) { |
1103 | sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit); | 1105 | sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit); |
1104 | sig->cputimer.running = 1; | 1106 | sig->cputimer.running = true; |
1105 | } | 1107 | } |
1106 | 1108 | ||
1107 | /* The timer lists. */ | 1109 | /* The timer lists. */ |
diff --git a/kernel/futex.c b/kernel/futex.c index 6e443efc65f4..684d7549825a 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -255,9 +255,18 @@ struct futex_hash_bucket { | |||
255 | struct plist_head chain; | 255 | struct plist_head chain; |
256 | } ____cacheline_aligned_in_smp; | 256 | } ____cacheline_aligned_in_smp; |
257 | 257 | ||
258 | static unsigned long __read_mostly futex_hashsize; | 258 | /* |
259 | * The base of the bucket array and its size are always used together | ||
260 | * (after initialization only in hash_futex()), so ensure that they | ||
261 | * reside in the same cacheline. | ||
262 | */ | ||
263 | static struct { | ||
264 | struct futex_hash_bucket *queues; | ||
265 | unsigned long hashsize; | ||
266 | } __futex_data __read_mostly __aligned(2*sizeof(long)); | ||
267 | #define futex_queues (__futex_data.queues) | ||
268 | #define futex_hashsize (__futex_data.hashsize) | ||
259 | 269 | ||
260 | static struct futex_hash_bucket *futex_queues; | ||
261 | 270 | ||
262 | /* | 271 | /* |
263 | * Fault injections for futexes. | 272 | * Fault injections for futexes. |
@@ -267,10 +276,10 @@ static struct futex_hash_bucket *futex_queues; | |||
267 | static struct { | 276 | static struct { |
268 | struct fault_attr attr; | 277 | struct fault_attr attr; |
269 | 278 | ||
270 | u32 ignore_private; | 279 | bool ignore_private; |
271 | } fail_futex = { | 280 | } fail_futex = { |
272 | .attr = FAULT_ATTR_INITIALIZER, | 281 | .attr = FAULT_ATTR_INITIALIZER, |
273 | .ignore_private = 0, | 282 | .ignore_private = false, |
274 | }; | 283 | }; |
275 | 284 | ||
276 | static int __init setup_fail_futex(char *str) | 285 | static int __init setup_fail_futex(char *str) |
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 9a76e3beda54..3b48dab80164 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig | |||
@@ -30,6 +30,10 @@ config GENERIC_IRQ_LEGACY_ALLOC_HWIRQ | |||
30 | config GENERIC_PENDING_IRQ | 30 | config GENERIC_PENDING_IRQ |
31 | bool | 31 | bool |
32 | 32 | ||
33 | # Support for generic irq migrating off cpu before the cpu is offline. | ||
34 | config GENERIC_IRQ_MIGRATION | ||
35 | bool | ||
36 | |||
33 | # Alpha specific irq affinity mechanism | 37 | # Alpha specific irq affinity mechanism |
34 | config AUTO_IRQ_AFFINITY | 38 | config AUTO_IRQ_AFFINITY |
35 | bool | 39 | bool |
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index d12123526e2b..2fc9cbdf35b6 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile | |||
@@ -5,5 +5,6 @@ obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o | |||
5 | obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o | 5 | obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o |
6 | obj-$(CONFIG_PROC_FS) += proc.o | 6 | obj-$(CONFIG_PROC_FS) += proc.o |
7 | obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o | 7 | obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o |
8 | obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o | ||
8 | obj-$(CONFIG_PM_SLEEP) += pm.o | 9 | obj-$(CONFIG_PM_SLEEP) += pm.o |
9 | obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o | 10 | obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index e28169dd1c36..15206453b12a 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -21,6 +21,20 @@ | |||
21 | 21 | ||
22 | #include "internals.h" | 22 | #include "internals.h" |
23 | 23 | ||
24 | static irqreturn_t bad_chained_irq(int irq, void *dev_id) | ||
25 | { | ||
26 | WARN_ONCE(1, "Chained irq %d should not call an action\n", irq); | ||
27 | return IRQ_NONE; | ||
28 | } | ||
29 | |||
30 | /* | ||
31 | * Chained handlers should never call action on their IRQ. This default | ||
32 | * action will emit warning if such thing happens. | ||
33 | */ | ||
34 | struct irqaction chained_action = { | ||
35 | .handler = bad_chained_irq, | ||
36 | }; | ||
37 | |||
24 | /** | 38 | /** |
25 | * irq_set_chip - set the irq chip for an irq | 39 | * irq_set_chip - set the irq chip for an irq |
26 | * @irq: irq number | 40 | * @irq: irq number |
@@ -227,6 +241,13 @@ void irq_enable(struct irq_desc *desc) | |||
227 | * disabled. If an interrupt happens, then the interrupt flow | 241 | * disabled. If an interrupt happens, then the interrupt flow |
228 | * handler masks the line at the hardware level and marks it | 242 | * handler masks the line at the hardware level and marks it |
229 | * pending. | 243 | * pending. |
244 | * | ||
245 | * If the interrupt chip does not implement the irq_disable callback, | ||
246 | * a driver can disable the lazy approach for a particular irq line by | ||
247 | * calling 'irq_set_status_flags(irq, IRQ_DISABLE_UNLAZY)'. This can | ||
248 | * be used for devices which cannot disable the interrupt at the | ||
249 | * device level under certain circumstances and have to use | ||
250 | * disable_irq[_nosync] instead. | ||
230 | */ | 251 | */ |
231 | void irq_disable(struct irq_desc *desc) | 252 | void irq_disable(struct irq_desc *desc) |
232 | { | 253 | { |
@@ -234,6 +255,8 @@ void irq_disable(struct irq_desc *desc) | |||
234 | if (desc->irq_data.chip->irq_disable) { | 255 | if (desc->irq_data.chip->irq_disable) { |
235 | desc->irq_data.chip->irq_disable(&desc->irq_data); | 256 | desc->irq_data.chip->irq_disable(&desc->irq_data); |
236 | irq_state_set_masked(desc); | 257 | irq_state_set_masked(desc); |
258 | } else if (irq_settings_disable_unlazy(desc)) { | ||
259 | mask_irq(desc); | ||
237 | } | 260 | } |
238 | } | 261 | } |
239 | 262 | ||
@@ -669,7 +692,7 @@ void handle_percpu_irq(struct irq_desc *desc) | |||
669 | if (chip->irq_ack) | 692 | if (chip->irq_ack) |
670 | chip->irq_ack(&desc->irq_data); | 693 | chip->irq_ack(&desc->irq_data); |
671 | 694 | ||
672 | handle_irq_event_percpu(desc, desc->action); | 695 | handle_irq_event_percpu(desc); |
673 | 696 | ||
674 | if (chip->irq_eoi) | 697 | if (chip->irq_eoi) |
675 | chip->irq_eoi(&desc->irq_data); | 698 | chip->irq_eoi(&desc->irq_data); |
@@ -746,6 +769,8 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, | |||
746 | if (desc->irq_data.chip != &no_irq_chip) | 769 | if (desc->irq_data.chip != &no_irq_chip) |
747 | mask_ack_irq(desc); | 770 | mask_ack_irq(desc); |
748 | irq_state_set_disabled(desc); | 771 | irq_state_set_disabled(desc); |
772 | if (is_chained) | ||
773 | desc->action = NULL; | ||
749 | desc->depth = 1; | 774 | desc->depth = 1; |
750 | } | 775 | } |
751 | desc->handle_irq = handle; | 776 | desc->handle_irq = handle; |
@@ -755,6 +780,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, | |||
755 | irq_settings_set_noprobe(desc); | 780 | irq_settings_set_noprobe(desc); |
756 | irq_settings_set_norequest(desc); | 781 | irq_settings_set_norequest(desc); |
757 | irq_settings_set_nothread(desc); | 782 | irq_settings_set_nothread(desc); |
783 | desc->action = &chained_action; | ||
758 | irq_startup(desc, true); | 784 | irq_startup(desc, true); |
759 | } | 785 | } |
760 | } | 786 | } |
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c new file mode 100644 index 000000000000..011f8c4c63da --- /dev/null +++ b/kernel/irq/cpuhotplug.c | |||
@@ -0,0 +1,82 @@ | |||
1 | /* | ||
2 | * Generic cpu hotunplug interrupt migration code copied from the | ||
3 | * arch/arm implementation | ||
4 | * | ||
5 | * Copyright (C) Russell King | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License version 2 as | ||
9 | * published by the Free Software Foundation. | ||
10 | */ | ||
11 | #include <linux/interrupt.h> | ||
12 | #include <linux/ratelimit.h> | ||
13 | #include <linux/irq.h> | ||
14 | |||
15 | #include "internals.h" | ||
16 | |||
17 | static bool migrate_one_irq(struct irq_desc *desc) | ||
18 | { | ||
19 | struct irq_data *d = irq_desc_get_irq_data(desc); | ||
20 | const struct cpumask *affinity = d->common->affinity; | ||
21 | struct irq_chip *c; | ||
22 | bool ret = false; | ||
23 | |||
24 | /* | ||
25 | * If this is a per-CPU interrupt, or the affinity does not | ||
26 | * include this CPU, then we have nothing to do. | ||
27 | */ | ||
28 | if (irqd_is_per_cpu(d) || | ||
29 | !cpumask_test_cpu(smp_processor_id(), affinity)) | ||
30 | return false; | ||
31 | |||
32 | if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { | ||
33 | affinity = cpu_online_mask; | ||
34 | ret = true; | ||
35 | } | ||
36 | |||
37 | c = irq_data_get_irq_chip(d); | ||
38 | if (!c->irq_set_affinity) { | ||
39 | pr_debug("IRQ%u: unable to set affinity\n", d->irq); | ||
40 | } else { | ||
41 | int r = irq_do_set_affinity(d, affinity, false); | ||
42 | if (r) | ||
43 | pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n", | ||
44 | d->irq, r); | ||
45 | } | ||
46 | |||
47 | return ret; | ||
48 | } | ||
49 | |||
50 | /** | ||
51 | * irq_migrate_all_off_this_cpu - Migrate irqs away from offline cpu | ||
52 | * | ||
53 | * The current CPU has been marked offline. Migrate IRQs off this CPU. | ||
54 | * If the affinity settings do not allow other CPUs, force them onto any | ||
55 | * available CPU. | ||
56 | * | ||
57 | * Note: we must iterate over all IRQs, whether they have an attached | ||
58 | * action structure or not, as we need to get chained interrupts too. | ||
59 | */ | ||
60 | void irq_migrate_all_off_this_cpu(void) | ||
61 | { | ||
62 | unsigned int irq; | ||
63 | struct irq_desc *desc; | ||
64 | unsigned long flags; | ||
65 | |||
66 | local_irq_save(flags); | ||
67 | |||
68 | for_each_active_irq(irq) { | ||
69 | bool affinity_broken; | ||
70 | |||
71 | desc = irq_to_desc(irq); | ||
72 | raw_spin_lock(&desc->lock); | ||
73 | affinity_broken = migrate_one_irq(desc); | ||
74 | raw_spin_unlock(&desc->lock); | ||
75 | |||
76 | if (affinity_broken) | ||
77 | pr_warn_ratelimited("IRQ%u no longer affine to CPU%u\n", | ||
78 | irq, smp_processor_id()); | ||
79 | } | ||
80 | |||
81 | local_irq_restore(flags); | ||
82 | } | ||
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index de41a68fc038..a302cf9a2126 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c | |||
@@ -22,7 +22,6 @@ | |||
22 | 22 | ||
23 | /** | 23 | /** |
24 | * handle_bad_irq - handle spurious and unhandled irqs | 24 | * handle_bad_irq - handle spurious and unhandled irqs |
25 | * @irq: the interrupt number | ||
26 | * @desc: description of the interrupt | 25 | * @desc: description of the interrupt |
27 | * | 26 | * |
28 | * Handles spurious and unhandled IRQ's. It also prints a debugmessage. | 27 | * Handles spurious and unhandled IRQ's. It also prints a debugmessage. |
@@ -35,6 +34,7 @@ void handle_bad_irq(struct irq_desc *desc) | |||
35 | kstat_incr_irqs_this_cpu(desc); | 34 | kstat_incr_irqs_this_cpu(desc); |
36 | ack_bad_irq(irq); | 35 | ack_bad_irq(irq); |
37 | } | 36 | } |
37 | EXPORT_SYMBOL_GPL(handle_bad_irq); | ||
38 | 38 | ||
39 | /* | 39 | /* |
40 | * Special, empty irq handler: | 40 | * Special, empty irq handler: |
@@ -132,11 +132,11 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action) | |||
132 | wake_up_process(action->thread); | 132 | wake_up_process(action->thread); |
133 | } | 133 | } |
134 | 134 | ||
135 | irqreturn_t | 135 | irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) |
136 | handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) | ||
137 | { | 136 | { |
138 | irqreturn_t retval = IRQ_NONE; | 137 | irqreturn_t retval = IRQ_NONE; |
139 | unsigned int flags = 0, irq = desc->irq_data.irq; | 138 | unsigned int flags = 0, irq = desc->irq_data.irq; |
139 | struct irqaction *action = desc->action; | ||
140 | 140 | ||
141 | do { | 141 | do { |
142 | irqreturn_t res; | 142 | irqreturn_t res; |
@@ -184,14 +184,13 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) | |||
184 | 184 | ||
185 | irqreturn_t handle_irq_event(struct irq_desc *desc) | 185 | irqreturn_t handle_irq_event(struct irq_desc *desc) |
186 | { | 186 | { |
187 | struct irqaction *action = desc->action; | ||
188 | irqreturn_t ret; | 187 | irqreturn_t ret; |
189 | 188 | ||
190 | desc->istate &= ~IRQS_PENDING; | 189 | desc->istate &= ~IRQS_PENDING; |
191 | irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); | 190 | irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); |
192 | raw_spin_unlock(&desc->lock); | 191 | raw_spin_unlock(&desc->lock); |
193 | 192 | ||
194 | ret = handle_irq_event_percpu(desc, action); | 193 | ret = handle_irq_event_percpu(desc); |
195 | 194 | ||
196 | raw_spin_lock(&desc->lock); | 195 | raw_spin_lock(&desc->lock); |
197 | irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); | 196 | irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); |
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 5ef0c2dbe930..05c2188271b8 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h | |||
@@ -18,6 +18,8 @@ | |||
18 | 18 | ||
19 | extern bool noirqdebug; | 19 | extern bool noirqdebug; |
20 | 20 | ||
21 | extern struct irqaction chained_action; | ||
22 | |||
21 | /* | 23 | /* |
22 | * Bits used by threaded handlers: | 24 | * Bits used by threaded handlers: |
23 | * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run | 25 | * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run |
@@ -81,7 +83,7 @@ extern void irq_mark_irq(unsigned int irq); | |||
81 | 83 | ||
82 | extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); | 84 | extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); |
83 | 85 | ||
84 | irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action); | 86 | irqreturn_t handle_irq_event_percpu(struct irq_desc *desc); |
85 | irqreturn_t handle_irq_event(struct irq_desc *desc); | 87 | irqreturn_t handle_irq_event(struct irq_desc *desc); |
86 | 88 | ||
87 | /* Resending of interrupts :*/ | 89 | /* Resending of interrupts :*/ |
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index dc9d27c0c158..22aa9612ef7c 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c | |||
@@ -27,6 +27,57 @@ static int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, | |||
27 | irq_hw_number_t hwirq, int node); | 27 | irq_hw_number_t hwirq, int node); |
28 | static void irq_domain_check_hierarchy(struct irq_domain *domain); | 28 | static void irq_domain_check_hierarchy(struct irq_domain *domain); |
29 | 29 | ||
30 | struct irqchip_fwid { | ||
31 | struct fwnode_handle fwnode; | ||
32 | char *name; | ||
33 | void *data; | ||
34 | }; | ||
35 | |||
36 | /** | ||
37 | * irq_domain_alloc_fwnode - Allocate a fwnode_handle suitable for | ||
38 | * identifying an irq domain | ||
39 | * @data: optional user-provided data | ||
40 | * | ||
41 | * Allocate a struct device_node, and return a poiner to the embedded | ||
42 | * fwnode_handle (or NULL on failure). | ||
43 | */ | ||
44 | struct fwnode_handle *irq_domain_alloc_fwnode(void *data) | ||
45 | { | ||
46 | struct irqchip_fwid *fwid; | ||
47 | char *name; | ||
48 | |||
49 | fwid = kzalloc(sizeof(*fwid), GFP_KERNEL); | ||
50 | name = kasprintf(GFP_KERNEL, "irqchip@%p", data); | ||
51 | |||
52 | if (!fwid || !name) { | ||
53 | kfree(fwid); | ||
54 | kfree(name); | ||
55 | return NULL; | ||
56 | } | ||
57 | |||
58 | fwid->name = name; | ||
59 | fwid->data = data; | ||
60 | fwid->fwnode.type = FWNODE_IRQCHIP; | ||
61 | return &fwid->fwnode; | ||
62 | } | ||
63 | |||
64 | /** | ||
65 | * irq_domain_free_fwnode - Free a non-OF-backed fwnode_handle | ||
66 | * | ||
67 | * Free a fwnode_handle allocated with irq_domain_alloc_fwnode. | ||
68 | */ | ||
69 | void irq_domain_free_fwnode(struct fwnode_handle *fwnode) | ||
70 | { | ||
71 | struct irqchip_fwid *fwid; | ||
72 | |||
73 | if (WARN_ON(fwnode->type != FWNODE_IRQCHIP)) | ||
74 | return; | ||
75 | |||
76 | fwid = container_of(fwnode, struct irqchip_fwid, fwnode); | ||
77 | kfree(fwid->name); | ||
78 | kfree(fwid); | ||
79 | } | ||
80 | |||
30 | /** | 81 | /** |
31 | * __irq_domain_add() - Allocate a new irq_domain data structure | 82 | * __irq_domain_add() - Allocate a new irq_domain data structure |
32 | * @of_node: optional device-tree node of the interrupt controller | 83 | * @of_node: optional device-tree node of the interrupt controller |
@@ -40,23 +91,28 @@ static void irq_domain_check_hierarchy(struct irq_domain *domain); | |||
40 | * Allocates and initialize and irq_domain structure. | 91 | * Allocates and initialize and irq_domain structure. |
41 | * Returns pointer to IRQ domain, or NULL on failure. | 92 | * Returns pointer to IRQ domain, or NULL on failure. |
42 | */ | 93 | */ |
43 | struct irq_domain *__irq_domain_add(struct device_node *of_node, int size, | 94 | struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, |
44 | irq_hw_number_t hwirq_max, int direct_max, | 95 | irq_hw_number_t hwirq_max, int direct_max, |
45 | const struct irq_domain_ops *ops, | 96 | const struct irq_domain_ops *ops, |
46 | void *host_data) | 97 | void *host_data) |
47 | { | 98 | { |
48 | struct irq_domain *domain; | 99 | struct irq_domain *domain; |
100 | struct device_node *of_node; | ||
101 | |||
102 | of_node = to_of_node(fwnode); | ||
49 | 103 | ||
50 | domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), | 104 | domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), |
51 | GFP_KERNEL, of_node_to_nid(of_node)); | 105 | GFP_KERNEL, of_node_to_nid(of_node)); |
52 | if (WARN_ON(!domain)) | 106 | if (WARN_ON(!domain)) |
53 | return NULL; | 107 | return NULL; |
54 | 108 | ||
109 | of_node_get(of_node); | ||
110 | |||
55 | /* Fill structure */ | 111 | /* Fill structure */ |
56 | INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL); | 112 | INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL); |
57 | domain->ops = ops; | 113 | domain->ops = ops; |
58 | domain->host_data = host_data; | 114 | domain->host_data = host_data; |
59 | domain->of_node = of_node_get(of_node); | 115 | domain->fwnode = fwnode; |
60 | domain->hwirq_max = hwirq_max; | 116 | domain->hwirq_max = hwirq_max; |
61 | domain->revmap_size = size; | 117 | domain->revmap_size = size; |
62 | domain->revmap_direct_max_irq = direct_max; | 118 | domain->revmap_direct_max_irq = direct_max; |
@@ -102,7 +158,7 @@ void irq_domain_remove(struct irq_domain *domain) | |||
102 | 158 | ||
103 | pr_debug("Removed domain %s\n", domain->name); | 159 | pr_debug("Removed domain %s\n", domain->name); |
104 | 160 | ||
105 | of_node_put(domain->of_node); | 161 | of_node_put(irq_domain_get_of_node(domain)); |
106 | kfree(domain); | 162 | kfree(domain); |
107 | } | 163 | } |
108 | EXPORT_SYMBOL_GPL(irq_domain_remove); | 164 | EXPORT_SYMBOL_GPL(irq_domain_remove); |
@@ -133,7 +189,7 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, | |||
133 | { | 189 | { |
134 | struct irq_domain *domain; | 190 | struct irq_domain *domain; |
135 | 191 | ||
136 | domain = __irq_domain_add(of_node, size, size, 0, ops, host_data); | 192 | domain = __irq_domain_add(of_node_to_fwnode(of_node), size, size, 0, ops, host_data); |
137 | if (!domain) | 193 | if (!domain) |
138 | return NULL; | 194 | return NULL; |
139 | 195 | ||
@@ -177,7 +233,7 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, | |||
177 | { | 233 | { |
178 | struct irq_domain *domain; | 234 | struct irq_domain *domain; |
179 | 235 | ||
180 | domain = __irq_domain_add(of_node, first_hwirq + size, | 236 | domain = __irq_domain_add(of_node_to_fwnode(of_node), first_hwirq + size, |
181 | first_hwirq + size, 0, ops, host_data); | 237 | first_hwirq + size, 0, ops, host_data); |
182 | if (domain) | 238 | if (domain) |
183 | irq_domain_associate_many(domain, first_irq, first_hwirq, size); | 239 | irq_domain_associate_many(domain, first_irq, first_hwirq, size); |
@@ -187,12 +243,12 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, | |||
187 | EXPORT_SYMBOL_GPL(irq_domain_add_legacy); | 243 | EXPORT_SYMBOL_GPL(irq_domain_add_legacy); |
188 | 244 | ||
189 | /** | 245 | /** |
190 | * irq_find_matching_host() - Locates a domain for a given device node | 246 | * irq_find_matching_fwnode() - Locates a domain for a given fwnode |
191 | * @node: device-tree node of the interrupt controller | 247 | * @fwnode: FW descriptor of the interrupt controller |
192 | * @bus_token: domain-specific data | 248 | * @bus_token: domain-specific data |
193 | */ | 249 | */ |
194 | struct irq_domain *irq_find_matching_host(struct device_node *node, | 250 | struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode, |
195 | enum irq_domain_bus_token bus_token) | 251 | enum irq_domain_bus_token bus_token) |
196 | { | 252 | { |
197 | struct irq_domain *h, *found = NULL; | 253 | struct irq_domain *h, *found = NULL; |
198 | int rc; | 254 | int rc; |
@@ -209,9 +265,9 @@ struct irq_domain *irq_find_matching_host(struct device_node *node, | |||
209 | mutex_lock(&irq_domain_mutex); | 265 | mutex_lock(&irq_domain_mutex); |
210 | list_for_each_entry(h, &irq_domain_list, link) { | 266 | list_for_each_entry(h, &irq_domain_list, link) { |
211 | if (h->ops->match) | 267 | if (h->ops->match) |
212 | rc = h->ops->match(h, node, bus_token); | 268 | rc = h->ops->match(h, to_of_node(fwnode), bus_token); |
213 | else | 269 | else |
214 | rc = ((h->of_node != NULL) && (h->of_node == node) && | 270 | rc = ((fwnode != NULL) && (h->fwnode == fwnode) && |
215 | ((bus_token == DOMAIN_BUS_ANY) || | 271 | ((bus_token == DOMAIN_BUS_ANY) || |
216 | (h->bus_token == bus_token))); | 272 | (h->bus_token == bus_token))); |
217 | 273 | ||
@@ -223,7 +279,7 @@ struct irq_domain *irq_find_matching_host(struct device_node *node, | |||
223 | mutex_unlock(&irq_domain_mutex); | 279 | mutex_unlock(&irq_domain_mutex); |
224 | return found; | 280 | return found; |
225 | } | 281 | } |
226 | EXPORT_SYMBOL_GPL(irq_find_matching_host); | 282 | EXPORT_SYMBOL_GPL(irq_find_matching_fwnode); |
227 | 283 | ||
228 | /** | 284 | /** |
229 | * irq_set_default_host() - Set a "default" irq domain | 285 | * irq_set_default_host() - Set a "default" irq domain |
@@ -336,10 +392,12 @@ EXPORT_SYMBOL_GPL(irq_domain_associate); | |||
336 | void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, | 392 | void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, |
337 | irq_hw_number_t hwirq_base, int count) | 393 | irq_hw_number_t hwirq_base, int count) |
338 | { | 394 | { |
395 | struct device_node *of_node; | ||
339 | int i; | 396 | int i; |
340 | 397 | ||
398 | of_node = irq_domain_get_of_node(domain); | ||
341 | pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__, | 399 | pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__, |
342 | of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count); | 400 | of_node_full_name(of_node), irq_base, (int)hwirq_base, count); |
343 | 401 | ||
344 | for (i = 0; i < count; i++) { | 402 | for (i = 0; i < count; i++) { |
345 | irq_domain_associate(domain, irq_base + i, hwirq_base + i); | 403 | irq_domain_associate(domain, irq_base + i, hwirq_base + i); |
@@ -359,12 +417,14 @@ EXPORT_SYMBOL_GPL(irq_domain_associate_many); | |||
359 | */ | 417 | */ |
360 | unsigned int irq_create_direct_mapping(struct irq_domain *domain) | 418 | unsigned int irq_create_direct_mapping(struct irq_domain *domain) |
361 | { | 419 | { |
420 | struct device_node *of_node; | ||
362 | unsigned int virq; | 421 | unsigned int virq; |
363 | 422 | ||
364 | if (domain == NULL) | 423 | if (domain == NULL) |
365 | domain = irq_default_domain; | 424 | domain = irq_default_domain; |
366 | 425 | ||
367 | virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node)); | 426 | of_node = irq_domain_get_of_node(domain); |
427 | virq = irq_alloc_desc_from(1, of_node_to_nid(of_node)); | ||
368 | if (!virq) { | 428 | if (!virq) { |
369 | pr_debug("create_direct virq allocation failed\n"); | 429 | pr_debug("create_direct virq allocation failed\n"); |
370 | return 0; | 430 | return 0; |
@@ -399,6 +459,7 @@ EXPORT_SYMBOL_GPL(irq_create_direct_mapping); | |||
399 | unsigned int irq_create_mapping(struct irq_domain *domain, | 459 | unsigned int irq_create_mapping(struct irq_domain *domain, |
400 | irq_hw_number_t hwirq) | 460 | irq_hw_number_t hwirq) |
401 | { | 461 | { |
462 | struct device_node *of_node; | ||
402 | int virq; | 463 | int virq; |
403 | 464 | ||
404 | pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); | 465 | pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); |
@@ -412,6 +473,8 @@ unsigned int irq_create_mapping(struct irq_domain *domain, | |||
412 | } | 473 | } |
413 | pr_debug("-> using domain @%p\n", domain); | 474 | pr_debug("-> using domain @%p\n", domain); |
414 | 475 | ||
476 | of_node = irq_domain_get_of_node(domain); | ||
477 | |||
415 | /* Check if mapping already exists */ | 478 | /* Check if mapping already exists */ |
416 | virq = irq_find_mapping(domain, hwirq); | 479 | virq = irq_find_mapping(domain, hwirq); |
417 | if (virq) { | 480 | if (virq) { |
@@ -420,8 +483,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain, | |||
420 | } | 483 | } |
421 | 484 | ||
422 | /* Allocate a virtual interrupt number */ | 485 | /* Allocate a virtual interrupt number */ |
423 | virq = irq_domain_alloc_descs(-1, 1, hwirq, | 486 | virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node)); |
424 | of_node_to_nid(domain->of_node)); | ||
425 | if (virq <= 0) { | 487 | if (virq <= 0) { |
426 | pr_debug("-> virq allocation failed\n"); | 488 | pr_debug("-> virq allocation failed\n"); |
427 | return 0; | 489 | return 0; |
@@ -433,7 +495,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain, | |||
433 | } | 495 | } |
434 | 496 | ||
435 | pr_debug("irq %lu on domain %s mapped to virtual irq %u\n", | 497 | pr_debug("irq %lu on domain %s mapped to virtual irq %u\n", |
436 | hwirq, of_node_full_name(domain->of_node), virq); | 498 | hwirq, of_node_full_name(of_node), virq); |
437 | 499 | ||
438 | return virq; | 500 | return virq; |
439 | } | 501 | } |
@@ -460,10 +522,12 @@ EXPORT_SYMBOL_GPL(irq_create_mapping); | |||
460 | int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base, | 522 | int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base, |
461 | irq_hw_number_t hwirq_base, int count) | 523 | irq_hw_number_t hwirq_base, int count) |
462 | { | 524 | { |
525 | struct device_node *of_node; | ||
463 | int ret; | 526 | int ret; |
464 | 527 | ||
528 | of_node = irq_domain_get_of_node(domain); | ||
465 | ret = irq_alloc_descs(irq_base, irq_base, count, | 529 | ret = irq_alloc_descs(irq_base, irq_base, count, |
466 | of_node_to_nid(domain->of_node)); | 530 | of_node_to_nid(of_node)); |
467 | if (unlikely(ret < 0)) | 531 | if (unlikely(ret < 0)) |
468 | return ret; | 532 | return ret; |
469 | 533 | ||
@@ -472,28 +536,56 @@ int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base, | |||
472 | } | 536 | } |
473 | EXPORT_SYMBOL_GPL(irq_create_strict_mappings); | 537 | EXPORT_SYMBOL_GPL(irq_create_strict_mappings); |
474 | 538 | ||
475 | unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) | 539 | static int irq_domain_translate(struct irq_domain *d, |
540 | struct irq_fwspec *fwspec, | ||
541 | irq_hw_number_t *hwirq, unsigned int *type) | ||
542 | { | ||
543 | #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY | ||
544 | if (d->ops->translate) | ||
545 | return d->ops->translate(d, fwspec, hwirq, type); | ||
546 | #endif | ||
547 | if (d->ops->xlate) | ||
548 | return d->ops->xlate(d, to_of_node(fwspec->fwnode), | ||
549 | fwspec->param, fwspec->param_count, | ||
550 | hwirq, type); | ||
551 | |||
552 | /* If domain has no translation, then we assume interrupt line */ | ||
553 | *hwirq = fwspec->param[0]; | ||
554 | return 0; | ||
555 | } | ||
556 | |||
557 | static void of_phandle_args_to_fwspec(struct of_phandle_args *irq_data, | ||
558 | struct irq_fwspec *fwspec) | ||
559 | { | ||
560 | int i; | ||
561 | |||
562 | fwspec->fwnode = irq_data->np ? &irq_data->np->fwnode : NULL; | ||
563 | fwspec->param_count = irq_data->args_count; | ||
564 | |||
565 | for (i = 0; i < irq_data->args_count; i++) | ||
566 | fwspec->param[i] = irq_data->args[i]; | ||
567 | } | ||
568 | |||
569 | unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) | ||
476 | { | 570 | { |
477 | struct irq_domain *domain; | 571 | struct irq_domain *domain; |
478 | irq_hw_number_t hwirq; | 572 | irq_hw_number_t hwirq; |
479 | unsigned int type = IRQ_TYPE_NONE; | 573 | unsigned int type = IRQ_TYPE_NONE; |
480 | int virq; | 574 | int virq; |
481 | 575 | ||
482 | domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain; | 576 | if (fwspec->fwnode) |
577 | domain = irq_find_matching_fwnode(fwspec->fwnode, DOMAIN_BUS_ANY); | ||
578 | else | ||
579 | domain = irq_default_domain; | ||
580 | |||
483 | if (!domain) { | 581 | if (!domain) { |
484 | pr_warn("no irq domain found for %s !\n", | 582 | pr_warn("no irq domain found for %s !\n", |
485 | of_node_full_name(irq_data->np)); | 583 | of_node_full_name(to_of_node(fwspec->fwnode))); |
486 | return 0; | 584 | return 0; |
487 | } | 585 | } |
488 | 586 | ||
489 | /* If domain has no translation, then we assume interrupt line */ | 587 | if (irq_domain_translate(domain, fwspec, &hwirq, &type)) |
490 | if (domain->ops->xlate == NULL) | 588 | return 0; |
491 | hwirq = irq_data->args[0]; | ||
492 | else { | ||
493 | if (domain->ops->xlate(domain, irq_data->np, irq_data->args, | ||
494 | irq_data->args_count, &hwirq, &type)) | ||
495 | return 0; | ||
496 | } | ||
497 | 589 | ||
498 | if (irq_domain_is_hierarchy(domain)) { | 590 | if (irq_domain_is_hierarchy(domain)) { |
499 | /* | 591 | /* |
@@ -504,7 +596,7 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) | |||
504 | if (virq) | 596 | if (virq) |
505 | return virq; | 597 | return virq; |
506 | 598 | ||
507 | virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, irq_data); | 599 | virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, fwspec); |
508 | if (virq <= 0) | 600 | if (virq <= 0) |
509 | return 0; | 601 | return 0; |
510 | } else { | 602 | } else { |
@@ -520,6 +612,15 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) | |||
520 | irq_set_irq_type(virq, type); | 612 | irq_set_irq_type(virq, type); |
521 | return virq; | 613 | return virq; |
522 | } | 614 | } |
615 | EXPORT_SYMBOL_GPL(irq_create_fwspec_mapping); | ||
616 | |||
617 | unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) | ||
618 | { | ||
619 | struct irq_fwspec fwspec; | ||
620 | |||
621 | of_phandle_args_to_fwspec(irq_data, &fwspec); | ||
622 | return irq_create_fwspec_mapping(&fwspec); | ||
623 | } | ||
523 | EXPORT_SYMBOL_GPL(irq_create_of_mapping); | 624 | EXPORT_SYMBOL_GPL(irq_create_of_mapping); |
524 | 625 | ||
525 | /** | 626 | /** |
@@ -590,14 +691,16 @@ static int virq_debug_show(struct seq_file *m, void *private) | |||
590 | "name", "mapped", "linear-max", "direct-max", "devtree-node"); | 691 | "name", "mapped", "linear-max", "direct-max", "devtree-node"); |
591 | mutex_lock(&irq_domain_mutex); | 692 | mutex_lock(&irq_domain_mutex); |
592 | list_for_each_entry(domain, &irq_domain_list, link) { | 693 | list_for_each_entry(domain, &irq_domain_list, link) { |
694 | struct device_node *of_node; | ||
593 | int count = 0; | 695 | int count = 0; |
696 | of_node = irq_domain_get_of_node(domain); | ||
594 | radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0) | 697 | radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0) |
595 | count++; | 698 | count++; |
596 | seq_printf(m, "%c%-16s %6u %10u %10u %s\n", | 699 | seq_printf(m, "%c%-16s %6u %10u %10u %s\n", |
597 | domain == irq_default_domain ? '*' : ' ', domain->name, | 700 | domain == irq_default_domain ? '*' : ' ', domain->name, |
598 | domain->revmap_size + count, domain->revmap_size, | 701 | domain->revmap_size + count, domain->revmap_size, |
599 | domain->revmap_direct_max_irq, | 702 | domain->revmap_direct_max_irq, |
600 | domain->of_node ? of_node_full_name(domain->of_node) : ""); | 703 | of_node ? of_node_full_name(of_node) : ""); |
601 | } | 704 | } |
602 | mutex_unlock(&irq_domain_mutex); | 705 | mutex_unlock(&irq_domain_mutex); |
603 | 706 | ||
@@ -751,11 +854,11 @@ static int irq_domain_alloc_descs(int virq, unsigned int cnt, | |||
751 | 854 | ||
752 | #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY | 855 | #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY |
753 | /** | 856 | /** |
754 | * irq_domain_add_hierarchy - Add a irqdomain into the hierarchy | 857 | * irq_domain_create_hierarchy - Add a irqdomain into the hierarchy |
755 | * @parent: Parent irq domain to associate with the new domain | 858 | * @parent: Parent irq domain to associate with the new domain |
756 | * @flags: Irq domain flags associated to the domain | 859 | * @flags: Irq domain flags associated to the domain |
757 | * @size: Size of the domain. See below | 860 | * @size: Size of the domain. See below |
758 | * @node: Optional device-tree node of the interrupt controller | 861 | * @fwnode: Optional fwnode of the interrupt controller |
759 | * @ops: Pointer to the interrupt domain callbacks | 862 | * @ops: Pointer to the interrupt domain callbacks |
760 | * @host_data: Controller private data pointer | 863 | * @host_data: Controller private data pointer |
761 | * | 864 | * |
@@ -765,19 +868,19 @@ static int irq_domain_alloc_descs(int virq, unsigned int cnt, | |||
765 | * domain flags are set. | 868 | * domain flags are set. |
766 | * Returns pointer to IRQ domain, or NULL on failure. | 869 | * Returns pointer to IRQ domain, or NULL on failure. |
767 | */ | 870 | */ |
768 | struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *parent, | 871 | struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent, |
769 | unsigned int flags, | 872 | unsigned int flags, |
770 | unsigned int size, | 873 | unsigned int size, |
771 | struct device_node *node, | 874 | struct fwnode_handle *fwnode, |
772 | const struct irq_domain_ops *ops, | 875 | const struct irq_domain_ops *ops, |
773 | void *host_data) | 876 | void *host_data) |
774 | { | 877 | { |
775 | struct irq_domain *domain; | 878 | struct irq_domain *domain; |
776 | 879 | ||
777 | if (size) | 880 | if (size) |
778 | domain = irq_domain_add_linear(node, size, ops, host_data); | 881 | domain = irq_domain_create_linear(fwnode, size, ops, host_data); |
779 | else | 882 | else |
780 | domain = irq_domain_add_tree(node, ops, host_data); | 883 | domain = irq_domain_create_tree(fwnode, ops, host_data); |
781 | if (domain) { | 884 | if (domain) { |
782 | domain->parent = parent; | 885 | domain->parent = parent; |
783 | domain->flags |= flags; | 886 | domain->flags |= flags; |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index f9a59f6cabd2..0eebaeef317b 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -258,37 +258,6 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) | |||
258 | } | 258 | } |
259 | EXPORT_SYMBOL_GPL(irq_set_affinity_hint); | 259 | EXPORT_SYMBOL_GPL(irq_set_affinity_hint); |
260 | 260 | ||
261 | /** | ||
262 | * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt | ||
263 | * @irq: interrupt number to set affinity | ||
264 | * @vcpu_info: vCPU specific data | ||
265 | * | ||
266 | * This function uses the vCPU specific data to set the vCPU | ||
267 | * affinity for an irq. The vCPU specific data is passed from | ||
268 | * outside, such as KVM. One example code path is as below: | ||
269 | * KVM -> IOMMU -> irq_set_vcpu_affinity(). | ||
270 | */ | ||
271 | int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info) | ||
272 | { | ||
273 | unsigned long flags; | ||
274 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); | ||
275 | struct irq_data *data; | ||
276 | struct irq_chip *chip; | ||
277 | int ret = -ENOSYS; | ||
278 | |||
279 | if (!desc) | ||
280 | return -EINVAL; | ||
281 | |||
282 | data = irq_desc_get_irq_data(desc); | ||
283 | chip = irq_data_get_irq_chip(data); | ||
284 | if (chip && chip->irq_set_vcpu_affinity) | ||
285 | ret = chip->irq_set_vcpu_affinity(data, vcpu_info); | ||
286 | irq_put_desc_unlock(desc, flags); | ||
287 | |||
288 | return ret; | ||
289 | } | ||
290 | EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity); | ||
291 | |||
292 | static void irq_affinity_notify(struct work_struct *work) | 261 | static void irq_affinity_notify(struct work_struct *work) |
293 | { | 262 | { |
294 | struct irq_affinity_notify *notify = | 263 | struct irq_affinity_notify *notify = |
@@ -424,6 +393,37 @@ setup_affinity(struct irq_desc *desc, struct cpumask *mask) | |||
424 | } | 393 | } |
425 | #endif | 394 | #endif |
426 | 395 | ||
396 | /** | ||
397 | * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt | ||
398 | * @irq: interrupt number to set affinity | ||
399 | * @vcpu_info: vCPU specific data | ||
400 | * | ||
401 | * This function uses the vCPU specific data to set the vCPU | ||
402 | * affinity for an irq. The vCPU specific data is passed from | ||
403 | * outside, such as KVM. One example code path is as below: | ||
404 | * KVM -> IOMMU -> irq_set_vcpu_affinity(). | ||
405 | */ | ||
406 | int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info) | ||
407 | { | ||
408 | unsigned long flags; | ||
409 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); | ||
410 | struct irq_data *data; | ||
411 | struct irq_chip *chip; | ||
412 | int ret = -ENOSYS; | ||
413 | |||
414 | if (!desc) | ||
415 | return -EINVAL; | ||
416 | |||
417 | data = irq_desc_get_irq_data(desc); | ||
418 | chip = irq_data_get_irq_chip(data); | ||
419 | if (chip && chip->irq_set_vcpu_affinity) | ||
420 | ret = chip->irq_set_vcpu_affinity(data, vcpu_info); | ||
421 | irq_put_desc_unlock(desc, flags); | ||
422 | |||
423 | return ret; | ||
424 | } | ||
425 | EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity); | ||
426 | |||
427 | void __disable_irq(struct irq_desc *desc) | 427 | void __disable_irq(struct irq_desc *desc) |
428 | { | 428 | { |
429 | if (!desc->depth++) | 429 | if (!desc->depth++) |
@@ -730,6 +730,12 @@ static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id) | |||
730 | return IRQ_NONE; | 730 | return IRQ_NONE; |
731 | } | 731 | } |
732 | 732 | ||
733 | static irqreturn_t irq_forced_secondary_handler(int irq, void *dev_id) | ||
734 | { | ||
735 | WARN(1, "Secondary action handler called for irq %d\n", irq); | ||
736 | return IRQ_NONE; | ||
737 | } | ||
738 | |||
733 | static int irq_wait_for_interrupt(struct irqaction *action) | 739 | static int irq_wait_for_interrupt(struct irqaction *action) |
734 | { | 740 | { |
735 | set_current_state(TASK_INTERRUPTIBLE); | 741 | set_current_state(TASK_INTERRUPTIBLE); |
@@ -756,7 +762,8 @@ static int irq_wait_for_interrupt(struct irqaction *action) | |||
756 | static void irq_finalize_oneshot(struct irq_desc *desc, | 762 | static void irq_finalize_oneshot(struct irq_desc *desc, |
757 | struct irqaction *action) | 763 | struct irqaction *action) |
758 | { | 764 | { |
759 | if (!(desc->istate & IRQS_ONESHOT)) | 765 | if (!(desc->istate & IRQS_ONESHOT) || |
766 | action->handler == irq_forced_secondary_handler) | ||
760 | return; | 767 | return; |
761 | again: | 768 | again: |
762 | chip_bus_lock(desc); | 769 | chip_bus_lock(desc); |
@@ -910,6 +917,18 @@ static void irq_thread_dtor(struct callback_head *unused) | |||
910 | irq_finalize_oneshot(desc, action); | 917 | irq_finalize_oneshot(desc, action); |
911 | } | 918 | } |
912 | 919 | ||
920 | static void irq_wake_secondary(struct irq_desc *desc, struct irqaction *action) | ||
921 | { | ||
922 | struct irqaction *secondary = action->secondary; | ||
923 | |||
924 | if (WARN_ON_ONCE(!secondary)) | ||
925 | return; | ||
926 | |||
927 | raw_spin_lock_irq(&desc->lock); | ||
928 | __irq_wake_thread(desc, secondary); | ||
929 | raw_spin_unlock_irq(&desc->lock); | ||
930 | } | ||
931 | |||
913 | /* | 932 | /* |
914 | * Interrupt handler thread | 933 | * Interrupt handler thread |
915 | */ | 934 | */ |
@@ -940,6 +959,8 @@ static int irq_thread(void *data) | |||
940 | action_ret = handler_fn(desc, action); | 959 | action_ret = handler_fn(desc, action); |
941 | if (action_ret == IRQ_HANDLED) | 960 | if (action_ret == IRQ_HANDLED) |
942 | atomic_inc(&desc->threads_handled); | 961 | atomic_inc(&desc->threads_handled); |
962 | if (action_ret == IRQ_WAKE_THREAD) | ||
963 | irq_wake_secondary(desc, action); | ||
943 | 964 | ||
944 | wake_threads_waitq(desc); | 965 | wake_threads_waitq(desc); |
945 | } | 966 | } |
@@ -984,20 +1005,36 @@ void irq_wake_thread(unsigned int irq, void *dev_id) | |||
984 | } | 1005 | } |
985 | EXPORT_SYMBOL_GPL(irq_wake_thread); | 1006 | EXPORT_SYMBOL_GPL(irq_wake_thread); |
986 | 1007 | ||
987 | static void irq_setup_forced_threading(struct irqaction *new) | 1008 | static int irq_setup_forced_threading(struct irqaction *new) |
988 | { | 1009 | { |
989 | if (!force_irqthreads) | 1010 | if (!force_irqthreads) |
990 | return; | 1011 | return 0; |
991 | if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT)) | 1012 | if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT)) |
992 | return; | 1013 | return 0; |
993 | 1014 | ||
994 | new->flags |= IRQF_ONESHOT; | 1015 | new->flags |= IRQF_ONESHOT; |
995 | 1016 | ||
996 | if (!new->thread_fn) { | 1017 | /* |
997 | set_bit(IRQTF_FORCED_THREAD, &new->thread_flags); | 1018 | * Handle the case where we have a real primary handler and a |
998 | new->thread_fn = new->handler; | 1019 | * thread handler. We force thread them as well by creating a |
999 | new->handler = irq_default_primary_handler; | 1020 | * secondary action. |
1021 | */ | ||
1022 | if (new->handler != irq_default_primary_handler && new->thread_fn) { | ||
1023 | /* Allocate the secondary action */ | ||
1024 | new->secondary = kzalloc(sizeof(struct irqaction), GFP_KERNEL); | ||
1025 | if (!new->secondary) | ||
1026 | return -ENOMEM; | ||
1027 | new->secondary->handler = irq_forced_secondary_handler; | ||
1028 | new->secondary->thread_fn = new->thread_fn; | ||
1029 | new->secondary->dev_id = new->dev_id; | ||
1030 | new->secondary->irq = new->irq; | ||
1031 | new->secondary->name = new->name; | ||
1000 | } | 1032 | } |
1033 | /* Deal with the primary handler */ | ||
1034 | set_bit(IRQTF_FORCED_THREAD, &new->thread_flags); | ||
1035 | new->thread_fn = new->handler; | ||
1036 | new->handler = irq_default_primary_handler; | ||
1037 | return 0; | ||
1001 | } | 1038 | } |
1002 | 1039 | ||
1003 | static int irq_request_resources(struct irq_desc *desc) | 1040 | static int irq_request_resources(struct irq_desc *desc) |
@@ -1017,6 +1054,48 @@ static void irq_release_resources(struct irq_desc *desc) | |||
1017 | c->irq_release_resources(d); | 1054 | c->irq_release_resources(d); |
1018 | } | 1055 | } |
1019 | 1056 | ||
1057 | static int | ||
1058 | setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) | ||
1059 | { | ||
1060 | struct task_struct *t; | ||
1061 | struct sched_param param = { | ||
1062 | .sched_priority = MAX_USER_RT_PRIO/2, | ||
1063 | }; | ||
1064 | |||
1065 | if (!secondary) { | ||
1066 | t = kthread_create(irq_thread, new, "irq/%d-%s", irq, | ||
1067 | new->name); | ||
1068 | } else { | ||
1069 | t = kthread_create(irq_thread, new, "irq/%d-s-%s", irq, | ||
1070 | new->name); | ||
1071 | param.sched_priority -= 1; | ||
1072 | } | ||
1073 | |||
1074 | if (IS_ERR(t)) | ||
1075 | return PTR_ERR(t); | ||
1076 | |||
1077 | sched_setscheduler_nocheck(t, SCHED_FIFO, ¶m); | ||
1078 | |||
1079 | /* | ||
1080 | * We keep the reference to the task struct even if | ||
1081 | * the thread dies to avoid that the interrupt code | ||
1082 | * references an already freed task_struct. | ||
1083 | */ | ||
1084 | get_task_struct(t); | ||
1085 | new->thread = t; | ||
1086 | /* | ||
1087 | * Tell the thread to set its affinity. This is | ||
1088 | * important for shared interrupt handlers as we do | ||
1089 | * not invoke setup_affinity() for the secondary | ||
1090 | * handlers as everything is already set up. Even for | ||
1091 | * interrupts marked with IRQF_NO_BALANCE this is | ||
1092 | * correct as we want the thread to move to the cpu(s) | ||
1093 | * on which the requesting code placed the interrupt. | ||
1094 | */ | ||
1095 | set_bit(IRQTF_AFFINITY, &new->thread_flags); | ||
1096 | return 0; | ||
1097 | } | ||
1098 | |||
1020 | /* | 1099 | /* |
1021 | * Internal function to register an irqaction - typically used to | 1100 | * Internal function to register an irqaction - typically used to |
1022 | * allocate special interrupts that are part of the architecture. | 1101 | * allocate special interrupts that are part of the architecture. |
@@ -1037,6 +1116,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
1037 | if (!try_module_get(desc->owner)) | 1116 | if (!try_module_get(desc->owner)) |
1038 | return -ENODEV; | 1117 | return -ENODEV; |
1039 | 1118 | ||
1119 | new->irq = irq; | ||
1120 | |||
1040 | /* | 1121 | /* |
1041 | * Check whether the interrupt nests into another interrupt | 1122 | * Check whether the interrupt nests into another interrupt |
1042 | * thread. | 1123 | * thread. |
@@ -1054,8 +1135,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
1054 | */ | 1135 | */ |
1055 | new->handler = irq_nested_primary_handler; | 1136 | new->handler = irq_nested_primary_handler; |
1056 | } else { | 1137 | } else { |
1057 | if (irq_settings_can_thread(desc)) | 1138 | if (irq_settings_can_thread(desc)) { |
1058 | irq_setup_forced_threading(new); | 1139 | ret = irq_setup_forced_threading(new); |
1140 | if (ret) | ||
1141 | goto out_mput; | ||
1142 | } | ||
1059 | } | 1143 | } |
1060 | 1144 | ||
1061 | /* | 1145 | /* |
@@ -1064,37 +1148,14 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
1064 | * thread. | 1148 | * thread. |
1065 | */ | 1149 | */ |
1066 | if (new->thread_fn && !nested) { | 1150 | if (new->thread_fn && !nested) { |
1067 | struct task_struct *t; | 1151 | ret = setup_irq_thread(new, irq, false); |
1068 | static const struct sched_param param = { | 1152 | if (ret) |
1069 | .sched_priority = MAX_USER_RT_PRIO/2, | ||
1070 | }; | ||
1071 | |||
1072 | t = kthread_create(irq_thread, new, "irq/%d-%s", irq, | ||
1073 | new->name); | ||
1074 | if (IS_ERR(t)) { | ||
1075 | ret = PTR_ERR(t); | ||
1076 | goto out_mput; | 1153 | goto out_mput; |
1154 | if (new->secondary) { | ||
1155 | ret = setup_irq_thread(new->secondary, irq, true); | ||
1156 | if (ret) | ||
1157 | goto out_thread; | ||
1077 | } | 1158 | } |
1078 | |||
1079 | sched_setscheduler_nocheck(t, SCHED_FIFO, ¶m); | ||
1080 | |||
1081 | /* | ||
1082 | * We keep the reference to the task struct even if | ||
1083 | * the thread dies to avoid that the interrupt code | ||
1084 | * references an already freed task_struct. | ||
1085 | */ | ||
1086 | get_task_struct(t); | ||
1087 | new->thread = t; | ||
1088 | /* | ||
1089 | * Tell the thread to set its affinity. This is | ||
1090 | * important for shared interrupt handlers as we do | ||
1091 | * not invoke setup_affinity() for the secondary | ||
1092 | * handlers as everything is already set up. Even for | ||
1093 | * interrupts marked with IRQF_NO_BALANCE this is | ||
1094 | * correct as we want the thread to move to the cpu(s) | ||
1095 | * on which the requesting code placed the interrupt. | ||
1096 | */ | ||
1097 | set_bit(IRQTF_AFFINITY, &new->thread_flags); | ||
1098 | } | 1159 | } |
1099 | 1160 | ||
1100 | if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { | 1161 | if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { |
@@ -1267,7 +1328,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
1267 | irq, nmsk, omsk); | 1328 | irq, nmsk, omsk); |
1268 | } | 1329 | } |
1269 | 1330 | ||
1270 | new->irq = irq; | ||
1271 | *old_ptr = new; | 1331 | *old_ptr = new; |
1272 | 1332 | ||
1273 | irq_pm_install_action(desc, new); | 1333 | irq_pm_install_action(desc, new); |
@@ -1293,6 +1353,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
1293 | */ | 1353 | */ |
1294 | if (new->thread) | 1354 | if (new->thread) |
1295 | wake_up_process(new->thread); | 1355 | wake_up_process(new->thread); |
1356 | if (new->secondary) | ||
1357 | wake_up_process(new->secondary->thread); | ||
1296 | 1358 | ||
1297 | register_irq_proc(irq, desc); | 1359 | register_irq_proc(irq, desc); |
1298 | new->dir = NULL; | 1360 | new->dir = NULL; |
@@ -1323,6 +1385,13 @@ out_thread: | |||
1323 | kthread_stop(t); | 1385 | kthread_stop(t); |
1324 | put_task_struct(t); | 1386 | put_task_struct(t); |
1325 | } | 1387 | } |
1388 | if (new->secondary && new->secondary->thread) { | ||
1389 | struct task_struct *t = new->secondary->thread; | ||
1390 | |||
1391 | new->secondary->thread = NULL; | ||
1392 | kthread_stop(t); | ||
1393 | put_task_struct(t); | ||
1394 | } | ||
1326 | out_mput: | 1395 | out_mput: |
1327 | module_put(desc->owner); | 1396 | module_put(desc->owner); |
1328 | return ret; | 1397 | return ret; |
@@ -1394,6 +1463,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) | |||
1394 | 1463 | ||
1395 | /* If this was the last handler, shut down the IRQ line: */ | 1464 | /* If this was the last handler, shut down the IRQ line: */ |
1396 | if (!desc->action) { | 1465 | if (!desc->action) { |
1466 | irq_settings_clr_disable_unlazy(desc); | ||
1397 | irq_shutdown(desc); | 1467 | irq_shutdown(desc); |
1398 | irq_release_resources(desc); | 1468 | irq_release_resources(desc); |
1399 | } | 1469 | } |
@@ -1430,9 +1500,14 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) | |||
1430 | if (action->thread) { | 1500 | if (action->thread) { |
1431 | kthread_stop(action->thread); | 1501 | kthread_stop(action->thread); |
1432 | put_task_struct(action->thread); | 1502 | put_task_struct(action->thread); |
1503 | if (action->secondary && action->secondary->thread) { | ||
1504 | kthread_stop(action->secondary->thread); | ||
1505 | put_task_struct(action->secondary->thread); | ||
1506 | } | ||
1433 | } | 1507 | } |
1434 | 1508 | ||
1435 | module_put(desc->owner); | 1509 | module_put(desc->owner); |
1510 | kfree(action->secondary); | ||
1436 | return action; | 1511 | return action; |
1437 | } | 1512 | } |
1438 | 1513 | ||
@@ -1576,8 +1651,10 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, | |||
1576 | retval = __setup_irq(irq, desc, action); | 1651 | retval = __setup_irq(irq, desc, action); |
1577 | chip_bus_sync_unlock(desc); | 1652 | chip_bus_sync_unlock(desc); |
1578 | 1653 | ||
1579 | if (retval) | 1654 | if (retval) { |
1655 | kfree(action->secondary); | ||
1580 | kfree(action); | 1656 | kfree(action); |
1657 | } | ||
1581 | 1658 | ||
1582 | #ifdef CONFIG_DEBUG_SHIRQ_FIXME | 1659 | #ifdef CONFIG_DEBUG_SHIRQ_FIXME |
1583 | if (!retval && (irqflags & IRQF_SHARED)) { | 1660 | if (!retval && (irqflags & IRQF_SHARED)) { |
@@ -1761,6 +1838,7 @@ void free_percpu_irq(unsigned int irq, void __percpu *dev_id) | |||
1761 | kfree(__free_percpu_irq(irq, dev_id)); | 1838 | kfree(__free_percpu_irq(irq, dev_id)); |
1762 | chip_bus_sync_unlock(desc); | 1839 | chip_bus_sync_unlock(desc); |
1763 | } | 1840 | } |
1841 | EXPORT_SYMBOL_GPL(free_percpu_irq); | ||
1764 | 1842 | ||
1765 | /** | 1843 | /** |
1766 | * setup_percpu_irq - setup a per-cpu interrupt | 1844 | * setup_percpu_irq - setup a per-cpu interrupt |
@@ -1790,9 +1868,10 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act) | |||
1790 | * @devname: An ascii name for the claiming device | 1868 | * @devname: An ascii name for the claiming device |
1791 | * @dev_id: A percpu cookie passed back to the handler function | 1869 | * @dev_id: A percpu cookie passed back to the handler function |
1792 | * | 1870 | * |
1793 | * This call allocates interrupt resources, but doesn't | 1871 | * This call allocates interrupt resources and enables the |
1794 | * automatically enable the interrupt. It has to be done on each | 1872 | * interrupt on the local CPU. If the interrupt is supposed to be |
1795 | * CPU using enable_percpu_irq(). | 1873 | * enabled on other CPUs, it has to be done on each CPU using |
1874 | * enable_percpu_irq(). | ||
1796 | * | 1875 | * |
1797 | * Dev_id must be globally unique. It is a per-cpu variable, and | 1876 | * Dev_id must be globally unique. It is a per-cpu variable, and |
1798 | * the handler gets called with the interrupted CPU's instance of | 1877 | * the handler gets called with the interrupted CPU's instance of |
@@ -1831,6 +1910,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler, | |||
1831 | 1910 | ||
1832 | return retval; | 1911 | return retval; |
1833 | } | 1912 | } |
1913 | EXPORT_SYMBOL_GPL(request_percpu_irq); | ||
1834 | 1914 | ||
1835 | /** | 1915 | /** |
1836 | * irq_get_irqchip_state - returns the irqchip state of a interrupt. | 1916 | * irq_get_irqchip_state - returns the irqchip state of a interrupt. |
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 7e6512b9dc1f..6b0c0b74a2a1 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c | |||
@@ -228,22 +228,18 @@ static void msi_domain_update_chip_ops(struct msi_domain_info *info) | |||
228 | { | 228 | { |
229 | struct irq_chip *chip = info->chip; | 229 | struct irq_chip *chip = info->chip; |
230 | 230 | ||
231 | BUG_ON(!chip); | 231 | BUG_ON(!chip || !chip->irq_mask || !chip->irq_unmask); |
232 | if (!chip->irq_mask) | ||
233 | chip->irq_mask = pci_msi_mask_irq; | ||
234 | if (!chip->irq_unmask) | ||
235 | chip->irq_unmask = pci_msi_unmask_irq; | ||
236 | if (!chip->irq_set_affinity) | 232 | if (!chip->irq_set_affinity) |
237 | chip->irq_set_affinity = msi_domain_set_affinity; | 233 | chip->irq_set_affinity = msi_domain_set_affinity; |
238 | } | 234 | } |
239 | 235 | ||
240 | /** | 236 | /** |
241 | * msi_create_irq_domain - Create a MSI interrupt domain | 237 | * msi_create_irq_domain - Create a MSI interrupt domain |
242 | * @of_node: Optional device-tree node of the interrupt controller | 238 | * @fwnode: Optional fwnode of the interrupt controller |
243 | * @info: MSI domain info | 239 | * @info: MSI domain info |
244 | * @parent: Parent irq domain | 240 | * @parent: Parent irq domain |
245 | */ | 241 | */ |
246 | struct irq_domain *msi_create_irq_domain(struct device_node *node, | 242 | struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode, |
247 | struct msi_domain_info *info, | 243 | struct msi_domain_info *info, |
248 | struct irq_domain *parent) | 244 | struct irq_domain *parent) |
249 | { | 245 | { |
@@ -252,8 +248,8 @@ struct irq_domain *msi_create_irq_domain(struct device_node *node, | |||
252 | if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS) | 248 | if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS) |
253 | msi_domain_update_chip_ops(info); | 249 | msi_domain_update_chip_ops(info); |
254 | 250 | ||
255 | return irq_domain_add_hierarchy(parent, 0, 0, node, &msi_domain_ops, | 251 | return irq_domain_create_hierarchy(parent, 0, 0, fwnode, |
256 | info); | 252 | &msi_domain_ops, info); |
257 | } | 253 | } |
258 | 254 | ||
259 | /** | 255 | /** |
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 21c62617a35a..e80c4400118a 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c | |||
@@ -21,7 +21,7 @@ bool irq_pm_check_wakeup(struct irq_desc *desc) | |||
21 | desc->istate |= IRQS_SUSPENDED | IRQS_PENDING; | 21 | desc->istate |= IRQS_SUSPENDED | IRQS_PENDING; |
22 | desc->depth++; | 22 | desc->depth++; |
23 | irq_disable(desc); | 23 | irq_disable(desc); |
24 | pm_system_wakeup(); | 24 | pm_system_irq_wakeup(irq_desc_get_irq(desc)); |
25 | return true; | 25 | return true; |
26 | } | 26 | } |
27 | return false; | 27 | return false; |
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index e3a8c9577ba6..a916cf144b65 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
@@ -12,6 +12,7 @@ | |||
12 | #include <linux/seq_file.h> | 12 | #include <linux/seq_file.h> |
13 | #include <linux/interrupt.h> | 13 | #include <linux/interrupt.h> |
14 | #include <linux/kernel_stat.h> | 14 | #include <linux/kernel_stat.h> |
15 | #include <linux/mutex.h> | ||
15 | 16 | ||
16 | #include "internals.h" | 17 | #include "internals.h" |
17 | 18 | ||
@@ -323,18 +324,29 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) | |||
323 | 324 | ||
324 | void register_irq_proc(unsigned int irq, struct irq_desc *desc) | 325 | void register_irq_proc(unsigned int irq, struct irq_desc *desc) |
325 | { | 326 | { |
327 | static DEFINE_MUTEX(register_lock); | ||
326 | char name [MAX_NAMELEN]; | 328 | char name [MAX_NAMELEN]; |
327 | 329 | ||
328 | if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip) || desc->dir) | 330 | if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip)) |
329 | return; | 331 | return; |
330 | 332 | ||
333 | /* | ||
334 | * irq directories are registered only when a handler is | ||
335 | * added, not when the descriptor is created, so multiple | ||
336 | * tasks might try to register at the same time. | ||
337 | */ | ||
338 | mutex_lock(®ister_lock); | ||
339 | |||
340 | if (desc->dir) | ||
341 | goto out_unlock; | ||
342 | |||
331 | memset(name, 0, MAX_NAMELEN); | 343 | memset(name, 0, MAX_NAMELEN); |
332 | sprintf(name, "%d", irq); | 344 | sprintf(name, "%d", irq); |
333 | 345 | ||
334 | /* create /proc/irq/1234 */ | 346 | /* create /proc/irq/1234 */ |
335 | desc->dir = proc_mkdir(name, root_irq_dir); | 347 | desc->dir = proc_mkdir(name, root_irq_dir); |
336 | if (!desc->dir) | 348 | if (!desc->dir) |
337 | return; | 349 | goto out_unlock; |
338 | 350 | ||
339 | #ifdef CONFIG_SMP | 351 | #ifdef CONFIG_SMP |
340 | /* create /proc/irq/<irq>/smp_affinity */ | 352 | /* create /proc/irq/<irq>/smp_affinity */ |
@@ -355,6 +367,9 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) | |||
355 | 367 | ||
356 | proc_create_data("spurious", 0444, desc->dir, | 368 | proc_create_data("spurious", 0444, desc->dir, |
357 | &irq_spurious_proc_fops, (void *)(long)irq); | 369 | &irq_spurious_proc_fops, (void *)(long)irq); |
370 | |||
371 | out_unlock: | ||
372 | mutex_unlock(®ister_lock); | ||
358 | } | 373 | } |
359 | 374 | ||
360 | void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) | 375 | void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) |
@@ -460,7 +475,7 @@ int show_interrupts(struct seq_file *p, void *v) | |||
460 | for_each_online_cpu(j) | 475 | for_each_online_cpu(j) |
461 | any_count |= kstat_irqs_cpu(i, j); | 476 | any_count |= kstat_irqs_cpu(i, j); |
462 | action = desc->action; | 477 | action = desc->action; |
463 | if (!action && !any_count) | 478 | if ((!action || action == &chained_action) && !any_count) |
464 | goto out; | 479 | goto out; |
465 | 480 | ||
466 | seq_printf(p, "%*d: ", prec, i); | 481 | seq_printf(p, "%*d: ", prec, i); |
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 3320b84cc60f..320579d89091 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h | |||
@@ -15,6 +15,7 @@ enum { | |||
15 | _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD, | 15 | _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD, |
16 | _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID, | 16 | _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID, |
17 | _IRQ_IS_POLLED = IRQ_IS_POLLED, | 17 | _IRQ_IS_POLLED = IRQ_IS_POLLED, |
18 | _IRQ_DISABLE_UNLAZY = IRQ_DISABLE_UNLAZY, | ||
18 | _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, | 19 | _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, |
19 | }; | 20 | }; |
20 | 21 | ||
@@ -28,6 +29,7 @@ enum { | |||
28 | #define IRQ_NESTED_THREAD GOT_YOU_MORON | 29 | #define IRQ_NESTED_THREAD GOT_YOU_MORON |
29 | #define IRQ_PER_CPU_DEVID GOT_YOU_MORON | 30 | #define IRQ_PER_CPU_DEVID GOT_YOU_MORON |
30 | #define IRQ_IS_POLLED GOT_YOU_MORON | 31 | #define IRQ_IS_POLLED GOT_YOU_MORON |
32 | #define IRQ_DISABLE_UNLAZY GOT_YOU_MORON | ||
31 | #undef IRQF_MODIFY_MASK | 33 | #undef IRQF_MODIFY_MASK |
32 | #define IRQF_MODIFY_MASK GOT_YOU_MORON | 34 | #define IRQF_MODIFY_MASK GOT_YOU_MORON |
33 | 35 | ||
@@ -154,3 +156,13 @@ static inline bool irq_settings_is_polled(struct irq_desc *desc) | |||
154 | { | 156 | { |
155 | return desc->status_use_accessors & _IRQ_IS_POLLED; | 157 | return desc->status_use_accessors & _IRQ_IS_POLLED; |
156 | } | 158 | } |
159 | |||
160 | static inline bool irq_settings_disable_unlazy(struct irq_desc *desc) | ||
161 | { | ||
162 | return desc->status_use_accessors & _IRQ_DISABLE_UNLAZY; | ||
163 | } | ||
164 | |||
165 | static inline void irq_settings_clr_disable_unlazy(struct irq_desc *desc) | ||
166 | { | ||
167 | desc->status_use_accessors &= ~_IRQ_DISABLE_UNLAZY; | ||
168 | } | ||
diff --git a/kernel/kexec.c b/kernel/kexec.c index 4c5edc357923..d873b64fbddc 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -6,6 +6,8 @@ | |||
6 | * Version 2. See the file COPYING for more details. | 6 | * Version 2. See the file COPYING for more details. |
7 | */ | 7 | */ |
8 | 8 | ||
9 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
10 | |||
9 | #include <linux/capability.h> | 11 | #include <linux/capability.h> |
10 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
11 | #include <linux/file.h> | 13 | #include <linux/file.h> |
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 201b45327804..11b64a63c0f8 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c | |||
@@ -6,7 +6,7 @@ | |||
6 | * Version 2. See the file COPYING for more details. | 6 | * Version 2. See the file COPYING for more details. |
7 | */ | 7 | */ |
8 | 8 | ||
9 | #define pr_fmt(fmt) "kexec: " fmt | 9 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
10 | 10 | ||
11 | #include <linux/capability.h> | 11 | #include <linux/capability.h> |
12 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
@@ -1027,7 +1027,7 @@ static int __init crash_notes_memory_init(void) | |||
1027 | 1027 | ||
1028 | crash_notes = __alloc_percpu(size, align); | 1028 | crash_notes = __alloc_percpu(size, align); |
1029 | if (!crash_notes) { | 1029 | if (!crash_notes) { |
1030 | pr_warn("Kexec: Memory allocation for saving cpu register states failed\n"); | 1030 | pr_warn("Memory allocation for saving cpu register states failed\n"); |
1031 | return -ENOMEM; | 1031 | return -ENOMEM; |
1032 | } | 1032 | } |
1033 | return 0; | 1033 | return 0; |
@@ -1149,7 +1149,7 @@ static int __init parse_crashkernel_simple(char *cmdline, | |||
1149 | if (*cur == '@') | 1149 | if (*cur == '@') |
1150 | *crash_base = memparse(cur+1, &cur); | 1150 | *crash_base = memparse(cur+1, &cur); |
1151 | else if (*cur != ' ' && *cur != '\0') { | 1151 | else if (*cur != ' ' && *cur != '\0') { |
1152 | pr_warn("crashkernel: unrecognized char\n"); | 1152 | pr_warn("crashkernel: unrecognized char: %c\n", *cur); |
1153 | return -EINVAL; | 1153 | return -EINVAL; |
1154 | } | 1154 | } |
1155 | 1155 | ||
@@ -1186,12 +1186,12 @@ static int __init parse_crashkernel_suffix(char *cmdline, | |||
1186 | 1186 | ||
1187 | /* check with suffix */ | 1187 | /* check with suffix */ |
1188 | if (strncmp(cur, suffix, strlen(suffix))) { | 1188 | if (strncmp(cur, suffix, strlen(suffix))) { |
1189 | pr_warn("crashkernel: unrecognized char\n"); | 1189 | pr_warn("crashkernel: unrecognized char: %c\n", *cur); |
1190 | return -EINVAL; | 1190 | return -EINVAL; |
1191 | } | 1191 | } |
1192 | cur += strlen(suffix); | 1192 | cur += strlen(suffix); |
1193 | if (*cur != ' ' && *cur != '\0') { | 1193 | if (*cur != ' ' && *cur != '\0') { |
1194 | pr_warn("crashkernel: unrecognized char\n"); | 1194 | pr_warn("crashkernel: unrecognized char: %c\n", *cur); |
1195 | return -EINVAL; | 1195 | return -EINVAL; |
1196 | } | 1196 | } |
1197 | 1197 | ||
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 6a9a3f2a0e8e..b70ada0028d2 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c | |||
@@ -9,6 +9,8 @@ | |||
9 | * Version 2. See the file COPYING for more details. | 9 | * Version 2. See the file COPYING for more details. |
10 | */ | 10 | */ |
11 | 11 | ||
12 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
13 | |||
12 | #include <linux/capability.h> | 14 | #include <linux/capability.h> |
13 | #include <linux/mm.h> | 15 | #include <linux/mm.h> |
14 | #include <linux/file.h> | 16 | #include <linux/file.h> |
diff --git a/kernel/kmod.c b/kernel/kmod.c index da98d0593de2..0277d1216f80 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -327,9 +327,13 @@ static void call_usermodehelper_exec_work(struct work_struct *work) | |||
327 | call_usermodehelper_exec_sync(sub_info); | 327 | call_usermodehelper_exec_sync(sub_info); |
328 | } else { | 328 | } else { |
329 | pid_t pid; | 329 | pid_t pid; |
330 | 330 | /* | |
331 | * Use CLONE_PARENT to reparent it to kthreadd; we do not | ||
332 | * want to pollute current->children, and we need a parent | ||
333 | * that always ignores SIGCHLD to ensure auto-reaping. | ||
334 | */ | ||
331 | pid = kernel_thread(call_usermodehelper_exec_async, sub_info, | 335 | pid = kernel_thread(call_usermodehelper_exec_async, sub_info, |
332 | SIGCHLD); | 336 | CLONE_PARENT | SIGCHLD); |
333 | if (pid < 0) { | 337 | if (pid < 0) { |
334 | sub_info->retval = pid; | 338 | sub_info->retval = pid; |
335 | umh_complete(sub_info); | 339 | umh_complete(sub_info); |
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 8acfbf773e06..deae3907ac1e 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c | |||
@@ -2738,7 +2738,7 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) | |||
2738 | return; | 2738 | return; |
2739 | 2739 | ||
2740 | /* no reclaim without waiting on it */ | 2740 | /* no reclaim without waiting on it */ |
2741 | if (!(gfp_mask & __GFP_WAIT)) | 2741 | if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) |
2742 | return; | 2742 | return; |
2743 | 2743 | ||
2744 | /* this guy won't enter reclaim */ | 2744 | /* this guy won't enter reclaim */ |
@@ -3068,7 +3068,7 @@ static int __lock_is_held(struct lockdep_map *lock); | |||
3068 | static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | 3068 | static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, |
3069 | int trylock, int read, int check, int hardirqs_off, | 3069 | int trylock, int read, int check, int hardirqs_off, |
3070 | struct lockdep_map *nest_lock, unsigned long ip, | 3070 | struct lockdep_map *nest_lock, unsigned long ip, |
3071 | int references) | 3071 | int references, int pin_count) |
3072 | { | 3072 | { |
3073 | struct task_struct *curr = current; | 3073 | struct task_struct *curr = current; |
3074 | struct lock_class *class = NULL; | 3074 | struct lock_class *class = NULL; |
@@ -3157,7 +3157,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
3157 | hlock->waittime_stamp = 0; | 3157 | hlock->waittime_stamp = 0; |
3158 | hlock->holdtime_stamp = lockstat_clock(); | 3158 | hlock->holdtime_stamp = lockstat_clock(); |
3159 | #endif | 3159 | #endif |
3160 | hlock->pin_count = 0; | 3160 | hlock->pin_count = pin_count; |
3161 | 3161 | ||
3162 | if (check && !mark_irqflags(curr, hlock)) | 3162 | if (check && !mark_irqflags(curr, hlock)) |
3163 | return 0; | 3163 | return 0; |
@@ -3343,7 +3343,7 @@ found_it: | |||
3343 | hlock_class(hlock)->subclass, hlock->trylock, | 3343 | hlock_class(hlock)->subclass, hlock->trylock, |
3344 | hlock->read, hlock->check, hlock->hardirqs_off, | 3344 | hlock->read, hlock->check, hlock->hardirqs_off, |
3345 | hlock->nest_lock, hlock->acquire_ip, | 3345 | hlock->nest_lock, hlock->acquire_ip, |
3346 | hlock->references)) | 3346 | hlock->references, hlock->pin_count)) |
3347 | return 0; | 3347 | return 0; |
3348 | } | 3348 | } |
3349 | 3349 | ||
@@ -3433,7 +3433,7 @@ found_it: | |||
3433 | hlock_class(hlock)->subclass, hlock->trylock, | 3433 | hlock_class(hlock)->subclass, hlock->trylock, |
3434 | hlock->read, hlock->check, hlock->hardirqs_off, | 3434 | hlock->read, hlock->check, hlock->hardirqs_off, |
3435 | hlock->nest_lock, hlock->acquire_ip, | 3435 | hlock->nest_lock, hlock->acquire_ip, |
3436 | hlock->references)) | 3436 | hlock->references, hlock->pin_count)) |
3437 | return 0; | 3437 | return 0; |
3438 | } | 3438 | } |
3439 | 3439 | ||
@@ -3583,7 +3583,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
3583 | current->lockdep_recursion = 1; | 3583 | current->lockdep_recursion = 1; |
3584 | trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); | 3584 | trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); |
3585 | __lock_acquire(lock, subclass, trylock, read, check, | 3585 | __lock_acquire(lock, subclass, trylock, read, check, |
3586 | irqs_disabled_flags(flags), nest_lock, ip, 0); | 3586 | irqs_disabled_flags(flags), nest_lock, ip, 0, 0); |
3587 | current->lockdep_recursion = 0; | 3587 | current->lockdep_recursion = 0; |
3588 | raw_local_irq_restore(flags); | 3588 | raw_local_irq_restore(flags); |
3589 | } | 3589 | } |
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 32244186f1f2..8ef1919d63b2 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c | |||
@@ -17,12 +17,14 @@ | |||
17 | * | 17 | * |
18 | * Copyright (C) IBM Corporation, 2014 | 18 | * Copyright (C) IBM Corporation, 2014 |
19 | * | 19 | * |
20 | * Author: Paul E. McKenney <paulmck@us.ibm.com> | 20 | * Authors: Paul E. McKenney <paulmck@us.ibm.com> |
21 | * Davidlohr Bueso <dave@stgolabs.net> | ||
21 | * Based on kernel/rcu/torture.c. | 22 | * Based on kernel/rcu/torture.c. |
22 | */ | 23 | */ |
23 | #include <linux/kernel.h> | 24 | #include <linux/kernel.h> |
24 | #include <linux/module.h> | 25 | #include <linux/module.h> |
25 | #include <linux/kthread.h> | 26 | #include <linux/kthread.h> |
27 | #include <linux/sched/rt.h> | ||
26 | #include <linux/spinlock.h> | 28 | #include <linux/spinlock.h> |
27 | #include <linux/rwlock.h> | 29 | #include <linux/rwlock.h> |
28 | #include <linux/mutex.h> | 30 | #include <linux/mutex.h> |
@@ -34,6 +36,7 @@ | |||
34 | #include <linux/moduleparam.h> | 36 | #include <linux/moduleparam.h> |
35 | #include <linux/delay.h> | 37 | #include <linux/delay.h> |
36 | #include <linux/slab.h> | 38 | #include <linux/slab.h> |
39 | #include <linux/percpu-rwsem.h> | ||
37 | #include <linux/torture.h> | 40 | #include <linux/torture.h> |
38 | 41 | ||
39 | MODULE_LICENSE("GPL"); | 42 | MODULE_LICENSE("GPL"); |
@@ -91,11 +94,13 @@ struct lock_torture_ops { | |||
91 | void (*init)(void); | 94 | void (*init)(void); |
92 | int (*writelock)(void); | 95 | int (*writelock)(void); |
93 | void (*write_delay)(struct torture_random_state *trsp); | 96 | void (*write_delay)(struct torture_random_state *trsp); |
97 | void (*task_boost)(struct torture_random_state *trsp); | ||
94 | void (*writeunlock)(void); | 98 | void (*writeunlock)(void); |
95 | int (*readlock)(void); | 99 | int (*readlock)(void); |
96 | void (*read_delay)(struct torture_random_state *trsp); | 100 | void (*read_delay)(struct torture_random_state *trsp); |
97 | void (*readunlock)(void); | 101 | void (*readunlock)(void); |
98 | unsigned long flags; | 102 | |
103 | unsigned long flags; /* for irq spinlocks */ | ||
99 | const char *name; | 104 | const char *name; |
100 | }; | 105 | }; |
101 | 106 | ||
@@ -139,9 +144,15 @@ static void torture_lock_busted_write_unlock(void) | |||
139 | /* BUGGY, do not use in real life!!! */ | 144 | /* BUGGY, do not use in real life!!! */ |
140 | } | 145 | } |
141 | 146 | ||
147 | static void torture_boost_dummy(struct torture_random_state *trsp) | ||
148 | { | ||
149 | /* Only rtmutexes care about priority */ | ||
150 | } | ||
151 | |||
142 | static struct lock_torture_ops lock_busted_ops = { | 152 | static struct lock_torture_ops lock_busted_ops = { |
143 | .writelock = torture_lock_busted_write_lock, | 153 | .writelock = torture_lock_busted_write_lock, |
144 | .write_delay = torture_lock_busted_write_delay, | 154 | .write_delay = torture_lock_busted_write_delay, |
155 | .task_boost = torture_boost_dummy, | ||
145 | .writeunlock = torture_lock_busted_write_unlock, | 156 | .writeunlock = torture_lock_busted_write_unlock, |
146 | .readlock = NULL, | 157 | .readlock = NULL, |
147 | .read_delay = NULL, | 158 | .read_delay = NULL, |
@@ -185,6 +196,7 @@ static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock) | |||
185 | static struct lock_torture_ops spin_lock_ops = { | 196 | static struct lock_torture_ops spin_lock_ops = { |
186 | .writelock = torture_spin_lock_write_lock, | 197 | .writelock = torture_spin_lock_write_lock, |
187 | .write_delay = torture_spin_lock_write_delay, | 198 | .write_delay = torture_spin_lock_write_delay, |
199 | .task_boost = torture_boost_dummy, | ||
188 | .writeunlock = torture_spin_lock_write_unlock, | 200 | .writeunlock = torture_spin_lock_write_unlock, |
189 | .readlock = NULL, | 201 | .readlock = NULL, |
190 | .read_delay = NULL, | 202 | .read_delay = NULL, |
@@ -211,6 +223,7 @@ __releases(torture_spinlock) | |||
211 | static struct lock_torture_ops spin_lock_irq_ops = { | 223 | static struct lock_torture_ops spin_lock_irq_ops = { |
212 | .writelock = torture_spin_lock_write_lock_irq, | 224 | .writelock = torture_spin_lock_write_lock_irq, |
213 | .write_delay = torture_spin_lock_write_delay, | 225 | .write_delay = torture_spin_lock_write_delay, |
226 | .task_boost = torture_boost_dummy, | ||
214 | .writeunlock = torture_lock_spin_write_unlock_irq, | 227 | .writeunlock = torture_lock_spin_write_unlock_irq, |
215 | .readlock = NULL, | 228 | .readlock = NULL, |
216 | .read_delay = NULL, | 229 | .read_delay = NULL, |
@@ -275,6 +288,7 @@ static void torture_rwlock_read_unlock(void) __releases(torture_rwlock) | |||
275 | static struct lock_torture_ops rw_lock_ops = { | 288 | static struct lock_torture_ops rw_lock_ops = { |
276 | .writelock = torture_rwlock_write_lock, | 289 | .writelock = torture_rwlock_write_lock, |
277 | .write_delay = torture_rwlock_write_delay, | 290 | .write_delay = torture_rwlock_write_delay, |
291 | .task_boost = torture_boost_dummy, | ||
278 | .writeunlock = torture_rwlock_write_unlock, | 292 | .writeunlock = torture_rwlock_write_unlock, |
279 | .readlock = torture_rwlock_read_lock, | 293 | .readlock = torture_rwlock_read_lock, |
280 | .read_delay = torture_rwlock_read_delay, | 294 | .read_delay = torture_rwlock_read_delay, |
@@ -315,6 +329,7 @@ __releases(torture_rwlock) | |||
315 | static struct lock_torture_ops rw_lock_irq_ops = { | 329 | static struct lock_torture_ops rw_lock_irq_ops = { |
316 | .writelock = torture_rwlock_write_lock_irq, | 330 | .writelock = torture_rwlock_write_lock_irq, |
317 | .write_delay = torture_rwlock_write_delay, | 331 | .write_delay = torture_rwlock_write_delay, |
332 | .task_boost = torture_boost_dummy, | ||
318 | .writeunlock = torture_rwlock_write_unlock_irq, | 333 | .writeunlock = torture_rwlock_write_unlock_irq, |
319 | .readlock = torture_rwlock_read_lock_irq, | 334 | .readlock = torture_rwlock_read_lock_irq, |
320 | .read_delay = torture_rwlock_read_delay, | 335 | .read_delay = torture_rwlock_read_delay, |
@@ -354,6 +369,7 @@ static void torture_mutex_unlock(void) __releases(torture_mutex) | |||
354 | static struct lock_torture_ops mutex_lock_ops = { | 369 | static struct lock_torture_ops mutex_lock_ops = { |
355 | .writelock = torture_mutex_lock, | 370 | .writelock = torture_mutex_lock, |
356 | .write_delay = torture_mutex_delay, | 371 | .write_delay = torture_mutex_delay, |
372 | .task_boost = torture_boost_dummy, | ||
357 | .writeunlock = torture_mutex_unlock, | 373 | .writeunlock = torture_mutex_unlock, |
358 | .readlock = NULL, | 374 | .readlock = NULL, |
359 | .read_delay = NULL, | 375 | .read_delay = NULL, |
@@ -361,6 +377,90 @@ static struct lock_torture_ops mutex_lock_ops = { | |||
361 | .name = "mutex_lock" | 377 | .name = "mutex_lock" |
362 | }; | 378 | }; |
363 | 379 | ||
380 | #ifdef CONFIG_RT_MUTEXES | ||
381 | static DEFINE_RT_MUTEX(torture_rtmutex); | ||
382 | |||
383 | static int torture_rtmutex_lock(void) __acquires(torture_rtmutex) | ||
384 | { | ||
385 | rt_mutex_lock(&torture_rtmutex); | ||
386 | return 0; | ||
387 | } | ||
388 | |||
389 | static void torture_rtmutex_boost(struct torture_random_state *trsp) | ||
390 | { | ||
391 | int policy; | ||
392 | struct sched_param param; | ||
393 | const unsigned int factor = 50000; /* yes, quite arbitrary */ | ||
394 | |||
395 | if (!rt_task(current)) { | ||
396 | /* | ||
397 | * (1) Boost priority once every ~50k operations. When the | ||
398 | * task tries to take the lock, the rtmutex it will account | ||
399 | * for the new priority, and do any corresponding pi-dance. | ||
400 | */ | ||
401 | if (!(torture_random(trsp) % | ||
402 | (cxt.nrealwriters_stress * factor))) { | ||
403 | policy = SCHED_FIFO; | ||
404 | param.sched_priority = MAX_RT_PRIO - 1; | ||
405 | } else /* common case, do nothing */ | ||
406 | return; | ||
407 | } else { | ||
408 | /* | ||
409 | * The task will remain boosted for another ~500k operations, | ||
410 | * then restored back to its original prio, and so forth. | ||
411 | * | ||
412 | * When @trsp is nil, we want to force-reset the task for | ||
413 | * stopping the kthread. | ||
414 | */ | ||
415 | if (!trsp || !(torture_random(trsp) % | ||
416 | (cxt.nrealwriters_stress * factor * 2))) { | ||
417 | policy = SCHED_NORMAL; | ||
418 | param.sched_priority = 0; | ||
419 | } else /* common case, do nothing */ | ||
420 | return; | ||
421 | } | ||
422 | |||
423 | sched_setscheduler_nocheck(current, policy, ¶m); | ||
424 | } | ||
425 | |||
426 | static void torture_rtmutex_delay(struct torture_random_state *trsp) | ||
427 | { | ||
428 | const unsigned long shortdelay_us = 2; | ||
429 | const unsigned long longdelay_ms = 100; | ||
430 | |||
431 | /* | ||
432 | * We want a short delay mostly to emulate likely code, and | ||
433 | * we want a long delay occasionally to force massive contention. | ||
434 | */ | ||
435 | if (!(torture_random(trsp) % | ||
436 | (cxt.nrealwriters_stress * 2000 * longdelay_ms))) | ||
437 | mdelay(longdelay_ms); | ||
438 | if (!(torture_random(trsp) % | ||
439 | (cxt.nrealwriters_stress * 2 * shortdelay_us))) | ||
440 | udelay(shortdelay_us); | ||
441 | #ifdef CONFIG_PREEMPT | ||
442 | if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) | ||
443 | preempt_schedule(); /* Allow test to be preempted. */ | ||
444 | #endif | ||
445 | } | ||
446 | |||
447 | static void torture_rtmutex_unlock(void) __releases(torture_rtmutex) | ||
448 | { | ||
449 | rt_mutex_unlock(&torture_rtmutex); | ||
450 | } | ||
451 | |||
452 | static struct lock_torture_ops rtmutex_lock_ops = { | ||
453 | .writelock = torture_rtmutex_lock, | ||
454 | .write_delay = torture_rtmutex_delay, | ||
455 | .task_boost = torture_rtmutex_boost, | ||
456 | .writeunlock = torture_rtmutex_unlock, | ||
457 | .readlock = NULL, | ||
458 | .read_delay = NULL, | ||
459 | .readunlock = NULL, | ||
460 | .name = "rtmutex_lock" | ||
461 | }; | ||
462 | #endif | ||
463 | |||
364 | static DECLARE_RWSEM(torture_rwsem); | 464 | static DECLARE_RWSEM(torture_rwsem); |
365 | static int torture_rwsem_down_write(void) __acquires(torture_rwsem) | 465 | static int torture_rwsem_down_write(void) __acquires(torture_rwsem) |
366 | { | 466 | { |
@@ -419,6 +519,7 @@ static void torture_rwsem_up_read(void) __releases(torture_rwsem) | |||
419 | static struct lock_torture_ops rwsem_lock_ops = { | 519 | static struct lock_torture_ops rwsem_lock_ops = { |
420 | .writelock = torture_rwsem_down_write, | 520 | .writelock = torture_rwsem_down_write, |
421 | .write_delay = torture_rwsem_write_delay, | 521 | .write_delay = torture_rwsem_write_delay, |
522 | .task_boost = torture_boost_dummy, | ||
422 | .writeunlock = torture_rwsem_up_write, | 523 | .writeunlock = torture_rwsem_up_write, |
423 | .readlock = torture_rwsem_down_read, | 524 | .readlock = torture_rwsem_down_read, |
424 | .read_delay = torture_rwsem_read_delay, | 525 | .read_delay = torture_rwsem_read_delay, |
@@ -426,6 +527,48 @@ static struct lock_torture_ops rwsem_lock_ops = { | |||
426 | .name = "rwsem_lock" | 527 | .name = "rwsem_lock" |
427 | }; | 528 | }; |
428 | 529 | ||
530 | #include <linux/percpu-rwsem.h> | ||
531 | static struct percpu_rw_semaphore pcpu_rwsem; | ||
532 | |||
533 | void torture_percpu_rwsem_init(void) | ||
534 | { | ||
535 | BUG_ON(percpu_init_rwsem(&pcpu_rwsem)); | ||
536 | } | ||
537 | |||
538 | static int torture_percpu_rwsem_down_write(void) __acquires(pcpu_rwsem) | ||
539 | { | ||
540 | percpu_down_write(&pcpu_rwsem); | ||
541 | return 0; | ||
542 | } | ||
543 | |||
544 | static void torture_percpu_rwsem_up_write(void) __releases(pcpu_rwsem) | ||
545 | { | ||
546 | percpu_up_write(&pcpu_rwsem); | ||
547 | } | ||
548 | |||
549 | static int torture_percpu_rwsem_down_read(void) __acquires(pcpu_rwsem) | ||
550 | { | ||
551 | percpu_down_read(&pcpu_rwsem); | ||
552 | return 0; | ||
553 | } | ||
554 | |||
555 | static void torture_percpu_rwsem_up_read(void) __releases(pcpu_rwsem) | ||
556 | { | ||
557 | percpu_up_read(&pcpu_rwsem); | ||
558 | } | ||
559 | |||
560 | static struct lock_torture_ops percpu_rwsem_lock_ops = { | ||
561 | .init = torture_percpu_rwsem_init, | ||
562 | .writelock = torture_percpu_rwsem_down_write, | ||
563 | .write_delay = torture_rwsem_write_delay, | ||
564 | .task_boost = torture_boost_dummy, | ||
565 | .writeunlock = torture_percpu_rwsem_up_write, | ||
566 | .readlock = torture_percpu_rwsem_down_read, | ||
567 | .read_delay = torture_rwsem_read_delay, | ||
568 | .readunlock = torture_percpu_rwsem_up_read, | ||
569 | .name = "percpu_rwsem_lock" | ||
570 | }; | ||
571 | |||
429 | /* | 572 | /* |
430 | * Lock torture writer kthread. Repeatedly acquires and releases | 573 | * Lock torture writer kthread. Repeatedly acquires and releases |
431 | * the lock, checking for duplicate acquisitions. | 574 | * the lock, checking for duplicate acquisitions. |
@@ -442,6 +585,7 @@ static int lock_torture_writer(void *arg) | |||
442 | if ((torture_random(&rand) & 0xfffff) == 0) | 585 | if ((torture_random(&rand) & 0xfffff) == 0) |
443 | schedule_timeout_uninterruptible(1); | 586 | schedule_timeout_uninterruptible(1); |
444 | 587 | ||
588 | cxt.cur_ops->task_boost(&rand); | ||
445 | cxt.cur_ops->writelock(); | 589 | cxt.cur_ops->writelock(); |
446 | if (WARN_ON_ONCE(lock_is_write_held)) | 590 | if (WARN_ON_ONCE(lock_is_write_held)) |
447 | lwsp->n_lock_fail++; | 591 | lwsp->n_lock_fail++; |
@@ -456,6 +600,8 @@ static int lock_torture_writer(void *arg) | |||
456 | 600 | ||
457 | stutter_wait("lock_torture_writer"); | 601 | stutter_wait("lock_torture_writer"); |
458 | } while (!torture_must_stop()); | 602 | } while (!torture_must_stop()); |
603 | |||
604 | cxt.cur_ops->task_boost(NULL); /* reset prio */ | ||
459 | torture_kthread_stopping("lock_torture_writer"); | 605 | torture_kthread_stopping("lock_torture_writer"); |
460 | return 0; | 606 | return 0; |
461 | } | 607 | } |
@@ -642,7 +788,11 @@ static int __init lock_torture_init(void) | |||
642 | &spin_lock_ops, &spin_lock_irq_ops, | 788 | &spin_lock_ops, &spin_lock_irq_ops, |
643 | &rw_lock_ops, &rw_lock_irq_ops, | 789 | &rw_lock_ops, &rw_lock_irq_ops, |
644 | &mutex_lock_ops, | 790 | &mutex_lock_ops, |
791 | #ifdef CONFIG_RT_MUTEXES | ||
792 | &rtmutex_lock_ops, | ||
793 | #endif | ||
645 | &rwsem_lock_ops, | 794 | &rwsem_lock_ops, |
795 | &percpu_rwsem_lock_ops, | ||
646 | }; | 796 | }; |
647 | 797 | ||
648 | if (!torture_init_begin(torture_type, verbose, &torture_runnable)) | 798 | if (!torture_init_begin(torture_type, verbose, &torture_runnable)) |
@@ -661,11 +811,11 @@ static int __init lock_torture_init(void) | |||
661 | for (i = 0; i < ARRAY_SIZE(torture_ops); i++) | 811 | for (i = 0; i < ARRAY_SIZE(torture_ops); i++) |
662 | pr_alert(" %s", torture_ops[i]->name); | 812 | pr_alert(" %s", torture_ops[i]->name); |
663 | pr_alert("\n"); | 813 | pr_alert("\n"); |
664 | torture_init_end(); | 814 | firsterr = -EINVAL; |
665 | return -EINVAL; | 815 | goto unwind; |
666 | } | 816 | } |
667 | if (cxt.cur_ops->init) | 817 | if (cxt.cur_ops->init) |
668 | cxt.cur_ops->init(); /* no "goto unwind" prior to this point!!! */ | 818 | cxt.cur_ops->init(); |
669 | 819 | ||
670 | if (nwriters_stress >= 0) | 820 | if (nwriters_stress >= 0) |
671 | cxt.nrealwriters_stress = nwriters_stress; | 821 | cxt.nrealwriters_stress = nwriters_stress; |
@@ -676,6 +826,10 @@ static int __init lock_torture_init(void) | |||
676 | if (strncmp(torture_type, "mutex", 5) == 0) | 826 | if (strncmp(torture_type, "mutex", 5) == 0) |
677 | cxt.debug_lock = true; | 827 | cxt.debug_lock = true; |
678 | #endif | 828 | #endif |
829 | #ifdef CONFIG_DEBUG_RT_MUTEXES | ||
830 | if (strncmp(torture_type, "rtmutex", 7) == 0) | ||
831 | cxt.debug_lock = true; | ||
832 | #endif | ||
679 | #ifdef CONFIG_DEBUG_SPINLOCK | 833 | #ifdef CONFIG_DEBUG_SPINLOCK |
680 | if ((strncmp(torture_type, "spin", 4) == 0) || | 834 | if ((strncmp(torture_type, "spin", 4) == 0) || |
681 | (strncmp(torture_type, "rw_lock", 7) == 0)) | 835 | (strncmp(torture_type, "rw_lock", 7) == 0)) |
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index fd91aaa4554c..5b9102a47ea5 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h | |||
@@ -67,7 +67,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) | |||
67 | node->locked = 0; | 67 | node->locked = 0; |
68 | node->next = NULL; | 68 | node->next = NULL; |
69 | 69 | ||
70 | prev = xchg(lock, node); | 70 | prev = xchg_acquire(lock, node); |
71 | if (likely(prev == NULL)) { | 71 | if (likely(prev == NULL)) { |
72 | /* | 72 | /* |
73 | * Lock acquired, don't need to set node->locked to 1. Threads | 73 | * Lock acquired, don't need to set node->locked to 1. Threads |
@@ -98,7 +98,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) | |||
98 | /* | 98 | /* |
99 | * Release the lock by setting it to NULL | 99 | * Release the lock by setting it to NULL |
100 | */ | 100 | */ |
101 | if (likely(cmpxchg(lock, node, NULL) == node)) | 101 | if (likely(cmpxchg_release(lock, node, NULL) == node)) |
102 | return; | 102 | return; |
103 | /* Wait until the next pointer is set */ | 103 | /* Wait until the next pointer is set */ |
104 | while (!(next = READ_ONCE(node->next))) | 104 | while (!(next = READ_ONCE(node->next))) |
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 4cccea6b8934..0551c219c40e 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c | |||
@@ -277,7 +277,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) | |||
277 | static inline bool mutex_try_to_acquire(struct mutex *lock) | 277 | static inline bool mutex_try_to_acquire(struct mutex *lock) |
278 | { | 278 | { |
279 | return !mutex_is_locked(lock) && | 279 | return !mutex_is_locked(lock) && |
280 | (atomic_cmpxchg(&lock->count, 1, 0) == 1); | 280 | (atomic_cmpxchg_acquire(&lock->count, 1, 0) == 1); |
281 | } | 281 | } |
282 | 282 | ||
283 | /* | 283 | /* |
@@ -529,7 +529,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
529 | * Once more, try to acquire the lock. Only try-lock the mutex if | 529 | * Once more, try to acquire the lock. Only try-lock the mutex if |
530 | * it is unlocked to reduce unnecessary xchg() operations. | 530 | * it is unlocked to reduce unnecessary xchg() operations. |
531 | */ | 531 | */ |
532 | if (!mutex_is_locked(lock) && (atomic_xchg(&lock->count, 0) == 1)) | 532 | if (!mutex_is_locked(lock) && |
533 | (atomic_xchg_acquire(&lock->count, 0) == 1)) | ||
533 | goto skip_wait; | 534 | goto skip_wait; |
534 | 535 | ||
535 | debug_mutex_lock_common(lock, &waiter); | 536 | debug_mutex_lock_common(lock, &waiter); |
@@ -553,7 +554,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
553 | * non-negative in order to avoid unnecessary xchg operations: | 554 | * non-negative in order to avoid unnecessary xchg operations: |
554 | */ | 555 | */ |
555 | if (atomic_read(&lock->count) >= 0 && | 556 | if (atomic_read(&lock->count) >= 0 && |
556 | (atomic_xchg(&lock->count, -1) == 1)) | 557 | (atomic_xchg_acquire(&lock->count, -1) == 1)) |
557 | break; | 558 | break; |
558 | 559 | ||
559 | /* | 560 | /* |
@@ -867,7 +868,7 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count) | |||
867 | 868 | ||
868 | spin_lock_mutex(&lock->wait_lock, flags); | 869 | spin_lock_mutex(&lock->wait_lock, flags); |
869 | 870 | ||
870 | prev = atomic_xchg(&lock->count, -1); | 871 | prev = atomic_xchg_acquire(&lock->count, -1); |
871 | if (likely(prev == 1)) { | 872 | if (likely(prev == 1)) { |
872 | mutex_set_owner(lock); | 873 | mutex_set_owner(lock); |
873 | mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); | 874 | mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); |
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index dc85ee23a26f..d092a0c9c2d4 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c | |||
@@ -50,7 +50,7 @@ osq_wait_next(struct optimistic_spin_queue *lock, | |||
50 | 50 | ||
51 | for (;;) { | 51 | for (;;) { |
52 | if (atomic_read(&lock->tail) == curr && | 52 | if (atomic_read(&lock->tail) == curr && |
53 | atomic_cmpxchg(&lock->tail, curr, old) == curr) { | 53 | atomic_cmpxchg_acquire(&lock->tail, curr, old) == curr) { |
54 | /* | 54 | /* |
55 | * We were the last queued, we moved @lock back. @prev | 55 | * We were the last queued, we moved @lock back. @prev |
56 | * will now observe @lock and will complete its | 56 | * will now observe @lock and will complete its |
@@ -92,7 +92,11 @@ bool osq_lock(struct optimistic_spin_queue *lock) | |||
92 | node->next = NULL; | 92 | node->next = NULL; |
93 | node->cpu = curr; | 93 | node->cpu = curr; |
94 | 94 | ||
95 | old = atomic_xchg(&lock->tail, curr); | 95 | /* |
96 | * ACQUIRE semantics, pairs with corresponding RELEASE | ||
97 | * in unlock() uncontended, or fastpath. | ||
98 | */ | ||
99 | old = atomic_xchg_acquire(&lock->tail, curr); | ||
96 | if (old == OSQ_UNLOCKED_VAL) | 100 | if (old == OSQ_UNLOCKED_VAL) |
97 | return true; | 101 | return true; |
98 | 102 | ||
@@ -184,7 +188,8 @@ void osq_unlock(struct optimistic_spin_queue *lock) | |||
184 | /* | 188 | /* |
185 | * Fast path for the uncontended case. | 189 | * Fast path for the uncontended case. |
186 | */ | 190 | */ |
187 | if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr)) | 191 | if (likely(atomic_cmpxchg_release(&lock->tail, curr, |
192 | OSQ_UNLOCKED_VAL) == curr)) | ||
188 | return; | 193 | return; |
189 | 194 | ||
190 | /* | 195 | /* |
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index f32567254867..f231e0bb311c 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c | |||
@@ -17,50 +17,43 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *brw, | |||
17 | 17 | ||
18 | /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ | 18 | /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ |
19 | __init_rwsem(&brw->rw_sem, name, rwsem_key); | 19 | __init_rwsem(&brw->rw_sem, name, rwsem_key); |
20 | atomic_set(&brw->write_ctr, 0); | 20 | rcu_sync_init(&brw->rss, RCU_SCHED_SYNC); |
21 | atomic_set(&brw->slow_read_ctr, 0); | 21 | atomic_set(&brw->slow_read_ctr, 0); |
22 | init_waitqueue_head(&brw->write_waitq); | 22 | init_waitqueue_head(&brw->write_waitq); |
23 | return 0; | 23 | return 0; |
24 | } | 24 | } |
25 | EXPORT_SYMBOL_GPL(__percpu_init_rwsem); | ||
25 | 26 | ||
26 | void percpu_free_rwsem(struct percpu_rw_semaphore *brw) | 27 | void percpu_free_rwsem(struct percpu_rw_semaphore *brw) |
27 | { | 28 | { |
29 | /* | ||
30 | * XXX: temporary kludge. The error path in alloc_super() | ||
31 | * assumes that percpu_free_rwsem() is safe after kzalloc(). | ||
32 | */ | ||
33 | if (!brw->fast_read_ctr) | ||
34 | return; | ||
35 | |||
36 | rcu_sync_dtor(&brw->rss); | ||
28 | free_percpu(brw->fast_read_ctr); | 37 | free_percpu(brw->fast_read_ctr); |
29 | brw->fast_read_ctr = NULL; /* catch use after free bugs */ | 38 | brw->fast_read_ctr = NULL; /* catch use after free bugs */ |
30 | } | 39 | } |
31 | 40 | ||
32 | /* | 41 | /* |
33 | * This is the fast-path for down_read/up_read, it only needs to ensure | 42 | * This is the fast-path for down_read/up_read. If it succeeds we rely |
34 | * there is no pending writer (atomic_read(write_ctr) == 0) and inc/dec the | 43 | * on the barriers provided by rcu_sync_enter/exit; see the comments in |
35 | * fast per-cpu counter. The writer uses synchronize_sched_expedited() to | 44 | * percpu_down_write() and percpu_up_write(). |
36 | * serialize with the preempt-disabled section below. | ||
37 | * | ||
38 | * The nontrivial part is that we should guarantee acquire/release semantics | ||
39 | * in case when | ||
40 | * | ||
41 | * R_W: down_write() comes after up_read(), the writer should see all | ||
42 | * changes done by the reader | ||
43 | * or | ||
44 | * W_R: down_read() comes after up_write(), the reader should see all | ||
45 | * changes done by the writer | ||
46 | * | 45 | * |
47 | * If this helper fails the callers rely on the normal rw_semaphore and | 46 | * If this helper fails the callers rely on the normal rw_semaphore and |
48 | * atomic_dec_and_test(), so in this case we have the necessary barriers. | 47 | * atomic_dec_and_test(), so in this case we have the necessary barriers. |
49 | * | ||
50 | * But if it succeeds we do not have any barriers, atomic_read(write_ctr) or | ||
51 | * __this_cpu_add() below can be reordered with any LOAD/STORE done by the | ||
52 | * reader inside the critical section. See the comments in down_write and | ||
53 | * up_write below. | ||
54 | */ | 48 | */ |
55 | static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val) | 49 | static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val) |
56 | { | 50 | { |
57 | bool success = false; | 51 | bool success; |
58 | 52 | ||
59 | preempt_disable(); | 53 | preempt_disable(); |
60 | if (likely(!atomic_read(&brw->write_ctr))) { | 54 | success = rcu_sync_is_idle(&brw->rss); |
55 | if (likely(success)) | ||
61 | __this_cpu_add(*brw->fast_read_ctr, val); | 56 | __this_cpu_add(*brw->fast_read_ctr, val); |
62 | success = true; | ||
63 | } | ||
64 | preempt_enable(); | 57 | preempt_enable(); |
65 | 58 | ||
66 | return success; | 59 | return success; |
@@ -77,16 +70,17 @@ static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val) | |||
77 | void percpu_down_read(struct percpu_rw_semaphore *brw) | 70 | void percpu_down_read(struct percpu_rw_semaphore *brw) |
78 | { | 71 | { |
79 | might_sleep(); | 72 | might_sleep(); |
80 | if (likely(update_fast_ctr(brw, +1))) { | 73 | rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_); |
81 | rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_); | 74 | |
75 | if (likely(update_fast_ctr(brw, +1))) | ||
82 | return; | 76 | return; |
83 | } | ||
84 | 77 | ||
85 | down_read(&brw->rw_sem); | 78 | /* Avoid rwsem_acquire_read() and rwsem_release() */ |
79 | __down_read(&brw->rw_sem); | ||
86 | atomic_inc(&brw->slow_read_ctr); | 80 | atomic_inc(&brw->slow_read_ctr); |
87 | /* avoid up_read()->rwsem_release() */ | ||
88 | __up_read(&brw->rw_sem); | 81 | __up_read(&brw->rw_sem); |
89 | } | 82 | } |
83 | EXPORT_SYMBOL_GPL(percpu_down_read); | ||
90 | 84 | ||
91 | int percpu_down_read_trylock(struct percpu_rw_semaphore *brw) | 85 | int percpu_down_read_trylock(struct percpu_rw_semaphore *brw) |
92 | { | 86 | { |
@@ -112,6 +106,7 @@ void percpu_up_read(struct percpu_rw_semaphore *brw) | |||
112 | if (atomic_dec_and_test(&brw->slow_read_ctr)) | 106 | if (atomic_dec_and_test(&brw->slow_read_ctr)) |
113 | wake_up_all(&brw->write_waitq); | 107 | wake_up_all(&brw->write_waitq); |
114 | } | 108 | } |
109 | EXPORT_SYMBOL_GPL(percpu_up_read); | ||
115 | 110 | ||
116 | static int clear_fast_ctr(struct percpu_rw_semaphore *brw) | 111 | static int clear_fast_ctr(struct percpu_rw_semaphore *brw) |
117 | { | 112 | { |
@@ -126,33 +121,17 @@ static int clear_fast_ctr(struct percpu_rw_semaphore *brw) | |||
126 | return sum; | 121 | return sum; |
127 | } | 122 | } |
128 | 123 | ||
129 | /* | ||
130 | * A writer increments ->write_ctr to force the readers to switch to the | ||
131 | * slow mode, note the atomic_read() check in update_fast_ctr(). | ||
132 | * | ||
133 | * After that the readers can only inc/dec the slow ->slow_read_ctr counter, | ||
134 | * ->fast_read_ctr is stable. Once the writer moves its sum into the slow | ||
135 | * counter it represents the number of active readers. | ||
136 | * | ||
137 | * Finally the writer takes ->rw_sem for writing and blocks the new readers, | ||
138 | * then waits until the slow counter becomes zero. | ||
139 | */ | ||
140 | void percpu_down_write(struct percpu_rw_semaphore *brw) | 124 | void percpu_down_write(struct percpu_rw_semaphore *brw) |
141 | { | 125 | { |
142 | /* tell update_fast_ctr() there is a pending writer */ | ||
143 | atomic_inc(&brw->write_ctr); | ||
144 | /* | 126 | /* |
145 | * 1. Ensures that write_ctr != 0 is visible to any down_read/up_read | 127 | * Make rcu_sync_is_idle() == F and thus disable the fast-path in |
146 | * so that update_fast_ctr() can't succeed. | 128 | * percpu_down_read() and percpu_up_read(), and wait for gp pass. |
147 | * | ||
148 | * 2. Ensures we see the result of every previous this_cpu_add() in | ||
149 | * update_fast_ctr(). | ||
150 | * | 129 | * |
151 | * 3. Ensures that if any reader has exited its critical section via | 130 | * The latter synchronises us with the preceding readers which used |
152 | * fast-path, it executes a full memory barrier before we return. | 131 | * the fast-past, so we can not miss the result of __this_cpu_add() |
153 | * See R_W case in the comment above update_fast_ctr(). | 132 | * or anything else inside their criticial sections. |
154 | */ | 133 | */ |
155 | synchronize_sched_expedited(); | 134 | rcu_sync_enter(&brw->rss); |
156 | 135 | ||
157 | /* exclude other writers, and block the new readers completely */ | 136 | /* exclude other writers, and block the new readers completely */ |
158 | down_write(&brw->rw_sem); | 137 | down_write(&brw->rw_sem); |
@@ -163,16 +142,17 @@ void percpu_down_write(struct percpu_rw_semaphore *brw) | |||
163 | /* wait for all readers to complete their percpu_up_read() */ | 142 | /* wait for all readers to complete their percpu_up_read() */ |
164 | wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr)); | 143 | wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr)); |
165 | } | 144 | } |
145 | EXPORT_SYMBOL_GPL(percpu_down_write); | ||
166 | 146 | ||
167 | void percpu_up_write(struct percpu_rw_semaphore *brw) | 147 | void percpu_up_write(struct percpu_rw_semaphore *brw) |
168 | { | 148 | { |
169 | /* release the lock, but the readers can't use the fast-path */ | 149 | /* release the lock, but the readers can't use the fast-path */ |
170 | up_write(&brw->rw_sem); | 150 | up_write(&brw->rw_sem); |
171 | /* | 151 | /* |
172 | * Insert the barrier before the next fast-path in down_read, | 152 | * Enable the fast-path in percpu_down_read() and percpu_up_read() |
173 | * see W_R case in the comment above update_fast_ctr(). | 153 | * but only after another gp pass; this adds the necessary barrier |
154 | * to ensure the reader can't miss the changes done by us. | ||
174 | */ | 155 | */ |
175 | synchronize_sched_expedited(); | 156 | rcu_sync_exit(&brw->rss); |
176 | /* the last writer unblocks update_fast_ctr() */ | ||
177 | atomic_dec(&brw->write_ctr); | ||
178 | } | 157 | } |
158 | EXPORT_SYMBOL_GPL(percpu_up_write); | ||
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index f17a3e3b3550..fec082338668 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c | |||
@@ -86,7 +86,7 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts) | |||
86 | /* | 86 | /* |
87 | * Put the reader into the wait queue | 87 | * Put the reader into the wait queue |
88 | */ | 88 | */ |
89 | arch_spin_lock(&lock->lock); | 89 | arch_spin_lock(&lock->wait_lock); |
90 | 90 | ||
91 | /* | 91 | /* |
92 | * The ACQUIRE semantics of the following spinning code ensure | 92 | * The ACQUIRE semantics of the following spinning code ensure |
@@ -99,7 +99,7 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts) | |||
99 | /* | 99 | /* |
100 | * Signal the next one in queue to become queue head | 100 | * Signal the next one in queue to become queue head |
101 | */ | 101 | */ |
102 | arch_spin_unlock(&lock->lock); | 102 | arch_spin_unlock(&lock->wait_lock); |
103 | } | 103 | } |
104 | EXPORT_SYMBOL(queued_read_lock_slowpath); | 104 | EXPORT_SYMBOL(queued_read_lock_slowpath); |
105 | 105 | ||
@@ -112,7 +112,7 @@ void queued_write_lock_slowpath(struct qrwlock *lock) | |||
112 | u32 cnts; | 112 | u32 cnts; |
113 | 113 | ||
114 | /* Put the writer into the wait queue */ | 114 | /* Put the writer into the wait queue */ |
115 | arch_spin_lock(&lock->lock); | 115 | arch_spin_lock(&lock->wait_lock); |
116 | 116 | ||
117 | /* Try to acquire the lock directly if no reader is present */ | 117 | /* Try to acquire the lock directly if no reader is present */ |
118 | if (!atomic_read(&lock->cnts) && | 118 | if (!atomic_read(&lock->cnts) && |
@@ -144,6 +144,6 @@ void queued_write_lock_slowpath(struct qrwlock *lock) | |||
144 | cpu_relax_lowlatency(); | 144 | cpu_relax_lowlatency(); |
145 | } | 145 | } |
146 | unlock: | 146 | unlock: |
147 | arch_spin_unlock(&lock->lock); | 147 | arch_spin_unlock(&lock->wait_lock); |
148 | } | 148 | } |
149 | EXPORT_SYMBOL(queued_write_lock_slowpath); | 149 | EXPORT_SYMBOL(queued_write_lock_slowpath); |
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index c8e6e9a596f5..f0450ff4829b 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h | |||
@@ -267,7 +267,6 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) | |||
267 | } | 267 | } |
268 | 268 | ||
269 | if (!lp) { /* ONCE */ | 269 | if (!lp) { /* ONCE */ |
270 | WRITE_ONCE(pn->state, vcpu_hashed); | ||
271 | lp = pv_hash(lock, pn); | 270 | lp = pv_hash(lock, pn); |
272 | 271 | ||
273 | /* | 272 | /* |
@@ -275,11 +274,9 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) | |||
275 | * when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock() | 274 | * when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock() |
276 | * we'll be sure to be able to observe our hash entry. | 275 | * we'll be sure to be able to observe our hash entry. |
277 | * | 276 | * |
278 | * [S] pn->state | ||
279 | * [S] <hash> [Rmw] l->locked == _Q_SLOW_VAL | 277 | * [S] <hash> [Rmw] l->locked == _Q_SLOW_VAL |
280 | * MB RMB | 278 | * MB RMB |
281 | * [RmW] l->locked = _Q_SLOW_VAL [L] <unhash> | 279 | * [RmW] l->locked = _Q_SLOW_VAL [L] <unhash> |
282 | * [L] pn->state | ||
283 | * | 280 | * |
284 | * Matches the smp_rmb() in __pv_queued_spin_unlock(). | 281 | * Matches the smp_rmb() in __pv_queued_spin_unlock(). |
285 | */ | 282 | */ |
@@ -364,8 +361,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) | |||
364 | * vCPU is harmless other than the additional latency in completing | 361 | * vCPU is harmless other than the additional latency in completing |
365 | * the unlock. | 362 | * the unlock. |
366 | */ | 363 | */ |
367 | if (READ_ONCE(node->state) == vcpu_hashed) | 364 | pv_kick(node->cpu); |
368 | pv_kick(node->cpu); | ||
369 | } | 365 | } |
370 | /* | 366 | /* |
371 | * Include the architecture specific callee-save thunk of the | 367 | * Include the architecture specific callee-save thunk of the |
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 7781d801212f..8251e75dd9c0 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c | |||
@@ -74,14 +74,23 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock) | |||
74 | * set up. | 74 | * set up. |
75 | */ | 75 | */ |
76 | #ifndef CONFIG_DEBUG_RT_MUTEXES | 76 | #ifndef CONFIG_DEBUG_RT_MUTEXES |
77 | # define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c) | 77 | # define rt_mutex_cmpxchg_relaxed(l,c,n) (cmpxchg_relaxed(&l->owner, c, n) == c) |
78 | # define rt_mutex_cmpxchg_acquire(l,c,n) (cmpxchg_acquire(&l->owner, c, n) == c) | ||
79 | # define rt_mutex_cmpxchg_release(l,c,n) (cmpxchg_release(&l->owner, c, n) == c) | ||
80 | |||
81 | /* | ||
82 | * Callers must hold the ->wait_lock -- which is the whole purpose as we force | ||
83 | * all future threads that attempt to [Rmw] the lock to the slowpath. As such | ||
84 | * relaxed semantics suffice. | ||
85 | */ | ||
78 | static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) | 86 | static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) |
79 | { | 87 | { |
80 | unsigned long owner, *p = (unsigned long *) &lock->owner; | 88 | unsigned long owner, *p = (unsigned long *) &lock->owner; |
81 | 89 | ||
82 | do { | 90 | do { |
83 | owner = *p; | 91 | owner = *p; |
84 | } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); | 92 | } while (cmpxchg_relaxed(p, owner, |
93 | owner | RT_MUTEX_HAS_WAITERS) != owner); | ||
85 | } | 94 | } |
86 | 95 | ||
87 | /* | 96 | /* |
@@ -121,11 +130,14 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock) | |||
121 | * lock(wait_lock); | 130 | * lock(wait_lock); |
122 | * acquire(lock); | 131 | * acquire(lock); |
123 | */ | 132 | */ |
124 | return rt_mutex_cmpxchg(lock, owner, NULL); | 133 | return rt_mutex_cmpxchg_release(lock, owner, NULL); |
125 | } | 134 | } |
126 | 135 | ||
127 | #else | 136 | #else |
128 | # define rt_mutex_cmpxchg(l,c,n) (0) | 137 | # define rt_mutex_cmpxchg_relaxed(l,c,n) (0) |
138 | # define rt_mutex_cmpxchg_acquire(l,c,n) (0) | ||
139 | # define rt_mutex_cmpxchg_release(l,c,n) (0) | ||
140 | |||
129 | static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) | 141 | static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) |
130 | { | 142 | { |
131 | lock->owner = (struct task_struct *) | 143 | lock->owner = (struct task_struct *) |
@@ -158,7 +170,8 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left, | |||
158 | * then right waiter has a dl_prio() too. | 170 | * then right waiter has a dl_prio() too. |
159 | */ | 171 | */ |
160 | if (dl_prio(left->prio)) | 172 | if (dl_prio(left->prio)) |
161 | return (left->task->dl.deadline < right->task->dl.deadline); | 173 | return dl_time_before(left->task->dl.deadline, |
174 | right->task->dl.deadline); | ||
162 | 175 | ||
163 | return 0; | 176 | return 0; |
164 | } | 177 | } |
@@ -1321,7 +1334,7 @@ rt_mutex_fastlock(struct rt_mutex *lock, int state, | |||
1321 | struct hrtimer_sleeper *timeout, | 1334 | struct hrtimer_sleeper *timeout, |
1322 | enum rtmutex_chainwalk chwalk)) | 1335 | enum rtmutex_chainwalk chwalk)) |
1323 | { | 1336 | { |
1324 | if (likely(rt_mutex_cmpxchg(lock, NULL, current))) { | 1337 | if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { |
1325 | rt_mutex_deadlock_account_lock(lock, current); | 1338 | rt_mutex_deadlock_account_lock(lock, current); |
1326 | return 0; | 1339 | return 0; |
1327 | } else | 1340 | } else |
@@ -1337,7 +1350,7 @@ rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, | |||
1337 | enum rtmutex_chainwalk chwalk)) | 1350 | enum rtmutex_chainwalk chwalk)) |
1338 | { | 1351 | { |
1339 | if (chwalk == RT_MUTEX_MIN_CHAINWALK && | 1352 | if (chwalk == RT_MUTEX_MIN_CHAINWALK && |
1340 | likely(rt_mutex_cmpxchg(lock, NULL, current))) { | 1353 | likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { |
1341 | rt_mutex_deadlock_account_lock(lock, current); | 1354 | rt_mutex_deadlock_account_lock(lock, current); |
1342 | return 0; | 1355 | return 0; |
1343 | } else | 1356 | } else |
@@ -1348,7 +1361,7 @@ static inline int | |||
1348 | rt_mutex_fasttrylock(struct rt_mutex *lock, | 1361 | rt_mutex_fasttrylock(struct rt_mutex *lock, |
1349 | int (*slowfn)(struct rt_mutex *lock)) | 1362 | int (*slowfn)(struct rt_mutex *lock)) |
1350 | { | 1363 | { |
1351 | if (likely(rt_mutex_cmpxchg(lock, NULL, current))) { | 1364 | if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { |
1352 | rt_mutex_deadlock_account_lock(lock, current); | 1365 | rt_mutex_deadlock_account_lock(lock, current); |
1353 | return 1; | 1366 | return 1; |
1354 | } | 1367 | } |
@@ -1362,7 +1375,7 @@ rt_mutex_fastunlock(struct rt_mutex *lock, | |||
1362 | { | 1375 | { |
1363 | WAKE_Q(wake_q); | 1376 | WAKE_Q(wake_q); |
1364 | 1377 | ||
1365 | if (likely(rt_mutex_cmpxchg(lock, current, NULL))) { | 1378 | if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) { |
1366 | rt_mutex_deadlock_account_unlock(current); | 1379 | rt_mutex_deadlock_account_unlock(current); |
1367 | 1380 | ||
1368 | } else { | 1381 | } else { |
@@ -1484,7 +1497,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_unlock); | |||
1484 | bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock, | 1497 | bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock, |
1485 | struct wake_q_head *wqh) | 1498 | struct wake_q_head *wqh) |
1486 | { | 1499 | { |
1487 | if (likely(rt_mutex_cmpxchg(lock, current, NULL))) { | 1500 | if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) { |
1488 | rt_mutex_deadlock_account_unlock(current); | 1501 | rt_mutex_deadlock_account_unlock(current); |
1489 | return false; | 1502 | return false; |
1490 | } | 1503 | } |
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 0f189714e457..a4d4de05b2d1 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c | |||
@@ -262,7 +262,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) | |||
262 | * to reduce unnecessary expensive cmpxchg() operations. | 262 | * to reduce unnecessary expensive cmpxchg() operations. |
263 | */ | 263 | */ |
264 | if (count == RWSEM_WAITING_BIAS && | 264 | if (count == RWSEM_WAITING_BIAS && |
265 | cmpxchg(&sem->count, RWSEM_WAITING_BIAS, | 265 | cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, |
266 | RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { | 266 | RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { |
267 | if (!list_is_singular(&sem->wait_list)) | 267 | if (!list_is_singular(&sem->wait_list)) |
268 | rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); | 268 | rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); |
@@ -285,7 +285,8 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) | |||
285 | if (!(count == 0 || count == RWSEM_WAITING_BIAS)) | 285 | if (!(count == 0 || count == RWSEM_WAITING_BIAS)) |
286 | return false; | 286 | return false; |
287 | 287 | ||
288 | old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS); | 288 | old = cmpxchg_acquire(&sem->count, count, |
289 | count + RWSEM_ACTIVE_WRITE_BIAS); | ||
289 | if (old == count) { | 290 | if (old == count) { |
290 | rwsem_set_owner(sem); | 291 | rwsem_set_owner(sem); |
291 | return true; | 292 | return true; |
diff --git a/kernel/memremap.c b/kernel/memremap.c index 72b0c66628b6..7658d32c5c78 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c | |||
@@ -24,6 +24,16 @@ __weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size) | |||
24 | } | 24 | } |
25 | #endif | 25 | #endif |
26 | 26 | ||
27 | static void *try_ram_remap(resource_size_t offset, size_t size) | ||
28 | { | ||
29 | struct page *page = pfn_to_page(offset >> PAGE_SHIFT); | ||
30 | |||
31 | /* In the simple case just return the existing linear address */ | ||
32 | if (!PageHighMem(page)) | ||
33 | return __va(offset); | ||
34 | return NULL; /* fallback to ioremap_cache */ | ||
35 | } | ||
36 | |||
27 | /** | 37 | /** |
28 | * memremap() - remap an iomem_resource as cacheable memory | 38 | * memremap() - remap an iomem_resource as cacheable memory |
29 | * @offset: iomem resource start address | 39 | * @offset: iomem resource start address |
@@ -66,8 +76,8 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags) | |||
66 | * the requested range is potentially in "System RAM" | 76 | * the requested range is potentially in "System RAM" |
67 | */ | 77 | */ |
68 | if (is_ram == REGION_INTERSECTS) | 78 | if (is_ram == REGION_INTERSECTS) |
69 | addr = __va(offset); | 79 | addr = try_ram_remap(offset, size); |
70 | else | 80 | if (!addr) |
71 | addr = ioremap_cache(offset, size); | 81 | addr = ioremap_cache(offset, size); |
72 | } | 82 | } |
73 | 83 | ||
@@ -114,9 +124,10 @@ void *devm_memremap(struct device *dev, resource_size_t offset, | |||
114 | { | 124 | { |
115 | void **ptr, *addr; | 125 | void **ptr, *addr; |
116 | 126 | ||
117 | ptr = devres_alloc(devm_memremap_release, sizeof(*ptr), GFP_KERNEL); | 127 | ptr = devres_alloc_node(devm_memremap_release, sizeof(*ptr), GFP_KERNEL, |
128 | dev_to_node(dev)); | ||
118 | if (!ptr) | 129 | if (!ptr) |
119 | return NULL; | 130 | return ERR_PTR(-ENOMEM); |
120 | 131 | ||
121 | addr = memremap(offset, size, flags); | 132 | addr = memremap(offset, size, flags); |
122 | if (addr) { | 133 | if (addr) { |
@@ -131,9 +142,8 @@ EXPORT_SYMBOL(devm_memremap); | |||
131 | 142 | ||
132 | void devm_memunmap(struct device *dev, void *addr) | 143 | void devm_memunmap(struct device *dev, void *addr) |
133 | { | 144 | { |
134 | WARN_ON(devres_destroy(dev, devm_memremap_release, devm_memremap_match, | 145 | WARN_ON(devres_release(dev, devm_memremap_release, |
135 | addr)); | 146 | devm_memremap_match, addr)); |
136 | memunmap(addr); | ||
137 | } | 147 | } |
138 | EXPORT_SYMBOL(devm_memunmap); | 148 | EXPORT_SYMBOL(devm_memunmap); |
139 | 149 | ||
@@ -166,8 +176,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res) | |||
166 | if (is_ram == REGION_INTERSECTS) | 176 | if (is_ram == REGION_INTERSECTS) |
167 | return __va(res->start); | 177 | return __va(res->start); |
168 | 178 | ||
169 | page_map = devres_alloc(devm_memremap_pages_release, | 179 | page_map = devres_alloc_node(devm_memremap_pages_release, |
170 | sizeof(*page_map), GFP_KERNEL); | 180 | sizeof(*page_map), GFP_KERNEL, dev_to_node(dev)); |
171 | if (!page_map) | 181 | if (!page_map) |
172 | return ERR_PTR(-ENOMEM); | 182 | return ERR_PTR(-ENOMEM); |
173 | 183 | ||
@@ -175,7 +185,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res) | |||
175 | 185 | ||
176 | nid = dev_to_node(dev); | 186 | nid = dev_to_node(dev); |
177 | if (nid < 0) | 187 | if (nid < 0) |
178 | nid = 0; | 188 | nid = numa_mem_id(); |
179 | 189 | ||
180 | error = arch_add_memory(nid, res->start, resource_size(res), true); | 190 | error = arch_add_memory(nid, res->start, resource_size(res), true); |
181 | if (error) { | 191 | if (error) { |
diff --git a/kernel/module.c b/kernel/module.c index b86b7bf1be38..8f051a106676 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -1063,11 +1063,15 @@ void symbol_put_addr(void *addr) | |||
1063 | if (core_kernel_text(a)) | 1063 | if (core_kernel_text(a)) |
1064 | return; | 1064 | return; |
1065 | 1065 | ||
1066 | /* module_text_address is safe here: we're supposed to have reference | 1066 | /* |
1067 | * to module from symbol_get, so it can't go away. */ | 1067 | * Even though we hold a reference on the module; we still need to |
1068 | * disable preemption in order to safely traverse the data structure. | ||
1069 | */ | ||
1070 | preempt_disable(); | ||
1068 | modaddr = __module_text_address(a); | 1071 | modaddr = __module_text_address(a); |
1069 | BUG_ON(!modaddr); | 1072 | BUG_ON(!modaddr); |
1070 | module_put(modaddr); | 1073 | module_put(modaddr); |
1074 | preempt_enable(); | ||
1071 | } | 1075 | } |
1072 | EXPORT_SYMBOL_GPL(symbol_put_addr); | 1076 | EXPORT_SYMBOL_GPL(symbol_put_addr); |
1073 | 1077 | ||
diff --git a/kernel/module_signing.c b/kernel/module_signing.c index bd62f5cda746..6528a79d998d 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c | |||
@@ -10,6 +10,7 @@ | |||
10 | */ | 10 | */ |
11 | 11 | ||
12 | #include <linux/kernel.h> | 12 | #include <linux/kernel.h> |
13 | #include <linux/errno.h> | ||
13 | #include <keys/system_keyring.h> | 14 | #include <keys/system_keyring.h> |
14 | #include <crypto/public_key.h> | 15 | #include <crypto/public_key.h> |
15 | #include "module-internal.h" | 16 | #include "module-internal.h" |
diff --git a/kernel/panic.c b/kernel/panic.c index 04e91ff7560b..4579dbb7ed87 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/sysrq.h> | 23 | #include <linux/sysrq.h> |
24 | #include <linux/init.h> | 24 | #include <linux/init.h> |
25 | #include <linux/nmi.h> | 25 | #include <linux/nmi.h> |
26 | #include <linux/console.h> | ||
26 | 27 | ||
27 | #define PANIC_TIMER_STEP 100 | 28 | #define PANIC_TIMER_STEP 100 |
28 | #define PANIC_BLINK_SPD 18 | 29 | #define PANIC_BLINK_SPD 18 |
@@ -147,6 +148,15 @@ void panic(const char *fmt, ...) | |||
147 | 148 | ||
148 | bust_spinlocks(0); | 149 | bust_spinlocks(0); |
149 | 150 | ||
151 | /* | ||
152 | * We may have ended up stopping the CPU holding the lock (in | ||
153 | * smp_send_stop()) while still having some valuable data in the console | ||
154 | * buffer. Try to acquire the lock then release it regardless of the | ||
155 | * result. The release will also print the buffers out. | ||
156 | */ | ||
157 | console_trylock(); | ||
158 | console_unlock(); | ||
159 | |||
150 | if (!panic_blink) | 160 | if (!panic_blink) |
151 | panic_blink = no_blink; | 161 | panic_blink = no_blink; |
152 | 162 | ||
diff --git a/kernel/params.c b/kernel/params.c index b6554aa71094..a6d6149c0fe6 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -223,7 +223,7 @@ char *parse_args(const char *doing, | |||
223 | int (*unknown)(char *param, char *val, | 223 | int (*unknown)(char *param, char *val, |
224 | const char *doing, void *arg)) | 224 | const char *doing, void *arg)) |
225 | { | 225 | { |
226 | char *param, *val; | 226 | char *param, *val, *err = NULL; |
227 | 227 | ||
228 | /* Chew leading spaces */ | 228 | /* Chew leading spaces */ |
229 | args = skip_spaces(args); | 229 | args = skip_spaces(args); |
@@ -238,7 +238,7 @@ char *parse_args(const char *doing, | |||
238 | args = next_arg(args, ¶m, &val); | 238 | args = next_arg(args, ¶m, &val); |
239 | /* Stop at -- */ | 239 | /* Stop at -- */ |
240 | if (!val && strcmp(param, "--") == 0) | 240 | if (!val && strcmp(param, "--") == 0) |
241 | return args; | 241 | return err ?: args; |
242 | irq_was_disabled = irqs_disabled(); | 242 | irq_was_disabled = irqs_disabled(); |
243 | ret = parse_one(param, val, doing, params, num, | 243 | ret = parse_one(param, val, doing, params, num, |
244 | min_level, max_level, arg, unknown); | 244 | min_level, max_level, arg, unknown); |
@@ -247,24 +247,25 @@ char *parse_args(const char *doing, | |||
247 | doing, param); | 247 | doing, param); |
248 | 248 | ||
249 | switch (ret) { | 249 | switch (ret) { |
250 | case 0: | ||
251 | continue; | ||
250 | case -ENOENT: | 252 | case -ENOENT: |
251 | pr_err("%s: Unknown parameter `%s'\n", doing, param); | 253 | pr_err("%s: Unknown parameter `%s'\n", doing, param); |
252 | return ERR_PTR(ret); | 254 | break; |
253 | case -ENOSPC: | 255 | case -ENOSPC: |
254 | pr_err("%s: `%s' too large for parameter `%s'\n", | 256 | pr_err("%s: `%s' too large for parameter `%s'\n", |
255 | doing, val ?: "", param); | 257 | doing, val ?: "", param); |
256 | return ERR_PTR(ret); | ||
257 | case 0: | ||
258 | break; | 258 | break; |
259 | default: | 259 | default: |
260 | pr_err("%s: `%s' invalid for parameter `%s'\n", | 260 | pr_err("%s: `%s' invalid for parameter `%s'\n", |
261 | doing, val ?: "", param); | 261 | doing, val ?: "", param); |
262 | return ERR_PTR(ret); | 262 | break; |
263 | } | 263 | } |
264 | |||
265 | err = ERR_PTR(ret); | ||
264 | } | 266 | } |
265 | 267 | ||
266 | /* All parsed OK. */ | 268 | return err; |
267 | return NULL; | ||
268 | } | 269 | } |
269 | 270 | ||
270 | /* Lazy bastard, eh? */ | 271 | /* Lazy bastard, eh? */ |
@@ -325,10 +326,11 @@ int param_get_charp(char *buffer, const struct kernel_param *kp) | |||
325 | } | 326 | } |
326 | EXPORT_SYMBOL(param_get_charp); | 327 | EXPORT_SYMBOL(param_get_charp); |
327 | 328 | ||
328 | static void param_free_charp(void *arg) | 329 | void param_free_charp(void *arg) |
329 | { | 330 | { |
330 | maybe_kfree_parameter(*((char **)arg)); | 331 | maybe_kfree_parameter(*((char **)arg)); |
331 | } | 332 | } |
333 | EXPORT_SYMBOL(param_free_charp); | ||
332 | 334 | ||
333 | const struct kernel_param_ops param_ops_charp = { | 335 | const struct kernel_param_ops param_ops_charp = { |
334 | .set = param_set_charp, | 336 | .set = param_set_charp, |
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 690f78f210f2..b7342a24f559 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
@@ -733,7 +733,7 @@ int hibernate(void) | |||
733 | * contents of memory is restored from the saved image. | 733 | * contents of memory is restored from the saved image. |
734 | * | 734 | * |
735 | * If this is successful, control reappears in the restored target kernel in | 735 | * If this is successful, control reappears in the restored target kernel in |
736 | * hibernation_snaphot() which returns to hibernate(). Otherwise, the routine | 736 | * hibernation_snapshot() which returns to hibernate(). Otherwise, the routine |
737 | * attempts to recover gracefully and make the kernel return to the normal mode | 737 | * attempts to recover gracefully and make the kernel return to the normal mode |
738 | * of operation. | 738 | * of operation. |
739 | */ | 739 | */ |
diff --git a/kernel/power/main.c b/kernel/power/main.c index 63d395b5df93..b2dd4d999900 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -272,6 +272,22 @@ static inline void pm_print_times_init(void) | |||
272 | { | 272 | { |
273 | pm_print_times_enabled = !!initcall_debug; | 273 | pm_print_times_enabled = !!initcall_debug; |
274 | } | 274 | } |
275 | |||
276 | static ssize_t pm_wakeup_irq_show(struct kobject *kobj, | ||
277 | struct kobj_attribute *attr, | ||
278 | char *buf) | ||
279 | { | ||
280 | return pm_wakeup_irq ? sprintf(buf, "%u\n", pm_wakeup_irq) : -ENODATA; | ||
281 | } | ||
282 | |||
283 | static ssize_t pm_wakeup_irq_store(struct kobject *kobj, | ||
284 | struct kobj_attribute *attr, | ||
285 | const char *buf, size_t n) | ||
286 | { | ||
287 | return -EINVAL; | ||
288 | } | ||
289 | power_attr(pm_wakeup_irq); | ||
290 | |||
275 | #else /* !CONFIG_PM_SLEEP_DEBUG */ | 291 | #else /* !CONFIG_PM_SLEEP_DEBUG */ |
276 | static inline void pm_print_times_init(void) {} | 292 | static inline void pm_print_times_init(void) {} |
277 | #endif /* CONFIG_PM_SLEEP_DEBUG */ | 293 | #endif /* CONFIG_PM_SLEEP_DEBUG */ |
@@ -604,6 +620,7 @@ static struct attribute * g[] = { | |||
604 | #endif | 620 | #endif |
605 | #ifdef CONFIG_PM_SLEEP_DEBUG | 621 | #ifdef CONFIG_PM_SLEEP_DEBUG |
606 | &pm_print_times_attr.attr, | 622 | &pm_print_times_attr.attr, |
623 | &pm_wakeup_irq_attr.attr, | ||
607 | #endif | 624 | #endif |
608 | #endif | 625 | #endif |
609 | #ifdef CONFIG_FREEZER | 626 | #ifdef CONFIG_FREEZER |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 5235dd4e1e2f..3a970604308f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -1779,7 +1779,7 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem) | |||
1779 | while (to_alloc-- > 0) { | 1779 | while (to_alloc-- > 0) { |
1780 | struct page *page; | 1780 | struct page *page; |
1781 | 1781 | ||
1782 | page = alloc_image_page(__GFP_HIGHMEM); | 1782 | page = alloc_image_page(__GFP_HIGHMEM|__GFP_KSWAPD_RECLAIM); |
1783 | memory_bm_set_bit(bm, page_to_pfn(page)); | 1783 | memory_bm_set_bit(bm, page_to_pfn(page)); |
1784 | } | 1784 | } |
1785 | return nr_highmem; | 1785 | return nr_highmem; |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 7e4cda4a8dd9..f9fe133c13e2 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
@@ -35,6 +35,9 @@ | |||
35 | const char *pm_labels[] = { "mem", "standby", "freeze", NULL }; | 35 | const char *pm_labels[] = { "mem", "standby", "freeze", NULL }; |
36 | const char *pm_states[PM_SUSPEND_MAX]; | 36 | const char *pm_states[PM_SUSPEND_MAX]; |
37 | 37 | ||
38 | unsigned int pm_suspend_global_flags; | ||
39 | EXPORT_SYMBOL_GPL(pm_suspend_global_flags); | ||
40 | |||
38 | static const struct platform_suspend_ops *suspend_ops; | 41 | static const struct platform_suspend_ops *suspend_ops; |
39 | static const struct platform_freeze_ops *freeze_ops; | 42 | static const struct platform_freeze_ops *freeze_ops; |
40 | static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); | 43 | static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); |
@@ -493,6 +496,7 @@ static int enter_state(suspend_state_t state) | |||
493 | #endif | 496 | #endif |
494 | 497 | ||
495 | pr_debug("PM: Preparing system for sleep (%s)\n", pm_states[state]); | 498 | pr_debug("PM: Preparing system for sleep (%s)\n", pm_states[state]); |
499 | pm_suspend_clear_flags(); | ||
496 | error = suspend_prepare(state); | 500 | error = suspend_prepare(state); |
497 | if (error) | 501 | if (error) |
498 | goto Unlock; | 502 | goto Unlock; |
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index b2066fb5b10f..12cd989dadf6 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
@@ -257,7 +257,7 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr, | |||
257 | struct bio *bio; | 257 | struct bio *bio; |
258 | int error = 0; | 258 | int error = 0; |
259 | 259 | ||
260 | bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); | 260 | bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1); |
261 | bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); | 261 | bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); |
262 | bio->bi_bdev = hib_resume_bdev; | 262 | bio->bi_bdev = hib_resume_bdev; |
263 | 263 | ||
@@ -356,7 +356,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) | |||
356 | return -ENOSPC; | 356 | return -ENOSPC; |
357 | 357 | ||
358 | if (hb) { | 358 | if (hb) { |
359 | src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN | | 359 | src = (void *)__get_free_page(__GFP_RECLAIM | __GFP_NOWARN | |
360 | __GFP_NORETRY); | 360 | __GFP_NORETRY); |
361 | if (src) { | 361 | if (src) { |
362 | copy_page(src, buf); | 362 | copy_page(src, buf); |
@@ -364,7 +364,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) | |||
364 | ret = hib_wait_io(hb); /* Free pages */ | 364 | ret = hib_wait_io(hb); /* Free pages */ |
365 | if (ret) | 365 | if (ret) |
366 | return ret; | 366 | return ret; |
367 | src = (void *)__get_free_page(__GFP_WAIT | | 367 | src = (void *)__get_free_page(__GFP_RECLAIM | |
368 | __GFP_NOWARN | | 368 | __GFP_NOWARN | |
369 | __GFP_NORETRY); | 369 | __GFP_NORETRY); |
370 | if (src) { | 370 | if (src) { |
@@ -672,7 +672,7 @@ static int save_image_lzo(struct swap_map_handle *handle, | |||
672 | nr_threads = num_online_cpus() - 1; | 672 | nr_threads = num_online_cpus() - 1; |
673 | nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); | 673 | nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); |
674 | 674 | ||
675 | page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); | 675 | page = (void *)__get_free_page(__GFP_RECLAIM | __GFP_HIGH); |
676 | if (!page) { | 676 | if (!page) { |
677 | printk(KERN_ERR "PM: Failed to allocate LZO page\n"); | 677 | printk(KERN_ERR "PM: Failed to allocate LZO page\n"); |
678 | ret = -ENOMEM; | 678 | ret = -ENOMEM; |
@@ -975,7 +975,7 @@ static int get_swap_reader(struct swap_map_handle *handle, | |||
975 | last = tmp; | 975 | last = tmp; |
976 | 976 | ||
977 | tmp->map = (struct swap_map_page *) | 977 | tmp->map = (struct swap_map_page *) |
978 | __get_free_page(__GFP_WAIT | __GFP_HIGH); | 978 | __get_free_page(__GFP_RECLAIM | __GFP_HIGH); |
979 | if (!tmp->map) { | 979 | if (!tmp->map) { |
980 | release_swap_reader(handle); | 980 | release_swap_reader(handle); |
981 | return -ENOMEM; | 981 | return -ENOMEM; |
@@ -1242,9 +1242,9 @@ static int load_image_lzo(struct swap_map_handle *handle, | |||
1242 | 1242 | ||
1243 | for (i = 0; i < read_pages; i++) { | 1243 | for (i = 0; i < read_pages; i++) { |
1244 | page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? | 1244 | page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? |
1245 | __GFP_WAIT | __GFP_HIGH : | 1245 | __GFP_RECLAIM | __GFP_HIGH : |
1246 | __GFP_WAIT | __GFP_NOWARN | | 1246 | __GFP_RECLAIM | __GFP_NOWARN | |
1247 | __GFP_NORETRY); | 1247 | __GFP_NORETRY); |
1248 | 1248 | ||
1249 | if (!page[i]) { | 1249 | if (!page[i]) { |
1250 | if (i < LZO_CMP_PAGES) { | 1250 | if (i < LZO_CMP_PAGES) { |
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 8f0324ef72ab..2ce8826f1053 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c | |||
@@ -269,6 +269,9 @@ static u32 clear_idx; | |||
269 | #define PREFIX_MAX 32 | 269 | #define PREFIX_MAX 32 |
270 | #define LOG_LINE_MAX (1024 - PREFIX_MAX) | 270 | #define LOG_LINE_MAX (1024 - PREFIX_MAX) |
271 | 271 | ||
272 | #define LOG_LEVEL(v) ((v) & 0x07) | ||
273 | #define LOG_FACILITY(v) ((v) >> 3 & 0xff) | ||
274 | |||
272 | /* record buffer */ | 275 | /* record buffer */ |
273 | #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) | 276 | #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) |
274 | #define LOG_ALIGN 4 | 277 | #define LOG_ALIGN 4 |
@@ -517,6 +520,7 @@ int check_syslog_permissions(int type, int source) | |||
517 | ok: | 520 | ok: |
518 | return security_syslog(type); | 521 | return security_syslog(type); |
519 | } | 522 | } |
523 | EXPORT_SYMBOL_GPL(check_syslog_permissions); | ||
520 | 524 | ||
521 | static void append_char(char **pp, char *e, char c) | 525 | static void append_char(char **pp, char *e, char c) |
522 | { | 526 | { |
@@ -611,7 +615,6 @@ struct devkmsg_user { | |||
611 | static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) | 615 | static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) |
612 | { | 616 | { |
613 | char *buf, *line; | 617 | char *buf, *line; |
614 | int i; | ||
615 | int level = default_message_loglevel; | 618 | int level = default_message_loglevel; |
616 | int facility = 1; /* LOG_USER */ | 619 | int facility = 1; /* LOG_USER */ |
617 | size_t len = iov_iter_count(from); | 620 | size_t len = iov_iter_count(from); |
@@ -641,12 +644,13 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) | |||
641 | line = buf; | 644 | line = buf; |
642 | if (line[0] == '<') { | 645 | if (line[0] == '<') { |
643 | char *endp = NULL; | 646 | char *endp = NULL; |
647 | unsigned int u; | ||
644 | 648 | ||
645 | i = simple_strtoul(line+1, &endp, 10); | 649 | u = simple_strtoul(line + 1, &endp, 10); |
646 | if (endp && endp[0] == '>') { | 650 | if (endp && endp[0] == '>') { |
647 | level = i & 7; | 651 | level = LOG_LEVEL(u); |
648 | if (i >> 3) | 652 | if (LOG_FACILITY(u) != 0) |
649 | facility = i >> 3; | 653 | facility = LOG_FACILITY(u); |
650 | endp++; | 654 | endp++; |
651 | len -= endp - line; | 655 | len -= endp - line; |
652 | line = endp; | 656 | line = endp; |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 787320de68e0..b760bae64cf1 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -1016,6 +1016,11 @@ int ptrace_request(struct task_struct *child, long request, | |||
1016 | break; | 1016 | break; |
1017 | } | 1017 | } |
1018 | #endif | 1018 | #endif |
1019 | |||
1020 | case PTRACE_SECCOMP_GET_FILTER: | ||
1021 | ret = seccomp_get_filter(child, addr, datavp); | ||
1022 | break; | ||
1023 | |||
1019 | default: | 1024 | default: |
1020 | break; | 1025 | break; |
1021 | } | 1026 | } |
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 50a808424b06..61a16569ffbf 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile | |||
@@ -1,4 +1,4 @@ | |||
1 | obj-y += update.o | 1 | obj-y += update.o sync.o |
2 | obj-$(CONFIG_SRCU) += srcu.o | 2 | obj-$(CONFIG_SRCU) += srcu.o |
3 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o | 3 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o |
4 | obj-$(CONFIG_TREE_RCU) += tree.o | 4 | obj-$(CONFIG_TREE_RCU) += tree.o |
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 77192953dee5..d89328e260df 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c | |||
@@ -252,7 +252,7 @@ struct rcu_torture_ops { | |||
252 | void (*exp_sync)(void); | 252 | void (*exp_sync)(void); |
253 | unsigned long (*get_state)(void); | 253 | unsigned long (*get_state)(void); |
254 | void (*cond_sync)(unsigned long oldstate); | 254 | void (*cond_sync)(unsigned long oldstate); |
255 | void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); | 255 | call_rcu_func_t call; |
256 | void (*cb_barrier)(void); | 256 | void (*cb_barrier)(void); |
257 | void (*fqs)(void); | 257 | void (*fqs)(void); |
258 | void (*stats)(void); | 258 | void (*stats)(void); |
@@ -448,7 +448,7 @@ static void synchronize_rcu_busted(void) | |||
448 | } | 448 | } |
449 | 449 | ||
450 | static void | 450 | static void |
451 | call_rcu_busted(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | 451 | call_rcu_busted(struct rcu_head *head, rcu_callback_t func) |
452 | { | 452 | { |
453 | /* This is a deliberate bug for testing purposes only! */ | 453 | /* This is a deliberate bug for testing purposes only! */ |
454 | func(head); | 454 | func(head); |
@@ -523,7 +523,7 @@ static void srcu_torture_synchronize(void) | |||
523 | } | 523 | } |
524 | 524 | ||
525 | static void srcu_torture_call(struct rcu_head *head, | 525 | static void srcu_torture_call(struct rcu_head *head, |
526 | void (*func)(struct rcu_head *head)) | 526 | rcu_callback_t func) |
527 | { | 527 | { |
528 | call_srcu(srcu_ctlp, head, func); | 528 | call_srcu(srcu_ctlp, head, func); |
529 | } | 529 | } |
@@ -695,7 +695,7 @@ static bool __maybe_unused torturing_tasks(void) | |||
695 | 695 | ||
696 | #define RCUTORTURE_TASKS_OPS | 696 | #define RCUTORTURE_TASKS_OPS |
697 | 697 | ||
698 | static bool torturing_tasks(void) | 698 | static bool __maybe_unused torturing_tasks(void) |
699 | { | 699 | { |
700 | return false; | 700 | return false; |
701 | } | 701 | } |
@@ -768,7 +768,6 @@ static int rcu_torture_boost(void *arg) | |||
768 | } | 768 | } |
769 | call_rcu_time = jiffies; | 769 | call_rcu_time = jiffies; |
770 | } | 770 | } |
771 | cond_resched_rcu_qs(); | ||
772 | stutter_wait("rcu_torture_boost"); | 771 | stutter_wait("rcu_torture_boost"); |
773 | if (torture_must_stop()) | 772 | if (torture_must_stop()) |
774 | goto checkwait; | 773 | goto checkwait; |
@@ -1208,7 +1207,6 @@ rcu_torture_reader(void *arg) | |||
1208 | __this_cpu_inc(rcu_torture_batch[completed]); | 1207 | __this_cpu_inc(rcu_torture_batch[completed]); |
1209 | preempt_enable(); | 1208 | preempt_enable(); |
1210 | cur_ops->readunlock(idx); | 1209 | cur_ops->readunlock(idx); |
1211 | cond_resched_rcu_qs(); | ||
1212 | stutter_wait("rcu_torture_reader"); | 1210 | stutter_wait("rcu_torture_reader"); |
1213 | } while (!torture_must_stop()); | 1211 | } while (!torture_must_stop()); |
1214 | if (irqreader && cur_ops->irq_capable) { | 1212 | if (irqreader && cur_ops->irq_capable) { |
@@ -1742,15 +1740,15 @@ rcu_torture_init(void) | |||
1742 | for (i = 0; i < ARRAY_SIZE(torture_ops); i++) | 1740 | for (i = 0; i < ARRAY_SIZE(torture_ops); i++) |
1743 | pr_alert(" %s", torture_ops[i]->name); | 1741 | pr_alert(" %s", torture_ops[i]->name); |
1744 | pr_alert("\n"); | 1742 | pr_alert("\n"); |
1745 | torture_init_end(); | 1743 | firsterr = -EINVAL; |
1746 | return -EINVAL; | 1744 | goto unwind; |
1747 | } | 1745 | } |
1748 | if (cur_ops->fqs == NULL && fqs_duration != 0) { | 1746 | if (cur_ops->fqs == NULL && fqs_duration != 0) { |
1749 | pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); | 1747 | pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); |
1750 | fqs_duration = 0; | 1748 | fqs_duration = 0; |
1751 | } | 1749 | } |
1752 | if (cur_ops->init) | 1750 | if (cur_ops->init) |
1753 | cur_ops->init(); /* no "goto unwind" prior to this point!!! */ | 1751 | cur_ops->init(); |
1754 | 1752 | ||
1755 | if (nreaders >= 0) { | 1753 | if (nreaders >= 0) { |
1756 | nrealreaders = nreaders; | 1754 | nrealreaders = nreaders; |
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index d3fcb2ec8536..a63a1ea5a41b 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c | |||
@@ -298,11 +298,9 @@ int __srcu_read_lock(struct srcu_struct *sp) | |||
298 | int idx; | 298 | int idx; |
299 | 299 | ||
300 | idx = READ_ONCE(sp->completed) & 0x1; | 300 | idx = READ_ONCE(sp->completed) & 0x1; |
301 | preempt_disable(); | ||
302 | __this_cpu_inc(sp->per_cpu_ref->c[idx]); | 301 | __this_cpu_inc(sp->per_cpu_ref->c[idx]); |
303 | smp_mb(); /* B */ /* Avoid leaking the critical section. */ | 302 | smp_mb(); /* B */ /* Avoid leaking the critical section. */ |
304 | __this_cpu_inc(sp->per_cpu_ref->seq[idx]); | 303 | __this_cpu_inc(sp->per_cpu_ref->seq[idx]); |
305 | preempt_enable(); | ||
306 | return idx; | 304 | return idx; |
307 | } | 305 | } |
308 | EXPORT_SYMBOL_GPL(__srcu_read_lock); | 306 | EXPORT_SYMBOL_GPL(__srcu_read_lock); |
@@ -387,7 +385,7 @@ static void srcu_flip(struct srcu_struct *sp) | |||
387 | * srcu_struct structure. | 385 | * srcu_struct structure. |
388 | */ | 386 | */ |
389 | void call_srcu(struct srcu_struct *sp, struct rcu_head *head, | 387 | void call_srcu(struct srcu_struct *sp, struct rcu_head *head, |
390 | void (*func)(struct rcu_head *head)) | 388 | rcu_callback_t func) |
391 | { | 389 | { |
392 | unsigned long flags; | 390 | unsigned long flags; |
393 | 391 | ||
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c new file mode 100644 index 000000000000..be922c9f3d37 --- /dev/null +++ b/kernel/rcu/sync.c | |||
@@ -0,0 +1,223 @@ | |||
1 | /* | ||
2 | * RCU-based infrastructure for lightweight reader-writer locking | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, you can access it online at | ||
16 | * http://www.gnu.org/licenses/gpl-2.0.html. | ||
17 | * | ||
18 | * Copyright (c) 2015, Red Hat, Inc. | ||
19 | * | ||
20 | * Author: Oleg Nesterov <oleg@redhat.com> | ||
21 | */ | ||
22 | |||
23 | #include <linux/rcu_sync.h> | ||
24 | #include <linux/sched.h> | ||
25 | |||
26 | #ifdef CONFIG_PROVE_RCU | ||
27 | #define __INIT_HELD(func) .held = func, | ||
28 | #else | ||
29 | #define __INIT_HELD(func) | ||
30 | #endif | ||
31 | |||
32 | static const struct { | ||
33 | void (*sync)(void); | ||
34 | void (*call)(struct rcu_head *, void (*)(struct rcu_head *)); | ||
35 | void (*wait)(void); | ||
36 | #ifdef CONFIG_PROVE_RCU | ||
37 | int (*held)(void); | ||
38 | #endif | ||
39 | } gp_ops[] = { | ||
40 | [RCU_SYNC] = { | ||
41 | .sync = synchronize_rcu, | ||
42 | .call = call_rcu, | ||
43 | .wait = rcu_barrier, | ||
44 | __INIT_HELD(rcu_read_lock_held) | ||
45 | }, | ||
46 | [RCU_SCHED_SYNC] = { | ||
47 | .sync = synchronize_sched, | ||
48 | .call = call_rcu_sched, | ||
49 | .wait = rcu_barrier_sched, | ||
50 | __INIT_HELD(rcu_read_lock_sched_held) | ||
51 | }, | ||
52 | [RCU_BH_SYNC] = { | ||
53 | .sync = synchronize_rcu_bh, | ||
54 | .call = call_rcu_bh, | ||
55 | .wait = rcu_barrier_bh, | ||
56 | __INIT_HELD(rcu_read_lock_bh_held) | ||
57 | }, | ||
58 | }; | ||
59 | |||
60 | enum { GP_IDLE = 0, GP_PENDING, GP_PASSED }; | ||
61 | enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY }; | ||
62 | |||
63 | #define rss_lock gp_wait.lock | ||
64 | |||
65 | #ifdef CONFIG_PROVE_RCU | ||
66 | void rcu_sync_lockdep_assert(struct rcu_sync *rsp) | ||
67 | { | ||
68 | RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(), | ||
69 | "suspicious rcu_sync_is_idle() usage"); | ||
70 | } | ||
71 | #endif | ||
72 | |||
73 | /** | ||
74 | * rcu_sync_init() - Initialize an rcu_sync structure | ||
75 | * @rsp: Pointer to rcu_sync structure to be initialized | ||
76 | * @type: Flavor of RCU with which to synchronize rcu_sync structure | ||
77 | */ | ||
78 | void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type) | ||
79 | { | ||
80 | memset(rsp, 0, sizeof(*rsp)); | ||
81 | init_waitqueue_head(&rsp->gp_wait); | ||
82 | rsp->gp_type = type; | ||
83 | } | ||
84 | |||
85 | /** | ||
86 | * rcu_sync_enter() - Force readers onto slowpath | ||
87 | * @rsp: Pointer to rcu_sync structure to use for synchronization | ||
88 | * | ||
89 | * This function is used by updaters who need readers to make use of | ||
90 | * a slowpath during the update. After this function returns, all | ||
91 | * subsequent calls to rcu_sync_is_idle() will return false, which | ||
92 | * tells readers to stay off their fastpaths. A later call to | ||
93 | * rcu_sync_exit() re-enables reader slowpaths. | ||
94 | * | ||
95 | * When called in isolation, rcu_sync_enter() must wait for a grace | ||
96 | * period, however, closely spaced calls to rcu_sync_enter() can | ||
97 | * optimize away the grace-period wait via a state machine implemented | ||
98 | * by rcu_sync_enter(), rcu_sync_exit(), and rcu_sync_func(). | ||
99 | */ | ||
100 | void rcu_sync_enter(struct rcu_sync *rsp) | ||
101 | { | ||
102 | bool need_wait, need_sync; | ||
103 | |||
104 | spin_lock_irq(&rsp->rss_lock); | ||
105 | need_wait = rsp->gp_count++; | ||
106 | need_sync = rsp->gp_state == GP_IDLE; | ||
107 | if (need_sync) | ||
108 | rsp->gp_state = GP_PENDING; | ||
109 | spin_unlock_irq(&rsp->rss_lock); | ||
110 | |||
111 | BUG_ON(need_wait && need_sync); | ||
112 | |||
113 | if (need_sync) { | ||
114 | gp_ops[rsp->gp_type].sync(); | ||
115 | rsp->gp_state = GP_PASSED; | ||
116 | wake_up_all(&rsp->gp_wait); | ||
117 | } else if (need_wait) { | ||
118 | wait_event(rsp->gp_wait, rsp->gp_state == GP_PASSED); | ||
119 | } else { | ||
120 | /* | ||
121 | * Possible when there's a pending CB from a rcu_sync_exit(). | ||
122 | * Nobody has yet been allowed the 'fast' path and thus we can | ||
123 | * avoid doing any sync(). The callback will get 'dropped'. | ||
124 | */ | ||
125 | BUG_ON(rsp->gp_state != GP_PASSED); | ||
126 | } | ||
127 | } | ||
128 | |||
129 | /** | ||
130 | * rcu_sync_func() - Callback function managing reader access to fastpath | ||
131 | * @rsp: Pointer to rcu_sync structure to use for synchronization | ||
132 | * | ||
133 | * This function is passed to one of the call_rcu() functions by | ||
134 | * rcu_sync_exit(), so that it is invoked after a grace period following the | ||
135 | * that invocation of rcu_sync_exit(). It takes action based on events that | ||
136 | * have taken place in the meantime, so that closely spaced rcu_sync_enter() | ||
137 | * and rcu_sync_exit() pairs need not wait for a grace period. | ||
138 | * | ||
139 | * If another rcu_sync_enter() is invoked before the grace period | ||
140 | * ended, reset state to allow the next rcu_sync_exit() to let the | ||
141 | * readers back onto their fastpaths (after a grace period). If both | ||
142 | * another rcu_sync_enter() and its matching rcu_sync_exit() are invoked | ||
143 | * before the grace period ended, re-invoke call_rcu() on behalf of that | ||
144 | * rcu_sync_exit(). Otherwise, set all state back to idle so that readers | ||
145 | * can again use their fastpaths. | ||
146 | */ | ||
147 | static void rcu_sync_func(struct rcu_head *rcu) | ||
148 | { | ||
149 | struct rcu_sync *rsp = container_of(rcu, struct rcu_sync, cb_head); | ||
150 | unsigned long flags; | ||
151 | |||
152 | BUG_ON(rsp->gp_state != GP_PASSED); | ||
153 | BUG_ON(rsp->cb_state == CB_IDLE); | ||
154 | |||
155 | spin_lock_irqsave(&rsp->rss_lock, flags); | ||
156 | if (rsp->gp_count) { | ||
157 | /* | ||
158 | * A new rcu_sync_begin() has happened; drop the callback. | ||
159 | */ | ||
160 | rsp->cb_state = CB_IDLE; | ||
161 | } else if (rsp->cb_state == CB_REPLAY) { | ||
162 | /* | ||
163 | * A new rcu_sync_exit() has happened; requeue the callback | ||
164 | * to catch a later GP. | ||
165 | */ | ||
166 | rsp->cb_state = CB_PENDING; | ||
167 | gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func); | ||
168 | } else { | ||
169 | /* | ||
170 | * We're at least a GP after rcu_sync_exit(); eveybody will now | ||
171 | * have observed the write side critical section. Let 'em rip!. | ||
172 | */ | ||
173 | rsp->cb_state = CB_IDLE; | ||
174 | rsp->gp_state = GP_IDLE; | ||
175 | } | ||
176 | spin_unlock_irqrestore(&rsp->rss_lock, flags); | ||
177 | } | ||
178 | |||
179 | /** | ||
180 | * rcu_sync_exit() - Allow readers back onto fast patch after grace period | ||
181 | * @rsp: Pointer to rcu_sync structure to use for synchronization | ||
182 | * | ||
183 | * This function is used by updaters who have completed, and can therefore | ||
184 | * now allow readers to make use of their fastpaths after a grace period | ||
185 | * has elapsed. After this grace period has completed, all subsequent | ||
186 | * calls to rcu_sync_is_idle() will return true, which tells readers that | ||
187 | * they can once again use their fastpaths. | ||
188 | */ | ||
189 | void rcu_sync_exit(struct rcu_sync *rsp) | ||
190 | { | ||
191 | spin_lock_irq(&rsp->rss_lock); | ||
192 | if (!--rsp->gp_count) { | ||
193 | if (rsp->cb_state == CB_IDLE) { | ||
194 | rsp->cb_state = CB_PENDING; | ||
195 | gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func); | ||
196 | } else if (rsp->cb_state == CB_PENDING) { | ||
197 | rsp->cb_state = CB_REPLAY; | ||
198 | } | ||
199 | } | ||
200 | spin_unlock_irq(&rsp->rss_lock); | ||
201 | } | ||
202 | |||
203 | /** | ||
204 | * rcu_sync_dtor() - Clean up an rcu_sync structure | ||
205 | * @rsp: Pointer to rcu_sync structure to be cleaned up | ||
206 | */ | ||
207 | void rcu_sync_dtor(struct rcu_sync *rsp) | ||
208 | { | ||
209 | int cb_state; | ||
210 | |||
211 | BUG_ON(rsp->gp_count); | ||
212 | |||
213 | spin_lock_irq(&rsp->rss_lock); | ||
214 | if (rsp->cb_state == CB_REPLAY) | ||
215 | rsp->cb_state = CB_PENDING; | ||
216 | cb_state = rsp->cb_state; | ||
217 | spin_unlock_irq(&rsp->rss_lock); | ||
218 | |||
219 | if (cb_state != CB_IDLE) { | ||
220 | gp_ops[rsp->gp_type].wait(); | ||
221 | BUG_ON(rsp->cb_state != CB_IDLE); | ||
222 | } | ||
223 | } | ||
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index d0471056d0af..944b1b491ed8 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c | |||
@@ -44,7 +44,7 @@ struct rcu_ctrlblk; | |||
44 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); | 44 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); |
45 | static void rcu_process_callbacks(struct softirq_action *unused); | 45 | static void rcu_process_callbacks(struct softirq_action *unused); |
46 | static void __call_rcu(struct rcu_head *head, | 46 | static void __call_rcu(struct rcu_head *head, |
47 | void (*func)(struct rcu_head *rcu), | 47 | rcu_callback_t func, |
48 | struct rcu_ctrlblk *rcp); | 48 | struct rcu_ctrlblk *rcp); |
49 | 49 | ||
50 | #include "tiny_plugin.h" | 50 | #include "tiny_plugin.h" |
@@ -203,7 +203,7 @@ EXPORT_SYMBOL_GPL(synchronize_sched); | |||
203 | * Helper function for call_rcu() and call_rcu_bh(). | 203 | * Helper function for call_rcu() and call_rcu_bh(). |
204 | */ | 204 | */ |
205 | static void __call_rcu(struct rcu_head *head, | 205 | static void __call_rcu(struct rcu_head *head, |
206 | void (*func)(struct rcu_head *rcu), | 206 | rcu_callback_t func, |
207 | struct rcu_ctrlblk *rcp) | 207 | struct rcu_ctrlblk *rcp) |
208 | { | 208 | { |
209 | unsigned long flags; | 209 | unsigned long flags; |
@@ -229,7 +229,7 @@ static void __call_rcu(struct rcu_head *head, | |||
229 | * period. But since we have but one CPU, that would be after any | 229 | * period. But since we have but one CPU, that would be after any |
230 | * quiescent state. | 230 | * quiescent state. |
231 | */ | 231 | */ |
232 | void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | 232 | void call_rcu_sched(struct rcu_head *head, rcu_callback_t func) |
233 | { | 233 | { |
234 | __call_rcu(head, func, &rcu_sched_ctrlblk); | 234 | __call_rcu(head, func, &rcu_sched_ctrlblk); |
235 | } | 235 | } |
@@ -239,7 +239,7 @@ EXPORT_SYMBOL_GPL(call_rcu_sched); | |||
239 | * Post an RCU bottom-half callback to be invoked after any subsequent | 239 | * Post an RCU bottom-half callback to be invoked after any subsequent |
240 | * quiescent state. | 240 | * quiescent state. |
241 | */ | 241 | */ |
242 | void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | 242 | void call_rcu_bh(struct rcu_head *head, rcu_callback_t func) |
243 | { | 243 | { |
244 | __call_rcu(head, func, &rcu_bh_ctrlblk); | 244 | __call_rcu(head, func, &rcu_bh_ctrlblk); |
245 | } | 245 | } |
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9f75f25cc5d9..f07343b54fe5 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c | |||
@@ -71,7 +71,6 @@ MODULE_ALIAS("rcutree"); | |||
71 | static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; | 71 | static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; |
72 | static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; | 72 | static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; |
73 | static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS]; | 73 | static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS]; |
74 | static struct lock_class_key rcu_exp_sched_class[RCU_NUM_LVLS]; | ||
75 | 74 | ||
76 | /* | 75 | /* |
77 | * In order to export the rcu_state name to the tracing tools, it | 76 | * In order to export the rcu_state name to the tracing tools, it |
@@ -98,7 +97,7 @@ struct rcu_state sname##_state = { \ | |||
98 | .level = { &sname##_state.node[0] }, \ | 97 | .level = { &sname##_state.node[0] }, \ |
99 | .rda = &sname##_data, \ | 98 | .rda = &sname##_data, \ |
100 | .call = cr, \ | 99 | .call = cr, \ |
101 | .fqs_state = RCU_GP_IDLE, \ | 100 | .gp_state = RCU_GP_IDLE, \ |
102 | .gpnum = 0UL - 300UL, \ | 101 | .gpnum = 0UL - 300UL, \ |
103 | .completed = 0UL - 300UL, \ | 102 | .completed = 0UL - 300UL, \ |
104 | .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ | 103 | .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ |
@@ -161,6 +160,8 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); | |||
161 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); | 160 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); |
162 | static void invoke_rcu_core(void); | 161 | static void invoke_rcu_core(void); |
163 | static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); | 162 | static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); |
163 | static void rcu_report_exp_rdp(struct rcu_state *rsp, | ||
164 | struct rcu_data *rdp, bool wake); | ||
164 | 165 | ||
165 | /* rcuc/rcub kthread realtime priority */ | 166 | /* rcuc/rcub kthread realtime priority */ |
166 | #ifdef CONFIG_RCU_KTHREAD_PRIO | 167 | #ifdef CONFIG_RCU_KTHREAD_PRIO |
@@ -245,21 +246,33 @@ static int rcu_gp_in_progress(struct rcu_state *rsp) | |||
245 | */ | 246 | */ |
246 | void rcu_sched_qs(void) | 247 | void rcu_sched_qs(void) |
247 | { | 248 | { |
248 | if (!__this_cpu_read(rcu_sched_data.passed_quiesce)) { | 249 | unsigned long flags; |
250 | |||
251 | if (__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) { | ||
249 | trace_rcu_grace_period(TPS("rcu_sched"), | 252 | trace_rcu_grace_period(TPS("rcu_sched"), |
250 | __this_cpu_read(rcu_sched_data.gpnum), | 253 | __this_cpu_read(rcu_sched_data.gpnum), |
251 | TPS("cpuqs")); | 254 | TPS("cpuqs")); |
252 | __this_cpu_write(rcu_sched_data.passed_quiesce, 1); | 255 | __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false); |
256 | if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) | ||
257 | return; | ||
258 | local_irq_save(flags); | ||
259 | if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) { | ||
260 | __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false); | ||
261 | rcu_report_exp_rdp(&rcu_sched_state, | ||
262 | this_cpu_ptr(&rcu_sched_data), | ||
263 | true); | ||
264 | } | ||
265 | local_irq_restore(flags); | ||
253 | } | 266 | } |
254 | } | 267 | } |
255 | 268 | ||
256 | void rcu_bh_qs(void) | 269 | void rcu_bh_qs(void) |
257 | { | 270 | { |
258 | if (!__this_cpu_read(rcu_bh_data.passed_quiesce)) { | 271 | if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) { |
259 | trace_rcu_grace_period(TPS("rcu_bh"), | 272 | trace_rcu_grace_period(TPS("rcu_bh"), |
260 | __this_cpu_read(rcu_bh_data.gpnum), | 273 | __this_cpu_read(rcu_bh_data.gpnum), |
261 | TPS("cpuqs")); | 274 | TPS("cpuqs")); |
262 | __this_cpu_write(rcu_bh_data.passed_quiesce, 1); | 275 | __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false); |
263 | } | 276 | } |
264 | } | 277 | } |
265 | 278 | ||
@@ -337,12 +350,14 @@ static void rcu_momentary_dyntick_idle(void) | |||
337 | */ | 350 | */ |
338 | void rcu_note_context_switch(void) | 351 | void rcu_note_context_switch(void) |
339 | { | 352 | { |
353 | barrier(); /* Avoid RCU read-side critical sections leaking down. */ | ||
340 | trace_rcu_utilization(TPS("Start context switch")); | 354 | trace_rcu_utilization(TPS("Start context switch")); |
341 | rcu_sched_qs(); | 355 | rcu_sched_qs(); |
342 | rcu_preempt_note_context_switch(); | 356 | rcu_preempt_note_context_switch(); |
343 | if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) | 357 | if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) |
344 | rcu_momentary_dyntick_idle(); | 358 | rcu_momentary_dyntick_idle(); |
345 | trace_rcu_utilization(TPS("End context switch")); | 359 | trace_rcu_utilization(TPS("End context switch")); |
360 | barrier(); /* Avoid RCU read-side critical sections leaking up. */ | ||
346 | } | 361 | } |
347 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); | 362 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); |
348 | 363 | ||
@@ -353,12 +368,19 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); | |||
353 | * RCU flavors in desperate need of a quiescent state, which will normally | 368 | * RCU flavors in desperate need of a quiescent state, which will normally |
354 | * be none of them). Either way, do a lightweight quiescent state for | 369 | * be none of them). Either way, do a lightweight quiescent state for |
355 | * all RCU flavors. | 370 | * all RCU flavors. |
371 | * | ||
372 | * The barrier() calls are redundant in the common case when this is | ||
373 | * called externally, but just in case this is called from within this | ||
374 | * file. | ||
375 | * | ||
356 | */ | 376 | */ |
357 | void rcu_all_qs(void) | 377 | void rcu_all_qs(void) |
358 | { | 378 | { |
379 | barrier(); /* Avoid RCU read-side critical sections leaking down. */ | ||
359 | if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) | 380 | if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) |
360 | rcu_momentary_dyntick_idle(); | 381 | rcu_momentary_dyntick_idle(); |
361 | this_cpu_inc(rcu_qs_ctr); | 382 | this_cpu_inc(rcu_qs_ctr); |
383 | barrier(); /* Avoid RCU read-side critical sections leaking up. */ | ||
362 | } | 384 | } |
363 | EXPORT_SYMBOL_GPL(rcu_all_qs); | 385 | EXPORT_SYMBOL_GPL(rcu_all_qs); |
364 | 386 | ||
@@ -1744,9 +1766,9 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, | |||
1744 | */ | 1766 | */ |
1745 | rdp->gpnum = rnp->gpnum; | 1767 | rdp->gpnum = rnp->gpnum; |
1746 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); | 1768 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); |
1747 | rdp->passed_quiesce = 0; | 1769 | rdp->cpu_no_qs.b.norm = true; |
1748 | rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); | 1770 | rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); |
1749 | rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); | 1771 | rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask); |
1750 | zero_cpu_stall_ticks(rdp); | 1772 | zero_cpu_stall_ticks(rdp); |
1751 | WRITE_ONCE(rdp->gpwrap, false); | 1773 | WRITE_ONCE(rdp->gpwrap, false); |
1752 | } | 1774 | } |
@@ -1927,16 +1949,15 @@ static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp) | |||
1927 | /* | 1949 | /* |
1928 | * Do one round of quiescent-state forcing. | 1950 | * Do one round of quiescent-state forcing. |
1929 | */ | 1951 | */ |
1930 | static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) | 1952 | static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time) |
1931 | { | 1953 | { |
1932 | int fqs_state = fqs_state_in; | ||
1933 | bool isidle = false; | 1954 | bool isidle = false; |
1934 | unsigned long maxj; | 1955 | unsigned long maxj; |
1935 | struct rcu_node *rnp = rcu_get_root(rsp); | 1956 | struct rcu_node *rnp = rcu_get_root(rsp); |
1936 | 1957 | ||
1937 | WRITE_ONCE(rsp->gp_activity, jiffies); | 1958 | WRITE_ONCE(rsp->gp_activity, jiffies); |
1938 | rsp->n_force_qs++; | 1959 | rsp->n_force_qs++; |
1939 | if (fqs_state == RCU_SAVE_DYNTICK) { | 1960 | if (first_time) { |
1940 | /* Collect dyntick-idle snapshots. */ | 1961 | /* Collect dyntick-idle snapshots. */ |
1941 | if (is_sysidle_rcu_state(rsp)) { | 1962 | if (is_sysidle_rcu_state(rsp)) { |
1942 | isidle = true; | 1963 | isidle = true; |
@@ -1945,7 +1966,6 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) | |||
1945 | force_qs_rnp(rsp, dyntick_save_progress_counter, | 1966 | force_qs_rnp(rsp, dyntick_save_progress_counter, |
1946 | &isidle, &maxj); | 1967 | &isidle, &maxj); |
1947 | rcu_sysidle_report_gp(rsp, isidle, maxj); | 1968 | rcu_sysidle_report_gp(rsp, isidle, maxj); |
1948 | fqs_state = RCU_FORCE_QS; | ||
1949 | } else { | 1969 | } else { |
1950 | /* Handle dyntick-idle and offline CPUs. */ | 1970 | /* Handle dyntick-idle and offline CPUs. */ |
1951 | isidle = true; | 1971 | isidle = true; |
@@ -1959,7 +1979,6 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) | |||
1959 | READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS); | 1979 | READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS); |
1960 | raw_spin_unlock_irq(&rnp->lock); | 1980 | raw_spin_unlock_irq(&rnp->lock); |
1961 | } | 1981 | } |
1962 | return fqs_state; | ||
1963 | } | 1982 | } |
1964 | 1983 | ||
1965 | /* | 1984 | /* |
@@ -2023,7 +2042,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) | |||
2023 | /* Declare grace period done. */ | 2042 | /* Declare grace period done. */ |
2024 | WRITE_ONCE(rsp->completed, rsp->gpnum); | 2043 | WRITE_ONCE(rsp->completed, rsp->gpnum); |
2025 | trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); | 2044 | trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); |
2026 | rsp->fqs_state = RCU_GP_IDLE; | 2045 | rsp->gp_state = RCU_GP_IDLE; |
2027 | rdp = this_cpu_ptr(rsp->rda); | 2046 | rdp = this_cpu_ptr(rsp->rda); |
2028 | /* Advance CBs to reduce false positives below. */ | 2047 | /* Advance CBs to reduce false positives below. */ |
2029 | needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp; | 2048 | needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp; |
@@ -2041,7 +2060,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) | |||
2041 | */ | 2060 | */ |
2042 | static int __noreturn rcu_gp_kthread(void *arg) | 2061 | static int __noreturn rcu_gp_kthread(void *arg) |
2043 | { | 2062 | { |
2044 | int fqs_state; | 2063 | bool first_gp_fqs; |
2045 | int gf; | 2064 | int gf; |
2046 | unsigned long j; | 2065 | unsigned long j; |
2047 | int ret; | 2066 | int ret; |
@@ -2073,7 +2092,7 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
2073 | } | 2092 | } |
2074 | 2093 | ||
2075 | /* Handle quiescent-state forcing. */ | 2094 | /* Handle quiescent-state forcing. */ |
2076 | fqs_state = RCU_SAVE_DYNTICK; | 2095 | first_gp_fqs = true; |
2077 | j = jiffies_till_first_fqs; | 2096 | j = jiffies_till_first_fqs; |
2078 | if (j > HZ) { | 2097 | if (j > HZ) { |
2079 | j = HZ; | 2098 | j = HZ; |
@@ -2101,7 +2120,8 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
2101 | trace_rcu_grace_period(rsp->name, | 2120 | trace_rcu_grace_period(rsp->name, |
2102 | READ_ONCE(rsp->gpnum), | 2121 | READ_ONCE(rsp->gpnum), |
2103 | TPS("fqsstart")); | 2122 | TPS("fqsstart")); |
2104 | fqs_state = rcu_gp_fqs(rsp, fqs_state); | 2123 | rcu_gp_fqs(rsp, first_gp_fqs); |
2124 | first_gp_fqs = false; | ||
2105 | trace_rcu_grace_period(rsp->name, | 2125 | trace_rcu_grace_period(rsp->name, |
2106 | READ_ONCE(rsp->gpnum), | 2126 | READ_ONCE(rsp->gpnum), |
2107 | TPS("fqsend")); | 2127 | TPS("fqsend")); |
@@ -2337,7 +2357,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) | |||
2337 | rnp = rdp->mynode; | 2357 | rnp = rdp->mynode; |
2338 | raw_spin_lock_irqsave(&rnp->lock, flags); | 2358 | raw_spin_lock_irqsave(&rnp->lock, flags); |
2339 | smp_mb__after_unlock_lock(); | 2359 | smp_mb__after_unlock_lock(); |
2340 | if ((rdp->passed_quiesce == 0 && | 2360 | if ((rdp->cpu_no_qs.b.norm && |
2341 | rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) || | 2361 | rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) || |
2342 | rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum || | 2362 | rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum || |
2343 | rdp->gpwrap) { | 2363 | rdp->gpwrap) { |
@@ -2348,7 +2368,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) | |||
2348 | * We will instead need a new quiescent state that lies | 2368 | * We will instead need a new quiescent state that lies |
2349 | * within the current grace period. | 2369 | * within the current grace period. |
2350 | */ | 2370 | */ |
2351 | rdp->passed_quiesce = 0; /* need qs for new gp. */ | 2371 | rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */ |
2352 | rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); | 2372 | rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); |
2353 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 2373 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
2354 | return; | 2374 | return; |
@@ -2357,7 +2377,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) | |||
2357 | if ((rnp->qsmask & mask) == 0) { | 2377 | if ((rnp->qsmask & mask) == 0) { |
2358 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 2378 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
2359 | } else { | 2379 | } else { |
2360 | rdp->qs_pending = 0; | 2380 | rdp->core_needs_qs = 0; |
2361 | 2381 | ||
2362 | /* | 2382 | /* |
2363 | * This GP can't end until cpu checks in, so all of our | 2383 | * This GP can't end until cpu checks in, so all of our |
@@ -2388,14 +2408,14 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) | |||
2388 | * Does this CPU still need to do its part for current grace period? | 2408 | * Does this CPU still need to do its part for current grace period? |
2389 | * If no, return and let the other CPUs do their part as well. | 2409 | * If no, return and let the other CPUs do their part as well. |
2390 | */ | 2410 | */ |
2391 | if (!rdp->qs_pending) | 2411 | if (!rdp->core_needs_qs) |
2392 | return; | 2412 | return; |
2393 | 2413 | ||
2394 | /* | 2414 | /* |
2395 | * Was there a quiescent state since the beginning of the grace | 2415 | * Was there a quiescent state since the beginning of the grace |
2396 | * period? If no, then exit and wait for the next call. | 2416 | * period? If no, then exit and wait for the next call. |
2397 | */ | 2417 | */ |
2398 | if (!rdp->passed_quiesce && | 2418 | if (rdp->cpu_no_qs.b.norm && |
2399 | rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) | 2419 | rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) |
2400 | return; | 2420 | return; |
2401 | 2421 | ||
@@ -3017,7 +3037,7 @@ static void rcu_leak_callback(struct rcu_head *rhp) | |||
3017 | * is expected to specify a CPU. | 3037 | * is expected to specify a CPU. |
3018 | */ | 3038 | */ |
3019 | static void | 3039 | static void |
3020 | __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | 3040 | __call_rcu(struct rcu_head *head, rcu_callback_t func, |
3021 | struct rcu_state *rsp, int cpu, bool lazy) | 3041 | struct rcu_state *rsp, int cpu, bool lazy) |
3022 | { | 3042 | { |
3023 | unsigned long flags; | 3043 | unsigned long flags; |
@@ -3088,7 +3108,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
3088 | /* | 3108 | /* |
3089 | * Queue an RCU-sched callback for invocation after a grace period. | 3109 | * Queue an RCU-sched callback for invocation after a grace period. |
3090 | */ | 3110 | */ |
3091 | void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | 3111 | void call_rcu_sched(struct rcu_head *head, rcu_callback_t func) |
3092 | { | 3112 | { |
3093 | __call_rcu(head, func, &rcu_sched_state, -1, 0); | 3113 | __call_rcu(head, func, &rcu_sched_state, -1, 0); |
3094 | } | 3114 | } |
@@ -3097,7 +3117,7 @@ EXPORT_SYMBOL_GPL(call_rcu_sched); | |||
3097 | /* | 3117 | /* |
3098 | * Queue an RCU callback for invocation after a quicker grace period. | 3118 | * Queue an RCU callback for invocation after a quicker grace period. |
3099 | */ | 3119 | */ |
3100 | void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | 3120 | void call_rcu_bh(struct rcu_head *head, rcu_callback_t func) |
3101 | { | 3121 | { |
3102 | __call_rcu(head, func, &rcu_bh_state, -1, 0); | 3122 | __call_rcu(head, func, &rcu_bh_state, -1, 0); |
3103 | } | 3123 | } |
@@ -3111,7 +3131,7 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); | |||
3111 | * function may only be called from __kfree_rcu(). | 3131 | * function may only be called from __kfree_rcu(). |
3112 | */ | 3132 | */ |
3113 | void kfree_call_rcu(struct rcu_head *head, | 3133 | void kfree_call_rcu(struct rcu_head *head, |
3114 | void (*func)(struct rcu_head *rcu)) | 3134 | rcu_callback_t func) |
3115 | { | 3135 | { |
3116 | __call_rcu(head, func, rcu_state_p, -1, 1); | 3136 | __call_rcu(head, func, rcu_state_p, -1, 1); |
3117 | } | 3137 | } |
@@ -3379,6 +3399,191 @@ static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) | |||
3379 | return rcu_seq_done(&rsp->expedited_sequence, s); | 3399 | return rcu_seq_done(&rsp->expedited_sequence, s); |
3380 | } | 3400 | } |
3381 | 3401 | ||
3402 | /* | ||
3403 | * Reset the ->expmaskinit values in the rcu_node tree to reflect any | ||
3404 | * recent CPU-online activity. Note that these masks are not cleared | ||
3405 | * when CPUs go offline, so they reflect the union of all CPUs that have | ||
3406 | * ever been online. This means that this function normally takes its | ||
3407 | * no-work-to-do fastpath. | ||
3408 | */ | ||
3409 | static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) | ||
3410 | { | ||
3411 | bool done; | ||
3412 | unsigned long flags; | ||
3413 | unsigned long mask; | ||
3414 | unsigned long oldmask; | ||
3415 | int ncpus = READ_ONCE(rsp->ncpus); | ||
3416 | struct rcu_node *rnp; | ||
3417 | struct rcu_node *rnp_up; | ||
3418 | |||
3419 | /* If no new CPUs onlined since last time, nothing to do. */ | ||
3420 | if (likely(ncpus == rsp->ncpus_snap)) | ||
3421 | return; | ||
3422 | rsp->ncpus_snap = ncpus; | ||
3423 | |||
3424 | /* | ||
3425 | * Each pass through the following loop propagates newly onlined | ||
3426 | * CPUs for the current rcu_node structure up the rcu_node tree. | ||
3427 | */ | ||
3428 | rcu_for_each_leaf_node(rsp, rnp) { | ||
3429 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
3430 | smp_mb__after_unlock_lock(); | ||
3431 | if (rnp->expmaskinit == rnp->expmaskinitnext) { | ||
3432 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
3433 | continue; /* No new CPUs, nothing to do. */ | ||
3434 | } | ||
3435 | |||
3436 | /* Update this node's mask, track old value for propagation. */ | ||
3437 | oldmask = rnp->expmaskinit; | ||
3438 | rnp->expmaskinit = rnp->expmaskinitnext; | ||
3439 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
3440 | |||
3441 | /* If was already nonzero, nothing to propagate. */ | ||
3442 | if (oldmask) | ||
3443 | continue; | ||
3444 | |||
3445 | /* Propagate the new CPU up the tree. */ | ||
3446 | mask = rnp->grpmask; | ||
3447 | rnp_up = rnp->parent; | ||
3448 | done = false; | ||
3449 | while (rnp_up) { | ||
3450 | raw_spin_lock_irqsave(&rnp_up->lock, flags); | ||
3451 | smp_mb__after_unlock_lock(); | ||
3452 | if (rnp_up->expmaskinit) | ||
3453 | done = true; | ||
3454 | rnp_up->expmaskinit |= mask; | ||
3455 | raw_spin_unlock_irqrestore(&rnp_up->lock, flags); | ||
3456 | if (done) | ||
3457 | break; | ||
3458 | mask = rnp_up->grpmask; | ||
3459 | rnp_up = rnp_up->parent; | ||
3460 | } | ||
3461 | } | ||
3462 | } | ||
3463 | |||
3464 | /* | ||
3465 | * Reset the ->expmask values in the rcu_node tree in preparation for | ||
3466 | * a new expedited grace period. | ||
3467 | */ | ||
3468 | static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) | ||
3469 | { | ||
3470 | unsigned long flags; | ||
3471 | struct rcu_node *rnp; | ||
3472 | |||
3473 | sync_exp_reset_tree_hotplug(rsp); | ||
3474 | rcu_for_each_node_breadth_first(rsp, rnp) { | ||
3475 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
3476 | smp_mb__after_unlock_lock(); | ||
3477 | WARN_ON_ONCE(rnp->expmask); | ||
3478 | rnp->expmask = rnp->expmaskinit; | ||
3479 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
3480 | } | ||
3481 | } | ||
3482 | |||
3483 | /* | ||
3484 | * Return non-zero if there is no RCU expedited grace period in progress | ||
3485 | * for the specified rcu_node structure, in other words, if all CPUs and | ||
3486 | * tasks covered by the specified rcu_node structure have done their bit | ||
3487 | * for the current expedited grace period. Works only for preemptible | ||
3488 | * RCU -- other RCU implementation use other means. | ||
3489 | * | ||
3490 | * Caller must hold the root rcu_node's exp_funnel_mutex. | ||
3491 | */ | ||
3492 | static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) | ||
3493 | { | ||
3494 | return rnp->exp_tasks == NULL && | ||
3495 | READ_ONCE(rnp->expmask) == 0; | ||
3496 | } | ||
3497 | |||
3498 | /* | ||
3499 | * Report the exit from RCU read-side critical section for the last task | ||
3500 | * that queued itself during or before the current expedited preemptible-RCU | ||
3501 | * grace period. This event is reported either to the rcu_node structure on | ||
3502 | * which the task was queued or to one of that rcu_node structure's ancestors, | ||
3503 | * recursively up the tree. (Calm down, calm down, we do the recursion | ||
3504 | * iteratively!) | ||
3505 | * | ||
3506 | * Caller must hold the root rcu_node's exp_funnel_mutex and the | ||
3507 | * specified rcu_node structure's ->lock. | ||
3508 | */ | ||
3509 | static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, | ||
3510 | bool wake, unsigned long flags) | ||
3511 | __releases(rnp->lock) | ||
3512 | { | ||
3513 | unsigned long mask; | ||
3514 | |||
3515 | for (;;) { | ||
3516 | if (!sync_rcu_preempt_exp_done(rnp)) { | ||
3517 | if (!rnp->expmask) | ||
3518 | rcu_initiate_boost(rnp, flags); | ||
3519 | else | ||
3520 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
3521 | break; | ||
3522 | } | ||
3523 | if (rnp->parent == NULL) { | ||
3524 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
3525 | if (wake) { | ||
3526 | smp_mb(); /* EGP done before wake_up(). */ | ||
3527 | wake_up(&rsp->expedited_wq); | ||
3528 | } | ||
3529 | break; | ||
3530 | } | ||
3531 | mask = rnp->grpmask; | ||
3532 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ | ||
3533 | rnp = rnp->parent; | ||
3534 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ | ||
3535 | smp_mb__after_unlock_lock(); | ||
3536 | WARN_ON_ONCE(!(rnp->expmask & mask)); | ||
3537 | rnp->expmask &= ~mask; | ||
3538 | } | ||
3539 | } | ||
3540 | |||
3541 | /* | ||
3542 | * Report expedited quiescent state for specified node. This is a | ||
3543 | * lock-acquisition wrapper function for __rcu_report_exp_rnp(). | ||
3544 | * | ||
3545 | * Caller must hold the root rcu_node's exp_funnel_mutex. | ||
3546 | */ | ||
3547 | static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, | ||
3548 | struct rcu_node *rnp, bool wake) | ||
3549 | { | ||
3550 | unsigned long flags; | ||
3551 | |||
3552 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
3553 | smp_mb__after_unlock_lock(); | ||
3554 | __rcu_report_exp_rnp(rsp, rnp, wake, flags); | ||
3555 | } | ||
3556 | |||
3557 | /* | ||
3558 | * Report expedited quiescent state for multiple CPUs, all covered by the | ||
3559 | * specified leaf rcu_node structure. Caller must hold the root | ||
3560 | * rcu_node's exp_funnel_mutex. | ||
3561 | */ | ||
3562 | static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, | ||
3563 | unsigned long mask, bool wake) | ||
3564 | { | ||
3565 | unsigned long flags; | ||
3566 | |||
3567 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
3568 | smp_mb__after_unlock_lock(); | ||
3569 | if (!(rnp->expmask & mask)) { | ||
3570 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
3571 | return; | ||
3572 | } | ||
3573 | rnp->expmask &= ~mask; | ||
3574 | __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */ | ||
3575 | } | ||
3576 | |||
3577 | /* | ||
3578 | * Report expedited quiescent state for specified rcu_data (CPU). | ||
3579 | * Caller must hold the root rcu_node's exp_funnel_mutex. | ||
3580 | */ | ||
3581 | static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, | ||
3582 | bool wake) | ||
3583 | { | ||
3584 | rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake); | ||
3585 | } | ||
3586 | |||
3382 | /* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ | 3587 | /* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ |
3383 | static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, | 3588 | static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, |
3384 | struct rcu_data *rdp, | 3589 | struct rcu_data *rdp, |
@@ -3455,16 +3660,111 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) | |||
3455 | } | 3660 | } |
3456 | 3661 | ||
3457 | /* Invoked on each online non-idle CPU for expedited quiescent state. */ | 3662 | /* Invoked on each online non-idle CPU for expedited quiescent state. */ |
3458 | static int synchronize_sched_expedited_cpu_stop(void *data) | 3663 | static void sync_sched_exp_handler(void *data) |
3459 | { | 3664 | { |
3460 | struct rcu_data *rdp = data; | 3665 | struct rcu_data *rdp; |
3461 | struct rcu_state *rsp = rdp->rsp; | 3666 | struct rcu_node *rnp; |
3667 | struct rcu_state *rsp = data; | ||
3462 | 3668 | ||
3463 | /* We are here: If we are last, do the wakeup. */ | 3669 | rdp = this_cpu_ptr(rsp->rda); |
3464 | rdp->exp_done = true; | 3670 | rnp = rdp->mynode; |
3465 | if (atomic_dec_and_test(&rsp->expedited_need_qs)) | 3671 | if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || |
3466 | wake_up(&rsp->expedited_wq); | 3672 | __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) |
3467 | return 0; | 3673 | return; |
3674 | __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); | ||
3675 | resched_cpu(smp_processor_id()); | ||
3676 | } | ||
3677 | |||
3678 | /* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ | ||
3679 | static void sync_sched_exp_online_cleanup(int cpu) | ||
3680 | { | ||
3681 | struct rcu_data *rdp; | ||
3682 | int ret; | ||
3683 | struct rcu_node *rnp; | ||
3684 | struct rcu_state *rsp = &rcu_sched_state; | ||
3685 | |||
3686 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
3687 | rnp = rdp->mynode; | ||
3688 | if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) | ||
3689 | return; | ||
3690 | ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0); | ||
3691 | WARN_ON_ONCE(ret); | ||
3692 | } | ||
3693 | |||
3694 | /* | ||
3695 | * Select the nodes that the upcoming expedited grace period needs | ||
3696 | * to wait for. | ||
3697 | */ | ||
3698 | static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, | ||
3699 | smp_call_func_t func) | ||
3700 | { | ||
3701 | int cpu; | ||
3702 | unsigned long flags; | ||
3703 | unsigned long mask; | ||
3704 | unsigned long mask_ofl_test; | ||
3705 | unsigned long mask_ofl_ipi; | ||
3706 | int ret; | ||
3707 | struct rcu_node *rnp; | ||
3708 | |||
3709 | sync_exp_reset_tree(rsp); | ||
3710 | rcu_for_each_leaf_node(rsp, rnp) { | ||
3711 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
3712 | smp_mb__after_unlock_lock(); | ||
3713 | |||
3714 | /* Each pass checks a CPU for identity, offline, and idle. */ | ||
3715 | mask_ofl_test = 0; | ||
3716 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { | ||
3717 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | ||
3718 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | ||
3719 | |||
3720 | if (raw_smp_processor_id() == cpu || | ||
3721 | !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) | ||
3722 | mask_ofl_test |= rdp->grpmask; | ||
3723 | } | ||
3724 | mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; | ||
3725 | |||
3726 | /* | ||
3727 | * Need to wait for any blocked tasks as well. Note that | ||
3728 | * additional blocking tasks will also block the expedited | ||
3729 | * GP until such time as the ->expmask bits are cleared. | ||
3730 | */ | ||
3731 | if (rcu_preempt_has_tasks(rnp)) | ||
3732 | rnp->exp_tasks = rnp->blkd_tasks.next; | ||
3733 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
3734 | |||
3735 | /* IPI the remaining CPUs for expedited quiescent state. */ | ||
3736 | mask = 1; | ||
3737 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { | ||
3738 | if (!(mask_ofl_ipi & mask)) | ||
3739 | continue; | ||
3740 | retry_ipi: | ||
3741 | ret = smp_call_function_single(cpu, func, rsp, 0); | ||
3742 | if (!ret) { | ||
3743 | mask_ofl_ipi &= ~mask; | ||
3744 | } else { | ||
3745 | /* Failed, raced with offline. */ | ||
3746 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
3747 | if (cpu_online(cpu) && | ||
3748 | (rnp->expmask & mask)) { | ||
3749 | raw_spin_unlock_irqrestore(&rnp->lock, | ||
3750 | flags); | ||
3751 | schedule_timeout_uninterruptible(1); | ||
3752 | if (cpu_online(cpu) && | ||
3753 | (rnp->expmask & mask)) | ||
3754 | goto retry_ipi; | ||
3755 | raw_spin_lock_irqsave(&rnp->lock, | ||
3756 | flags); | ||
3757 | } | ||
3758 | if (!(rnp->expmask & mask)) | ||
3759 | mask_ofl_ipi &= ~mask; | ||
3760 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
3761 | } | ||
3762 | } | ||
3763 | /* Report quiescent states for those that went offline. */ | ||
3764 | mask_ofl_test |= mask_ofl_ipi; | ||
3765 | if (mask_ofl_test) | ||
3766 | rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); | ||
3767 | } | ||
3468 | } | 3768 | } |
3469 | 3769 | ||
3470 | static void synchronize_sched_expedited_wait(struct rcu_state *rsp) | 3770 | static void synchronize_sched_expedited_wait(struct rcu_state *rsp) |
@@ -3472,7 +3772,9 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) | |||
3472 | int cpu; | 3772 | int cpu; |
3473 | unsigned long jiffies_stall; | 3773 | unsigned long jiffies_stall; |
3474 | unsigned long jiffies_start; | 3774 | unsigned long jiffies_start; |
3475 | struct rcu_data *rdp; | 3775 | unsigned long mask; |
3776 | struct rcu_node *rnp; | ||
3777 | struct rcu_node *rnp_root = rcu_get_root(rsp); | ||
3476 | int ret; | 3778 | int ret; |
3477 | 3779 | ||
3478 | jiffies_stall = rcu_jiffies_till_stall_check(); | 3780 | jiffies_stall = rcu_jiffies_till_stall_check(); |
@@ -3481,33 +3783,43 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) | |||
3481 | for (;;) { | 3783 | for (;;) { |
3482 | ret = wait_event_interruptible_timeout( | 3784 | ret = wait_event_interruptible_timeout( |
3483 | rsp->expedited_wq, | 3785 | rsp->expedited_wq, |
3484 | !atomic_read(&rsp->expedited_need_qs), | 3786 | sync_rcu_preempt_exp_done(rnp_root), |
3485 | jiffies_stall); | 3787 | jiffies_stall); |
3486 | if (ret > 0) | 3788 | if (ret > 0) |
3487 | return; | 3789 | return; |
3488 | if (ret < 0) { | 3790 | if (ret < 0) { |
3489 | /* Hit a signal, disable CPU stall warnings. */ | 3791 | /* Hit a signal, disable CPU stall warnings. */ |
3490 | wait_event(rsp->expedited_wq, | 3792 | wait_event(rsp->expedited_wq, |
3491 | !atomic_read(&rsp->expedited_need_qs)); | 3793 | sync_rcu_preempt_exp_done(rnp_root)); |
3492 | return; | 3794 | return; |
3493 | } | 3795 | } |
3494 | pr_err("INFO: %s detected expedited stalls on CPUs: {", | 3796 | pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", |
3495 | rsp->name); | 3797 | rsp->name); |
3496 | for_each_online_cpu(cpu) { | 3798 | rcu_for_each_leaf_node(rsp, rnp) { |
3497 | rdp = per_cpu_ptr(rsp->rda, cpu); | 3799 | (void)rcu_print_task_exp_stall(rnp); |
3498 | 3800 | mask = 1; | |
3499 | if (rdp->exp_done) | 3801 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { |
3500 | continue; | 3802 | struct rcu_data *rdp; |
3501 | pr_cont(" %d", cpu); | 3803 | |
3804 | if (!(rnp->expmask & mask)) | ||
3805 | continue; | ||
3806 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
3807 | pr_cont(" %d-%c%c%c", cpu, | ||
3808 | "O."[cpu_online(cpu)], | ||
3809 | "o."[!!(rdp->grpmask & rnp->expmaskinit)], | ||
3810 | "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); | ||
3811 | } | ||
3812 | mask <<= 1; | ||
3502 | } | 3813 | } |
3503 | pr_cont(" } %lu jiffies s: %lu\n", | 3814 | pr_cont(" } %lu jiffies s: %lu\n", |
3504 | jiffies - jiffies_start, rsp->expedited_sequence); | 3815 | jiffies - jiffies_start, rsp->expedited_sequence); |
3505 | for_each_online_cpu(cpu) { | 3816 | rcu_for_each_leaf_node(rsp, rnp) { |
3506 | rdp = per_cpu_ptr(rsp->rda, cpu); | 3817 | mask = 1; |
3507 | 3818 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { | |
3508 | if (rdp->exp_done) | 3819 | if (!(rnp->expmask & mask)) |
3509 | continue; | 3820 | continue; |
3510 | dump_cpu_task(cpu); | 3821 | dump_cpu_task(cpu); |
3822 | } | ||
3511 | } | 3823 | } |
3512 | jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; | 3824 | jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; |
3513 | } | 3825 | } |
@@ -3531,7 +3843,6 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) | |||
3531 | */ | 3843 | */ |
3532 | void synchronize_sched_expedited(void) | 3844 | void synchronize_sched_expedited(void) |
3533 | { | 3845 | { |
3534 | int cpu; | ||
3535 | unsigned long s; | 3846 | unsigned long s; |
3536 | struct rcu_node *rnp; | 3847 | struct rcu_node *rnp; |
3537 | struct rcu_state *rsp = &rcu_sched_state; | 3848 | struct rcu_state *rsp = &rcu_sched_state; |
@@ -3539,48 +3850,16 @@ void synchronize_sched_expedited(void) | |||
3539 | /* Take a snapshot of the sequence number. */ | 3850 | /* Take a snapshot of the sequence number. */ |
3540 | s = rcu_exp_gp_seq_snap(rsp); | 3851 | s = rcu_exp_gp_seq_snap(rsp); |
3541 | 3852 | ||
3542 | if (!try_get_online_cpus()) { | ||
3543 | /* CPU hotplug operation in flight, fall back to normal GP. */ | ||
3544 | wait_rcu_gp(call_rcu_sched); | ||
3545 | atomic_long_inc(&rsp->expedited_normal); | ||
3546 | return; | ||
3547 | } | ||
3548 | WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); | ||
3549 | |||
3550 | rnp = exp_funnel_lock(rsp, s); | 3853 | rnp = exp_funnel_lock(rsp, s); |
3551 | if (rnp == NULL) { | 3854 | if (rnp == NULL) |
3552 | put_online_cpus(); | ||
3553 | return; /* Someone else did our work for us. */ | 3855 | return; /* Someone else did our work for us. */ |
3554 | } | ||
3555 | 3856 | ||
3556 | rcu_exp_gp_seq_start(rsp); | 3857 | rcu_exp_gp_seq_start(rsp); |
3557 | 3858 | sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); | |
3558 | /* Stop each CPU that is online, non-idle, and not us. */ | 3859 | synchronize_sched_expedited_wait(rsp); |
3559 | init_waitqueue_head(&rsp->expedited_wq); | ||
3560 | atomic_set(&rsp->expedited_need_qs, 1); /* Extra count avoids race. */ | ||
3561 | for_each_online_cpu(cpu) { | ||
3562 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | ||
3563 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | ||
3564 | |||
3565 | rdp->exp_done = false; | ||
3566 | |||
3567 | /* Skip our CPU and any idle CPUs. */ | ||
3568 | if (raw_smp_processor_id() == cpu || | ||
3569 | !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) | ||
3570 | continue; | ||
3571 | atomic_inc(&rsp->expedited_need_qs); | ||
3572 | stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop, | ||
3573 | rdp, &rdp->exp_stop_work); | ||
3574 | } | ||
3575 | |||
3576 | /* Remove extra count and, if necessary, wait for CPUs to stop. */ | ||
3577 | if (!atomic_dec_and_test(&rsp->expedited_need_qs)) | ||
3578 | synchronize_sched_expedited_wait(rsp); | ||
3579 | 3860 | ||
3580 | rcu_exp_gp_seq_end(rsp); | 3861 | rcu_exp_gp_seq_end(rsp); |
3581 | mutex_unlock(&rnp->exp_funnel_mutex); | 3862 | mutex_unlock(&rnp->exp_funnel_mutex); |
3582 | |||
3583 | put_online_cpus(); | ||
3584 | } | 3863 | } |
3585 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | 3864 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); |
3586 | 3865 | ||
@@ -3606,11 +3885,11 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) | |||
3606 | 3885 | ||
3607 | /* Is the RCU core waiting for a quiescent state from this CPU? */ | 3886 | /* Is the RCU core waiting for a quiescent state from this CPU? */ |
3608 | if (rcu_scheduler_fully_active && | 3887 | if (rcu_scheduler_fully_active && |
3609 | rdp->qs_pending && !rdp->passed_quiesce && | 3888 | rdp->core_needs_qs && rdp->cpu_no_qs.b.norm && |
3610 | rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) { | 3889 | rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) { |
3611 | rdp->n_rp_qs_pending++; | 3890 | rdp->n_rp_core_needs_qs++; |
3612 | } else if (rdp->qs_pending && | 3891 | } else if (rdp->core_needs_qs && |
3613 | (rdp->passed_quiesce || | 3892 | (!rdp->cpu_no_qs.b.norm || |
3614 | rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) { | 3893 | rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) { |
3615 | rdp->n_rp_report_qs++; | 3894 | rdp->n_rp_report_qs++; |
3616 | return 1; | 3895 | return 1; |
@@ -3901,7 +4180,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
3901 | 4180 | ||
3902 | /* Set up local state, ensuring consistent view of global state. */ | 4181 | /* Set up local state, ensuring consistent view of global state. */ |
3903 | raw_spin_lock_irqsave(&rnp->lock, flags); | 4182 | raw_spin_lock_irqsave(&rnp->lock, flags); |
3904 | rdp->beenonline = 1; /* We have now been online. */ | ||
3905 | rdp->qlen_last_fqs_check = 0; | 4183 | rdp->qlen_last_fqs_check = 0; |
3906 | rdp->n_force_qs_snap = rsp->n_force_qs; | 4184 | rdp->n_force_qs_snap = rsp->n_force_qs; |
3907 | rdp->blimit = blimit; | 4185 | rdp->blimit = blimit; |
@@ -3923,11 +4201,15 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
3923 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | 4201 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ |
3924 | smp_mb__after_unlock_lock(); | 4202 | smp_mb__after_unlock_lock(); |
3925 | rnp->qsmaskinitnext |= mask; | 4203 | rnp->qsmaskinitnext |= mask; |
4204 | rnp->expmaskinitnext |= mask; | ||
4205 | if (!rdp->beenonline) | ||
4206 | WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1); | ||
4207 | rdp->beenonline = true; /* We have now been online. */ | ||
3926 | rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ | 4208 | rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ |
3927 | rdp->completed = rnp->completed; | 4209 | rdp->completed = rnp->completed; |
3928 | rdp->passed_quiesce = false; | 4210 | rdp->cpu_no_qs.b.norm = true; |
3929 | rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu); | 4211 | rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu); |
3930 | rdp->qs_pending = false; | 4212 | rdp->core_needs_qs = false; |
3931 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); | 4213 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); |
3932 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 4214 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
3933 | } | 4215 | } |
@@ -3960,6 +4242,7 @@ int rcu_cpu_notify(struct notifier_block *self, | |||
3960 | break; | 4242 | break; |
3961 | case CPU_ONLINE: | 4243 | case CPU_ONLINE: |
3962 | case CPU_DOWN_FAILED: | 4244 | case CPU_DOWN_FAILED: |
4245 | sync_sched_exp_online_cleanup(cpu); | ||
3963 | rcu_boost_kthread_setaffinity(rnp, -1); | 4246 | rcu_boost_kthread_setaffinity(rnp, -1); |
3964 | break; | 4247 | break; |
3965 | case CPU_DOWN_PREPARE: | 4248 | case CPU_DOWN_PREPARE: |
@@ -3971,6 +4254,12 @@ int rcu_cpu_notify(struct notifier_block *self, | |||
3971 | rcu_cleanup_dying_cpu(rsp); | 4254 | rcu_cleanup_dying_cpu(rsp); |
3972 | break; | 4255 | break; |
3973 | case CPU_DYING_IDLE: | 4256 | case CPU_DYING_IDLE: |
4257 | /* QS for any half-done expedited RCU-sched GP. */ | ||
4258 | preempt_disable(); | ||
4259 | rcu_report_exp_rdp(&rcu_sched_state, | ||
4260 | this_cpu_ptr(rcu_sched_state.rda), true); | ||
4261 | preempt_enable(); | ||
4262 | |||
3974 | for_each_rcu_flavor(rsp) { | 4263 | for_each_rcu_flavor(rsp) { |
3975 | rcu_cleanup_dying_idle_cpu(cpu, rsp); | 4264 | rcu_cleanup_dying_idle_cpu(cpu, rsp); |
3976 | } | 4265 | } |
@@ -4102,7 +4391,6 @@ static void __init rcu_init_one(struct rcu_state *rsp, | |||
4102 | static const char * const buf[] = RCU_NODE_NAME_INIT; | 4391 | static const char * const buf[] = RCU_NODE_NAME_INIT; |
4103 | static const char * const fqs[] = RCU_FQS_NAME_INIT; | 4392 | static const char * const fqs[] = RCU_FQS_NAME_INIT; |
4104 | static const char * const exp[] = RCU_EXP_NAME_INIT; | 4393 | static const char * const exp[] = RCU_EXP_NAME_INIT; |
4105 | static const char * const exp_sched[] = RCU_EXP_SCHED_NAME_INIT; | ||
4106 | static u8 fl_mask = 0x1; | 4394 | static u8 fl_mask = 0x1; |
4107 | 4395 | ||
4108 | int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ | 4396 | int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ |
@@ -4162,18 +4450,13 @@ static void __init rcu_init_one(struct rcu_state *rsp, | |||
4162 | INIT_LIST_HEAD(&rnp->blkd_tasks); | 4450 | INIT_LIST_HEAD(&rnp->blkd_tasks); |
4163 | rcu_init_one_nocb(rnp); | 4451 | rcu_init_one_nocb(rnp); |
4164 | mutex_init(&rnp->exp_funnel_mutex); | 4452 | mutex_init(&rnp->exp_funnel_mutex); |
4165 | if (rsp == &rcu_sched_state) | 4453 | lockdep_set_class_and_name(&rnp->exp_funnel_mutex, |
4166 | lockdep_set_class_and_name( | 4454 | &rcu_exp_class[i], exp[i]); |
4167 | &rnp->exp_funnel_mutex, | ||
4168 | &rcu_exp_sched_class[i], exp_sched[i]); | ||
4169 | else | ||
4170 | lockdep_set_class_and_name( | ||
4171 | &rnp->exp_funnel_mutex, | ||
4172 | &rcu_exp_class[i], exp[i]); | ||
4173 | } | 4455 | } |
4174 | } | 4456 | } |
4175 | 4457 | ||
4176 | init_waitqueue_head(&rsp->gp_wq); | 4458 | init_waitqueue_head(&rsp->gp_wq); |
4459 | init_waitqueue_head(&rsp->expedited_wq); | ||
4177 | rnp = rsp->level[rcu_num_lvls - 1]; | 4460 | rnp = rsp->level[rcu_num_lvls - 1]; |
4178 | for_each_possible_cpu(i) { | 4461 | for_each_possible_cpu(i) { |
4179 | while (i > rnp->grphi) | 4462 | while (i > rnp->grphi) |
@@ -4216,13 +4499,12 @@ static void __init rcu_init_geometry(void) | |||
4216 | rcu_fanout_leaf, nr_cpu_ids); | 4499 | rcu_fanout_leaf, nr_cpu_ids); |
4217 | 4500 | ||
4218 | /* | 4501 | /* |
4219 | * The boot-time rcu_fanout_leaf parameter is only permitted | 4502 | * The boot-time rcu_fanout_leaf parameter must be at least two |
4220 | * to increase the leaf-level fanout, not decrease it. Of course, | 4503 | * and cannot exceed the number of bits in the rcu_node masks. |
4221 | * the leaf-level fanout cannot exceed the number of bits in | 4504 | * Complain and fall back to the compile-time values if this |
4222 | * the rcu_node masks. Complain and fall back to the compile- | 4505 | * limit is exceeded. |
4223 | * time values if these limits are exceeded. | ||
4224 | */ | 4506 | */ |
4225 | if (rcu_fanout_leaf < RCU_FANOUT_LEAF || | 4507 | if (rcu_fanout_leaf < 2 || |
4226 | rcu_fanout_leaf > sizeof(unsigned long) * 8) { | 4508 | rcu_fanout_leaf > sizeof(unsigned long) * 8) { |
4227 | rcu_fanout_leaf = RCU_FANOUT_LEAF; | 4509 | rcu_fanout_leaf = RCU_FANOUT_LEAF; |
4228 | WARN_ON(1); | 4510 | WARN_ON(1); |
@@ -4239,10 +4521,13 @@ static void __init rcu_init_geometry(void) | |||
4239 | 4521 | ||
4240 | /* | 4522 | /* |
4241 | * The tree must be able to accommodate the configured number of CPUs. | 4523 | * The tree must be able to accommodate the configured number of CPUs. |
4242 | * If this limit is exceeded than we have a serious problem elsewhere. | 4524 | * If this limit is exceeded, fall back to the compile-time values. |
4243 | */ | 4525 | */ |
4244 | if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1]) | 4526 | if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1]) { |
4245 | panic("rcu_init_geometry: rcu_capacity[] is too small"); | 4527 | rcu_fanout_leaf = RCU_FANOUT_LEAF; |
4528 | WARN_ON(1); | ||
4529 | return; | ||
4530 | } | ||
4246 | 4531 | ||
4247 | /* Calculate the number of levels in the tree. */ | 4532 | /* Calculate the number of levels in the tree. */ |
4248 | for (i = 0; nr_cpu_ids > rcu_capacity[i]; i++) { | 4533 | for (i = 0; nr_cpu_ids > rcu_capacity[i]; i++) { |
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 2e991f8361e4..9fb4e238d4dc 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h | |||
@@ -70,8 +70,6 @@ | |||
70 | # define RCU_NODE_NAME_INIT { "rcu_node_0" } | 70 | # define RCU_NODE_NAME_INIT { "rcu_node_0" } |
71 | # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } | 71 | # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } |
72 | # define RCU_EXP_NAME_INIT { "rcu_node_exp_0" } | 72 | # define RCU_EXP_NAME_INIT { "rcu_node_exp_0" } |
73 | # define RCU_EXP_SCHED_NAME_INIT \ | ||
74 | { "rcu_node_exp_sched_0" } | ||
75 | #elif NR_CPUS <= RCU_FANOUT_2 | 73 | #elif NR_CPUS <= RCU_FANOUT_2 |
76 | # define RCU_NUM_LVLS 2 | 74 | # define RCU_NUM_LVLS 2 |
77 | # define NUM_RCU_LVL_0 1 | 75 | # define NUM_RCU_LVL_0 1 |
@@ -81,8 +79,6 @@ | |||
81 | # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } | 79 | # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } |
82 | # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } | 80 | # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } |
83 | # define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" } | 81 | # define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" } |
84 | # define RCU_EXP_SCHED_NAME_INIT \ | ||
85 | { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1" } | ||
86 | #elif NR_CPUS <= RCU_FANOUT_3 | 82 | #elif NR_CPUS <= RCU_FANOUT_3 |
87 | # define RCU_NUM_LVLS 3 | 83 | # define RCU_NUM_LVLS 3 |
88 | # define NUM_RCU_LVL_0 1 | 84 | # define NUM_RCU_LVL_0 1 |
@@ -93,8 +89,6 @@ | |||
93 | # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } | 89 | # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } |
94 | # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } | 90 | # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } |
95 | # define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" } | 91 | # define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" } |
96 | # define RCU_EXP_SCHED_NAME_INIT \ | ||
97 | { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1", "rcu_node_exp_sched_2" } | ||
98 | #elif NR_CPUS <= RCU_FANOUT_4 | 92 | #elif NR_CPUS <= RCU_FANOUT_4 |
99 | # define RCU_NUM_LVLS 4 | 93 | # define RCU_NUM_LVLS 4 |
100 | # define NUM_RCU_LVL_0 1 | 94 | # define NUM_RCU_LVL_0 1 |
@@ -106,8 +100,6 @@ | |||
106 | # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } | 100 | # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } |
107 | # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } | 101 | # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } |
108 | # define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" } | 102 | # define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" } |
109 | # define RCU_EXP_SCHED_NAME_INIT \ | ||
110 | { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1", "rcu_node_exp_sched_2", "rcu_node_exp_sched_3" } | ||
111 | #else | 103 | #else |
112 | # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" | 104 | # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" |
113 | #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ | 105 | #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ |
@@ -171,16 +163,21 @@ struct rcu_node { | |||
171 | /* an rcu_data structure, otherwise, each */ | 163 | /* an rcu_data structure, otherwise, each */ |
172 | /* bit corresponds to a child rcu_node */ | 164 | /* bit corresponds to a child rcu_node */ |
173 | /* structure. */ | 165 | /* structure. */ |
174 | unsigned long expmask; /* Groups that have ->blkd_tasks */ | ||
175 | /* elements that need to drain to allow the */ | ||
176 | /* current expedited grace period to */ | ||
177 | /* complete (only for PREEMPT_RCU). */ | ||
178 | unsigned long qsmaskinit; | 166 | unsigned long qsmaskinit; |
179 | /* Per-GP initial value for qsmask & expmask. */ | 167 | /* Per-GP initial value for qsmask. */ |
180 | /* Initialized from ->qsmaskinitnext at the */ | 168 | /* Initialized from ->qsmaskinitnext at the */ |
181 | /* beginning of each grace period. */ | 169 | /* beginning of each grace period. */ |
182 | unsigned long qsmaskinitnext; | 170 | unsigned long qsmaskinitnext; |
183 | /* Online CPUs for next grace period. */ | 171 | /* Online CPUs for next grace period. */ |
172 | unsigned long expmask; /* CPUs or groups that need to check in */ | ||
173 | /* to allow the current expedited GP */ | ||
174 | /* to complete. */ | ||
175 | unsigned long expmaskinit; | ||
176 | /* Per-GP initial values for expmask. */ | ||
177 | /* Initialized from ->expmaskinitnext at the */ | ||
178 | /* beginning of each expedited GP. */ | ||
179 | unsigned long expmaskinitnext; | ||
180 | /* Online CPUs for next expedited GP. */ | ||
184 | unsigned long grpmask; /* Mask to apply to parent qsmask. */ | 181 | unsigned long grpmask; /* Mask to apply to parent qsmask. */ |
185 | /* Only one bit will be set in this mask. */ | 182 | /* Only one bit will be set in this mask. */ |
186 | int grplo; /* lowest-numbered CPU or group here. */ | 183 | int grplo; /* lowest-numbered CPU or group here. */ |
@@ -281,6 +278,18 @@ struct rcu_node { | |||
281 | for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ | 278 | for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ |
282 | (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) | 279 | (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) |
283 | 280 | ||
281 | /* | ||
282 | * Union to allow "aggregate OR" operation on the need for a quiescent | ||
283 | * state by the normal and expedited grace periods. | ||
284 | */ | ||
285 | union rcu_noqs { | ||
286 | struct { | ||
287 | u8 norm; | ||
288 | u8 exp; | ||
289 | } b; /* Bits. */ | ||
290 | u16 s; /* Set of bits, aggregate OR here. */ | ||
291 | }; | ||
292 | |||
284 | /* Index values for nxttail array in struct rcu_data. */ | 293 | /* Index values for nxttail array in struct rcu_data. */ |
285 | #define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ | 294 | #define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ |
286 | #define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ | 295 | #define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ |
@@ -297,8 +306,8 @@ struct rcu_data { | |||
297 | /* is aware of having started. */ | 306 | /* is aware of having started. */ |
298 | unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */ | 307 | unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */ |
299 | /* for rcu_all_qs() invocations. */ | 308 | /* for rcu_all_qs() invocations. */ |
300 | bool passed_quiesce; /* User-mode/idle loop etc. */ | 309 | union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */ |
301 | bool qs_pending; /* Core waits for quiesc state. */ | 310 | bool core_needs_qs; /* Core waits for quiesc state. */ |
302 | bool beenonline; /* CPU online at least once. */ | 311 | bool beenonline; /* CPU online at least once. */ |
303 | bool gpwrap; /* Possible gpnum/completed wrap. */ | 312 | bool gpwrap; /* Possible gpnum/completed wrap. */ |
304 | struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ | 313 | struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ |
@@ -307,9 +316,6 @@ struct rcu_data { | |||
307 | /* ticks this CPU has handled */ | 316 | /* ticks this CPU has handled */ |
308 | /* during and after the last grace */ | 317 | /* during and after the last grace */ |
309 | /* period it is aware of. */ | 318 | /* period it is aware of. */ |
310 | struct cpu_stop_work exp_stop_work; | ||
311 | /* Expedited grace-period control */ | ||
312 | /* for CPU stopping. */ | ||
313 | 319 | ||
314 | /* 2) batch handling */ | 320 | /* 2) batch handling */ |
315 | /* | 321 | /* |
@@ -363,7 +369,7 @@ struct rcu_data { | |||
363 | 369 | ||
364 | /* 5) __rcu_pending() statistics. */ | 370 | /* 5) __rcu_pending() statistics. */ |
365 | unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ | 371 | unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ |
366 | unsigned long n_rp_qs_pending; | 372 | unsigned long n_rp_core_needs_qs; |
367 | unsigned long n_rp_report_qs; | 373 | unsigned long n_rp_report_qs; |
368 | unsigned long n_rp_cb_ready; | 374 | unsigned long n_rp_cb_ready; |
369 | unsigned long n_rp_cpu_needs_gp; | 375 | unsigned long n_rp_cpu_needs_gp; |
@@ -378,7 +384,6 @@ struct rcu_data { | |||
378 | struct rcu_head oom_head; | 384 | struct rcu_head oom_head; |
379 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | 385 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ |
380 | struct mutex exp_funnel_mutex; | 386 | struct mutex exp_funnel_mutex; |
381 | bool exp_done; /* Expedited QS for this CPU? */ | ||
382 | 387 | ||
383 | /* 7) Callback offloading. */ | 388 | /* 7) Callback offloading. */ |
384 | #ifdef CONFIG_RCU_NOCB_CPU | 389 | #ifdef CONFIG_RCU_NOCB_CPU |
@@ -412,13 +417,6 @@ struct rcu_data { | |||
412 | struct rcu_state *rsp; | 417 | struct rcu_state *rsp; |
413 | }; | 418 | }; |
414 | 419 | ||
415 | /* Values for fqs_state field in struct rcu_state. */ | ||
416 | #define RCU_GP_IDLE 0 /* No grace period in progress. */ | ||
417 | #define RCU_GP_INIT 1 /* Grace period being initialized. */ | ||
418 | #define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ | ||
419 | #define RCU_FORCE_QS 3 /* Need to force quiescent state. */ | ||
420 | #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK | ||
421 | |||
422 | /* Values for nocb_defer_wakeup field in struct rcu_data. */ | 420 | /* Values for nocb_defer_wakeup field in struct rcu_data. */ |
423 | #define RCU_NOGP_WAKE_NOT 0 | 421 | #define RCU_NOGP_WAKE_NOT 0 |
424 | #define RCU_NOGP_WAKE 1 | 422 | #define RCU_NOGP_WAKE 1 |
@@ -464,14 +462,13 @@ struct rcu_state { | |||
464 | /* shut bogus gcc warning) */ | 462 | /* shut bogus gcc warning) */ |
465 | u8 flavor_mask; /* bit in flavor mask. */ | 463 | u8 flavor_mask; /* bit in flavor mask. */ |
466 | struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ | 464 | struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ |
467 | void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ | 465 | call_rcu_func_t call; /* call_rcu() flavor. */ |
468 | void (*func)(struct rcu_head *head)); | 466 | int ncpus; /* # CPUs seen so far. */ |
469 | 467 | ||
470 | /* The following fields are guarded by the root rcu_node's lock. */ | 468 | /* The following fields are guarded by the root rcu_node's lock. */ |
471 | 469 | ||
472 | u8 fqs_state ____cacheline_internodealigned_in_smp; | 470 | u8 boost ____cacheline_internodealigned_in_smp; |
473 | /* Force QS state. */ | 471 | /* Subject to priority boost. */ |
474 | u8 boost; /* Subject to priority boost. */ | ||
475 | unsigned long gpnum; /* Current gp number. */ | 472 | unsigned long gpnum; /* Current gp number. */ |
476 | unsigned long completed; /* # of last completed gp. */ | 473 | unsigned long completed; /* # of last completed gp. */ |
477 | struct task_struct *gp_kthread; /* Task for grace periods. */ | 474 | struct task_struct *gp_kthread; /* Task for grace periods. */ |
@@ -508,6 +505,7 @@ struct rcu_state { | |||
508 | atomic_long_t expedited_normal; /* # fallbacks to normal. */ | 505 | atomic_long_t expedited_normal; /* # fallbacks to normal. */ |
509 | atomic_t expedited_need_qs; /* # CPUs left to check in. */ | 506 | atomic_t expedited_need_qs; /* # CPUs left to check in. */ |
510 | wait_queue_head_t expedited_wq; /* Wait for check-ins. */ | 507 | wait_queue_head_t expedited_wq; /* Wait for check-ins. */ |
508 | int ncpus_snap; /* # CPUs seen last time. */ | ||
511 | 509 | ||
512 | unsigned long jiffies_force_qs; /* Time at which to invoke */ | 510 | unsigned long jiffies_force_qs; /* Time at which to invoke */ |
513 | /* force_quiescent_state(). */ | 511 | /* force_quiescent_state(). */ |
@@ -538,8 +536,8 @@ struct rcu_state { | |||
538 | #define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */ | 536 | #define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */ |
539 | #define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ | 537 | #define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ |
540 | 538 | ||
541 | /* Values for rcu_state structure's gp_flags field. */ | 539 | /* Values for rcu_state structure's gp_state field. */ |
542 | #define RCU_GP_WAIT_INIT 0 /* Initial state. */ | 540 | #define RCU_GP_IDLE 0 /* Initial state and no GP in progress. */ |
543 | #define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */ | 541 | #define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */ |
544 | #define RCU_GP_DONE_GPS 2 /* Wait done for grace-period start. */ | 542 | #define RCU_GP_DONE_GPS 2 /* Wait done for grace-period start. */ |
545 | #define RCU_GP_WAIT_FQS 3 /* Wait for force-quiescent-state time. */ | 543 | #define RCU_GP_WAIT_FQS 3 /* Wait for force-quiescent-state time. */ |
@@ -582,9 +580,10 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp); | |||
582 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 580 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
583 | static void rcu_print_detail_task_stall(struct rcu_state *rsp); | 581 | static void rcu_print_detail_task_stall(struct rcu_state *rsp); |
584 | static int rcu_print_task_stall(struct rcu_node *rnp); | 582 | static int rcu_print_task_stall(struct rcu_node *rnp); |
583 | static int rcu_print_task_exp_stall(struct rcu_node *rnp); | ||
585 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); | 584 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); |
586 | static void rcu_preempt_check_callbacks(void); | 585 | static void rcu_preempt_check_callbacks(void); |
587 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); | 586 | void call_rcu(struct rcu_head *head, rcu_callback_t func); |
588 | static void __init __rcu_init_preempt(void); | 587 | static void __init __rcu_init_preempt(void); |
589 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); | 588 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); |
590 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); | 589 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); |
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index b2bf3963a0ae..630c19772630 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h | |||
@@ -101,7 +101,6 @@ RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); | |||
101 | static struct rcu_state *const rcu_state_p = &rcu_preempt_state; | 101 | static struct rcu_state *const rcu_state_p = &rcu_preempt_state; |
102 | static struct rcu_data __percpu *const rcu_data_p = &rcu_preempt_data; | 102 | static struct rcu_data __percpu *const rcu_data_p = &rcu_preempt_data; |
103 | 103 | ||
104 | static int rcu_preempted_readers_exp(struct rcu_node *rnp); | ||
105 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, | 104 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, |
106 | bool wake); | 105 | bool wake); |
107 | 106 | ||
@@ -114,6 +113,147 @@ static void __init rcu_bootup_announce(void) | |||
114 | rcu_bootup_announce_oddness(); | 113 | rcu_bootup_announce_oddness(); |
115 | } | 114 | } |
116 | 115 | ||
116 | /* Flags for rcu_preempt_ctxt_queue() decision table. */ | ||
117 | #define RCU_GP_TASKS 0x8 | ||
118 | #define RCU_EXP_TASKS 0x4 | ||
119 | #define RCU_GP_BLKD 0x2 | ||
120 | #define RCU_EXP_BLKD 0x1 | ||
121 | |||
122 | /* | ||
123 | * Queues a task preempted within an RCU-preempt read-side critical | ||
124 | * section into the appropriate location within the ->blkd_tasks list, | ||
125 | * depending on the states of any ongoing normal and expedited grace | ||
126 | * periods. The ->gp_tasks pointer indicates which element the normal | ||
127 | * grace period is waiting on (NULL if none), and the ->exp_tasks pointer | ||
128 | * indicates which element the expedited grace period is waiting on (again, | ||
129 | * NULL if none). If a grace period is waiting on a given element in the | ||
130 | * ->blkd_tasks list, it also waits on all subsequent elements. Thus, | ||
131 | * adding a task to the tail of the list blocks any grace period that is | ||
132 | * already waiting on one of the elements. In contrast, adding a task | ||
133 | * to the head of the list won't block any grace period that is already | ||
134 | * waiting on one of the elements. | ||
135 | * | ||
136 | * This queuing is imprecise, and can sometimes make an ongoing grace | ||
137 | * period wait for a task that is not strictly speaking blocking it. | ||
138 | * Given the choice, we needlessly block a normal grace period rather than | ||
139 | * blocking an expedited grace period. | ||
140 | * | ||
141 | * Note that an endless sequence of expedited grace periods still cannot | ||
142 | * indefinitely postpone a normal grace period. Eventually, all of the | ||
143 | * fixed number of preempted tasks blocking the normal grace period that are | ||
144 | * not also blocking the expedited grace period will resume and complete | ||
145 | * their RCU read-side critical sections. At that point, the ->gp_tasks | ||
146 | * pointer will equal the ->exp_tasks pointer, at which point the end of | ||
147 | * the corresponding expedited grace period will also be the end of the | ||
148 | * normal grace period. | ||
149 | */ | ||
150 | static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp, | ||
151 | unsigned long flags) __releases(rnp->lock) | ||
152 | { | ||
153 | int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) + | ||
154 | (rnp->exp_tasks ? RCU_EXP_TASKS : 0) + | ||
155 | (rnp->qsmask & rdp->grpmask ? RCU_GP_BLKD : 0) + | ||
156 | (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0); | ||
157 | struct task_struct *t = current; | ||
158 | |||
159 | /* | ||
160 | * Decide where to queue the newly blocked task. In theory, | ||
161 | * this could be an if-statement. In practice, when I tried | ||
162 | * that, it was quite messy. | ||
163 | */ | ||
164 | switch (blkd_state) { | ||
165 | case 0: | ||
166 | case RCU_EXP_TASKS: | ||
167 | case RCU_EXP_TASKS + RCU_GP_BLKD: | ||
168 | case RCU_GP_TASKS: | ||
169 | case RCU_GP_TASKS + RCU_EXP_TASKS: | ||
170 | |||
171 | /* | ||
172 | * Blocking neither GP, or first task blocking the normal | ||
173 | * GP but not blocking the already-waiting expedited GP. | ||
174 | * Queue at the head of the list to avoid unnecessarily | ||
175 | * blocking the already-waiting GPs. | ||
176 | */ | ||
177 | list_add(&t->rcu_node_entry, &rnp->blkd_tasks); | ||
178 | break; | ||
179 | |||
180 | case RCU_EXP_BLKD: | ||
181 | case RCU_GP_BLKD: | ||
182 | case RCU_GP_BLKD + RCU_EXP_BLKD: | ||
183 | case RCU_GP_TASKS + RCU_EXP_BLKD: | ||
184 | case RCU_GP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: | ||
185 | case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: | ||
186 | |||
187 | /* | ||
188 | * First task arriving that blocks either GP, or first task | ||
189 | * arriving that blocks the expedited GP (with the normal | ||
190 | * GP already waiting), or a task arriving that blocks | ||
191 | * both GPs with both GPs already waiting. Queue at the | ||
192 | * tail of the list to avoid any GP waiting on any of the | ||
193 | * already queued tasks that are not blocking it. | ||
194 | */ | ||
195 | list_add_tail(&t->rcu_node_entry, &rnp->blkd_tasks); | ||
196 | break; | ||
197 | |||
198 | case RCU_EXP_TASKS + RCU_EXP_BLKD: | ||
199 | case RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: | ||
200 | case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_EXP_BLKD: | ||
201 | |||
202 | /* | ||
203 | * Second or subsequent task blocking the expedited GP. | ||
204 | * The task either does not block the normal GP, or is the | ||
205 | * first task blocking the normal GP. Queue just after | ||
206 | * the first task blocking the expedited GP. | ||
207 | */ | ||
208 | list_add(&t->rcu_node_entry, rnp->exp_tasks); | ||
209 | break; | ||
210 | |||
211 | case RCU_GP_TASKS + RCU_GP_BLKD: | ||
212 | case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD: | ||
213 | |||
214 | /* | ||
215 | * Second or subsequent task blocking the normal GP. | ||
216 | * The task does not block the expedited GP. Queue just | ||
217 | * after the first task blocking the normal GP. | ||
218 | */ | ||
219 | list_add(&t->rcu_node_entry, rnp->gp_tasks); | ||
220 | break; | ||
221 | |||
222 | default: | ||
223 | |||
224 | /* Yet another exercise in excessive paranoia. */ | ||
225 | WARN_ON_ONCE(1); | ||
226 | break; | ||
227 | } | ||
228 | |||
229 | /* | ||
230 | * We have now queued the task. If it was the first one to | ||
231 | * block either grace period, update the ->gp_tasks and/or | ||
232 | * ->exp_tasks pointers, respectively, to reference the newly | ||
233 | * blocked tasks. | ||
234 | */ | ||
235 | if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) | ||
236 | rnp->gp_tasks = &t->rcu_node_entry; | ||
237 | if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) | ||
238 | rnp->exp_tasks = &t->rcu_node_entry; | ||
239 | raw_spin_unlock(&rnp->lock); | ||
240 | |||
241 | /* | ||
242 | * Report the quiescent state for the expedited GP. This expedited | ||
243 | * GP should not be able to end until we report, so there should be | ||
244 | * no need to check for a subsequent expedited GP. (Though we are | ||
245 | * still in a quiescent state in any case.) | ||
246 | */ | ||
247 | if (blkd_state & RCU_EXP_BLKD && | ||
248 | t->rcu_read_unlock_special.b.exp_need_qs) { | ||
249 | t->rcu_read_unlock_special.b.exp_need_qs = false; | ||
250 | rcu_report_exp_rdp(rdp->rsp, rdp, true); | ||
251 | } else { | ||
252 | WARN_ON_ONCE(t->rcu_read_unlock_special.b.exp_need_qs); | ||
253 | } | ||
254 | local_irq_restore(flags); | ||
255 | } | ||
256 | |||
117 | /* | 257 | /* |
118 | * Record a preemptible-RCU quiescent state for the specified CPU. Note | 258 | * Record a preemptible-RCU quiescent state for the specified CPU. Note |
119 | * that this just means that the task currently running on the CPU is | 259 | * that this just means that the task currently running on the CPU is |
@@ -125,11 +265,11 @@ static void __init rcu_bootup_announce(void) | |||
125 | */ | 265 | */ |
126 | static void rcu_preempt_qs(void) | 266 | static void rcu_preempt_qs(void) |
127 | { | 267 | { |
128 | if (!__this_cpu_read(rcu_data_p->passed_quiesce)) { | 268 | if (__this_cpu_read(rcu_data_p->cpu_no_qs.s)) { |
129 | trace_rcu_grace_period(TPS("rcu_preempt"), | 269 | trace_rcu_grace_period(TPS("rcu_preempt"), |
130 | __this_cpu_read(rcu_data_p->gpnum), | 270 | __this_cpu_read(rcu_data_p->gpnum), |
131 | TPS("cpuqs")); | 271 | TPS("cpuqs")); |
132 | __this_cpu_write(rcu_data_p->passed_quiesce, 1); | 272 | __this_cpu_write(rcu_data_p->cpu_no_qs.b.norm, false); |
133 | barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */ | 273 | barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */ |
134 | current->rcu_read_unlock_special.b.need_qs = false; | 274 | current->rcu_read_unlock_special.b.need_qs = false; |
135 | } | 275 | } |
@@ -167,42 +307,18 @@ static void rcu_preempt_note_context_switch(void) | |||
167 | t->rcu_blocked_node = rnp; | 307 | t->rcu_blocked_node = rnp; |
168 | 308 | ||
169 | /* | 309 | /* |
170 | * If this CPU has already checked in, then this task | 310 | * Verify the CPU's sanity, trace the preemption, and |
171 | * will hold up the next grace period rather than the | 311 | * then queue the task as required based on the states |
172 | * current grace period. Queue the task accordingly. | 312 | * of any ongoing and expedited grace periods. |
173 | * If the task is queued for the current grace period | ||
174 | * (i.e., this CPU has not yet passed through a quiescent | ||
175 | * state for the current grace period), then as long | ||
176 | * as that task remains queued, the current grace period | ||
177 | * cannot end. Note that there is some uncertainty as | ||
178 | * to exactly when the current grace period started. | ||
179 | * We take a conservative approach, which can result | ||
180 | * in unnecessarily waiting on tasks that started very | ||
181 | * slightly after the current grace period began. C'est | ||
182 | * la vie!!! | ||
183 | * | ||
184 | * But first, note that the current CPU must still be | ||
185 | * on line! | ||
186 | */ | 313 | */ |
187 | WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0); | 314 | WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0); |
188 | WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); | 315 | WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); |
189 | if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) { | ||
190 | list_add(&t->rcu_node_entry, rnp->gp_tasks->prev); | ||
191 | rnp->gp_tasks = &t->rcu_node_entry; | ||
192 | if (IS_ENABLED(CONFIG_RCU_BOOST) && | ||
193 | rnp->boost_tasks != NULL) | ||
194 | rnp->boost_tasks = rnp->gp_tasks; | ||
195 | } else { | ||
196 | list_add(&t->rcu_node_entry, &rnp->blkd_tasks); | ||
197 | if (rnp->qsmask & rdp->grpmask) | ||
198 | rnp->gp_tasks = &t->rcu_node_entry; | ||
199 | } | ||
200 | trace_rcu_preempt_task(rdp->rsp->name, | 316 | trace_rcu_preempt_task(rdp->rsp->name, |
201 | t->pid, | 317 | t->pid, |
202 | (rnp->qsmask & rdp->grpmask) | 318 | (rnp->qsmask & rdp->grpmask) |
203 | ? rnp->gpnum | 319 | ? rnp->gpnum |
204 | : rnp->gpnum + 1); | 320 | : rnp->gpnum + 1); |
205 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 321 | rcu_preempt_ctxt_queue(rnp, rdp, flags); |
206 | } else if (t->rcu_read_lock_nesting < 0 && | 322 | } else if (t->rcu_read_lock_nesting < 0 && |
207 | t->rcu_read_unlock_special.s) { | 323 | t->rcu_read_unlock_special.s) { |
208 | 324 | ||
@@ -272,6 +388,7 @@ void rcu_read_unlock_special(struct task_struct *t) | |||
272 | unsigned long flags; | 388 | unsigned long flags; |
273 | struct list_head *np; | 389 | struct list_head *np; |
274 | bool drop_boost_mutex = false; | 390 | bool drop_boost_mutex = false; |
391 | struct rcu_data *rdp; | ||
275 | struct rcu_node *rnp; | 392 | struct rcu_node *rnp; |
276 | union rcu_special special; | 393 | union rcu_special special; |
277 | 394 | ||
@@ -282,8 +399,8 @@ void rcu_read_unlock_special(struct task_struct *t) | |||
282 | local_irq_save(flags); | 399 | local_irq_save(flags); |
283 | 400 | ||
284 | /* | 401 | /* |
285 | * If RCU core is waiting for this CPU to exit critical section, | 402 | * If RCU core is waiting for this CPU to exit its critical section, |
286 | * let it know that we have done so. Because irqs are disabled, | 403 | * report the fact that it has exited. Because irqs are disabled, |
287 | * t->rcu_read_unlock_special cannot change. | 404 | * t->rcu_read_unlock_special cannot change. |
288 | */ | 405 | */ |
289 | special = t->rcu_read_unlock_special; | 406 | special = t->rcu_read_unlock_special; |
@@ -296,13 +413,32 @@ void rcu_read_unlock_special(struct task_struct *t) | |||
296 | } | 413 | } |
297 | } | 414 | } |
298 | 415 | ||
416 | /* | ||
417 | * Respond to a request for an expedited grace period, but only if | ||
418 | * we were not preempted, meaning that we were running on the same | ||
419 | * CPU throughout. If we were preempted, the exp_need_qs flag | ||
420 | * would have been cleared at the time of the first preemption, | ||
421 | * and the quiescent state would be reported when we were dequeued. | ||
422 | */ | ||
423 | if (special.b.exp_need_qs) { | ||
424 | WARN_ON_ONCE(special.b.blocked); | ||
425 | t->rcu_read_unlock_special.b.exp_need_qs = false; | ||
426 | rdp = this_cpu_ptr(rcu_state_p->rda); | ||
427 | rcu_report_exp_rdp(rcu_state_p, rdp, true); | ||
428 | if (!t->rcu_read_unlock_special.s) { | ||
429 | local_irq_restore(flags); | ||
430 | return; | ||
431 | } | ||
432 | } | ||
433 | |||
299 | /* Hardware IRQ handlers cannot block, complain if they get here. */ | 434 | /* Hardware IRQ handlers cannot block, complain if they get here. */ |
300 | if (in_irq() || in_serving_softirq()) { | 435 | if (in_irq() || in_serving_softirq()) { |
301 | lockdep_rcu_suspicious(__FILE__, __LINE__, | 436 | lockdep_rcu_suspicious(__FILE__, __LINE__, |
302 | "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n"); | 437 | "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n"); |
303 | pr_alert("->rcu_read_unlock_special: %#x (b: %d, nq: %d)\n", | 438 | pr_alert("->rcu_read_unlock_special: %#x (b: %d, enq: %d nq: %d)\n", |
304 | t->rcu_read_unlock_special.s, | 439 | t->rcu_read_unlock_special.s, |
305 | t->rcu_read_unlock_special.b.blocked, | 440 | t->rcu_read_unlock_special.b.blocked, |
441 | t->rcu_read_unlock_special.b.exp_need_qs, | ||
306 | t->rcu_read_unlock_special.b.need_qs); | 442 | t->rcu_read_unlock_special.b.need_qs); |
307 | local_irq_restore(flags); | 443 | local_irq_restore(flags); |
308 | return; | 444 | return; |
@@ -329,7 +465,7 @@ void rcu_read_unlock_special(struct task_struct *t) | |||
329 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 465 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
330 | } | 466 | } |
331 | empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); | 467 | empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); |
332 | empty_exp = !rcu_preempted_readers_exp(rnp); | 468 | empty_exp = sync_rcu_preempt_exp_done(rnp); |
333 | smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ | 469 | smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ |
334 | np = rcu_next_node_entry(t, rnp); | 470 | np = rcu_next_node_entry(t, rnp); |
335 | list_del_init(&t->rcu_node_entry); | 471 | list_del_init(&t->rcu_node_entry); |
@@ -353,7 +489,7 @@ void rcu_read_unlock_special(struct task_struct *t) | |||
353 | * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, | 489 | * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, |
354 | * so we must take a snapshot of the expedited state. | 490 | * so we must take a snapshot of the expedited state. |
355 | */ | 491 | */ |
356 | empty_exp_now = !rcu_preempted_readers_exp(rnp); | 492 | empty_exp_now = sync_rcu_preempt_exp_done(rnp); |
357 | if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) { | 493 | if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) { |
358 | trace_rcu_quiescent_state_report(TPS("preempt_rcu"), | 494 | trace_rcu_quiescent_state_report(TPS("preempt_rcu"), |
359 | rnp->gpnum, | 495 | rnp->gpnum, |
@@ -450,6 +586,27 @@ static int rcu_print_task_stall(struct rcu_node *rnp) | |||
450 | } | 586 | } |
451 | 587 | ||
452 | /* | 588 | /* |
589 | * Scan the current list of tasks blocked within RCU read-side critical | ||
590 | * sections, printing out the tid of each that is blocking the current | ||
591 | * expedited grace period. | ||
592 | */ | ||
593 | static int rcu_print_task_exp_stall(struct rcu_node *rnp) | ||
594 | { | ||
595 | struct task_struct *t; | ||
596 | int ndetected = 0; | ||
597 | |||
598 | if (!rnp->exp_tasks) | ||
599 | return 0; | ||
600 | t = list_entry(rnp->exp_tasks->prev, | ||
601 | struct task_struct, rcu_node_entry); | ||
602 | list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { | ||
603 | pr_cont(" P%d", t->pid); | ||
604 | ndetected++; | ||
605 | } | ||
606 | return ndetected; | ||
607 | } | ||
608 | |||
609 | /* | ||
453 | * Check that the list of blocked tasks for the newly completed grace | 610 | * Check that the list of blocked tasks for the newly completed grace |
454 | * period is in fact empty. It is a serious bug to complete a grace | 611 | * period is in fact empty. It is a serious bug to complete a grace |
455 | * period that still has RCU readers blocked! This function must be | 612 | * period that still has RCU readers blocked! This function must be |
@@ -483,8 +640,8 @@ static void rcu_preempt_check_callbacks(void) | |||
483 | return; | 640 | return; |
484 | } | 641 | } |
485 | if (t->rcu_read_lock_nesting > 0 && | 642 | if (t->rcu_read_lock_nesting > 0 && |
486 | __this_cpu_read(rcu_data_p->qs_pending) && | 643 | __this_cpu_read(rcu_data_p->core_needs_qs) && |
487 | !__this_cpu_read(rcu_data_p->passed_quiesce)) | 644 | __this_cpu_read(rcu_data_p->cpu_no_qs.b.norm)) |
488 | t->rcu_read_unlock_special.b.need_qs = true; | 645 | t->rcu_read_unlock_special.b.need_qs = true; |
489 | } | 646 | } |
490 | 647 | ||
@@ -500,7 +657,7 @@ static void rcu_preempt_do_callbacks(void) | |||
500 | /* | 657 | /* |
501 | * Queue a preemptible-RCU callback for invocation after a grace period. | 658 | * Queue a preemptible-RCU callback for invocation after a grace period. |
502 | */ | 659 | */ |
503 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | 660 | void call_rcu(struct rcu_head *head, rcu_callback_t func) |
504 | { | 661 | { |
505 | __call_rcu(head, func, rcu_state_p, -1, 0); | 662 | __call_rcu(head, func, rcu_state_p, -1, 0); |
506 | } | 663 | } |
@@ -535,155 +692,41 @@ void synchronize_rcu(void) | |||
535 | } | 692 | } |
536 | EXPORT_SYMBOL_GPL(synchronize_rcu); | 693 | EXPORT_SYMBOL_GPL(synchronize_rcu); |
537 | 694 | ||
538 | static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); | ||
539 | |||
540 | /* | ||
541 | * Return non-zero if there are any tasks in RCU read-side critical | ||
542 | * sections blocking the current preemptible-RCU expedited grace period. | ||
543 | * If there is no preemptible-RCU expedited grace period currently in | ||
544 | * progress, returns zero unconditionally. | ||
545 | */ | ||
546 | static int rcu_preempted_readers_exp(struct rcu_node *rnp) | ||
547 | { | ||
548 | return rnp->exp_tasks != NULL; | ||
549 | } | ||
550 | |||
551 | /* | ||
552 | * return non-zero if there is no RCU expedited grace period in progress | ||
553 | * for the specified rcu_node structure, in other words, if all CPUs and | ||
554 | * tasks covered by the specified rcu_node structure have done their bit | ||
555 | * for the current expedited grace period. Works only for preemptible | ||
556 | * RCU -- other RCU implementation use other means. | ||
557 | * | ||
558 | * Caller must hold the root rcu_node's exp_funnel_mutex. | ||
559 | */ | ||
560 | static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) | ||
561 | { | ||
562 | return !rcu_preempted_readers_exp(rnp) && | ||
563 | READ_ONCE(rnp->expmask) == 0; | ||
564 | } | ||
565 | |||
566 | /* | ||
567 | * Report the exit from RCU read-side critical section for the last task | ||
568 | * that queued itself during or before the current expedited preemptible-RCU | ||
569 | * grace period. This event is reported either to the rcu_node structure on | ||
570 | * which the task was queued or to one of that rcu_node structure's ancestors, | ||
571 | * recursively up the tree. (Calm down, calm down, we do the recursion | ||
572 | * iteratively!) | ||
573 | * | ||
574 | * Caller must hold the root rcu_node's exp_funnel_mutex. | ||
575 | */ | ||
576 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, | ||
577 | bool wake) | ||
578 | { | ||
579 | unsigned long flags; | ||
580 | unsigned long mask; | ||
581 | |||
582 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
583 | smp_mb__after_unlock_lock(); | ||
584 | for (;;) { | ||
585 | if (!sync_rcu_preempt_exp_done(rnp)) { | ||
586 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
587 | break; | ||
588 | } | ||
589 | if (rnp->parent == NULL) { | ||
590 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
591 | if (wake) { | ||
592 | smp_mb(); /* EGP done before wake_up(). */ | ||
593 | wake_up(&sync_rcu_preempt_exp_wq); | ||
594 | } | ||
595 | break; | ||
596 | } | ||
597 | mask = rnp->grpmask; | ||
598 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ | ||
599 | rnp = rnp->parent; | ||
600 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ | ||
601 | smp_mb__after_unlock_lock(); | ||
602 | rnp->expmask &= ~mask; | ||
603 | } | ||
604 | } | ||
605 | |||
606 | /* | 695 | /* |
607 | * Snapshot the tasks blocking the newly started preemptible-RCU expedited | 696 | * Remote handler for smp_call_function_single(). If there is an |
608 | * grace period for the specified rcu_node structure, phase 1. If there | 697 | * RCU read-side critical section in effect, request that the |
609 | * are such tasks, set the ->expmask bits up the rcu_node tree and also | 698 | * next rcu_read_unlock() record the quiescent state up the |
610 | * set the ->expmask bits on the leaf rcu_node structures to tell phase 2 | 699 | * ->expmask fields in the rcu_node tree. Otherwise, immediately |
611 | * that work is needed here. | 700 | * report the quiescent state. |
612 | * | ||
613 | * Caller must hold the root rcu_node's exp_funnel_mutex. | ||
614 | */ | 701 | */ |
615 | static void | 702 | static void sync_rcu_exp_handler(void *info) |
616 | sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp) | ||
617 | { | 703 | { |
618 | unsigned long flags; | 704 | struct rcu_data *rdp; |
619 | unsigned long mask; | 705 | struct rcu_state *rsp = info; |
620 | struct rcu_node *rnp_up; | 706 | struct task_struct *t = current; |
621 | |||
622 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
623 | smp_mb__after_unlock_lock(); | ||
624 | WARN_ON_ONCE(rnp->expmask); | ||
625 | WARN_ON_ONCE(rnp->exp_tasks); | ||
626 | if (!rcu_preempt_has_tasks(rnp)) { | ||
627 | /* No blocked tasks, nothing to do. */ | ||
628 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
629 | return; | ||
630 | } | ||
631 | /* Call for Phase 2 and propagate ->expmask bits up the tree. */ | ||
632 | rnp->expmask = 1; | ||
633 | rnp_up = rnp; | ||
634 | while (rnp_up->parent) { | ||
635 | mask = rnp_up->grpmask; | ||
636 | rnp_up = rnp_up->parent; | ||
637 | if (rnp_up->expmask & mask) | ||
638 | break; | ||
639 | raw_spin_lock(&rnp_up->lock); /* irqs already off */ | ||
640 | smp_mb__after_unlock_lock(); | ||
641 | rnp_up->expmask |= mask; | ||
642 | raw_spin_unlock(&rnp_up->lock); /* irqs still off */ | ||
643 | } | ||
644 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
645 | } | ||
646 | |||
647 | /* | ||
648 | * Snapshot the tasks blocking the newly started preemptible-RCU expedited | ||
649 | * grace period for the specified rcu_node structure, phase 2. If the | ||
650 | * leaf rcu_node structure has its ->expmask field set, check for tasks. | ||
651 | * If there are some, clear ->expmask and set ->exp_tasks accordingly, | ||
652 | * then initiate RCU priority boosting. Otherwise, clear ->expmask and | ||
653 | * invoke rcu_report_exp_rnp() to clear out the upper-level ->expmask bits, | ||
654 | * enabling rcu_read_unlock_special() to do the bit-clearing. | ||
655 | * | ||
656 | * Caller must hold the root rcu_node's exp_funnel_mutex. | ||
657 | */ | ||
658 | static void | ||
659 | sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp) | ||
660 | { | ||
661 | unsigned long flags; | ||
662 | |||
663 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
664 | smp_mb__after_unlock_lock(); | ||
665 | if (!rnp->expmask) { | ||
666 | /* Phase 1 didn't do anything, so Phase 2 doesn't either. */ | ||
667 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
668 | return; | ||
669 | } | ||
670 | |||
671 | /* Phase 1 is over. */ | ||
672 | rnp->expmask = 0; | ||
673 | 707 | ||
674 | /* | 708 | /* |
675 | * If there are still blocked tasks, set up ->exp_tasks so that | 709 | * Within an RCU read-side critical section, request that the next |
676 | * rcu_read_unlock_special() will wake us and then boost them. | 710 | * rcu_read_unlock() report. Unless this RCU read-side critical |
711 | * section has already blocked, in which case it is already set | ||
712 | * up for the expedited grace period to wait on it. | ||
677 | */ | 713 | */ |
678 | if (rcu_preempt_has_tasks(rnp)) { | 714 | if (t->rcu_read_lock_nesting > 0 && |
679 | rnp->exp_tasks = rnp->blkd_tasks.next; | 715 | !t->rcu_read_unlock_special.b.blocked) { |
680 | rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ | 716 | t->rcu_read_unlock_special.b.exp_need_qs = true; |
681 | return; | 717 | return; |
682 | } | 718 | } |
683 | 719 | ||
684 | /* No longer any blocked tasks, so undo bit setting. */ | 720 | /* |
685 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 721 | * We are either exiting an RCU read-side critical section (negative |
686 | rcu_report_exp_rnp(rsp, rnp, false); | 722 | * values of t->rcu_read_lock_nesting) or are not in one at all |
723 | * (zero value of t->rcu_read_lock_nesting). Or we are in an RCU | ||
724 | * read-side critical section that blocked before this expedited | ||
725 | * grace period started. Either way, we can immediately report | ||
726 | * the quiescent state. | ||
727 | */ | ||
728 | rdp = this_cpu_ptr(rsp->rda); | ||
729 | rcu_report_exp_rdp(rsp, rdp, true); | ||
687 | } | 730 | } |
688 | 731 | ||
689 | /** | 732 | /** |
@@ -713,24 +756,12 @@ void synchronize_rcu_expedited(void) | |||
713 | 756 | ||
714 | rcu_exp_gp_seq_start(rsp); | 757 | rcu_exp_gp_seq_start(rsp); |
715 | 758 | ||
716 | /* force all RCU readers onto ->blkd_tasks lists. */ | 759 | /* Initialize the rcu_node tree in preparation for the wait. */ |
717 | synchronize_sched_expedited(); | 760 | sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); |
718 | |||
719 | /* | ||
720 | * Snapshot current state of ->blkd_tasks lists into ->expmask. | ||
721 | * Phase 1 sets bits and phase 2 permits rcu_read_unlock_special() | ||
722 | * to start clearing them. Doing this in one phase leads to | ||
723 | * strange races between setting and clearing bits, so just say "no"! | ||
724 | */ | ||
725 | rcu_for_each_leaf_node(rsp, rnp) | ||
726 | sync_rcu_preempt_exp_init1(rsp, rnp); | ||
727 | rcu_for_each_leaf_node(rsp, rnp) | ||
728 | sync_rcu_preempt_exp_init2(rsp, rnp); | ||
729 | 761 | ||
730 | /* Wait for snapshotted ->blkd_tasks lists to drain. */ | 762 | /* Wait for snapshotted ->blkd_tasks lists to drain. */ |
731 | rnp = rcu_get_root(rsp); | 763 | rnp = rcu_get_root(rsp); |
732 | wait_event(sync_rcu_preempt_exp_wq, | 764 | synchronize_sched_expedited_wait(rsp); |
733 | sync_rcu_preempt_exp_done(rnp)); | ||
734 | 765 | ||
735 | /* Clean up and exit. */ | 766 | /* Clean up and exit. */ |
736 | rcu_exp_gp_seq_end(rsp); | 767 | rcu_exp_gp_seq_end(rsp); |
@@ -835,6 +866,16 @@ static int rcu_print_task_stall(struct rcu_node *rnp) | |||
835 | } | 866 | } |
836 | 867 | ||
837 | /* | 868 | /* |
869 | * Because preemptible RCU does not exist, we never have to check for | ||
870 | * tasks blocked within RCU read-side critical sections that are | ||
871 | * blocking the current expedited grace period. | ||
872 | */ | ||
873 | static int rcu_print_task_exp_stall(struct rcu_node *rnp) | ||
874 | { | ||
875 | return 0; | ||
876 | } | ||
877 | |||
878 | /* | ||
838 | * Because there is no preemptible RCU, there can be no readers blocked, | 879 | * Because there is no preemptible RCU, there can be no readers blocked, |
839 | * so there is no need to check for blocked tasks. So check only for | 880 | * so there is no need to check for blocked tasks. So check only for |
840 | * bogus qsmask values. | 881 | * bogus qsmask values. |
@@ -1702,8 +1743,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) | |||
1702 | ticks_value = rsp->gpnum - rdp->gpnum; | 1743 | ticks_value = rsp->gpnum - rdp->gpnum; |
1703 | } | 1744 | } |
1704 | print_cpu_stall_fast_no_hz(fast_no_hz, cpu); | 1745 | print_cpu_stall_fast_no_hz(fast_no_hz, cpu); |
1705 | pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n", | 1746 | pr_err("\t%d-%c%c%c: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n", |
1706 | cpu, ticks_value, ticks_title, | 1747 | cpu, |
1748 | "O."[!!cpu_online(cpu)], | ||
1749 | "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], | ||
1750 | "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)], | ||
1751 | ticks_value, ticks_title, | ||
1707 | atomic_read(&rdtp->dynticks) & 0xfff, | 1752 | atomic_read(&rdtp->dynticks) & 0xfff, |
1708 | rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, | 1753 | rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, |
1709 | rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), | 1754 | rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), |
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 6fc4c5ff3bb5..ef7093cc9b5c 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c | |||
@@ -117,13 +117,13 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | |||
117 | 117 | ||
118 | if (!rdp->beenonline) | 118 | if (!rdp->beenonline) |
119 | return; | 119 | return; |
120 | seq_printf(m, "%3d%cc=%ld g=%ld pq=%d/%d qp=%d", | 120 | seq_printf(m, "%3d%cc=%ld g=%ld cnq=%d/%d:%d", |
121 | rdp->cpu, | 121 | rdp->cpu, |
122 | cpu_is_offline(rdp->cpu) ? '!' : ' ', | 122 | cpu_is_offline(rdp->cpu) ? '!' : ' ', |
123 | ulong2long(rdp->completed), ulong2long(rdp->gpnum), | 123 | ulong2long(rdp->completed), ulong2long(rdp->gpnum), |
124 | rdp->passed_quiesce, | 124 | rdp->cpu_no_qs.b.norm, |
125 | rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu), | 125 | rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu), |
126 | rdp->qs_pending); | 126 | rdp->core_needs_qs); |
127 | seq_printf(m, " dt=%d/%llx/%d df=%lu", | 127 | seq_printf(m, " dt=%d/%llx/%d df=%lu", |
128 | atomic_read(&rdp->dynticks->dynticks), | 128 | atomic_read(&rdp->dynticks->dynticks), |
129 | rdp->dynticks->dynticks_nesting, | 129 | rdp->dynticks->dynticks_nesting, |
@@ -268,7 +268,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | |||
268 | gpnum = rsp->gpnum; | 268 | gpnum = rsp->gpnum; |
269 | seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ", | 269 | seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ", |
270 | ulong2long(rsp->completed), ulong2long(gpnum), | 270 | ulong2long(rsp->completed), ulong2long(gpnum), |
271 | rsp->fqs_state, | 271 | rsp->gp_state, |
272 | (long)(rsp->jiffies_force_qs - jiffies), | 272 | (long)(rsp->jiffies_force_qs - jiffies), |
273 | (int)(jiffies & 0xffff)); | 273 | (int)(jiffies & 0xffff)); |
274 | seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", | 274 | seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", |
@@ -361,7 +361,7 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) | |||
361 | cpu_is_offline(rdp->cpu) ? '!' : ' ', | 361 | cpu_is_offline(rdp->cpu) ? '!' : ' ', |
362 | rdp->n_rcu_pending); | 362 | rdp->n_rcu_pending); |
363 | seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ", | 363 | seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ", |
364 | rdp->n_rp_qs_pending, | 364 | rdp->n_rp_core_needs_qs, |
365 | rdp->n_rp_report_qs, | 365 | rdp->n_rp_report_qs, |
366 | rdp->n_rp_cb_ready, | 366 | rdp->n_rp_cb_ready, |
367 | rdp->n_rp_cpu_needs_gp); | 367 | rdp->n_rp_cpu_needs_gp); |
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 7a0b3bc7c5ed..5f748c5a40f0 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c | |||
@@ -534,7 +534,7 @@ static void rcu_spawn_tasks_kthread(void); | |||
534 | * Post an RCU-tasks callback. First call must be from process context | 534 | * Post an RCU-tasks callback. First call must be from process context |
535 | * after the scheduler if fully operational. | 535 | * after the scheduler if fully operational. |
536 | */ | 536 | */ |
537 | void call_rcu_tasks(struct rcu_head *rhp, void (*func)(struct rcu_head *rhp)) | 537 | void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) |
538 | { | 538 | { |
539 | unsigned long flags; | 539 | unsigned long flags; |
540 | bool needwake; | 540 | bool needwake; |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2f9c92884817..4d568ac9319e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -817,7 +817,7 @@ static void set_load_weight(struct task_struct *p) | |||
817 | /* | 817 | /* |
818 | * SCHED_IDLE tasks get minimal weight: | 818 | * SCHED_IDLE tasks get minimal weight: |
819 | */ | 819 | */ |
820 | if (p->policy == SCHED_IDLE) { | 820 | if (idle_policy(p->policy)) { |
821 | load->weight = scale_load(WEIGHT_IDLEPRIO); | 821 | load->weight = scale_load(WEIGHT_IDLEPRIO); |
822 | load->inv_weight = WMULT_IDLEPRIO; | 822 | load->inv_weight = WMULT_IDLEPRIO; |
823 | return; | 823 | return; |
@@ -827,17 +827,19 @@ static void set_load_weight(struct task_struct *p) | |||
827 | load->inv_weight = prio_to_wmult[prio]; | 827 | load->inv_weight = prio_to_wmult[prio]; |
828 | } | 828 | } |
829 | 829 | ||
830 | static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) | 830 | static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) |
831 | { | 831 | { |
832 | update_rq_clock(rq); | 832 | update_rq_clock(rq); |
833 | sched_info_queued(rq, p); | 833 | if (!(flags & ENQUEUE_RESTORE)) |
834 | sched_info_queued(rq, p); | ||
834 | p->sched_class->enqueue_task(rq, p, flags); | 835 | p->sched_class->enqueue_task(rq, p, flags); |
835 | } | 836 | } |
836 | 837 | ||
837 | static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) | 838 | static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) |
838 | { | 839 | { |
839 | update_rq_clock(rq); | 840 | update_rq_clock(rq); |
840 | sched_info_dequeued(rq, p); | 841 | if (!(flags & DEQUEUE_SAVE)) |
842 | sched_info_dequeued(rq, p); | ||
841 | p->sched_class->dequeue_task(rq, p, flags); | 843 | p->sched_class->dequeue_task(rq, p, flags); |
842 | } | 844 | } |
843 | 845 | ||
@@ -1178,7 +1180,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | |||
1178 | * holding rq->lock. | 1180 | * holding rq->lock. |
1179 | */ | 1181 | */ |
1180 | lockdep_assert_held(&rq->lock); | 1182 | lockdep_assert_held(&rq->lock); |
1181 | dequeue_task(rq, p, 0); | 1183 | dequeue_task(rq, p, DEQUEUE_SAVE); |
1182 | } | 1184 | } |
1183 | if (running) | 1185 | if (running) |
1184 | put_prev_task(rq, p); | 1186 | put_prev_task(rq, p); |
@@ -1188,7 +1190,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | |||
1188 | if (running) | 1190 | if (running) |
1189 | p->sched_class->set_curr_task(rq); | 1191 | p->sched_class->set_curr_task(rq); |
1190 | if (queued) | 1192 | if (queued) |
1191 | enqueue_task(rq, p, 0); | 1193 | enqueue_task(rq, p, ENQUEUE_RESTORE); |
1192 | } | 1194 | } |
1193 | 1195 | ||
1194 | /* | 1196 | /* |
@@ -1292,7 +1294,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
1292 | 1294 | ||
1293 | if (task_cpu(p) != new_cpu) { | 1295 | if (task_cpu(p) != new_cpu) { |
1294 | if (p->sched_class->migrate_task_rq) | 1296 | if (p->sched_class->migrate_task_rq) |
1295 | p->sched_class->migrate_task_rq(p, new_cpu); | 1297 | p->sched_class->migrate_task_rq(p); |
1296 | p->se.nr_migrations++; | 1298 | p->se.nr_migrations++; |
1297 | perf_event_task_migrate(p); | 1299 | perf_event_task_migrate(p); |
1298 | } | 1300 | } |
@@ -1333,12 +1335,16 @@ static int migrate_swap_stop(void *data) | |||
1333 | struct rq *src_rq, *dst_rq; | 1335 | struct rq *src_rq, *dst_rq; |
1334 | int ret = -EAGAIN; | 1336 | int ret = -EAGAIN; |
1335 | 1337 | ||
1338 | if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu)) | ||
1339 | return -EAGAIN; | ||
1340 | |||
1336 | src_rq = cpu_rq(arg->src_cpu); | 1341 | src_rq = cpu_rq(arg->src_cpu); |
1337 | dst_rq = cpu_rq(arg->dst_cpu); | 1342 | dst_rq = cpu_rq(arg->dst_cpu); |
1338 | 1343 | ||
1339 | double_raw_lock(&arg->src_task->pi_lock, | 1344 | double_raw_lock(&arg->src_task->pi_lock, |
1340 | &arg->dst_task->pi_lock); | 1345 | &arg->dst_task->pi_lock); |
1341 | double_rq_lock(src_rq, dst_rq); | 1346 | double_rq_lock(src_rq, dst_rq); |
1347 | |||
1342 | if (task_cpu(arg->dst_task) != arg->dst_cpu) | 1348 | if (task_cpu(arg->dst_task) != arg->dst_cpu) |
1343 | goto unlock; | 1349 | goto unlock; |
1344 | 1350 | ||
@@ -1574,13 +1580,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||
1574 | goto out; | 1580 | goto out; |
1575 | } | 1581 | } |
1576 | 1582 | ||
1583 | /* No more Mr. Nice Guy. */ | ||
1577 | switch (state) { | 1584 | switch (state) { |
1578 | case cpuset: | 1585 | case cpuset: |
1579 | /* No more Mr. Nice Guy. */ | 1586 | if (IS_ENABLED(CONFIG_CPUSETS)) { |
1580 | cpuset_cpus_allowed_fallback(p); | 1587 | cpuset_cpus_allowed_fallback(p); |
1581 | state = possible; | 1588 | state = possible; |
1582 | break; | 1589 | break; |
1583 | 1590 | } | |
1591 | /* fall-through */ | ||
1584 | case possible: | 1592 | case possible: |
1585 | do_set_cpus_allowed(p, cpu_possible_mask); | 1593 | do_set_cpus_allowed(p, cpu_possible_mask); |
1586 | state = fail; | 1594 | state = fail; |
@@ -1692,7 +1700,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) | |||
1692 | #endif /* CONFIG_SCHEDSTATS */ | 1700 | #endif /* CONFIG_SCHEDSTATS */ |
1693 | } | 1701 | } |
1694 | 1702 | ||
1695 | static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) | 1703 | static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) |
1696 | { | 1704 | { |
1697 | activate_task(rq, p, en_flags); | 1705 | activate_task(rq, p, en_flags); |
1698 | p->on_rq = TASK_ON_RQ_QUEUED; | 1706 | p->on_rq = TASK_ON_RQ_QUEUED; |
@@ -2114,23 +2122,17 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
2114 | #endif /* CONFIG_NUMA_BALANCING */ | 2122 | #endif /* CONFIG_NUMA_BALANCING */ |
2115 | } | 2123 | } |
2116 | 2124 | ||
2125 | DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); | ||
2126 | |||
2117 | #ifdef CONFIG_NUMA_BALANCING | 2127 | #ifdef CONFIG_NUMA_BALANCING |
2118 | #ifdef CONFIG_SCHED_DEBUG | 2128 | |
2119 | void set_numabalancing_state(bool enabled) | 2129 | void set_numabalancing_state(bool enabled) |
2120 | { | 2130 | { |
2121 | if (enabled) | 2131 | if (enabled) |
2122 | sched_feat_set("NUMA"); | 2132 | static_branch_enable(&sched_numa_balancing); |
2123 | else | 2133 | else |
2124 | sched_feat_set("NO_NUMA"); | 2134 | static_branch_disable(&sched_numa_balancing); |
2125 | } | 2135 | } |
2126 | #else | ||
2127 | __read_mostly bool numabalancing_enabled; | ||
2128 | |||
2129 | void set_numabalancing_state(bool enabled) | ||
2130 | { | ||
2131 | numabalancing_enabled = enabled; | ||
2132 | } | ||
2133 | #endif /* CONFIG_SCHED_DEBUG */ | ||
2134 | 2136 | ||
2135 | #ifdef CONFIG_PROC_SYSCTL | 2137 | #ifdef CONFIG_PROC_SYSCTL |
2136 | int sysctl_numa_balancing(struct ctl_table *table, int write, | 2138 | int sysctl_numa_balancing(struct ctl_table *table, int write, |
@@ -2138,7 +2140,7 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, | |||
2138 | { | 2140 | { |
2139 | struct ctl_table t; | 2141 | struct ctl_table t; |
2140 | int err; | 2142 | int err; |
2141 | int state = numabalancing_enabled; | 2143 | int state = static_branch_likely(&sched_numa_balancing); |
2142 | 2144 | ||
2143 | if (write && !capable(CAP_SYS_ADMIN)) | 2145 | if (write && !capable(CAP_SYS_ADMIN)) |
2144 | return -EPERM; | 2146 | return -EPERM; |
@@ -2349,6 +2351,8 @@ void wake_up_new_task(struct task_struct *p) | |||
2349 | struct rq *rq; | 2351 | struct rq *rq; |
2350 | 2352 | ||
2351 | raw_spin_lock_irqsave(&p->pi_lock, flags); | 2353 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
2354 | /* Initialize new task's runnable average */ | ||
2355 | init_entity_runnable_average(&p->se); | ||
2352 | #ifdef CONFIG_SMP | 2356 | #ifdef CONFIG_SMP |
2353 | /* | 2357 | /* |
2354 | * Fork balancing, do it here and not earlier because: | 2358 | * Fork balancing, do it here and not earlier because: |
@@ -2358,16 +2362,21 @@ void wake_up_new_task(struct task_struct *p) | |||
2358 | set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); | 2362 | set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); |
2359 | #endif | 2363 | #endif |
2360 | 2364 | ||
2361 | /* Initialize new task's runnable average */ | ||
2362 | init_entity_runnable_average(&p->se); | ||
2363 | rq = __task_rq_lock(p); | 2365 | rq = __task_rq_lock(p); |
2364 | activate_task(rq, p, 0); | 2366 | activate_task(rq, p, 0); |
2365 | p->on_rq = TASK_ON_RQ_QUEUED; | 2367 | p->on_rq = TASK_ON_RQ_QUEUED; |
2366 | trace_sched_wakeup_new(p); | 2368 | trace_sched_wakeup_new(p); |
2367 | check_preempt_curr(rq, p, WF_FORK); | 2369 | check_preempt_curr(rq, p, WF_FORK); |
2368 | #ifdef CONFIG_SMP | 2370 | #ifdef CONFIG_SMP |
2369 | if (p->sched_class->task_woken) | 2371 | if (p->sched_class->task_woken) { |
2372 | /* | ||
2373 | * Nothing relies on rq->lock after this, so its fine to | ||
2374 | * drop it. | ||
2375 | */ | ||
2376 | lockdep_unpin_lock(&rq->lock); | ||
2370 | p->sched_class->task_woken(rq, p); | 2377 | p->sched_class->task_woken(rq, p); |
2378 | lockdep_pin_lock(&rq->lock); | ||
2379 | } | ||
2371 | #endif | 2380 | #endif |
2372 | task_rq_unlock(rq, p, &flags); | 2381 | task_rq_unlock(rq, p, &flags); |
2373 | } | 2382 | } |
@@ -2476,7 +2485,6 @@ static inline void | |||
2476 | prepare_task_switch(struct rq *rq, struct task_struct *prev, | 2485 | prepare_task_switch(struct rq *rq, struct task_struct *prev, |
2477 | struct task_struct *next) | 2486 | struct task_struct *next) |
2478 | { | 2487 | { |
2479 | trace_sched_switch(prev, next); | ||
2480 | sched_info_switch(rq, prev, next); | 2488 | sched_info_switch(rq, prev, next); |
2481 | perf_event_task_sched_out(prev, next); | 2489 | perf_event_task_sched_out(prev, next); |
2482 | fire_sched_out_preempt_notifiers(prev, next); | 2490 | fire_sched_out_preempt_notifiers(prev, next); |
@@ -2510,6 +2518,22 @@ static struct rq *finish_task_switch(struct task_struct *prev) | |||
2510 | struct mm_struct *mm = rq->prev_mm; | 2518 | struct mm_struct *mm = rq->prev_mm; |
2511 | long prev_state; | 2519 | long prev_state; |
2512 | 2520 | ||
2521 | /* | ||
2522 | * The previous task will have left us with a preempt_count of 2 | ||
2523 | * because it left us after: | ||
2524 | * | ||
2525 | * schedule() | ||
2526 | * preempt_disable(); // 1 | ||
2527 | * __schedule() | ||
2528 | * raw_spin_lock_irq(&rq->lock) // 2 | ||
2529 | * | ||
2530 | * Also, see FORK_PREEMPT_COUNT. | ||
2531 | */ | ||
2532 | if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, | ||
2533 | "corrupted preempt_count: %s/%d/0x%x\n", | ||
2534 | current->comm, current->pid, preempt_count())) | ||
2535 | preempt_count_set(FORK_PREEMPT_COUNT); | ||
2536 | |||
2513 | rq->prev_mm = NULL; | 2537 | rq->prev_mm = NULL; |
2514 | 2538 | ||
2515 | /* | 2539 | /* |
@@ -2517,11 +2541,11 @@ static struct rq *finish_task_switch(struct task_struct *prev) | |||
2517 | * If a task dies, then it sets TASK_DEAD in tsk->state and calls | 2541 | * If a task dies, then it sets TASK_DEAD in tsk->state and calls |
2518 | * schedule one last time. The schedule call will never return, and | 2542 | * schedule one last time. The schedule call will never return, and |
2519 | * the scheduled task must drop that reference. | 2543 | * the scheduled task must drop that reference. |
2520 | * The test for TASK_DEAD must occur while the runqueue locks are | 2544 | * |
2521 | * still held, otherwise prev could be scheduled on another cpu, die | 2545 | * We must observe prev->state before clearing prev->on_cpu (in |
2522 | * there before we look at prev->state, and then the reference would | 2546 | * finish_lock_switch), otherwise a concurrent wakeup can get prev |
2523 | * be dropped twice. | 2547 | * running on another CPU and we could rave with its RUNNING -> DEAD |
2524 | * Manfred Spraul <manfred@colorfullife.com> | 2548 | * transition, resulting in a double drop. |
2525 | */ | 2549 | */ |
2526 | prev_state = prev->state; | 2550 | prev_state = prev->state; |
2527 | vtime_task_switch(prev); | 2551 | vtime_task_switch(prev); |
@@ -2594,8 +2618,15 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) | |||
2594 | { | 2618 | { |
2595 | struct rq *rq; | 2619 | struct rq *rq; |
2596 | 2620 | ||
2597 | /* finish_task_switch() drops rq->lock and enables preemtion */ | 2621 | /* |
2598 | preempt_disable(); | 2622 | * New tasks start with FORK_PREEMPT_COUNT, see there and |
2623 | * finish_task_switch() for details. | ||
2624 | * | ||
2625 | * finish_task_switch() will drop rq->lock() and lower preempt_count | ||
2626 | * and the preempt_enable() will end up enabling preemption (on | ||
2627 | * PREEMPT_COUNT kernels). | ||
2628 | */ | ||
2629 | |||
2599 | rq = finish_task_switch(prev); | 2630 | rq = finish_task_switch(prev); |
2600 | balance_callback(rq); | 2631 | balance_callback(rq); |
2601 | preempt_enable(); | 2632 | preempt_enable(); |
@@ -2953,15 +2984,13 @@ static noinline void __schedule_bug(struct task_struct *prev) | |||
2953 | static inline void schedule_debug(struct task_struct *prev) | 2984 | static inline void schedule_debug(struct task_struct *prev) |
2954 | { | 2985 | { |
2955 | #ifdef CONFIG_SCHED_STACK_END_CHECK | 2986 | #ifdef CONFIG_SCHED_STACK_END_CHECK |
2956 | BUG_ON(unlikely(task_stack_end_corrupted(prev))); | 2987 | BUG_ON(task_stack_end_corrupted(prev)); |
2957 | #endif | 2988 | #endif |
2958 | /* | 2989 | |
2959 | * Test if we are atomic. Since do_exit() needs to call into | 2990 | if (unlikely(in_atomic_preempt_off())) { |
2960 | * schedule() atomically, we ignore that path. Otherwise whine | ||
2961 | * if we are scheduling when we should not. | ||
2962 | */ | ||
2963 | if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD)) | ||
2964 | __schedule_bug(prev); | 2991 | __schedule_bug(prev); |
2992 | preempt_count_set(PREEMPT_DISABLED); | ||
2993 | } | ||
2965 | rcu_sleep_check(); | 2994 | rcu_sleep_check(); |
2966 | 2995 | ||
2967 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); | 2996 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); |
@@ -3047,7 +3076,7 @@ again: | |||
3047 | * | 3076 | * |
3048 | * WARNING: must be called with preemption disabled! | 3077 | * WARNING: must be called with preemption disabled! |
3049 | */ | 3078 | */ |
3050 | static void __sched __schedule(void) | 3079 | static void __sched notrace __schedule(bool preempt) |
3051 | { | 3080 | { |
3052 | struct task_struct *prev, *next; | 3081 | struct task_struct *prev, *next; |
3053 | unsigned long *switch_count; | 3082 | unsigned long *switch_count; |
@@ -3059,6 +3088,17 @@ static void __sched __schedule(void) | |||
3059 | rcu_note_context_switch(); | 3088 | rcu_note_context_switch(); |
3060 | prev = rq->curr; | 3089 | prev = rq->curr; |
3061 | 3090 | ||
3091 | /* | ||
3092 | * do_exit() calls schedule() with preemption disabled as an exception; | ||
3093 | * however we must fix that up, otherwise the next task will see an | ||
3094 | * inconsistent (higher) preempt count. | ||
3095 | * | ||
3096 | * It also avoids the below schedule_debug() test from complaining | ||
3097 | * about this. | ||
3098 | */ | ||
3099 | if (unlikely(prev->state == TASK_DEAD)) | ||
3100 | preempt_enable_no_resched_notrace(); | ||
3101 | |||
3062 | schedule_debug(prev); | 3102 | schedule_debug(prev); |
3063 | 3103 | ||
3064 | if (sched_feat(HRTICK)) | 3104 | if (sched_feat(HRTICK)) |
@@ -3076,7 +3116,7 @@ static void __sched __schedule(void) | |||
3076 | rq->clock_skip_update <<= 1; /* promote REQ to ACT */ | 3116 | rq->clock_skip_update <<= 1; /* promote REQ to ACT */ |
3077 | 3117 | ||
3078 | switch_count = &prev->nivcsw; | 3118 | switch_count = &prev->nivcsw; |
3079 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 3119 | if (!preempt && prev->state) { |
3080 | if (unlikely(signal_pending_state(prev->state, prev))) { | 3120 | if (unlikely(signal_pending_state(prev->state, prev))) { |
3081 | prev->state = TASK_RUNNING; | 3121 | prev->state = TASK_RUNNING; |
3082 | } else { | 3122 | } else { |
@@ -3112,6 +3152,7 @@ static void __sched __schedule(void) | |||
3112 | rq->curr = next; | 3152 | rq->curr = next; |
3113 | ++*switch_count; | 3153 | ++*switch_count; |
3114 | 3154 | ||
3155 | trace_sched_switch(preempt, prev, next); | ||
3115 | rq = context_switch(rq, prev, next); /* unlocks the rq */ | 3156 | rq = context_switch(rq, prev, next); /* unlocks the rq */ |
3116 | cpu = cpu_of(rq); | 3157 | cpu = cpu_of(rq); |
3117 | } else { | 3158 | } else { |
@@ -3141,7 +3182,7 @@ asmlinkage __visible void __sched schedule(void) | |||
3141 | sched_submit_work(tsk); | 3182 | sched_submit_work(tsk); |
3142 | do { | 3183 | do { |
3143 | preempt_disable(); | 3184 | preempt_disable(); |
3144 | __schedule(); | 3185 | __schedule(false); |
3145 | sched_preempt_enable_no_resched(); | 3186 | sched_preempt_enable_no_resched(); |
3146 | } while (need_resched()); | 3187 | } while (need_resched()); |
3147 | } | 3188 | } |
@@ -3181,9 +3222,9 @@ void __sched schedule_preempt_disabled(void) | |||
3181 | static void __sched notrace preempt_schedule_common(void) | 3222 | static void __sched notrace preempt_schedule_common(void) |
3182 | { | 3223 | { |
3183 | do { | 3224 | do { |
3184 | preempt_active_enter(); | 3225 | preempt_disable_notrace(); |
3185 | __schedule(); | 3226 | __schedule(true); |
3186 | preempt_active_exit(); | 3227 | preempt_enable_no_resched_notrace(); |
3187 | 3228 | ||
3188 | /* | 3229 | /* |
3189 | * Check again in case we missed a preemption opportunity | 3230 | * Check again in case we missed a preemption opportunity |
@@ -3234,24 +3275,17 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) | |||
3234 | return; | 3275 | return; |
3235 | 3276 | ||
3236 | do { | 3277 | do { |
3237 | /* | 3278 | preempt_disable_notrace(); |
3238 | * Use raw __prempt_count() ops that don't call function. | ||
3239 | * We can't call functions before disabling preemption which | ||
3240 | * disarm preemption tracing recursions. | ||
3241 | */ | ||
3242 | __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); | ||
3243 | barrier(); | ||
3244 | /* | 3279 | /* |
3245 | * Needs preempt disabled in case user_exit() is traced | 3280 | * Needs preempt disabled in case user_exit() is traced |
3246 | * and the tracer calls preempt_enable_notrace() causing | 3281 | * and the tracer calls preempt_enable_notrace() causing |
3247 | * an infinite recursion. | 3282 | * an infinite recursion. |
3248 | */ | 3283 | */ |
3249 | prev_ctx = exception_enter(); | 3284 | prev_ctx = exception_enter(); |
3250 | __schedule(); | 3285 | __schedule(true); |
3251 | exception_exit(prev_ctx); | 3286 | exception_exit(prev_ctx); |
3252 | 3287 | ||
3253 | barrier(); | 3288 | preempt_enable_no_resched_notrace(); |
3254 | __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); | ||
3255 | } while (need_resched()); | 3289 | } while (need_resched()); |
3256 | } | 3290 | } |
3257 | EXPORT_SYMBOL_GPL(preempt_schedule_notrace); | 3291 | EXPORT_SYMBOL_GPL(preempt_schedule_notrace); |
@@ -3274,11 +3308,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) | |||
3274 | prev_state = exception_enter(); | 3308 | prev_state = exception_enter(); |
3275 | 3309 | ||
3276 | do { | 3310 | do { |
3277 | preempt_active_enter(); | 3311 | preempt_disable(); |
3278 | local_irq_enable(); | 3312 | local_irq_enable(); |
3279 | __schedule(); | 3313 | __schedule(true); |
3280 | local_irq_disable(); | 3314 | local_irq_disable(); |
3281 | preempt_active_exit(); | 3315 | sched_preempt_enable_no_resched(); |
3282 | } while (need_resched()); | 3316 | } while (need_resched()); |
3283 | 3317 | ||
3284 | exception_exit(prev_state); | 3318 | exception_exit(prev_state); |
@@ -3306,7 +3340,7 @@ EXPORT_SYMBOL(default_wake_function); | |||
3306 | */ | 3340 | */ |
3307 | void rt_mutex_setprio(struct task_struct *p, int prio) | 3341 | void rt_mutex_setprio(struct task_struct *p, int prio) |
3308 | { | 3342 | { |
3309 | int oldprio, queued, running, enqueue_flag = 0; | 3343 | int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE; |
3310 | struct rq *rq; | 3344 | struct rq *rq; |
3311 | const struct sched_class *prev_class; | 3345 | const struct sched_class *prev_class; |
3312 | 3346 | ||
@@ -3338,7 +3372,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
3338 | queued = task_on_rq_queued(p); | 3372 | queued = task_on_rq_queued(p); |
3339 | running = task_current(rq, p); | 3373 | running = task_current(rq, p); |
3340 | if (queued) | 3374 | if (queued) |
3341 | dequeue_task(rq, p, 0); | 3375 | dequeue_task(rq, p, DEQUEUE_SAVE); |
3342 | if (running) | 3376 | if (running) |
3343 | put_prev_task(rq, p); | 3377 | put_prev_task(rq, p); |
3344 | 3378 | ||
@@ -3356,7 +3390,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
3356 | if (!dl_prio(p->normal_prio) || | 3390 | if (!dl_prio(p->normal_prio) || |
3357 | (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { | 3391 | (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { |
3358 | p->dl.dl_boosted = 1; | 3392 | p->dl.dl_boosted = 1; |
3359 | enqueue_flag = ENQUEUE_REPLENISH; | 3393 | enqueue_flag |= ENQUEUE_REPLENISH; |
3360 | } else | 3394 | } else |
3361 | p->dl.dl_boosted = 0; | 3395 | p->dl.dl_boosted = 0; |
3362 | p->sched_class = &dl_sched_class; | 3396 | p->sched_class = &dl_sched_class; |
@@ -3364,7 +3398,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
3364 | if (dl_prio(oldprio)) | 3398 | if (dl_prio(oldprio)) |
3365 | p->dl.dl_boosted = 0; | 3399 | p->dl.dl_boosted = 0; |
3366 | if (oldprio < prio) | 3400 | if (oldprio < prio) |
3367 | enqueue_flag = ENQUEUE_HEAD; | 3401 | enqueue_flag |= ENQUEUE_HEAD; |
3368 | p->sched_class = &rt_sched_class; | 3402 | p->sched_class = &rt_sched_class; |
3369 | } else { | 3403 | } else { |
3370 | if (dl_prio(oldprio)) | 3404 | if (dl_prio(oldprio)) |
@@ -3416,7 +3450,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
3416 | } | 3450 | } |
3417 | queued = task_on_rq_queued(p); | 3451 | queued = task_on_rq_queued(p); |
3418 | if (queued) | 3452 | if (queued) |
3419 | dequeue_task(rq, p, 0); | 3453 | dequeue_task(rq, p, DEQUEUE_SAVE); |
3420 | 3454 | ||
3421 | p->static_prio = NICE_TO_PRIO(nice); | 3455 | p->static_prio = NICE_TO_PRIO(nice); |
3422 | set_load_weight(p); | 3456 | set_load_weight(p); |
@@ -3425,7 +3459,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
3425 | delta = p->prio - old_prio; | 3459 | delta = p->prio - old_prio; |
3426 | 3460 | ||
3427 | if (queued) { | 3461 | if (queued) { |
3428 | enqueue_task(rq, p, 0); | 3462 | enqueue_task(rq, p, ENQUEUE_RESTORE); |
3429 | /* | 3463 | /* |
3430 | * If the task increased its priority or is running and | 3464 | * If the task increased its priority or is running and |
3431 | * lowered its priority, then reschedule its CPU: | 3465 | * lowered its priority, then reschedule its CPU: |
@@ -3746,10 +3780,7 @@ recheck: | |||
3746 | } else { | 3780 | } else { |
3747 | reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); | 3781 | reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); |
3748 | 3782 | ||
3749 | if (policy != SCHED_DEADLINE && | 3783 | if (!valid_policy(policy)) |
3750 | policy != SCHED_FIFO && policy != SCHED_RR && | ||
3751 | policy != SCHED_NORMAL && policy != SCHED_BATCH && | ||
3752 | policy != SCHED_IDLE) | ||
3753 | return -EINVAL; | 3784 | return -EINVAL; |
3754 | } | 3785 | } |
3755 | 3786 | ||
@@ -3805,7 +3836,7 @@ recheck: | |||
3805 | * Treat SCHED_IDLE as nice 20. Only allow a switch to | 3836 | * Treat SCHED_IDLE as nice 20. Only allow a switch to |
3806 | * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. | 3837 | * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. |
3807 | */ | 3838 | */ |
3808 | if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { | 3839 | if (idle_policy(p->policy) && !idle_policy(policy)) { |
3809 | if (!can_nice(p, task_nice(p))) | 3840 | if (!can_nice(p, task_nice(p))) |
3810 | return -EPERM; | 3841 | return -EPERM; |
3811 | } | 3842 | } |
@@ -3930,7 +3961,7 @@ change: | |||
3930 | queued = task_on_rq_queued(p); | 3961 | queued = task_on_rq_queued(p); |
3931 | running = task_current(rq, p); | 3962 | running = task_current(rq, p); |
3932 | if (queued) | 3963 | if (queued) |
3933 | dequeue_task(rq, p, 0); | 3964 | dequeue_task(rq, p, DEQUEUE_SAVE); |
3934 | if (running) | 3965 | if (running) |
3935 | put_prev_task(rq, p); | 3966 | put_prev_task(rq, p); |
3936 | 3967 | ||
@@ -3940,11 +3971,15 @@ change: | |||
3940 | if (running) | 3971 | if (running) |
3941 | p->sched_class->set_curr_task(rq); | 3972 | p->sched_class->set_curr_task(rq); |
3942 | if (queued) { | 3973 | if (queued) { |
3974 | int enqueue_flags = ENQUEUE_RESTORE; | ||
3943 | /* | 3975 | /* |
3944 | * We enqueue to tail when the priority of a task is | 3976 | * We enqueue to tail when the priority of a task is |
3945 | * increased (user space view). | 3977 | * increased (user space view). |
3946 | */ | 3978 | */ |
3947 | enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0); | 3979 | if (oldprio <= p->prio) |
3980 | enqueue_flags |= ENQUEUE_HEAD; | ||
3981 | |||
3982 | enqueue_task(rq, p, enqueue_flags); | ||
3948 | } | 3983 | } |
3949 | 3984 | ||
3950 | check_class_changed(rq, p, prev_class, oldprio); | 3985 | check_class_changed(rq, p, prev_class, oldprio); |
@@ -4022,6 +4057,7 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy, | |||
4022 | { | 4057 | { |
4023 | return _sched_setscheduler(p, policy, param, false); | 4058 | return _sched_setscheduler(p, policy, param, false); |
4024 | } | 4059 | } |
4060 | EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); | ||
4025 | 4061 | ||
4026 | static int | 4062 | static int |
4027 | do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | 4063 | do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) |
@@ -4934,7 +4970,15 @@ void init_idle(struct task_struct *idle, int cpu) | |||
4934 | idle->state = TASK_RUNNING; | 4970 | idle->state = TASK_RUNNING; |
4935 | idle->se.exec_start = sched_clock(); | 4971 | idle->se.exec_start = sched_clock(); |
4936 | 4972 | ||
4937 | do_set_cpus_allowed(idle, cpumask_of(cpu)); | 4973 | #ifdef CONFIG_SMP |
4974 | /* | ||
4975 | * Its possible that init_idle() gets called multiple times on a task, | ||
4976 | * in that case do_set_cpus_allowed() will not do the right thing. | ||
4977 | * | ||
4978 | * And since this is boot we can forgo the serialization. | ||
4979 | */ | ||
4980 | set_cpus_allowed_common(idle, cpumask_of(cpu)); | ||
4981 | #endif | ||
4938 | /* | 4982 | /* |
4939 | * We're having a chicken and egg problem, even though we are | 4983 | * We're having a chicken and egg problem, even though we are |
4940 | * holding rq->lock, the cpu isn't yet set to this cpu so the | 4984 | * holding rq->lock, the cpu isn't yet set to this cpu so the |
@@ -4951,7 +4995,7 @@ void init_idle(struct task_struct *idle, int cpu) | |||
4951 | 4995 | ||
4952 | rq->curr = rq->idle = idle; | 4996 | rq->curr = rq->idle = idle; |
4953 | idle->on_rq = TASK_ON_RQ_QUEUED; | 4997 | idle->on_rq = TASK_ON_RQ_QUEUED; |
4954 | #if defined(CONFIG_SMP) | 4998 | #ifdef CONFIG_SMP |
4955 | idle->on_cpu = 1; | 4999 | idle->on_cpu = 1; |
4956 | #endif | 5000 | #endif |
4957 | raw_spin_unlock(&rq->lock); | 5001 | raw_spin_unlock(&rq->lock); |
@@ -4966,7 +5010,7 @@ void init_idle(struct task_struct *idle, int cpu) | |||
4966 | idle->sched_class = &idle_sched_class; | 5010 | idle->sched_class = &idle_sched_class; |
4967 | ftrace_graph_init_idle_task(idle, cpu); | 5011 | ftrace_graph_init_idle_task(idle, cpu); |
4968 | vtime_init_idle(idle, cpu); | 5012 | vtime_init_idle(idle, cpu); |
4969 | #if defined(CONFIG_SMP) | 5013 | #ifdef CONFIG_SMP |
4970 | sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); | 5014 | sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); |
4971 | #endif | 5015 | #endif |
4972 | } | 5016 | } |
@@ -5085,7 +5129,7 @@ void sched_setnuma(struct task_struct *p, int nid) | |||
5085 | running = task_current(rq, p); | 5129 | running = task_current(rq, p); |
5086 | 5130 | ||
5087 | if (queued) | 5131 | if (queued) |
5088 | dequeue_task(rq, p, 0); | 5132 | dequeue_task(rq, p, DEQUEUE_SAVE); |
5089 | if (running) | 5133 | if (running) |
5090 | put_prev_task(rq, p); | 5134 | put_prev_task(rq, p); |
5091 | 5135 | ||
@@ -5094,7 +5138,7 @@ void sched_setnuma(struct task_struct *p, int nid) | |||
5094 | if (running) | 5138 | if (running) |
5095 | p->sched_class->set_curr_task(rq); | 5139 | p->sched_class->set_curr_task(rq); |
5096 | if (queued) | 5140 | if (queued) |
5097 | enqueue_task(rq, p, 0); | 5141 | enqueue_task(rq, p, ENQUEUE_RESTORE); |
5098 | task_rq_unlock(rq, p, &flags); | 5142 | task_rq_unlock(rq, p, &flags); |
5099 | } | 5143 | } |
5100 | #endif /* CONFIG_NUMA_BALANCING */ | 5144 | #endif /* CONFIG_NUMA_BALANCING */ |
@@ -5515,21 +5559,27 @@ static void set_cpu_rq_start_time(void) | |||
5515 | static int sched_cpu_active(struct notifier_block *nfb, | 5559 | static int sched_cpu_active(struct notifier_block *nfb, |
5516 | unsigned long action, void *hcpu) | 5560 | unsigned long action, void *hcpu) |
5517 | { | 5561 | { |
5562 | int cpu = (long)hcpu; | ||
5563 | |||
5518 | switch (action & ~CPU_TASKS_FROZEN) { | 5564 | switch (action & ~CPU_TASKS_FROZEN) { |
5519 | case CPU_STARTING: | 5565 | case CPU_STARTING: |
5520 | set_cpu_rq_start_time(); | 5566 | set_cpu_rq_start_time(); |
5521 | return NOTIFY_OK; | 5567 | return NOTIFY_OK; |
5568 | |||
5522 | case CPU_ONLINE: | 5569 | case CPU_ONLINE: |
5523 | /* | 5570 | /* |
5524 | * At this point a starting CPU has marked itself as online via | 5571 | * At this point a starting CPU has marked itself as online via |
5525 | * set_cpu_online(). But it might not yet have marked itself | 5572 | * set_cpu_online(). But it might not yet have marked itself |
5526 | * as active, which is essential from here on. | 5573 | * as active, which is essential from here on. |
5527 | * | ||
5528 | * Thus, fall-through and help the starting CPU along. | ||
5529 | */ | 5574 | */ |
5575 | set_cpu_active(cpu, true); | ||
5576 | stop_machine_unpark(cpu); | ||
5577 | return NOTIFY_OK; | ||
5578 | |||
5530 | case CPU_DOWN_FAILED: | 5579 | case CPU_DOWN_FAILED: |
5531 | set_cpu_active((long)hcpu, true); | 5580 | set_cpu_active(cpu, true); |
5532 | return NOTIFY_OK; | 5581 | return NOTIFY_OK; |
5582 | |||
5533 | default: | 5583 | default: |
5534 | return NOTIFY_DONE; | 5584 | return NOTIFY_DONE; |
5535 | } | 5585 | } |
@@ -6461,7 +6511,8 @@ static struct sched_domain_topology_level default_topology[] = { | |||
6461 | { NULL, }, | 6511 | { NULL, }, |
6462 | }; | 6512 | }; |
6463 | 6513 | ||
6464 | struct sched_domain_topology_level *sched_domain_topology = default_topology; | 6514 | static struct sched_domain_topology_level *sched_domain_topology = |
6515 | default_topology; | ||
6465 | 6516 | ||
6466 | #define for_each_sd_topology(tl) \ | 6517 | #define for_each_sd_topology(tl) \ |
6467 | for (tl = sched_domain_topology; tl->mask; tl++) | 6518 | for (tl = sched_domain_topology; tl->mask; tl++) |
@@ -7230,9 +7281,6 @@ void __init sched_init_smp(void) | |||
7230 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); | 7281 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); |
7231 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); | 7282 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); |
7232 | 7283 | ||
7233 | /* nohz_full won't take effect without isolating the cpus. */ | ||
7234 | tick_nohz_full_add_cpus_to(cpu_isolated_map); | ||
7235 | |||
7236 | sched_init_numa(); | 7284 | sched_init_numa(); |
7237 | 7285 | ||
7238 | /* | 7286 | /* |
@@ -7465,7 +7513,7 @@ void __init sched_init(void) | |||
7465 | #ifdef CONFIG_DEBUG_ATOMIC_SLEEP | 7513 | #ifdef CONFIG_DEBUG_ATOMIC_SLEEP |
7466 | static inline int preempt_count_equals(int preempt_offset) | 7514 | static inline int preempt_count_equals(int preempt_offset) |
7467 | { | 7515 | { |
7468 | int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); | 7516 | int nested = preempt_count() + rcu_preempt_depth(); |
7469 | 7517 | ||
7470 | return (nested == preempt_offset); | 7518 | return (nested == preempt_offset); |
7471 | } | 7519 | } |
@@ -7712,7 +7760,7 @@ void sched_move_task(struct task_struct *tsk) | |||
7712 | queued = task_on_rq_queued(tsk); | 7760 | queued = task_on_rq_queued(tsk); |
7713 | 7761 | ||
7714 | if (queued) | 7762 | if (queued) |
7715 | dequeue_task(rq, tsk, 0); | 7763 | dequeue_task(rq, tsk, DEQUEUE_SAVE); |
7716 | if (unlikely(running)) | 7764 | if (unlikely(running)) |
7717 | put_prev_task(rq, tsk); | 7765 | put_prev_task(rq, tsk); |
7718 | 7766 | ||
@@ -7728,7 +7776,7 @@ void sched_move_task(struct task_struct *tsk) | |||
7728 | 7776 | ||
7729 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7777 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7730 | if (tsk->sched_class->task_move_group) | 7778 | if (tsk->sched_class->task_move_group) |
7731 | tsk->sched_class->task_move_group(tsk, queued); | 7779 | tsk->sched_class->task_move_group(tsk); |
7732 | else | 7780 | else |
7733 | #endif | 7781 | #endif |
7734 | set_task_rq(tsk, task_cpu(tsk)); | 7782 | set_task_rq(tsk, task_cpu(tsk)); |
@@ -7736,7 +7784,7 @@ void sched_move_task(struct task_struct *tsk) | |||
7736 | if (unlikely(running)) | 7784 | if (unlikely(running)) |
7737 | tsk->sched_class->set_curr_task(rq); | 7785 | tsk->sched_class->set_curr_task(rq); |
7738 | if (queued) | 7786 | if (queued) |
7739 | enqueue_task(rq, tsk, 0); | 7787 | enqueue_task(rq, tsk, ENQUEUE_RESTORE); |
7740 | 7788 | ||
7741 | task_rq_unlock(rq, tsk, &flags); | 7789 | task_rq_unlock(rq, tsk, &flags); |
7742 | } | 7790 | } |
@@ -8196,21 +8244,6 @@ static void cpu_cgroup_attach(struct cgroup_subsys_state *css, | |||
8196 | sched_move_task(task); | 8244 | sched_move_task(task); |
8197 | } | 8245 | } |
8198 | 8246 | ||
8199 | static void cpu_cgroup_exit(struct cgroup_subsys_state *css, | ||
8200 | struct cgroup_subsys_state *old_css, | ||
8201 | struct task_struct *task) | ||
8202 | { | ||
8203 | /* | ||
8204 | * cgroup_exit() is called in the copy_process() failure path. | ||
8205 | * Ignore this case since the task hasn't ran yet, this avoids | ||
8206 | * trying to poke a half freed task state from generic code. | ||
8207 | */ | ||
8208 | if (!(task->flags & PF_EXITING)) | ||
8209 | return; | ||
8210 | |||
8211 | sched_move_task(task); | ||
8212 | } | ||
8213 | |||
8214 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8247 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8215 | static int cpu_shares_write_u64(struct cgroup_subsys_state *css, | 8248 | static int cpu_shares_write_u64(struct cgroup_subsys_state *css, |
8216 | struct cftype *cftype, u64 shareval) | 8249 | struct cftype *cftype, u64 shareval) |
@@ -8542,7 +8575,6 @@ struct cgroup_subsys cpu_cgrp_subsys = { | |||
8542 | .fork = cpu_cgroup_fork, | 8575 | .fork = cpu_cgroup_fork, |
8543 | .can_attach = cpu_cgroup_can_attach, | 8576 | .can_attach = cpu_cgroup_can_attach, |
8544 | .attach = cpu_cgroup_attach, | 8577 | .attach = cpu_cgroup_attach, |
8545 | .exit = cpu_cgroup_exit, | ||
8546 | .legacy_cftypes = cpu_files, | 8578 | .legacy_cftypes = cpu_files, |
8547 | .early_init = 1, | 8579 | .early_init = 1, |
8548 | }; | 8580 | }; |
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index c6acb07466bb..5a75b08cfd85 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c | |||
@@ -31,11 +31,6 @@ static inline int right_child(int i) | |||
31 | return (i << 1) + 2; | 31 | return (i << 1) + 2; |
32 | } | 32 | } |
33 | 33 | ||
34 | static inline int dl_time_before(u64 a, u64 b) | ||
35 | { | ||
36 | return (s64)(a - b) < 0; | ||
37 | } | ||
38 | |||
39 | static void cpudl_exchange(struct cpudl *cp, int a, int b) | 34 | static void cpudl_exchange(struct cpudl *cp, int a, int b) |
40 | { | 35 | { |
41 | int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; | 36 | int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; |
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index 1a0a6ef2fbe1..fcbdf83fed7e 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h | |||
@@ -2,6 +2,7 @@ | |||
2 | #define _LINUX_CPUDL_H | 2 | #define _LINUX_CPUDL_H |
3 | 3 | ||
4 | #include <linux/sched.h> | 4 | #include <linux/sched.h> |
5 | #include <linux/sched/deadline.h> | ||
5 | 6 | ||
6 | #define IDX_INVALID -1 | 7 | #define IDX_INVALID -1 |
7 | 8 | ||
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 8cbc3db671df..26a54461bf59 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
@@ -444,6 +444,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
444 | *ut = p->utime; | 444 | *ut = p->utime; |
445 | *st = p->stime; | 445 | *st = p->stime; |
446 | } | 446 | } |
447 | EXPORT_SYMBOL_GPL(task_cputime_adjusted); | ||
447 | 448 | ||
448 | void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | 449 | void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) |
449 | { | 450 | { |
@@ -652,6 +653,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
652 | task_cputime(p, &cputime.utime, &cputime.stime); | 653 | task_cputime(p, &cputime.utime, &cputime.stime); |
653 | cputime_adjust(&cputime, &p->prev_cputime, ut, st); | 654 | cputime_adjust(&cputime, &p->prev_cputime, ut, st); |
654 | } | 655 | } |
656 | EXPORT_SYMBOL_GPL(task_cputime_adjusted); | ||
655 | 657 | ||
656 | void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | 658 | void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) |
657 | { | 659 | { |
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fc8f01083527..8b0a15e285f9 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
@@ -668,8 +668,15 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) | |||
668 | * Queueing this task back might have overloaded rq, check if we need | 668 | * Queueing this task back might have overloaded rq, check if we need |
669 | * to kick someone away. | 669 | * to kick someone away. |
670 | */ | 670 | */ |
671 | if (has_pushable_dl_tasks(rq)) | 671 | if (has_pushable_dl_tasks(rq)) { |
672 | /* | ||
673 | * Nothing relies on rq->lock after this, so its safe to drop | ||
674 | * rq->lock. | ||
675 | */ | ||
676 | lockdep_unpin_lock(&rq->lock); | ||
672 | push_dl_task(rq); | 677 | push_dl_task(rq); |
678 | lockdep_pin_lock(&rq->lock); | ||
679 | } | ||
673 | #endif | 680 | #endif |
674 | 681 | ||
675 | unlock: | 682 | unlock: |
@@ -1066,8 +1073,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) | |||
1066 | int target = find_later_rq(p); | 1073 | int target = find_later_rq(p); |
1067 | 1074 | ||
1068 | if (target != -1 && | 1075 | if (target != -1 && |
1069 | dl_time_before(p->dl.deadline, | 1076 | (dl_time_before(p->dl.deadline, |
1070 | cpu_rq(target)->dl.earliest_dl.curr)) | 1077 | cpu_rq(target)->dl.earliest_dl.curr) || |
1078 | (cpu_rq(target)->dl.dl_nr_running == 0))) | ||
1071 | cpu = target; | 1079 | cpu = target; |
1072 | } | 1080 | } |
1073 | rcu_read_unlock(); | 1081 | rcu_read_unlock(); |
@@ -1417,7 +1425,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) | |||
1417 | 1425 | ||
1418 | later_rq = cpu_rq(cpu); | 1426 | later_rq = cpu_rq(cpu); |
1419 | 1427 | ||
1420 | if (!dl_time_before(task->dl.deadline, | 1428 | if (later_rq->dl.dl_nr_running && |
1429 | !dl_time_before(task->dl.deadline, | ||
1421 | later_rq->dl.earliest_dl.curr)) { | 1430 | later_rq->dl.earliest_dl.curr)) { |
1422 | /* | 1431 | /* |
1423 | * Target rq has tasks of equal or earlier deadline, | 1432 | * Target rq has tasks of equal or earlier deadline, |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6e2e3483b1ec..824aa9f501a3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -661,11 +661,12 @@ static unsigned long task_h_load(struct task_struct *p); | |||
661 | 661 | ||
662 | /* | 662 | /* |
663 | * We choose a half-life close to 1 scheduling period. | 663 | * We choose a half-life close to 1 scheduling period. |
664 | * Note: The tables below are dependent on this value. | 664 | * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are |
665 | * dependent on this value. | ||
665 | */ | 666 | */ |
666 | #define LOAD_AVG_PERIOD 32 | 667 | #define LOAD_AVG_PERIOD 32 |
667 | #define LOAD_AVG_MAX 47742 /* maximum possible load avg */ | 668 | #define LOAD_AVG_MAX 47742 /* maximum possible load avg */ |
668 | #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */ | 669 | #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */ |
669 | 670 | ||
670 | /* Give new sched_entity start runnable values to heavy its load in infant time */ | 671 | /* Give new sched_entity start runnable values to heavy its load in infant time */ |
671 | void init_entity_runnable_average(struct sched_entity *se) | 672 | void init_entity_runnable_average(struct sched_entity *se) |
@@ -682,7 +683,7 @@ void init_entity_runnable_average(struct sched_entity *se) | |||
682 | sa->load_avg = scale_load_down(se->load.weight); | 683 | sa->load_avg = scale_load_down(se->load.weight); |
683 | sa->load_sum = sa->load_avg * LOAD_AVG_MAX; | 684 | sa->load_sum = sa->load_avg * LOAD_AVG_MAX; |
684 | sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); | 685 | sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); |
685 | sa->util_sum = LOAD_AVG_MAX; | 686 | sa->util_sum = sa->util_avg * LOAD_AVG_MAX; |
686 | /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ | 687 | /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ |
687 | } | 688 | } |
688 | 689 | ||
@@ -2069,7 +2070,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) | |||
2069 | int local = !!(flags & TNF_FAULT_LOCAL); | 2070 | int local = !!(flags & TNF_FAULT_LOCAL); |
2070 | int priv; | 2071 | int priv; |
2071 | 2072 | ||
2072 | if (!numabalancing_enabled) | 2073 | if (!static_branch_likely(&sched_numa_balancing)) |
2073 | return; | 2074 | return; |
2074 | 2075 | ||
2075 | /* for example, ksmd faulting in a user's mm */ | 2076 | /* for example, ksmd faulting in a user's mm */ |
@@ -2157,7 +2158,7 @@ void task_numa_work(struct callback_head *work) | |||
2157 | struct vm_area_struct *vma; | 2158 | struct vm_area_struct *vma; |
2158 | unsigned long start, end; | 2159 | unsigned long start, end; |
2159 | unsigned long nr_pte_updates = 0; | 2160 | unsigned long nr_pte_updates = 0; |
2160 | long pages; | 2161 | long pages, virtpages; |
2161 | 2162 | ||
2162 | WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); | 2163 | WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); |
2163 | 2164 | ||
@@ -2203,9 +2204,11 @@ void task_numa_work(struct callback_head *work) | |||
2203 | start = mm->numa_scan_offset; | 2204 | start = mm->numa_scan_offset; |
2204 | pages = sysctl_numa_balancing_scan_size; | 2205 | pages = sysctl_numa_balancing_scan_size; |
2205 | pages <<= 20 - PAGE_SHIFT; /* MB in pages */ | 2206 | pages <<= 20 - PAGE_SHIFT; /* MB in pages */ |
2207 | virtpages = pages * 8; /* Scan up to this much virtual space */ | ||
2206 | if (!pages) | 2208 | if (!pages) |
2207 | return; | 2209 | return; |
2208 | 2210 | ||
2211 | |||
2209 | down_read(&mm->mmap_sem); | 2212 | down_read(&mm->mmap_sem); |
2210 | vma = find_vma(mm, start); | 2213 | vma = find_vma(mm, start); |
2211 | if (!vma) { | 2214 | if (!vma) { |
@@ -2240,18 +2243,22 @@ void task_numa_work(struct callback_head *work) | |||
2240 | start = max(start, vma->vm_start); | 2243 | start = max(start, vma->vm_start); |
2241 | end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); | 2244 | end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); |
2242 | end = min(end, vma->vm_end); | 2245 | end = min(end, vma->vm_end); |
2243 | nr_pte_updates += change_prot_numa(vma, start, end); | 2246 | nr_pte_updates = change_prot_numa(vma, start, end); |
2244 | 2247 | ||
2245 | /* | 2248 | /* |
2246 | * Scan sysctl_numa_balancing_scan_size but ensure that | 2249 | * Try to scan sysctl_numa_balancing_size worth of |
2247 | * at least one PTE is updated so that unused virtual | 2250 | * hpages that have at least one present PTE that |
2248 | * address space is quickly skipped. | 2251 | * is not already pte-numa. If the VMA contains |
2252 | * areas that are unused or already full of prot_numa | ||
2253 | * PTEs, scan up to virtpages, to skip through those | ||
2254 | * areas faster. | ||
2249 | */ | 2255 | */ |
2250 | if (nr_pte_updates) | 2256 | if (nr_pte_updates) |
2251 | pages -= (end - start) >> PAGE_SHIFT; | 2257 | pages -= (end - start) >> PAGE_SHIFT; |
2258 | virtpages -= (end - start) >> PAGE_SHIFT; | ||
2252 | 2259 | ||
2253 | start = end; | 2260 | start = end; |
2254 | if (pages <= 0) | 2261 | if (pages <= 0 || virtpages <= 0) |
2255 | goto out; | 2262 | goto out; |
2256 | 2263 | ||
2257 | cond_resched(); | 2264 | cond_resched(); |
@@ -2363,7 +2370,7 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) | |||
2363 | */ | 2370 | */ |
2364 | tg_weight = atomic_long_read(&tg->load_avg); | 2371 | tg_weight = atomic_long_read(&tg->load_avg); |
2365 | tg_weight -= cfs_rq->tg_load_avg_contrib; | 2372 | tg_weight -= cfs_rq->tg_load_avg_contrib; |
2366 | tg_weight += cfs_rq_load_avg(cfs_rq); | 2373 | tg_weight += cfs_rq->load.weight; |
2367 | 2374 | ||
2368 | return tg_weight; | 2375 | return tg_weight; |
2369 | } | 2376 | } |
@@ -2373,7 +2380,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) | |||
2373 | long tg_weight, load, shares; | 2380 | long tg_weight, load, shares; |
2374 | 2381 | ||
2375 | tg_weight = calc_tg_weight(tg, cfs_rq); | 2382 | tg_weight = calc_tg_weight(tg, cfs_rq); |
2376 | load = cfs_rq_load_avg(cfs_rq); | 2383 | load = cfs_rq->load.weight; |
2377 | 2384 | ||
2378 | shares = (tg->shares * load); | 2385 | shares = (tg->shares * load); |
2379 | if (tg_weight) | 2386 | if (tg_weight) |
@@ -2515,6 +2522,12 @@ static u32 __compute_runnable_contrib(u64 n) | |||
2515 | return contrib + runnable_avg_yN_sum[n]; | 2522 | return contrib + runnable_avg_yN_sum[n]; |
2516 | } | 2523 | } |
2517 | 2524 | ||
2525 | #if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10 | ||
2526 | #error "load tracking assumes 2^10 as unit" | ||
2527 | #endif | ||
2528 | |||
2529 | #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) | ||
2530 | |||
2518 | /* | 2531 | /* |
2519 | * We can represent the historical contribution to runnable average as the | 2532 | * We can represent the historical contribution to runnable average as the |
2520 | * coefficients of a geometric series. To do this we sub-divide our runnable | 2533 | * coefficients of a geometric series. To do this we sub-divide our runnable |
@@ -2547,10 +2560,10 @@ static __always_inline int | |||
2547 | __update_load_avg(u64 now, int cpu, struct sched_avg *sa, | 2560 | __update_load_avg(u64 now, int cpu, struct sched_avg *sa, |
2548 | unsigned long weight, int running, struct cfs_rq *cfs_rq) | 2561 | unsigned long weight, int running, struct cfs_rq *cfs_rq) |
2549 | { | 2562 | { |
2550 | u64 delta, periods; | 2563 | u64 delta, scaled_delta, periods; |
2551 | u32 contrib; | 2564 | u32 contrib; |
2552 | int delta_w, decayed = 0; | 2565 | unsigned int delta_w, scaled_delta_w, decayed = 0; |
2553 | unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); | 2566 | unsigned long scale_freq, scale_cpu; |
2554 | 2567 | ||
2555 | delta = now - sa->last_update_time; | 2568 | delta = now - sa->last_update_time; |
2556 | /* | 2569 | /* |
@@ -2571,6 +2584,9 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, | |||
2571 | return 0; | 2584 | return 0; |
2572 | sa->last_update_time = now; | 2585 | sa->last_update_time = now; |
2573 | 2586 | ||
2587 | scale_freq = arch_scale_freq_capacity(NULL, cpu); | ||
2588 | scale_cpu = arch_scale_cpu_capacity(NULL, cpu); | ||
2589 | |||
2574 | /* delta_w is the amount already accumulated against our next period */ | 2590 | /* delta_w is the amount already accumulated against our next period */ |
2575 | delta_w = sa->period_contrib; | 2591 | delta_w = sa->period_contrib; |
2576 | if (delta + delta_w >= 1024) { | 2592 | if (delta + delta_w >= 1024) { |
@@ -2585,13 +2601,16 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, | |||
2585 | * period and accrue it. | 2601 | * period and accrue it. |
2586 | */ | 2602 | */ |
2587 | delta_w = 1024 - delta_w; | 2603 | delta_w = 1024 - delta_w; |
2604 | scaled_delta_w = cap_scale(delta_w, scale_freq); | ||
2588 | if (weight) { | 2605 | if (weight) { |
2589 | sa->load_sum += weight * delta_w; | 2606 | sa->load_sum += weight * scaled_delta_w; |
2590 | if (cfs_rq) | 2607 | if (cfs_rq) { |
2591 | cfs_rq->runnable_load_sum += weight * delta_w; | 2608 | cfs_rq->runnable_load_sum += |
2609 | weight * scaled_delta_w; | ||
2610 | } | ||
2592 | } | 2611 | } |
2593 | if (running) | 2612 | if (running) |
2594 | sa->util_sum += delta_w * scale_freq >> SCHED_CAPACITY_SHIFT; | 2613 | sa->util_sum += scaled_delta_w * scale_cpu; |
2595 | 2614 | ||
2596 | delta -= delta_w; | 2615 | delta -= delta_w; |
2597 | 2616 | ||
@@ -2608,23 +2627,25 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, | |||
2608 | 2627 | ||
2609 | /* Efficiently calculate \sum (1..n_period) 1024*y^i */ | 2628 | /* Efficiently calculate \sum (1..n_period) 1024*y^i */ |
2610 | contrib = __compute_runnable_contrib(periods); | 2629 | contrib = __compute_runnable_contrib(periods); |
2630 | contrib = cap_scale(contrib, scale_freq); | ||
2611 | if (weight) { | 2631 | if (weight) { |
2612 | sa->load_sum += weight * contrib; | 2632 | sa->load_sum += weight * contrib; |
2613 | if (cfs_rq) | 2633 | if (cfs_rq) |
2614 | cfs_rq->runnable_load_sum += weight * contrib; | 2634 | cfs_rq->runnable_load_sum += weight * contrib; |
2615 | } | 2635 | } |
2616 | if (running) | 2636 | if (running) |
2617 | sa->util_sum += contrib * scale_freq >> SCHED_CAPACITY_SHIFT; | 2637 | sa->util_sum += contrib * scale_cpu; |
2618 | } | 2638 | } |
2619 | 2639 | ||
2620 | /* Remainder of delta accrued against u_0` */ | 2640 | /* Remainder of delta accrued against u_0` */ |
2641 | scaled_delta = cap_scale(delta, scale_freq); | ||
2621 | if (weight) { | 2642 | if (weight) { |
2622 | sa->load_sum += weight * delta; | 2643 | sa->load_sum += weight * scaled_delta; |
2623 | if (cfs_rq) | 2644 | if (cfs_rq) |
2624 | cfs_rq->runnable_load_sum += weight * delta; | 2645 | cfs_rq->runnable_load_sum += weight * scaled_delta; |
2625 | } | 2646 | } |
2626 | if (running) | 2647 | if (running) |
2627 | sa->util_sum += delta * scale_freq >> SCHED_CAPACITY_SHIFT; | 2648 | sa->util_sum += scaled_delta * scale_cpu; |
2628 | 2649 | ||
2629 | sa->period_contrib += delta; | 2650 | sa->period_contrib += delta; |
2630 | 2651 | ||
@@ -2634,7 +2655,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, | |||
2634 | cfs_rq->runnable_load_avg = | 2655 | cfs_rq->runnable_load_avg = |
2635 | div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); | 2656 | div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); |
2636 | } | 2657 | } |
2637 | sa->util_avg = (sa->util_sum << SCHED_LOAD_SHIFT) / LOAD_AVG_MAX; | 2658 | sa->util_avg = sa->util_sum / LOAD_AVG_MAX; |
2638 | } | 2659 | } |
2639 | 2660 | ||
2640 | return decayed; | 2661 | return decayed; |
@@ -2664,20 +2685,20 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); | |||
2664 | /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ | 2685 | /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ |
2665 | static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) | 2686 | static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) |
2666 | { | 2687 | { |
2667 | int decayed; | ||
2668 | struct sched_avg *sa = &cfs_rq->avg; | 2688 | struct sched_avg *sa = &cfs_rq->avg; |
2689 | int decayed, removed = 0; | ||
2669 | 2690 | ||
2670 | if (atomic_long_read(&cfs_rq->removed_load_avg)) { | 2691 | if (atomic_long_read(&cfs_rq->removed_load_avg)) { |
2671 | long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); | 2692 | long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); |
2672 | sa->load_avg = max_t(long, sa->load_avg - r, 0); | 2693 | sa->load_avg = max_t(long, sa->load_avg - r, 0); |
2673 | sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); | 2694 | sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); |
2695 | removed = 1; | ||
2674 | } | 2696 | } |
2675 | 2697 | ||
2676 | if (atomic_long_read(&cfs_rq->removed_util_avg)) { | 2698 | if (atomic_long_read(&cfs_rq->removed_util_avg)) { |
2677 | long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); | 2699 | long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); |
2678 | sa->util_avg = max_t(long, sa->util_avg - r, 0); | 2700 | sa->util_avg = max_t(long, sa->util_avg - r, 0); |
2679 | sa->util_sum = max_t(s32, sa->util_sum - | 2701 | sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0); |
2680 | ((r * LOAD_AVG_MAX) >> SCHED_LOAD_SHIFT), 0); | ||
2681 | } | 2702 | } |
2682 | 2703 | ||
2683 | decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, | 2704 | decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, |
@@ -2688,40 +2709,77 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) | |||
2688 | cfs_rq->load_last_update_time_copy = sa->last_update_time; | 2709 | cfs_rq->load_last_update_time_copy = sa->last_update_time; |
2689 | #endif | 2710 | #endif |
2690 | 2711 | ||
2691 | return decayed; | 2712 | return decayed || removed; |
2692 | } | 2713 | } |
2693 | 2714 | ||
2694 | /* Update task and its cfs_rq load average */ | 2715 | /* Update task and its cfs_rq load average */ |
2695 | static inline void update_load_avg(struct sched_entity *se, int update_tg) | 2716 | static inline void update_load_avg(struct sched_entity *se, int update_tg) |
2696 | { | 2717 | { |
2697 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 2718 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
2698 | int cpu = cpu_of(rq_of(cfs_rq)); | ||
2699 | u64 now = cfs_rq_clock_task(cfs_rq); | 2719 | u64 now = cfs_rq_clock_task(cfs_rq); |
2720 | int cpu = cpu_of(rq_of(cfs_rq)); | ||
2700 | 2721 | ||
2701 | /* | 2722 | /* |
2702 | * Track task load average for carrying it to new CPU after migrated, and | 2723 | * Track task load average for carrying it to new CPU after migrated, and |
2703 | * track group sched_entity load average for task_h_load calc in migration | 2724 | * track group sched_entity load average for task_h_load calc in migration |
2704 | */ | 2725 | */ |
2705 | __update_load_avg(now, cpu, &se->avg, | 2726 | __update_load_avg(now, cpu, &se->avg, |
2706 | se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); | 2727 | se->on_rq * scale_load_down(se->load.weight), |
2728 | cfs_rq->curr == se, NULL); | ||
2707 | 2729 | ||
2708 | if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) | 2730 | if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) |
2709 | update_tg_load_avg(cfs_rq, 0); | 2731 | update_tg_load_avg(cfs_rq, 0); |
2710 | } | 2732 | } |
2711 | 2733 | ||
2734 | static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
2735 | { | ||
2736 | if (!sched_feat(ATTACH_AGE_LOAD)) | ||
2737 | goto skip_aging; | ||
2738 | |||
2739 | /* | ||
2740 | * If we got migrated (either between CPUs or between cgroups) we'll | ||
2741 | * have aged the average right before clearing @last_update_time. | ||
2742 | */ | ||
2743 | if (se->avg.last_update_time) { | ||
2744 | __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), | ||
2745 | &se->avg, 0, 0, NULL); | ||
2746 | |||
2747 | /* | ||
2748 | * XXX: we could have just aged the entire load away if we've been | ||
2749 | * absent from the fair class for too long. | ||
2750 | */ | ||
2751 | } | ||
2752 | |||
2753 | skip_aging: | ||
2754 | se->avg.last_update_time = cfs_rq->avg.last_update_time; | ||
2755 | cfs_rq->avg.load_avg += se->avg.load_avg; | ||
2756 | cfs_rq->avg.load_sum += se->avg.load_sum; | ||
2757 | cfs_rq->avg.util_avg += se->avg.util_avg; | ||
2758 | cfs_rq->avg.util_sum += se->avg.util_sum; | ||
2759 | } | ||
2760 | |||
2761 | static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
2762 | { | ||
2763 | __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), | ||
2764 | &se->avg, se->on_rq * scale_load_down(se->load.weight), | ||
2765 | cfs_rq->curr == se, NULL); | ||
2766 | |||
2767 | cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); | ||
2768 | cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); | ||
2769 | cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); | ||
2770 | cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); | ||
2771 | } | ||
2772 | |||
2712 | /* Add the load generated by se into cfs_rq's load average */ | 2773 | /* Add the load generated by se into cfs_rq's load average */ |
2713 | static inline void | 2774 | static inline void |
2714 | enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) | 2775 | enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) |
2715 | { | 2776 | { |
2716 | struct sched_avg *sa = &se->avg; | 2777 | struct sched_avg *sa = &se->avg; |
2717 | u64 now = cfs_rq_clock_task(cfs_rq); | 2778 | u64 now = cfs_rq_clock_task(cfs_rq); |
2718 | int migrated = 0, decayed; | 2779 | int migrated, decayed; |
2719 | 2780 | ||
2720 | if (sa->last_update_time == 0) { | 2781 | migrated = !sa->last_update_time; |
2721 | sa->last_update_time = now; | 2782 | if (!migrated) { |
2722 | migrated = 1; | ||
2723 | } | ||
2724 | else { | ||
2725 | __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, | 2783 | __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, |
2726 | se->on_rq * scale_load_down(se->load.weight), | 2784 | se->on_rq * scale_load_down(se->load.weight), |
2727 | cfs_rq->curr == se, NULL); | 2785 | cfs_rq->curr == se, NULL); |
@@ -2732,12 +2790,8 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
2732 | cfs_rq->runnable_load_avg += sa->load_avg; | 2790 | cfs_rq->runnable_load_avg += sa->load_avg; |
2733 | cfs_rq->runnable_load_sum += sa->load_sum; | 2791 | cfs_rq->runnable_load_sum += sa->load_sum; |
2734 | 2792 | ||
2735 | if (migrated) { | 2793 | if (migrated) |
2736 | cfs_rq->avg.load_avg += sa->load_avg; | 2794 | attach_entity_load_avg(cfs_rq, se); |
2737 | cfs_rq->avg.load_sum += sa->load_sum; | ||
2738 | cfs_rq->avg.util_avg += sa->util_avg; | ||
2739 | cfs_rq->avg.util_sum += sa->util_sum; | ||
2740 | } | ||
2741 | 2795 | ||
2742 | if (decayed || migrated) | 2796 | if (decayed || migrated) |
2743 | update_tg_load_avg(cfs_rq, 0); | 2797 | update_tg_load_avg(cfs_rq, 0); |
@@ -2752,7 +2806,7 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
2752 | cfs_rq->runnable_load_avg = | 2806 | cfs_rq->runnable_load_avg = |
2753 | max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); | 2807 | max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); |
2754 | cfs_rq->runnable_load_sum = | 2808 | cfs_rq->runnable_load_sum = |
2755 | max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); | 2809 | max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); |
2756 | } | 2810 | } |
2757 | 2811 | ||
2758 | /* | 2812 | /* |
@@ -2820,6 +2874,11 @@ static inline void | |||
2820 | dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} | 2874 | dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} |
2821 | static inline void remove_entity_load_avg(struct sched_entity *se) {} | 2875 | static inline void remove_entity_load_avg(struct sched_entity *se) {} |
2822 | 2876 | ||
2877 | static inline void | ||
2878 | attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} | ||
2879 | static inline void | ||
2880 | detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} | ||
2881 | |||
2823 | static inline int idle_balance(struct rq *rq) | 2882 | static inline int idle_balance(struct rq *rq) |
2824 | { | 2883 | { |
2825 | return 0; | 2884 | return 0; |
@@ -4816,32 +4875,39 @@ next: | |||
4816 | done: | 4875 | done: |
4817 | return target; | 4876 | return target; |
4818 | } | 4877 | } |
4878 | |||
4819 | /* | 4879 | /* |
4820 | * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS | 4880 | * cpu_util returns the amount of capacity of a CPU that is used by CFS |
4821 | * tasks. The unit of the return value must be the one of capacity so we can | 4881 | * tasks. The unit of the return value must be the one of capacity so we can |
4822 | * compare the usage with the capacity of the CPU that is available for CFS | 4882 | * compare the utilization with the capacity of the CPU that is available for |
4823 | * task (ie cpu_capacity). | 4883 | * CFS task (ie cpu_capacity). |
4824 | * cfs.avg.util_avg is the sum of running time of runnable tasks on a | 4884 | * |
4825 | * CPU. It represents the amount of utilization of a CPU in the range | 4885 | * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the |
4826 | * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full | 4886 | * recent utilization of currently non-runnable tasks on a CPU. It represents |
4827 | * capacity of the CPU because it's about the running time on this CPU. | 4887 | * the amount of utilization of a CPU in the range [0..capacity_orig] where |
4828 | * Nevertheless, cfs.avg.util_avg can be higher than SCHED_LOAD_SCALE | 4888 | * capacity_orig is the cpu_capacity available at the highest frequency |
4829 | * because of unfortunate rounding in util_avg or just | 4889 | * (arch_scale_freq_capacity()). |
4830 | * after migrating tasks until the average stabilizes with the new running | 4890 | * The utilization of a CPU converges towards a sum equal to or less than the |
4831 | * time. So we need to check that the usage stays into the range | 4891 | * current capacity (capacity_curr <= capacity_orig) of the CPU because it is |
4832 | * [0..cpu_capacity_orig] and cap if necessary. | 4892 | * the running time on this CPU scaled by capacity_curr. |
4833 | * Without capping the usage, a group could be seen as overloaded (CPU0 usage | 4893 | * |
4834 | * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity | 4894 | * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even |
4895 | * higher than capacity_orig because of unfortunate rounding in | ||
4896 | * cfs.avg.util_avg or just after migrating tasks and new task wakeups until | ||
4897 | * the average stabilizes with the new running time. We need to check that the | ||
4898 | * utilization stays within the range of [0..capacity_orig] and cap it if | ||
4899 | * necessary. Without utilization capping, a group could be seen as overloaded | ||
4900 | * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of | ||
4901 | * available capacity. We allow utilization to overshoot capacity_curr (but not | ||
4902 | * capacity_orig) as it useful for predicting the capacity required after task | ||
4903 | * migrations (scheduler-driven DVFS). | ||
4835 | */ | 4904 | */ |
4836 | static int get_cpu_usage(int cpu) | 4905 | static int cpu_util(int cpu) |
4837 | { | 4906 | { |
4838 | unsigned long usage = cpu_rq(cpu)->cfs.avg.util_avg; | 4907 | unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; |
4839 | unsigned long capacity = capacity_orig_of(cpu); | 4908 | unsigned long capacity = capacity_orig_of(cpu); |
4840 | 4909 | ||
4841 | if (usage >= SCHED_LOAD_SCALE) | 4910 | return (util >= capacity) ? capacity : util; |
4842 | return capacity; | ||
4843 | |||
4844 | return (usage * capacity) >> SCHED_LOAD_SHIFT; | ||
4845 | } | 4911 | } |
4846 | 4912 | ||
4847 | /* | 4913 | /* |
@@ -4944,7 +5010,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
4944 | * previous cpu. However, the caller only guarantees p->pi_lock is held; no | 5010 | * previous cpu. However, the caller only guarantees p->pi_lock is held; no |
4945 | * other assumptions, including the state of rq->lock, should be made. | 5011 | * other assumptions, including the state of rq->lock, should be made. |
4946 | */ | 5012 | */ |
4947 | static void migrate_task_rq_fair(struct task_struct *p, int next_cpu) | 5013 | static void migrate_task_rq_fair(struct task_struct *p) |
4948 | { | 5014 | { |
4949 | /* | 5015 | /* |
4950 | * We are supposed to update the task to "current" time, then its up to date | 5016 | * We are supposed to update the task to "current" time, then its up to date |
@@ -5524,10 +5590,10 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) | |||
5524 | unsigned long src_faults, dst_faults; | 5590 | unsigned long src_faults, dst_faults; |
5525 | int src_nid, dst_nid; | 5591 | int src_nid, dst_nid; |
5526 | 5592 | ||
5527 | if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) | 5593 | if (!static_branch_likely(&sched_numa_balancing)) |
5528 | return -1; | 5594 | return -1; |
5529 | 5595 | ||
5530 | if (!sched_feat(NUMA)) | 5596 | if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) |
5531 | return -1; | 5597 | return -1; |
5532 | 5598 | ||
5533 | src_nid = cpu_to_node(env->src_cpu); | 5599 | src_nid = cpu_to_node(env->src_cpu); |
@@ -5933,7 +5999,7 @@ struct sg_lb_stats { | |||
5933 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | 5999 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ |
5934 | unsigned long load_per_task; | 6000 | unsigned long load_per_task; |
5935 | unsigned long group_capacity; | 6001 | unsigned long group_capacity; |
5936 | unsigned long group_usage; /* Total usage of the group */ | 6002 | unsigned long group_util; /* Total utilization of the group */ |
5937 | unsigned int sum_nr_running; /* Nr tasks running in the group */ | 6003 | unsigned int sum_nr_running; /* Nr tasks running in the group */ |
5938 | unsigned int idle_cpus; | 6004 | unsigned int idle_cpus; |
5939 | unsigned int group_weight; | 6005 | unsigned int group_weight; |
@@ -6009,19 +6075,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd, | |||
6009 | return load_idx; | 6075 | return load_idx; |
6010 | } | 6076 | } |
6011 | 6077 | ||
6012 | static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) | ||
6013 | { | ||
6014 | if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) | ||
6015 | return sd->smt_gain / sd->span_weight; | ||
6016 | |||
6017 | return SCHED_CAPACITY_SCALE; | ||
6018 | } | ||
6019 | |||
6020 | unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) | ||
6021 | { | ||
6022 | return default_scale_cpu_capacity(sd, cpu); | ||
6023 | } | ||
6024 | |||
6025 | static unsigned long scale_rt_capacity(int cpu) | 6078 | static unsigned long scale_rt_capacity(int cpu) |
6026 | { | 6079 | { |
6027 | struct rq *rq = cpu_rq(cpu); | 6080 | struct rq *rq = cpu_rq(cpu); |
@@ -6051,16 +6104,9 @@ static unsigned long scale_rt_capacity(int cpu) | |||
6051 | 6104 | ||
6052 | static void update_cpu_capacity(struct sched_domain *sd, int cpu) | 6105 | static void update_cpu_capacity(struct sched_domain *sd, int cpu) |
6053 | { | 6106 | { |
6054 | unsigned long capacity = SCHED_CAPACITY_SCALE; | 6107 | unsigned long capacity = arch_scale_cpu_capacity(sd, cpu); |
6055 | struct sched_group *sdg = sd->groups; | 6108 | struct sched_group *sdg = sd->groups; |
6056 | 6109 | ||
6057 | if (sched_feat(ARCH_CAPACITY)) | ||
6058 | capacity *= arch_scale_cpu_capacity(sd, cpu); | ||
6059 | else | ||
6060 | capacity *= default_scale_cpu_capacity(sd, cpu); | ||
6061 | |||
6062 | capacity >>= SCHED_CAPACITY_SHIFT; | ||
6063 | |||
6064 | cpu_rq(cpu)->cpu_capacity_orig = capacity; | 6110 | cpu_rq(cpu)->cpu_capacity_orig = capacity; |
6065 | 6111 | ||
6066 | capacity *= scale_rt_capacity(cpu); | 6112 | capacity *= scale_rt_capacity(cpu); |
@@ -6186,8 +6232,8 @@ static inline int sg_imbalanced(struct sched_group *group) | |||
6186 | * group_has_capacity returns true if the group has spare capacity that could | 6232 | * group_has_capacity returns true if the group has spare capacity that could |
6187 | * be used by some tasks. | 6233 | * be used by some tasks. |
6188 | * We consider that a group has spare capacity if the * number of task is | 6234 | * We consider that a group has spare capacity if the * number of task is |
6189 | * smaller than the number of CPUs or if the usage is lower than the available | 6235 | * smaller than the number of CPUs or if the utilization is lower than the |
6190 | * capacity for CFS tasks. | 6236 | * available capacity for CFS tasks. |
6191 | * For the latter, we use a threshold to stabilize the state, to take into | 6237 | * For the latter, we use a threshold to stabilize the state, to take into |
6192 | * account the variance of the tasks' load and to return true if the available | 6238 | * account the variance of the tasks' load and to return true if the available |
6193 | * capacity in meaningful for the load balancer. | 6239 | * capacity in meaningful for the load balancer. |
@@ -6201,7 +6247,7 @@ group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) | |||
6201 | return true; | 6247 | return true; |
6202 | 6248 | ||
6203 | if ((sgs->group_capacity * 100) > | 6249 | if ((sgs->group_capacity * 100) > |
6204 | (sgs->group_usage * env->sd->imbalance_pct)) | 6250 | (sgs->group_util * env->sd->imbalance_pct)) |
6205 | return true; | 6251 | return true; |
6206 | 6252 | ||
6207 | return false; | 6253 | return false; |
@@ -6222,15 +6268,15 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) | |||
6222 | return false; | 6268 | return false; |
6223 | 6269 | ||
6224 | if ((sgs->group_capacity * 100) < | 6270 | if ((sgs->group_capacity * 100) < |
6225 | (sgs->group_usage * env->sd->imbalance_pct)) | 6271 | (sgs->group_util * env->sd->imbalance_pct)) |
6226 | return true; | 6272 | return true; |
6227 | 6273 | ||
6228 | return false; | 6274 | return false; |
6229 | } | 6275 | } |
6230 | 6276 | ||
6231 | static enum group_type group_classify(struct lb_env *env, | 6277 | static inline enum |
6232 | struct sched_group *group, | 6278 | group_type group_classify(struct sched_group *group, |
6233 | struct sg_lb_stats *sgs) | 6279 | struct sg_lb_stats *sgs) |
6234 | { | 6280 | { |
6235 | if (sgs->group_no_capacity) | 6281 | if (sgs->group_no_capacity) |
6236 | return group_overloaded; | 6282 | return group_overloaded; |
@@ -6270,7 +6316,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
6270 | load = source_load(i, load_idx); | 6316 | load = source_load(i, load_idx); |
6271 | 6317 | ||
6272 | sgs->group_load += load; | 6318 | sgs->group_load += load; |
6273 | sgs->group_usage += get_cpu_usage(i); | 6319 | sgs->group_util += cpu_util(i); |
6274 | sgs->sum_nr_running += rq->cfs.h_nr_running; | 6320 | sgs->sum_nr_running += rq->cfs.h_nr_running; |
6275 | 6321 | ||
6276 | if (rq->nr_running > 1) | 6322 | if (rq->nr_running > 1) |
@@ -6295,7 +6341,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
6295 | sgs->group_weight = group->group_weight; | 6341 | sgs->group_weight = group->group_weight; |
6296 | 6342 | ||
6297 | sgs->group_no_capacity = group_is_overloaded(env, sgs); | 6343 | sgs->group_no_capacity = group_is_overloaded(env, sgs); |
6298 | sgs->group_type = group_classify(env, group, sgs); | 6344 | sgs->group_type = group_classify(group, sgs); |
6299 | } | 6345 | } |
6300 | 6346 | ||
6301 | /** | 6347 | /** |
@@ -6429,7 +6475,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd | |||
6429 | group_has_capacity(env, &sds->local_stat) && | 6475 | group_has_capacity(env, &sds->local_stat) && |
6430 | (sgs->sum_nr_running > 1)) { | 6476 | (sgs->sum_nr_running > 1)) { |
6431 | sgs->group_no_capacity = 1; | 6477 | sgs->group_no_capacity = 1; |
6432 | sgs->group_type = group_overloaded; | 6478 | sgs->group_type = group_classify(sg, sgs); |
6433 | } | 6479 | } |
6434 | 6480 | ||
6435 | if (update_sd_pick_busiest(env, sds, sg, sgs)) { | 6481 | if (update_sd_pick_busiest(env, sds, sg, sgs)) { |
@@ -7609,8 +7655,22 @@ out: | |||
7609 | * When the cpu is attached to null domain for ex, it will not be | 7655 | * When the cpu is attached to null domain for ex, it will not be |
7610 | * updated. | 7656 | * updated. |
7611 | */ | 7657 | */ |
7612 | if (likely(update_next_balance)) | 7658 | if (likely(update_next_balance)) { |
7613 | rq->next_balance = next_balance; | 7659 | rq->next_balance = next_balance; |
7660 | |||
7661 | #ifdef CONFIG_NO_HZ_COMMON | ||
7662 | /* | ||
7663 | * If this CPU has been elected to perform the nohz idle | ||
7664 | * balance. Other idle CPUs have already rebalanced with | ||
7665 | * nohz_idle_balance() and nohz.next_balance has been | ||
7666 | * updated accordingly. This CPU is now running the idle load | ||
7667 | * balance for itself and we need to update the | ||
7668 | * nohz.next_balance accordingly. | ||
7669 | */ | ||
7670 | if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance)) | ||
7671 | nohz.next_balance = rq->next_balance; | ||
7672 | #endif | ||
7673 | } | ||
7614 | } | 7674 | } |
7615 | 7675 | ||
7616 | #ifdef CONFIG_NO_HZ_COMMON | 7676 | #ifdef CONFIG_NO_HZ_COMMON |
@@ -7623,6 +7683,9 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) | |||
7623 | int this_cpu = this_rq->cpu; | 7683 | int this_cpu = this_rq->cpu; |
7624 | struct rq *rq; | 7684 | struct rq *rq; |
7625 | int balance_cpu; | 7685 | int balance_cpu; |
7686 | /* Earliest time when we have to do rebalance again */ | ||
7687 | unsigned long next_balance = jiffies + 60*HZ; | ||
7688 | int update_next_balance = 0; | ||
7626 | 7689 | ||
7627 | if (idle != CPU_IDLE || | 7690 | if (idle != CPU_IDLE || |
7628 | !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) | 7691 | !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) |
@@ -7654,10 +7717,19 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) | |||
7654 | rebalance_domains(rq, CPU_IDLE); | 7717 | rebalance_domains(rq, CPU_IDLE); |
7655 | } | 7718 | } |
7656 | 7719 | ||
7657 | if (time_after(this_rq->next_balance, rq->next_balance)) | 7720 | if (time_after(next_balance, rq->next_balance)) { |
7658 | this_rq->next_balance = rq->next_balance; | 7721 | next_balance = rq->next_balance; |
7722 | update_next_balance = 1; | ||
7723 | } | ||
7659 | } | 7724 | } |
7660 | nohz.next_balance = this_rq->next_balance; | 7725 | |
7726 | /* | ||
7727 | * next_balance will be updated only when there is a need. | ||
7728 | * When the CPU is attached to null domain for ex, it will not be | ||
7729 | * updated. | ||
7730 | */ | ||
7731 | if (likely(update_next_balance)) | ||
7732 | nohz.next_balance = next_balance; | ||
7661 | end: | 7733 | end: |
7662 | clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)); | 7734 | clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)); |
7663 | } | 7735 | } |
@@ -7810,7 +7882,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) | |||
7810 | entity_tick(cfs_rq, se, queued); | 7882 | entity_tick(cfs_rq, se, queued); |
7811 | } | 7883 | } |
7812 | 7884 | ||
7813 | if (numabalancing_enabled) | 7885 | if (static_branch_unlikely(&sched_numa_balancing)) |
7814 | task_tick_numa(rq, curr); | 7886 | task_tick_numa(rq, curr); |
7815 | } | 7887 | } |
7816 | 7888 | ||
@@ -7886,21 +7958,39 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) | |||
7886 | check_preempt_curr(rq, p, 0); | 7958 | check_preempt_curr(rq, p, 0); |
7887 | } | 7959 | } |
7888 | 7960 | ||
7889 | static void switched_from_fair(struct rq *rq, struct task_struct *p) | 7961 | static inline bool vruntime_normalized(struct task_struct *p) |
7890 | { | 7962 | { |
7891 | struct sched_entity *se = &p->se; | 7963 | struct sched_entity *se = &p->se; |
7892 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
7893 | 7964 | ||
7894 | /* | 7965 | /* |
7895 | * Ensure the task's vruntime is normalized, so that when it's | 7966 | * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases, |
7896 | * switched back to the fair class the enqueue_entity(.flags=0) will | 7967 | * the dequeue_entity(.flags=0) will already have normalized the |
7897 | * do the right thing. | 7968 | * vruntime. |
7969 | */ | ||
7970 | if (p->on_rq) | ||
7971 | return true; | ||
7972 | |||
7973 | /* | ||
7974 | * When !on_rq, vruntime of the task has usually NOT been normalized. | ||
7975 | * But there are some cases where it has already been normalized: | ||
7898 | * | 7976 | * |
7899 | * If it's queued, then the dequeue_entity(.flags=0) will already | 7977 | * - A forked child which is waiting for being woken up by |
7900 | * have normalized the vruntime, if it's !queued, then only when | 7978 | * wake_up_new_task(). |
7901 | * the task is sleeping will it still have non-normalized vruntime. | 7979 | * - A task which has been woken up by try_to_wake_up() and |
7980 | * waiting for actually being woken up by sched_ttwu_pending(). | ||
7902 | */ | 7981 | */ |
7903 | if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) { | 7982 | if (!se->sum_exec_runtime || p->state == TASK_WAKING) |
7983 | return true; | ||
7984 | |||
7985 | return false; | ||
7986 | } | ||
7987 | |||
7988 | static void detach_task_cfs_rq(struct task_struct *p) | ||
7989 | { | ||
7990 | struct sched_entity *se = &p->se; | ||
7991 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
7992 | |||
7993 | if (!vruntime_normalized(p)) { | ||
7904 | /* | 7994 | /* |
7905 | * Fix up our vruntime so that the current sleep doesn't | 7995 | * Fix up our vruntime so that the current sleep doesn't |
7906 | * cause 'unlimited' sleep bonus. | 7996 | * cause 'unlimited' sleep bonus. |
@@ -7909,28 +7999,14 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) | |||
7909 | se->vruntime -= cfs_rq->min_vruntime; | 7999 | se->vruntime -= cfs_rq->min_vruntime; |
7910 | } | 8000 | } |
7911 | 8001 | ||
7912 | #ifdef CONFIG_SMP | ||
7913 | /* Catch up with the cfs_rq and remove our load when we leave */ | 8002 | /* Catch up with the cfs_rq and remove our load when we leave */ |
7914 | __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq), &se->avg, | 8003 | detach_entity_load_avg(cfs_rq, se); |
7915 | se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); | ||
7916 | |||
7917 | cfs_rq->avg.load_avg = | ||
7918 | max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); | ||
7919 | cfs_rq->avg.load_sum = | ||
7920 | max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); | ||
7921 | cfs_rq->avg.util_avg = | ||
7922 | max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); | ||
7923 | cfs_rq->avg.util_sum = | ||
7924 | max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); | ||
7925 | #endif | ||
7926 | } | 8004 | } |
7927 | 8005 | ||
7928 | /* | 8006 | static void attach_task_cfs_rq(struct task_struct *p) |
7929 | * We switched to the sched_fair class. | ||
7930 | */ | ||
7931 | static void switched_to_fair(struct rq *rq, struct task_struct *p) | ||
7932 | { | 8007 | { |
7933 | struct sched_entity *se = &p->se; | 8008 | struct sched_entity *se = &p->se; |
8009 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
7934 | 8010 | ||
7935 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8011 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7936 | /* | 8012 | /* |
@@ -7940,31 +8016,33 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) | |||
7940 | se->depth = se->parent ? se->parent->depth + 1 : 0; | 8016 | se->depth = se->parent ? se->parent->depth + 1 : 0; |
7941 | #endif | 8017 | #endif |
7942 | 8018 | ||
7943 | if (!task_on_rq_queued(p)) { | 8019 | /* Synchronize task with its cfs_rq */ |
8020 | attach_entity_load_avg(cfs_rq, se); | ||
8021 | |||
8022 | if (!vruntime_normalized(p)) | ||
8023 | se->vruntime += cfs_rq->min_vruntime; | ||
8024 | } | ||
8025 | |||
8026 | static void switched_from_fair(struct rq *rq, struct task_struct *p) | ||
8027 | { | ||
8028 | detach_task_cfs_rq(p); | ||
8029 | } | ||
8030 | |||
8031 | static void switched_to_fair(struct rq *rq, struct task_struct *p) | ||
8032 | { | ||
8033 | attach_task_cfs_rq(p); | ||
7944 | 8034 | ||
8035 | if (task_on_rq_queued(p)) { | ||
7945 | /* | 8036 | /* |
7946 | * Ensure the task has a non-normalized vruntime when it is switched | 8037 | * We were most likely switched from sched_rt, so |
7947 | * back to the fair class with !queued, so that enqueue_entity() at | 8038 | * kick off the schedule if running, otherwise just see |
7948 | * wake-up time will do the right thing. | 8039 | * if we can still preempt the current task. |
7949 | * | ||
7950 | * If it's queued, then the enqueue_entity(.flags=0) makes the task | ||
7951 | * has non-normalized vruntime, if it's !queued, then it still has | ||
7952 | * normalized vruntime. | ||
7953 | */ | 8040 | */ |
7954 | if (p->state != TASK_RUNNING) | 8041 | if (rq->curr == p) |
7955 | se->vruntime += cfs_rq_of(se)->min_vruntime; | 8042 | resched_curr(rq); |
7956 | return; | 8043 | else |
8044 | check_preempt_curr(rq, p, 0); | ||
7957 | } | 8045 | } |
7958 | |||
7959 | /* | ||
7960 | * We were most likely switched from sched_rt, so | ||
7961 | * kick off the schedule if running, otherwise just see | ||
7962 | * if we can still preempt the current task. | ||
7963 | */ | ||
7964 | if (rq->curr == p) | ||
7965 | resched_curr(rq); | ||
7966 | else | ||
7967 | check_preempt_curr(rq, p, 0); | ||
7968 | } | 8046 | } |
7969 | 8047 | ||
7970 | /* Account for a task changing its policy or group. | 8048 | /* Account for a task changing its policy or group. |
@@ -7999,56 +8077,16 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) | |||
7999 | } | 8077 | } |
8000 | 8078 | ||
8001 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8079 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8002 | static void task_move_group_fair(struct task_struct *p, int queued) | 8080 | static void task_move_group_fair(struct task_struct *p) |
8003 | { | 8081 | { |
8004 | struct sched_entity *se = &p->se; | 8082 | detach_task_cfs_rq(p); |
8005 | struct cfs_rq *cfs_rq; | ||
8006 | |||
8007 | /* | ||
8008 | * If the task was not on the rq at the time of this cgroup movement | ||
8009 | * it must have been asleep, sleeping tasks keep their ->vruntime | ||
8010 | * absolute on their old rq until wakeup (needed for the fair sleeper | ||
8011 | * bonus in place_entity()). | ||
8012 | * | ||
8013 | * If it was on the rq, we've just 'preempted' it, which does convert | ||
8014 | * ->vruntime to a relative base. | ||
8015 | * | ||
8016 | * Make sure both cases convert their relative position when migrating | ||
8017 | * to another cgroup's rq. This does somewhat interfere with the | ||
8018 | * fair sleeper stuff for the first placement, but who cares. | ||
8019 | */ | ||
8020 | /* | ||
8021 | * When !queued, vruntime of the task has usually NOT been normalized. | ||
8022 | * But there are some cases where it has already been normalized: | ||
8023 | * | ||
8024 | * - Moving a forked child which is waiting for being woken up by | ||
8025 | * wake_up_new_task(). | ||
8026 | * - Moving a task which has been woken up by try_to_wake_up() and | ||
8027 | * waiting for actually being woken up by sched_ttwu_pending(). | ||
8028 | * | ||
8029 | * To prevent boost or penalty in the new cfs_rq caused by delta | ||
8030 | * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. | ||
8031 | */ | ||
8032 | if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING)) | ||
8033 | queued = 1; | ||
8034 | |||
8035 | if (!queued) | ||
8036 | se->vruntime -= cfs_rq_of(se)->min_vruntime; | ||
8037 | set_task_rq(p, task_cpu(p)); | 8083 | set_task_rq(p, task_cpu(p)); |
8038 | se->depth = se->parent ? se->parent->depth + 1 : 0; | ||
8039 | if (!queued) { | ||
8040 | cfs_rq = cfs_rq_of(se); | ||
8041 | se->vruntime += cfs_rq->min_vruntime; | ||
8042 | 8084 | ||
8043 | #ifdef CONFIG_SMP | 8085 | #ifdef CONFIG_SMP |
8044 | /* Virtually synchronize task with its new cfs_rq */ | 8086 | /* Tell se's cfs_rq has been changed -- migrated */ |
8045 | p->se.avg.last_update_time = cfs_rq->avg.last_update_time; | 8087 | p->se.avg.last_update_time = 0; |
8046 | cfs_rq->avg.load_avg += p->se.avg.load_avg; | ||
8047 | cfs_rq->avg.load_sum += p->se.avg.load_sum; | ||
8048 | cfs_rq->avg.util_avg += p->se.avg.util_avg; | ||
8049 | cfs_rq->avg.util_sum += p->se.avg.util_sum; | ||
8050 | #endif | 8088 | #endif |
8051 | } | 8089 | attach_task_cfs_rq(p); |
8052 | } | 8090 | } |
8053 | 8091 | ||
8054 | void free_fair_sched_group(struct task_group *tg) | 8092 | void free_fair_sched_group(struct task_group *tg) |
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 83a50e7ca533..69631fa46c2f 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
@@ -36,11 +36,6 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true) | |||
36 | */ | 36 | */ |
37 | SCHED_FEAT(WAKEUP_PREEMPTION, true) | 37 | SCHED_FEAT(WAKEUP_PREEMPTION, true) |
38 | 38 | ||
39 | /* | ||
40 | * Use arch dependent cpu capacity functions | ||
41 | */ | ||
42 | SCHED_FEAT(ARCH_CAPACITY, true) | ||
43 | |||
44 | SCHED_FEAT(HRTICK, false) | 39 | SCHED_FEAT(HRTICK, false) |
45 | SCHED_FEAT(DOUBLE_TICK, false) | 40 | SCHED_FEAT(DOUBLE_TICK, false) |
46 | SCHED_FEAT(LB_BIAS, true) | 41 | SCHED_FEAT(LB_BIAS, true) |
@@ -72,19 +67,5 @@ SCHED_FEAT(RT_PUSH_IPI, true) | |||
72 | SCHED_FEAT(FORCE_SD_OVERLAP, false) | 67 | SCHED_FEAT(FORCE_SD_OVERLAP, false) |
73 | SCHED_FEAT(RT_RUNTIME_SHARE, true) | 68 | SCHED_FEAT(RT_RUNTIME_SHARE, true) |
74 | SCHED_FEAT(LB_MIN, false) | 69 | SCHED_FEAT(LB_MIN, false) |
70 | SCHED_FEAT(ATTACH_AGE_LOAD, true) | ||
75 | 71 | ||
76 | /* | ||
77 | * Apply the automatic NUMA scheduling policy. Enabled automatically | ||
78 | * at runtime if running on a NUMA machine. Can be controlled via | ||
79 | * numa_balancing= | ||
80 | */ | ||
81 | #ifdef CONFIG_NUMA_BALANCING | ||
82 | |||
83 | /* | ||
84 | * NUMA will favor moving tasks towards nodes where a higher number of | ||
85 | * hinting faults are recorded during active load balancing. It will | ||
86 | * resist moving tasks towards nodes where a lower number of hinting | ||
87 | * faults have been recorded. | ||
88 | */ | ||
89 | SCHED_FEAT(NUMA, true) | ||
90 | #endif | ||
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 8f177c73ae19..4a2ef5a02fd3 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c | |||
@@ -57,9 +57,11 @@ static inline int cpu_idle_poll(void) | |||
57 | rcu_idle_enter(); | 57 | rcu_idle_enter(); |
58 | trace_cpu_idle_rcuidle(0, smp_processor_id()); | 58 | trace_cpu_idle_rcuidle(0, smp_processor_id()); |
59 | local_irq_enable(); | 59 | local_irq_enable(); |
60 | stop_critical_timings(); | ||
60 | while (!tif_need_resched() && | 61 | while (!tif_need_resched() && |
61 | (cpu_idle_force_poll || tick_check_broadcast_expired())) | 62 | (cpu_idle_force_poll || tick_check_broadcast_expired())) |
62 | cpu_relax(); | 63 | cpu_relax(); |
64 | start_critical_timings(); | ||
63 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); | 65 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); |
64 | rcu_idle_exit(); | 66 | rcu_idle_exit(); |
65 | return 1; | 67 | return 1; |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index d2ea59364a1c..e3cc16312046 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -635,11 +635,11 @@ bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) | |||
635 | /* | 635 | /* |
636 | * We ran out of runtime, see if we can borrow some from our neighbours. | 636 | * We ran out of runtime, see if we can borrow some from our neighbours. |
637 | */ | 637 | */ |
638 | static int do_balance_runtime(struct rt_rq *rt_rq) | 638 | static void do_balance_runtime(struct rt_rq *rt_rq) |
639 | { | 639 | { |
640 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); | 640 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); |
641 | struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd; | 641 | struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd; |
642 | int i, weight, more = 0; | 642 | int i, weight; |
643 | u64 rt_period; | 643 | u64 rt_period; |
644 | 644 | ||
645 | weight = cpumask_weight(rd->span); | 645 | weight = cpumask_weight(rd->span); |
@@ -673,7 +673,6 @@ static int do_balance_runtime(struct rt_rq *rt_rq) | |||
673 | diff = rt_period - rt_rq->rt_runtime; | 673 | diff = rt_period - rt_rq->rt_runtime; |
674 | iter->rt_runtime -= diff; | 674 | iter->rt_runtime -= diff; |
675 | rt_rq->rt_runtime += diff; | 675 | rt_rq->rt_runtime += diff; |
676 | more = 1; | ||
677 | if (rt_rq->rt_runtime == rt_period) { | 676 | if (rt_rq->rt_runtime == rt_period) { |
678 | raw_spin_unlock(&iter->rt_runtime_lock); | 677 | raw_spin_unlock(&iter->rt_runtime_lock); |
679 | break; | 678 | break; |
@@ -683,8 +682,6 @@ next: | |||
683 | raw_spin_unlock(&iter->rt_runtime_lock); | 682 | raw_spin_unlock(&iter->rt_runtime_lock); |
684 | } | 683 | } |
685 | raw_spin_unlock(&rt_b->rt_runtime_lock); | 684 | raw_spin_unlock(&rt_b->rt_runtime_lock); |
686 | |||
687 | return more; | ||
688 | } | 685 | } |
689 | 686 | ||
690 | /* | 687 | /* |
@@ -796,26 +793,19 @@ static void __enable_runtime(struct rq *rq) | |||
796 | } | 793 | } |
797 | } | 794 | } |
798 | 795 | ||
799 | static int balance_runtime(struct rt_rq *rt_rq) | 796 | static void balance_runtime(struct rt_rq *rt_rq) |
800 | { | 797 | { |
801 | int more = 0; | ||
802 | |||
803 | if (!sched_feat(RT_RUNTIME_SHARE)) | 798 | if (!sched_feat(RT_RUNTIME_SHARE)) |
804 | return more; | 799 | return; |
805 | 800 | ||
806 | if (rt_rq->rt_time > rt_rq->rt_runtime) { | 801 | if (rt_rq->rt_time > rt_rq->rt_runtime) { |
807 | raw_spin_unlock(&rt_rq->rt_runtime_lock); | 802 | raw_spin_unlock(&rt_rq->rt_runtime_lock); |
808 | more = do_balance_runtime(rt_rq); | 803 | do_balance_runtime(rt_rq); |
809 | raw_spin_lock(&rt_rq->rt_runtime_lock); | 804 | raw_spin_lock(&rt_rq->rt_runtime_lock); |
810 | } | 805 | } |
811 | |||
812 | return more; | ||
813 | } | 806 | } |
814 | #else /* !CONFIG_SMP */ | 807 | #else /* !CONFIG_SMP */ |
815 | static inline int balance_runtime(struct rt_rq *rt_rq) | 808 | static inline void balance_runtime(struct rt_rq *rt_rq) {} |
816 | { | ||
817 | return 0; | ||
818 | } | ||
819 | #endif /* CONFIG_SMP */ | 809 | #endif /* CONFIG_SMP */ |
820 | 810 | ||
821 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) | 811 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 68cda117574c..efd3bfc7e347 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -84,6 +84,10 @@ static inline void update_cpu_load_active(struct rq *this_rq) { } | |||
84 | */ | 84 | */ |
85 | #define RUNTIME_INF ((u64)~0ULL) | 85 | #define RUNTIME_INF ((u64)~0ULL) |
86 | 86 | ||
87 | static inline int idle_policy(int policy) | ||
88 | { | ||
89 | return policy == SCHED_IDLE; | ||
90 | } | ||
87 | static inline int fair_policy(int policy) | 91 | static inline int fair_policy(int policy) |
88 | { | 92 | { |
89 | return policy == SCHED_NORMAL || policy == SCHED_BATCH; | 93 | return policy == SCHED_NORMAL || policy == SCHED_BATCH; |
@@ -98,6 +102,11 @@ static inline int dl_policy(int policy) | |||
98 | { | 102 | { |
99 | return policy == SCHED_DEADLINE; | 103 | return policy == SCHED_DEADLINE; |
100 | } | 104 | } |
105 | static inline bool valid_policy(int policy) | ||
106 | { | ||
107 | return idle_policy(policy) || fair_policy(policy) || | ||
108 | rt_policy(policy) || dl_policy(policy); | ||
109 | } | ||
101 | 110 | ||
102 | static inline int task_has_rt_policy(struct task_struct *p) | 111 | static inline int task_has_rt_policy(struct task_struct *p) |
103 | { | 112 | { |
@@ -109,11 +118,6 @@ static inline int task_has_dl_policy(struct task_struct *p) | |||
109 | return dl_policy(p->policy); | 118 | return dl_policy(p->policy); |
110 | } | 119 | } |
111 | 120 | ||
112 | static inline bool dl_time_before(u64 a, u64 b) | ||
113 | { | ||
114 | return (s64)(a - b) < 0; | ||
115 | } | ||
116 | |||
117 | /* | 121 | /* |
118 | * Tells if entity @a should preempt entity @b. | 122 | * Tells if entity @a should preempt entity @b. |
119 | */ | 123 | */ |
@@ -1003,17 +1007,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; | |||
1003 | #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) | 1007 | #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) |
1004 | #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ | 1008 | #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ |
1005 | 1009 | ||
1006 | #ifdef CONFIG_NUMA_BALANCING | 1010 | extern struct static_key_false sched_numa_balancing; |
1007 | #define sched_feat_numa(x) sched_feat(x) | ||
1008 | #ifdef CONFIG_SCHED_DEBUG | ||
1009 | #define numabalancing_enabled sched_feat_numa(NUMA) | ||
1010 | #else | ||
1011 | extern bool numabalancing_enabled; | ||
1012 | #endif /* CONFIG_SCHED_DEBUG */ | ||
1013 | #else | ||
1014 | #define sched_feat_numa(x) (0) | ||
1015 | #define numabalancing_enabled (0) | ||
1016 | #endif /* CONFIG_NUMA_BALANCING */ | ||
1017 | 1011 | ||
1018 | static inline u64 global_rt_period(void) | 1012 | static inline u64 global_rt_period(void) |
1019 | { | 1013 | { |
@@ -1078,9 +1072,10 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
1078 | * After ->on_cpu is cleared, the task can be moved to a different CPU. | 1072 | * After ->on_cpu is cleared, the task can be moved to a different CPU. |
1079 | * We must ensure this doesn't happen until the switch is completely | 1073 | * We must ensure this doesn't happen until the switch is completely |
1080 | * finished. | 1074 | * finished. |
1075 | * | ||
1076 | * Pairs with the control dependency and rmb in try_to_wake_up(). | ||
1081 | */ | 1077 | */ |
1082 | smp_wmb(); | 1078 | smp_store_release(&prev->on_cpu, 0); |
1083 | prev->on_cpu = 0; | ||
1084 | #endif | 1079 | #endif |
1085 | #ifdef CONFIG_DEBUG_SPINLOCK | 1080 | #ifdef CONFIG_DEBUG_SPINLOCK |
1086 | /* this is a valid case when another task releases the spinlock */ | 1081 | /* this is a valid case when another task releases the spinlock */ |
@@ -1156,16 +1151,18 @@ static const u32 prio_to_wmult[40] = { | |||
1156 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, | 1151 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, |
1157 | }; | 1152 | }; |
1158 | 1153 | ||
1159 | #define ENQUEUE_WAKEUP 1 | 1154 | #define ENQUEUE_WAKEUP 0x01 |
1160 | #define ENQUEUE_HEAD 2 | 1155 | #define ENQUEUE_HEAD 0x02 |
1161 | #ifdef CONFIG_SMP | 1156 | #ifdef CONFIG_SMP |
1162 | #define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */ | 1157 | #define ENQUEUE_WAKING 0x04 /* sched_class::task_waking was called */ |
1163 | #else | 1158 | #else |
1164 | #define ENQUEUE_WAKING 0 | 1159 | #define ENQUEUE_WAKING 0x00 |
1165 | #endif | 1160 | #endif |
1166 | #define ENQUEUE_REPLENISH 8 | 1161 | #define ENQUEUE_REPLENISH 0x08 |
1162 | #define ENQUEUE_RESTORE 0x10 | ||
1167 | 1163 | ||
1168 | #define DEQUEUE_SLEEP 1 | 1164 | #define DEQUEUE_SLEEP 0x01 |
1165 | #define DEQUEUE_SAVE 0x02 | ||
1169 | 1166 | ||
1170 | #define RETRY_TASK ((void *)-1UL) | 1167 | #define RETRY_TASK ((void *)-1UL) |
1171 | 1168 | ||
@@ -1193,7 +1190,7 @@ struct sched_class { | |||
1193 | 1190 | ||
1194 | #ifdef CONFIG_SMP | 1191 | #ifdef CONFIG_SMP |
1195 | int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); | 1192 | int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); |
1196 | void (*migrate_task_rq)(struct task_struct *p, int next_cpu); | 1193 | void (*migrate_task_rq)(struct task_struct *p); |
1197 | 1194 | ||
1198 | void (*task_waking) (struct task_struct *task); | 1195 | void (*task_waking) (struct task_struct *task); |
1199 | void (*task_woken) (struct rq *this_rq, struct task_struct *task); | 1196 | void (*task_woken) (struct rq *this_rq, struct task_struct *task); |
@@ -1226,7 +1223,7 @@ struct sched_class { | |||
1226 | void (*update_curr) (struct rq *rq); | 1223 | void (*update_curr) (struct rq *rq); |
1227 | 1224 | ||
1228 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1225 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1229 | void (*task_move_group) (struct task_struct *p, int on_rq); | 1226 | void (*task_move_group) (struct task_struct *p); |
1230 | #endif | 1227 | #endif |
1231 | }; | 1228 | }; |
1232 | 1229 | ||
@@ -1404,6 +1401,17 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) | |||
1404 | } | 1401 | } |
1405 | #endif | 1402 | #endif |
1406 | 1403 | ||
1404 | #ifndef arch_scale_cpu_capacity | ||
1405 | static __always_inline | ||
1406 | unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) | ||
1407 | { | ||
1408 | if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) | ||
1409 | return sd->smt_gain / sd->span_weight; | ||
1410 | |||
1411 | return SCHED_CAPACITY_SCALE; | ||
1412 | } | ||
1413 | #endif | ||
1414 | |||
1407 | static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | 1415 | static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) |
1408 | { | 1416 | { |
1409 | rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); | 1417 | rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); |
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 272d9322bc5d..052e02672d12 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c | |||
@@ -106,10 +106,9 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) | |||
106 | } | 106 | } |
107 | EXPORT_SYMBOL_GPL(__wake_up_locked); | 107 | EXPORT_SYMBOL_GPL(__wake_up_locked); |
108 | 108 | ||
109 | void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr, | 109 | void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) |
110 | void *key) | ||
111 | { | 110 | { |
112 | __wake_up_common(q, mode, nr, 0, key); | 111 | __wake_up_common(q, mode, 1, 0, key); |
113 | } | 112 | } |
114 | EXPORT_SYMBOL_GPL(__wake_up_locked_key); | 113 | EXPORT_SYMBOL_GPL(__wake_up_locked_key); |
115 | 114 | ||
@@ -284,7 +283,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, | |||
284 | if (!list_empty(&wait->task_list)) | 283 | if (!list_empty(&wait->task_list)) |
285 | list_del_init(&wait->task_list); | 284 | list_del_init(&wait->task_list); |
286 | else if (waitqueue_active(q)) | 285 | else if (waitqueue_active(q)) |
287 | __wake_up_locked_key(q, mode, 1, key); | 286 | __wake_up_locked_key(q, mode, key); |
288 | spin_unlock_irqrestore(&q->lock, flags); | 287 | spin_unlock_irqrestore(&q->lock, flags); |
289 | } | 288 | } |
290 | EXPORT_SYMBOL(abort_exclusive_wait); | 289 | EXPORT_SYMBOL(abort_exclusive_wait); |
diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 5bd4779282df..580ac2d4024f 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c | |||
@@ -347,6 +347,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) | |||
347 | { | 347 | { |
348 | struct seccomp_filter *sfilter; | 348 | struct seccomp_filter *sfilter; |
349 | int ret; | 349 | int ret; |
350 | const bool save_orig = config_enabled(CONFIG_CHECKPOINT_RESTORE); | ||
350 | 351 | ||
351 | if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) | 352 | if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) |
352 | return ERR_PTR(-EINVAL); | 353 | return ERR_PTR(-EINVAL); |
@@ -370,7 +371,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) | |||
370 | return ERR_PTR(-ENOMEM); | 371 | return ERR_PTR(-ENOMEM); |
371 | 372 | ||
372 | ret = bpf_prog_create_from_user(&sfilter->prog, fprog, | 373 | ret = bpf_prog_create_from_user(&sfilter->prog, fprog, |
373 | seccomp_check_filter); | 374 | seccomp_check_filter, save_orig); |
374 | if (ret < 0) { | 375 | if (ret < 0) { |
375 | kfree(sfilter); | 376 | kfree(sfilter); |
376 | return ERR_PTR(ret); | 377 | return ERR_PTR(ret); |
@@ -469,7 +470,7 @@ void get_seccomp_filter(struct task_struct *tsk) | |||
469 | static inline void seccomp_filter_free(struct seccomp_filter *filter) | 470 | static inline void seccomp_filter_free(struct seccomp_filter *filter) |
470 | { | 471 | { |
471 | if (filter) { | 472 | if (filter) { |
472 | bpf_prog_free(filter->prog); | 473 | bpf_prog_destroy(filter->prog); |
473 | kfree(filter); | 474 | kfree(filter); |
474 | } | 475 | } |
475 | } | 476 | } |
@@ -867,3 +868,76 @@ long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) | |||
867 | /* prctl interface doesn't have flags, so they are always zero. */ | 868 | /* prctl interface doesn't have flags, so they are always zero. */ |
868 | return do_seccomp(op, 0, uargs); | 869 | return do_seccomp(op, 0, uargs); |
869 | } | 870 | } |
871 | |||
872 | #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE) | ||
873 | long seccomp_get_filter(struct task_struct *task, unsigned long filter_off, | ||
874 | void __user *data) | ||
875 | { | ||
876 | struct seccomp_filter *filter; | ||
877 | struct sock_fprog_kern *fprog; | ||
878 | long ret; | ||
879 | unsigned long count = 0; | ||
880 | |||
881 | if (!capable(CAP_SYS_ADMIN) || | ||
882 | current->seccomp.mode != SECCOMP_MODE_DISABLED) { | ||
883 | return -EACCES; | ||
884 | } | ||
885 | |||
886 | spin_lock_irq(&task->sighand->siglock); | ||
887 | if (task->seccomp.mode != SECCOMP_MODE_FILTER) { | ||
888 | ret = -EINVAL; | ||
889 | goto out; | ||
890 | } | ||
891 | |||
892 | filter = task->seccomp.filter; | ||
893 | while (filter) { | ||
894 | filter = filter->prev; | ||
895 | count++; | ||
896 | } | ||
897 | |||
898 | if (filter_off >= count) { | ||
899 | ret = -ENOENT; | ||
900 | goto out; | ||
901 | } | ||
902 | count -= filter_off; | ||
903 | |||
904 | filter = task->seccomp.filter; | ||
905 | while (filter && count > 1) { | ||
906 | filter = filter->prev; | ||
907 | count--; | ||
908 | } | ||
909 | |||
910 | if (WARN_ON(count != 1 || !filter)) { | ||
911 | /* The filter tree shouldn't shrink while we're using it. */ | ||
912 | ret = -ENOENT; | ||
913 | goto out; | ||
914 | } | ||
915 | |||
916 | fprog = filter->prog->orig_prog; | ||
917 | if (!fprog) { | ||
918 | /* This must be a new non-cBPF filter, since we save every | ||
919 | * every cBPF filter's orig_prog above when | ||
920 | * CONFIG_CHECKPOINT_RESTORE is enabled. | ||
921 | */ | ||
922 | ret = -EMEDIUMTYPE; | ||
923 | goto out; | ||
924 | } | ||
925 | |||
926 | ret = fprog->len; | ||
927 | if (!data) | ||
928 | goto out; | ||
929 | |||
930 | get_seccomp_filter(task); | ||
931 | spin_unlock_irq(&task->sighand->siglock); | ||
932 | |||
933 | if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog))) | ||
934 | ret = -EFAULT; | ||
935 | |||
936 | put_seccomp_filter(task); | ||
937 | return ret; | ||
938 | |||
939 | out: | ||
940 | spin_unlock_irq(&task->sighand->siglock); | ||
941 | return ret; | ||
942 | } | ||
943 | #endif | ||
diff --git a/kernel/signal.c b/kernel/signal.c index 0f6bbbe77b46..c0b01fe24bbd 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -503,41 +503,6 @@ int unhandled_signal(struct task_struct *tsk, int sig) | |||
503 | return !tsk->ptrace; | 503 | return !tsk->ptrace; |
504 | } | 504 | } |
505 | 505 | ||
506 | /* | ||
507 | * Notify the system that a driver wants to block all signals for this | ||
508 | * process, and wants to be notified if any signals at all were to be | ||
509 | * sent/acted upon. If the notifier routine returns non-zero, then the | ||
510 | * signal will be acted upon after all. If the notifier routine returns 0, | ||
511 | * then then signal will be blocked. Only one block per process is | ||
512 | * allowed. priv is a pointer to private data that the notifier routine | ||
513 | * can use to determine if the signal should be blocked or not. | ||
514 | */ | ||
515 | void | ||
516 | block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask) | ||
517 | { | ||
518 | unsigned long flags; | ||
519 | |||
520 | spin_lock_irqsave(¤t->sighand->siglock, flags); | ||
521 | current->notifier_mask = mask; | ||
522 | current->notifier_data = priv; | ||
523 | current->notifier = notifier; | ||
524 | spin_unlock_irqrestore(¤t->sighand->siglock, flags); | ||
525 | } | ||
526 | |||
527 | /* Notify the system that blocking has ended. */ | ||
528 | |||
529 | void | ||
530 | unblock_all_signals(void) | ||
531 | { | ||
532 | unsigned long flags; | ||
533 | |||
534 | spin_lock_irqsave(¤t->sighand->siglock, flags); | ||
535 | current->notifier = NULL; | ||
536 | current->notifier_data = NULL; | ||
537 | recalc_sigpending(); | ||
538 | spin_unlock_irqrestore(¤t->sighand->siglock, flags); | ||
539 | } | ||
540 | |||
541 | static void collect_signal(int sig, struct sigpending *list, siginfo_t *info) | 506 | static void collect_signal(int sig, struct sigpending *list, siginfo_t *info) |
542 | { | 507 | { |
543 | struct sigqueue *q, *first = NULL; | 508 | struct sigqueue *q, *first = NULL; |
@@ -580,19 +545,8 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, | |||
580 | { | 545 | { |
581 | int sig = next_signal(pending, mask); | 546 | int sig = next_signal(pending, mask); |
582 | 547 | ||
583 | if (sig) { | 548 | if (sig) |
584 | if (current->notifier) { | ||
585 | if (sigismember(current->notifier_mask, sig)) { | ||
586 | if (!(current->notifier)(current->notifier_data)) { | ||
587 | clear_thread_flag(TIF_SIGPENDING); | ||
588 | return 0; | ||
589 | } | ||
590 | } | ||
591 | } | ||
592 | |||
593 | collect_signal(sig, pending, info); | 549 | collect_signal(sig, pending, info); |
594 | } | ||
595 | |||
596 | return sig; | 550 | return sig; |
597 | } | 551 | } |
598 | 552 | ||
@@ -834,7 +788,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) | |||
834 | sigset_t flush; | 788 | sigset_t flush; |
835 | 789 | ||
836 | if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) { | 790 | if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) { |
837 | if (signal->flags & SIGNAL_GROUP_COREDUMP) | 791 | if (!(signal->flags & SIGNAL_GROUP_EXIT)) |
838 | return sig == SIGKILL; | 792 | return sig == SIGKILL; |
839 | /* | 793 | /* |
840 | * The process is in the middle of dying, nothing to do. | 794 | * The process is in the middle of dying, nothing to do. |
@@ -2483,9 +2437,6 @@ EXPORT_SYMBOL(force_sig); | |||
2483 | EXPORT_SYMBOL(send_sig); | 2437 | EXPORT_SYMBOL(send_sig); |
2484 | EXPORT_SYMBOL(send_sig_info); | 2438 | EXPORT_SYMBOL(send_sig_info); |
2485 | EXPORT_SYMBOL(sigprocmask); | 2439 | EXPORT_SYMBOL(sigprocmask); |
2486 | EXPORT_SYMBOL(block_all_signals); | ||
2487 | EXPORT_SYMBOL(unblock_all_signals); | ||
2488 | |||
2489 | 2440 | ||
2490 | /* | 2441 | /* |
2491 | * System call entry points. | 2442 | * System call entry points. |
diff --git a/kernel/smp.c b/kernel/smp.c index 07854477c164..d903c02223af 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
@@ -669,7 +669,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), | |||
669 | cpumask_var_t cpus; | 669 | cpumask_var_t cpus; |
670 | int cpu, ret; | 670 | int cpu, ret; |
671 | 671 | ||
672 | might_sleep_if(gfp_flags & __GFP_WAIT); | 672 | might_sleep_if(gfpflags_allow_blocking(gfp_flags)); |
673 | 673 | ||
674 | if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) { | 674 | if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) { |
675 | preempt_disable(); | 675 | preempt_disable(); |
diff --git a/kernel/smpboot.c b/kernel/smpboot.c index a818cbc73e14..d264f59bff56 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c | |||
@@ -222,9 +222,8 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp | |||
222 | { | 222 | { |
223 | struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); | 223 | struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); |
224 | 224 | ||
225 | if (ht->pre_unpark) | 225 | if (!ht->selfparking) |
226 | ht->pre_unpark(cpu); | 226 | kthread_unpark(tsk); |
227 | kthread_unpark(tsk); | ||
228 | } | 227 | } |
229 | 228 | ||
230 | void smpboot_unpark_threads(unsigned int cpu) | 229 | void smpboot_unpark_threads(unsigned int cpu) |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 12484e5d5c88..867bc20e1ef1 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
@@ -73,21 +73,24 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed) | |||
73 | } | 73 | } |
74 | } | 74 | } |
75 | 75 | ||
76 | static void __cpu_stop_queue_work(struct cpu_stopper *stopper, | ||
77 | struct cpu_stop_work *work) | ||
78 | { | ||
79 | list_add_tail(&work->list, &stopper->works); | ||
80 | wake_up_process(stopper->thread); | ||
81 | } | ||
82 | |||
76 | /* queue @work to @stopper. if offline, @work is completed immediately */ | 83 | /* queue @work to @stopper. if offline, @work is completed immediately */ |
77 | static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) | 84 | static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) |
78 | { | 85 | { |
79 | struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); | 86 | struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); |
80 | |||
81 | unsigned long flags; | 87 | unsigned long flags; |
82 | 88 | ||
83 | spin_lock_irqsave(&stopper->lock, flags); | 89 | spin_lock_irqsave(&stopper->lock, flags); |
84 | 90 | if (stopper->enabled) | |
85 | if (stopper->enabled) { | 91 | __cpu_stop_queue_work(stopper, work); |
86 | list_add_tail(&work->list, &stopper->works); | 92 | else |
87 | wake_up_process(stopper->thread); | ||
88 | } else | ||
89 | cpu_stop_signal_done(work->done, false); | 93 | cpu_stop_signal_done(work->done, false); |
90 | |||
91 | spin_unlock_irqrestore(&stopper->lock, flags); | 94 | spin_unlock_irqrestore(&stopper->lock, flags); |
92 | } | 95 | } |
93 | 96 | ||
@@ -213,6 +216,31 @@ static int multi_cpu_stop(void *data) | |||
213 | return err; | 216 | return err; |
214 | } | 217 | } |
215 | 218 | ||
219 | static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, | ||
220 | int cpu2, struct cpu_stop_work *work2) | ||
221 | { | ||
222 | struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1); | ||
223 | struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2); | ||
224 | int err; | ||
225 | |||
226 | lg_double_lock(&stop_cpus_lock, cpu1, cpu2); | ||
227 | spin_lock_irq(&stopper1->lock); | ||
228 | spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING); | ||
229 | |||
230 | err = -ENOENT; | ||
231 | if (!stopper1->enabled || !stopper2->enabled) | ||
232 | goto unlock; | ||
233 | |||
234 | err = 0; | ||
235 | __cpu_stop_queue_work(stopper1, work1); | ||
236 | __cpu_stop_queue_work(stopper2, work2); | ||
237 | unlock: | ||
238 | spin_unlock(&stopper2->lock); | ||
239 | spin_unlock_irq(&stopper1->lock); | ||
240 | lg_double_unlock(&stop_cpus_lock, cpu1, cpu2); | ||
241 | |||
242 | return err; | ||
243 | } | ||
216 | /** | 244 | /** |
217 | * stop_two_cpus - stops two cpus | 245 | * stop_two_cpus - stops two cpus |
218 | * @cpu1: the cpu to stop | 246 | * @cpu1: the cpu to stop |
@@ -247,24 +275,13 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * | |||
247 | cpu_stop_init_done(&done, 2); | 275 | cpu_stop_init_done(&done, 2); |
248 | set_state(&msdata, MULTI_STOP_PREPARE); | 276 | set_state(&msdata, MULTI_STOP_PREPARE); |
249 | 277 | ||
250 | /* | 278 | if (cpu1 > cpu2) |
251 | * If we observe both CPUs active we know _cpu_down() cannot yet have | 279 | swap(cpu1, cpu2); |
252 | * queued its stop_machine works and therefore ours will get executed | 280 | if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2)) { |
253 | * first. Or its not either one of our CPUs that's getting unplugged, | ||
254 | * in which case we don't care. | ||
255 | * | ||
256 | * This relies on the stopper workqueues to be FIFO. | ||
257 | */ | ||
258 | if (!cpu_active(cpu1) || !cpu_active(cpu2)) { | ||
259 | preempt_enable(); | 281 | preempt_enable(); |
260 | return -ENOENT; | 282 | return -ENOENT; |
261 | } | 283 | } |
262 | 284 | ||
263 | lg_double_lock(&stop_cpus_lock, cpu1, cpu2); | ||
264 | cpu_stop_queue_work(cpu1, &work1); | ||
265 | cpu_stop_queue_work(cpu2, &work2); | ||
266 | lg_double_unlock(&stop_cpus_lock, cpu1, cpu2); | ||
267 | |||
268 | preempt_enable(); | 285 | preempt_enable(); |
269 | 286 | ||
270 | wait_for_completion(&done.completion); | 287 | wait_for_completion(&done.completion); |
@@ -452,6 +469,18 @@ repeat: | |||
452 | } | 469 | } |
453 | } | 470 | } |
454 | 471 | ||
472 | void stop_machine_park(int cpu) | ||
473 | { | ||
474 | struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); | ||
475 | /* | ||
476 | * Lockless. cpu_stopper_thread() will take stopper->lock and flush | ||
477 | * the pending works before it parks, until then it is fine to queue | ||
478 | * the new works. | ||
479 | */ | ||
480 | stopper->enabled = false; | ||
481 | kthread_park(stopper->thread); | ||
482 | } | ||
483 | |||
455 | extern void sched_set_stop_task(int cpu, struct task_struct *stop); | 484 | extern void sched_set_stop_task(int cpu, struct task_struct *stop); |
456 | 485 | ||
457 | static void cpu_stop_create(unsigned int cpu) | 486 | static void cpu_stop_create(unsigned int cpu) |
@@ -462,26 +491,16 @@ static void cpu_stop_create(unsigned int cpu) | |||
462 | static void cpu_stop_park(unsigned int cpu) | 491 | static void cpu_stop_park(unsigned int cpu) |
463 | { | 492 | { |
464 | struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); | 493 | struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); |
465 | struct cpu_stop_work *work, *tmp; | ||
466 | unsigned long flags; | ||
467 | 494 | ||
468 | /* drain remaining works */ | 495 | WARN_ON(!list_empty(&stopper->works)); |
469 | spin_lock_irqsave(&stopper->lock, flags); | ||
470 | list_for_each_entry_safe(work, tmp, &stopper->works, list) { | ||
471 | list_del_init(&work->list); | ||
472 | cpu_stop_signal_done(work->done, false); | ||
473 | } | ||
474 | stopper->enabled = false; | ||
475 | spin_unlock_irqrestore(&stopper->lock, flags); | ||
476 | } | 496 | } |
477 | 497 | ||
478 | static void cpu_stop_unpark(unsigned int cpu) | 498 | void stop_machine_unpark(int cpu) |
479 | { | 499 | { |
480 | struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); | 500 | struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); |
481 | 501 | ||
482 | spin_lock_irq(&stopper->lock); | ||
483 | stopper->enabled = true; | 502 | stopper->enabled = true; |
484 | spin_unlock_irq(&stopper->lock); | 503 | kthread_unpark(stopper->thread); |
485 | } | 504 | } |
486 | 505 | ||
487 | static struct smp_hotplug_thread cpu_stop_threads = { | 506 | static struct smp_hotplug_thread cpu_stop_threads = { |
@@ -490,9 +509,7 @@ static struct smp_hotplug_thread cpu_stop_threads = { | |||
490 | .thread_fn = cpu_stopper_thread, | 509 | .thread_fn = cpu_stopper_thread, |
491 | .thread_comm = "migration/%u", | 510 | .thread_comm = "migration/%u", |
492 | .create = cpu_stop_create, | 511 | .create = cpu_stop_create, |
493 | .setup = cpu_stop_unpark, | ||
494 | .park = cpu_stop_park, | 512 | .park = cpu_stop_park, |
495 | .pre_unpark = cpu_stop_unpark, | ||
496 | .selfparking = true, | 513 | .selfparking = true, |
497 | }; | 514 | }; |
498 | 515 | ||
@@ -508,6 +525,7 @@ static int __init cpu_stop_init(void) | |||
508 | } | 525 | } |
509 | 526 | ||
510 | BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads)); | 527 | BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads)); |
528 | stop_machine_unpark(raw_smp_processor_id()); | ||
511 | stop_machine_initialized = true; | 529 | stop_machine_initialized = true; |
512 | return 0; | 530 | return 0; |
513 | } | 531 | } |
diff --git a/kernel/sys.c b/kernel/sys.c index fa2f2f671a5c..6af9212ab5aa 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -222,7 +222,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) | |||
222 | goto out_unlock; /* No processes for this user */ | 222 | goto out_unlock; /* No processes for this user */ |
223 | } | 223 | } |
224 | do_each_thread(g, p) { | 224 | do_each_thread(g, p) { |
225 | if (uid_eq(task_uid(p), uid)) | 225 | if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) |
226 | error = set_one_prio(p, niceval, error); | 226 | error = set_one_prio(p, niceval, error); |
227 | } while_each_thread(g, p); | 227 | } while_each_thread(g, p); |
228 | if (!uid_eq(uid, cred->uid)) | 228 | if (!uid_eq(uid, cred->uid)) |
@@ -290,7 +290,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) | |||
290 | goto out_unlock; /* No processes for this user */ | 290 | goto out_unlock; /* No processes for this user */ |
291 | } | 291 | } |
292 | do_each_thread(g, p) { | 292 | do_each_thread(g, p) { |
293 | if (uid_eq(task_uid(p), uid)) { | 293 | if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) { |
294 | niceval = nice_to_rlimit(task_nice(p)); | 294 | niceval = nice_to_rlimit(task_nice(p)); |
295 | if (niceval > retval) | 295 | if (niceval > retval) |
296 | retval = niceval; | 296 | retval = niceval; |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index a02decf15583..0623787ec67a 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -194,6 +194,7 @@ cond_syscall(sys_mlock); | |||
194 | cond_syscall(sys_munlock); | 194 | cond_syscall(sys_munlock); |
195 | cond_syscall(sys_mlockall); | 195 | cond_syscall(sys_mlockall); |
196 | cond_syscall(sys_munlockall); | 196 | cond_syscall(sys_munlockall); |
197 | cond_syscall(sys_mlock2); | ||
197 | cond_syscall(sys_mincore); | 198 | cond_syscall(sys_mincore); |
198 | cond_syscall(sys_madvise); | 199 | cond_syscall(sys_madvise); |
199 | cond_syscall(sys_mremap); | 200 | cond_syscall(sys_mremap); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e69201d8094e..dc6858d6639e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -64,6 +64,7 @@ | |||
64 | #include <linux/binfmts.h> | 64 | #include <linux/binfmts.h> |
65 | #include <linux/sched/sysctl.h> | 65 | #include <linux/sched/sysctl.h> |
66 | #include <linux/kexec.h> | 66 | #include <linux/kexec.h> |
67 | #include <linux/bpf.h> | ||
67 | 68 | ||
68 | #include <asm/uaccess.h> | 69 | #include <asm/uaccess.h> |
69 | #include <asm/processor.h> | 70 | #include <asm/processor.h> |
@@ -887,6 +888,17 @@ static struct ctl_table kern_table[] = { | |||
887 | .extra1 = &zero, | 888 | .extra1 = &zero, |
888 | .extra2 = &one, | 889 | .extra2 = &one, |
889 | }, | 890 | }, |
891 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
892 | { | ||
893 | .procname = "hardlockup_panic", | ||
894 | .data = &hardlockup_panic, | ||
895 | .maxlen = sizeof(int), | ||
896 | .mode = 0644, | ||
897 | .proc_handler = proc_dointvec_minmax, | ||
898 | .extra1 = &zero, | ||
899 | .extra2 = &one, | ||
900 | }, | ||
901 | #endif | ||
890 | #ifdef CONFIG_SMP | 902 | #ifdef CONFIG_SMP |
891 | { | 903 | { |
892 | .procname = "softlockup_all_cpu_backtrace", | 904 | .procname = "softlockup_all_cpu_backtrace", |
@@ -897,6 +909,15 @@ static struct ctl_table kern_table[] = { | |||
897 | .extra1 = &zero, | 909 | .extra1 = &zero, |
898 | .extra2 = &one, | 910 | .extra2 = &one, |
899 | }, | 911 | }, |
912 | { | ||
913 | .procname = "hardlockup_all_cpu_backtrace", | ||
914 | .data = &sysctl_hardlockup_all_cpu_backtrace, | ||
915 | .maxlen = sizeof(int), | ||
916 | .mode = 0644, | ||
917 | .proc_handler = proc_dointvec_minmax, | ||
918 | .extra1 = &zero, | ||
919 | .extra2 = &one, | ||
920 | }, | ||
900 | #endif /* CONFIG_SMP */ | 921 | #endif /* CONFIG_SMP */ |
901 | #endif | 922 | #endif |
902 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) | 923 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) |
@@ -1139,6 +1160,18 @@ static struct ctl_table kern_table[] = { | |||
1139 | .proc_handler = timer_migration_handler, | 1160 | .proc_handler = timer_migration_handler, |
1140 | }, | 1161 | }, |
1141 | #endif | 1162 | #endif |
1163 | #ifdef CONFIG_BPF_SYSCALL | ||
1164 | { | ||
1165 | .procname = "unprivileged_bpf_disabled", | ||
1166 | .data = &sysctl_unprivileged_bpf_disabled, | ||
1167 | .maxlen = sizeof(sysctl_unprivileged_bpf_disabled), | ||
1168 | .mode = 0644, | ||
1169 | /* only handle a transition from default "0" to "1" */ | ||
1170 | .proc_handler = proc_dointvec_minmax, | ||
1171 | .extra1 = &one, | ||
1172 | .extra2 = &one, | ||
1173 | }, | ||
1174 | #endif | ||
1142 | { } | 1175 | { } |
1143 | }; | 1176 | }; |
1144 | 1177 | ||
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 841b72f720e8..1347882d131e 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
@@ -217,7 +217,7 @@ static void clocksource_watchdog(unsigned long data) | |||
217 | continue; | 217 | continue; |
218 | 218 | ||
219 | /* Check the deviation from the watchdog clocksource. */ | 219 | /* Check the deviation from the watchdog clocksource. */ |
220 | if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) { | 220 | if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { |
221 | pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable because the skew is too large:\n", | 221 | pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable because the skew is too large:\n", |
222 | cs->name); | 222 | cs->name); |
223 | pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n", | 223 | pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n", |
@@ -479,7 +479,7 @@ static u32 clocksource_max_adjustment(struct clocksource *cs) | |||
479 | * return half the number of nanoseconds the hardware counter can technically | 479 | * return half the number of nanoseconds the hardware counter can technically |
480 | * cover. This is done so that we can potentially detect problems caused by | 480 | * cover. This is done so that we can potentially detect problems caused by |
481 | * delayed timers or bad hardware, which might result in time intervals that | 481 | * delayed timers or bad hardware, which might result in time intervals that |
482 | * are larger then what the math used can handle without overflows. | 482 | * are larger than what the math used can handle without overflows. |
483 | */ | 483 | */ |
484 | u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc) | 484 | u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc) |
485 | { | 485 | { |
@@ -595,16 +595,15 @@ static void __clocksource_select(bool skipcur) | |||
595 | */ | 595 | */ |
596 | static void clocksource_select(void) | 596 | static void clocksource_select(void) |
597 | { | 597 | { |
598 | return __clocksource_select(false); | 598 | __clocksource_select(false); |
599 | } | 599 | } |
600 | 600 | ||
601 | static void clocksource_select_fallback(void) | 601 | static void clocksource_select_fallback(void) |
602 | { | 602 | { |
603 | return __clocksource_select(true); | 603 | __clocksource_select(true); |
604 | } | 604 | } |
605 | 605 | ||
606 | #else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ | 606 | #else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ |
607 | |||
608 | static inline void clocksource_select(void) { } | 607 | static inline void clocksource_select(void) { } |
609 | static inline void clocksource_select_fallback(void) { } | 608 | static inline void clocksource_select_fallback(void) { } |
610 | 609 | ||
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 457a373e2181..435b8850dd80 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c | |||
@@ -59,7 +59,7 @@ | |||
59 | /* | 59 | /* |
60 | * The timer bases: | 60 | * The timer bases: |
61 | * | 61 | * |
62 | * There are more clockids then hrtimer bases. Thus, we index | 62 | * There are more clockids than hrtimer bases. Thus, we index |
63 | * into the timer bases by the hrtimer_base_type enum. When trying | 63 | * into the timer bases by the hrtimer_base_type enum. When trying |
64 | * to reach a base using a clockid, hrtimer_clockid_to_base() | 64 | * to reach a base using a clockid, hrtimer_clockid_to_base() |
65 | * is used to convert from clockid to the proper hrtimer_base_type. | 65 | * is used to convert from clockid to the proper hrtimer_base_type. |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index df68cb875248..149cc8086aea 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
@@ -99,7 +99,7 @@ static time64_t ntp_next_leap_sec = TIME64_MAX; | |||
99 | static int pps_valid; /* signal watchdog counter */ | 99 | static int pps_valid; /* signal watchdog counter */ |
100 | static long pps_tf[3]; /* phase median filter */ | 100 | static long pps_tf[3]; /* phase median filter */ |
101 | static long pps_jitter; /* current jitter (ns) */ | 101 | static long pps_jitter; /* current jitter (ns) */ |
102 | static struct timespec pps_fbase; /* beginning of the last freq interval */ | 102 | static struct timespec64 pps_fbase; /* beginning of the last freq interval */ |
103 | static int pps_shift; /* current interval duration (s) (shift) */ | 103 | static int pps_shift; /* current interval duration (s) (shift) */ |
104 | static int pps_intcnt; /* interval counter */ | 104 | static int pps_intcnt; /* interval counter */ |
105 | static s64 pps_freq; /* frequency offset (scaled ns/s) */ | 105 | static s64 pps_freq; /* frequency offset (scaled ns/s) */ |
@@ -509,7 +509,7 @@ static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); | |||
509 | static void sync_cmos_clock(struct work_struct *work) | 509 | static void sync_cmos_clock(struct work_struct *work) |
510 | { | 510 | { |
511 | struct timespec64 now; | 511 | struct timespec64 now; |
512 | struct timespec next; | 512 | struct timespec64 next; |
513 | int fail = 1; | 513 | int fail = 1; |
514 | 514 | ||
515 | /* | 515 | /* |
@@ -559,7 +559,7 @@ static void sync_cmos_clock(struct work_struct *work) | |||
559 | next.tv_nsec -= NSEC_PER_SEC; | 559 | next.tv_nsec -= NSEC_PER_SEC; |
560 | } | 560 | } |
561 | queue_delayed_work(system_power_efficient_wq, | 561 | queue_delayed_work(system_power_efficient_wq, |
562 | &sync_cmos_work, timespec_to_jiffies(&next)); | 562 | &sync_cmos_work, timespec64_to_jiffies(&next)); |
563 | } | 563 | } |
564 | 564 | ||
565 | void ntp_notify_cmos_timer(void) | 565 | void ntp_notify_cmos_timer(void) |
@@ -773,13 +773,13 @@ int __do_adjtimex(struct timex *txc, struct timespec64 *ts, s32 *time_tai) | |||
773 | * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] | 773 | * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] |
774 | * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */ | 774 | * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */ |
775 | struct pps_normtime { | 775 | struct pps_normtime { |
776 | __kernel_time_t sec; /* seconds */ | 776 | s64 sec; /* seconds */ |
777 | long nsec; /* nanoseconds */ | 777 | long nsec; /* nanoseconds */ |
778 | }; | 778 | }; |
779 | 779 | ||
780 | /* normalize the timestamp so that nsec is in the | 780 | /* normalize the timestamp so that nsec is in the |
781 | ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */ | 781 | ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */ |
782 | static inline struct pps_normtime pps_normalize_ts(struct timespec ts) | 782 | static inline struct pps_normtime pps_normalize_ts(struct timespec64 ts) |
783 | { | 783 | { |
784 | struct pps_normtime norm = { | 784 | struct pps_normtime norm = { |
785 | .sec = ts.tv_sec, | 785 | .sec = ts.tv_sec, |
@@ -861,7 +861,7 @@ static long hardpps_update_freq(struct pps_normtime freq_norm) | |||
861 | pps_errcnt++; | 861 | pps_errcnt++; |
862 | pps_dec_freq_interval(); | 862 | pps_dec_freq_interval(); |
863 | printk_deferred(KERN_ERR | 863 | printk_deferred(KERN_ERR |
864 | "hardpps: PPSERROR: interval too long - %ld s\n", | 864 | "hardpps: PPSERROR: interval too long - %lld s\n", |
865 | freq_norm.sec); | 865 | freq_norm.sec); |
866 | return 0; | 866 | return 0; |
867 | } | 867 | } |
@@ -948,7 +948,7 @@ static void hardpps_update_phase(long error) | |||
948 | * This code is based on David Mills's reference nanokernel | 948 | * This code is based on David Mills's reference nanokernel |
949 | * implementation. It was mostly rewritten but keeps the same idea. | 949 | * implementation. It was mostly rewritten but keeps the same idea. |
950 | */ | 950 | */ |
951 | void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) | 951 | void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) |
952 | { | 952 | { |
953 | struct pps_normtime pts_norm, freq_norm; | 953 | struct pps_normtime pts_norm, freq_norm; |
954 | 954 | ||
@@ -969,7 +969,7 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) | |||
969 | } | 969 | } |
970 | 970 | ||
971 | /* ok, now we have a base for frequency calculation */ | 971 | /* ok, now we have a base for frequency calculation */ |
972 | freq_norm = pps_normalize_ts(timespec_sub(*raw_ts, pps_fbase)); | 972 | freq_norm = pps_normalize_ts(timespec64_sub(*raw_ts, pps_fbase)); |
973 | 973 | ||
974 | /* check that the signal is in the range | 974 | /* check that the signal is in the range |
975 | * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */ | 975 | * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */ |
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index 65430504ca26..af924470eac0 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h | |||
@@ -9,5 +9,5 @@ extern ktime_t ntp_get_next_leap(void); | |||
9 | extern int second_overflow(unsigned long secs); | 9 | extern int second_overflow(unsigned long secs); |
10 | extern int ntp_validate_timex(struct timex *); | 10 | extern int ntp_validate_timex(struct timex *); |
11 | extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *); | 11 | extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *); |
12 | extern void __hardpps(const struct timespec *, const struct timespec *); | 12 | extern void __hardpps(const struct timespec64 *, const struct timespec64 *); |
13 | #endif /* _LINUX_NTP_INTERNAL_H */ | 13 | #endif /* _LINUX_NTP_INTERNAL_H */ |
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 892e3dae0aac..f5e86d282d52 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c | |||
@@ -249,7 +249,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) | |||
249 | * but barriers are not required because update_gt_cputime() | 249 | * but barriers are not required because update_gt_cputime() |
250 | * can handle concurrent updates. | 250 | * can handle concurrent updates. |
251 | */ | 251 | */ |
252 | WRITE_ONCE(cputimer->running, 1); | 252 | WRITE_ONCE(cputimer->running, true); |
253 | } | 253 | } |
254 | sample_cputime_atomic(times, &cputimer->cputime_atomic); | 254 | sample_cputime_atomic(times, &cputimer->cputime_atomic); |
255 | } | 255 | } |
@@ -864,6 +864,13 @@ static void check_thread_timers(struct task_struct *tsk, | |||
864 | unsigned long long expires; | 864 | unsigned long long expires; |
865 | unsigned long soft; | 865 | unsigned long soft; |
866 | 866 | ||
867 | /* | ||
868 | * If cputime_expires is zero, then there are no active | ||
869 | * per thread CPU timers. | ||
870 | */ | ||
871 | if (task_cputime_zero(&tsk->cputime_expires)) | ||
872 | return; | ||
873 | |||
867 | expires = check_timers_list(timers, firing, prof_ticks(tsk)); | 874 | expires = check_timers_list(timers, firing, prof_ticks(tsk)); |
868 | tsk_expires->prof_exp = expires_to_cputime(expires); | 875 | tsk_expires->prof_exp = expires_to_cputime(expires); |
869 | 876 | ||
@@ -911,7 +918,7 @@ static inline void stop_process_timers(struct signal_struct *sig) | |||
911 | struct thread_group_cputimer *cputimer = &sig->cputimer; | 918 | struct thread_group_cputimer *cputimer = &sig->cputimer; |
912 | 919 | ||
913 | /* Turn off cputimer->running. This is done without locking. */ | 920 | /* Turn off cputimer->running. This is done without locking. */ |
914 | WRITE_ONCE(cputimer->running, 0); | 921 | WRITE_ONCE(cputimer->running, false); |
915 | } | 922 | } |
916 | 923 | ||
917 | static u32 onecputick; | 924 | static u32 onecputick; |
@@ -962,6 +969,19 @@ static void check_process_timers(struct task_struct *tsk, | |||
962 | unsigned long soft; | 969 | unsigned long soft; |
963 | 970 | ||
964 | /* | 971 | /* |
972 | * If cputimer is not running, then there are no active | ||
973 | * process wide timers (POSIX 1.b, itimers, RLIMIT_CPU). | ||
974 | */ | ||
975 | if (!READ_ONCE(tsk->signal->cputimer.running)) | ||
976 | return; | ||
977 | |||
978 | /* | ||
979 | * Signify that a thread is checking for process timers. | ||
980 | * Write access to this field is protected by the sighand lock. | ||
981 | */ | ||
982 | sig->cputimer.checking_timer = true; | ||
983 | |||
984 | /* | ||
965 | * Collect the current process totals. | 985 | * Collect the current process totals. |
966 | */ | 986 | */ |
967 | thread_group_cputimer(tsk, &cputime); | 987 | thread_group_cputimer(tsk, &cputime); |
@@ -1015,6 +1035,8 @@ static void check_process_timers(struct task_struct *tsk, | |||
1015 | sig->cputime_expires.sched_exp = sched_expires; | 1035 | sig->cputime_expires.sched_exp = sched_expires; |
1016 | if (task_cputime_zero(&sig->cputime_expires)) | 1036 | if (task_cputime_zero(&sig->cputime_expires)) |
1017 | stop_process_timers(sig); | 1037 | stop_process_timers(sig); |
1038 | |||
1039 | sig->cputimer.checking_timer = false; | ||
1018 | } | 1040 | } |
1019 | 1041 | ||
1020 | /* | 1042 | /* |
@@ -1117,24 +1139,33 @@ static inline int task_cputime_expired(const struct task_cputime *sample, | |||
1117 | static inline int fastpath_timer_check(struct task_struct *tsk) | 1139 | static inline int fastpath_timer_check(struct task_struct *tsk) |
1118 | { | 1140 | { |
1119 | struct signal_struct *sig; | 1141 | struct signal_struct *sig; |
1120 | cputime_t utime, stime; | ||
1121 | |||
1122 | task_cputime(tsk, &utime, &stime); | ||
1123 | 1142 | ||
1124 | if (!task_cputime_zero(&tsk->cputime_expires)) { | 1143 | if (!task_cputime_zero(&tsk->cputime_expires)) { |
1125 | struct task_cputime task_sample = { | 1144 | struct task_cputime task_sample; |
1126 | .utime = utime, | ||
1127 | .stime = stime, | ||
1128 | .sum_exec_runtime = tsk->se.sum_exec_runtime | ||
1129 | }; | ||
1130 | 1145 | ||
1146 | task_cputime(tsk, &task_sample.utime, &task_sample.stime); | ||
1147 | task_sample.sum_exec_runtime = tsk->se.sum_exec_runtime; | ||
1131 | if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) | 1148 | if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) |
1132 | return 1; | 1149 | return 1; |
1133 | } | 1150 | } |
1134 | 1151 | ||
1135 | sig = tsk->signal; | 1152 | sig = tsk->signal; |
1136 | /* Check if cputimer is running. This is accessed without locking. */ | 1153 | /* |
1137 | if (READ_ONCE(sig->cputimer.running)) { | 1154 | * Check if thread group timers expired when the cputimer is |
1155 | * running and no other thread in the group is already checking | ||
1156 | * for thread group cputimers. These fields are read without the | ||
1157 | * sighand lock. However, this is fine because this is meant to | ||
1158 | * be a fastpath heuristic to determine whether we should try to | ||
1159 | * acquire the sighand lock to check/handle timers. | ||
1160 | * | ||
1161 | * In the worst case scenario, if 'running' or 'checking_timer' gets | ||
1162 | * set but the current thread doesn't see the change yet, we'll wait | ||
1163 | * until the next thread in the group gets a scheduler interrupt to | ||
1164 | * handle the timer. This isn't an issue in practice because these | ||
1165 | * types of delays with signals actually getting sent are expected. | ||
1166 | */ | ||
1167 | if (READ_ONCE(sig->cputimer.running) && | ||
1168 | !READ_ONCE(sig->cputimer.checking_timer)) { | ||
1138 | struct task_cputime group_sample; | 1169 | struct task_cputime group_sample; |
1139 | 1170 | ||
1140 | sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic); | 1171 | sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic); |
@@ -1174,12 +1205,8 @@ void run_posix_cpu_timers(struct task_struct *tsk) | |||
1174 | * put them on the firing list. | 1205 | * put them on the firing list. |
1175 | */ | 1206 | */ |
1176 | check_thread_timers(tsk, &firing); | 1207 | check_thread_timers(tsk, &firing); |
1177 | /* | 1208 | |
1178 | * If there are any active process wide timers (POSIX 1.b, itimers, | 1209 | check_process_timers(tsk, &firing); |
1179 | * RLIMIT_CPU) cputimer must be running. | ||
1180 | */ | ||
1181 | if (READ_ONCE(tsk->signal->cputimer.running)) | ||
1182 | check_process_timers(tsk, &firing); | ||
1183 | 1210 | ||
1184 | /* | 1211 | /* |
1185 | * We must release these locks before taking any timer's lock. | 1212 | * We must release these locks before taking any timer's lock. |
diff --git a/kernel/time/timeconst.bc b/kernel/time/timeconst.bc index c7388dee8635..c48688904f9f 100644 --- a/kernel/time/timeconst.bc +++ b/kernel/time/timeconst.bc | |||
@@ -39,7 +39,7 @@ define fmuls(b,n,d) { | |||
39 | } | 39 | } |
40 | 40 | ||
41 | define timeconst(hz) { | 41 | define timeconst(hz) { |
42 | print "/* Automatically generated by kernel/timeconst.bc */\n" | 42 | print "/* Automatically generated by kernel/time/timeconst.bc */\n" |
43 | print "/* Time conversion constants for HZ == ", hz, " */\n" | 43 | print "/* Time conversion constants for HZ == ", hz, " */\n" |
44 | print "\n" | 44 | print "\n" |
45 | 45 | ||
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3739ac6aa473..d563c1960302 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -849,7 +849,7 @@ EXPORT_SYMBOL_GPL(ktime_get_real_seconds); | |||
849 | #ifdef CONFIG_NTP_PPS | 849 | #ifdef CONFIG_NTP_PPS |
850 | 850 | ||
851 | /** | 851 | /** |
852 | * getnstime_raw_and_real - get day and raw monotonic time in timespec format | 852 | * ktime_get_raw_and_real_ts64 - get day and raw monotonic time in timespec format |
853 | * @ts_raw: pointer to the timespec to be set to raw monotonic time | 853 | * @ts_raw: pointer to the timespec to be set to raw monotonic time |
854 | * @ts_real: pointer to the timespec to be set to the time of day | 854 | * @ts_real: pointer to the timespec to be set to the time of day |
855 | * | 855 | * |
@@ -857,7 +857,7 @@ EXPORT_SYMBOL_GPL(ktime_get_real_seconds); | |||
857 | * same time atomically and stores the resulting timestamps in timespec | 857 | * same time atomically and stores the resulting timestamps in timespec |
858 | * format. | 858 | * format. |
859 | */ | 859 | */ |
860 | void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) | 860 | void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw, struct timespec64 *ts_real) |
861 | { | 861 | { |
862 | struct timekeeper *tk = &tk_core.timekeeper; | 862 | struct timekeeper *tk = &tk_core.timekeeper; |
863 | unsigned long seq; | 863 | unsigned long seq; |
@@ -868,7 +868,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) | |||
868 | do { | 868 | do { |
869 | seq = read_seqcount_begin(&tk_core.seq); | 869 | seq = read_seqcount_begin(&tk_core.seq); |
870 | 870 | ||
871 | *ts_raw = timespec64_to_timespec(tk->raw_time); | 871 | *ts_raw = tk->raw_time; |
872 | ts_real->tv_sec = tk->xtime_sec; | 872 | ts_real->tv_sec = tk->xtime_sec; |
873 | ts_real->tv_nsec = 0; | 873 | ts_real->tv_nsec = 0; |
874 | 874 | ||
@@ -877,10 +877,10 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) | |||
877 | 877 | ||
878 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 878 | } while (read_seqcount_retry(&tk_core.seq, seq)); |
879 | 879 | ||
880 | timespec_add_ns(ts_raw, nsecs_raw); | 880 | timespec64_add_ns(ts_raw, nsecs_raw); |
881 | timespec_add_ns(ts_real, nsecs_real); | 881 | timespec64_add_ns(ts_real, nsecs_real); |
882 | } | 882 | } |
883 | EXPORT_SYMBOL(getnstime_raw_and_real); | 883 | EXPORT_SYMBOL(ktime_get_raw_and_real_ts64); |
884 | 884 | ||
885 | #endif /* CONFIG_NTP_PPS */ | 885 | #endif /* CONFIG_NTP_PPS */ |
886 | 886 | ||
@@ -1251,7 +1251,7 @@ void __init timekeeping_init(void) | |||
1251 | set_normalized_timespec64(&tmp, -boot.tv_sec, -boot.tv_nsec); | 1251 | set_normalized_timespec64(&tmp, -boot.tv_sec, -boot.tv_nsec); |
1252 | tk_set_wall_to_mono(tk, tmp); | 1252 | tk_set_wall_to_mono(tk, tmp); |
1253 | 1253 | ||
1254 | timekeeping_update(tk, TK_MIRROR); | 1254 | timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); |
1255 | 1255 | ||
1256 | write_seqcount_end(&tk_core.seq); | 1256 | write_seqcount_end(&tk_core.seq); |
1257 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | 1257 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); |
@@ -1614,7 +1614,7 @@ static __always_inline void timekeeping_freqadjust(struct timekeeper *tk, | |||
1614 | negative = (tick_error < 0); | 1614 | negative = (tick_error < 0); |
1615 | 1615 | ||
1616 | /* Sort out the magnitude of the correction */ | 1616 | /* Sort out the magnitude of the correction */ |
1617 | tick_error = abs64(tick_error); | 1617 | tick_error = abs(tick_error); |
1618 | for (adj = 0; tick_error > interval; adj++) | 1618 | for (adj = 0; tick_error > interval; adj++) |
1619 | tick_error >>= 1; | 1619 | tick_error >>= 1; |
1620 | 1620 | ||
@@ -1674,7 +1674,7 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) | |||
1674 | /** | 1674 | /** |
1675 | * accumulate_nsecs_to_secs - Accumulates nsecs into secs | 1675 | * accumulate_nsecs_to_secs - Accumulates nsecs into secs |
1676 | * | 1676 | * |
1677 | * Helper function that accumulates a the nsecs greater then a second | 1677 | * Helper function that accumulates the nsecs greater than a second |
1678 | * from the xtime_nsec field to the xtime_secs field. | 1678 | * from the xtime_nsec field to the xtime_secs field. |
1679 | * It also calls into the NTP code to handle leapsecond processing. | 1679 | * It also calls into the NTP code to handle leapsecond processing. |
1680 | * | 1680 | * |
@@ -1726,7 +1726,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, | |||
1726 | cycle_t interval = tk->cycle_interval << shift; | 1726 | cycle_t interval = tk->cycle_interval << shift; |
1727 | u64 raw_nsecs; | 1727 | u64 raw_nsecs; |
1728 | 1728 | ||
1729 | /* If the offset is smaller then a shifted interval, do nothing */ | 1729 | /* If the offset is smaller than a shifted interval, do nothing */ |
1730 | if (offset < interval) | 1730 | if (offset < interval) |
1731 | return offset; | 1731 | return offset; |
1732 | 1732 | ||
@@ -2025,7 +2025,7 @@ int do_adjtimex(struct timex *txc) | |||
2025 | /** | 2025 | /** |
2026 | * hardpps() - Accessor function to NTP __hardpps function | 2026 | * hardpps() - Accessor function to NTP __hardpps function |
2027 | */ | 2027 | */ |
2028 | void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) | 2028 | void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) |
2029 | { | 2029 | { |
2030 | unsigned long flags; | 2030 | unsigned long flags; |
2031 | 2031 | ||
diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 84190f02b521..74591ba9474f 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c | |||
@@ -461,10 +461,17 @@ void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) | |||
461 | 461 | ||
462 | static void timer_stats_account_timer(struct timer_list *timer) | 462 | static void timer_stats_account_timer(struct timer_list *timer) |
463 | { | 463 | { |
464 | if (likely(!timer->start_site)) | 464 | void *site; |
465 | |||
466 | /* | ||
467 | * start_site can be concurrently reset by | ||
468 | * timer_stats_timer_clear_start_info() | ||
469 | */ | ||
470 | site = READ_ONCE(timer->start_site); | ||
471 | if (likely(!site)) | ||
465 | return; | 472 | return; |
466 | 473 | ||
467 | timer_stats_update_stats(timer, timer->start_pid, timer->start_site, | 474 | timer_stats_update_stats(timer, timer->start_pid, site, |
468 | timer->function, timer->start_comm, | 475 | timer->function, timer->start_comm, |
469 | timer->flags); | 476 | timer->flags); |
470 | } | 477 | } |
@@ -867,7 +874,7 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires) | |||
867 | if (mask == 0) | 874 | if (mask == 0) |
868 | return expires; | 875 | return expires; |
869 | 876 | ||
870 | bit = find_last_bit(&mask, BITS_PER_LONG); | 877 | bit = __fls(mask); |
871 | 878 | ||
872 | mask = (1UL << bit) - 1; | 879 | mask = (1UL << bit) - 1; |
873 | 880 | ||
diff --git a/kernel/torture.c b/kernel/torture.c index 3e4840633d3e..44aa462d033f 100644 --- a/kernel/torture.c +++ b/kernel/torture.c | |||
@@ -523,6 +523,7 @@ static int stutter; | |||
523 | */ | 523 | */ |
524 | void stutter_wait(const char *title) | 524 | void stutter_wait(const char *title) |
525 | { | 525 | { |
526 | cond_resched_rcu_qs(); | ||
526 | while (READ_ONCE(stutter_pause_test) || | 527 | while (READ_ONCE(stutter_pause_test) || |
527 | (torture_runnable && !READ_ONCE(*torture_runnable))) { | 528 | (torture_runnable && !READ_ONCE(*torture_runnable))) { |
528 | if (stutter_pause_test) | 529 | if (stutter_pause_test) |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8d6363f42169..e45db6b0d878 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
@@ -434,7 +434,7 @@ config UPROBE_EVENT | |||
434 | 434 | ||
435 | config BPF_EVENTS | 435 | config BPF_EVENTS |
436 | depends on BPF_SYSCALL | 436 | depends on BPF_SYSCALL |
437 | depends on KPROBE_EVENT || UPROBE_EVENT | 437 | depends on (KPROBE_EVENT || UPROBE_EVENT) && PERF_EVENTS |
438 | bool | 438 | bool |
439 | default y | 439 | default y |
440 | help | 440 | help |
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index b2fcf472774e..a990824c8604 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c | |||
@@ -437,7 +437,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, | |||
437 | struct block_device *bdev, | 437 | struct block_device *bdev, |
438 | struct blk_user_trace_setup *buts) | 438 | struct blk_user_trace_setup *buts) |
439 | { | 439 | { |
440 | struct blk_trace *old_bt, *bt = NULL; | 440 | struct blk_trace *bt = NULL; |
441 | struct dentry *dir = NULL; | 441 | struct dentry *dir = NULL; |
442 | int ret; | 442 | int ret; |
443 | 443 | ||
@@ -519,11 +519,8 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, | |||
519 | bt->trace_state = Blktrace_setup; | 519 | bt->trace_state = Blktrace_setup; |
520 | 520 | ||
521 | ret = -EBUSY; | 521 | ret = -EBUSY; |
522 | old_bt = xchg(&q->blk_trace, bt); | 522 | if (cmpxchg(&q->blk_trace, NULL, bt)) |
523 | if (old_bt) { | ||
524 | (void) xchg(&q->blk_trace, old_bt); | ||
525 | goto err; | 523 | goto err; |
526 | } | ||
527 | 524 | ||
528 | if (atomic_inc_return(&blk_probes_ref) == 1) | 525 | if (atomic_inc_return(&blk_probes_ref) == 1) |
529 | blk_register_tracepoints(); | 526 | blk_register_tracepoints(); |
@@ -1482,7 +1479,7 @@ static int blk_trace_remove_queue(struct request_queue *q) | |||
1482 | static int blk_trace_setup_queue(struct request_queue *q, | 1479 | static int blk_trace_setup_queue(struct request_queue *q, |
1483 | struct block_device *bdev) | 1480 | struct block_device *bdev) |
1484 | { | 1481 | { |
1485 | struct blk_trace *old_bt, *bt = NULL; | 1482 | struct blk_trace *bt = NULL; |
1486 | int ret = -ENOMEM; | 1483 | int ret = -ENOMEM; |
1487 | 1484 | ||
1488 | bt = kzalloc(sizeof(*bt), GFP_KERNEL); | 1485 | bt = kzalloc(sizeof(*bt), GFP_KERNEL); |
@@ -1498,12 +1495,9 @@ static int blk_trace_setup_queue(struct request_queue *q, | |||
1498 | 1495 | ||
1499 | blk_trace_setup_lba(bt, bdev); | 1496 | blk_trace_setup_lba(bt, bdev); |
1500 | 1497 | ||
1501 | old_bt = xchg(&q->blk_trace, bt); | 1498 | ret = -EBUSY; |
1502 | if (old_bt != NULL) { | 1499 | if (cmpxchg(&q->blk_trace, NULL, bt)) |
1503 | (void)xchg(&q->blk_trace, old_bt); | ||
1504 | ret = -EBUSY; | ||
1505 | goto free_bt; | 1500 | goto free_bt; |
1506 | } | ||
1507 | 1501 | ||
1508 | if (atomic_inc_return(&blk_probes_ref) == 1) | 1502 | if (atomic_inc_return(&blk_probes_ref) == 1) |
1509 | blk_register_tracepoints(); | 1503 | blk_register_tracepoints(); |
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0fe96c7c8803..4228fd3682c3 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c | |||
@@ -199,6 +199,11 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) | |||
199 | if (!event) | 199 | if (!event) |
200 | return -ENOENT; | 200 | return -ENOENT; |
201 | 201 | ||
202 | /* make sure event is local and doesn't have pmu::count */ | ||
203 | if (event->oncpu != smp_processor_id() || | ||
204 | event->pmu->count) | ||
205 | return -EINVAL; | ||
206 | |||
202 | /* | 207 | /* |
203 | * we don't know if the function is run successfully by the | 208 | * we don't know if the function is run successfully by the |
204 | * return value. It can be judged in other places, such as | 209 | * return value. It can be judged in other places, such as |
@@ -207,14 +212,58 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) | |||
207 | return perf_event_read_local(event); | 212 | return perf_event_read_local(event); |
208 | } | 213 | } |
209 | 214 | ||
210 | const struct bpf_func_proto bpf_perf_event_read_proto = { | 215 | static const struct bpf_func_proto bpf_perf_event_read_proto = { |
211 | .func = bpf_perf_event_read, | 216 | .func = bpf_perf_event_read, |
212 | .gpl_only = false, | 217 | .gpl_only = true, |
213 | .ret_type = RET_INTEGER, | 218 | .ret_type = RET_INTEGER, |
214 | .arg1_type = ARG_CONST_MAP_PTR, | 219 | .arg1_type = ARG_CONST_MAP_PTR, |
215 | .arg2_type = ARG_ANYTHING, | 220 | .arg2_type = ARG_ANYTHING, |
216 | }; | 221 | }; |
217 | 222 | ||
223 | static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size) | ||
224 | { | ||
225 | struct pt_regs *regs = (struct pt_regs *) (long) r1; | ||
226 | struct bpf_map *map = (struct bpf_map *) (long) r2; | ||
227 | struct bpf_array *array = container_of(map, struct bpf_array, map); | ||
228 | void *data = (void *) (long) r4; | ||
229 | struct perf_sample_data sample_data; | ||
230 | struct perf_event *event; | ||
231 | struct perf_raw_record raw = { | ||
232 | .size = size, | ||
233 | .data = data, | ||
234 | }; | ||
235 | |||
236 | if (unlikely(index >= array->map.max_entries)) | ||
237 | return -E2BIG; | ||
238 | |||
239 | event = (struct perf_event *)array->ptrs[index]; | ||
240 | if (unlikely(!event)) | ||
241 | return -ENOENT; | ||
242 | |||
243 | if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE || | ||
244 | event->attr.config != PERF_COUNT_SW_BPF_OUTPUT)) | ||
245 | return -EINVAL; | ||
246 | |||
247 | if (unlikely(event->oncpu != smp_processor_id())) | ||
248 | return -EOPNOTSUPP; | ||
249 | |||
250 | perf_sample_data_init(&sample_data, 0, 0); | ||
251 | sample_data.raw = &raw; | ||
252 | perf_event_output(event, &sample_data, regs); | ||
253 | return 0; | ||
254 | } | ||
255 | |||
256 | static const struct bpf_func_proto bpf_perf_event_output_proto = { | ||
257 | .func = bpf_perf_event_output, | ||
258 | .gpl_only = true, | ||
259 | .ret_type = RET_INTEGER, | ||
260 | .arg1_type = ARG_PTR_TO_CTX, | ||
261 | .arg2_type = ARG_CONST_MAP_PTR, | ||
262 | .arg3_type = ARG_ANYTHING, | ||
263 | .arg4_type = ARG_PTR_TO_STACK, | ||
264 | .arg5_type = ARG_CONST_STACK_SIZE, | ||
265 | }; | ||
266 | |||
218 | static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) | 267 | static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) |
219 | { | 268 | { |
220 | switch (func_id) { | 269 | switch (func_id) { |
@@ -242,6 +291,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func | |||
242 | return &bpf_get_smp_processor_id_proto; | 291 | return &bpf_get_smp_processor_id_proto; |
243 | case BPF_FUNC_perf_event_read: | 292 | case BPF_FUNC_perf_event_read: |
244 | return &bpf_perf_event_read_proto; | 293 | return &bpf_perf_event_read_proto; |
294 | case BPF_FUNC_perf_event_output: | ||
295 | return &bpf_perf_event_output_proto; | ||
245 | default: | 296 | default: |
246 | return NULL; | 297 | return NULL; |
247 | } | 298 | } |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ea2725053771..3f743b147247 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -5708,7 +5708,7 @@ free: | |||
5708 | } | 5708 | } |
5709 | 5709 | ||
5710 | static void | 5710 | static void |
5711 | ftrace_graph_probe_sched_switch(void *ignore, | 5711 | ftrace_graph_probe_sched_switch(void *ignore, bool preempt, |
5712 | struct task_struct *prev, struct task_struct *next) | 5712 | struct task_struct *prev, struct task_struct *next) |
5713 | { | 5713 | { |
5714 | unsigned long long timestamp; | 5714 | unsigned long long timestamp; |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index bee1e1530052..6bbc5f652355 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -506,7 +506,7 @@ check_ignore_pid(struct trace_pid_list *filtered_pids, struct task_struct *task) | |||
506 | } | 506 | } |
507 | 507 | ||
508 | static void | 508 | static void |
509 | event_filter_pid_sched_switch_probe_pre(void *data, | 509 | event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, |
510 | struct task_struct *prev, struct task_struct *next) | 510 | struct task_struct *prev, struct task_struct *next) |
511 | { | 511 | { |
512 | struct trace_array *tr = data; | 512 | struct trace_array *tr = data; |
@@ -520,7 +520,7 @@ event_filter_pid_sched_switch_probe_pre(void *data, | |||
520 | } | 520 | } |
521 | 521 | ||
522 | static void | 522 | static void |
523 | event_filter_pid_sched_switch_probe_post(void *data, | 523 | event_filter_pid_sched_switch_probe_post(void *data, bool preempt, |
524 | struct task_struct *prev, struct task_struct *next) | 524 | struct task_struct *prev, struct task_struct *next) |
525 | { | 525 | { |
526 | struct trace_array *tr = data; | 526 | struct trace_array *tr = data; |
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index f270088e9929..4c896a0101bd 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c | |||
@@ -16,7 +16,8 @@ static int sched_ref; | |||
16 | static DEFINE_MUTEX(sched_register_mutex); | 16 | static DEFINE_MUTEX(sched_register_mutex); |
17 | 17 | ||
18 | static void | 18 | static void |
19 | probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next) | 19 | probe_sched_switch(void *ignore, bool preempt, |
20 | struct task_struct *prev, struct task_struct *next) | ||
20 | { | 21 | { |
21 | if (unlikely(!sched_ref)) | 22 | if (unlikely(!sched_ref)) |
22 | return; | 23 | return; |
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 855c2c7612e8..9d4399b553a3 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c | |||
@@ -424,7 +424,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, | |||
424 | } | 424 | } |
425 | 425 | ||
426 | static void notrace | 426 | static void notrace |
427 | probe_wakeup_sched_switch(void *ignore, | 427 | probe_wakeup_sched_switch(void *ignore, bool preempt, |
428 | struct task_struct *prev, struct task_struct *next) | 428 | struct task_struct *prev, struct task_struct *next) |
429 | { | 429 | { |
430 | struct trace_array_cpu *data; | 430 | struct trace_array_cpu *data; |
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 0bd212af406c..dda9e6742950 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c | |||
@@ -91,9 +91,19 @@ check_stack(unsigned long ip, unsigned long *stack) | |||
91 | if (!object_is_on_stack(stack)) | 91 | if (!object_is_on_stack(stack)) |
92 | return; | 92 | return; |
93 | 93 | ||
94 | /* Can't do this from NMI context (can cause deadlocks) */ | ||
95 | if (in_nmi()) | ||
96 | return; | ||
97 | |||
94 | local_irq_save(flags); | 98 | local_irq_save(flags); |
95 | arch_spin_lock(&stack_trace_max_lock); | 99 | arch_spin_lock(&stack_trace_max_lock); |
96 | 100 | ||
101 | /* | ||
102 | * RCU may not be watching, make it see us. | ||
103 | * The stack trace code uses rcu_sched. | ||
104 | */ | ||
105 | rcu_irq_enter(); | ||
106 | |||
97 | /* In case another CPU set the tracer_frame on us */ | 107 | /* In case another CPU set the tracer_frame on us */ |
98 | if (unlikely(!frame_size)) | 108 | if (unlikely(!frame_size)) |
99 | this_size -= tracer_frame; | 109 | this_size -= tracer_frame; |
@@ -175,6 +185,7 @@ check_stack(unsigned long ip, unsigned long *stack) | |||
175 | } | 185 | } |
176 | 186 | ||
177 | out: | 187 | out: |
188 | rcu_irq_exit(); | ||
178 | arch_spin_unlock(&stack_trace_max_lock); | 189 | arch_spin_unlock(&stack_trace_max_lock); |
179 | local_irq_restore(flags); | 190 | local_irq_restore(flags); |
180 | } | 191 | } |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 64ed1c37bd1f..18f34cf75f74 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -57,8 +57,10 @@ int __read_mostly watchdog_thresh = 10; | |||
57 | 57 | ||
58 | #ifdef CONFIG_SMP | 58 | #ifdef CONFIG_SMP |
59 | int __read_mostly sysctl_softlockup_all_cpu_backtrace; | 59 | int __read_mostly sysctl_softlockup_all_cpu_backtrace; |
60 | int __read_mostly sysctl_hardlockup_all_cpu_backtrace; | ||
60 | #else | 61 | #else |
61 | #define sysctl_softlockup_all_cpu_backtrace 0 | 62 | #define sysctl_softlockup_all_cpu_backtrace 0 |
63 | #define sysctl_hardlockup_all_cpu_backtrace 0 | ||
62 | #endif | 64 | #endif |
63 | static struct cpumask watchdog_cpumask __read_mostly; | 65 | static struct cpumask watchdog_cpumask __read_mostly; |
64 | unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); | 66 | unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); |
@@ -110,8 +112,9 @@ static unsigned long soft_lockup_nmi_warn; | |||
110 | * Should we panic when a soft-lockup or hard-lockup occurs: | 112 | * Should we panic when a soft-lockup or hard-lockup occurs: |
111 | */ | 113 | */ |
112 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | 114 | #ifdef CONFIG_HARDLOCKUP_DETECTOR |
113 | static int hardlockup_panic = | 115 | unsigned int __read_mostly hardlockup_panic = |
114 | CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; | 116 | CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; |
117 | static unsigned long hardlockup_allcpu_dumped; | ||
115 | /* | 118 | /* |
116 | * We may not want to enable hard lockup detection by default in all cases, | 119 | * We may not want to enable hard lockup detection by default in all cases, |
117 | * for example when running the kernel as a guest on a hypervisor. In these | 120 | * for example when running the kernel as a guest on a hypervisor. In these |
@@ -173,6 +176,13 @@ static int __init softlockup_all_cpu_backtrace_setup(char *str) | |||
173 | return 1; | 176 | return 1; |
174 | } | 177 | } |
175 | __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); | 178 | __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); |
179 | static int __init hardlockup_all_cpu_backtrace_setup(char *str) | ||
180 | { | ||
181 | sysctl_hardlockup_all_cpu_backtrace = | ||
182 | !!simple_strtol(str, NULL, 0); | ||
183 | return 1; | ||
184 | } | ||
185 | __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); | ||
176 | #endif | 186 | #endif |
177 | 187 | ||
178 | /* | 188 | /* |
@@ -263,15 +273,15 @@ void touch_softlockup_watchdog_sync(void) | |||
263 | 273 | ||
264 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | 274 | #ifdef CONFIG_HARDLOCKUP_DETECTOR |
265 | /* watchdog detector functions */ | 275 | /* watchdog detector functions */ |
266 | static int is_hardlockup(void) | 276 | static bool is_hardlockup(void) |
267 | { | 277 | { |
268 | unsigned long hrint = __this_cpu_read(hrtimer_interrupts); | 278 | unsigned long hrint = __this_cpu_read(hrtimer_interrupts); |
269 | 279 | ||
270 | if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) | 280 | if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) |
271 | return 1; | 281 | return true; |
272 | 282 | ||
273 | __this_cpu_write(hrtimer_interrupts_saved, hrint); | 283 | __this_cpu_write(hrtimer_interrupts_saved, hrint); |
274 | return 0; | 284 | return false; |
275 | } | 285 | } |
276 | #endif | 286 | #endif |
277 | 287 | ||
@@ -279,7 +289,7 @@ static int is_softlockup(unsigned long touch_ts) | |||
279 | { | 289 | { |
280 | unsigned long now = get_timestamp(); | 290 | unsigned long now = get_timestamp(); |
281 | 291 | ||
282 | if (watchdog_enabled & SOFT_WATCHDOG_ENABLED) { | 292 | if ((watchdog_enabled & SOFT_WATCHDOG_ENABLED) && watchdog_thresh){ |
283 | /* Warn about unreasonable delays. */ | 293 | /* Warn about unreasonable delays. */ |
284 | if (time_after(now, touch_ts + get_softlockup_thresh())) | 294 | if (time_after(now, touch_ts + get_softlockup_thresh())) |
285 | return now - touch_ts; | 295 | return now - touch_ts; |
@@ -318,17 +328,30 @@ static void watchdog_overflow_callback(struct perf_event *event, | |||
318 | */ | 328 | */ |
319 | if (is_hardlockup()) { | 329 | if (is_hardlockup()) { |
320 | int this_cpu = smp_processor_id(); | 330 | int this_cpu = smp_processor_id(); |
331 | struct pt_regs *regs = get_irq_regs(); | ||
321 | 332 | ||
322 | /* only print hardlockups once */ | 333 | /* only print hardlockups once */ |
323 | if (__this_cpu_read(hard_watchdog_warn) == true) | 334 | if (__this_cpu_read(hard_watchdog_warn) == true) |
324 | return; | 335 | return; |
325 | 336 | ||
326 | if (hardlockup_panic) | 337 | pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu); |
327 | panic("Watchdog detected hard LOCKUP on cpu %d", | 338 | print_modules(); |
328 | this_cpu); | 339 | print_irqtrace_events(current); |
340 | if (regs) | ||
341 | show_regs(regs); | ||
329 | else | 342 | else |
330 | WARN(1, "Watchdog detected hard LOCKUP on cpu %d", | 343 | dump_stack(); |
331 | this_cpu); | 344 | |
345 | /* | ||
346 | * Perform all-CPU dump only once to avoid multiple hardlockups | ||
347 | * generating interleaving traces | ||
348 | */ | ||
349 | if (sysctl_hardlockup_all_cpu_backtrace && | ||
350 | !test_and_set_bit(0, &hardlockup_allcpu_dumped)) | ||
351 | trigger_allbutself_cpu_backtrace(); | ||
352 | |||
353 | if (hardlockup_panic) | ||
354 | panic("Hard LOCKUP"); | ||
332 | 355 | ||
333 | __this_cpu_write(hard_watchdog_warn, true); | 356 | __this_cpu_write(hard_watchdog_warn, true); |
334 | return; | 357 | return; |
@@ -347,6 +370,9 @@ static void watchdog_interrupt_count(void) | |||
347 | static int watchdog_nmi_enable(unsigned int cpu); | 370 | static int watchdog_nmi_enable(unsigned int cpu); |
348 | static void watchdog_nmi_disable(unsigned int cpu); | 371 | static void watchdog_nmi_disable(unsigned int cpu); |
349 | 372 | ||
373 | static int watchdog_enable_all_cpus(void); | ||
374 | static void watchdog_disable_all_cpus(void); | ||
375 | |||
350 | /* watchdog kicker functions */ | 376 | /* watchdog kicker functions */ |
351 | static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | 377 | static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) |
352 | { | 378 | { |
@@ -651,37 +677,41 @@ static struct smp_hotplug_thread watchdog_threads = { | |||
651 | 677 | ||
652 | /* | 678 | /* |
653 | * park all watchdog threads that are specified in 'watchdog_cpumask' | 679 | * park all watchdog threads that are specified in 'watchdog_cpumask' |
680 | * | ||
681 | * This function returns an error if kthread_park() of a watchdog thread | ||
682 | * fails. In this situation, the watchdog threads of some CPUs can already | ||
683 | * be parked and the watchdog threads of other CPUs can still be runnable. | ||
684 | * Callers are expected to handle this special condition as appropriate in | ||
685 | * their context. | ||
686 | * | ||
687 | * This function may only be called in a context that is protected against | ||
688 | * races with CPU hotplug - for example, via get_online_cpus(). | ||
654 | */ | 689 | */ |
655 | static int watchdog_park_threads(void) | 690 | static int watchdog_park_threads(void) |
656 | { | 691 | { |
657 | int cpu, ret = 0; | 692 | int cpu, ret = 0; |
658 | 693 | ||
659 | get_online_cpus(); | ||
660 | for_each_watchdog_cpu(cpu) { | 694 | for_each_watchdog_cpu(cpu) { |
661 | ret = kthread_park(per_cpu(softlockup_watchdog, cpu)); | 695 | ret = kthread_park(per_cpu(softlockup_watchdog, cpu)); |
662 | if (ret) | 696 | if (ret) |
663 | break; | 697 | break; |
664 | } | 698 | } |
665 | if (ret) { | ||
666 | for_each_watchdog_cpu(cpu) | ||
667 | kthread_unpark(per_cpu(softlockup_watchdog, cpu)); | ||
668 | } | ||
669 | put_online_cpus(); | ||
670 | 699 | ||
671 | return ret; | 700 | return ret; |
672 | } | 701 | } |
673 | 702 | ||
674 | /* | 703 | /* |
675 | * unpark all watchdog threads that are specified in 'watchdog_cpumask' | 704 | * unpark all watchdog threads that are specified in 'watchdog_cpumask' |
705 | * | ||
706 | * This function may only be called in a context that is protected against | ||
707 | * races with CPU hotplug - for example, via get_online_cpus(). | ||
676 | */ | 708 | */ |
677 | static void watchdog_unpark_threads(void) | 709 | static void watchdog_unpark_threads(void) |
678 | { | 710 | { |
679 | int cpu; | 711 | int cpu; |
680 | 712 | ||
681 | get_online_cpus(); | ||
682 | for_each_watchdog_cpu(cpu) | 713 | for_each_watchdog_cpu(cpu) |
683 | kthread_unpark(per_cpu(softlockup_watchdog, cpu)); | 714 | kthread_unpark(per_cpu(softlockup_watchdog, cpu)); |
684 | put_online_cpus(); | ||
685 | } | 715 | } |
686 | 716 | ||
687 | /* | 717 | /* |
@@ -691,6 +721,7 @@ int lockup_detector_suspend(void) | |||
691 | { | 721 | { |
692 | int ret = 0; | 722 | int ret = 0; |
693 | 723 | ||
724 | get_online_cpus(); | ||
694 | mutex_lock(&watchdog_proc_mutex); | 725 | mutex_lock(&watchdog_proc_mutex); |
695 | /* | 726 | /* |
696 | * Multiple suspend requests can be active in parallel (counted by | 727 | * Multiple suspend requests can be active in parallel (counted by |
@@ -704,6 +735,11 @@ int lockup_detector_suspend(void) | |||
704 | 735 | ||
705 | if (ret == 0) | 736 | if (ret == 0) |
706 | watchdog_suspended++; | 737 | watchdog_suspended++; |
738 | else { | ||
739 | watchdog_disable_all_cpus(); | ||
740 | pr_err("Failed to suspend lockup detectors, disabled\n"); | ||
741 | watchdog_enabled = 0; | ||
742 | } | ||
707 | 743 | ||
708 | mutex_unlock(&watchdog_proc_mutex); | 744 | mutex_unlock(&watchdog_proc_mutex); |
709 | 745 | ||
@@ -726,12 +762,20 @@ void lockup_detector_resume(void) | |||
726 | watchdog_unpark_threads(); | 762 | watchdog_unpark_threads(); |
727 | 763 | ||
728 | mutex_unlock(&watchdog_proc_mutex); | 764 | mutex_unlock(&watchdog_proc_mutex); |
765 | put_online_cpus(); | ||
729 | } | 766 | } |
730 | 767 | ||
731 | static void update_watchdog_all_cpus(void) | 768 | static int update_watchdog_all_cpus(void) |
732 | { | 769 | { |
733 | watchdog_park_threads(); | 770 | int ret; |
771 | |||
772 | ret = watchdog_park_threads(); | ||
773 | if (ret) | ||
774 | return ret; | ||
775 | |||
734 | watchdog_unpark_threads(); | 776 | watchdog_unpark_threads(); |
777 | |||
778 | return 0; | ||
735 | } | 779 | } |
736 | 780 | ||
737 | static int watchdog_enable_all_cpus(void) | 781 | static int watchdog_enable_all_cpus(void) |
@@ -750,15 +794,20 @@ static int watchdog_enable_all_cpus(void) | |||
750 | * Enable/disable the lockup detectors or | 794 | * Enable/disable the lockup detectors or |
751 | * change the sample period 'on the fly'. | 795 | * change the sample period 'on the fly'. |
752 | */ | 796 | */ |
753 | update_watchdog_all_cpus(); | 797 | err = update_watchdog_all_cpus(); |
798 | |||
799 | if (err) { | ||
800 | watchdog_disable_all_cpus(); | ||
801 | pr_err("Failed to update lockup detectors, disabled\n"); | ||
802 | } | ||
754 | } | 803 | } |
755 | 804 | ||
805 | if (err) | ||
806 | watchdog_enabled = 0; | ||
807 | |||
756 | return err; | 808 | return err; |
757 | } | 809 | } |
758 | 810 | ||
759 | /* prepare/enable/disable routines */ | ||
760 | /* sysctl functions */ | ||
761 | #ifdef CONFIG_SYSCTL | ||
762 | static void watchdog_disable_all_cpus(void) | 811 | static void watchdog_disable_all_cpus(void) |
763 | { | 812 | { |
764 | if (watchdog_running) { | 813 | if (watchdog_running) { |
@@ -767,6 +816,8 @@ static void watchdog_disable_all_cpus(void) | |||
767 | } | 816 | } |
768 | } | 817 | } |
769 | 818 | ||
819 | #ifdef CONFIG_SYSCTL | ||
820 | |||
770 | /* | 821 | /* |
771 | * Update the run state of the lockup detectors. | 822 | * Update the run state of the lockup detectors. |
772 | */ | 823 | */ |
@@ -808,6 +859,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, | |||
808 | int err, old, new; | 859 | int err, old, new; |
809 | int *watchdog_param = (int *)table->data; | 860 | int *watchdog_param = (int *)table->data; |
810 | 861 | ||
862 | get_online_cpus(); | ||
811 | mutex_lock(&watchdog_proc_mutex); | 863 | mutex_lock(&watchdog_proc_mutex); |
812 | 864 | ||
813 | if (watchdog_suspended) { | 865 | if (watchdog_suspended) { |
@@ -849,15 +901,17 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, | |||
849 | } while (cmpxchg(&watchdog_enabled, old, new) != old); | 901 | } while (cmpxchg(&watchdog_enabled, old, new) != old); |
850 | 902 | ||
851 | /* | 903 | /* |
852 | * Update the run state of the lockup detectors. | 904 | * Update the run state of the lockup detectors. There is _no_ |
853 | * Restore 'watchdog_enabled' on failure. | 905 | * need to check the value returned by proc_watchdog_update() |
906 | * and to restore the previous value of 'watchdog_enabled' as | ||
907 | * both lockup detectors are disabled if proc_watchdog_update() | ||
908 | * returns an error. | ||
854 | */ | 909 | */ |
855 | err = proc_watchdog_update(); | 910 | err = proc_watchdog_update(); |
856 | if (err) | ||
857 | watchdog_enabled = old; | ||
858 | } | 911 | } |
859 | out: | 912 | out: |
860 | mutex_unlock(&watchdog_proc_mutex); | 913 | mutex_unlock(&watchdog_proc_mutex); |
914 | put_online_cpus(); | ||
861 | return err; | 915 | return err; |
862 | } | 916 | } |
863 | 917 | ||
@@ -899,6 +953,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, | |||
899 | { | 953 | { |
900 | int err, old; | 954 | int err, old; |
901 | 955 | ||
956 | get_online_cpus(); | ||
902 | mutex_lock(&watchdog_proc_mutex); | 957 | mutex_lock(&watchdog_proc_mutex); |
903 | 958 | ||
904 | if (watchdog_suspended) { | 959 | if (watchdog_suspended) { |
@@ -914,15 +969,17 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, | |||
914 | goto out; | 969 | goto out; |
915 | 970 | ||
916 | /* | 971 | /* |
917 | * Update the sample period. | 972 | * Update the sample period. Restore on failure. |
918 | * Restore 'watchdog_thresh' on failure. | ||
919 | */ | 973 | */ |
920 | set_sample_period(); | 974 | set_sample_period(); |
921 | err = proc_watchdog_update(); | 975 | err = proc_watchdog_update(); |
922 | if (err) | 976 | if (err) { |
923 | watchdog_thresh = old; | 977 | watchdog_thresh = old; |
978 | set_sample_period(); | ||
979 | } | ||
924 | out: | 980 | out: |
925 | mutex_unlock(&watchdog_proc_mutex); | 981 | mutex_unlock(&watchdog_proc_mutex); |
982 | put_online_cpus(); | ||
926 | return err; | 983 | return err; |
927 | } | 984 | } |
928 | 985 | ||
@@ -937,6 +994,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, | |||
937 | { | 994 | { |
938 | int err; | 995 | int err; |
939 | 996 | ||
997 | get_online_cpus(); | ||
940 | mutex_lock(&watchdog_proc_mutex); | 998 | mutex_lock(&watchdog_proc_mutex); |
941 | 999 | ||
942 | if (watchdog_suspended) { | 1000 | if (watchdog_suspended) { |
@@ -964,6 +1022,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, | |||
964 | } | 1022 | } |
965 | out: | 1023 | out: |
966 | mutex_unlock(&watchdog_proc_mutex); | 1024 | mutex_unlock(&watchdog_proc_mutex); |
1025 | put_online_cpus(); | ||
967 | return err; | 1026 | return err; |
968 | } | 1027 | } |
969 | 1028 | ||
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ca71582fcfab..c579dbab2e36 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -1458,13 +1458,13 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, | |||
1458 | timer_stats_timer_set_start_info(&dwork->timer); | 1458 | timer_stats_timer_set_start_info(&dwork->timer); |
1459 | 1459 | ||
1460 | dwork->wq = wq; | 1460 | dwork->wq = wq; |
1461 | /* timer isn't guaranteed to run in this cpu, record earlier */ | ||
1462 | if (cpu == WORK_CPU_UNBOUND) | ||
1463 | cpu = raw_smp_processor_id(); | ||
1461 | dwork->cpu = cpu; | 1464 | dwork->cpu = cpu; |
1462 | timer->expires = jiffies + delay; | 1465 | timer->expires = jiffies + delay; |
1463 | 1466 | ||
1464 | if (unlikely(cpu != WORK_CPU_UNBOUND)) | 1467 | add_timer_on(timer, cpu); |
1465 | add_timer_on(timer, cpu); | ||
1466 | else | ||
1467 | add_timer(timer); | ||
1468 | } | 1468 | } |
1469 | 1469 | ||
1470 | /** | 1470 | /** |
@@ -3199,6 +3199,7 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) | |||
3199 | u32 hash = wqattrs_hash(attrs); | 3199 | u32 hash = wqattrs_hash(attrs); |
3200 | struct worker_pool *pool; | 3200 | struct worker_pool *pool; |
3201 | int node; | 3201 | int node; |
3202 | int target_node = NUMA_NO_NODE; | ||
3202 | 3203 | ||
3203 | lockdep_assert_held(&wq_pool_mutex); | 3204 | lockdep_assert_held(&wq_pool_mutex); |
3204 | 3205 | ||
@@ -3210,13 +3211,25 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) | |||
3210 | } | 3211 | } |
3211 | } | 3212 | } |
3212 | 3213 | ||
3214 | /* if cpumask is contained inside a NUMA node, we belong to that node */ | ||
3215 | if (wq_numa_enabled) { | ||
3216 | for_each_node(node) { | ||
3217 | if (cpumask_subset(attrs->cpumask, | ||
3218 | wq_numa_possible_cpumask[node])) { | ||
3219 | target_node = node; | ||
3220 | break; | ||
3221 | } | ||
3222 | } | ||
3223 | } | ||
3224 | |||
3213 | /* nope, create a new one */ | 3225 | /* nope, create a new one */ |
3214 | pool = kzalloc(sizeof(*pool), GFP_KERNEL); | 3226 | pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, target_node); |
3215 | if (!pool || init_worker_pool(pool) < 0) | 3227 | if (!pool || init_worker_pool(pool) < 0) |
3216 | goto fail; | 3228 | goto fail; |
3217 | 3229 | ||
3218 | lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ | 3230 | lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ |
3219 | copy_workqueue_attrs(pool->attrs, attrs); | 3231 | copy_workqueue_attrs(pool->attrs, attrs); |
3232 | pool->node = target_node; | ||
3220 | 3233 | ||
3221 | /* | 3234 | /* |
3222 | * no_numa isn't a worker_pool attribute, always clear it. See | 3235 | * no_numa isn't a worker_pool attribute, always clear it. See |
@@ -3224,17 +3237,6 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) | |||
3224 | */ | 3237 | */ |
3225 | pool->attrs->no_numa = false; | 3238 | pool->attrs->no_numa = false; |
3226 | 3239 | ||
3227 | /* if cpumask is contained inside a NUMA node, we belong to that node */ | ||
3228 | if (wq_numa_enabled) { | ||
3229 | for_each_node(node) { | ||
3230 | if (cpumask_subset(pool->attrs->cpumask, | ||
3231 | wq_numa_possible_cpumask[node])) { | ||
3232 | pool->node = node; | ||
3233 | break; | ||
3234 | } | ||
3235 | } | ||
3236 | } | ||
3237 | |||
3238 | if (worker_pool_assign_id(pool) < 0) | 3240 | if (worker_pool_assign_id(pool) < 0) |
3239 | goto fail; | 3241 | goto fail; |
3240 | 3242 | ||