summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/.gitignore1
-rw-r--r--kernel/audit.c48
-rw-r--r--kernel/audit.h2
-rw-r--r--kernel/audit_tree.c6
-rw-r--r--kernel/auditfilter.c14
-rw-r--r--kernel/bpf/Makefile4
-rw-r--r--kernel/bpf/arraymap.c26
-rw-r--r--kernel/bpf/core.c33
-rw-r--r--kernel/bpf/hashtab.c18
-rw-r--r--kernel/bpf/helpers.c7
-rw-r--r--kernel/bpf/inode.c387
-rw-r--r--kernel/bpf/syscall.c174
-rw-r--r--kernel/bpf/verifier.c116
-rw-r--r--kernel/cgroup.c1195
-rw-r--r--kernel/cgroup_pids.c8
-rw-r--r--kernel/context_tracking.c80
-rw-r--r--kernel/cpu.c23
-rw-r--r--kernel/cpuset.c86
-rw-r--r--kernel/events/core.c361
-rw-r--r--kernel/events/ring_buffer.c2
-rw-r--r--kernel/exit.c6
-rw-r--r--kernel/fork.c6
-rw-r--r--kernel/futex.c17
-rw-r--r--kernel/irq/Kconfig4
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/chip.c28
-rw-r--r--kernel/irq/cpuhotplug.c82
-rw-r--r--kernel/irq/handle.c9
-rw-r--r--kernel/irq/internals.h4
-rw-r--r--kernel/irq/irqdomain.c177
-rw-r--r--kernel/irq/manage.c230
-rw-r--r--kernel/irq/msi.c14
-rw-r--r--kernel/irq/pm.c2
-rw-r--r--kernel/irq/proc.c21
-rw-r--r--kernel/irq/settings.h12
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kexec_core.c10
-rw-r--r--kernel/kexec_file.c2
-rw-r--r--kernel/kmod.c8
-rw-r--r--kernel/locking/lockdep.c12
-rw-r--r--kernel/locking/locktorture.c164
-rw-r--r--kernel/locking/mcs_spinlock.h4
-rw-r--r--kernel/locking/mutex.c9
-rw-r--r--kernel/locking/osq_lock.c11
-rw-r--r--kernel/locking/percpu-rwsem.c90
-rw-r--r--kernel/locking/qrwlock.c8
-rw-r--r--kernel/locking/qspinlock_paravirt.h6
-rw-r--r--kernel/locking/rtmutex.c33
-rw-r--r--kernel/locking/rwsem-xadd.c5
-rw-r--r--kernel/memremap.c30
-rw-r--r--kernel/module.c8
-rw-r--r--kernel/module_signing.c1
-rw-r--r--kernel/panic.c10
-rw-r--r--kernel/params.c20
-rw-r--r--kernel/power/hibernate.c2
-rw-r--r--kernel/power/main.c17
-rw-r--r--kernel/power/snapshot.c2
-rw-r--r--kernel/power/suspend.c4
-rw-r--r--kernel/power/swap.c16
-rw-r--r--kernel/printk/printk.c14
-rw-r--r--kernel/ptrace.c5
-rw-r--r--kernel/rcu/Makefile2
-rw-r--r--kernel/rcu/rcutorture.c16
-rw-r--r--kernel/rcu/srcu.c4
-rw-r--r--kernel/rcu/sync.c223
-rw-r--r--kernel/rcu/tiny.c8
-rw-r--r--kernel/rcu/tree.c507
-rw-r--r--kernel/rcu/tree.h69
-rw-r--r--kernel/rcu/tree_plugin.h437
-rw-r--r--kernel/rcu/tree_trace.c10
-rw-r--r--kernel/rcu/update.c2
-rw-r--r--kernel/sched/core.c248
-rw-r--r--kernel/sched/cpudeadline.c5
-rw-r--r--kernel/sched/cpudeadline.h1
-rw-r--r--kernel/sched/cputime.c2
-rw-r--r--kernel/sched/deadline.c17
-rw-r--r--kernel/sched/fair.c428
-rw-r--r--kernel/sched/features.h21
-rw-r--r--kernel/sched/idle.c2
-rw-r--r--kernel/sched/rt.c22
-rw-r--r--kernel/sched/sched.h60
-rw-r--r--kernel/sched/wait.c7
-rw-r--r--kernel/seccomp.c78
-rw-r--r--kernel/signal.c53
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/smpboot.c5
-rw-r--r--kernel/stop_machine.c90
-rw-r--r--kernel/sys.c4
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c33
-rw-r--r--kernel/time/clocksource.c9
-rw-r--r--kernel/time/hrtimer.c2
-rw-r--r--kernel/time/ntp.c16
-rw-r--r--kernel/time/ntp_internal.h2
-rw-r--r--kernel/time/posix-cpu-timers.c63
-rw-r--r--kernel/time/timeconst.bc2
-rw-r--r--kernel/time/timekeeping.c22
-rw-r--r--kernel/time/timer.c13
-rw-r--r--kernel/torture.c1
-rw-r--r--kernel/trace/Kconfig2
-rw-r--r--kernel/trace/blktrace.c16
-rw-r--r--kernel/trace/bpf_trace.c55
-rw-r--r--kernel/trace/ftrace.c2
-rw-r--r--kernel/trace/trace_events.c4
-rw-r--r--kernel/trace/trace_sched_switch.c3
-rw-r--r--kernel/trace/trace_sched_wakeup.c2
-rw-r--r--kernel/trace/trace_stack.c11
-rw-r--r--kernel/watchdog.c121
-rw-r--r--kernel/workqueue.c34
109 files changed, 4399 insertions, 2035 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index 790d83c7d160..b3097bde4e9c 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -5,4 +5,3 @@ config_data.h
5config_data.gz 5config_data.gz
6timeconst.h 6timeconst.h
7hz.bc 7hz.bc
8x509_certificate_list
diff --git a/kernel/audit.c b/kernel/audit.c
index 662c007635fb..5ffcbd354a52 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -407,16 +407,33 @@ static void audit_printk_skb(struct sk_buff *skb)
407static void kauditd_send_skb(struct sk_buff *skb) 407static void kauditd_send_skb(struct sk_buff *skb)
408{ 408{
409 int err; 409 int err;
410 int attempts = 0;
411#define AUDITD_RETRIES 5
412
413restart:
410 /* take a reference in case we can't send it and we want to hold it */ 414 /* take a reference in case we can't send it and we want to hold it */
411 skb_get(skb); 415 skb_get(skb);
412 err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0); 416 err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);
413 if (err < 0) { 417 if (err < 0) {
414 BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ 418 pr_err("netlink_unicast sending to audit_pid=%d returned error: %d\n",
419 audit_pid, err);
415 if (audit_pid) { 420 if (audit_pid) {
416 pr_err("*NO* daemon at audit_pid=%d\n", audit_pid); 421 if (err == -ECONNREFUSED || err == -EPERM
417 audit_log_lost("auditd disappeared"); 422 || ++attempts >= AUDITD_RETRIES) {
418 audit_pid = 0; 423 char s[32];
419 audit_sock = NULL; 424
425 snprintf(s, sizeof(s), "audit_pid=%d reset", audit_pid);
426 audit_log_lost(s);
427 audit_pid = 0;
428 audit_sock = NULL;
429 } else {
430 pr_warn("re-scheduling(#%d) write to audit_pid=%d\n",
431 attempts, audit_pid);
432 set_current_state(TASK_INTERRUPTIBLE);
433 schedule();
434 __set_current_state(TASK_RUNNING);
435 goto restart;
436 }
420 } 437 }
421 /* we might get lucky and get this in the next auditd */ 438 /* we might get lucky and get this in the next auditd */
422 audit_hold_skb(skb); 439 audit_hold_skb(skb);
@@ -684,25 +701,22 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
684 return err; 701 return err;
685} 702}
686 703
687static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) 704static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
688{ 705{
689 int rc = 0;
690 uid_t uid = from_kuid(&init_user_ns, current_uid()); 706 uid_t uid = from_kuid(&init_user_ns, current_uid());
691 pid_t pid = task_tgid_nr(current); 707 pid_t pid = task_tgid_nr(current);
692 708
693 if (!audit_enabled && msg_type != AUDIT_USER_AVC) { 709 if (!audit_enabled && msg_type != AUDIT_USER_AVC) {
694 *ab = NULL; 710 *ab = NULL;
695 return rc; 711 return;
696 } 712 }
697 713
698 *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); 714 *ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
699 if (unlikely(!*ab)) 715 if (unlikely(!*ab))
700 return rc; 716 return;
701 audit_log_format(*ab, "pid=%d uid=%u", pid, uid); 717 audit_log_format(*ab, "pid=%d uid=%u", pid, uid);
702 audit_log_session_info(*ab); 718 audit_log_session_info(*ab);
703 audit_log_task_context(*ab); 719 audit_log_task_context(*ab);
704
705 return rc;
706} 720}
707 721
708int is_audit_feature_set(int i) 722int is_audit_feature_set(int i)
@@ -1357,16 +1371,16 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
1357 if (unlikely(audit_filter_type(type))) 1371 if (unlikely(audit_filter_type(type)))
1358 return NULL; 1372 return NULL;
1359 1373
1360 if (gfp_mask & __GFP_WAIT) { 1374 if (gfp_mask & __GFP_DIRECT_RECLAIM) {
1361 if (audit_pid && audit_pid == current->pid) 1375 if (audit_pid && audit_pid == current->pid)
1362 gfp_mask &= ~__GFP_WAIT; 1376 gfp_mask &= ~__GFP_DIRECT_RECLAIM;
1363 else 1377 else
1364 reserve = 0; 1378 reserve = 0;
1365 } 1379 }
1366 1380
1367 while (audit_backlog_limit 1381 while (audit_backlog_limit
1368 && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) { 1382 && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) {
1369 if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) { 1383 if (gfp_mask & __GFP_DIRECT_RECLAIM && audit_backlog_wait_time) {
1370 long sleep_time; 1384 long sleep_time;
1371 1385
1372 sleep_time = timeout_start + audit_backlog_wait_time - jiffies; 1386 sleep_time = timeout_start + audit_backlog_wait_time - jiffies;
@@ -1566,14 +1580,14 @@ void audit_log_n_string(struct audit_buffer *ab, const char *string,
1566 * @string: string to be checked 1580 * @string: string to be checked
1567 * @len: max length of the string to check 1581 * @len: max length of the string to check
1568 */ 1582 */
1569int audit_string_contains_control(const char *string, size_t len) 1583bool audit_string_contains_control(const char *string, size_t len)
1570{ 1584{
1571 const unsigned char *p; 1585 const unsigned char *p;
1572 for (p = string; p < (const unsigned char *)string + len; p++) { 1586 for (p = string; p < (const unsigned char *)string + len; p++) {
1573 if (*p == '"' || *p < 0x21 || *p > 0x7e) 1587 if (*p == '"' || *p < 0x21 || *p > 0x7e)
1574 return 1; 1588 return true;
1575 } 1589 }
1576 return 0; 1590 return false;
1577} 1591}
1578 1592
1579/** 1593/**
diff --git a/kernel/audit.h b/kernel/audit.h
index dadf86a0e59e..de6cbb7cf547 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -301,7 +301,7 @@ extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark
301#ifdef CONFIG_AUDIT_TREE 301#ifdef CONFIG_AUDIT_TREE
302extern struct audit_chunk *audit_tree_lookup(const struct inode *); 302extern struct audit_chunk *audit_tree_lookup(const struct inode *);
303extern void audit_put_chunk(struct audit_chunk *); 303extern void audit_put_chunk(struct audit_chunk *);
304extern int audit_tree_match(struct audit_chunk *, struct audit_tree *); 304extern bool audit_tree_match(struct audit_chunk *, struct audit_tree *);
305extern int audit_make_tree(struct audit_krule *, char *, u32); 305extern int audit_make_tree(struct audit_krule *, char *, u32);
306extern int audit_add_tree_rule(struct audit_krule *); 306extern int audit_add_tree_rule(struct audit_krule *);
307extern int audit_remove_tree_rule(struct audit_krule *); 307extern int audit_remove_tree_rule(struct audit_krule *);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 94ecdabda8e6..5efe9b299a12 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -197,13 +197,13 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
197 return NULL; 197 return NULL;
198} 198}
199 199
200int audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree) 200bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree)
201{ 201{
202 int n; 202 int n;
203 for (n = 0; n < chunk->count; n++) 203 for (n = 0; n < chunk->count; n++)
204 if (chunk->owners[n].owner == tree) 204 if (chunk->owners[n].owner == tree)
205 return 1; 205 return true;
206 return 0; 206 return false;
207} 207}
208 208
209/* tagging and untagging inodes with trees */ 209/* tagging and untagging inodes with trees */
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 7714d93edb85..b8ff9e193753 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -39,13 +39,13 @@
39 * Locking model: 39 * Locking model:
40 * 40 *
41 * audit_filter_mutex: 41 * audit_filter_mutex:
42 * Synchronizes writes and blocking reads of audit's filterlist 42 * Synchronizes writes and blocking reads of audit's filterlist
43 * data. Rcu is used to traverse the filterlist and access 43 * data. Rcu is used to traverse the filterlist and access
44 * contents of structs audit_entry, audit_watch and opaque 44 * contents of structs audit_entry, audit_watch and opaque
45 * LSM rules during filtering. If modified, these structures 45 * LSM rules during filtering. If modified, these structures
46 * must be copied and replace their counterparts in the filterlist. 46 * must be copied and replace their counterparts in the filterlist.
47 * An audit_parent struct is not accessed during filtering, so may 47 * An audit_parent struct is not accessed during filtering, so may
48 * be written directly provided audit_filter_mutex is held. 48 * be written directly provided audit_filter_mutex is held.
49 */ 49 */
50 50
51/* Audit filter lists, defined in <linux/audit.h> */ 51/* Audit filter lists, defined in <linux/audit.h> */
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index e6983be12bd3..13272582eee0 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,2 +1,4 @@
1obj-y := core.o 1obj-y := core.o
2obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o 2
3obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
4obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 29ace107f236..3f4c99e06c6b 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -15,6 +15,7 @@
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/mm.h> 16#include <linux/mm.h>
17#include <linux/filter.h> 17#include <linux/filter.h>
18#include <linux/perf_event.h>
18 19
19/* Called from syscall */ 20/* Called from syscall */
20static struct bpf_map *array_map_alloc(union bpf_attr *attr) 21static struct bpf_map *array_map_alloc(union bpf_attr *attr)
@@ -48,7 +49,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
48 array->map.key_size = attr->key_size; 49 array->map.key_size = attr->key_size;
49 array->map.value_size = attr->value_size; 50 array->map.value_size = attr->value_size;
50 array->map.max_entries = attr->max_entries; 51 array->map.max_entries = attr->max_entries;
51 52 array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT;
52 array->elem_size = elem_size; 53 array->elem_size = elem_size;
53 54
54 return &array->map; 55 return &array->map;
@@ -291,14 +292,23 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
291 292
292 attr = perf_event_attrs(event); 293 attr = perf_event_attrs(event);
293 if (IS_ERR(attr)) 294 if (IS_ERR(attr))
294 return (void *)attr; 295 goto err;
295 296
296 if (attr->type != PERF_TYPE_RAW && 297 if (attr->inherit)
297 attr->type != PERF_TYPE_HARDWARE) { 298 goto err;
298 perf_event_release_kernel(event); 299
299 return ERR_PTR(-EINVAL); 300 if (attr->type == PERF_TYPE_RAW)
300 } 301 return event;
301 return event; 302
303 if (attr->type == PERF_TYPE_HARDWARE)
304 return event;
305
306 if (attr->type == PERF_TYPE_SOFTWARE &&
307 attr->config == PERF_COUNT_SW_BPF_OUTPUT)
308 return event;
309err:
310 perf_event_release_kernel(event);
311 return ERR_PTR(-EINVAL);
302} 312}
303 313
304static void perf_event_fd_array_put_ptr(void *ptr) 314static void perf_event_fd_array_put_ptr(void *ptr)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 67c380cfa9ca..334b1bdd572c 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -82,6 +82,8 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
82 if (fp == NULL) 82 if (fp == NULL)
83 return NULL; 83 return NULL;
84 84
85 kmemcheck_annotate_bitfield(fp, meta);
86
85 aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags); 87 aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags);
86 if (aux == NULL) { 88 if (aux == NULL) {
87 vfree(fp); 89 vfree(fp);
@@ -90,6 +92,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
90 92
91 fp->pages = size / PAGE_SIZE; 93 fp->pages = size / PAGE_SIZE;
92 fp->aux = aux; 94 fp->aux = aux;
95 fp->aux->prog = fp;
93 96
94 return fp; 97 return fp;
95} 98}
@@ -110,8 +113,11 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
110 113
111 fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); 114 fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
112 if (fp != NULL) { 115 if (fp != NULL) {
116 kmemcheck_annotate_bitfield(fp, meta);
117
113 memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); 118 memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
114 fp->pages = size / PAGE_SIZE; 119 fp->pages = size / PAGE_SIZE;
120 fp->aux->prog = fp;
115 121
116 /* We keep fp->aux from fp_old around in the new 122 /* We keep fp->aux from fp_old around in the new
117 * reallocated structure. 123 * reallocated structure.
@@ -722,11 +728,36 @@ void bpf_prog_free(struct bpf_prog *fp)
722 struct bpf_prog_aux *aux = fp->aux; 728 struct bpf_prog_aux *aux = fp->aux;
723 729
724 INIT_WORK(&aux->work, bpf_prog_free_deferred); 730 INIT_WORK(&aux->work, bpf_prog_free_deferred);
725 aux->prog = fp;
726 schedule_work(&aux->work); 731 schedule_work(&aux->work);
727} 732}
728EXPORT_SYMBOL_GPL(bpf_prog_free); 733EXPORT_SYMBOL_GPL(bpf_prog_free);
729 734
735/* RNG for unpriviledged user space with separated state from prandom_u32(). */
736static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state);
737
738void bpf_user_rnd_init_once(void)
739{
740 prandom_init_once(&bpf_user_rnd_state);
741}
742
743u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
744{
745 /* Should someone ever have the rather unwise idea to use some
746 * of the registers passed into this function, then note that
747 * this function is called from native eBPF and classic-to-eBPF
748 * transformations. Register assignments from both sides are
749 * different, f.e. classic always sets fn(ctx, A, X) here.
750 */
751 struct rnd_state *state;
752 u32 res;
753
754 state = &get_cpu_var(bpf_user_rnd_state);
755 res = prandom_u32_state(state);
756 put_cpu_var(state);
757
758 return res;
759}
760
730/* Weak definitions of helper functions in case we don't have bpf syscall. */ 761/* Weak definitions of helper functions in case we don't have bpf syscall. */
731const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; 762const struct bpf_func_proto bpf_map_lookup_elem_proto __weak;
732const struct bpf_func_proto bpf_map_update_elem_proto __weak; 763const struct bpf_func_proto bpf_map_update_elem_proto __weak;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 83c209d9b17a..19909b22b4f8 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -17,7 +17,7 @@
17struct bpf_htab { 17struct bpf_htab {
18 struct bpf_map map; 18 struct bpf_map map;
19 struct hlist_head *buckets; 19 struct hlist_head *buckets;
20 spinlock_t lock; 20 raw_spinlock_t lock;
21 u32 count; /* number of elements in this hashtable */ 21 u32 count; /* number of elements in this hashtable */
22 u32 n_buckets; /* number of hash buckets */ 22 u32 n_buckets; /* number of hash buckets */
23 u32 elem_size; /* size of each element in bytes */ 23 u32 elem_size; /* size of each element in bytes */
@@ -82,12 +82,16 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
82 for (i = 0; i < htab->n_buckets; i++) 82 for (i = 0; i < htab->n_buckets; i++)
83 INIT_HLIST_HEAD(&htab->buckets[i]); 83 INIT_HLIST_HEAD(&htab->buckets[i]);
84 84
85 spin_lock_init(&htab->lock); 85 raw_spin_lock_init(&htab->lock);
86 htab->count = 0; 86 htab->count = 0;
87 87
88 htab->elem_size = sizeof(struct htab_elem) + 88 htab->elem_size = sizeof(struct htab_elem) +
89 round_up(htab->map.key_size, 8) + 89 round_up(htab->map.key_size, 8) +
90 htab->map.value_size; 90 htab->map.value_size;
91
92 htab->map.pages = round_up(htab->n_buckets * sizeof(struct hlist_head) +
93 htab->elem_size * htab->map.max_entries,
94 PAGE_SIZE) >> PAGE_SHIFT;
91 return &htab->map; 95 return &htab->map;
92 96
93free_htab: 97free_htab:
@@ -230,7 +234,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
230 l_new->hash = htab_map_hash(l_new->key, key_size); 234 l_new->hash = htab_map_hash(l_new->key, key_size);
231 235
232 /* bpf_map_update_elem() can be called in_irq() */ 236 /* bpf_map_update_elem() can be called in_irq() */
233 spin_lock_irqsave(&htab->lock, flags); 237 raw_spin_lock_irqsave(&htab->lock, flags);
234 238
235 head = select_bucket(htab, l_new->hash); 239 head = select_bucket(htab, l_new->hash);
236 240
@@ -266,11 +270,11 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
266 } else { 270 } else {
267 htab->count++; 271 htab->count++;
268 } 272 }
269 spin_unlock_irqrestore(&htab->lock, flags); 273 raw_spin_unlock_irqrestore(&htab->lock, flags);
270 274
271 return 0; 275 return 0;
272err: 276err:
273 spin_unlock_irqrestore(&htab->lock, flags); 277 raw_spin_unlock_irqrestore(&htab->lock, flags);
274 kfree(l_new); 278 kfree(l_new);
275 return ret; 279 return ret;
276} 280}
@@ -291,7 +295,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
291 295
292 hash = htab_map_hash(key, key_size); 296 hash = htab_map_hash(key, key_size);
293 297
294 spin_lock_irqsave(&htab->lock, flags); 298 raw_spin_lock_irqsave(&htab->lock, flags);
295 299
296 head = select_bucket(htab, hash); 300 head = select_bucket(htab, hash);
297 301
@@ -304,7 +308,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
304 ret = 0; 308 ret = 0;
305 } 309 }
306 310
307 spin_unlock_irqrestore(&htab->lock, flags); 311 raw_spin_unlock_irqrestore(&htab->lock, flags);
308 return ret; 312 return ret;
309} 313}
310 314
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 1447ec09421e..4504ca66118d 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -93,13 +93,8 @@ const struct bpf_func_proto bpf_map_delete_elem_proto = {
93 .arg2_type = ARG_PTR_TO_MAP_KEY, 93 .arg2_type = ARG_PTR_TO_MAP_KEY,
94}; 94};
95 95
96static u64 bpf_get_prandom_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
97{
98 return prandom_u32();
99}
100
101const struct bpf_func_proto bpf_get_prandom_u32_proto = { 96const struct bpf_func_proto bpf_get_prandom_u32_proto = {
102 .func = bpf_get_prandom_u32, 97 .func = bpf_user_rnd_u32,
103 .gpl_only = false, 98 .gpl_only = false,
104 .ret_type = RET_INTEGER, 99 .ret_type = RET_INTEGER,
105}; 100};
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
new file mode 100644
index 000000000000..be6d726e31c9
--- /dev/null
+++ b/kernel/bpf/inode.c
@@ -0,0 +1,387 @@
1/*
2 * Minimal file system backend for holding eBPF maps and programs,
3 * used by bpf(2) object pinning.
4 *
5 * Authors:
6 *
7 * Daniel Borkmann <daniel@iogearbox.net>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * version 2 as published by the Free Software Foundation.
12 */
13
14#include <linux/module.h>
15#include <linux/magic.h>
16#include <linux/major.h>
17#include <linux/mount.h>
18#include <linux/namei.h>
19#include <linux/fs.h>
20#include <linux/kdev_t.h>
21#include <linux/filter.h>
22#include <linux/bpf.h>
23
24enum bpf_type {
25 BPF_TYPE_UNSPEC = 0,
26 BPF_TYPE_PROG,
27 BPF_TYPE_MAP,
28};
29
30static void *bpf_any_get(void *raw, enum bpf_type type)
31{
32 switch (type) {
33 case BPF_TYPE_PROG:
34 atomic_inc(&((struct bpf_prog *)raw)->aux->refcnt);
35 break;
36 case BPF_TYPE_MAP:
37 atomic_inc(&((struct bpf_map *)raw)->refcnt);
38 break;
39 default:
40 WARN_ON_ONCE(1);
41 break;
42 }
43
44 return raw;
45}
46
47static void bpf_any_put(void *raw, enum bpf_type type)
48{
49 switch (type) {
50 case BPF_TYPE_PROG:
51 bpf_prog_put(raw);
52 break;
53 case BPF_TYPE_MAP:
54 bpf_map_put(raw);
55 break;
56 default:
57 WARN_ON_ONCE(1);
58 break;
59 }
60}
61
62static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type)
63{
64 void *raw;
65
66 *type = BPF_TYPE_MAP;
67 raw = bpf_map_get(ufd);
68 if (IS_ERR(raw)) {
69 *type = BPF_TYPE_PROG;
70 raw = bpf_prog_get(ufd);
71 }
72
73 return raw;
74}
75
76static const struct inode_operations bpf_dir_iops;
77
78static const struct inode_operations bpf_prog_iops = { };
79static const struct inode_operations bpf_map_iops = { };
80
81static struct inode *bpf_get_inode(struct super_block *sb,
82 const struct inode *dir,
83 umode_t mode)
84{
85 struct inode *inode;
86
87 switch (mode & S_IFMT) {
88 case S_IFDIR:
89 case S_IFREG:
90 break;
91 default:
92 return ERR_PTR(-EINVAL);
93 }
94
95 inode = new_inode(sb);
96 if (!inode)
97 return ERR_PTR(-ENOSPC);
98
99 inode->i_ino = get_next_ino();
100 inode->i_atime = CURRENT_TIME;
101 inode->i_mtime = inode->i_atime;
102 inode->i_ctime = inode->i_atime;
103
104 inode_init_owner(inode, dir, mode);
105
106 return inode;
107}
108
109static int bpf_inode_type(const struct inode *inode, enum bpf_type *type)
110{
111 *type = BPF_TYPE_UNSPEC;
112 if (inode->i_op == &bpf_prog_iops)
113 *type = BPF_TYPE_PROG;
114 else if (inode->i_op == &bpf_map_iops)
115 *type = BPF_TYPE_MAP;
116 else
117 return -EACCES;
118
119 return 0;
120}
121
122static bool bpf_dname_reserved(const struct dentry *dentry)
123{
124 return strchr(dentry->d_name.name, '.');
125}
126
127static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
128{
129 struct inode *inode;
130
131 if (bpf_dname_reserved(dentry))
132 return -EPERM;
133
134 inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR);
135 if (IS_ERR(inode))
136 return PTR_ERR(inode);
137
138 inode->i_op = &bpf_dir_iops;
139 inode->i_fop = &simple_dir_operations;
140
141 inc_nlink(inode);
142 inc_nlink(dir);
143
144 d_instantiate(dentry, inode);
145 dget(dentry);
146
147 return 0;
148}
149
150static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry,
151 umode_t mode, const struct inode_operations *iops)
152{
153 struct inode *inode;
154
155 if (bpf_dname_reserved(dentry))
156 return -EPERM;
157
158 inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFREG);
159 if (IS_ERR(inode))
160 return PTR_ERR(inode);
161
162 inode->i_op = iops;
163 inode->i_private = dentry->d_fsdata;
164
165 d_instantiate(dentry, inode);
166 dget(dentry);
167
168 return 0;
169}
170
171static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode,
172 dev_t devt)
173{
174 enum bpf_type type = MINOR(devt);
175
176 if (MAJOR(devt) != UNNAMED_MAJOR || !S_ISREG(mode) ||
177 dentry->d_fsdata == NULL)
178 return -EPERM;
179
180 switch (type) {
181 case BPF_TYPE_PROG:
182 return bpf_mkobj_ops(dir, dentry, mode, &bpf_prog_iops);
183 case BPF_TYPE_MAP:
184 return bpf_mkobj_ops(dir, dentry, mode, &bpf_map_iops);
185 default:
186 return -EPERM;
187 }
188}
189
190static const struct inode_operations bpf_dir_iops = {
191 .lookup = simple_lookup,
192 .mknod = bpf_mkobj,
193 .mkdir = bpf_mkdir,
194 .rmdir = simple_rmdir,
195 .unlink = simple_unlink,
196};
197
198static int bpf_obj_do_pin(const struct filename *pathname, void *raw,
199 enum bpf_type type)
200{
201 struct dentry *dentry;
202 struct inode *dir;
203 struct path path;
204 umode_t mode;
205 dev_t devt;
206 int ret;
207
208 dentry = kern_path_create(AT_FDCWD, pathname->name, &path, 0);
209 if (IS_ERR(dentry))
210 return PTR_ERR(dentry);
211
212 mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
213 devt = MKDEV(UNNAMED_MAJOR, type);
214
215 ret = security_path_mknod(&path, dentry, mode, devt);
216 if (ret)
217 goto out;
218
219 dir = d_inode(path.dentry);
220 if (dir->i_op != &bpf_dir_iops) {
221 ret = -EPERM;
222 goto out;
223 }
224
225 dentry->d_fsdata = raw;
226 ret = vfs_mknod(dir, dentry, mode, devt);
227 dentry->d_fsdata = NULL;
228out:
229 done_path_create(&path, dentry);
230 return ret;
231}
232
233int bpf_obj_pin_user(u32 ufd, const char __user *pathname)
234{
235 struct filename *pname;
236 enum bpf_type type;
237 void *raw;
238 int ret;
239
240 pname = getname(pathname);
241 if (IS_ERR(pname))
242 return PTR_ERR(pname);
243
244 raw = bpf_fd_probe_obj(ufd, &type);
245 if (IS_ERR(raw)) {
246 ret = PTR_ERR(raw);
247 goto out;
248 }
249
250 ret = bpf_obj_do_pin(pname, raw, type);
251 if (ret != 0)
252 bpf_any_put(raw, type);
253out:
254 putname(pname);
255 return ret;
256}
257
258static void *bpf_obj_do_get(const struct filename *pathname,
259 enum bpf_type *type)
260{
261 struct inode *inode;
262 struct path path;
263 void *raw;
264 int ret;
265
266 ret = kern_path(pathname->name, LOOKUP_FOLLOW, &path);
267 if (ret)
268 return ERR_PTR(ret);
269
270 inode = d_backing_inode(path.dentry);
271 ret = inode_permission(inode, MAY_WRITE);
272 if (ret)
273 goto out;
274
275 ret = bpf_inode_type(inode, type);
276 if (ret)
277 goto out;
278
279 raw = bpf_any_get(inode->i_private, *type);
280 touch_atime(&path);
281
282 path_put(&path);
283 return raw;
284out:
285 path_put(&path);
286 return ERR_PTR(ret);
287}
288
289int bpf_obj_get_user(const char __user *pathname)
290{
291 enum bpf_type type = BPF_TYPE_UNSPEC;
292 struct filename *pname;
293 int ret = -ENOENT;
294 void *raw;
295
296 pname = getname(pathname);
297 if (IS_ERR(pname))
298 return PTR_ERR(pname);
299
300 raw = bpf_obj_do_get(pname, &type);
301 if (IS_ERR(raw)) {
302 ret = PTR_ERR(raw);
303 goto out;
304 }
305
306 if (type == BPF_TYPE_PROG)
307 ret = bpf_prog_new_fd(raw);
308 else if (type == BPF_TYPE_MAP)
309 ret = bpf_map_new_fd(raw);
310 else
311 goto out;
312
313 if (ret < 0)
314 bpf_any_put(raw, type);
315out:
316 putname(pname);
317 return ret;
318}
319
320static void bpf_evict_inode(struct inode *inode)
321{
322 enum bpf_type type;
323
324 truncate_inode_pages_final(&inode->i_data);
325 clear_inode(inode);
326
327 if (!bpf_inode_type(inode, &type))
328 bpf_any_put(inode->i_private, type);
329}
330
331static const struct super_operations bpf_super_ops = {
332 .statfs = simple_statfs,
333 .drop_inode = generic_delete_inode,
334 .evict_inode = bpf_evict_inode,
335};
336
337static int bpf_fill_super(struct super_block *sb, void *data, int silent)
338{
339 static struct tree_descr bpf_rfiles[] = { { "" } };
340 struct inode *inode;
341 int ret;
342
343 ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles);
344 if (ret)
345 return ret;
346
347 sb->s_op = &bpf_super_ops;
348
349 inode = sb->s_root->d_inode;
350 inode->i_op = &bpf_dir_iops;
351 inode->i_mode &= ~S_IALLUGO;
352 inode->i_mode |= S_ISVTX | S_IRWXUGO;
353
354 return 0;
355}
356
357static struct dentry *bpf_mount(struct file_system_type *type, int flags,
358 const char *dev_name, void *data)
359{
360 return mount_ns(type, flags, current->nsproxy->mnt_ns, bpf_fill_super);
361}
362
363static struct file_system_type bpf_fs_type = {
364 .owner = THIS_MODULE,
365 .name = "bpf",
366 .mount = bpf_mount,
367 .kill_sb = kill_litter_super,
368 .fs_flags = FS_USERNS_MOUNT,
369};
370
371MODULE_ALIAS_FS("bpf");
372
373static int __init bpf_init(void)
374{
375 int ret;
376
377 ret = sysfs_create_mount_point(fs_kobj, "bpf");
378 if (ret)
379 return ret;
380
381 ret = register_filesystem(&bpf_fs_type);
382 if (ret)
383 sysfs_remove_mount_point(fs_kobj, "bpf");
384
385 return ret;
386}
387fs_initcall(bpf_init);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 35bac8e8b071..0d3313d02a7e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -18,6 +18,8 @@
18#include <linux/filter.h> 18#include <linux/filter.h>
19#include <linux/version.h> 19#include <linux/version.h>
20 20
21int sysctl_unprivileged_bpf_disabled __read_mostly;
22
21static LIST_HEAD(bpf_map_types); 23static LIST_HEAD(bpf_map_types);
22 24
23static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) 25static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
@@ -44,11 +46,38 @@ void bpf_register_map_type(struct bpf_map_type_list *tl)
44 list_add(&tl->list_node, &bpf_map_types); 46 list_add(&tl->list_node, &bpf_map_types);
45} 47}
46 48
49static int bpf_map_charge_memlock(struct bpf_map *map)
50{
51 struct user_struct *user = get_current_user();
52 unsigned long memlock_limit;
53
54 memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
55
56 atomic_long_add(map->pages, &user->locked_vm);
57
58 if (atomic_long_read(&user->locked_vm) > memlock_limit) {
59 atomic_long_sub(map->pages, &user->locked_vm);
60 free_uid(user);
61 return -EPERM;
62 }
63 map->user = user;
64 return 0;
65}
66
67static void bpf_map_uncharge_memlock(struct bpf_map *map)
68{
69 struct user_struct *user = map->user;
70
71 atomic_long_sub(map->pages, &user->locked_vm);
72 free_uid(user);
73}
74
47/* called from workqueue */ 75/* called from workqueue */
48static void bpf_map_free_deferred(struct work_struct *work) 76static void bpf_map_free_deferred(struct work_struct *work)
49{ 77{
50 struct bpf_map *map = container_of(work, struct bpf_map, work); 78 struct bpf_map *map = container_of(work, struct bpf_map, work);
51 79
80 bpf_map_uncharge_memlock(map);
52 /* implementation dependent freeing */ 81 /* implementation dependent freeing */
53 map->ops->map_free(map); 82 map->ops->map_free(map);
54} 83}
@@ -82,6 +111,12 @@ static const struct file_operations bpf_map_fops = {
82 .release = bpf_map_release, 111 .release = bpf_map_release,
83}; 112};
84 113
114int bpf_map_new_fd(struct bpf_map *map)
115{
116 return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
117 O_RDWR | O_CLOEXEC);
118}
119
85/* helper macro to check that unused fields 'union bpf_attr' are zero */ 120/* helper macro to check that unused fields 'union bpf_attr' are zero */
86#define CHECK_ATTR(CMD) \ 121#define CHECK_ATTR(CMD) \
87 memchr_inv((void *) &attr->CMD##_LAST_FIELD + \ 122 memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
@@ -108,8 +143,11 @@ static int map_create(union bpf_attr *attr)
108 143
109 atomic_set(&map->refcnt, 1); 144 atomic_set(&map->refcnt, 1);
110 145
111 err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC); 146 err = bpf_map_charge_memlock(map);
147 if (err)
148 goto free_map;
112 149
150 err = bpf_map_new_fd(map);
113 if (err < 0) 151 if (err < 0)
114 /* failed to allocate fd */ 152 /* failed to allocate fd */
115 goto free_map; 153 goto free_map;
@@ -124,19 +162,29 @@ free_map:
124/* if error is returned, fd is released. 162/* if error is returned, fd is released.
125 * On success caller should complete fd access with matching fdput() 163 * On success caller should complete fd access with matching fdput()
126 */ 164 */
127struct bpf_map *bpf_map_get(struct fd f) 165struct bpf_map *__bpf_map_get(struct fd f)
128{ 166{
129 struct bpf_map *map;
130
131 if (!f.file) 167 if (!f.file)
132 return ERR_PTR(-EBADF); 168 return ERR_PTR(-EBADF);
133
134 if (f.file->f_op != &bpf_map_fops) { 169 if (f.file->f_op != &bpf_map_fops) {
135 fdput(f); 170 fdput(f);
136 return ERR_PTR(-EINVAL); 171 return ERR_PTR(-EINVAL);
137 } 172 }
138 173
139 map = f.file->private_data; 174 return f.file->private_data;
175}
176
177struct bpf_map *bpf_map_get(u32 ufd)
178{
179 struct fd f = fdget(ufd);
180 struct bpf_map *map;
181
182 map = __bpf_map_get(f);
183 if (IS_ERR(map))
184 return map;
185
186 atomic_inc(&map->refcnt);
187 fdput(f);
140 188
141 return map; 189 return map;
142} 190}
@@ -164,7 +212,7 @@ static int map_lookup_elem(union bpf_attr *attr)
164 return -EINVAL; 212 return -EINVAL;
165 213
166 f = fdget(ufd); 214 f = fdget(ufd);
167 map = bpf_map_get(f); 215 map = __bpf_map_get(f);
168 if (IS_ERR(map)) 216 if (IS_ERR(map))
169 return PTR_ERR(map); 217 return PTR_ERR(map);
170 218
@@ -223,7 +271,7 @@ static int map_update_elem(union bpf_attr *attr)
223 return -EINVAL; 271 return -EINVAL;
224 272
225 f = fdget(ufd); 273 f = fdget(ufd);
226 map = bpf_map_get(f); 274 map = __bpf_map_get(f);
227 if (IS_ERR(map)) 275 if (IS_ERR(map))
228 return PTR_ERR(map); 276 return PTR_ERR(map);
229 277
@@ -276,7 +324,7 @@ static int map_delete_elem(union bpf_attr *attr)
276 return -EINVAL; 324 return -EINVAL;
277 325
278 f = fdget(ufd); 326 f = fdget(ufd);
279 map = bpf_map_get(f); 327 map = __bpf_map_get(f);
280 if (IS_ERR(map)) 328 if (IS_ERR(map))
281 return PTR_ERR(map); 329 return PTR_ERR(map);
282 330
@@ -317,7 +365,7 @@ static int map_get_next_key(union bpf_attr *attr)
317 return -EINVAL; 365 return -EINVAL;
318 366
319 f = fdget(ufd); 367 f = fdget(ufd);
320 map = bpf_map_get(f); 368 map = __bpf_map_get(f);
321 if (IS_ERR(map)) 369 if (IS_ERR(map))
322 return PTR_ERR(map); 370 return PTR_ERR(map);
323 371
@@ -402,6 +450,10 @@ static void fixup_bpf_calls(struct bpf_prog *prog)
402 */ 450 */
403 BUG_ON(!prog->aux->ops->get_func_proto); 451 BUG_ON(!prog->aux->ops->get_func_proto);
404 452
453 if (insn->imm == BPF_FUNC_get_route_realm)
454 prog->dst_needed = 1;
455 if (insn->imm == BPF_FUNC_get_prandom_u32)
456 bpf_user_rnd_init_once();
405 if (insn->imm == BPF_FUNC_tail_call) { 457 if (insn->imm == BPF_FUNC_tail_call) {
406 /* mark bpf_tail_call as different opcode 458 /* mark bpf_tail_call as different opcode
407 * to avoid conditional branch in 459 * to avoid conditional branch in
@@ -436,29 +488,51 @@ static void free_used_maps(struct bpf_prog_aux *aux)
436 kfree(aux->used_maps); 488 kfree(aux->used_maps);
437} 489}
438 490
439static void __prog_put_rcu(struct rcu_head *rcu) 491static int bpf_prog_charge_memlock(struct bpf_prog *prog)
492{
493 struct user_struct *user = get_current_user();
494 unsigned long memlock_limit;
495
496 memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
497
498 atomic_long_add(prog->pages, &user->locked_vm);
499 if (atomic_long_read(&user->locked_vm) > memlock_limit) {
500 atomic_long_sub(prog->pages, &user->locked_vm);
501 free_uid(user);
502 return -EPERM;
503 }
504 prog->aux->user = user;
505 return 0;
506}
507
508static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
509{
510 struct user_struct *user = prog->aux->user;
511
512 atomic_long_sub(prog->pages, &user->locked_vm);
513 free_uid(user);
514}
515
516static void __prog_put_common(struct rcu_head *rcu)
440{ 517{
441 struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); 518 struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
442 519
443 free_used_maps(aux); 520 free_used_maps(aux);
521 bpf_prog_uncharge_memlock(aux->prog);
444 bpf_prog_free(aux->prog); 522 bpf_prog_free(aux->prog);
445} 523}
446 524
447/* version of bpf_prog_put() that is called after a grace period */ 525/* version of bpf_prog_put() that is called after a grace period */
448void bpf_prog_put_rcu(struct bpf_prog *prog) 526void bpf_prog_put_rcu(struct bpf_prog *prog)
449{ 527{
450 if (atomic_dec_and_test(&prog->aux->refcnt)) { 528 if (atomic_dec_and_test(&prog->aux->refcnt))
451 prog->aux->prog = prog; 529 call_rcu(&prog->aux->rcu, __prog_put_common);
452 call_rcu(&prog->aux->rcu, __prog_put_rcu);
453 }
454} 530}
455 531
456void bpf_prog_put(struct bpf_prog *prog) 532void bpf_prog_put(struct bpf_prog *prog)
457{ 533{
458 if (atomic_dec_and_test(&prog->aux->refcnt)) { 534 if (atomic_dec_and_test(&prog->aux->refcnt))
459 free_used_maps(prog->aux); 535 __prog_put_common(&prog->aux->rcu);
460 bpf_prog_free(prog);
461 }
462} 536}
463EXPORT_SYMBOL_GPL(bpf_prog_put); 537EXPORT_SYMBOL_GPL(bpf_prog_put);
464 538
@@ -474,21 +548,22 @@ static const struct file_operations bpf_prog_fops = {
474 .release = bpf_prog_release, 548 .release = bpf_prog_release,
475}; 549};
476 550
477static struct bpf_prog *get_prog(struct fd f) 551int bpf_prog_new_fd(struct bpf_prog *prog)
478{ 552{
479 struct bpf_prog *prog; 553 return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog,
554 O_RDWR | O_CLOEXEC);
555}
480 556
557static struct bpf_prog *__bpf_prog_get(struct fd f)
558{
481 if (!f.file) 559 if (!f.file)
482 return ERR_PTR(-EBADF); 560 return ERR_PTR(-EBADF);
483
484 if (f.file->f_op != &bpf_prog_fops) { 561 if (f.file->f_op != &bpf_prog_fops) {
485 fdput(f); 562 fdput(f);
486 return ERR_PTR(-EINVAL); 563 return ERR_PTR(-EINVAL);
487 } 564 }
488 565
489 prog = f.file->private_data; 566 return f.file->private_data;
490
491 return prog;
492} 567}
493 568
494/* called by sockets/tracing/seccomp before attaching program to an event 569/* called by sockets/tracing/seccomp before attaching program to an event
@@ -499,13 +574,13 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
499 struct fd f = fdget(ufd); 574 struct fd f = fdget(ufd);
500 struct bpf_prog *prog; 575 struct bpf_prog *prog;
501 576
502 prog = get_prog(f); 577 prog = __bpf_prog_get(f);
503
504 if (IS_ERR(prog)) 578 if (IS_ERR(prog))
505 return prog; 579 return prog;
506 580
507 atomic_inc(&prog->aux->refcnt); 581 atomic_inc(&prog->aux->refcnt);
508 fdput(f); 582 fdput(f);
583
509 return prog; 584 return prog;
510} 585}
511EXPORT_SYMBOL_GPL(bpf_prog_get); 586EXPORT_SYMBOL_GPL(bpf_prog_get);
@@ -540,11 +615,18 @@ static int bpf_prog_load(union bpf_attr *attr)
540 attr->kern_version != LINUX_VERSION_CODE) 615 attr->kern_version != LINUX_VERSION_CODE)
541 return -EINVAL; 616 return -EINVAL;
542 617
618 if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN))
619 return -EPERM;
620
543 /* plain bpf_prog allocation */ 621 /* plain bpf_prog allocation */
544 prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); 622 prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
545 if (!prog) 623 if (!prog)
546 return -ENOMEM; 624 return -ENOMEM;
547 625
626 err = bpf_prog_charge_memlock(prog);
627 if (err)
628 goto free_prog_nouncharge;
629
548 prog->len = attr->insn_cnt; 630 prog->len = attr->insn_cnt;
549 631
550 err = -EFAULT; 632 err = -EFAULT;
@@ -553,10 +635,10 @@ static int bpf_prog_load(union bpf_attr *attr)
553 goto free_prog; 635 goto free_prog;
554 636
555 prog->orig_prog = NULL; 637 prog->orig_prog = NULL;
556 prog->jited = false; 638 prog->jited = 0;
557 639
558 atomic_set(&prog->aux->refcnt, 1); 640 atomic_set(&prog->aux->refcnt, 1);
559 prog->gpl_compatible = is_gpl; 641 prog->gpl_compatible = is_gpl ? 1 : 0;
560 642
561 /* find program type: socket_filter vs tracing_filter */ 643 /* find program type: socket_filter vs tracing_filter */
562 err = find_prog_type(type, prog); 644 err = find_prog_type(type, prog);
@@ -576,7 +658,7 @@ static int bpf_prog_load(union bpf_attr *attr)
576 if (err < 0) 658 if (err < 0)
577 goto free_used_maps; 659 goto free_used_maps;
578 660
579 err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC); 661 err = bpf_prog_new_fd(prog);
580 if (err < 0) 662 if (err < 0)
581 /* failed to allocate fd */ 663 /* failed to allocate fd */
582 goto free_used_maps; 664 goto free_used_maps;
@@ -586,20 +668,36 @@ static int bpf_prog_load(union bpf_attr *attr)
586free_used_maps: 668free_used_maps:
587 free_used_maps(prog->aux); 669 free_used_maps(prog->aux);
588free_prog: 670free_prog:
671 bpf_prog_uncharge_memlock(prog);
672free_prog_nouncharge:
589 bpf_prog_free(prog); 673 bpf_prog_free(prog);
590 return err; 674 return err;
591} 675}
592 676
677#define BPF_OBJ_LAST_FIELD bpf_fd
678
679static int bpf_obj_pin(const union bpf_attr *attr)
680{
681 if (CHECK_ATTR(BPF_OBJ))
682 return -EINVAL;
683
684 return bpf_obj_pin_user(attr->bpf_fd, u64_to_ptr(attr->pathname));
685}
686
687static int bpf_obj_get(const union bpf_attr *attr)
688{
689 if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0)
690 return -EINVAL;
691
692 return bpf_obj_get_user(u64_to_ptr(attr->pathname));
693}
694
593SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) 695SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
594{ 696{
595 union bpf_attr attr = {}; 697 union bpf_attr attr = {};
596 int err; 698 int err;
597 699
598 /* the syscall is limited to root temporarily. This restriction will be 700 if (!capable(CAP_SYS_ADMIN) && sysctl_unprivileged_bpf_disabled)
599 * lifted when security audit is clean. Note that eBPF+tracing must have
600 * this restriction, since it may pass kernel data to user space
601 */
602 if (!capable(CAP_SYS_ADMIN))
603 return -EPERM; 701 return -EPERM;
604 702
605 if (!access_ok(VERIFY_READ, uattr, 1)) 703 if (!access_ok(VERIFY_READ, uattr, 1))
@@ -654,6 +752,12 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
654 case BPF_PROG_LOAD: 752 case BPF_PROG_LOAD:
655 err = bpf_prog_load(&attr); 753 err = bpf_prog_load(&attr);
656 break; 754 break;
755 case BPF_OBJ_PIN:
756 err = bpf_obj_pin(&attr);
757 break;
758 case BPF_OBJ_GET:
759 err = bpf_obj_get(&attr);
760 break;
657 default: 761 default:
658 err = -EINVAL; 762 err = -EINVAL;
659 break; 763 break;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b074b23000d6..c6073056badf 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -199,6 +199,7 @@ struct verifier_env {
199 struct verifier_state_list **explored_states; /* search pruning optimization */ 199 struct verifier_state_list **explored_states; /* search pruning optimization */
200 struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ 200 struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
201 u32 used_map_cnt; /* number of used maps */ 201 u32 used_map_cnt; /* number of used maps */
202 bool allow_ptr_leaks;
202}; 203};
203 204
204/* verbose verifier prints what it's seeing 205/* verbose verifier prints what it's seeing
@@ -213,7 +214,7 @@ static DEFINE_MUTEX(bpf_verifier_lock);
213 * verbose() is used to dump the verification trace to the log, so the user 214 * verbose() is used to dump the verification trace to the log, so the user
214 * can figure out what's wrong with the program 215 * can figure out what's wrong with the program
215 */ 216 */
216static void verbose(const char *fmt, ...) 217static __printf(1, 2) void verbose(const char *fmt, ...)
217{ 218{
218 va_list args; 219 va_list args;
219 220
@@ -244,6 +245,7 @@ static const struct {
244} func_limit[] = { 245} func_limit[] = {
245 {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call}, 246 {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call},
246 {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read}, 247 {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read},
248 {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_output},
247}; 249};
248 250
249static void print_verifier_state(struct verifier_env *env) 251static void print_verifier_state(struct verifier_env *env)
@@ -538,6 +540,21 @@ static int bpf_size_to_bytes(int bpf_size)
538 return -EINVAL; 540 return -EINVAL;
539} 541}
540 542
543static bool is_spillable_regtype(enum bpf_reg_type type)
544{
545 switch (type) {
546 case PTR_TO_MAP_VALUE:
547 case PTR_TO_MAP_VALUE_OR_NULL:
548 case PTR_TO_STACK:
549 case PTR_TO_CTX:
550 case FRAME_PTR:
551 case CONST_PTR_TO_MAP:
552 return true;
553 default:
554 return false;
555 }
556}
557
541/* check_stack_read/write functions track spill/fill of registers, 558/* check_stack_read/write functions track spill/fill of registers,
542 * stack boundary and alignment are checked in check_mem_access() 559 * stack boundary and alignment are checked in check_mem_access()
543 */ 560 */
@@ -550,9 +567,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
550 */ 567 */
551 568
552 if (value_regno >= 0 && 569 if (value_regno >= 0 &&
553 (state->regs[value_regno].type == PTR_TO_MAP_VALUE || 570 is_spillable_regtype(state->regs[value_regno].type)) {
554 state->regs[value_regno].type == PTR_TO_STACK ||
555 state->regs[value_regno].type == PTR_TO_CTX)) {
556 571
557 /* register containing pointer is being spilled into stack */ 572 /* register containing pointer is being spilled into stack */
558 if (size != BPF_REG_SIZE) { 573 if (size != BPF_REG_SIZE) {
@@ -643,6 +658,20 @@ static int check_ctx_access(struct verifier_env *env, int off, int size,
643 return -EACCES; 658 return -EACCES;
644} 659}
645 660
661static bool is_pointer_value(struct verifier_env *env, int regno)
662{
663 if (env->allow_ptr_leaks)
664 return false;
665
666 switch (env->cur_state.regs[regno].type) {
667 case UNKNOWN_VALUE:
668 case CONST_IMM:
669 return false;
670 default:
671 return true;
672 }
673}
674
646/* check whether memory at (regno + off) is accessible for t = (read | write) 675/* check whether memory at (regno + off) is accessible for t = (read | write)
647 * if t==write, value_regno is a register which value is stored into memory 676 * if t==write, value_regno is a register which value is stored into memory
648 * if t==read, value_regno is a register which will receive the value from memory 677 * if t==read, value_regno is a register which will receive the value from memory
@@ -669,11 +698,21 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
669 } 698 }
670 699
671 if (state->regs[regno].type == PTR_TO_MAP_VALUE) { 700 if (state->regs[regno].type == PTR_TO_MAP_VALUE) {
701 if (t == BPF_WRITE && value_regno >= 0 &&
702 is_pointer_value(env, value_regno)) {
703 verbose("R%d leaks addr into map\n", value_regno);
704 return -EACCES;
705 }
672 err = check_map_access(env, regno, off, size); 706 err = check_map_access(env, regno, off, size);
673 if (!err && t == BPF_READ && value_regno >= 0) 707 if (!err && t == BPF_READ && value_regno >= 0)
674 mark_reg_unknown_value(state->regs, value_regno); 708 mark_reg_unknown_value(state->regs, value_regno);
675 709
676 } else if (state->regs[regno].type == PTR_TO_CTX) { 710 } else if (state->regs[regno].type == PTR_TO_CTX) {
711 if (t == BPF_WRITE && value_regno >= 0 &&
712 is_pointer_value(env, value_regno)) {
713 verbose("R%d leaks addr into ctx\n", value_regno);
714 return -EACCES;
715 }
677 err = check_ctx_access(env, off, size, t); 716 err = check_ctx_access(env, off, size, t);
678 if (!err && t == BPF_READ && value_regno >= 0) 717 if (!err && t == BPF_READ && value_regno >= 0)
679 mark_reg_unknown_value(state->regs, value_regno); 718 mark_reg_unknown_value(state->regs, value_regno);
@@ -684,10 +723,17 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
684 verbose("invalid stack off=%d size=%d\n", off, size); 723 verbose("invalid stack off=%d size=%d\n", off, size);
685 return -EACCES; 724 return -EACCES;
686 } 725 }
687 if (t == BPF_WRITE) 726 if (t == BPF_WRITE) {
727 if (!env->allow_ptr_leaks &&
728 state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
729 size != BPF_REG_SIZE) {
730 verbose("attempt to corrupt spilled pointer on stack\n");
731 return -EACCES;
732 }
688 err = check_stack_write(state, off, size, value_regno); 733 err = check_stack_write(state, off, size, value_regno);
689 else 734 } else {
690 err = check_stack_read(state, off, size, value_regno); 735 err = check_stack_read(state, off, size, value_regno);
736 }
691 } else { 737 } else {
692 verbose("R%d invalid mem access '%s'\n", 738 verbose("R%d invalid mem access '%s'\n",
693 regno, reg_type_str[state->regs[regno].type]); 739 regno, reg_type_str[state->regs[regno].type]);
@@ -775,8 +821,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
775 return -EACCES; 821 return -EACCES;
776 } 822 }
777 823
778 if (arg_type == ARG_ANYTHING) 824 if (arg_type == ARG_ANYTHING) {
825 if (is_pointer_value(env, regno)) {
826 verbose("R%d leaks addr into helper function\n", regno);
827 return -EACCES;
828 }
779 return 0; 829 return 0;
830 }
780 831
781 if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY || 832 if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY ||
782 arg_type == ARG_PTR_TO_MAP_VALUE) { 833 arg_type == ARG_PTR_TO_MAP_VALUE) {
@@ -860,7 +911,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
860 * don't allow any other map type to be passed into 911 * don't allow any other map type to be passed into
861 * the special func; 912 * the special func;
862 */ 913 */
863 if (bool_map != bool_func) 914 if (bool_func && bool_map != bool_func)
864 return -EINVAL; 915 return -EINVAL;
865 } 916 }
866 917
@@ -950,8 +1001,9 @@ static int check_call(struct verifier_env *env, int func_id)
950} 1001}
951 1002
952/* check validity of 32-bit and 64-bit arithmetic operations */ 1003/* check validity of 32-bit and 64-bit arithmetic operations */
953static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn) 1004static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
954{ 1005{
1006 struct reg_state *regs = env->cur_state.regs;
955 u8 opcode = BPF_OP(insn->code); 1007 u8 opcode = BPF_OP(insn->code);
956 int err; 1008 int err;
957 1009
@@ -976,6 +1028,12 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
976 if (err) 1028 if (err)
977 return err; 1029 return err;
978 1030
1031 if (is_pointer_value(env, insn->dst_reg)) {
1032 verbose("R%d pointer arithmetic prohibited\n",
1033 insn->dst_reg);
1034 return -EACCES;
1035 }
1036
979 /* check dest operand */ 1037 /* check dest operand */
980 err = check_reg_arg(regs, insn->dst_reg, DST_OP); 1038 err = check_reg_arg(regs, insn->dst_reg, DST_OP);
981 if (err) 1039 if (err)
@@ -1012,6 +1070,11 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
1012 */ 1070 */
1013 regs[insn->dst_reg] = regs[insn->src_reg]; 1071 regs[insn->dst_reg] = regs[insn->src_reg];
1014 } else { 1072 } else {
1073 if (is_pointer_value(env, insn->src_reg)) {
1074 verbose("R%d partial copy of pointer\n",
1075 insn->src_reg);
1076 return -EACCES;
1077 }
1015 regs[insn->dst_reg].type = UNKNOWN_VALUE; 1078 regs[insn->dst_reg].type = UNKNOWN_VALUE;
1016 regs[insn->dst_reg].map_ptr = NULL; 1079 regs[insn->dst_reg].map_ptr = NULL;
1017 } 1080 }
@@ -1061,8 +1124,18 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
1061 /* pattern match 'bpf_add Rx, imm' instruction */ 1124 /* pattern match 'bpf_add Rx, imm' instruction */
1062 if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && 1125 if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
1063 regs[insn->dst_reg].type == FRAME_PTR && 1126 regs[insn->dst_reg].type == FRAME_PTR &&
1064 BPF_SRC(insn->code) == BPF_K) 1127 BPF_SRC(insn->code) == BPF_K) {
1065 stack_relative = true; 1128 stack_relative = true;
1129 } else if (is_pointer_value(env, insn->dst_reg)) {
1130 verbose("R%d pointer arithmetic prohibited\n",
1131 insn->dst_reg);
1132 return -EACCES;
1133 } else if (BPF_SRC(insn->code) == BPF_X &&
1134 is_pointer_value(env, insn->src_reg)) {
1135 verbose("R%d pointer arithmetic prohibited\n",
1136 insn->src_reg);
1137 return -EACCES;
1138 }
1066 1139
1067 /* check dest operand */ 1140 /* check dest operand */
1068 err = check_reg_arg(regs, insn->dst_reg, DST_OP); 1141 err = check_reg_arg(regs, insn->dst_reg, DST_OP);
@@ -1101,6 +1174,12 @@ static int check_cond_jmp_op(struct verifier_env *env,
1101 err = check_reg_arg(regs, insn->src_reg, SRC_OP); 1174 err = check_reg_arg(regs, insn->src_reg, SRC_OP);
1102 if (err) 1175 if (err)
1103 return err; 1176 return err;
1177
1178 if (is_pointer_value(env, insn->src_reg)) {
1179 verbose("R%d pointer comparison prohibited\n",
1180 insn->src_reg);
1181 return -EACCES;
1182 }
1104 } else { 1183 } else {
1105 if (insn->src_reg != BPF_REG_0) { 1184 if (insn->src_reg != BPF_REG_0) {
1106 verbose("BPF_JMP uses reserved fields\n"); 1185 verbose("BPF_JMP uses reserved fields\n");
@@ -1155,6 +1234,9 @@ static int check_cond_jmp_op(struct verifier_env *env,
1155 regs[insn->dst_reg].type = CONST_IMM; 1234 regs[insn->dst_reg].type = CONST_IMM;
1156 regs[insn->dst_reg].imm = 0; 1235 regs[insn->dst_reg].imm = 0;
1157 } 1236 }
1237 } else if (is_pointer_value(env, insn->dst_reg)) {
1238 verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
1239 return -EACCES;
1158 } else if (BPF_SRC(insn->code) == BPF_K && 1240 } else if (BPF_SRC(insn->code) == BPF_K &&
1159 (opcode == BPF_JEQ || opcode == BPF_JNE)) { 1241 (opcode == BPF_JEQ || opcode == BPF_JNE)) {
1160 1242
@@ -1658,7 +1740,7 @@ static int do_check(struct verifier_env *env)
1658 } 1740 }
1659 1741
1660 if (class == BPF_ALU || class == BPF_ALU64) { 1742 if (class == BPF_ALU || class == BPF_ALU64) {
1661 err = check_alu_op(regs, insn); 1743 err = check_alu_op(env, insn);
1662 if (err) 1744 if (err)
1663 return err; 1745 return err;
1664 1746
@@ -1816,6 +1898,11 @@ static int do_check(struct verifier_env *env)
1816 if (err) 1898 if (err)
1817 return err; 1899 return err;
1818 1900
1901 if (is_pointer_value(env, BPF_REG_0)) {
1902 verbose("R0 leaks addr as return value\n");
1903 return -EACCES;
1904 }
1905
1819process_bpf_exit: 1906process_bpf_exit:
1820 insn_idx = pop_stack(env, &prev_insn_idx); 1907 insn_idx = pop_stack(env, &prev_insn_idx);
1821 if (insn_idx < 0) { 1908 if (insn_idx < 0) {
@@ -1902,8 +1989,7 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env)
1902 } 1989 }
1903 1990
1904 f = fdget(insn->imm); 1991 f = fdget(insn->imm);
1905 1992 map = __bpf_map_get(f);
1906 map = bpf_map_get(f);
1907 if (IS_ERR(map)) { 1993 if (IS_ERR(map)) {
1908 verbose("fd %d is not pointing to valid bpf_map\n", 1994 verbose("fd %d is not pointing to valid bpf_map\n",
1909 insn->imm); 1995 insn->imm);
@@ -2024,7 +2110,7 @@ static int convert_ctx_accesses(struct verifier_env *env)
2024 2110
2025 cnt = env->prog->aux->ops-> 2111 cnt = env->prog->aux->ops->
2026 convert_ctx_access(type, insn->dst_reg, insn->src_reg, 2112 convert_ctx_access(type, insn->dst_reg, insn->src_reg,
2027 insn->off, insn_buf); 2113 insn->off, insn_buf, env->prog);
2028 if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { 2114 if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
2029 verbose("bpf verifier is misconfigured\n"); 2115 verbose("bpf verifier is misconfigured\n");
2030 return -EINVAL; 2116 return -EINVAL;
@@ -2144,6 +2230,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
2144 if (ret < 0) 2230 if (ret < 0)
2145 goto skip_full_check; 2231 goto skip_full_check;
2146 2232
2233 env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
2234
2147 ret = do_check(env); 2235 ret = do_check(env);
2148 2236
2149skip_full_check: 2237skip_full_check:
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2cf0f79f1fc9..f1603c153890 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -45,7 +45,6 @@
45#include <linux/sched.h> 45#include <linux/sched.h>
46#include <linux/slab.h> 46#include <linux/slab.h>
47#include <linux/spinlock.h> 47#include <linux/spinlock.h>
48#include <linux/rwsem.h>
49#include <linux/percpu-rwsem.h> 48#include <linux/percpu-rwsem.h>
50#include <linux/string.h> 49#include <linux/string.h>
51#include <linux/sort.h> 50#include <linux/sort.h>
@@ -76,7 +75,7 @@
76 * cgroup_mutex is the master lock. Any modification to cgroup or its 75 * cgroup_mutex is the master lock. Any modification to cgroup or its
77 * hierarchy must be performed while holding it. 76 * hierarchy must be performed while holding it.
78 * 77 *
79 * css_set_rwsem protects task->cgroups pointer, the list of css_set 78 * css_set_lock protects task->cgroups pointer, the list of css_set
80 * objects, and the chain of tasks off each css_set. 79 * objects, and the chain of tasks off each css_set.
81 * 80 *
82 * These locks are exported if CONFIG_PROVE_RCU so that accessors in 81 * These locks are exported if CONFIG_PROVE_RCU so that accessors in
@@ -84,12 +83,12 @@
84 */ 83 */
85#ifdef CONFIG_PROVE_RCU 84#ifdef CONFIG_PROVE_RCU
86DEFINE_MUTEX(cgroup_mutex); 85DEFINE_MUTEX(cgroup_mutex);
87DECLARE_RWSEM(css_set_rwsem); 86DEFINE_SPINLOCK(css_set_lock);
88EXPORT_SYMBOL_GPL(cgroup_mutex); 87EXPORT_SYMBOL_GPL(cgroup_mutex);
89EXPORT_SYMBOL_GPL(css_set_rwsem); 88EXPORT_SYMBOL_GPL(css_set_lock);
90#else 89#else
91static DEFINE_MUTEX(cgroup_mutex); 90static DEFINE_MUTEX(cgroup_mutex);
92static DECLARE_RWSEM(css_set_rwsem); 91static DEFINE_SPINLOCK(css_set_lock);
93#endif 92#endif
94 93
95/* 94/*
@@ -139,6 +138,27 @@ static const char *cgroup_subsys_name[] = {
139}; 138};
140#undef SUBSYS 139#undef SUBSYS
141 140
141/* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */
142#define SUBSYS(_x) \
143 DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \
144 DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \
145 EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \
146 EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
147#include <linux/cgroup_subsys.h>
148#undef SUBSYS
149
150#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
151static struct static_key_true *cgroup_subsys_enabled_key[] = {
152#include <linux/cgroup_subsys.h>
153};
154#undef SUBSYS
155
156#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
157static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
158#include <linux/cgroup_subsys.h>
159};
160#undef SUBSYS
161
142/* 162/*
143 * The default hierarchy, reserved for the subsystems that are otherwise 163 * The default hierarchy, reserved for the subsystems that are otherwise
144 * unattached - it never has more than a single cgroup, and all tasks are 164 * unattached - it never has more than a single cgroup, and all tasks are
@@ -153,12 +173,6 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root);
153 */ 173 */
154static bool cgrp_dfl_root_visible; 174static bool cgrp_dfl_root_visible;
155 175
156/*
157 * Set by the boot param of the same name and makes subsystems with NULL
158 * ->dfl_files to use ->legacy_files on the default hierarchy.
159 */
160static bool cgroup_legacy_files_on_dfl;
161
162/* some controllers are not supported in the default hierarchy */ 176/* some controllers are not supported in the default hierarchy */
163static unsigned long cgrp_dfl_root_inhibit_ss_mask; 177static unsigned long cgrp_dfl_root_inhibit_ss_mask;
164 178
@@ -186,6 +200,7 @@ static u64 css_serial_nr_next = 1;
186 */ 200 */
187static unsigned long have_fork_callback __read_mostly; 201static unsigned long have_fork_callback __read_mostly;
188static unsigned long have_exit_callback __read_mostly; 202static unsigned long have_exit_callback __read_mostly;
203static unsigned long have_free_callback __read_mostly;
189 204
190/* Ditto for the can_fork callback. */ 205/* Ditto for the can_fork callback. */
191static unsigned long have_canfork_callback __read_mostly; 206static unsigned long have_canfork_callback __read_mostly;
@@ -195,14 +210,87 @@ static struct cftype cgroup_legacy_base_files[];
195 210
196static int rebind_subsystems(struct cgroup_root *dst_root, 211static int rebind_subsystems(struct cgroup_root *dst_root,
197 unsigned long ss_mask); 212 unsigned long ss_mask);
213static void css_task_iter_advance(struct css_task_iter *it);
198static int cgroup_destroy_locked(struct cgroup *cgrp); 214static int cgroup_destroy_locked(struct cgroup *cgrp);
199static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, 215static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
200 bool visible); 216 bool visible);
201static void css_release(struct percpu_ref *ref); 217static void css_release(struct percpu_ref *ref);
202static void kill_css(struct cgroup_subsys_state *css); 218static void kill_css(struct cgroup_subsys_state *css);
203static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], 219static int cgroup_addrm_files(struct cgroup_subsys_state *css,
220 struct cgroup *cgrp, struct cftype cfts[],
204 bool is_add); 221 bool is_add);
205 222
223/**
224 * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
225 * @ssid: subsys ID of interest
226 *
227 * cgroup_subsys_enabled() can only be used with literal subsys names which
228 * is fine for individual subsystems but unsuitable for cgroup core. This
229 * is slower static_key_enabled() based test indexed by @ssid.
230 */
231static bool cgroup_ssid_enabled(int ssid)
232{
233 return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
234}
235
236/**
237 * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
238 * @cgrp: the cgroup of interest
239 *
240 * The default hierarchy is the v2 interface of cgroup and this function
241 * can be used to test whether a cgroup is on the default hierarchy for
242 * cases where a subsystem should behave differnetly depending on the
243 * interface version.
244 *
245 * The set of behaviors which change on the default hierarchy are still
246 * being determined and the mount option is prefixed with __DEVEL__.
247 *
248 * List of changed behaviors:
249 *
250 * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
251 * and "name" are disallowed.
252 *
253 * - When mounting an existing superblock, mount options should match.
254 *
255 * - Remount is disallowed.
256 *
257 * - rename(2) is disallowed.
258 *
259 * - "tasks" is removed. Everything should be at process granularity. Use
260 * "cgroup.procs" instead.
261 *
262 * - "cgroup.procs" is not sorted. pids will be unique unless they got
263 * recycled inbetween reads.
264 *
265 * - "release_agent" and "notify_on_release" are removed. Replacement
266 * notification mechanism will be implemented.
267 *
268 * - "cgroup.clone_children" is removed.
269 *
270 * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup
271 * and its descendants contain no task; otherwise, 1. The file also
272 * generates kernfs notification which can be monitored through poll and
273 * [di]notify when the value of the file changes.
274 *
275 * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
276 * take masks of ancestors with non-empty cpus/mems, instead of being
277 * moved to an ancestor.
278 *
279 * - cpuset: a task can be moved into an empty cpuset, and again it takes
280 * masks of ancestors.
281 *
282 * - memcg: use_hierarchy is on by default and the cgroup file for the flag
283 * is not created.
284 *
285 * - blkcg: blk-throttle becomes properly hierarchical.
286 *
287 * - debug: disallowed on the default hierarchy.
288 */
289static bool cgroup_on_dfl(const struct cgroup *cgrp)
290{
291 return cgrp->root == &cgrp_dfl_root;
292}
293
206/* IDR wrappers which synchronize using cgroup_idr_lock */ 294/* IDR wrappers which synchronize using cgroup_idr_lock */
207static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, 295static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
208 gfp_t gfp_mask) 296 gfp_t gfp_mask)
@@ -211,7 +299,7 @@ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
211 299
212 idr_preload(gfp_mask); 300 idr_preload(gfp_mask);
213 spin_lock_bh(&cgroup_idr_lock); 301 spin_lock_bh(&cgroup_idr_lock);
214 ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_WAIT); 302 ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
215 spin_unlock_bh(&cgroup_idr_lock); 303 spin_unlock_bh(&cgroup_idr_lock);
216 idr_preload_end(); 304 idr_preload_end();
217 return ret; 305 return ret;
@@ -335,6 +423,22 @@ static inline bool cgroup_is_dead(const struct cgroup *cgrp)
335 return !(cgrp->self.flags & CSS_ONLINE); 423 return !(cgrp->self.flags & CSS_ONLINE);
336} 424}
337 425
426static void cgroup_get(struct cgroup *cgrp)
427{
428 WARN_ON_ONCE(cgroup_is_dead(cgrp));
429 css_get(&cgrp->self);
430}
431
432static bool cgroup_tryget(struct cgroup *cgrp)
433{
434 return css_tryget(&cgrp->self);
435}
436
437static void cgroup_put(struct cgroup *cgrp)
438{
439 css_put(&cgrp->self);
440}
441
338struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) 442struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
339{ 443{
340 struct cgroup *cgrp = of->kn->parent->priv; 444 struct cgroup *cgrp = of->kn->parent->priv;
@@ -484,19 +588,31 @@ struct css_set init_css_set = {
484 .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), 588 .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
485 .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), 589 .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
486 .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), 590 .mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
591 .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
487}; 592};
488 593
489static int css_set_count = 1; /* 1 for init_css_set */ 594static int css_set_count = 1; /* 1 for init_css_set */
490 595
491/** 596/**
597 * css_set_populated - does a css_set contain any tasks?
598 * @cset: target css_set
599 */
600static bool css_set_populated(struct css_set *cset)
601{
602 lockdep_assert_held(&css_set_lock);
603
604 return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
605}
606
607/**
492 * cgroup_update_populated - updated populated count of a cgroup 608 * cgroup_update_populated - updated populated count of a cgroup
493 * @cgrp: the target cgroup 609 * @cgrp: the target cgroup
494 * @populated: inc or dec populated count 610 * @populated: inc or dec populated count
495 * 611 *
496 * @cgrp is either getting the first task (css_set) or losing the last. 612 * One of the css_sets associated with @cgrp is either getting its first
497 * Update @cgrp->populated_cnt accordingly. The count is propagated 613 * task or losing the last. Update @cgrp->populated_cnt accordingly. The
498 * towards root so that a given cgroup's populated_cnt is zero iff the 614 * count is propagated towards root so that a given cgroup's populated_cnt
499 * cgroup and all its descendants are empty. 615 * is zero iff the cgroup and all its descendants don't contain any tasks.
500 * 616 *
501 * @cgrp's interface file "cgroup.populated" is zero if 617 * @cgrp's interface file "cgroup.populated" is zero if
502 * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt 618 * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt
@@ -506,7 +622,7 @@ static int css_set_count = 1; /* 1 for init_css_set */
506 */ 622 */
507static void cgroup_update_populated(struct cgroup *cgrp, bool populated) 623static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
508{ 624{
509 lockdep_assert_held(&css_set_rwsem); 625 lockdep_assert_held(&css_set_lock);
510 626
511 do { 627 do {
512 bool trigger; 628 bool trigger;
@@ -519,12 +635,93 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
519 if (!trigger) 635 if (!trigger)
520 break; 636 break;
521 637
522 if (cgrp->populated_kn) 638 check_for_release(cgrp);
523 kernfs_notify(cgrp->populated_kn); 639 cgroup_file_notify(&cgrp->events_file);
640
524 cgrp = cgroup_parent(cgrp); 641 cgrp = cgroup_parent(cgrp);
525 } while (cgrp); 642 } while (cgrp);
526} 643}
527 644
645/**
646 * css_set_update_populated - update populated state of a css_set
647 * @cset: target css_set
648 * @populated: whether @cset is populated or depopulated
649 *
650 * @cset is either getting the first task or losing the last. Update the
651 * ->populated_cnt of all associated cgroups accordingly.
652 */
653static void css_set_update_populated(struct css_set *cset, bool populated)
654{
655 struct cgrp_cset_link *link;
656
657 lockdep_assert_held(&css_set_lock);
658
659 list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
660 cgroup_update_populated(link->cgrp, populated);
661}
662
663/**
664 * css_set_move_task - move a task from one css_set to another
665 * @task: task being moved
666 * @from_cset: css_set @task currently belongs to (may be NULL)
667 * @to_cset: new css_set @task is being moved to (may be NULL)
668 * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
669 *
670 * Move @task from @from_cset to @to_cset. If @task didn't belong to any
671 * css_set, @from_cset can be NULL. If @task is being disassociated
672 * instead of moved, @to_cset can be NULL.
673 *
674 * This function automatically handles populated_cnt updates and
675 * css_task_iter adjustments but the caller is responsible for managing
676 * @from_cset and @to_cset's reference counts.
677 */
678static void css_set_move_task(struct task_struct *task,
679 struct css_set *from_cset, struct css_set *to_cset,
680 bool use_mg_tasks)
681{
682 lockdep_assert_held(&css_set_lock);
683
684 if (from_cset) {
685 struct css_task_iter *it, *pos;
686
687 WARN_ON_ONCE(list_empty(&task->cg_list));
688
689 /*
690 * @task is leaving, advance task iterators which are
691 * pointing to it so that they can resume at the next
692 * position. Advancing an iterator might remove it from
693 * the list, use safe walk. See css_task_iter_advance*()
694 * for details.
695 */
696 list_for_each_entry_safe(it, pos, &from_cset->task_iters,
697 iters_node)
698 if (it->task_pos == &task->cg_list)
699 css_task_iter_advance(it);
700
701 list_del_init(&task->cg_list);
702 if (!css_set_populated(from_cset))
703 css_set_update_populated(from_cset, false);
704 } else {
705 WARN_ON_ONCE(!list_empty(&task->cg_list));
706 }
707
708 if (to_cset) {
709 /*
710 * We are synchronized through cgroup_threadgroup_rwsem
711 * against PF_EXITING setting such that we can't race
712 * against cgroup_exit() changing the css_set to
713 * init_css_set and dropping the old one.
714 */
715 WARN_ON_ONCE(task->flags & PF_EXITING);
716
717 if (!css_set_populated(to_cset))
718 css_set_update_populated(to_cset, true);
719 rcu_assign_pointer(task->cgroups, to_cset);
720 list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
721 &to_cset->tasks);
722 }
723}
724
528/* 725/*
529 * hash table for cgroup groups. This improves the performance to find 726 * hash table for cgroup groups. This improves the performance to find
530 * an existing css_set. This hash doesn't (currently) take into 727 * an existing css_set. This hash doesn't (currently) take into
@@ -552,7 +749,7 @@ static void put_css_set_locked(struct css_set *cset)
552 struct cgroup_subsys *ss; 749 struct cgroup_subsys *ss;
553 int ssid; 750 int ssid;
554 751
555 lockdep_assert_held(&css_set_rwsem); 752 lockdep_assert_held(&css_set_lock);
556 753
557 if (!atomic_dec_and_test(&cset->refcount)) 754 if (!atomic_dec_and_test(&cset->refcount))
558 return; 755 return;
@@ -564,17 +761,10 @@ static void put_css_set_locked(struct css_set *cset)
564 css_set_count--; 761 css_set_count--;
565 762
566 list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) { 763 list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
567 struct cgroup *cgrp = link->cgrp;
568
569 list_del(&link->cset_link); 764 list_del(&link->cset_link);
570 list_del(&link->cgrp_link); 765 list_del(&link->cgrp_link);
571 766 if (cgroup_parent(link->cgrp))
572 /* @cgrp can't go away while we're holding css_set_rwsem */ 767 cgroup_put(link->cgrp);
573 if (list_empty(&cgrp->cset_links)) {
574 cgroup_update_populated(cgrp, false);
575 check_for_release(cgrp);
576 }
577
578 kfree(link); 768 kfree(link);
579 } 769 }
580 770
@@ -591,9 +781,9 @@ static void put_css_set(struct css_set *cset)
591 if (atomic_add_unless(&cset->refcount, -1, 1)) 781 if (atomic_add_unless(&cset->refcount, -1, 1))
592 return; 782 return;
593 783
594 down_write(&css_set_rwsem); 784 spin_lock_bh(&css_set_lock);
595 put_css_set_locked(cset); 785 put_css_set_locked(cset);
596 up_write(&css_set_rwsem); 786 spin_unlock_bh(&css_set_lock);
597} 787}
598 788
599/* 789/*
@@ -782,15 +972,15 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
782 link->cset = cset; 972 link->cset = cset;
783 link->cgrp = cgrp; 973 link->cgrp = cgrp;
784 974
785 if (list_empty(&cgrp->cset_links))
786 cgroup_update_populated(cgrp, true);
787 list_move(&link->cset_link, &cgrp->cset_links);
788
789 /* 975 /*
790 * Always add links to the tail of the list so that the list 976 * Always add links to the tail of the lists so that the lists are
791 * is sorted by order of hierarchy creation 977 * in choronological order.
792 */ 978 */
979 list_move_tail(&link->cset_link, &cgrp->cset_links);
793 list_add_tail(&link->cgrp_link, &cset->cgrp_links); 980 list_add_tail(&link->cgrp_link, &cset->cgrp_links);
981
982 if (cgroup_parent(cgrp))
983 cgroup_get(cgrp);
794} 984}
795 985
796/** 986/**
@@ -816,11 +1006,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
816 1006
817 /* First see if we already have a cgroup group that matches 1007 /* First see if we already have a cgroup group that matches
818 * the desired set */ 1008 * the desired set */
819 down_read(&css_set_rwsem); 1009 spin_lock_bh(&css_set_lock);
820 cset = find_existing_css_set(old_cset, cgrp, template); 1010 cset = find_existing_css_set(old_cset, cgrp, template);
821 if (cset) 1011 if (cset)
822 get_css_set(cset); 1012 get_css_set(cset);
823 up_read(&css_set_rwsem); 1013 spin_unlock_bh(&css_set_lock);
824 1014
825 if (cset) 1015 if (cset)
826 return cset; 1016 return cset;
@@ -841,13 +1031,14 @@ static struct css_set *find_css_set(struct css_set *old_cset,
841 INIT_LIST_HEAD(&cset->mg_tasks); 1031 INIT_LIST_HEAD(&cset->mg_tasks);
842 INIT_LIST_HEAD(&cset->mg_preload_node); 1032 INIT_LIST_HEAD(&cset->mg_preload_node);
843 INIT_LIST_HEAD(&cset->mg_node); 1033 INIT_LIST_HEAD(&cset->mg_node);
1034 INIT_LIST_HEAD(&cset->task_iters);
844 INIT_HLIST_NODE(&cset->hlist); 1035 INIT_HLIST_NODE(&cset->hlist);
845 1036
846 /* Copy the set of subsystem state objects generated in 1037 /* Copy the set of subsystem state objects generated in
847 * find_existing_css_set() */ 1038 * find_existing_css_set() */
848 memcpy(cset->subsys, template, sizeof(cset->subsys)); 1039 memcpy(cset->subsys, template, sizeof(cset->subsys));
849 1040
850 down_write(&css_set_rwsem); 1041 spin_lock_bh(&css_set_lock);
851 /* Add reference counts and links from the new css_set. */ 1042 /* Add reference counts and links from the new css_set. */
852 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { 1043 list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
853 struct cgroup *c = link->cgrp; 1044 struct cgroup *c = link->cgrp;
@@ -869,7 +1060,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
869 list_add_tail(&cset->e_cset_node[ssid], 1060 list_add_tail(&cset->e_cset_node[ssid],
870 &cset->subsys[ssid]->cgroup->e_csets[ssid]); 1061 &cset->subsys[ssid]->cgroup->e_csets[ssid]);
871 1062
872 up_write(&css_set_rwsem); 1063 spin_unlock_bh(&css_set_lock);
873 1064
874 return cset; 1065 return cset;
875} 1066}
@@ -933,14 +1124,15 @@ static void cgroup_destroy_root(struct cgroup_root *root)
933 * Release all the links from cset_links to this hierarchy's 1124 * Release all the links from cset_links to this hierarchy's
934 * root cgroup 1125 * root cgroup
935 */ 1126 */
936 down_write(&css_set_rwsem); 1127 spin_lock_bh(&css_set_lock);
937 1128
938 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { 1129 list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
939 list_del(&link->cset_link); 1130 list_del(&link->cset_link);
940 list_del(&link->cgrp_link); 1131 list_del(&link->cgrp_link);
941 kfree(link); 1132 kfree(link);
942 } 1133 }
943 up_write(&css_set_rwsem); 1134
1135 spin_unlock_bh(&css_set_lock);
944 1136
945 if (!list_empty(&root->root_list)) { 1137 if (!list_empty(&root->root_list)) {
946 list_del(&root->root_list); 1138 list_del(&root->root_list);
@@ -962,7 +1154,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
962 struct cgroup *res = NULL; 1154 struct cgroup *res = NULL;
963 1155
964 lockdep_assert_held(&cgroup_mutex); 1156 lockdep_assert_held(&cgroup_mutex);
965 lockdep_assert_held(&css_set_rwsem); 1157 lockdep_assert_held(&css_set_lock);
966 1158
967 if (cset == &init_css_set) { 1159 if (cset == &init_css_set) {
968 res = &root->cgrp; 1160 res = &root->cgrp;
@@ -985,7 +1177,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
985 1177
986/* 1178/*
987 * Return the cgroup for "task" from the given hierarchy. Must be 1179 * Return the cgroup for "task" from the given hierarchy. Must be
988 * called with cgroup_mutex and css_set_rwsem held. 1180 * called with cgroup_mutex and css_set_lock held.
989 */ 1181 */
990static struct cgroup *task_cgroup_from_root(struct task_struct *task, 1182static struct cgroup *task_cgroup_from_root(struct task_struct *task,
991 struct cgroup_root *root) 1183 struct cgroup_root *root)
@@ -1024,7 +1216,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
1024 * update of a tasks cgroup pointer by cgroup_attach_task() 1216 * update of a tasks cgroup pointer by cgroup_attach_task()
1025 */ 1217 */
1026 1218
1027static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
1028static struct kernfs_syscall_ops cgroup_kf_syscall_ops; 1219static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
1029static const struct file_operations proc_cgroupstats_operations; 1220static const struct file_operations proc_cgroupstats_operations;
1030 1221
@@ -1047,43 +1238,25 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
1047 * cgroup_file_mode - deduce file mode of a control file 1238 * cgroup_file_mode - deduce file mode of a control file
1048 * @cft: the control file in question 1239 * @cft: the control file in question
1049 * 1240 *
1050 * returns cft->mode if ->mode is not 0 1241 * S_IRUGO for read, S_IWUSR for write.
1051 * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
1052 * returns S_IRUGO if it has only a read handler
1053 * returns S_IWUSR if it has only a write hander
1054 */ 1242 */
1055static umode_t cgroup_file_mode(const struct cftype *cft) 1243static umode_t cgroup_file_mode(const struct cftype *cft)
1056{ 1244{
1057 umode_t mode = 0; 1245 umode_t mode = 0;
1058 1246
1059 if (cft->mode)
1060 return cft->mode;
1061
1062 if (cft->read_u64 || cft->read_s64 || cft->seq_show) 1247 if (cft->read_u64 || cft->read_s64 || cft->seq_show)
1063 mode |= S_IRUGO; 1248 mode |= S_IRUGO;
1064 1249
1065 if (cft->write_u64 || cft->write_s64 || cft->write) 1250 if (cft->write_u64 || cft->write_s64 || cft->write) {
1066 mode |= S_IWUSR; 1251 if (cft->flags & CFTYPE_WORLD_WRITABLE)
1252 mode |= S_IWUGO;
1253 else
1254 mode |= S_IWUSR;
1255 }
1067 1256
1068 return mode; 1257 return mode;
1069} 1258}
1070 1259
1071static void cgroup_get(struct cgroup *cgrp)
1072{
1073 WARN_ON_ONCE(cgroup_is_dead(cgrp));
1074 css_get(&cgrp->self);
1075}
1076
1077static bool cgroup_tryget(struct cgroup *cgrp)
1078{
1079 return css_tryget(&cgrp->self);
1080}
1081
1082static void cgroup_put(struct cgroup *cgrp)
1083{
1084 css_put(&cgrp->self);
1085}
1086
1087/** 1260/**
1088 * cgroup_calc_child_subsys_mask - calculate child_subsys_mask 1261 * cgroup_calc_child_subsys_mask - calculate child_subsys_mask
1089 * @cgrp: the target cgroup 1262 * @cgrp: the target cgroup
@@ -1224,28 +1397,64 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
1224} 1397}
1225 1398
1226/** 1399/**
1227 * cgroup_clear_dir - remove subsys files in a cgroup directory 1400 * css_clear_dir - remove subsys files in a cgroup directory
1228 * @cgrp: target cgroup 1401 * @css: taget css
1229 * @subsys_mask: mask of the subsystem ids whose files should be removed 1402 * @cgrp_override: specify if target cgroup is different from css->cgroup
1230 */ 1403 */
1231static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) 1404static void css_clear_dir(struct cgroup_subsys_state *css,
1405 struct cgroup *cgrp_override)
1232{ 1406{
1233 struct cgroup_subsys *ss; 1407 struct cgroup *cgrp = cgrp_override ?: css->cgroup;
1234 int i; 1408 struct cftype *cfts;
1235 1409
1236 for_each_subsys(ss, i) { 1410 list_for_each_entry(cfts, &css->ss->cfts, node)
1237 struct cftype *cfts; 1411 cgroup_addrm_files(css, cgrp, cfts, false);
1412}
1238 1413
1239 if (!(subsys_mask & (1 << i))) 1414/**
1240 continue; 1415 * css_populate_dir - create subsys files in a cgroup directory
1241 list_for_each_entry(cfts, &ss->cfts, node) 1416 * @css: target css
1242 cgroup_addrm_files(cgrp, cfts, false); 1417 * @cgrp_overried: specify if target cgroup is different from css->cgroup
1418 *
1419 * On failure, no file is added.
1420 */
1421static int css_populate_dir(struct cgroup_subsys_state *css,
1422 struct cgroup *cgrp_override)
1423{
1424 struct cgroup *cgrp = cgrp_override ?: css->cgroup;
1425 struct cftype *cfts, *failed_cfts;
1426 int ret;
1427
1428 if (!css->ss) {
1429 if (cgroup_on_dfl(cgrp))
1430 cfts = cgroup_dfl_base_files;
1431 else
1432 cfts = cgroup_legacy_base_files;
1433
1434 return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
1243 } 1435 }
1436
1437 list_for_each_entry(cfts, &css->ss->cfts, node) {
1438 ret = cgroup_addrm_files(css, cgrp, cfts, true);
1439 if (ret < 0) {
1440 failed_cfts = cfts;
1441 goto err;
1442 }
1443 }
1444 return 0;
1445err:
1446 list_for_each_entry(cfts, &css->ss->cfts, node) {
1447 if (cfts == failed_cfts)
1448 break;
1449 cgroup_addrm_files(css, cgrp, cfts, false);
1450 }
1451 return ret;
1244} 1452}
1245 1453
1246static int rebind_subsystems(struct cgroup_root *dst_root, 1454static int rebind_subsystems(struct cgroup_root *dst_root,
1247 unsigned long ss_mask) 1455 unsigned long ss_mask)
1248{ 1456{
1457 struct cgroup *dcgrp = &dst_root->cgrp;
1249 struct cgroup_subsys *ss; 1458 struct cgroup_subsys *ss;
1250 unsigned long tmp_ss_mask; 1459 unsigned long tmp_ss_mask;
1251 int ssid, i, ret; 1460 int ssid, i, ret;
@@ -1267,10 +1476,13 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
1267 if (dst_root == &cgrp_dfl_root) 1476 if (dst_root == &cgrp_dfl_root)
1268 tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask; 1477 tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask;
1269 1478
1270 ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask); 1479 for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
1271 if (ret) { 1480 struct cgroup *scgrp = &ss->root->cgrp;
1272 if (dst_root != &cgrp_dfl_root) 1481 int tssid;
1273 return ret; 1482
1483 ret = css_populate_dir(cgroup_css(scgrp, ss), dcgrp);
1484 if (!ret)
1485 continue;
1274 1486
1275 /* 1487 /*
1276 * Rebinding back to the default root is not allowed to 1488 * Rebinding back to the default root is not allowed to
@@ -1278,57 +1490,67 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
1278 * be rare. Moving subsystems back and forth even more so. 1490 * be rare. Moving subsystems back and forth even more so.
1279 * Just warn about it and continue. 1491 * Just warn about it and continue.
1280 */ 1492 */
1281 if (cgrp_dfl_root_visible) { 1493 if (dst_root == &cgrp_dfl_root) {
1282 pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n", 1494 if (cgrp_dfl_root_visible) {
1283 ret, ss_mask); 1495 pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n",
1284 pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); 1496 ret, ss_mask);
1497 pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
1498 }
1499 continue;
1500 }
1501
1502 for_each_subsys_which(ss, tssid, &tmp_ss_mask) {
1503 if (tssid == ssid)
1504 break;
1505 css_clear_dir(cgroup_css(scgrp, ss), dcgrp);
1285 } 1506 }
1507 return ret;
1286 } 1508 }
1287 1509
1288 /* 1510 /*
1289 * Nothing can fail from this point on. Remove files for the 1511 * Nothing can fail from this point on. Remove files for the
1290 * removed subsystems and rebind each subsystem. 1512 * removed subsystems and rebind each subsystem.
1291 */ 1513 */
1292 for_each_subsys_which(ss, ssid, &ss_mask)
1293 cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
1294
1295 for_each_subsys_which(ss, ssid, &ss_mask) { 1514 for_each_subsys_which(ss, ssid, &ss_mask) {
1296 struct cgroup_root *src_root; 1515 struct cgroup_root *src_root = ss->root;
1297 struct cgroup_subsys_state *css; 1516 struct cgroup *scgrp = &src_root->cgrp;
1517 struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
1298 struct css_set *cset; 1518 struct css_set *cset;
1299 1519
1300 src_root = ss->root; 1520 WARN_ON(!css || cgroup_css(dcgrp, ss));
1301 css = cgroup_css(&src_root->cgrp, ss);
1302 1521
1303 WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss)); 1522 css_clear_dir(css, NULL);
1304 1523
1305 RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL); 1524 RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
1306 rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css); 1525 rcu_assign_pointer(dcgrp->subsys[ssid], css);
1307 ss->root = dst_root; 1526 ss->root = dst_root;
1308 css->cgroup = &dst_root->cgrp; 1527 css->cgroup = dcgrp;
1309 1528
1310 down_write(&css_set_rwsem); 1529 spin_lock_bh(&css_set_lock);
1311 hash_for_each(css_set_table, i, cset, hlist) 1530 hash_for_each(css_set_table, i, cset, hlist)
1312 list_move_tail(&cset->e_cset_node[ss->id], 1531 list_move_tail(&cset->e_cset_node[ss->id],
1313 &dst_root->cgrp.e_csets[ss->id]); 1532 &dcgrp->e_csets[ss->id]);
1314 up_write(&css_set_rwsem); 1533 spin_unlock_bh(&css_set_lock);
1315 1534
1316 src_root->subsys_mask &= ~(1 << ssid); 1535 src_root->subsys_mask &= ~(1 << ssid);
1317 src_root->cgrp.subtree_control &= ~(1 << ssid); 1536 scgrp->subtree_control &= ~(1 << ssid);
1318 cgroup_refresh_child_subsys_mask(&src_root->cgrp); 1537 cgroup_refresh_child_subsys_mask(scgrp);
1319 1538
1320 /* default hierarchy doesn't enable controllers by default */ 1539 /* default hierarchy doesn't enable controllers by default */
1321 dst_root->subsys_mask |= 1 << ssid; 1540 dst_root->subsys_mask |= 1 << ssid;
1322 if (dst_root != &cgrp_dfl_root) { 1541 if (dst_root == &cgrp_dfl_root) {
1323 dst_root->cgrp.subtree_control |= 1 << ssid; 1542 static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
1324 cgroup_refresh_child_subsys_mask(&dst_root->cgrp); 1543 } else {
1544 dcgrp->subtree_control |= 1 << ssid;
1545 cgroup_refresh_child_subsys_mask(dcgrp);
1546 static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
1325 } 1547 }
1326 1548
1327 if (ss->bind) 1549 if (ss->bind)
1328 ss->bind(css); 1550 ss->bind(css);
1329 } 1551 }
1330 1552
1331 kernfs_activate(dst_root->cgrp.kn); 1553 kernfs_activate(dcgrp->kn);
1332 return 0; 1554 return 0;
1333} 1555}
1334 1556
@@ -1458,7 +1680,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1458 for_each_subsys(ss, i) { 1680 for_each_subsys(ss, i) {
1459 if (strcmp(token, ss->legacy_name)) 1681 if (strcmp(token, ss->legacy_name))
1460 continue; 1682 continue;
1461 if (ss->disabled) 1683 if (!cgroup_ssid_enabled(i))
1462 continue; 1684 continue;
1463 1685
1464 /* Mutually exclusive option 'all' + subsystem name */ 1686 /* Mutually exclusive option 'all' + subsystem name */
@@ -1489,7 +1711,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1489 */ 1711 */
1490 if (all_ss || (!one_ss && !opts->none && !opts->name)) 1712 if (all_ss || (!one_ss && !opts->none && !opts->name))
1491 for_each_subsys(ss, i) 1713 for_each_subsys(ss, i)
1492 if (!ss->disabled) 1714 if (cgroup_ssid_enabled(i))
1493 opts->subsys_mask |= (1 << i); 1715 opts->subsys_mask |= (1 << i);
1494 1716
1495 /* 1717 /*
@@ -1585,7 +1807,7 @@ static void cgroup_enable_task_cg_lists(void)
1585{ 1807{
1586 struct task_struct *p, *g; 1808 struct task_struct *p, *g;
1587 1809
1588 down_write(&css_set_rwsem); 1810 spin_lock_bh(&css_set_lock);
1589 1811
1590 if (use_task_css_set_links) 1812 if (use_task_css_set_links)
1591 goto out_unlock; 1813 goto out_unlock;
@@ -1615,14 +1837,16 @@ static void cgroup_enable_task_cg_lists(void)
1615 if (!(p->flags & PF_EXITING)) { 1837 if (!(p->flags & PF_EXITING)) {
1616 struct css_set *cset = task_css_set(p); 1838 struct css_set *cset = task_css_set(p);
1617 1839
1618 list_add(&p->cg_list, &cset->tasks); 1840 if (!css_set_populated(cset))
1841 css_set_update_populated(cset, true);
1842 list_add_tail(&p->cg_list, &cset->tasks);
1619 get_css_set(cset); 1843 get_css_set(cset);
1620 } 1844 }
1621 spin_unlock_irq(&p->sighand->siglock); 1845 spin_unlock_irq(&p->sighand->siglock);
1622 } while_each_thread(g, p); 1846 } while_each_thread(g, p);
1623 read_unlock(&tasklist_lock); 1847 read_unlock(&tasklist_lock);
1624out_unlock: 1848out_unlock:
1625 up_write(&css_set_rwsem); 1849 spin_unlock_bh(&css_set_lock);
1626} 1850}
1627 1851
1628static void init_cgroup_housekeeping(struct cgroup *cgrp) 1852static void init_cgroup_housekeeping(struct cgroup *cgrp)
@@ -1632,6 +1856,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1632 1856
1633 INIT_LIST_HEAD(&cgrp->self.sibling); 1857 INIT_LIST_HEAD(&cgrp->self.sibling);
1634 INIT_LIST_HEAD(&cgrp->self.children); 1858 INIT_LIST_HEAD(&cgrp->self.children);
1859 INIT_LIST_HEAD(&cgrp->self.files);
1635 INIT_LIST_HEAD(&cgrp->cset_links); 1860 INIT_LIST_HEAD(&cgrp->cset_links);
1636 INIT_LIST_HEAD(&cgrp->pidlists); 1861 INIT_LIST_HEAD(&cgrp->pidlists);
1637 mutex_init(&cgrp->pidlist_mutex); 1862 mutex_init(&cgrp->pidlist_mutex);
@@ -1669,7 +1894,6 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
1669{ 1894{
1670 LIST_HEAD(tmp_links); 1895 LIST_HEAD(tmp_links);
1671 struct cgroup *root_cgrp = &root->cgrp; 1896 struct cgroup *root_cgrp = &root->cgrp;
1672 struct cftype *base_files;
1673 struct css_set *cset; 1897 struct css_set *cset;
1674 int i, ret; 1898 int i, ret;
1675 1899
@@ -1686,7 +1910,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
1686 goto out; 1910 goto out;
1687 1911
1688 /* 1912 /*
1689 * We're accessing css_set_count without locking css_set_rwsem here, 1913 * We're accessing css_set_count without locking css_set_lock here,
1690 * but that's OK - it can only be increased by someone holding 1914 * but that's OK - it can only be increased by someone holding
1691 * cgroup_lock, and that's us. The worst that can happen is that we 1915 * cgroup_lock, and that's us. The worst that can happen is that we
1692 * have some link structures left over 1916 * have some link structures left over
@@ -1708,12 +1932,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
1708 } 1932 }
1709 root_cgrp->kn = root->kf_root->kn; 1933 root_cgrp->kn = root->kf_root->kn;
1710 1934
1711 if (root == &cgrp_dfl_root) 1935 ret = css_populate_dir(&root_cgrp->self, NULL);
1712 base_files = cgroup_dfl_base_files;
1713 else
1714 base_files = cgroup_legacy_base_files;
1715
1716 ret = cgroup_addrm_files(root_cgrp, base_files, true);
1717 if (ret) 1936 if (ret)
1718 goto destroy_root; 1937 goto destroy_root;
1719 1938
@@ -1733,10 +1952,13 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
1733 * Link the root cgroup in this hierarchy into all the css_set 1952 * Link the root cgroup in this hierarchy into all the css_set
1734 * objects. 1953 * objects.
1735 */ 1954 */
1736 down_write(&css_set_rwsem); 1955 spin_lock_bh(&css_set_lock);
1737 hash_for_each(css_set_table, i, cset, hlist) 1956 hash_for_each(css_set_table, i, cset, hlist) {
1738 link_css_set(&tmp_links, cset, root_cgrp); 1957 link_css_set(&tmp_links, cset, root_cgrp);
1739 up_write(&css_set_rwsem); 1958 if (css_set_populated(cset))
1959 cgroup_update_populated(root_cgrp, true);
1960 }
1961 spin_unlock_bh(&css_set_lock);
1740 1962
1741 BUG_ON(!list_empty(&root_cgrp->self.children)); 1963 BUG_ON(!list_empty(&root_cgrp->self.children));
1742 BUG_ON(atomic_read(&root->nr_cgrps) != 1); 1964 BUG_ON(atomic_read(&root->nr_cgrps) != 1);
@@ -1969,7 +2191,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
1969 char *path = NULL; 2191 char *path = NULL;
1970 2192
1971 mutex_lock(&cgroup_mutex); 2193 mutex_lock(&cgroup_mutex);
1972 down_read(&css_set_rwsem); 2194 spin_lock_bh(&css_set_lock);
1973 2195
1974 root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); 2196 root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
1975 2197
@@ -1982,7 +2204,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
1982 path = buf; 2204 path = buf;
1983 } 2205 }
1984 2206
1985 up_read(&css_set_rwsem); 2207 spin_unlock_bh(&css_set_lock);
1986 mutex_unlock(&cgroup_mutex); 2208 mutex_unlock(&cgroup_mutex);
1987 return path; 2209 return path;
1988} 2210}
@@ -2010,6 +2232,49 @@ struct cgroup_taskset {
2010 struct task_struct *cur_task; 2232 struct task_struct *cur_task;
2011}; 2233};
2012 2234
2235#define CGROUP_TASKSET_INIT(tset) (struct cgroup_taskset){ \
2236 .src_csets = LIST_HEAD_INIT(tset.src_csets), \
2237 .dst_csets = LIST_HEAD_INIT(tset.dst_csets), \
2238 .csets = &tset.src_csets, \
2239}
2240
2241/**
2242 * cgroup_taskset_add - try to add a migration target task to a taskset
2243 * @task: target task
2244 * @tset: target taskset
2245 *
2246 * Add @task, which is a migration target, to @tset. This function becomes
2247 * noop if @task doesn't need to be migrated. @task's css_set should have
2248 * been added as a migration source and @task->cg_list will be moved from
2249 * the css_set's tasks list to mg_tasks one.
2250 */
2251static void cgroup_taskset_add(struct task_struct *task,
2252 struct cgroup_taskset *tset)
2253{
2254 struct css_set *cset;
2255
2256 lockdep_assert_held(&css_set_lock);
2257
2258 /* @task either already exited or can't exit until the end */
2259 if (task->flags & PF_EXITING)
2260 return;
2261
2262 /* leave @task alone if post_fork() hasn't linked it yet */
2263 if (list_empty(&task->cg_list))
2264 return;
2265
2266 cset = task_css_set(task);
2267 if (!cset->mg_src_cgrp)
2268 return;
2269
2270 list_move_tail(&task->cg_list, &cset->mg_tasks);
2271 if (list_empty(&cset->mg_node))
2272 list_add_tail(&cset->mg_node, &tset->src_csets);
2273 if (list_empty(&cset->mg_dst_cset->mg_node))
2274 list_move_tail(&cset->mg_dst_cset->mg_node,
2275 &tset->dst_csets);
2276}
2277
2013/** 2278/**
2014 * cgroup_taskset_first - reset taskset and return the first task 2279 * cgroup_taskset_first - reset taskset and return the first task
2015 * @tset: taskset of interest 2280 * @tset: taskset of interest
@@ -2057,47 +2322,86 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
2057} 2322}
2058 2323
2059/** 2324/**
2060 * cgroup_task_migrate - move a task from one cgroup to another. 2325 * cgroup_taskset_migrate - migrate a taskset to a cgroup
2061 * @old_cgrp: the cgroup @tsk is being migrated from 2326 * @tset: taget taskset
2062 * @tsk: the task being migrated 2327 * @dst_cgrp: destination cgroup
2063 * @new_cset: the new css_set @tsk is being attached to
2064 * 2328 *
2065 * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked. 2329 * Migrate tasks in @tset to @dst_cgrp. This function fails iff one of the
2330 * ->can_attach callbacks fails and guarantees that either all or none of
2331 * the tasks in @tset are migrated. @tset is consumed regardless of
2332 * success.
2066 */ 2333 */
2067static void cgroup_task_migrate(struct cgroup *old_cgrp, 2334static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
2068 struct task_struct *tsk, 2335 struct cgroup *dst_cgrp)
2069 struct css_set *new_cset)
2070{ 2336{
2071 struct css_set *old_cset; 2337 struct cgroup_subsys_state *css, *failed_css = NULL;
2072 2338 struct task_struct *task, *tmp_task;
2073 lockdep_assert_held(&cgroup_mutex); 2339 struct css_set *cset, *tmp_cset;
2074 lockdep_assert_held(&css_set_rwsem); 2340 int i, ret;
2075 2341
2076 /* 2342 /* methods shouldn't be called if no task is actually migrating */
2077 * We are synchronized through cgroup_threadgroup_rwsem against 2343 if (list_empty(&tset->src_csets))
2078 * PF_EXITING setting such that we can't race against cgroup_exit() 2344 return 0;
2079 * changing the css_set to init_css_set and dropping the old one.
2080 */
2081 WARN_ON_ONCE(tsk->flags & PF_EXITING);
2082 old_cset = task_css_set(tsk);
2083 2345
2084 get_css_set(new_cset); 2346 /* check that we can legitimately attach to the cgroup */
2085 rcu_assign_pointer(tsk->cgroups, new_cset); 2347 for_each_e_css(css, i, dst_cgrp) {
2348 if (css->ss->can_attach) {
2349 ret = css->ss->can_attach(css, tset);
2350 if (ret) {
2351 failed_css = css;
2352 goto out_cancel_attach;
2353 }
2354 }
2355 }
2086 2356
2087 /* 2357 /*
2088 * Use move_tail so that cgroup_taskset_first() still returns the 2358 * Now that we're guaranteed success, proceed to move all tasks to
2089 * leader after migration. This works because cgroup_migrate() 2359 * the new cgroup. There are no failure cases after here, so this
2090 * ensures that the dst_cset of the leader is the first on the 2360 * is the commit point.
2091 * tset's dst_csets list.
2092 */ 2361 */
2093 list_move_tail(&tsk->cg_list, &new_cset->mg_tasks); 2362 spin_lock_bh(&css_set_lock);
2363 list_for_each_entry(cset, &tset->src_csets, mg_node) {
2364 list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
2365 struct css_set *from_cset = task_css_set(task);
2366 struct css_set *to_cset = cset->mg_dst_cset;
2367
2368 get_css_set(to_cset);
2369 css_set_move_task(task, from_cset, to_cset, true);
2370 put_css_set_locked(from_cset);
2371 }
2372 }
2373 spin_unlock_bh(&css_set_lock);
2094 2374
2095 /* 2375 /*
2096 * We just gained a reference on old_cset by taking it from the 2376 * Migration is committed, all target tasks are now on dst_csets.
2097 * task. As trading it for new_cset is protected by cgroup_mutex, 2377 * Nothing is sensitive to fork() after this point. Notify
2098 * we're safe to drop it here; it will be freed under RCU. 2378 * controllers that migration is complete.
2099 */ 2379 */
2100 put_css_set_locked(old_cset); 2380 tset->csets = &tset->dst_csets;
2381
2382 for_each_e_css(css, i, dst_cgrp)
2383 if (css->ss->attach)
2384 css->ss->attach(css, tset);
2385
2386 ret = 0;
2387 goto out_release_tset;
2388
2389out_cancel_attach:
2390 for_each_e_css(css, i, dst_cgrp) {
2391 if (css == failed_css)
2392 break;
2393 if (css->ss->cancel_attach)
2394 css->ss->cancel_attach(css, tset);
2395 }
2396out_release_tset:
2397 spin_lock_bh(&css_set_lock);
2398 list_splice_init(&tset->dst_csets, &tset->src_csets);
2399 list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
2400 list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
2401 list_del_init(&cset->mg_node);
2402 }
2403 spin_unlock_bh(&css_set_lock);
2404 return ret;
2101} 2405}
2102 2406
2103/** 2407/**
@@ -2113,14 +2417,14 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
2113 2417
2114 lockdep_assert_held(&cgroup_mutex); 2418 lockdep_assert_held(&cgroup_mutex);
2115 2419
2116 down_write(&css_set_rwsem); 2420 spin_lock_bh(&css_set_lock);
2117 list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) { 2421 list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
2118 cset->mg_src_cgrp = NULL; 2422 cset->mg_src_cgrp = NULL;
2119 cset->mg_dst_cset = NULL; 2423 cset->mg_dst_cset = NULL;
2120 list_del_init(&cset->mg_preload_node); 2424 list_del_init(&cset->mg_preload_node);
2121 put_css_set_locked(cset); 2425 put_css_set_locked(cset);
2122 } 2426 }
2123 up_write(&css_set_rwsem); 2427 spin_unlock_bh(&css_set_lock);
2124} 2428}
2125 2429
2126/** 2430/**
@@ -2146,7 +2450,7 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
2146 struct cgroup *src_cgrp; 2450 struct cgroup *src_cgrp;
2147 2451
2148 lockdep_assert_held(&cgroup_mutex); 2452 lockdep_assert_held(&cgroup_mutex);
2149 lockdep_assert_held(&css_set_rwsem); 2453 lockdep_assert_held(&css_set_lock);
2150 2454
2151 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); 2455 src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
2152 2456
@@ -2235,9 +2539,9 @@ err:
2235 2539
2236/** 2540/**
2237 * cgroup_migrate - migrate a process or task to a cgroup 2541 * cgroup_migrate - migrate a process or task to a cgroup
2238 * @cgrp: the destination cgroup
2239 * @leader: the leader of the process or the task to migrate 2542 * @leader: the leader of the process or the task to migrate
2240 * @threadgroup: whether @leader points to the whole process or a single task 2543 * @threadgroup: whether @leader points to the whole process or a single task
2544 * @cgrp: the destination cgroup
2241 * 2545 *
2242 * Migrate a process or task denoted by @leader to @cgrp. If migrating a 2546 * Migrate a process or task denoted by @leader to @cgrp. If migrating a
2243 * process, the caller must be holding cgroup_threadgroup_rwsem. The 2547 * process, the caller must be holding cgroup_threadgroup_rwsem. The
@@ -2251,115 +2555,29 @@ err:
2251 * decided for all targets by invoking group_migrate_prepare_dst() before 2555 * decided for all targets by invoking group_migrate_prepare_dst() before
2252 * actually starting migrating. 2556 * actually starting migrating.
2253 */ 2557 */
2254static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, 2558static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
2255 bool threadgroup) 2559 struct cgroup *cgrp)
2256{ 2560{
2257 struct cgroup_taskset tset = { 2561 struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
2258 .src_csets = LIST_HEAD_INIT(tset.src_csets), 2562 struct task_struct *task;
2259 .dst_csets = LIST_HEAD_INIT(tset.dst_csets),
2260 .csets = &tset.src_csets,
2261 };
2262 struct cgroup_subsys_state *css, *failed_css = NULL;
2263 struct css_set *cset, *tmp_cset;
2264 struct task_struct *task, *tmp_task;
2265 int i, ret;
2266 2563
2267 /* 2564 /*
2268 * Prevent freeing of tasks while we take a snapshot. Tasks that are 2565 * Prevent freeing of tasks while we take a snapshot. Tasks that are
2269 * already PF_EXITING could be freed from underneath us unless we 2566 * already PF_EXITING could be freed from underneath us unless we
2270 * take an rcu_read_lock. 2567 * take an rcu_read_lock.
2271 */ 2568 */
2272 down_write(&css_set_rwsem); 2569 spin_lock_bh(&css_set_lock);
2273 rcu_read_lock(); 2570 rcu_read_lock();
2274 task = leader; 2571 task = leader;
2275 do { 2572 do {
2276 /* @task either already exited or can't exit until the end */ 2573 cgroup_taskset_add(task, &tset);
2277 if (task->flags & PF_EXITING)
2278 goto next;
2279
2280 /* leave @task alone if post_fork() hasn't linked it yet */
2281 if (list_empty(&task->cg_list))
2282 goto next;
2283
2284 cset = task_css_set(task);
2285 if (!cset->mg_src_cgrp)
2286 goto next;
2287
2288 /*
2289 * cgroup_taskset_first() must always return the leader.
2290 * Take care to avoid disturbing the ordering.
2291 */
2292 list_move_tail(&task->cg_list, &cset->mg_tasks);
2293 if (list_empty(&cset->mg_node))
2294 list_add_tail(&cset->mg_node, &tset.src_csets);
2295 if (list_empty(&cset->mg_dst_cset->mg_node))
2296 list_move_tail(&cset->mg_dst_cset->mg_node,
2297 &tset.dst_csets);
2298 next:
2299 if (!threadgroup) 2574 if (!threadgroup)
2300 break; 2575 break;
2301 } while_each_thread(leader, task); 2576 } while_each_thread(leader, task);
2302 rcu_read_unlock(); 2577 rcu_read_unlock();
2303 up_write(&css_set_rwsem); 2578 spin_unlock_bh(&css_set_lock);
2304
2305 /* methods shouldn't be called if no task is actually migrating */
2306 if (list_empty(&tset.src_csets))
2307 return 0;
2308
2309 /* check that we can legitimately attach to the cgroup */
2310 for_each_e_css(css, i, cgrp) {
2311 if (css->ss->can_attach) {
2312 ret = css->ss->can_attach(css, &tset);
2313 if (ret) {
2314 failed_css = css;
2315 goto out_cancel_attach;
2316 }
2317 }
2318 }
2319
2320 /*
2321 * Now that we're guaranteed success, proceed to move all tasks to
2322 * the new cgroup. There are no failure cases after here, so this
2323 * is the commit point.
2324 */
2325 down_write(&css_set_rwsem);
2326 list_for_each_entry(cset, &tset.src_csets, mg_node) {
2327 list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
2328 cgroup_task_migrate(cset->mg_src_cgrp, task,
2329 cset->mg_dst_cset);
2330 }
2331 up_write(&css_set_rwsem);
2332
2333 /*
2334 * Migration is committed, all target tasks are now on dst_csets.
2335 * Nothing is sensitive to fork() after this point. Notify
2336 * controllers that migration is complete.
2337 */
2338 tset.csets = &tset.dst_csets;
2339 2579
2340 for_each_e_css(css, i, cgrp) 2580 return cgroup_taskset_migrate(&tset, cgrp);
2341 if (css->ss->attach)
2342 css->ss->attach(css, &tset);
2343
2344 ret = 0;
2345 goto out_release_tset;
2346
2347out_cancel_attach:
2348 for_each_e_css(css, i, cgrp) {
2349 if (css == failed_css)
2350 break;
2351 if (css->ss->cancel_attach)
2352 css->ss->cancel_attach(css, &tset);
2353 }
2354out_release_tset:
2355 down_write(&css_set_rwsem);
2356 list_splice_init(&tset.dst_csets, &tset.src_csets);
2357 list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
2358 list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
2359 list_del_init(&cset->mg_node);
2360 }
2361 up_write(&css_set_rwsem);
2362 return ret;
2363} 2581}
2364 2582
2365/** 2583/**
@@ -2378,7 +2596,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
2378 int ret; 2596 int ret;
2379 2597
2380 /* look up all src csets */ 2598 /* look up all src csets */
2381 down_read(&css_set_rwsem); 2599 spin_lock_bh(&css_set_lock);
2382 rcu_read_lock(); 2600 rcu_read_lock();
2383 task = leader; 2601 task = leader;
2384 do { 2602 do {
@@ -2388,12 +2606,12 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
2388 break; 2606 break;
2389 } while_each_thread(leader, task); 2607 } while_each_thread(leader, task);
2390 rcu_read_unlock(); 2608 rcu_read_unlock();
2391 up_read(&css_set_rwsem); 2609 spin_unlock_bh(&css_set_lock);
2392 2610
2393 /* prepare dst csets and commit */ 2611 /* prepare dst csets and commit */
2394 ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets); 2612 ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
2395 if (!ret) 2613 if (!ret)
2396 ret = cgroup_migrate(dst_cgrp, leader, threadgroup); 2614 ret = cgroup_migrate(leader, threadgroup, dst_cgrp);
2397 2615
2398 cgroup_migrate_finish(&preloaded_csets); 2616 cgroup_migrate_finish(&preloaded_csets);
2399 return ret; 2617 return ret;
@@ -2421,15 +2639,15 @@ static int cgroup_procs_write_permission(struct task_struct *task,
2421 struct cgroup *cgrp; 2639 struct cgroup *cgrp;
2422 struct inode *inode; 2640 struct inode *inode;
2423 2641
2424 down_read(&css_set_rwsem); 2642 spin_lock_bh(&css_set_lock);
2425 cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); 2643 cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
2426 up_read(&css_set_rwsem); 2644 spin_unlock_bh(&css_set_lock);
2427 2645
2428 while (!cgroup_is_descendant(dst_cgrp, cgrp)) 2646 while (!cgroup_is_descendant(dst_cgrp, cgrp))
2429 cgrp = cgroup_parent(cgrp); 2647 cgrp = cgroup_parent(cgrp);
2430 2648
2431 ret = -ENOMEM; 2649 ret = -ENOMEM;
2432 inode = kernfs_get_inode(sb, cgrp->procs_kn); 2650 inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
2433 if (inode) { 2651 if (inode) {
2434 ret = inode_permission(inode, MAY_WRITE); 2652 ret = inode_permission(inode, MAY_WRITE);
2435 iput(inode); 2653 iput(inode);
@@ -2520,9 +2738,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2520 if (root == &cgrp_dfl_root) 2738 if (root == &cgrp_dfl_root)
2521 continue; 2739 continue;
2522 2740
2523 down_read(&css_set_rwsem); 2741 spin_lock_bh(&css_set_lock);
2524 from_cgrp = task_cgroup_from_root(from, root); 2742 from_cgrp = task_cgroup_from_root(from, root);
2525 up_read(&css_set_rwsem); 2743 spin_unlock_bh(&css_set_lock);
2526 2744
2527 retval = cgroup_attach_task(from_cgrp, tsk, false); 2745 retval = cgroup_attach_task(from_cgrp, tsk, false);
2528 if (retval) 2746 if (retval)
@@ -2637,6 +2855,7 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
2637static int cgroup_update_dfl_csses(struct cgroup *cgrp) 2855static int cgroup_update_dfl_csses(struct cgroup *cgrp)
2638{ 2856{
2639 LIST_HEAD(preloaded_csets); 2857 LIST_HEAD(preloaded_csets);
2858 struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
2640 struct cgroup_subsys_state *css; 2859 struct cgroup_subsys_state *css;
2641 struct css_set *src_cset; 2860 struct css_set *src_cset;
2642 int ret; 2861 int ret;
@@ -2646,7 +2865,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
2646 percpu_down_write(&cgroup_threadgroup_rwsem); 2865 percpu_down_write(&cgroup_threadgroup_rwsem);
2647 2866
2648 /* look up all csses currently attached to @cgrp's subtree */ 2867 /* look up all csses currently attached to @cgrp's subtree */
2649 down_read(&css_set_rwsem); 2868 spin_lock_bh(&css_set_lock);
2650 css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { 2869 css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
2651 struct cgrp_cset_link *link; 2870 struct cgrp_cset_link *link;
2652 2871
@@ -2658,57 +2877,28 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
2658 cgroup_migrate_add_src(link->cset, cgrp, 2877 cgroup_migrate_add_src(link->cset, cgrp,
2659 &preloaded_csets); 2878 &preloaded_csets);
2660 } 2879 }
2661 up_read(&css_set_rwsem); 2880 spin_unlock_bh(&css_set_lock);
2662 2881
2663 /* NULL dst indicates self on default hierarchy */ 2882 /* NULL dst indicates self on default hierarchy */
2664 ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets); 2883 ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
2665 if (ret) 2884 if (ret)
2666 goto out_finish; 2885 goto out_finish;
2667 2886
2887 spin_lock_bh(&css_set_lock);
2668 list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) { 2888 list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
2669 struct task_struct *last_task = NULL, *task; 2889 struct task_struct *task, *ntask;
2670 2890
2671 /* src_csets precede dst_csets, break on the first dst_cset */ 2891 /* src_csets precede dst_csets, break on the first dst_cset */
2672 if (!src_cset->mg_src_cgrp) 2892 if (!src_cset->mg_src_cgrp)
2673 break; 2893 break;
2674 2894
2675 /* 2895 /* all tasks in src_csets need to be migrated */
2676 * All tasks in src_cset need to be migrated to the 2896 list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
2677 * matching dst_cset. Empty it process by process. We 2897 cgroup_taskset_add(task, &tset);
2678 * walk tasks but migrate processes. The leader might even
2679 * belong to a different cset but such src_cset would also
2680 * be among the target src_csets because the default
2681 * hierarchy enforces per-process membership.
2682 */
2683 while (true) {
2684 down_read(&css_set_rwsem);
2685 task = list_first_entry_or_null(&src_cset->tasks,
2686 struct task_struct, cg_list);
2687 if (task) {
2688 task = task->group_leader;
2689 WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp);
2690 get_task_struct(task);
2691 }
2692 up_read(&css_set_rwsem);
2693
2694 if (!task)
2695 break;
2696
2697 /* guard against possible infinite loop */
2698 if (WARN(last_task == task,
2699 "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n"))
2700 goto out_finish;
2701 last_task = task;
2702
2703 ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
2704
2705 put_task_struct(task);
2706
2707 if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
2708 goto out_finish;
2709 }
2710 } 2898 }
2899 spin_unlock_bh(&css_set_lock);
2711 2900
2901 ret = cgroup_taskset_migrate(&tset, cgrp);
2712out_finish: 2902out_finish:
2713 cgroup_migrate_finish(&preloaded_csets); 2903 cgroup_migrate_finish(&preloaded_csets);
2714 percpu_up_write(&cgroup_threadgroup_rwsem); 2904 percpu_up_write(&cgroup_threadgroup_rwsem);
@@ -2738,7 +2928,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2738 if (tok[0] == '\0') 2928 if (tok[0] == '\0')
2739 continue; 2929 continue;
2740 for_each_subsys_which(ss, ssid, &tmp_ss_mask) { 2930 for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
2741 if (ss->disabled || strcmp(tok + 1, ss->name)) 2931 if (!cgroup_ssid_enabled(ssid) ||
2932 strcmp(tok + 1, ss->name))
2742 continue; 2933 continue;
2743 2934
2744 if (*tok == '+') { 2935 if (*tok == '+') {
@@ -2862,7 +3053,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2862 ret = create_css(child, ss, 3053 ret = create_css(child, ss,
2863 cgrp->subtree_control & (1 << ssid)); 3054 cgrp->subtree_control & (1 << ssid));
2864 else 3055 else
2865 ret = cgroup_populate_dir(child, 1 << ssid); 3056 ret = css_populate_dir(cgroup_css(child, ss),
3057 NULL);
2866 if (ret) 3058 if (ret)
2867 goto err_undo_css; 3059 goto err_undo_css;
2868 } 3060 }
@@ -2895,7 +3087,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
2895 if (css_disable & (1 << ssid)) { 3087 if (css_disable & (1 << ssid)) {
2896 kill_css(css); 3088 kill_css(css);
2897 } else { 3089 } else {
2898 cgroup_clear_dir(child, 1 << ssid); 3090 css_clear_dir(css, NULL);
2899 if (ss->css_reset) 3091 if (ss->css_reset)
2900 ss->css_reset(css); 3092 ss->css_reset(css);
2901 } 3093 }
@@ -2943,15 +3135,16 @@ err_undo_css:
2943 if (css_enable & (1 << ssid)) 3135 if (css_enable & (1 << ssid))
2944 kill_css(css); 3136 kill_css(css);
2945 else 3137 else
2946 cgroup_clear_dir(child, 1 << ssid); 3138 css_clear_dir(css, NULL);
2947 } 3139 }
2948 } 3140 }
2949 goto out_unlock; 3141 goto out_unlock;
2950} 3142}
2951 3143
2952static int cgroup_populated_show(struct seq_file *seq, void *v) 3144static int cgroup_events_show(struct seq_file *seq, void *v)
2953{ 3145{
2954 seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt); 3146 seq_printf(seq, "populated %d\n",
3147 cgroup_is_populated(seq_css(seq)->cgroup));
2955 return 0; 3148 return 0;
2956} 3149}
2957 3150
@@ -3094,7 +3287,8 @@ static int cgroup_kn_set_ugid(struct kernfs_node *kn)
3094 return kernfs_setattr(kn, &iattr); 3287 return kernfs_setattr(kn, &iattr);
3095} 3288}
3096 3289
3097static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) 3290static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
3291 struct cftype *cft)
3098{ 3292{
3099 char name[CGROUP_FILE_NAME_MAX]; 3293 char name[CGROUP_FILE_NAME_MAX];
3100 struct kernfs_node *kn; 3294 struct kernfs_node *kn;
@@ -3116,33 +3310,38 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
3116 return ret; 3310 return ret;
3117 } 3311 }
3118 3312
3119 if (cft->write == cgroup_procs_write) 3313 if (cft->file_offset) {
3120 cgrp->procs_kn = kn; 3314 struct cgroup_file *cfile = (void *)css + cft->file_offset;
3121 else if (cft->seq_show == cgroup_populated_show) 3315
3122 cgrp->populated_kn = kn; 3316 kernfs_get(kn);
3317 cfile->kn = kn;
3318 list_add(&cfile->node, &css->files);
3319 }
3320
3123 return 0; 3321 return 0;
3124} 3322}
3125 3323
3126/** 3324/**
3127 * cgroup_addrm_files - add or remove files to a cgroup directory 3325 * cgroup_addrm_files - add or remove files to a cgroup directory
3128 * @cgrp: the target cgroup 3326 * @css: the target css
3327 * @cgrp: the target cgroup (usually css->cgroup)
3129 * @cfts: array of cftypes to be added 3328 * @cfts: array of cftypes to be added
3130 * @is_add: whether to add or remove 3329 * @is_add: whether to add or remove
3131 * 3330 *
3132 * Depending on @is_add, add or remove files defined by @cfts on @cgrp. 3331 * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
3133 * For removals, this function never fails. If addition fails, this 3332 * For removals, this function never fails.
3134 * function doesn't remove files already added. The caller is responsible
3135 * for cleaning up.
3136 */ 3333 */
3137static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], 3334static int cgroup_addrm_files(struct cgroup_subsys_state *css,
3335 struct cgroup *cgrp, struct cftype cfts[],
3138 bool is_add) 3336 bool is_add)
3139{ 3337{
3140 struct cftype *cft; 3338 struct cftype *cft, *cft_end = NULL;
3141 int ret; 3339 int ret;
3142 3340
3143 lockdep_assert_held(&cgroup_mutex); 3341 lockdep_assert_held(&cgroup_mutex);
3144 3342
3145 for (cft = cfts; cft->name[0] != '\0'; cft++) { 3343restart:
3344 for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
3146 /* does cft->flags tell us to skip this file on @cgrp? */ 3345 /* does cft->flags tell us to skip this file on @cgrp? */
3147 if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) 3346 if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
3148 continue; 3347 continue;
@@ -3154,11 +3353,13 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
3154 continue; 3353 continue;
3155 3354
3156 if (is_add) { 3355 if (is_add) {
3157 ret = cgroup_add_file(cgrp, cft); 3356 ret = cgroup_add_file(css, cgrp, cft);
3158 if (ret) { 3357 if (ret) {
3159 pr_warn("%s: failed to add %s, err=%d\n", 3358 pr_warn("%s: failed to add %s, err=%d\n",
3160 __func__, cft->name, ret); 3359 __func__, cft->name, ret);
3161 return ret; 3360 cft_end = cft;
3361 is_add = false;
3362 goto restart;
3162 } 3363 }
3163 } else { 3364 } else {
3164 cgroup_rm_file(cgrp, cft); 3365 cgroup_rm_file(cgrp, cft);
@@ -3184,7 +3385,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
3184 if (cgroup_is_dead(cgrp)) 3385 if (cgroup_is_dead(cgrp))
3185 continue; 3386 continue;
3186 3387
3187 ret = cgroup_addrm_files(cgrp, cfts, is_add); 3388 ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
3188 if (ret) 3389 if (ret)
3189 break; 3390 break;
3190 } 3391 }
@@ -3296,7 +3497,7 @@ static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3296{ 3497{
3297 int ret; 3498 int ret;
3298 3499
3299 if (ss->disabled) 3500 if (!cgroup_ssid_enabled(ss->id))
3300 return 0; 3501 return 0;
3301 3502
3302 if (!cfts || cfts[0].name[0] == '\0') 3503 if (!cfts || cfts[0].name[0] == '\0')
@@ -3346,17 +3547,8 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3346{ 3547{
3347 struct cftype *cft; 3548 struct cftype *cft;
3348 3549
3349 /* 3550 for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
3350 * If legacy_flies_on_dfl, we want to show the legacy files on the 3551 cft->flags |= __CFTYPE_NOT_ON_DFL;
3351 * dfl hierarchy but iff the target subsystem hasn't been updated
3352 * for the dfl hierarchy yet.
3353 */
3354 if (!cgroup_legacy_files_on_dfl ||
3355 ss->dfl_cftypes != ss->legacy_cftypes) {
3356 for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
3357 cft->flags |= __CFTYPE_NOT_ON_DFL;
3358 }
3359
3360 return cgroup_add_cftypes(ss, cfts); 3552 return cgroup_add_cftypes(ss, cfts);
3361} 3553}
3362 3554
@@ -3371,10 +3563,10 @@ static int cgroup_task_count(const struct cgroup *cgrp)
3371 int count = 0; 3563 int count = 0;
3372 struct cgrp_cset_link *link; 3564 struct cgrp_cset_link *link;
3373 3565
3374 down_read(&css_set_rwsem); 3566 spin_lock_bh(&css_set_lock);
3375 list_for_each_entry(link, &cgrp->cset_links, cset_link) 3567 list_for_each_entry(link, &cgrp->cset_links, cset_link)
3376 count += atomic_read(&link->cset->refcount); 3568 count += atomic_read(&link->cset->refcount);
3377 up_read(&css_set_rwsem); 3569 spin_unlock_bh(&css_set_lock);
3378 return count; 3570 return count;
3379} 3571}
3380 3572
@@ -3606,22 +3798,25 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
3606} 3798}
3607 3799
3608/** 3800/**
3609 * css_advance_task_iter - advance a task itererator to the next css_set 3801 * css_task_iter_advance_css_set - advance a task itererator to the next css_set
3610 * @it: the iterator to advance 3802 * @it: the iterator to advance
3611 * 3803 *
3612 * Advance @it to the next css_set to walk. 3804 * Advance @it to the next css_set to walk.
3613 */ 3805 */
3614static void css_advance_task_iter(struct css_task_iter *it) 3806static void css_task_iter_advance_css_set(struct css_task_iter *it)
3615{ 3807{
3616 struct list_head *l = it->cset_pos; 3808 struct list_head *l = it->cset_pos;
3617 struct cgrp_cset_link *link; 3809 struct cgrp_cset_link *link;
3618 struct css_set *cset; 3810 struct css_set *cset;
3619 3811
3812 lockdep_assert_held(&css_set_lock);
3813
3620 /* Advance to the next non-empty css_set */ 3814 /* Advance to the next non-empty css_set */
3621 do { 3815 do {
3622 l = l->next; 3816 l = l->next;
3623 if (l == it->cset_head) { 3817 if (l == it->cset_head) {
3624 it->cset_pos = NULL; 3818 it->cset_pos = NULL;
3819 it->task_pos = NULL;
3625 return; 3820 return;
3626 } 3821 }
3627 3822
@@ -3632,7 +3827,7 @@ static void css_advance_task_iter(struct css_task_iter *it)
3632 link = list_entry(l, struct cgrp_cset_link, cset_link); 3827 link = list_entry(l, struct cgrp_cset_link, cset_link);
3633 cset = link->cset; 3828 cset = link->cset;
3634 } 3829 }
3635 } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks)); 3830 } while (!css_set_populated(cset));
3636 3831
3637 it->cset_pos = l; 3832 it->cset_pos = l;
3638 3833
@@ -3643,6 +3838,52 @@ static void css_advance_task_iter(struct css_task_iter *it)
3643 3838
3644 it->tasks_head = &cset->tasks; 3839 it->tasks_head = &cset->tasks;
3645 it->mg_tasks_head = &cset->mg_tasks; 3840 it->mg_tasks_head = &cset->mg_tasks;
3841
3842 /*
3843 * We don't keep css_sets locked across iteration steps and thus
3844 * need to take steps to ensure that iteration can be resumed after
3845 * the lock is re-acquired. Iteration is performed at two levels -
3846 * css_sets and tasks in them.
3847 *
3848 * Once created, a css_set never leaves its cgroup lists, so a
3849 * pinned css_set is guaranteed to stay put and we can resume
3850 * iteration afterwards.
3851 *
3852 * Tasks may leave @cset across iteration steps. This is resolved
3853 * by registering each iterator with the css_set currently being
3854 * walked and making css_set_move_task() advance iterators whose
3855 * next task is leaving.
3856 */
3857 if (it->cur_cset) {
3858 list_del(&it->iters_node);
3859 put_css_set_locked(it->cur_cset);
3860 }
3861 get_css_set(cset);
3862 it->cur_cset = cset;
3863 list_add(&it->iters_node, &cset->task_iters);
3864}
3865
3866static void css_task_iter_advance(struct css_task_iter *it)
3867{
3868 struct list_head *l = it->task_pos;
3869
3870 lockdep_assert_held(&css_set_lock);
3871 WARN_ON_ONCE(!l);
3872
3873 /*
3874 * Advance iterator to find next entry. cset->tasks is consumed
3875 * first and then ->mg_tasks. After ->mg_tasks, we move onto the
3876 * next cset.
3877 */
3878 l = l->next;
3879
3880 if (l == it->tasks_head)
3881 l = it->mg_tasks_head->next;
3882
3883 if (l == it->mg_tasks_head)
3884 css_task_iter_advance_css_set(it);
3885 else
3886 it->task_pos = l;
3646} 3887}
3647 3888
3648/** 3889/**
@@ -3654,19 +3895,16 @@ static void css_advance_task_iter(struct css_task_iter *it)
3654 * css_task_iter_next() to walk through the tasks until the function 3895 * css_task_iter_next() to walk through the tasks until the function
3655 * returns NULL. On completion of iteration, css_task_iter_end() must be 3896 * returns NULL. On completion of iteration, css_task_iter_end() must be
3656 * called. 3897 * called.
3657 *
3658 * Note that this function acquires a lock which is released when the
3659 * iteration finishes. The caller can't sleep while iteration is in
3660 * progress.
3661 */ 3898 */
3662void css_task_iter_start(struct cgroup_subsys_state *css, 3899void css_task_iter_start(struct cgroup_subsys_state *css,
3663 struct css_task_iter *it) 3900 struct css_task_iter *it)
3664 __acquires(css_set_rwsem)
3665{ 3901{
3666 /* no one should try to iterate before mounting cgroups */ 3902 /* no one should try to iterate before mounting cgroups */
3667 WARN_ON_ONCE(!use_task_css_set_links); 3903 WARN_ON_ONCE(!use_task_css_set_links);
3668 3904
3669 down_read(&css_set_rwsem); 3905 memset(it, 0, sizeof(*it));
3906
3907 spin_lock_bh(&css_set_lock);
3670 3908
3671 it->ss = css->ss; 3909 it->ss = css->ss;
3672 3910
@@ -3677,7 +3915,9 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
3677 3915
3678 it->cset_head = it->cset_pos; 3916 it->cset_head = it->cset_pos;
3679 3917
3680 css_advance_task_iter(it); 3918 css_task_iter_advance_css_set(it);
3919
3920 spin_unlock_bh(&css_set_lock);
3681} 3921}
3682 3922
3683/** 3923/**
@@ -3690,30 +3930,23 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
3690 */ 3930 */
3691struct task_struct *css_task_iter_next(struct css_task_iter *it) 3931struct task_struct *css_task_iter_next(struct css_task_iter *it)
3692{ 3932{
3693 struct task_struct *res; 3933 if (it->cur_task) {
3694 struct list_head *l = it->task_pos; 3934 put_task_struct(it->cur_task);
3935 it->cur_task = NULL;
3936 }
3695 3937
3696 /* If the iterator cg is NULL, we have no tasks */ 3938 spin_lock_bh(&css_set_lock);
3697 if (!it->cset_pos)
3698 return NULL;
3699 res = list_entry(l, struct task_struct, cg_list);
3700 3939
3701 /* 3940 if (it->task_pos) {
3702 * Advance iterator to find next entry. cset->tasks is consumed 3941 it->cur_task = list_entry(it->task_pos, struct task_struct,
3703 * first and then ->mg_tasks. After ->mg_tasks, we move onto the 3942 cg_list);
3704 * next cset. 3943 get_task_struct(it->cur_task);
3705 */ 3944 css_task_iter_advance(it);
3706 l = l->next; 3945 }
3707 3946
3708 if (l == it->tasks_head) 3947 spin_unlock_bh(&css_set_lock);
3709 l = it->mg_tasks_head->next;
3710 3948
3711 if (l == it->mg_tasks_head) 3949 return it->cur_task;
3712 css_advance_task_iter(it);
3713 else
3714 it->task_pos = l;
3715
3716 return res;
3717} 3950}
3718 3951
3719/** 3952/**
@@ -3723,9 +3956,16 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
3723 * Finish task iteration started by css_task_iter_start(). 3956 * Finish task iteration started by css_task_iter_start().
3724 */ 3957 */
3725void css_task_iter_end(struct css_task_iter *it) 3958void css_task_iter_end(struct css_task_iter *it)
3726 __releases(css_set_rwsem)
3727{ 3959{
3728 up_read(&css_set_rwsem); 3960 if (it->cur_cset) {
3961 spin_lock_bh(&css_set_lock);
3962 list_del(&it->iters_node);
3963 put_css_set_locked(it->cur_cset);
3964 spin_unlock_bh(&css_set_lock);
3965 }
3966
3967 if (it->cur_task)
3968 put_task_struct(it->cur_task);
3729} 3969}
3730 3970
3731/** 3971/**
@@ -3750,10 +3990,10 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3750 mutex_lock(&cgroup_mutex); 3990 mutex_lock(&cgroup_mutex);
3751 3991
3752 /* all tasks in @from are being moved, all csets are source */ 3992 /* all tasks in @from are being moved, all csets are source */
3753 down_read(&css_set_rwsem); 3993 spin_lock_bh(&css_set_lock);
3754 list_for_each_entry(link, &from->cset_links, cset_link) 3994 list_for_each_entry(link, &from->cset_links, cset_link)
3755 cgroup_migrate_add_src(link->cset, to, &preloaded_csets); 3995 cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
3756 up_read(&css_set_rwsem); 3996 spin_unlock_bh(&css_set_lock);
3757 3997
3758 ret = cgroup_migrate_prepare_dst(to, &preloaded_csets); 3998 ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
3759 if (ret) 3999 if (ret)
@@ -3771,7 +4011,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3771 css_task_iter_end(&it); 4011 css_task_iter_end(&it);
3772 4012
3773 if (task) { 4013 if (task) {
3774 ret = cgroup_migrate(to, task, false); 4014 ret = cgroup_migrate(task, false, to);
3775 put_task_struct(task); 4015 put_task_struct(task);
3776 } 4016 }
3777 } while (task && !ret); 4017 } while (task && !ret);
@@ -4268,13 +4508,13 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
4268static struct cftype cgroup_dfl_base_files[] = { 4508static struct cftype cgroup_dfl_base_files[] = {
4269 { 4509 {
4270 .name = "cgroup.procs", 4510 .name = "cgroup.procs",
4511 .file_offset = offsetof(struct cgroup, procs_file),
4271 .seq_start = cgroup_pidlist_start, 4512 .seq_start = cgroup_pidlist_start,
4272 .seq_next = cgroup_pidlist_next, 4513 .seq_next = cgroup_pidlist_next,
4273 .seq_stop = cgroup_pidlist_stop, 4514 .seq_stop = cgroup_pidlist_stop,
4274 .seq_show = cgroup_pidlist_show, 4515 .seq_show = cgroup_pidlist_show,
4275 .private = CGROUP_FILE_PROCS, 4516 .private = CGROUP_FILE_PROCS,
4276 .write = cgroup_procs_write, 4517 .write = cgroup_procs_write,
4277 .mode = S_IRUGO | S_IWUSR,
4278 }, 4518 },
4279 { 4519 {
4280 .name = "cgroup.controllers", 4520 .name = "cgroup.controllers",
@@ -4292,9 +4532,10 @@ static struct cftype cgroup_dfl_base_files[] = {
4292 .write = cgroup_subtree_control_write, 4532 .write = cgroup_subtree_control_write,
4293 }, 4533 },
4294 { 4534 {
4295 .name = "cgroup.populated", 4535 .name = "cgroup.events",
4296 .flags = CFTYPE_NOT_ON_ROOT, 4536 .flags = CFTYPE_NOT_ON_ROOT,
4297 .seq_show = cgroup_populated_show, 4537 .file_offset = offsetof(struct cgroup, events_file),
4538 .seq_show = cgroup_events_show,
4298 }, 4539 },
4299 { } /* terminate */ 4540 { } /* terminate */
4300}; 4541};
@@ -4309,7 +4550,6 @@ static struct cftype cgroup_legacy_base_files[] = {
4309 .seq_show = cgroup_pidlist_show, 4550 .seq_show = cgroup_pidlist_show,
4310 .private = CGROUP_FILE_PROCS, 4551 .private = CGROUP_FILE_PROCS,
4311 .write = cgroup_procs_write, 4552 .write = cgroup_procs_write,
4312 .mode = S_IRUGO | S_IWUSR,
4313 }, 4553 },
4314 { 4554 {
4315 .name = "cgroup.clone_children", 4555 .name = "cgroup.clone_children",
@@ -4329,7 +4569,6 @@ static struct cftype cgroup_legacy_base_files[] = {
4329 .seq_show = cgroup_pidlist_show, 4569 .seq_show = cgroup_pidlist_show,
4330 .private = CGROUP_FILE_TASKS, 4570 .private = CGROUP_FILE_TASKS,
4331 .write = cgroup_tasks_write, 4571 .write = cgroup_tasks_write,
4332 .mode = S_IRUGO | S_IWUSR,
4333 }, 4572 },
4334 { 4573 {
4335 .name = "notify_on_release", 4574 .name = "notify_on_release",
@@ -4346,37 +4585,6 @@ static struct cftype cgroup_legacy_base_files[] = {
4346 { } /* terminate */ 4585 { } /* terminate */
4347}; 4586};
4348 4587
4349/**
4350 * cgroup_populate_dir - create subsys files in a cgroup directory
4351 * @cgrp: target cgroup
4352 * @subsys_mask: mask of the subsystem ids whose files should be added
4353 *
4354 * On failure, no file is added.
4355 */
4356static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
4357{
4358 struct cgroup_subsys *ss;
4359 int i, ret = 0;
4360
4361 /* process cftsets of each subsystem */
4362 for_each_subsys(ss, i) {
4363 struct cftype *cfts;
4364
4365 if (!(subsys_mask & (1 << i)))
4366 continue;
4367
4368 list_for_each_entry(cfts, &ss->cfts, node) {
4369 ret = cgroup_addrm_files(cgrp, cfts, true);
4370 if (ret < 0)
4371 goto err;
4372 }
4373 }
4374 return 0;
4375err:
4376 cgroup_clear_dir(cgrp, subsys_mask);
4377 return ret;
4378}
4379
4380/* 4588/*
4381 * css destruction is four-stage process. 4589 * css destruction is four-stage process.
4382 * 4590 *
@@ -4405,9 +4613,13 @@ static void css_free_work_fn(struct work_struct *work)
4405 container_of(work, struct cgroup_subsys_state, destroy_work); 4613 container_of(work, struct cgroup_subsys_state, destroy_work);
4406 struct cgroup_subsys *ss = css->ss; 4614 struct cgroup_subsys *ss = css->ss;
4407 struct cgroup *cgrp = css->cgroup; 4615 struct cgroup *cgrp = css->cgroup;
4616 struct cgroup_file *cfile;
4408 4617
4409 percpu_ref_exit(&css->refcnt); 4618 percpu_ref_exit(&css->refcnt);
4410 4619
4620 list_for_each_entry(cfile, &css->files, node)
4621 kernfs_put(cfile->kn);
4622
4411 if (ss) { 4623 if (ss) {
4412 /* css free path */ 4624 /* css free path */
4413 int id = css->id; 4625 int id = css->id;
@@ -4512,6 +4724,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
4512 css->ss = ss; 4724 css->ss = ss;
4513 INIT_LIST_HEAD(&css->sibling); 4725 INIT_LIST_HEAD(&css->sibling);
4514 INIT_LIST_HEAD(&css->children); 4726 INIT_LIST_HEAD(&css->children);
4727 INIT_LIST_HEAD(&css->files);
4515 css->serial_nr = css_serial_nr_next++; 4728 css->serial_nr = css_serial_nr_next++;
4516 4729
4517 if (cgroup_parent(cgrp)) { 4730 if (cgroup_parent(cgrp)) {
@@ -4594,7 +4807,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
4594 css->id = err; 4807 css->id = err;
4595 4808
4596 if (visible) { 4809 if (visible) {
4597 err = cgroup_populate_dir(cgrp, 1 << ss->id); 4810 err = css_populate_dir(css, NULL);
4598 if (err) 4811 if (err)
4599 goto err_free_id; 4812 goto err_free_id;
4600 } 4813 }
@@ -4620,7 +4833,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
4620 4833
4621err_list_del: 4834err_list_del:
4622 list_del_rcu(&css->sibling); 4835 list_del_rcu(&css->sibling);
4623 cgroup_clear_dir(css->cgroup, 1 << css->ss->id); 4836 css_clear_dir(css, NULL);
4624err_free_id: 4837err_free_id:
4625 cgroup_idr_remove(&ss->css_idr, css->id); 4838 cgroup_idr_remove(&ss->css_idr, css->id);
4626err_free_percpu_ref: 4839err_free_percpu_ref:
@@ -4637,7 +4850,6 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
4637 struct cgroup_root *root; 4850 struct cgroup_root *root;
4638 struct cgroup_subsys *ss; 4851 struct cgroup_subsys *ss;
4639 struct kernfs_node *kn; 4852 struct kernfs_node *kn;
4640 struct cftype *base_files;
4641 int ssid, ret; 4853 int ssid, ret;
4642 4854
4643 /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable. 4855 /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable.
@@ -4713,12 +4925,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
4713 if (ret) 4925 if (ret)
4714 goto out_destroy; 4926 goto out_destroy;
4715 4927
4716 if (cgroup_on_dfl(cgrp)) 4928 ret = css_populate_dir(&cgrp->self, NULL);
4717 base_files = cgroup_dfl_base_files;
4718 else
4719 base_files = cgroup_legacy_base_files;
4720
4721 ret = cgroup_addrm_files(cgrp, base_files, true);
4722 if (ret) 4929 if (ret)
4723 goto out_destroy; 4930 goto out_destroy;
4724 4931
@@ -4805,7 +5012,7 @@ static void kill_css(struct cgroup_subsys_state *css)
4805 * This must happen before css is disassociated with its cgroup. 5012 * This must happen before css is disassociated with its cgroup.
4806 * See seq_css() for details. 5013 * See seq_css() for details.
4807 */ 5014 */
4808 cgroup_clear_dir(css->cgroup, 1 << css->ss->id); 5015 css_clear_dir(css, NULL);
4809 5016
4810 /* 5017 /*
4811 * Killing would put the base ref, but we need to keep it alive 5018 * Killing would put the base ref, but we need to keep it alive
@@ -4854,19 +5061,15 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4854 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 5061 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4855{ 5062{
4856 struct cgroup_subsys_state *css; 5063 struct cgroup_subsys_state *css;
4857 bool empty;
4858 int ssid; 5064 int ssid;
4859 5065
4860 lockdep_assert_held(&cgroup_mutex); 5066 lockdep_assert_held(&cgroup_mutex);
4861 5067
4862 /* 5068 /*
4863 * css_set_rwsem synchronizes access to ->cset_links and prevents 5069 * Only migration can raise populated from zero and we're already
4864 * @cgrp from being removed while put_css_set() is in progress. 5070 * holding cgroup_mutex.
4865 */ 5071 */
4866 down_read(&css_set_rwsem); 5072 if (cgroup_is_populated(cgrp))
4867 empty = list_empty(&cgrp->cset_links);
4868 up_read(&css_set_rwsem);
4869 if (!empty)
4870 return -EBUSY; 5073 return -EBUSY;
4871 5074
4872 /* 5075 /*
@@ -4964,6 +5167,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
4964 5167
4965 have_fork_callback |= (bool)ss->fork << ss->id; 5168 have_fork_callback |= (bool)ss->fork << ss->id;
4966 have_exit_callback |= (bool)ss->exit << ss->id; 5169 have_exit_callback |= (bool)ss->exit << ss->id;
5170 have_free_callback |= (bool)ss->free << ss->id;
4967 have_canfork_callback |= (bool)ss->can_fork << ss->id; 5171 have_canfork_callback |= (bool)ss->can_fork << ss->id;
4968 5172
4969 /* At system boot, before all subsystems have been 5173 /* At system boot, before all subsystems have been
@@ -5012,6 +5216,8 @@ int __init cgroup_init_early(void)
5012 return 0; 5216 return 0;
5013} 5217}
5014 5218
5219static unsigned long cgroup_disable_mask __initdata;
5220
5015/** 5221/**
5016 * cgroup_init - cgroup initialization 5222 * cgroup_init - cgroup initialization
5017 * 5223 *
@@ -5022,7 +5228,7 @@ int __init cgroup_init(void)
5022{ 5228{
5023 struct cgroup_subsys *ss; 5229 struct cgroup_subsys *ss;
5024 unsigned long key; 5230 unsigned long key;
5025 int ssid, err; 5231 int ssid;
5026 5232
5027 BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); 5233 BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
5028 BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); 5234 BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
@@ -5058,14 +5264,15 @@ int __init cgroup_init(void)
5058 * disabled flag and cftype registration needs kmalloc, 5264 * disabled flag and cftype registration needs kmalloc,
5059 * both of which aren't available during early_init. 5265 * both of which aren't available during early_init.
5060 */ 5266 */
5061 if (ss->disabled) 5267 if (cgroup_disable_mask & (1 << ssid)) {
5268 static_branch_disable(cgroup_subsys_enabled_key[ssid]);
5269 printk(KERN_INFO "Disabling %s control group subsystem\n",
5270 ss->name);
5062 continue; 5271 continue;
5272 }
5063 5273
5064 cgrp_dfl_root.subsys_mask |= 1 << ss->id; 5274 cgrp_dfl_root.subsys_mask |= 1 << ss->id;
5065 5275
5066 if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes)
5067 ss->dfl_cftypes = ss->legacy_cftypes;
5068
5069 if (!ss->dfl_cftypes) 5276 if (!ss->dfl_cftypes)
5070 cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id; 5277 cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id;
5071 5278
@@ -5080,17 +5287,10 @@ int __init cgroup_init(void)
5080 ss->bind(init_css_set.subsys[ssid]); 5287 ss->bind(init_css_set.subsys[ssid]);
5081 } 5288 }
5082 5289
5083 err = sysfs_create_mount_point(fs_kobj, "cgroup"); 5290 WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
5084 if (err) 5291 WARN_ON(register_filesystem(&cgroup_fs_type));
5085 return err; 5292 WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
5086
5087 err = register_filesystem(&cgroup_fs_type);
5088 if (err < 0) {
5089 sysfs_remove_mount_point(fs_kobj, "cgroup");
5090 return err;
5091 }
5092 5293
5093 proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
5094 return 0; 5294 return 0;
5095} 5295}
5096 5296
@@ -5137,7 +5337,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
5137 goto out; 5337 goto out;
5138 5338
5139 mutex_lock(&cgroup_mutex); 5339 mutex_lock(&cgroup_mutex);
5140 down_read(&css_set_rwsem); 5340 spin_lock_bh(&css_set_lock);
5141 5341
5142 for_each_root(root) { 5342 for_each_root(root) {
5143 struct cgroup_subsys *ss; 5343 struct cgroup_subsys *ss;
@@ -5157,19 +5357,39 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
5157 seq_printf(m, "%sname=%s", count ? "," : "", 5357 seq_printf(m, "%sname=%s", count ? "," : "",
5158 root->name); 5358 root->name);
5159 seq_putc(m, ':'); 5359 seq_putc(m, ':');
5360
5160 cgrp = task_cgroup_from_root(tsk, root); 5361 cgrp = task_cgroup_from_root(tsk, root);
5161 path = cgroup_path(cgrp, buf, PATH_MAX); 5362
5162 if (!path) { 5363 /*
5163 retval = -ENAMETOOLONG; 5364 * On traditional hierarchies, all zombie tasks show up as
5164 goto out_unlock; 5365 * belonging to the root cgroup. On the default hierarchy,
5366 * while a zombie doesn't show up in "cgroup.procs" and
5367 * thus can't be migrated, its /proc/PID/cgroup keeps
5368 * reporting the cgroup it belonged to before exiting. If
5369 * the cgroup is removed before the zombie is reaped,
5370 * " (deleted)" is appended to the cgroup path.
5371 */
5372 if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
5373 path = cgroup_path(cgrp, buf, PATH_MAX);
5374 if (!path) {
5375 retval = -ENAMETOOLONG;
5376 goto out_unlock;
5377 }
5378 } else {
5379 path = "/";
5165 } 5380 }
5381
5166 seq_puts(m, path); 5382 seq_puts(m, path);
5167 seq_putc(m, '\n'); 5383
5384 if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
5385 seq_puts(m, " (deleted)\n");
5386 else
5387 seq_putc(m, '\n');
5168 } 5388 }
5169 5389
5170 retval = 0; 5390 retval = 0;
5171out_unlock: 5391out_unlock:
5172 up_read(&css_set_rwsem); 5392 spin_unlock_bh(&css_set_lock);
5173 mutex_unlock(&cgroup_mutex); 5393 mutex_unlock(&cgroup_mutex);
5174 kfree(buf); 5394 kfree(buf);
5175out: 5395out:
@@ -5193,7 +5413,8 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
5193 for_each_subsys(ss, i) 5413 for_each_subsys(ss, i)
5194 seq_printf(m, "%s\t%d\t%d\t%d\n", 5414 seq_printf(m, "%s\t%d\t%d\t%d\n",
5195 ss->legacy_name, ss->root->hierarchy_id, 5415 ss->legacy_name, ss->root->hierarchy_id,
5196 atomic_read(&ss->root->nr_cgrps), !ss->disabled); 5416 atomic_read(&ss->root->nr_cgrps),
5417 cgroup_ssid_enabled(i));
5197 5418
5198 mutex_unlock(&cgroup_mutex); 5419 mutex_unlock(&cgroup_mutex);
5199 return 0; 5420 return 0;
@@ -5314,7 +5535,7 @@ void cgroup_post_fork(struct task_struct *child,
5314 * @child during its iteration. 5535 * @child during its iteration.
5315 * 5536 *
5316 * If we won the race, @child is associated with %current's 5537 * If we won the race, @child is associated with %current's
5317 * css_set. Grabbing css_set_rwsem guarantees both that the 5538 * css_set. Grabbing css_set_lock guarantees both that the
5318 * association is stable, and, on completion of the parent's 5539 * association is stable, and, on completion of the parent's
5319 * migration, @child is visible in the source of migration or 5540 * migration, @child is visible in the source of migration or
5320 * already in the destination cgroup. This guarantee is necessary 5541 * already in the destination cgroup. This guarantee is necessary
@@ -5329,14 +5550,13 @@ void cgroup_post_fork(struct task_struct *child,
5329 if (use_task_css_set_links) { 5550 if (use_task_css_set_links) {
5330 struct css_set *cset; 5551 struct css_set *cset;
5331 5552
5332 down_write(&css_set_rwsem); 5553 spin_lock_bh(&css_set_lock);
5333 cset = task_css_set(current); 5554 cset = task_css_set(current);
5334 if (list_empty(&child->cg_list)) { 5555 if (list_empty(&child->cg_list)) {
5335 rcu_assign_pointer(child->cgroups, cset);
5336 list_add(&child->cg_list, &cset->tasks);
5337 get_css_set(cset); 5556 get_css_set(cset);
5557 css_set_move_task(child, NULL, cset, false);
5338 } 5558 }
5339 up_write(&css_set_rwsem); 5559 spin_unlock_bh(&css_set_lock);
5340 } 5560 }
5341 5561
5342 /* 5562 /*
@@ -5371,39 +5591,42 @@ void cgroup_exit(struct task_struct *tsk)
5371{ 5591{
5372 struct cgroup_subsys *ss; 5592 struct cgroup_subsys *ss;
5373 struct css_set *cset; 5593 struct css_set *cset;
5374 bool put_cset = false;
5375 int i; 5594 int i;
5376 5595
5377 /* 5596 /*
5378 * Unlink from @tsk from its css_set. As migration path can't race 5597 * Unlink from @tsk from its css_set. As migration path can't race
5379 * with us, we can check cg_list without grabbing css_set_rwsem. 5598 * with us, we can check css_set and cg_list without synchronization.
5380 */ 5599 */
5600 cset = task_css_set(tsk);
5601
5381 if (!list_empty(&tsk->cg_list)) { 5602 if (!list_empty(&tsk->cg_list)) {
5382 down_write(&css_set_rwsem); 5603 spin_lock_bh(&css_set_lock);
5383 list_del_init(&tsk->cg_list); 5604 css_set_move_task(tsk, cset, NULL, false);
5384 up_write(&css_set_rwsem); 5605 spin_unlock_bh(&css_set_lock);
5385 put_cset = true; 5606 } else {
5607 get_css_set(cset);
5386 } 5608 }
5387 5609
5388 /* Reassign the task to the init_css_set. */
5389 cset = task_css_set(tsk);
5390 RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
5391
5392 /* see cgroup_post_fork() for details */ 5610 /* see cgroup_post_fork() for details */
5393 for_each_subsys_which(ss, i, &have_exit_callback) { 5611 for_each_subsys_which(ss, i, &have_exit_callback)
5394 struct cgroup_subsys_state *old_css = cset->subsys[i]; 5612 ss->exit(tsk);
5395 struct cgroup_subsys_state *css = task_css(tsk, i); 5613}
5396 5614
5397 ss->exit(css, old_css, tsk); 5615void cgroup_free(struct task_struct *task)
5398 } 5616{
5617 struct css_set *cset = task_css_set(task);
5618 struct cgroup_subsys *ss;
5619 int ssid;
5399 5620
5400 if (put_cset) 5621 for_each_subsys_which(ss, ssid, &have_free_callback)
5401 put_css_set(cset); 5622 ss->free(task);
5623
5624 put_css_set(cset);
5402} 5625}
5403 5626
5404static void check_for_release(struct cgroup *cgrp) 5627static void check_for_release(struct cgroup *cgrp)
5405{ 5628{
5406 if (notify_on_release(cgrp) && !cgroup_has_tasks(cgrp) && 5629 if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
5407 !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) 5630 !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
5408 schedule_work(&cgrp->release_agent_work); 5631 schedule_work(&cgrp->release_agent_work);
5409} 5632}
@@ -5482,25 +5705,13 @@ static int __init cgroup_disable(char *str)
5482 if (strcmp(token, ss->name) && 5705 if (strcmp(token, ss->name) &&
5483 strcmp(token, ss->legacy_name)) 5706 strcmp(token, ss->legacy_name))
5484 continue; 5707 continue;
5485 5708 cgroup_disable_mask |= 1 << i;
5486 ss->disabled = 1;
5487 printk(KERN_INFO "Disabling %s control group subsystem\n",
5488 ss->name);
5489 break;
5490 } 5709 }
5491 } 5710 }
5492 return 1; 5711 return 1;
5493} 5712}
5494__setup("cgroup_disable=", cgroup_disable); 5713__setup("cgroup_disable=", cgroup_disable);
5495 5714
5496static int __init cgroup_set_legacy_files_on_dfl(char *str)
5497{
5498 printk("cgroup: using legacy files on the default hierarchy\n");
5499 cgroup_legacy_files_on_dfl = true;
5500 return 0;
5501}
5502__setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl);
5503
5504/** 5715/**
5505 * css_tryget_online_from_dir - get corresponding css from a cgroup dentry 5716 * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
5506 * @dentry: directory dentry of interest 5717 * @dentry: directory dentry of interest
@@ -5604,7 +5815,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
5604 if (!name_buf) 5815 if (!name_buf)
5605 return -ENOMEM; 5816 return -ENOMEM;
5606 5817
5607 down_read(&css_set_rwsem); 5818 spin_lock_bh(&css_set_lock);
5608 rcu_read_lock(); 5819 rcu_read_lock();
5609 cset = rcu_dereference(current->cgroups); 5820 cset = rcu_dereference(current->cgroups);
5610 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { 5821 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
@@ -5615,7 +5826,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
5615 c->root->hierarchy_id, name_buf); 5826 c->root->hierarchy_id, name_buf);
5616 } 5827 }
5617 rcu_read_unlock(); 5828 rcu_read_unlock();
5618 up_read(&css_set_rwsem); 5829 spin_unlock_bh(&css_set_lock);
5619 kfree(name_buf); 5830 kfree(name_buf);
5620 return 0; 5831 return 0;
5621} 5832}
@@ -5626,7 +5837,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
5626 struct cgroup_subsys_state *css = seq_css(seq); 5837 struct cgroup_subsys_state *css = seq_css(seq);
5627 struct cgrp_cset_link *link; 5838 struct cgrp_cset_link *link;
5628 5839
5629 down_read(&css_set_rwsem); 5840 spin_lock_bh(&css_set_lock);
5630 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { 5841 list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
5631 struct css_set *cset = link->cset; 5842 struct css_set *cset = link->cset;
5632 struct task_struct *task; 5843 struct task_struct *task;
@@ -5649,13 +5860,13 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
5649 overflow: 5860 overflow:
5650 seq_puts(seq, " ...\n"); 5861 seq_puts(seq, " ...\n");
5651 } 5862 }
5652 up_read(&css_set_rwsem); 5863 spin_unlock_bh(&css_set_lock);
5653 return 0; 5864 return 0;
5654} 5865}
5655 5866
5656static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) 5867static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
5657{ 5868{
5658 return (!cgroup_has_tasks(css->cgroup) && 5869 return (!cgroup_is_populated(css->cgroup) &&
5659 !css_has_online_children(&css->cgroup->self)); 5870 !css_has_online_children(&css->cgroup->self));
5660} 5871}
5661 5872
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
index 806cd7693ac8..cdd8df4e991c 100644
--- a/kernel/cgroup_pids.c
+++ b/kernel/cgroup_pids.c
@@ -266,11 +266,9 @@ static void pids_fork(struct task_struct *task, void *priv)
266 css_put(old_css); 266 css_put(old_css);
267} 267}
268 268
269static void pids_exit(struct cgroup_subsys_state *css, 269static void pids_free(struct task_struct *task)
270 struct cgroup_subsys_state *old_css,
271 struct task_struct *task)
272{ 270{
273 struct pids_cgroup *pids = css_pids(old_css); 271 struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id));
274 272
275 pids_uncharge(pids, 1); 273 pids_uncharge(pids, 1);
276} 274}
@@ -349,7 +347,7 @@ struct cgroup_subsys pids_cgrp_subsys = {
349 .can_fork = pids_can_fork, 347 .can_fork = pids_can_fork,
350 .cancel_fork = pids_cancel_fork, 348 .cancel_fork = pids_cancel_fork,
351 .fork = pids_fork, 349 .fork = pids_fork,
352 .exit = pids_exit, 350 .free = pids_free,
353 .legacy_cftypes = pids_files, 351 .legacy_cftypes = pids_files,
354 .dfl_cftypes = pids_files, 352 .dfl_cftypes = pids_files,
355}; 353};
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 0a495ab35bc7..d8560ee3bab7 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -58,36 +58,13 @@ static void context_tracking_recursion_exit(void)
58 * instructions to execute won't use any RCU read side critical section 58 * instructions to execute won't use any RCU read side critical section
59 * because this function sets RCU in extended quiescent state. 59 * because this function sets RCU in extended quiescent state.
60 */ 60 */
61void context_tracking_enter(enum ctx_state state) 61void __context_tracking_enter(enum ctx_state state)
62{ 62{
63 unsigned long flags;
64
65 /*
66 * Repeat the user_enter() check here because some archs may be calling
67 * this from asm and if no CPU needs context tracking, they shouldn't
68 * go further. Repeat the check here until they support the inline static
69 * key check.
70 */
71 if (!context_tracking_is_enabled())
72 return;
73
74 /*
75 * Some contexts may involve an exception occuring in an irq,
76 * leading to that nesting:
77 * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
78 * This would mess up the dyntick_nesting count though. And rcu_irq_*()
79 * helpers are enough to protect RCU uses inside the exception. So
80 * just return immediately if we detect we are in an IRQ.
81 */
82 if (in_interrupt())
83 return;
84
85 /* Kernel threads aren't supposed to go to userspace */ 63 /* Kernel threads aren't supposed to go to userspace */
86 WARN_ON_ONCE(!current->mm); 64 WARN_ON_ONCE(!current->mm);
87 65
88 local_irq_save(flags);
89 if (!context_tracking_recursion_enter()) 66 if (!context_tracking_recursion_enter())
90 goto out_irq_restore; 67 return;
91 68
92 if ( __this_cpu_read(context_tracking.state) != state) { 69 if ( __this_cpu_read(context_tracking.state) != state) {
93 if (__this_cpu_read(context_tracking.active)) { 70 if (__this_cpu_read(context_tracking.active)) {
@@ -120,7 +97,27 @@ void context_tracking_enter(enum ctx_state state)
120 __this_cpu_write(context_tracking.state, state); 97 __this_cpu_write(context_tracking.state, state);
121 } 98 }
122 context_tracking_recursion_exit(); 99 context_tracking_recursion_exit();
123out_irq_restore: 100}
101NOKPROBE_SYMBOL(__context_tracking_enter);
102EXPORT_SYMBOL_GPL(__context_tracking_enter);
103
104void context_tracking_enter(enum ctx_state state)
105{
106 unsigned long flags;
107
108 /*
109 * Some contexts may involve an exception occuring in an irq,
110 * leading to that nesting:
111 * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
112 * This would mess up the dyntick_nesting count though. And rcu_irq_*()
113 * helpers are enough to protect RCU uses inside the exception. So
114 * just return immediately if we detect we are in an IRQ.
115 */
116 if (in_interrupt())
117 return;
118
119 local_irq_save(flags);
120 __context_tracking_enter(state);
124 local_irq_restore(flags); 121 local_irq_restore(flags);
125} 122}
126NOKPROBE_SYMBOL(context_tracking_enter); 123NOKPROBE_SYMBOL(context_tracking_enter);
@@ -128,7 +125,7 @@ EXPORT_SYMBOL_GPL(context_tracking_enter);
128 125
129void context_tracking_user_enter(void) 126void context_tracking_user_enter(void)
130{ 127{
131 context_tracking_enter(CONTEXT_USER); 128 user_enter();
132} 129}
133NOKPROBE_SYMBOL(context_tracking_user_enter); 130NOKPROBE_SYMBOL(context_tracking_user_enter);
134 131
@@ -144,19 +141,10 @@ NOKPROBE_SYMBOL(context_tracking_user_enter);
144 * This call supports re-entrancy. This way it can be called from any exception 141 * This call supports re-entrancy. This way it can be called from any exception
145 * handler without needing to know if we came from userspace or not. 142 * handler without needing to know if we came from userspace or not.
146 */ 143 */
147void context_tracking_exit(enum ctx_state state) 144void __context_tracking_exit(enum ctx_state state)
148{ 145{
149 unsigned long flags;
150
151 if (!context_tracking_is_enabled())
152 return;
153
154 if (in_interrupt())
155 return;
156
157 local_irq_save(flags);
158 if (!context_tracking_recursion_enter()) 146 if (!context_tracking_recursion_enter())
159 goto out_irq_restore; 147 return;
160 148
161 if (__this_cpu_read(context_tracking.state) == state) { 149 if (__this_cpu_read(context_tracking.state) == state) {
162 if (__this_cpu_read(context_tracking.active)) { 150 if (__this_cpu_read(context_tracking.active)) {
@@ -173,7 +161,19 @@ void context_tracking_exit(enum ctx_state state)
173 __this_cpu_write(context_tracking.state, CONTEXT_KERNEL); 161 __this_cpu_write(context_tracking.state, CONTEXT_KERNEL);
174 } 162 }
175 context_tracking_recursion_exit(); 163 context_tracking_recursion_exit();
176out_irq_restore: 164}
165NOKPROBE_SYMBOL(__context_tracking_exit);
166EXPORT_SYMBOL_GPL(__context_tracking_exit);
167
168void context_tracking_exit(enum ctx_state state)
169{
170 unsigned long flags;
171
172 if (in_interrupt())
173 return;
174
175 local_irq_save(flags);
176 __context_tracking_exit(state);
177 local_irq_restore(flags); 177 local_irq_restore(flags);
178} 178}
179NOKPROBE_SYMBOL(context_tracking_exit); 179NOKPROBE_SYMBOL(context_tracking_exit);
@@ -181,7 +181,7 @@ EXPORT_SYMBOL_GPL(context_tracking_exit);
181 181
182void context_tracking_user_exit(void) 182void context_tracking_user_exit(void)
183{ 183{
184 context_tracking_exit(CONTEXT_USER); 184 user_exit();
185} 185}
186NOKPROBE_SYMBOL(context_tracking_user_exit); 186NOKPROBE_SYMBOL(context_tracking_user_exit);
187 187
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 82cf9dff4295..85ff5e26e23b 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -102,19 +102,6 @@ void get_online_cpus(void)
102} 102}
103EXPORT_SYMBOL_GPL(get_online_cpus); 103EXPORT_SYMBOL_GPL(get_online_cpus);
104 104
105bool try_get_online_cpus(void)
106{
107 if (cpu_hotplug.active_writer == current)
108 return true;
109 if (!mutex_trylock(&cpu_hotplug.lock))
110 return false;
111 cpuhp_lock_acquire_tryread();
112 atomic_inc(&cpu_hotplug.refcount);
113 mutex_unlock(&cpu_hotplug.lock);
114 return true;
115}
116EXPORT_SYMBOL_GPL(try_get_online_cpus);
117
118void put_online_cpus(void) 105void put_online_cpus(void)
119{ 106{
120 int refcount; 107 int refcount;
@@ -304,8 +291,8 @@ static inline void check_for_tasks(int dead_cpu)
304{ 291{
305 struct task_struct *g, *p; 292 struct task_struct *g, *p;
306 293
307 read_lock_irq(&tasklist_lock); 294 read_lock(&tasklist_lock);
308 do_each_thread(g, p) { 295 for_each_process_thread(g, p) {
309 if (!p->on_rq) 296 if (!p->on_rq)
310 continue; 297 continue;
311 /* 298 /*
@@ -320,8 +307,8 @@ static inline void check_for_tasks(int dead_cpu)
320 307
321 pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n", 308 pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n",
322 p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags); 309 p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags);
323 } while_each_thread(g, p); 310 }
324 read_unlock_irq(&tasklist_lock); 311 read_unlock(&tasklist_lock);
325} 312}
326 313
327struct take_cpu_down_param { 314struct take_cpu_down_param {
@@ -344,7 +331,7 @@ static int take_cpu_down(void *_param)
344 /* Give up timekeeping duties */ 331 /* Give up timekeeping duties */
345 tick_handover_do_timer(); 332 tick_handover_do_timer();
346 /* Park the stopper thread */ 333 /* Park the stopper thread */
347 kthread_park(current); 334 stop_machine_park((long)param->hcpu);
348 return 0; 335 return 0;
349} 336}
350 337
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f0acff0f66c9..10ae73611d80 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -473,7 +473,8 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
473 473
474 /* On legacy hiearchy, we must be a subset of our parent cpuset. */ 474 /* On legacy hiearchy, we must be a subset of our parent cpuset. */
475 ret = -EACCES; 475 ret = -EACCES;
476 if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par)) 476 if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
477 !is_cpuset_subset(trial, par))
477 goto out; 478 goto out;
478 479
479 /* 480 /*
@@ -497,7 +498,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
497 * be changed to have empty cpus_allowed or mems_allowed. 498 * be changed to have empty cpus_allowed or mems_allowed.
498 */ 499 */
499 ret = -ENOSPC; 500 ret = -ENOSPC;
500 if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) { 501 if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
501 if (!cpumask_empty(cur->cpus_allowed) && 502 if (!cpumask_empty(cur->cpus_allowed) &&
502 cpumask_empty(trial->cpus_allowed)) 503 cpumask_empty(trial->cpus_allowed))
503 goto out; 504 goto out;
@@ -879,7 +880,8 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
879 * If it becomes empty, inherit the effective mask of the 880 * If it becomes empty, inherit the effective mask of the
880 * parent, which is guaranteed to have some CPUs. 881 * parent, which is guaranteed to have some CPUs.
881 */ 882 */
882 if (cgroup_on_dfl(cp->css.cgroup) && cpumask_empty(new_cpus)) 883 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
884 cpumask_empty(new_cpus))
883 cpumask_copy(new_cpus, parent->effective_cpus); 885 cpumask_copy(new_cpus, parent->effective_cpus);
884 886
885 /* Skip the whole subtree if the cpumask remains the same. */ 887 /* Skip the whole subtree if the cpumask remains the same. */
@@ -896,7 +898,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
896 cpumask_copy(cp->effective_cpus, new_cpus); 898 cpumask_copy(cp->effective_cpus, new_cpus);
897 spin_unlock_irq(&callback_lock); 899 spin_unlock_irq(&callback_lock);
898 900
899 WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && 901 WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
900 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); 902 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
901 903
902 update_tasks_cpumask(cp); 904 update_tasks_cpumask(cp);
@@ -1135,7 +1137,8 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
1135 * If it becomes empty, inherit the effective mask of the 1137 * If it becomes empty, inherit the effective mask of the
1136 * parent, which is guaranteed to have some MEMs. 1138 * parent, which is guaranteed to have some MEMs.
1137 */ 1139 */
1138 if (cgroup_on_dfl(cp->css.cgroup) && nodes_empty(*new_mems)) 1140 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
1141 nodes_empty(*new_mems))
1139 *new_mems = parent->effective_mems; 1142 *new_mems = parent->effective_mems;
1140 1143
1141 /* Skip the whole subtree if the nodemask remains the same. */ 1144 /* Skip the whole subtree if the nodemask remains the same. */
@@ -1152,7 +1155,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
1152 cp->effective_mems = *new_mems; 1155 cp->effective_mems = *new_mems;
1153 spin_unlock_irq(&callback_lock); 1156 spin_unlock_irq(&callback_lock);
1154 1157
1155 WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && 1158 WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
1156 !nodes_equal(cp->mems_allowed, cp->effective_mems)); 1159 !nodes_equal(cp->mems_allowed, cp->effective_mems));
1157 1160
1158 update_tasks_nodemask(cp); 1161 update_tasks_nodemask(cp);
@@ -1440,7 +1443,7 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
1440 1443
1441 /* allow moving tasks into an empty cpuset if on default hierarchy */ 1444 /* allow moving tasks into an empty cpuset if on default hierarchy */
1442 ret = -ENOSPC; 1445 ret = -ENOSPC;
1443 if (!cgroup_on_dfl(css->cgroup) && 1446 if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
1444 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) 1447 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
1445 goto out_unlock; 1448 goto out_unlock;
1446 1449
@@ -1484,9 +1487,8 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
1484{ 1487{
1485 /* static buf protected by cpuset_mutex */ 1488 /* static buf protected by cpuset_mutex */
1486 static nodemask_t cpuset_attach_nodemask_to; 1489 static nodemask_t cpuset_attach_nodemask_to;
1487 struct mm_struct *mm;
1488 struct task_struct *task; 1490 struct task_struct *task;
1489 struct task_struct *leader = cgroup_taskset_first(tset); 1491 struct task_struct *leader;
1490 struct cpuset *cs = css_cs(css); 1492 struct cpuset *cs = css_cs(css);
1491 struct cpuset *oldcs = cpuset_attach_old_cs; 1493 struct cpuset *oldcs = cpuset_attach_old_cs;
1492 1494
@@ -1512,26 +1514,30 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
1512 } 1514 }
1513 1515
1514 /* 1516 /*
1515 * Change mm, possibly for multiple threads in a threadgroup. This is 1517 * Change mm for all threadgroup leaders. This is expensive and may
1516 * expensive and may sleep. 1518 * sleep and should be moved outside migration path proper.
1517 */ 1519 */
1518 cpuset_attach_nodemask_to = cs->effective_mems; 1520 cpuset_attach_nodemask_to = cs->effective_mems;
1519 mm = get_task_mm(leader); 1521 cgroup_taskset_for_each_leader(leader, tset) {
1520 if (mm) { 1522 struct mm_struct *mm = get_task_mm(leader);
1521 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); 1523
1522 1524 if (mm) {
1523 /* 1525 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
1524 * old_mems_allowed is the same with mems_allowed here, except 1526
1525 * if this task is being moved automatically due to hotplug. 1527 /*
1526 * In that case @mems_allowed has been updated and is empty, 1528 * old_mems_allowed is the same with mems_allowed
1527 * so @old_mems_allowed is the right nodesets that we migrate 1529 * here, except if this task is being moved
1528 * mm from. 1530 * automatically due to hotplug. In that case
1529 */ 1531 * @mems_allowed has been updated and is empty, so
1530 if (is_memory_migrate(cs)) { 1532 * @old_mems_allowed is the right nodesets that we
1531 cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, 1533 * migrate mm from.
1532 &cpuset_attach_nodemask_to); 1534 */
1535 if (is_memory_migrate(cs)) {
1536 cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
1537 &cpuset_attach_nodemask_to);
1538 }
1539 mmput(mm);
1533 } 1540 }
1534 mmput(mm);
1535 } 1541 }
1536 1542
1537 cs->old_mems_allowed = cpuset_attach_nodemask_to; 1543 cs->old_mems_allowed = cpuset_attach_nodemask_to;
@@ -1594,9 +1600,6 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
1594 case FILE_MEMORY_PRESSURE_ENABLED: 1600 case FILE_MEMORY_PRESSURE_ENABLED:
1595 cpuset_memory_pressure_enabled = !!val; 1601 cpuset_memory_pressure_enabled = !!val;
1596 break; 1602 break;
1597 case FILE_MEMORY_PRESSURE:
1598 retval = -EACCES;
1599 break;
1600 case FILE_SPREAD_PAGE: 1603 case FILE_SPREAD_PAGE:
1601 retval = update_flag(CS_SPREAD_PAGE, cs, val); 1604 retval = update_flag(CS_SPREAD_PAGE, cs, val);
1602 break; 1605 break;
@@ -1863,9 +1866,6 @@ static struct cftype files[] = {
1863 { 1866 {
1864 .name = "memory_pressure", 1867 .name = "memory_pressure",
1865 .read_u64 = cpuset_read_u64, 1868 .read_u64 = cpuset_read_u64,
1866 .write_u64 = cpuset_write_u64,
1867 .private = FILE_MEMORY_PRESSURE,
1868 .mode = S_IRUGO,
1869 }, 1869 },
1870 1870
1871 { 1871 {
@@ -1952,7 +1952,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
1952 cpuset_inc(); 1952 cpuset_inc();
1953 1953
1954 spin_lock_irq(&callback_lock); 1954 spin_lock_irq(&callback_lock);
1955 if (cgroup_on_dfl(cs->css.cgroup)) { 1955 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
1956 cpumask_copy(cs->effective_cpus, parent->effective_cpus); 1956 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
1957 cs->effective_mems = parent->effective_mems; 1957 cs->effective_mems = parent->effective_mems;
1958 } 1958 }
@@ -2029,7 +2029,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
2029 mutex_lock(&cpuset_mutex); 2029 mutex_lock(&cpuset_mutex);
2030 spin_lock_irq(&callback_lock); 2030 spin_lock_irq(&callback_lock);
2031 2031
2032 if (cgroup_on_dfl(root_css->cgroup)) { 2032 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
2033 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); 2033 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
2034 top_cpuset.mems_allowed = node_possible_map; 2034 top_cpuset.mems_allowed = node_possible_map;
2035 } else { 2035 } else {
@@ -2210,7 +2210,7 @@ retry:
2210 cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); 2210 cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
2211 mems_updated = !nodes_equal(new_mems, cs->effective_mems); 2211 mems_updated = !nodes_equal(new_mems, cs->effective_mems);
2212 2212
2213 if (cgroup_on_dfl(cs->css.cgroup)) 2213 if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
2214 hotplug_update_tasks(cs, &new_cpus, &new_mems, 2214 hotplug_update_tasks(cs, &new_cpus, &new_mems,
2215 cpus_updated, mems_updated); 2215 cpus_updated, mems_updated);
2216 else 2216 else
@@ -2241,7 +2241,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2241 static cpumask_t new_cpus; 2241 static cpumask_t new_cpus;
2242 static nodemask_t new_mems; 2242 static nodemask_t new_mems;
2243 bool cpus_updated, mems_updated; 2243 bool cpus_updated, mems_updated;
2244 bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup); 2244 bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
2245 2245
2246 mutex_lock(&cpuset_mutex); 2246 mutex_lock(&cpuset_mutex);
2247 2247
@@ -2598,22 +2598,22 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2598} 2598}
2599 2599
2600/** 2600/**
2601 * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed 2601 * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed
2602 * @tsk: pointer to task_struct of some task.
2603 * 2602 *
2604 * Description: Prints @task's name, cpuset name, and cached copy of its 2603 * Description: Prints current's name, cpuset name, and cached copy of its
2605 * mems_allowed to the kernel log. 2604 * mems_allowed to the kernel log.
2606 */ 2605 */
2607void cpuset_print_task_mems_allowed(struct task_struct *tsk) 2606void cpuset_print_current_mems_allowed(void)
2608{ 2607{
2609 struct cgroup *cgrp; 2608 struct cgroup *cgrp;
2610 2609
2611 rcu_read_lock(); 2610 rcu_read_lock();
2612 2611
2613 cgrp = task_cs(tsk)->css.cgroup; 2612 cgrp = task_cs(current)->css.cgroup;
2614 pr_info("%s cpuset=", tsk->comm); 2613 pr_info("%s cpuset=", current->comm);
2615 pr_cont_cgroup_name(cgrp); 2614 pr_cont_cgroup_name(cgrp);
2616 pr_cont(" mems_allowed=%*pbl\n", nodemask_pr_args(&tsk->mems_allowed)); 2615 pr_cont(" mems_allowed=%*pbl\n",
2616 nodemask_pr_args(&current->mems_allowed));
2617 2617
2618 rcu_read_unlock(); 2618 rcu_read_unlock();
2619} 2619}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f548f69c4299..1a734e0adfa7 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -196,7 +196,7 @@ static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
196static int perf_sample_allowed_ns __read_mostly = 196static int perf_sample_allowed_ns __read_mostly =
197 DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100; 197 DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
198 198
199void update_perf_cpu_limits(void) 199static void update_perf_cpu_limits(void)
200{ 200{
201 u64 tmp = perf_sample_period_ns; 201 u64 tmp = perf_sample_period_ns;
202 202
@@ -472,7 +472,7 @@ perf_cgroup_set_timestamp(struct task_struct *task,
472 * mode SWOUT : schedule out everything 472 * mode SWOUT : schedule out everything
473 * mode SWIN : schedule in based on cgroup for next 473 * mode SWIN : schedule in based on cgroup for next
474 */ 474 */
475void perf_cgroup_switch(struct task_struct *task, int mode) 475static void perf_cgroup_switch(struct task_struct *task, int mode)
476{ 476{
477 struct perf_cpu_context *cpuctx; 477 struct perf_cpu_context *cpuctx;
478 struct pmu *pmu; 478 struct pmu *pmu;
@@ -1243,11 +1243,7 @@ static inline void perf_event__state_init(struct perf_event *event)
1243 PERF_EVENT_STATE_INACTIVE; 1243 PERF_EVENT_STATE_INACTIVE;
1244} 1244}
1245 1245
1246/* 1246static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
1247 * Called at perf_event creation and when events are attached/detached from a
1248 * group.
1249 */
1250static void perf_event__read_size(struct perf_event *event)
1251{ 1247{
1252 int entry = sizeof(u64); /* value */ 1248 int entry = sizeof(u64); /* value */
1253 int size = 0; 1249 int size = 0;
@@ -1263,7 +1259,7 @@ static void perf_event__read_size(struct perf_event *event)
1263 entry += sizeof(u64); 1259 entry += sizeof(u64);
1264 1260
1265 if (event->attr.read_format & PERF_FORMAT_GROUP) { 1261 if (event->attr.read_format & PERF_FORMAT_GROUP) {
1266 nr += event->group_leader->nr_siblings; 1262 nr += nr_siblings;
1267 size += sizeof(u64); 1263 size += sizeof(u64);
1268 } 1264 }
1269 1265
@@ -1271,14 +1267,11 @@ static void perf_event__read_size(struct perf_event *event)
1271 event->read_size = size; 1267 event->read_size = size;
1272} 1268}
1273 1269
1274static void perf_event__header_size(struct perf_event *event) 1270static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
1275{ 1271{
1276 struct perf_sample_data *data; 1272 struct perf_sample_data *data;
1277 u64 sample_type = event->attr.sample_type;
1278 u16 size = 0; 1273 u16 size = 0;
1279 1274
1280 perf_event__read_size(event);
1281
1282 if (sample_type & PERF_SAMPLE_IP) 1275 if (sample_type & PERF_SAMPLE_IP)
1283 size += sizeof(data->ip); 1276 size += sizeof(data->ip);
1284 1277
@@ -1303,6 +1296,17 @@ static void perf_event__header_size(struct perf_event *event)
1303 event->header_size = size; 1296 event->header_size = size;
1304} 1297}
1305 1298
1299/*
1300 * Called at perf_event creation and when events are attached/detached from a
1301 * group.
1302 */
1303static void perf_event__header_size(struct perf_event *event)
1304{
1305 __perf_event_read_size(event,
1306 event->group_leader->nr_siblings);
1307 __perf_event_header_size(event, event->attr.sample_type);
1308}
1309
1306static void perf_event__id_header_size(struct perf_event *event) 1310static void perf_event__id_header_size(struct perf_event *event)
1307{ 1311{
1308 struct perf_sample_data *data; 1312 struct perf_sample_data *data;
@@ -1330,6 +1334,27 @@ static void perf_event__id_header_size(struct perf_event *event)
1330 event->id_header_size = size; 1334 event->id_header_size = size;
1331} 1335}
1332 1336
1337static bool perf_event_validate_size(struct perf_event *event)
1338{
1339 /*
1340 * The values computed here will be over-written when we actually
1341 * attach the event.
1342 */
1343 __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
1344 __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
1345 perf_event__id_header_size(event);
1346
1347 /*
1348 * Sum the lot; should not exceed the 64k limit we have on records.
1349 * Conservative limit to allow for callchains and other variable fields.
1350 */
1351 if (event->read_size + event->header_size +
1352 event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
1353 return false;
1354
1355 return true;
1356}
1357
1333static void perf_group_attach(struct perf_event *event) 1358static void perf_group_attach(struct perf_event *event)
1334{ 1359{
1335 struct perf_event *group_leader = event->group_leader, *pos; 1360 struct perf_event *group_leader = event->group_leader, *pos;
@@ -1914,7 +1939,7 @@ group_sched_in(struct perf_event *group_event,
1914 if (group_event->state == PERF_EVENT_STATE_OFF) 1939 if (group_event->state == PERF_EVENT_STATE_OFF)
1915 return 0; 1940 return 0;
1916 1941
1917 pmu->start_txn(pmu); 1942 pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
1918 1943
1919 if (event_sched_in(group_event, cpuctx, ctx)) { 1944 if (event_sched_in(group_event, cpuctx, ctx)) {
1920 pmu->cancel_txn(pmu); 1945 pmu->cancel_txn(pmu);
@@ -3184,14 +3209,22 @@ void perf_event_exec(void)
3184 rcu_read_unlock(); 3209 rcu_read_unlock();
3185} 3210}
3186 3211
3212struct perf_read_data {
3213 struct perf_event *event;
3214 bool group;
3215 int ret;
3216};
3217
3187/* 3218/*
3188 * Cross CPU call to read the hardware event 3219 * Cross CPU call to read the hardware event
3189 */ 3220 */
3190static void __perf_event_read(void *info) 3221static void __perf_event_read(void *info)
3191{ 3222{
3192 struct perf_event *event = info; 3223 struct perf_read_data *data = info;
3224 struct perf_event *sub, *event = data->event;
3193 struct perf_event_context *ctx = event->ctx; 3225 struct perf_event_context *ctx = event->ctx;
3194 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 3226 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
3227 struct pmu *pmu = event->pmu;
3195 3228
3196 /* 3229 /*
3197 * If this is a task context, we need to check whether it is 3230 * If this is a task context, we need to check whether it is
@@ -3208,9 +3241,35 @@ static void __perf_event_read(void *info)
3208 update_context_time(ctx); 3241 update_context_time(ctx);
3209 update_cgrp_time_from_event(event); 3242 update_cgrp_time_from_event(event);
3210 } 3243 }
3244
3211 update_event_times(event); 3245 update_event_times(event);
3212 if (event->state == PERF_EVENT_STATE_ACTIVE) 3246 if (event->state != PERF_EVENT_STATE_ACTIVE)
3213 event->pmu->read(event); 3247 goto unlock;
3248
3249 if (!data->group) {
3250 pmu->read(event);
3251 data->ret = 0;
3252 goto unlock;
3253 }
3254
3255 pmu->start_txn(pmu, PERF_PMU_TXN_READ);
3256
3257 pmu->read(event);
3258
3259 list_for_each_entry(sub, &event->sibling_list, group_entry) {
3260 update_event_times(sub);
3261 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
3262 /*
3263 * Use sibling's PMU rather than @event's since
3264 * sibling could be on different (eg: software) PMU.
3265 */
3266 sub->pmu->read(sub);
3267 }
3268 }
3269
3270 data->ret = pmu->commit_txn(pmu);
3271
3272unlock:
3214 raw_spin_unlock(&ctx->lock); 3273 raw_spin_unlock(&ctx->lock);
3215} 3274}
3216 3275
@@ -3275,15 +3334,23 @@ u64 perf_event_read_local(struct perf_event *event)
3275 return val; 3334 return val;
3276} 3335}
3277 3336
3278static u64 perf_event_read(struct perf_event *event) 3337static int perf_event_read(struct perf_event *event, bool group)
3279{ 3338{
3339 int ret = 0;
3340
3280 /* 3341 /*
3281 * If event is enabled and currently active on a CPU, update the 3342 * If event is enabled and currently active on a CPU, update the
3282 * value in the event structure: 3343 * value in the event structure:
3283 */ 3344 */
3284 if (event->state == PERF_EVENT_STATE_ACTIVE) { 3345 if (event->state == PERF_EVENT_STATE_ACTIVE) {
3346 struct perf_read_data data = {
3347 .event = event,
3348 .group = group,
3349 .ret = 0,
3350 };
3285 smp_call_function_single(event->oncpu, 3351 smp_call_function_single(event->oncpu,
3286 __perf_event_read, event, 1); 3352 __perf_event_read, &data, 1);
3353 ret = data.ret;
3287 } else if (event->state == PERF_EVENT_STATE_INACTIVE) { 3354 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
3288 struct perf_event_context *ctx = event->ctx; 3355 struct perf_event_context *ctx = event->ctx;
3289 unsigned long flags; 3356 unsigned long flags;
@@ -3298,11 +3365,14 @@ static u64 perf_event_read(struct perf_event *event)
3298 update_context_time(ctx); 3365 update_context_time(ctx);
3299 update_cgrp_time_from_event(event); 3366 update_cgrp_time_from_event(event);
3300 } 3367 }
3301 update_event_times(event); 3368 if (group)
3369 update_group_times(event);
3370 else
3371 update_event_times(event);
3302 raw_spin_unlock_irqrestore(&ctx->lock, flags); 3372 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3303 } 3373 }
3304 3374
3305 return perf_event_count(event); 3375 return ret;
3306} 3376}
3307 3377
3308/* 3378/*
@@ -3744,7 +3814,7 @@ static void put_event(struct perf_event *event)
3744 * see the comment there. 3814 * see the comment there.
3745 * 3815 *
3746 * 2) there is a lock-inversion with mmap_sem through 3816 * 2) there is a lock-inversion with mmap_sem through
3747 * perf_event_read_group(), which takes faults while 3817 * perf_read_group(), which takes faults while
3748 * holding ctx->mutex, however this is called after 3818 * holding ctx->mutex, however this is called after
3749 * the last filedesc died, so there is no possibility 3819 * the last filedesc died, so there is no possibility
3750 * to trigger the AB-BA case. 3820 * to trigger the AB-BA case.
@@ -3818,14 +3888,18 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
3818 *running = 0; 3888 *running = 0;
3819 3889
3820 mutex_lock(&event->child_mutex); 3890 mutex_lock(&event->child_mutex);
3821 total += perf_event_read(event); 3891
3892 (void)perf_event_read(event, false);
3893 total += perf_event_count(event);
3894
3822 *enabled += event->total_time_enabled + 3895 *enabled += event->total_time_enabled +
3823 atomic64_read(&event->child_total_time_enabled); 3896 atomic64_read(&event->child_total_time_enabled);
3824 *running += event->total_time_running + 3897 *running += event->total_time_running +
3825 atomic64_read(&event->child_total_time_running); 3898 atomic64_read(&event->child_total_time_running);
3826 3899
3827 list_for_each_entry(child, &event->child_list, child_list) { 3900 list_for_each_entry(child, &event->child_list, child_list) {
3828 total += perf_event_read(child); 3901 (void)perf_event_read(child, false);
3902 total += perf_event_count(child);
3829 *enabled += child->total_time_enabled; 3903 *enabled += child->total_time_enabled;
3830 *running += child->total_time_running; 3904 *running += child->total_time_running;
3831 } 3905 }
@@ -3835,55 +3909,95 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
3835} 3909}
3836EXPORT_SYMBOL_GPL(perf_event_read_value); 3910EXPORT_SYMBOL_GPL(perf_event_read_value);
3837 3911
3838static int perf_event_read_group(struct perf_event *event, 3912static int __perf_read_group_add(struct perf_event *leader,
3839 u64 read_format, char __user *buf) 3913 u64 read_format, u64 *values)
3840{ 3914{
3841 struct perf_event *leader = event->group_leader, *sub; 3915 struct perf_event *sub;
3842 struct perf_event_context *ctx = leader->ctx; 3916 int n = 1; /* skip @nr */
3843 int n = 0, size = 0, ret; 3917 int ret;
3844 u64 count, enabled, running;
3845 u64 values[5];
3846 3918
3847 lockdep_assert_held(&ctx->mutex); 3919 ret = perf_event_read(leader, true);
3920 if (ret)
3921 return ret;
3922
3923 /*
3924 * Since we co-schedule groups, {enabled,running} times of siblings
3925 * will be identical to those of the leader, so we only publish one
3926 * set.
3927 */
3928 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
3929 values[n++] += leader->total_time_enabled +
3930 atomic64_read(&leader->child_total_time_enabled);
3931 }
3848 3932
3849 count = perf_event_read_value(leader, &enabled, &running); 3933 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
3934 values[n++] += leader->total_time_running +
3935 atomic64_read(&leader->child_total_time_running);
3936 }
3850 3937
3851 values[n++] = 1 + leader->nr_siblings; 3938 /*
3852 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 3939 * Write {count,id} tuples for every sibling.
3853 values[n++] = enabled; 3940 */
3854 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) 3941 values[n++] += perf_event_count(leader);
3855 values[n++] = running;
3856 values[n++] = count;
3857 if (read_format & PERF_FORMAT_ID) 3942 if (read_format & PERF_FORMAT_ID)
3858 values[n++] = primary_event_id(leader); 3943 values[n++] = primary_event_id(leader);
3859 3944
3860 size = n * sizeof(u64); 3945 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
3946 values[n++] += perf_event_count(sub);
3947 if (read_format & PERF_FORMAT_ID)
3948 values[n++] = primary_event_id(sub);
3949 }
3861 3950
3862 if (copy_to_user(buf, values, size)) 3951 return 0;
3863 return -EFAULT; 3952}
3864 3953
3865 ret = size; 3954static int perf_read_group(struct perf_event *event,
3955 u64 read_format, char __user *buf)
3956{
3957 struct perf_event *leader = event->group_leader, *child;
3958 struct perf_event_context *ctx = leader->ctx;
3959 int ret;
3960 u64 *values;
3866 3961
3867 list_for_each_entry(sub, &leader->sibling_list, group_entry) { 3962 lockdep_assert_held(&ctx->mutex);
3868 n = 0;
3869 3963
3870 values[n++] = perf_event_read_value(sub, &enabled, &running); 3964 values = kzalloc(event->read_size, GFP_KERNEL);
3871 if (read_format & PERF_FORMAT_ID) 3965 if (!values)
3872 values[n++] = primary_event_id(sub); 3966 return -ENOMEM;
3873 3967
3874 size = n * sizeof(u64); 3968 values[0] = 1 + leader->nr_siblings;
3875 3969
3876 if (copy_to_user(buf + ret, values, size)) { 3970 /*
3877 return -EFAULT; 3971 * By locking the child_mutex of the leader we effectively
3878 } 3972 * lock the child list of all siblings.. XXX explain how.
3973 */
3974 mutex_lock(&leader->child_mutex);
3879 3975
3880 ret += size; 3976 ret = __perf_read_group_add(leader, read_format, values);
3977 if (ret)
3978 goto unlock;
3979
3980 list_for_each_entry(child, &leader->child_list, child_list) {
3981 ret = __perf_read_group_add(child, read_format, values);
3982 if (ret)
3983 goto unlock;
3881 } 3984 }
3882 3985
3986 mutex_unlock(&leader->child_mutex);
3987
3988 ret = event->read_size;
3989 if (copy_to_user(buf, values, event->read_size))
3990 ret = -EFAULT;
3991 goto out;
3992
3993unlock:
3994 mutex_unlock(&leader->child_mutex);
3995out:
3996 kfree(values);
3883 return ret; 3997 return ret;
3884} 3998}
3885 3999
3886static int perf_event_read_one(struct perf_event *event, 4000static int perf_read_one(struct perf_event *event,
3887 u64 read_format, char __user *buf) 4001 u64 read_format, char __user *buf)
3888{ 4002{
3889 u64 enabled, running; 4003 u64 enabled, running;
@@ -3921,7 +4035,7 @@ static bool is_event_hup(struct perf_event *event)
3921 * Read the performance event - simple non blocking version for now 4035 * Read the performance event - simple non blocking version for now
3922 */ 4036 */
3923static ssize_t 4037static ssize_t
3924perf_read_hw(struct perf_event *event, char __user *buf, size_t count) 4038__perf_read(struct perf_event *event, char __user *buf, size_t count)
3925{ 4039{
3926 u64 read_format = event->attr.read_format; 4040 u64 read_format = event->attr.read_format;
3927 int ret; 4041 int ret;
@@ -3939,9 +4053,9 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
3939 4053
3940 WARN_ON_ONCE(event->ctx->parent_ctx); 4054 WARN_ON_ONCE(event->ctx->parent_ctx);
3941 if (read_format & PERF_FORMAT_GROUP) 4055 if (read_format & PERF_FORMAT_GROUP)
3942 ret = perf_event_read_group(event, read_format, buf); 4056 ret = perf_read_group(event, read_format, buf);
3943 else 4057 else
3944 ret = perf_event_read_one(event, read_format, buf); 4058 ret = perf_read_one(event, read_format, buf);
3945 4059
3946 return ret; 4060 return ret;
3947} 4061}
@@ -3954,7 +4068,7 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
3954 int ret; 4068 int ret;
3955 4069
3956 ctx = perf_event_ctx_lock(event); 4070 ctx = perf_event_ctx_lock(event);
3957 ret = perf_read_hw(event, buf, count); 4071 ret = __perf_read(event, buf, count);
3958 perf_event_ctx_unlock(event, ctx); 4072 perf_event_ctx_unlock(event, ctx);
3959 4073
3960 return ret; 4074 return ret;
@@ -3985,7 +4099,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
3985 4099
3986static void _perf_event_reset(struct perf_event *event) 4100static void _perf_event_reset(struct perf_event *event)
3987{ 4101{
3988 (void)perf_event_read(event); 4102 (void)perf_event_read(event, false);
3989 local64_set(&event->count, 0); 4103 local64_set(&event->count, 0);
3990 perf_event_update_userpage(event); 4104 perf_event_update_userpage(event);
3991} 4105}
@@ -5261,9 +5375,15 @@ void perf_output_sample(struct perf_output_handle *handle,
5261 5375
5262 if (sample_type & PERF_SAMPLE_RAW) { 5376 if (sample_type & PERF_SAMPLE_RAW) {
5263 if (data->raw) { 5377 if (data->raw) {
5264 perf_output_put(handle, data->raw->size); 5378 u32 raw_size = data->raw->size;
5265 __output_copy(handle, data->raw->data, 5379 u32 real_size = round_up(raw_size + sizeof(u32),
5266 data->raw->size); 5380 sizeof(u64)) - sizeof(u32);
5381 u64 zero = 0;
5382
5383 perf_output_put(handle, real_size);
5384 __output_copy(handle, data->raw->data, raw_size);
5385 if (real_size - raw_size)
5386 __output_copy(handle, &zero, real_size - raw_size);
5267 } else { 5387 } else {
5268 struct { 5388 struct {
5269 u32 size; 5389 u32 size;
@@ -5395,8 +5515,7 @@ void perf_prepare_sample(struct perf_event_header *header,
5395 else 5515 else
5396 size += sizeof(u32); 5516 size += sizeof(u32);
5397 5517
5398 WARN_ON_ONCE(size & (sizeof(u64)-1)); 5518 header->size += round_up(size, sizeof(u64));
5399 header->size += size;
5400 } 5519 }
5401 5520
5402 if (sample_type & PERF_SAMPLE_BRANCH_STACK) { 5521 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
@@ -7267,24 +7386,49 @@ static void perf_pmu_nop_void(struct pmu *pmu)
7267{ 7386{
7268} 7387}
7269 7388
7389static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
7390{
7391}
7392
7270static int perf_pmu_nop_int(struct pmu *pmu) 7393static int perf_pmu_nop_int(struct pmu *pmu)
7271{ 7394{
7272 return 0; 7395 return 0;
7273} 7396}
7274 7397
7275static void perf_pmu_start_txn(struct pmu *pmu) 7398static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
7399
7400static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
7276{ 7401{
7402 __this_cpu_write(nop_txn_flags, flags);
7403
7404 if (flags & ~PERF_PMU_TXN_ADD)
7405 return;
7406
7277 perf_pmu_disable(pmu); 7407 perf_pmu_disable(pmu);
7278} 7408}
7279 7409
7280static int perf_pmu_commit_txn(struct pmu *pmu) 7410static int perf_pmu_commit_txn(struct pmu *pmu)
7281{ 7411{
7412 unsigned int flags = __this_cpu_read(nop_txn_flags);
7413
7414 __this_cpu_write(nop_txn_flags, 0);
7415
7416 if (flags & ~PERF_PMU_TXN_ADD)
7417 return 0;
7418
7282 perf_pmu_enable(pmu); 7419 perf_pmu_enable(pmu);
7283 return 0; 7420 return 0;
7284} 7421}
7285 7422
7286static void perf_pmu_cancel_txn(struct pmu *pmu) 7423static void perf_pmu_cancel_txn(struct pmu *pmu)
7287{ 7424{
7425 unsigned int flags = __this_cpu_read(nop_txn_flags);
7426
7427 __this_cpu_write(nop_txn_flags, 0);
7428
7429 if (flags & ~PERF_PMU_TXN_ADD)
7430 return;
7431
7288 perf_pmu_enable(pmu); 7432 perf_pmu_enable(pmu);
7289} 7433}
7290 7434
@@ -7523,7 +7667,7 @@ got_cpu_context:
7523 pmu->commit_txn = perf_pmu_commit_txn; 7667 pmu->commit_txn = perf_pmu_commit_txn;
7524 pmu->cancel_txn = perf_pmu_cancel_txn; 7668 pmu->cancel_txn = perf_pmu_cancel_txn;
7525 } else { 7669 } else {
7526 pmu->start_txn = perf_pmu_nop_void; 7670 pmu->start_txn = perf_pmu_nop_txn;
7527 pmu->commit_txn = perf_pmu_nop_int; 7671 pmu->commit_txn = perf_pmu_nop_int;
7528 pmu->cancel_txn = perf_pmu_nop_void; 7672 pmu->cancel_txn = perf_pmu_nop_void;
7529 } 7673 }
@@ -7611,7 +7755,7 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
7611 return ret; 7755 return ret;
7612} 7756}
7613 7757
7614struct pmu *perf_init_event(struct perf_event *event) 7758static struct pmu *perf_init_event(struct perf_event *event)
7615{ 7759{
7616 struct pmu *pmu = NULL; 7760 struct pmu *pmu = NULL;
7617 int idx; 7761 int idx;
@@ -8297,13 +8441,35 @@ SYSCALL_DEFINE5(perf_event_open,
8297 8441
8298 if (move_group) { 8442 if (move_group) {
8299 gctx = group_leader->ctx; 8443 gctx = group_leader->ctx;
8444 mutex_lock_double(&gctx->mutex, &ctx->mutex);
8445 } else {
8446 mutex_lock(&ctx->mutex);
8447 }
8448
8449 if (!perf_event_validate_size(event)) {
8450 err = -E2BIG;
8451 goto err_locked;
8452 }
8453
8454 /*
8455 * Must be under the same ctx::mutex as perf_install_in_context(),
8456 * because we need to serialize with concurrent event creation.
8457 */
8458 if (!exclusive_event_installable(event, ctx)) {
8459 /* exclusive and group stuff are assumed mutually exclusive */
8460 WARN_ON_ONCE(move_group);
8300 8461
8462 err = -EBUSY;
8463 goto err_locked;
8464 }
8465
8466 WARN_ON_ONCE(ctx->parent_ctx);
8467
8468 if (move_group) {
8301 /* 8469 /*
8302 * See perf_event_ctx_lock() for comments on the details 8470 * See perf_event_ctx_lock() for comments on the details
8303 * of swizzling perf_event::ctx. 8471 * of swizzling perf_event::ctx.
8304 */ 8472 */
8305 mutex_lock_double(&gctx->mutex, &ctx->mutex);
8306
8307 perf_remove_from_context(group_leader, false); 8473 perf_remove_from_context(group_leader, false);
8308 8474
8309 list_for_each_entry(sibling, &group_leader->sibling_list, 8475 list_for_each_entry(sibling, &group_leader->sibling_list,
@@ -8311,13 +8477,7 @@ SYSCALL_DEFINE5(perf_event_open,
8311 perf_remove_from_context(sibling, false); 8477 perf_remove_from_context(sibling, false);
8312 put_ctx(gctx); 8478 put_ctx(gctx);
8313 } 8479 }
8314 } else {
8315 mutex_lock(&ctx->mutex);
8316 }
8317 8480
8318 WARN_ON_ONCE(ctx->parent_ctx);
8319
8320 if (move_group) {
8321 /* 8481 /*
8322 * Wait for everybody to stop referencing the events through 8482 * Wait for everybody to stop referencing the events through
8323 * the old lists, before installing it on new lists. 8483 * the old lists, before installing it on new lists.
@@ -8349,22 +8509,29 @@ SYSCALL_DEFINE5(perf_event_open,
8349 perf_event__state_init(group_leader); 8509 perf_event__state_init(group_leader);
8350 perf_install_in_context(ctx, group_leader, group_leader->cpu); 8510 perf_install_in_context(ctx, group_leader, group_leader->cpu);
8351 get_ctx(ctx); 8511 get_ctx(ctx);
8352 }
8353 8512
8354 if (!exclusive_event_installable(event, ctx)) { 8513 /*
8355 err = -EBUSY; 8514 * Now that all events are installed in @ctx, nothing
8356 mutex_unlock(&ctx->mutex); 8515 * references @gctx anymore, so drop the last reference we have
8357 fput(event_file); 8516 * on it.
8358 goto err_context; 8517 */
8518 put_ctx(gctx);
8359 } 8519 }
8360 8520
8521 /*
8522 * Precalculate sample_data sizes; do while holding ctx::mutex such
8523 * that we're serialized against further additions and before
8524 * perf_install_in_context() which is the point the event is active and
8525 * can use these values.
8526 */
8527 perf_event__header_size(event);
8528 perf_event__id_header_size(event);
8529
8361 perf_install_in_context(ctx, event, event->cpu); 8530 perf_install_in_context(ctx, event, event->cpu);
8362 perf_unpin_context(ctx); 8531 perf_unpin_context(ctx);
8363 8532
8364 if (move_group) { 8533 if (move_group)
8365 mutex_unlock(&gctx->mutex); 8534 mutex_unlock(&gctx->mutex);
8366 put_ctx(gctx);
8367 }
8368 mutex_unlock(&ctx->mutex); 8535 mutex_unlock(&ctx->mutex);
8369 8536
8370 put_online_cpus(); 8537 put_online_cpus();
@@ -8376,12 +8543,6 @@ SYSCALL_DEFINE5(perf_event_open,
8376 mutex_unlock(&current->perf_event_mutex); 8543 mutex_unlock(&current->perf_event_mutex);
8377 8544
8378 /* 8545 /*
8379 * Precalculate sample_data sizes
8380 */
8381 perf_event__header_size(event);
8382 perf_event__id_header_size(event);
8383
8384 /*
8385 * Drop the reference on the group_event after placing the 8546 * Drop the reference on the group_event after placing the
8386 * new event on the sibling_list. This ensures destruction 8547 * new event on the sibling_list. This ensures destruction
8387 * of the group leader will find the pointer to itself in 8548 * of the group leader will find the pointer to itself in
@@ -8391,6 +8552,12 @@ SYSCALL_DEFINE5(perf_event_open,
8391 fd_install(event_fd, event_file); 8552 fd_install(event_fd, event_file);
8392 return event_fd; 8553 return event_fd;
8393 8554
8555err_locked:
8556 if (move_group)
8557 mutex_unlock(&gctx->mutex);
8558 mutex_unlock(&ctx->mutex);
8559/* err_file: */
8560 fput(event_file);
8394err_context: 8561err_context:
8395 perf_unpin_context(ctx); 8562 perf_unpin_context(ctx);
8396 put_ctx(ctx); 8563 put_ctx(ctx);
@@ -9293,25 +9460,9 @@ static void perf_cgroup_attach(struct cgroup_subsys_state *css,
9293 task_function_call(task, __perf_cgroup_move, task); 9460 task_function_call(task, __perf_cgroup_move, task);
9294} 9461}
9295 9462
9296static void perf_cgroup_exit(struct cgroup_subsys_state *css,
9297 struct cgroup_subsys_state *old_css,
9298 struct task_struct *task)
9299{
9300 /*
9301 * cgroup_exit() is called in the copy_process() failure path.
9302 * Ignore this case since the task hasn't ran yet, this avoids
9303 * trying to poke a half freed task state from generic code.
9304 */
9305 if (!(task->flags & PF_EXITING))
9306 return;
9307
9308 task_function_call(task, __perf_cgroup_move, task);
9309}
9310
9311struct cgroup_subsys perf_event_cgrp_subsys = { 9463struct cgroup_subsys perf_event_cgrp_subsys = {
9312 .css_alloc = perf_cgroup_css_alloc, 9464 .css_alloc = perf_cgroup_css_alloc,
9313 .css_free = perf_cgroup_css_free, 9465 .css_free = perf_cgroup_css_free,
9314 .exit = perf_cgroup_exit,
9315 .attach = perf_cgroup_attach, 9466 .attach = perf_cgroup_attach,
9316}; 9467};
9317#endif /* CONFIG_CGROUP_PERF */ 9468#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 182bc30899d5..b5d1ea79c595 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -141,7 +141,7 @@ int perf_output_begin(struct perf_output_handle *handle,
141 perf_output_get_handle(handle); 141 perf_output_get_handle(handle);
142 142
143 do { 143 do {
144 tail = READ_ONCE_CTRL(rb->user_page->data_tail); 144 tail = READ_ONCE(rb->user_page->data_tail);
145 offset = head = local_read(&rb->head); 145 offset = head = local_read(&rb->head);
146 if (!rb->overwrite && 146 if (!rb->overwrite &&
147 unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size)) 147 unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
diff --git a/kernel/exit.c b/kernel/exit.c
index ea95ee1b5ef7..07110c6020a0 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -706,10 +706,12 @@ void do_exit(long code)
706 smp_mb(); 706 smp_mb();
707 raw_spin_unlock_wait(&tsk->pi_lock); 707 raw_spin_unlock_wait(&tsk->pi_lock);
708 708
709 if (unlikely(in_atomic())) 709 if (unlikely(in_atomic())) {
710 pr_info("note: %s[%d] exited with preempt_count %d\n", 710 pr_info("note: %s[%d] exited with preempt_count %d\n",
711 current->comm, task_pid_nr(current), 711 current->comm, task_pid_nr(current),
712 preempt_count()); 712 preempt_count());
713 preempt_count_set(PREEMPT_ENABLED);
714 }
713 715
714 /* sync mm's RSS info before statistics gathering */ 716 /* sync mm's RSS info before statistics gathering */
715 if (tsk->mm) 717 if (tsk->mm)
@@ -761,7 +763,9 @@ void do_exit(long code)
761 */ 763 */
762 flush_ptrace_hw_breakpoint(tsk); 764 flush_ptrace_hw_breakpoint(tsk);
763 765
766 TASKS_RCU(preempt_disable());
764 TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu)); 767 TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu));
768 TASKS_RCU(preempt_enable());
765 exit_notify(tsk, group_dead); 769 exit_notify(tsk, group_dead);
766 proc_exit_connector(tsk); 770 proc_exit_connector(tsk);
767#ifdef CONFIG_NUMA 771#ifdef CONFIG_NUMA
diff --git a/kernel/fork.c b/kernel/fork.c
index 7d5f0f118a63..f97f2c449f5c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -251,6 +251,7 @@ void __put_task_struct(struct task_struct *tsk)
251 WARN_ON(atomic_read(&tsk->usage)); 251 WARN_ON(atomic_read(&tsk->usage));
252 WARN_ON(tsk == current); 252 WARN_ON(tsk == current);
253 253
254 cgroup_free(tsk);
254 task_numa_free(tsk); 255 task_numa_free(tsk);
255 security_task_free(tsk); 256 security_task_free(tsk);
256 exit_creds(tsk); 257 exit_creds(tsk);
@@ -454,7 +455,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
454 tmp->vm_mm = mm; 455 tmp->vm_mm = mm;
455 if (anon_vma_fork(tmp, mpnt)) 456 if (anon_vma_fork(tmp, mpnt))
456 goto fail_nomem_anon_vma_fork; 457 goto fail_nomem_anon_vma_fork;
457 tmp->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP); 458 tmp->vm_flags &=
459 ~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP);
458 tmp->vm_next = tmp->vm_prev = NULL; 460 tmp->vm_next = tmp->vm_prev = NULL;
459 tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; 461 tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
460 file = tmp->vm_file; 462 file = tmp->vm_file;
@@ -1101,7 +1103,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
1101 cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); 1103 cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
1102 if (cpu_limit != RLIM_INFINITY) { 1104 if (cpu_limit != RLIM_INFINITY) {
1103 sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit); 1105 sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
1104 sig->cputimer.running = 1; 1106 sig->cputimer.running = true;
1105 } 1107 }
1106 1108
1107 /* The timer lists. */ 1109 /* The timer lists. */
diff --git a/kernel/futex.c b/kernel/futex.c
index 6e443efc65f4..684d7549825a 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -255,9 +255,18 @@ struct futex_hash_bucket {
255 struct plist_head chain; 255 struct plist_head chain;
256} ____cacheline_aligned_in_smp; 256} ____cacheline_aligned_in_smp;
257 257
258static unsigned long __read_mostly futex_hashsize; 258/*
259 * The base of the bucket array and its size are always used together
260 * (after initialization only in hash_futex()), so ensure that they
261 * reside in the same cacheline.
262 */
263static struct {
264 struct futex_hash_bucket *queues;
265 unsigned long hashsize;
266} __futex_data __read_mostly __aligned(2*sizeof(long));
267#define futex_queues (__futex_data.queues)
268#define futex_hashsize (__futex_data.hashsize)
259 269
260static struct futex_hash_bucket *futex_queues;
261 270
262/* 271/*
263 * Fault injections for futexes. 272 * Fault injections for futexes.
@@ -267,10 +276,10 @@ static struct futex_hash_bucket *futex_queues;
267static struct { 276static struct {
268 struct fault_attr attr; 277 struct fault_attr attr;
269 278
270 u32 ignore_private; 279 bool ignore_private;
271} fail_futex = { 280} fail_futex = {
272 .attr = FAULT_ATTR_INITIALIZER, 281 .attr = FAULT_ATTR_INITIALIZER,
273 .ignore_private = 0, 282 .ignore_private = false,
274}; 283};
275 284
276static int __init setup_fail_futex(char *str) 285static int __init setup_fail_futex(char *str)
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 9a76e3beda54..3b48dab80164 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -30,6 +30,10 @@ config GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
30config GENERIC_PENDING_IRQ 30config GENERIC_PENDING_IRQ
31 bool 31 bool
32 32
33# Support for generic irq migrating off cpu before the cpu is offline.
34config GENERIC_IRQ_MIGRATION
35 bool
36
33# Alpha specific irq affinity mechanism 37# Alpha specific irq affinity mechanism
34config AUTO_IRQ_AFFINITY 38config AUTO_IRQ_AFFINITY
35 bool 39 bool
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index d12123526e2b..2fc9cbdf35b6 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -5,5 +5,6 @@ obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
5obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o 5obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o
6obj-$(CONFIG_PROC_FS) += proc.o 6obj-$(CONFIG_PROC_FS) += proc.o
7obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o 7obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
8obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
8obj-$(CONFIG_PM_SLEEP) += pm.o 9obj-$(CONFIG_PM_SLEEP) += pm.o
9obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o 10obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index e28169dd1c36..15206453b12a 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -21,6 +21,20 @@
21 21
22#include "internals.h" 22#include "internals.h"
23 23
24static irqreturn_t bad_chained_irq(int irq, void *dev_id)
25{
26 WARN_ONCE(1, "Chained irq %d should not call an action\n", irq);
27 return IRQ_NONE;
28}
29
30/*
31 * Chained handlers should never call action on their IRQ. This default
32 * action will emit warning if such thing happens.
33 */
34struct irqaction chained_action = {
35 .handler = bad_chained_irq,
36};
37
24/** 38/**
25 * irq_set_chip - set the irq chip for an irq 39 * irq_set_chip - set the irq chip for an irq
26 * @irq: irq number 40 * @irq: irq number
@@ -227,6 +241,13 @@ void irq_enable(struct irq_desc *desc)
227 * disabled. If an interrupt happens, then the interrupt flow 241 * disabled. If an interrupt happens, then the interrupt flow
228 * handler masks the line at the hardware level and marks it 242 * handler masks the line at the hardware level and marks it
229 * pending. 243 * pending.
244 *
245 * If the interrupt chip does not implement the irq_disable callback,
246 * a driver can disable the lazy approach for a particular irq line by
247 * calling 'irq_set_status_flags(irq, IRQ_DISABLE_UNLAZY)'. This can
248 * be used for devices which cannot disable the interrupt at the
249 * device level under certain circumstances and have to use
250 * disable_irq[_nosync] instead.
230 */ 251 */
231void irq_disable(struct irq_desc *desc) 252void irq_disable(struct irq_desc *desc)
232{ 253{
@@ -234,6 +255,8 @@ void irq_disable(struct irq_desc *desc)
234 if (desc->irq_data.chip->irq_disable) { 255 if (desc->irq_data.chip->irq_disable) {
235 desc->irq_data.chip->irq_disable(&desc->irq_data); 256 desc->irq_data.chip->irq_disable(&desc->irq_data);
236 irq_state_set_masked(desc); 257 irq_state_set_masked(desc);
258 } else if (irq_settings_disable_unlazy(desc)) {
259 mask_irq(desc);
237 } 260 }
238} 261}
239 262
@@ -669,7 +692,7 @@ void handle_percpu_irq(struct irq_desc *desc)
669 if (chip->irq_ack) 692 if (chip->irq_ack)
670 chip->irq_ack(&desc->irq_data); 693 chip->irq_ack(&desc->irq_data);
671 694
672 handle_irq_event_percpu(desc, desc->action); 695 handle_irq_event_percpu(desc);
673 696
674 if (chip->irq_eoi) 697 if (chip->irq_eoi)
675 chip->irq_eoi(&desc->irq_data); 698 chip->irq_eoi(&desc->irq_data);
@@ -746,6 +769,8 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
746 if (desc->irq_data.chip != &no_irq_chip) 769 if (desc->irq_data.chip != &no_irq_chip)
747 mask_ack_irq(desc); 770 mask_ack_irq(desc);
748 irq_state_set_disabled(desc); 771 irq_state_set_disabled(desc);
772 if (is_chained)
773 desc->action = NULL;
749 desc->depth = 1; 774 desc->depth = 1;
750 } 775 }
751 desc->handle_irq = handle; 776 desc->handle_irq = handle;
@@ -755,6 +780,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
755 irq_settings_set_noprobe(desc); 780 irq_settings_set_noprobe(desc);
756 irq_settings_set_norequest(desc); 781 irq_settings_set_norequest(desc);
757 irq_settings_set_nothread(desc); 782 irq_settings_set_nothread(desc);
783 desc->action = &chained_action;
758 irq_startup(desc, true); 784 irq_startup(desc, true);
759 } 785 }
760} 786}
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
new file mode 100644
index 000000000000..011f8c4c63da
--- /dev/null
+++ b/kernel/irq/cpuhotplug.c
@@ -0,0 +1,82 @@
1/*
2 * Generic cpu hotunplug interrupt migration code copied from the
3 * arch/arm implementation
4 *
5 * Copyright (C) Russell King
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#include <linux/interrupt.h>
12#include <linux/ratelimit.h>
13#include <linux/irq.h>
14
15#include "internals.h"
16
17static bool migrate_one_irq(struct irq_desc *desc)
18{
19 struct irq_data *d = irq_desc_get_irq_data(desc);
20 const struct cpumask *affinity = d->common->affinity;
21 struct irq_chip *c;
22 bool ret = false;
23
24 /*
25 * If this is a per-CPU interrupt, or the affinity does not
26 * include this CPU, then we have nothing to do.
27 */
28 if (irqd_is_per_cpu(d) ||
29 !cpumask_test_cpu(smp_processor_id(), affinity))
30 return false;
31
32 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
33 affinity = cpu_online_mask;
34 ret = true;
35 }
36
37 c = irq_data_get_irq_chip(d);
38 if (!c->irq_set_affinity) {
39 pr_debug("IRQ%u: unable to set affinity\n", d->irq);
40 } else {
41 int r = irq_do_set_affinity(d, affinity, false);
42 if (r)
43 pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n",
44 d->irq, r);
45 }
46
47 return ret;
48}
49
50/**
51 * irq_migrate_all_off_this_cpu - Migrate irqs away from offline cpu
52 *
53 * The current CPU has been marked offline. Migrate IRQs off this CPU.
54 * If the affinity settings do not allow other CPUs, force them onto any
55 * available CPU.
56 *
57 * Note: we must iterate over all IRQs, whether they have an attached
58 * action structure or not, as we need to get chained interrupts too.
59 */
60void irq_migrate_all_off_this_cpu(void)
61{
62 unsigned int irq;
63 struct irq_desc *desc;
64 unsigned long flags;
65
66 local_irq_save(flags);
67
68 for_each_active_irq(irq) {
69 bool affinity_broken;
70
71 desc = irq_to_desc(irq);
72 raw_spin_lock(&desc->lock);
73 affinity_broken = migrate_one_irq(desc);
74 raw_spin_unlock(&desc->lock);
75
76 if (affinity_broken)
77 pr_warn_ratelimited("IRQ%u no longer affine to CPU%u\n",
78 irq, smp_processor_id());
79 }
80
81 local_irq_restore(flags);
82}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index de41a68fc038..a302cf9a2126 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -22,7 +22,6 @@
22 22
23/** 23/**
24 * handle_bad_irq - handle spurious and unhandled irqs 24 * handle_bad_irq - handle spurious and unhandled irqs
25 * @irq: the interrupt number
26 * @desc: description of the interrupt 25 * @desc: description of the interrupt
27 * 26 *
28 * Handles spurious and unhandled IRQ's. It also prints a debugmessage. 27 * Handles spurious and unhandled IRQ's. It also prints a debugmessage.
@@ -35,6 +34,7 @@ void handle_bad_irq(struct irq_desc *desc)
35 kstat_incr_irqs_this_cpu(desc); 34 kstat_incr_irqs_this_cpu(desc);
36 ack_bad_irq(irq); 35 ack_bad_irq(irq);
37} 36}
37EXPORT_SYMBOL_GPL(handle_bad_irq);
38 38
39/* 39/*
40 * Special, empty irq handler: 40 * Special, empty irq handler:
@@ -132,11 +132,11 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
132 wake_up_process(action->thread); 132 wake_up_process(action->thread);
133} 133}
134 134
135irqreturn_t 135irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
136handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
137{ 136{
138 irqreturn_t retval = IRQ_NONE; 137 irqreturn_t retval = IRQ_NONE;
139 unsigned int flags = 0, irq = desc->irq_data.irq; 138 unsigned int flags = 0, irq = desc->irq_data.irq;
139 struct irqaction *action = desc->action;
140 140
141 do { 141 do {
142 irqreturn_t res; 142 irqreturn_t res;
@@ -184,14 +184,13 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
184 184
185irqreturn_t handle_irq_event(struct irq_desc *desc) 185irqreturn_t handle_irq_event(struct irq_desc *desc)
186{ 186{
187 struct irqaction *action = desc->action;
188 irqreturn_t ret; 187 irqreturn_t ret;
189 188
190 desc->istate &= ~IRQS_PENDING; 189 desc->istate &= ~IRQS_PENDING;
191 irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); 190 irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
192 raw_spin_unlock(&desc->lock); 191 raw_spin_unlock(&desc->lock);
193 192
194 ret = handle_irq_event_percpu(desc, action); 193 ret = handle_irq_event_percpu(desc);
195 194
196 raw_spin_lock(&desc->lock); 195 raw_spin_lock(&desc->lock);
197 irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); 196 irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 5ef0c2dbe930..05c2188271b8 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -18,6 +18,8 @@
18 18
19extern bool noirqdebug; 19extern bool noirqdebug;
20 20
21extern struct irqaction chained_action;
22
21/* 23/*
22 * Bits used by threaded handlers: 24 * Bits used by threaded handlers:
23 * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run 25 * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run
@@ -81,7 +83,7 @@ extern void irq_mark_irq(unsigned int irq);
81 83
82extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); 84extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
83 85
84irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action); 86irqreturn_t handle_irq_event_percpu(struct irq_desc *desc);
85irqreturn_t handle_irq_event(struct irq_desc *desc); 87irqreturn_t handle_irq_event(struct irq_desc *desc);
86 88
87/* Resending of interrupts :*/ 89/* Resending of interrupts :*/
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index dc9d27c0c158..22aa9612ef7c 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -27,6 +27,57 @@ static int irq_domain_alloc_descs(int virq, unsigned int nr_irqs,
27 irq_hw_number_t hwirq, int node); 27 irq_hw_number_t hwirq, int node);
28static void irq_domain_check_hierarchy(struct irq_domain *domain); 28static void irq_domain_check_hierarchy(struct irq_domain *domain);
29 29
30struct irqchip_fwid {
31 struct fwnode_handle fwnode;
32 char *name;
33 void *data;
34};
35
36/**
37 * irq_domain_alloc_fwnode - Allocate a fwnode_handle suitable for
38 * identifying an irq domain
39 * @data: optional user-provided data
40 *
41 * Allocate a struct device_node, and return a poiner to the embedded
42 * fwnode_handle (or NULL on failure).
43 */
44struct fwnode_handle *irq_domain_alloc_fwnode(void *data)
45{
46 struct irqchip_fwid *fwid;
47 char *name;
48
49 fwid = kzalloc(sizeof(*fwid), GFP_KERNEL);
50 name = kasprintf(GFP_KERNEL, "irqchip@%p", data);
51
52 if (!fwid || !name) {
53 kfree(fwid);
54 kfree(name);
55 return NULL;
56 }
57
58 fwid->name = name;
59 fwid->data = data;
60 fwid->fwnode.type = FWNODE_IRQCHIP;
61 return &fwid->fwnode;
62}
63
64/**
65 * irq_domain_free_fwnode - Free a non-OF-backed fwnode_handle
66 *
67 * Free a fwnode_handle allocated with irq_domain_alloc_fwnode.
68 */
69void irq_domain_free_fwnode(struct fwnode_handle *fwnode)
70{
71 struct irqchip_fwid *fwid;
72
73 if (WARN_ON(fwnode->type != FWNODE_IRQCHIP))
74 return;
75
76 fwid = container_of(fwnode, struct irqchip_fwid, fwnode);
77 kfree(fwid->name);
78 kfree(fwid);
79}
80
30/** 81/**
31 * __irq_domain_add() - Allocate a new irq_domain data structure 82 * __irq_domain_add() - Allocate a new irq_domain data structure
32 * @of_node: optional device-tree node of the interrupt controller 83 * @of_node: optional device-tree node of the interrupt controller
@@ -40,23 +91,28 @@ static void irq_domain_check_hierarchy(struct irq_domain *domain);
40 * Allocates and initialize and irq_domain structure. 91 * Allocates and initialize and irq_domain structure.
41 * Returns pointer to IRQ domain, or NULL on failure. 92 * Returns pointer to IRQ domain, or NULL on failure.
42 */ 93 */
43struct irq_domain *__irq_domain_add(struct device_node *of_node, int size, 94struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
44 irq_hw_number_t hwirq_max, int direct_max, 95 irq_hw_number_t hwirq_max, int direct_max,
45 const struct irq_domain_ops *ops, 96 const struct irq_domain_ops *ops,
46 void *host_data) 97 void *host_data)
47{ 98{
48 struct irq_domain *domain; 99 struct irq_domain *domain;
100 struct device_node *of_node;
101
102 of_node = to_of_node(fwnode);
49 103
50 domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), 104 domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size),
51 GFP_KERNEL, of_node_to_nid(of_node)); 105 GFP_KERNEL, of_node_to_nid(of_node));
52 if (WARN_ON(!domain)) 106 if (WARN_ON(!domain))
53 return NULL; 107 return NULL;
54 108
109 of_node_get(of_node);
110
55 /* Fill structure */ 111 /* Fill structure */
56 INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL); 112 INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL);
57 domain->ops = ops; 113 domain->ops = ops;
58 domain->host_data = host_data; 114 domain->host_data = host_data;
59 domain->of_node = of_node_get(of_node); 115 domain->fwnode = fwnode;
60 domain->hwirq_max = hwirq_max; 116 domain->hwirq_max = hwirq_max;
61 domain->revmap_size = size; 117 domain->revmap_size = size;
62 domain->revmap_direct_max_irq = direct_max; 118 domain->revmap_direct_max_irq = direct_max;
@@ -102,7 +158,7 @@ void irq_domain_remove(struct irq_domain *domain)
102 158
103 pr_debug("Removed domain %s\n", domain->name); 159 pr_debug("Removed domain %s\n", domain->name);
104 160
105 of_node_put(domain->of_node); 161 of_node_put(irq_domain_get_of_node(domain));
106 kfree(domain); 162 kfree(domain);
107} 163}
108EXPORT_SYMBOL_GPL(irq_domain_remove); 164EXPORT_SYMBOL_GPL(irq_domain_remove);
@@ -133,7 +189,7 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
133{ 189{
134 struct irq_domain *domain; 190 struct irq_domain *domain;
135 191
136 domain = __irq_domain_add(of_node, size, size, 0, ops, host_data); 192 domain = __irq_domain_add(of_node_to_fwnode(of_node), size, size, 0, ops, host_data);
137 if (!domain) 193 if (!domain)
138 return NULL; 194 return NULL;
139 195
@@ -177,7 +233,7 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
177{ 233{
178 struct irq_domain *domain; 234 struct irq_domain *domain;
179 235
180 domain = __irq_domain_add(of_node, first_hwirq + size, 236 domain = __irq_domain_add(of_node_to_fwnode(of_node), first_hwirq + size,
181 first_hwirq + size, 0, ops, host_data); 237 first_hwirq + size, 0, ops, host_data);
182 if (domain) 238 if (domain)
183 irq_domain_associate_many(domain, first_irq, first_hwirq, size); 239 irq_domain_associate_many(domain, first_irq, first_hwirq, size);
@@ -187,12 +243,12 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
187EXPORT_SYMBOL_GPL(irq_domain_add_legacy); 243EXPORT_SYMBOL_GPL(irq_domain_add_legacy);
188 244
189/** 245/**
190 * irq_find_matching_host() - Locates a domain for a given device node 246 * irq_find_matching_fwnode() - Locates a domain for a given fwnode
191 * @node: device-tree node of the interrupt controller 247 * @fwnode: FW descriptor of the interrupt controller
192 * @bus_token: domain-specific data 248 * @bus_token: domain-specific data
193 */ 249 */
194struct irq_domain *irq_find_matching_host(struct device_node *node, 250struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode,
195 enum irq_domain_bus_token bus_token) 251 enum irq_domain_bus_token bus_token)
196{ 252{
197 struct irq_domain *h, *found = NULL; 253 struct irq_domain *h, *found = NULL;
198 int rc; 254 int rc;
@@ -209,9 +265,9 @@ struct irq_domain *irq_find_matching_host(struct device_node *node,
209 mutex_lock(&irq_domain_mutex); 265 mutex_lock(&irq_domain_mutex);
210 list_for_each_entry(h, &irq_domain_list, link) { 266 list_for_each_entry(h, &irq_domain_list, link) {
211 if (h->ops->match) 267 if (h->ops->match)
212 rc = h->ops->match(h, node, bus_token); 268 rc = h->ops->match(h, to_of_node(fwnode), bus_token);
213 else 269 else
214 rc = ((h->of_node != NULL) && (h->of_node == node) && 270 rc = ((fwnode != NULL) && (h->fwnode == fwnode) &&
215 ((bus_token == DOMAIN_BUS_ANY) || 271 ((bus_token == DOMAIN_BUS_ANY) ||
216 (h->bus_token == bus_token))); 272 (h->bus_token == bus_token)));
217 273
@@ -223,7 +279,7 @@ struct irq_domain *irq_find_matching_host(struct device_node *node,
223 mutex_unlock(&irq_domain_mutex); 279 mutex_unlock(&irq_domain_mutex);
224 return found; 280 return found;
225} 281}
226EXPORT_SYMBOL_GPL(irq_find_matching_host); 282EXPORT_SYMBOL_GPL(irq_find_matching_fwnode);
227 283
228/** 284/**
229 * irq_set_default_host() - Set a "default" irq domain 285 * irq_set_default_host() - Set a "default" irq domain
@@ -336,10 +392,12 @@ EXPORT_SYMBOL_GPL(irq_domain_associate);
336void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, 392void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
337 irq_hw_number_t hwirq_base, int count) 393 irq_hw_number_t hwirq_base, int count)
338{ 394{
395 struct device_node *of_node;
339 int i; 396 int i;
340 397
398 of_node = irq_domain_get_of_node(domain);
341 pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__, 399 pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__,
342 of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count); 400 of_node_full_name(of_node), irq_base, (int)hwirq_base, count);
343 401
344 for (i = 0; i < count; i++) { 402 for (i = 0; i < count; i++) {
345 irq_domain_associate(domain, irq_base + i, hwirq_base + i); 403 irq_domain_associate(domain, irq_base + i, hwirq_base + i);
@@ -359,12 +417,14 @@ EXPORT_SYMBOL_GPL(irq_domain_associate_many);
359 */ 417 */
360unsigned int irq_create_direct_mapping(struct irq_domain *domain) 418unsigned int irq_create_direct_mapping(struct irq_domain *domain)
361{ 419{
420 struct device_node *of_node;
362 unsigned int virq; 421 unsigned int virq;
363 422
364 if (domain == NULL) 423 if (domain == NULL)
365 domain = irq_default_domain; 424 domain = irq_default_domain;
366 425
367 virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node)); 426 of_node = irq_domain_get_of_node(domain);
427 virq = irq_alloc_desc_from(1, of_node_to_nid(of_node));
368 if (!virq) { 428 if (!virq) {
369 pr_debug("create_direct virq allocation failed\n"); 429 pr_debug("create_direct virq allocation failed\n");
370 return 0; 430 return 0;
@@ -399,6 +459,7 @@ EXPORT_SYMBOL_GPL(irq_create_direct_mapping);
399unsigned int irq_create_mapping(struct irq_domain *domain, 459unsigned int irq_create_mapping(struct irq_domain *domain,
400 irq_hw_number_t hwirq) 460 irq_hw_number_t hwirq)
401{ 461{
462 struct device_node *of_node;
402 int virq; 463 int virq;
403 464
404 pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); 465 pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
@@ -412,6 +473,8 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
412 } 473 }
413 pr_debug("-> using domain @%p\n", domain); 474 pr_debug("-> using domain @%p\n", domain);
414 475
476 of_node = irq_domain_get_of_node(domain);
477
415 /* Check if mapping already exists */ 478 /* Check if mapping already exists */
416 virq = irq_find_mapping(domain, hwirq); 479 virq = irq_find_mapping(domain, hwirq);
417 if (virq) { 480 if (virq) {
@@ -420,8 +483,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
420 } 483 }
421 484
422 /* Allocate a virtual interrupt number */ 485 /* Allocate a virtual interrupt number */
423 virq = irq_domain_alloc_descs(-1, 1, hwirq, 486 virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node));
424 of_node_to_nid(domain->of_node));
425 if (virq <= 0) { 487 if (virq <= 0) {
426 pr_debug("-> virq allocation failed\n"); 488 pr_debug("-> virq allocation failed\n");
427 return 0; 489 return 0;
@@ -433,7 +495,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
433 } 495 }
434 496
435 pr_debug("irq %lu on domain %s mapped to virtual irq %u\n", 497 pr_debug("irq %lu on domain %s mapped to virtual irq %u\n",
436 hwirq, of_node_full_name(domain->of_node), virq); 498 hwirq, of_node_full_name(of_node), virq);
437 499
438 return virq; 500 return virq;
439} 501}
@@ -460,10 +522,12 @@ EXPORT_SYMBOL_GPL(irq_create_mapping);
460int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base, 522int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base,
461 irq_hw_number_t hwirq_base, int count) 523 irq_hw_number_t hwirq_base, int count)
462{ 524{
525 struct device_node *of_node;
463 int ret; 526 int ret;
464 527
528 of_node = irq_domain_get_of_node(domain);
465 ret = irq_alloc_descs(irq_base, irq_base, count, 529 ret = irq_alloc_descs(irq_base, irq_base, count,
466 of_node_to_nid(domain->of_node)); 530 of_node_to_nid(of_node));
467 if (unlikely(ret < 0)) 531 if (unlikely(ret < 0))
468 return ret; 532 return ret;
469 533
@@ -472,28 +536,56 @@ int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base,
472} 536}
473EXPORT_SYMBOL_GPL(irq_create_strict_mappings); 537EXPORT_SYMBOL_GPL(irq_create_strict_mappings);
474 538
475unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) 539static int irq_domain_translate(struct irq_domain *d,
540 struct irq_fwspec *fwspec,
541 irq_hw_number_t *hwirq, unsigned int *type)
542{
543#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
544 if (d->ops->translate)
545 return d->ops->translate(d, fwspec, hwirq, type);
546#endif
547 if (d->ops->xlate)
548 return d->ops->xlate(d, to_of_node(fwspec->fwnode),
549 fwspec->param, fwspec->param_count,
550 hwirq, type);
551
552 /* If domain has no translation, then we assume interrupt line */
553 *hwirq = fwspec->param[0];
554 return 0;
555}
556
557static void of_phandle_args_to_fwspec(struct of_phandle_args *irq_data,
558 struct irq_fwspec *fwspec)
559{
560 int i;
561
562 fwspec->fwnode = irq_data->np ? &irq_data->np->fwnode : NULL;
563 fwspec->param_count = irq_data->args_count;
564
565 for (i = 0; i < irq_data->args_count; i++)
566 fwspec->param[i] = irq_data->args[i];
567}
568
569unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
476{ 570{
477 struct irq_domain *domain; 571 struct irq_domain *domain;
478 irq_hw_number_t hwirq; 572 irq_hw_number_t hwirq;
479 unsigned int type = IRQ_TYPE_NONE; 573 unsigned int type = IRQ_TYPE_NONE;
480 int virq; 574 int virq;
481 575
482 domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain; 576 if (fwspec->fwnode)
577 domain = irq_find_matching_fwnode(fwspec->fwnode, DOMAIN_BUS_ANY);
578 else
579 domain = irq_default_domain;
580
483 if (!domain) { 581 if (!domain) {
484 pr_warn("no irq domain found for %s !\n", 582 pr_warn("no irq domain found for %s !\n",
485 of_node_full_name(irq_data->np)); 583 of_node_full_name(to_of_node(fwspec->fwnode)));
486 return 0; 584 return 0;
487 } 585 }
488 586
489 /* If domain has no translation, then we assume interrupt line */ 587 if (irq_domain_translate(domain, fwspec, &hwirq, &type))
490 if (domain->ops->xlate == NULL) 588 return 0;
491 hwirq = irq_data->args[0];
492 else {
493 if (domain->ops->xlate(domain, irq_data->np, irq_data->args,
494 irq_data->args_count, &hwirq, &type))
495 return 0;
496 }
497 589
498 if (irq_domain_is_hierarchy(domain)) { 590 if (irq_domain_is_hierarchy(domain)) {
499 /* 591 /*
@@ -504,7 +596,7 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
504 if (virq) 596 if (virq)
505 return virq; 597 return virq;
506 598
507 virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, irq_data); 599 virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, fwspec);
508 if (virq <= 0) 600 if (virq <= 0)
509 return 0; 601 return 0;
510 } else { 602 } else {
@@ -520,6 +612,15 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
520 irq_set_irq_type(virq, type); 612 irq_set_irq_type(virq, type);
521 return virq; 613 return virq;
522} 614}
615EXPORT_SYMBOL_GPL(irq_create_fwspec_mapping);
616
617unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
618{
619 struct irq_fwspec fwspec;
620
621 of_phandle_args_to_fwspec(irq_data, &fwspec);
622 return irq_create_fwspec_mapping(&fwspec);
623}
523EXPORT_SYMBOL_GPL(irq_create_of_mapping); 624EXPORT_SYMBOL_GPL(irq_create_of_mapping);
524 625
525/** 626/**
@@ -590,14 +691,16 @@ static int virq_debug_show(struct seq_file *m, void *private)
590 "name", "mapped", "linear-max", "direct-max", "devtree-node"); 691 "name", "mapped", "linear-max", "direct-max", "devtree-node");
591 mutex_lock(&irq_domain_mutex); 692 mutex_lock(&irq_domain_mutex);
592 list_for_each_entry(domain, &irq_domain_list, link) { 693 list_for_each_entry(domain, &irq_domain_list, link) {
694 struct device_node *of_node;
593 int count = 0; 695 int count = 0;
696 of_node = irq_domain_get_of_node(domain);
594 radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0) 697 radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0)
595 count++; 698 count++;
596 seq_printf(m, "%c%-16s %6u %10u %10u %s\n", 699 seq_printf(m, "%c%-16s %6u %10u %10u %s\n",
597 domain == irq_default_domain ? '*' : ' ', domain->name, 700 domain == irq_default_domain ? '*' : ' ', domain->name,
598 domain->revmap_size + count, domain->revmap_size, 701 domain->revmap_size + count, domain->revmap_size,
599 domain->revmap_direct_max_irq, 702 domain->revmap_direct_max_irq,
600 domain->of_node ? of_node_full_name(domain->of_node) : ""); 703 of_node ? of_node_full_name(of_node) : "");
601 } 704 }
602 mutex_unlock(&irq_domain_mutex); 705 mutex_unlock(&irq_domain_mutex);
603 706
@@ -751,11 +854,11 @@ static int irq_domain_alloc_descs(int virq, unsigned int cnt,
751 854
752#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY 855#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
753/** 856/**
754 * irq_domain_add_hierarchy - Add a irqdomain into the hierarchy 857 * irq_domain_create_hierarchy - Add a irqdomain into the hierarchy
755 * @parent: Parent irq domain to associate with the new domain 858 * @parent: Parent irq domain to associate with the new domain
756 * @flags: Irq domain flags associated to the domain 859 * @flags: Irq domain flags associated to the domain
757 * @size: Size of the domain. See below 860 * @size: Size of the domain. See below
758 * @node: Optional device-tree node of the interrupt controller 861 * @fwnode: Optional fwnode of the interrupt controller
759 * @ops: Pointer to the interrupt domain callbacks 862 * @ops: Pointer to the interrupt domain callbacks
760 * @host_data: Controller private data pointer 863 * @host_data: Controller private data pointer
761 * 864 *
@@ -765,19 +868,19 @@ static int irq_domain_alloc_descs(int virq, unsigned int cnt,
765 * domain flags are set. 868 * domain flags are set.
766 * Returns pointer to IRQ domain, or NULL on failure. 869 * Returns pointer to IRQ domain, or NULL on failure.
767 */ 870 */
768struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *parent, 871struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent,
769 unsigned int flags, 872 unsigned int flags,
770 unsigned int size, 873 unsigned int size,
771 struct device_node *node, 874 struct fwnode_handle *fwnode,
772 const struct irq_domain_ops *ops, 875 const struct irq_domain_ops *ops,
773 void *host_data) 876 void *host_data)
774{ 877{
775 struct irq_domain *domain; 878 struct irq_domain *domain;
776 879
777 if (size) 880 if (size)
778 domain = irq_domain_add_linear(node, size, ops, host_data); 881 domain = irq_domain_create_linear(fwnode, size, ops, host_data);
779 else 882 else
780 domain = irq_domain_add_tree(node, ops, host_data); 883 domain = irq_domain_create_tree(fwnode, ops, host_data);
781 if (domain) { 884 if (domain) {
782 domain->parent = parent; 885 domain->parent = parent;
783 domain->flags |= flags; 886 domain->flags |= flags;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index f9a59f6cabd2..0eebaeef317b 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -258,37 +258,6 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
258} 258}
259EXPORT_SYMBOL_GPL(irq_set_affinity_hint); 259EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
260 260
261/**
262 * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt
263 * @irq: interrupt number to set affinity
264 * @vcpu_info: vCPU specific data
265 *
266 * This function uses the vCPU specific data to set the vCPU
267 * affinity for an irq. The vCPU specific data is passed from
268 * outside, such as KVM. One example code path is as below:
269 * KVM -> IOMMU -> irq_set_vcpu_affinity().
270 */
271int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info)
272{
273 unsigned long flags;
274 struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
275 struct irq_data *data;
276 struct irq_chip *chip;
277 int ret = -ENOSYS;
278
279 if (!desc)
280 return -EINVAL;
281
282 data = irq_desc_get_irq_data(desc);
283 chip = irq_data_get_irq_chip(data);
284 if (chip && chip->irq_set_vcpu_affinity)
285 ret = chip->irq_set_vcpu_affinity(data, vcpu_info);
286 irq_put_desc_unlock(desc, flags);
287
288 return ret;
289}
290EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity);
291
292static void irq_affinity_notify(struct work_struct *work) 261static void irq_affinity_notify(struct work_struct *work)
293{ 262{
294 struct irq_affinity_notify *notify = 263 struct irq_affinity_notify *notify =
@@ -424,6 +393,37 @@ setup_affinity(struct irq_desc *desc, struct cpumask *mask)
424} 393}
425#endif 394#endif
426 395
396/**
397 * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt
398 * @irq: interrupt number to set affinity
399 * @vcpu_info: vCPU specific data
400 *
401 * This function uses the vCPU specific data to set the vCPU
402 * affinity for an irq. The vCPU specific data is passed from
403 * outside, such as KVM. One example code path is as below:
404 * KVM -> IOMMU -> irq_set_vcpu_affinity().
405 */
406int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info)
407{
408 unsigned long flags;
409 struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
410 struct irq_data *data;
411 struct irq_chip *chip;
412 int ret = -ENOSYS;
413
414 if (!desc)
415 return -EINVAL;
416
417 data = irq_desc_get_irq_data(desc);
418 chip = irq_data_get_irq_chip(data);
419 if (chip && chip->irq_set_vcpu_affinity)
420 ret = chip->irq_set_vcpu_affinity(data, vcpu_info);
421 irq_put_desc_unlock(desc, flags);
422
423 return ret;
424}
425EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity);
426
427void __disable_irq(struct irq_desc *desc) 427void __disable_irq(struct irq_desc *desc)
428{ 428{
429 if (!desc->depth++) 429 if (!desc->depth++)
@@ -730,6 +730,12 @@ static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id)
730 return IRQ_NONE; 730 return IRQ_NONE;
731} 731}
732 732
733static irqreturn_t irq_forced_secondary_handler(int irq, void *dev_id)
734{
735 WARN(1, "Secondary action handler called for irq %d\n", irq);
736 return IRQ_NONE;
737}
738
733static int irq_wait_for_interrupt(struct irqaction *action) 739static int irq_wait_for_interrupt(struct irqaction *action)
734{ 740{
735 set_current_state(TASK_INTERRUPTIBLE); 741 set_current_state(TASK_INTERRUPTIBLE);
@@ -756,7 +762,8 @@ static int irq_wait_for_interrupt(struct irqaction *action)
756static void irq_finalize_oneshot(struct irq_desc *desc, 762static void irq_finalize_oneshot(struct irq_desc *desc,
757 struct irqaction *action) 763 struct irqaction *action)
758{ 764{
759 if (!(desc->istate & IRQS_ONESHOT)) 765 if (!(desc->istate & IRQS_ONESHOT) ||
766 action->handler == irq_forced_secondary_handler)
760 return; 767 return;
761again: 768again:
762 chip_bus_lock(desc); 769 chip_bus_lock(desc);
@@ -910,6 +917,18 @@ static void irq_thread_dtor(struct callback_head *unused)
910 irq_finalize_oneshot(desc, action); 917 irq_finalize_oneshot(desc, action);
911} 918}
912 919
920static void irq_wake_secondary(struct irq_desc *desc, struct irqaction *action)
921{
922 struct irqaction *secondary = action->secondary;
923
924 if (WARN_ON_ONCE(!secondary))
925 return;
926
927 raw_spin_lock_irq(&desc->lock);
928 __irq_wake_thread(desc, secondary);
929 raw_spin_unlock_irq(&desc->lock);
930}
931
913/* 932/*
914 * Interrupt handler thread 933 * Interrupt handler thread
915 */ 934 */
@@ -940,6 +959,8 @@ static int irq_thread(void *data)
940 action_ret = handler_fn(desc, action); 959 action_ret = handler_fn(desc, action);
941 if (action_ret == IRQ_HANDLED) 960 if (action_ret == IRQ_HANDLED)
942 atomic_inc(&desc->threads_handled); 961 atomic_inc(&desc->threads_handled);
962 if (action_ret == IRQ_WAKE_THREAD)
963 irq_wake_secondary(desc, action);
943 964
944 wake_threads_waitq(desc); 965 wake_threads_waitq(desc);
945 } 966 }
@@ -984,20 +1005,36 @@ void irq_wake_thread(unsigned int irq, void *dev_id)
984} 1005}
985EXPORT_SYMBOL_GPL(irq_wake_thread); 1006EXPORT_SYMBOL_GPL(irq_wake_thread);
986 1007
987static void irq_setup_forced_threading(struct irqaction *new) 1008static int irq_setup_forced_threading(struct irqaction *new)
988{ 1009{
989 if (!force_irqthreads) 1010 if (!force_irqthreads)
990 return; 1011 return 0;
991 if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT)) 1012 if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT))
992 return; 1013 return 0;
993 1014
994 new->flags |= IRQF_ONESHOT; 1015 new->flags |= IRQF_ONESHOT;
995 1016
996 if (!new->thread_fn) { 1017 /*
997 set_bit(IRQTF_FORCED_THREAD, &new->thread_flags); 1018 * Handle the case where we have a real primary handler and a
998 new->thread_fn = new->handler; 1019 * thread handler. We force thread them as well by creating a
999 new->handler = irq_default_primary_handler; 1020 * secondary action.
1021 */
1022 if (new->handler != irq_default_primary_handler && new->thread_fn) {
1023 /* Allocate the secondary action */
1024 new->secondary = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
1025 if (!new->secondary)
1026 return -ENOMEM;
1027 new->secondary->handler = irq_forced_secondary_handler;
1028 new->secondary->thread_fn = new->thread_fn;
1029 new->secondary->dev_id = new->dev_id;
1030 new->secondary->irq = new->irq;
1031 new->secondary->name = new->name;
1000 } 1032 }
1033 /* Deal with the primary handler */
1034 set_bit(IRQTF_FORCED_THREAD, &new->thread_flags);
1035 new->thread_fn = new->handler;
1036 new->handler = irq_default_primary_handler;
1037 return 0;
1001} 1038}
1002 1039
1003static int irq_request_resources(struct irq_desc *desc) 1040static int irq_request_resources(struct irq_desc *desc)
@@ -1017,6 +1054,48 @@ static void irq_release_resources(struct irq_desc *desc)
1017 c->irq_release_resources(d); 1054 c->irq_release_resources(d);
1018} 1055}
1019 1056
1057static int
1058setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
1059{
1060 struct task_struct *t;
1061 struct sched_param param = {
1062 .sched_priority = MAX_USER_RT_PRIO/2,
1063 };
1064
1065 if (!secondary) {
1066 t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
1067 new->name);
1068 } else {
1069 t = kthread_create(irq_thread, new, "irq/%d-s-%s", irq,
1070 new->name);
1071 param.sched_priority -= 1;
1072 }
1073
1074 if (IS_ERR(t))
1075 return PTR_ERR(t);
1076
1077 sched_setscheduler_nocheck(t, SCHED_FIFO, &param);
1078
1079 /*
1080 * We keep the reference to the task struct even if
1081 * the thread dies to avoid that the interrupt code
1082 * references an already freed task_struct.
1083 */
1084 get_task_struct(t);
1085 new->thread = t;
1086 /*
1087 * Tell the thread to set its affinity. This is
1088 * important for shared interrupt handlers as we do
1089 * not invoke setup_affinity() for the secondary
1090 * handlers as everything is already set up. Even for
1091 * interrupts marked with IRQF_NO_BALANCE this is
1092 * correct as we want the thread to move to the cpu(s)
1093 * on which the requesting code placed the interrupt.
1094 */
1095 set_bit(IRQTF_AFFINITY, &new->thread_flags);
1096 return 0;
1097}
1098
1020/* 1099/*
1021 * Internal function to register an irqaction - typically used to 1100 * Internal function to register an irqaction - typically used to
1022 * allocate special interrupts that are part of the architecture. 1101 * allocate special interrupts that are part of the architecture.
@@ -1037,6 +1116,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1037 if (!try_module_get(desc->owner)) 1116 if (!try_module_get(desc->owner))
1038 return -ENODEV; 1117 return -ENODEV;
1039 1118
1119 new->irq = irq;
1120
1040 /* 1121 /*
1041 * Check whether the interrupt nests into another interrupt 1122 * Check whether the interrupt nests into another interrupt
1042 * thread. 1123 * thread.
@@ -1054,8 +1135,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1054 */ 1135 */
1055 new->handler = irq_nested_primary_handler; 1136 new->handler = irq_nested_primary_handler;
1056 } else { 1137 } else {
1057 if (irq_settings_can_thread(desc)) 1138 if (irq_settings_can_thread(desc)) {
1058 irq_setup_forced_threading(new); 1139 ret = irq_setup_forced_threading(new);
1140 if (ret)
1141 goto out_mput;
1142 }
1059 } 1143 }
1060 1144
1061 /* 1145 /*
@@ -1064,37 +1148,14 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1064 * thread. 1148 * thread.
1065 */ 1149 */
1066 if (new->thread_fn && !nested) { 1150 if (new->thread_fn && !nested) {
1067 struct task_struct *t; 1151 ret = setup_irq_thread(new, irq, false);
1068 static const struct sched_param param = { 1152 if (ret)
1069 .sched_priority = MAX_USER_RT_PRIO/2,
1070 };
1071
1072 t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
1073 new->name);
1074 if (IS_ERR(t)) {
1075 ret = PTR_ERR(t);
1076 goto out_mput; 1153 goto out_mput;
1154 if (new->secondary) {
1155 ret = setup_irq_thread(new->secondary, irq, true);
1156 if (ret)
1157 goto out_thread;
1077 } 1158 }
1078
1079 sched_setscheduler_nocheck(t, SCHED_FIFO, &param);
1080
1081 /*
1082 * We keep the reference to the task struct even if
1083 * the thread dies to avoid that the interrupt code
1084 * references an already freed task_struct.
1085 */
1086 get_task_struct(t);
1087 new->thread = t;
1088 /*
1089 * Tell the thread to set its affinity. This is
1090 * important for shared interrupt handlers as we do
1091 * not invoke setup_affinity() for the secondary
1092 * handlers as everything is already set up. Even for
1093 * interrupts marked with IRQF_NO_BALANCE this is
1094 * correct as we want the thread to move to the cpu(s)
1095 * on which the requesting code placed the interrupt.
1096 */
1097 set_bit(IRQTF_AFFINITY, &new->thread_flags);
1098 } 1159 }
1099 1160
1100 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { 1161 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
@@ -1267,7 +1328,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1267 irq, nmsk, omsk); 1328 irq, nmsk, omsk);
1268 } 1329 }
1269 1330
1270 new->irq = irq;
1271 *old_ptr = new; 1331 *old_ptr = new;
1272 1332
1273 irq_pm_install_action(desc, new); 1333 irq_pm_install_action(desc, new);
@@ -1293,6 +1353,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1293 */ 1353 */
1294 if (new->thread) 1354 if (new->thread)
1295 wake_up_process(new->thread); 1355 wake_up_process(new->thread);
1356 if (new->secondary)
1357 wake_up_process(new->secondary->thread);
1296 1358
1297 register_irq_proc(irq, desc); 1359 register_irq_proc(irq, desc);
1298 new->dir = NULL; 1360 new->dir = NULL;
@@ -1323,6 +1385,13 @@ out_thread:
1323 kthread_stop(t); 1385 kthread_stop(t);
1324 put_task_struct(t); 1386 put_task_struct(t);
1325 } 1387 }
1388 if (new->secondary && new->secondary->thread) {
1389 struct task_struct *t = new->secondary->thread;
1390
1391 new->secondary->thread = NULL;
1392 kthread_stop(t);
1393 put_task_struct(t);
1394 }
1326out_mput: 1395out_mput:
1327 module_put(desc->owner); 1396 module_put(desc->owner);
1328 return ret; 1397 return ret;
@@ -1394,6 +1463,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
1394 1463
1395 /* If this was the last handler, shut down the IRQ line: */ 1464 /* If this was the last handler, shut down the IRQ line: */
1396 if (!desc->action) { 1465 if (!desc->action) {
1466 irq_settings_clr_disable_unlazy(desc);
1397 irq_shutdown(desc); 1467 irq_shutdown(desc);
1398 irq_release_resources(desc); 1468 irq_release_resources(desc);
1399 } 1469 }
@@ -1430,9 +1500,14 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
1430 if (action->thread) { 1500 if (action->thread) {
1431 kthread_stop(action->thread); 1501 kthread_stop(action->thread);
1432 put_task_struct(action->thread); 1502 put_task_struct(action->thread);
1503 if (action->secondary && action->secondary->thread) {
1504 kthread_stop(action->secondary->thread);
1505 put_task_struct(action->secondary->thread);
1506 }
1433 } 1507 }
1434 1508
1435 module_put(desc->owner); 1509 module_put(desc->owner);
1510 kfree(action->secondary);
1436 return action; 1511 return action;
1437} 1512}
1438 1513
@@ -1576,8 +1651,10 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1576 retval = __setup_irq(irq, desc, action); 1651 retval = __setup_irq(irq, desc, action);
1577 chip_bus_sync_unlock(desc); 1652 chip_bus_sync_unlock(desc);
1578 1653
1579 if (retval) 1654 if (retval) {
1655 kfree(action->secondary);
1580 kfree(action); 1656 kfree(action);
1657 }
1581 1658
1582#ifdef CONFIG_DEBUG_SHIRQ_FIXME 1659#ifdef CONFIG_DEBUG_SHIRQ_FIXME
1583 if (!retval && (irqflags & IRQF_SHARED)) { 1660 if (!retval && (irqflags & IRQF_SHARED)) {
@@ -1761,6 +1838,7 @@ void free_percpu_irq(unsigned int irq, void __percpu *dev_id)
1761 kfree(__free_percpu_irq(irq, dev_id)); 1838 kfree(__free_percpu_irq(irq, dev_id));
1762 chip_bus_sync_unlock(desc); 1839 chip_bus_sync_unlock(desc);
1763} 1840}
1841EXPORT_SYMBOL_GPL(free_percpu_irq);
1764 1842
1765/** 1843/**
1766 * setup_percpu_irq - setup a per-cpu interrupt 1844 * setup_percpu_irq - setup a per-cpu interrupt
@@ -1790,9 +1868,10 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
1790 * @devname: An ascii name for the claiming device 1868 * @devname: An ascii name for the claiming device
1791 * @dev_id: A percpu cookie passed back to the handler function 1869 * @dev_id: A percpu cookie passed back to the handler function
1792 * 1870 *
1793 * This call allocates interrupt resources, but doesn't 1871 * This call allocates interrupt resources and enables the
1794 * automatically enable the interrupt. It has to be done on each 1872 * interrupt on the local CPU. If the interrupt is supposed to be
1795 * CPU using enable_percpu_irq(). 1873 * enabled on other CPUs, it has to be done on each CPU using
1874 * enable_percpu_irq().
1796 * 1875 *
1797 * Dev_id must be globally unique. It is a per-cpu variable, and 1876 * Dev_id must be globally unique. It is a per-cpu variable, and
1798 * the handler gets called with the interrupted CPU's instance of 1877 * the handler gets called with the interrupted CPU's instance of
@@ -1831,6 +1910,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
1831 1910
1832 return retval; 1911 return retval;
1833} 1912}
1913EXPORT_SYMBOL_GPL(request_percpu_irq);
1834 1914
1835/** 1915/**
1836 * irq_get_irqchip_state - returns the irqchip state of a interrupt. 1916 * irq_get_irqchip_state - returns the irqchip state of a interrupt.
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 7e6512b9dc1f..6b0c0b74a2a1 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -228,22 +228,18 @@ static void msi_domain_update_chip_ops(struct msi_domain_info *info)
228{ 228{
229 struct irq_chip *chip = info->chip; 229 struct irq_chip *chip = info->chip;
230 230
231 BUG_ON(!chip); 231 BUG_ON(!chip || !chip->irq_mask || !chip->irq_unmask);
232 if (!chip->irq_mask)
233 chip->irq_mask = pci_msi_mask_irq;
234 if (!chip->irq_unmask)
235 chip->irq_unmask = pci_msi_unmask_irq;
236 if (!chip->irq_set_affinity) 232 if (!chip->irq_set_affinity)
237 chip->irq_set_affinity = msi_domain_set_affinity; 233 chip->irq_set_affinity = msi_domain_set_affinity;
238} 234}
239 235
240/** 236/**
241 * msi_create_irq_domain - Create a MSI interrupt domain 237 * msi_create_irq_domain - Create a MSI interrupt domain
242 * @of_node: Optional device-tree node of the interrupt controller 238 * @fwnode: Optional fwnode of the interrupt controller
243 * @info: MSI domain info 239 * @info: MSI domain info
244 * @parent: Parent irq domain 240 * @parent: Parent irq domain
245 */ 241 */
246struct irq_domain *msi_create_irq_domain(struct device_node *node, 242struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode,
247 struct msi_domain_info *info, 243 struct msi_domain_info *info,
248 struct irq_domain *parent) 244 struct irq_domain *parent)
249{ 245{
@@ -252,8 +248,8 @@ struct irq_domain *msi_create_irq_domain(struct device_node *node,
252 if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS) 248 if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
253 msi_domain_update_chip_ops(info); 249 msi_domain_update_chip_ops(info);
254 250
255 return irq_domain_add_hierarchy(parent, 0, 0, node, &msi_domain_ops, 251 return irq_domain_create_hierarchy(parent, 0, 0, fwnode,
256 info); 252 &msi_domain_ops, info);
257} 253}
258 254
259/** 255/**
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 21c62617a35a..e80c4400118a 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -21,7 +21,7 @@ bool irq_pm_check_wakeup(struct irq_desc *desc)
21 desc->istate |= IRQS_SUSPENDED | IRQS_PENDING; 21 desc->istate |= IRQS_SUSPENDED | IRQS_PENDING;
22 desc->depth++; 22 desc->depth++;
23 irq_disable(desc); 23 irq_disable(desc);
24 pm_system_wakeup(); 24 pm_system_irq_wakeup(irq_desc_get_irq(desc));
25 return true; 25 return true;
26 } 26 }
27 return false; 27 return false;
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index e3a8c9577ba6..a916cf144b65 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -12,6 +12,7 @@
12#include <linux/seq_file.h> 12#include <linux/seq_file.h>
13#include <linux/interrupt.h> 13#include <linux/interrupt.h>
14#include <linux/kernel_stat.h> 14#include <linux/kernel_stat.h>
15#include <linux/mutex.h>
15 16
16#include "internals.h" 17#include "internals.h"
17 18
@@ -323,18 +324,29 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
323 324
324void register_irq_proc(unsigned int irq, struct irq_desc *desc) 325void register_irq_proc(unsigned int irq, struct irq_desc *desc)
325{ 326{
327 static DEFINE_MUTEX(register_lock);
326 char name [MAX_NAMELEN]; 328 char name [MAX_NAMELEN];
327 329
328 if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip) || desc->dir) 330 if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip))
329 return; 331 return;
330 332
333 /*
334 * irq directories are registered only when a handler is
335 * added, not when the descriptor is created, so multiple
336 * tasks might try to register at the same time.
337 */
338 mutex_lock(&register_lock);
339
340 if (desc->dir)
341 goto out_unlock;
342
331 memset(name, 0, MAX_NAMELEN); 343 memset(name, 0, MAX_NAMELEN);
332 sprintf(name, "%d", irq); 344 sprintf(name, "%d", irq);
333 345
334 /* create /proc/irq/1234 */ 346 /* create /proc/irq/1234 */
335 desc->dir = proc_mkdir(name, root_irq_dir); 347 desc->dir = proc_mkdir(name, root_irq_dir);
336 if (!desc->dir) 348 if (!desc->dir)
337 return; 349 goto out_unlock;
338 350
339#ifdef CONFIG_SMP 351#ifdef CONFIG_SMP
340 /* create /proc/irq/<irq>/smp_affinity */ 352 /* create /proc/irq/<irq>/smp_affinity */
@@ -355,6 +367,9 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
355 367
356 proc_create_data("spurious", 0444, desc->dir, 368 proc_create_data("spurious", 0444, desc->dir,
357 &irq_spurious_proc_fops, (void *)(long)irq); 369 &irq_spurious_proc_fops, (void *)(long)irq);
370
371out_unlock:
372 mutex_unlock(&register_lock);
358} 373}
359 374
360void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) 375void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
@@ -460,7 +475,7 @@ int show_interrupts(struct seq_file *p, void *v)
460 for_each_online_cpu(j) 475 for_each_online_cpu(j)
461 any_count |= kstat_irqs_cpu(i, j); 476 any_count |= kstat_irqs_cpu(i, j);
462 action = desc->action; 477 action = desc->action;
463 if (!action && !any_count) 478 if ((!action || action == &chained_action) && !any_count)
464 goto out; 479 goto out;
465 480
466 seq_printf(p, "%*d: ", prec, i); 481 seq_printf(p, "%*d: ", prec, i);
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 3320b84cc60f..320579d89091 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -15,6 +15,7 @@ enum {
15 _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD, 15 _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD,
16 _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID, 16 _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID,
17 _IRQ_IS_POLLED = IRQ_IS_POLLED, 17 _IRQ_IS_POLLED = IRQ_IS_POLLED,
18 _IRQ_DISABLE_UNLAZY = IRQ_DISABLE_UNLAZY,
18 _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, 19 _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK,
19}; 20};
20 21
@@ -28,6 +29,7 @@ enum {
28#define IRQ_NESTED_THREAD GOT_YOU_MORON 29#define IRQ_NESTED_THREAD GOT_YOU_MORON
29#define IRQ_PER_CPU_DEVID GOT_YOU_MORON 30#define IRQ_PER_CPU_DEVID GOT_YOU_MORON
30#define IRQ_IS_POLLED GOT_YOU_MORON 31#define IRQ_IS_POLLED GOT_YOU_MORON
32#define IRQ_DISABLE_UNLAZY GOT_YOU_MORON
31#undef IRQF_MODIFY_MASK 33#undef IRQF_MODIFY_MASK
32#define IRQF_MODIFY_MASK GOT_YOU_MORON 34#define IRQF_MODIFY_MASK GOT_YOU_MORON
33 35
@@ -154,3 +156,13 @@ static inline bool irq_settings_is_polled(struct irq_desc *desc)
154{ 156{
155 return desc->status_use_accessors & _IRQ_IS_POLLED; 157 return desc->status_use_accessors & _IRQ_IS_POLLED;
156} 158}
159
160static inline bool irq_settings_disable_unlazy(struct irq_desc *desc)
161{
162 return desc->status_use_accessors & _IRQ_DISABLE_UNLAZY;
163}
164
165static inline void irq_settings_clr_disable_unlazy(struct irq_desc *desc)
166{
167 desc->status_use_accessors &= ~_IRQ_DISABLE_UNLAZY;
168}
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 4c5edc357923..d873b64fbddc 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -6,6 +6,8 @@
6 * Version 2. See the file COPYING for more details. 6 * Version 2. See the file COPYING for more details.
7 */ 7 */
8 8
9#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10
9#include <linux/capability.h> 11#include <linux/capability.h>
10#include <linux/mm.h> 12#include <linux/mm.h>
11#include <linux/file.h> 13#include <linux/file.h>
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 201b45327804..11b64a63c0f8 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -6,7 +6,7 @@
6 * Version 2. See the file COPYING for more details. 6 * Version 2. See the file COPYING for more details.
7 */ 7 */
8 8
9#define pr_fmt(fmt) "kexec: " fmt 9#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10 10
11#include <linux/capability.h> 11#include <linux/capability.h>
12#include <linux/mm.h> 12#include <linux/mm.h>
@@ -1027,7 +1027,7 @@ static int __init crash_notes_memory_init(void)
1027 1027
1028 crash_notes = __alloc_percpu(size, align); 1028 crash_notes = __alloc_percpu(size, align);
1029 if (!crash_notes) { 1029 if (!crash_notes) {
1030 pr_warn("Kexec: Memory allocation for saving cpu register states failed\n"); 1030 pr_warn("Memory allocation for saving cpu register states failed\n");
1031 return -ENOMEM; 1031 return -ENOMEM;
1032 } 1032 }
1033 return 0; 1033 return 0;
@@ -1149,7 +1149,7 @@ static int __init parse_crashkernel_simple(char *cmdline,
1149 if (*cur == '@') 1149 if (*cur == '@')
1150 *crash_base = memparse(cur+1, &cur); 1150 *crash_base = memparse(cur+1, &cur);
1151 else if (*cur != ' ' && *cur != '\0') { 1151 else if (*cur != ' ' && *cur != '\0') {
1152 pr_warn("crashkernel: unrecognized char\n"); 1152 pr_warn("crashkernel: unrecognized char: %c\n", *cur);
1153 return -EINVAL; 1153 return -EINVAL;
1154 } 1154 }
1155 1155
@@ -1186,12 +1186,12 @@ static int __init parse_crashkernel_suffix(char *cmdline,
1186 1186
1187 /* check with suffix */ 1187 /* check with suffix */
1188 if (strncmp(cur, suffix, strlen(suffix))) { 1188 if (strncmp(cur, suffix, strlen(suffix))) {
1189 pr_warn("crashkernel: unrecognized char\n"); 1189 pr_warn("crashkernel: unrecognized char: %c\n", *cur);
1190 return -EINVAL; 1190 return -EINVAL;
1191 } 1191 }
1192 cur += strlen(suffix); 1192 cur += strlen(suffix);
1193 if (*cur != ' ' && *cur != '\0') { 1193 if (*cur != ' ' && *cur != '\0') {
1194 pr_warn("crashkernel: unrecognized char\n"); 1194 pr_warn("crashkernel: unrecognized char: %c\n", *cur);
1195 return -EINVAL; 1195 return -EINVAL;
1196 } 1196 }
1197 1197
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 6a9a3f2a0e8e..b70ada0028d2 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -9,6 +9,8 @@
9 * Version 2. See the file COPYING for more details. 9 * Version 2. See the file COPYING for more details.
10 */ 10 */
11 11
12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13
12#include <linux/capability.h> 14#include <linux/capability.h>
13#include <linux/mm.h> 15#include <linux/mm.h>
14#include <linux/file.h> 16#include <linux/file.h>
diff --git a/kernel/kmod.c b/kernel/kmod.c
index da98d0593de2..0277d1216f80 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -327,9 +327,13 @@ static void call_usermodehelper_exec_work(struct work_struct *work)
327 call_usermodehelper_exec_sync(sub_info); 327 call_usermodehelper_exec_sync(sub_info);
328 } else { 328 } else {
329 pid_t pid; 329 pid_t pid;
330 330 /*
331 * Use CLONE_PARENT to reparent it to kthreadd; we do not
332 * want to pollute current->children, and we need a parent
333 * that always ignores SIGCHLD to ensure auto-reaping.
334 */
331 pid = kernel_thread(call_usermodehelper_exec_async, sub_info, 335 pid = kernel_thread(call_usermodehelper_exec_async, sub_info,
332 SIGCHLD); 336 CLONE_PARENT | SIGCHLD);
333 if (pid < 0) { 337 if (pid < 0) {
334 sub_info->retval = pid; 338 sub_info->retval = pid;
335 umh_complete(sub_info); 339 umh_complete(sub_info);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 8acfbf773e06..deae3907ac1e 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -2738,7 +2738,7 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
2738 return; 2738 return;
2739 2739
2740 /* no reclaim without waiting on it */ 2740 /* no reclaim without waiting on it */
2741 if (!(gfp_mask & __GFP_WAIT)) 2741 if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
2742 return; 2742 return;
2743 2743
2744 /* this guy won't enter reclaim */ 2744 /* this guy won't enter reclaim */
@@ -3068,7 +3068,7 @@ static int __lock_is_held(struct lockdep_map *lock);
3068static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, 3068static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3069 int trylock, int read, int check, int hardirqs_off, 3069 int trylock, int read, int check, int hardirqs_off,
3070 struct lockdep_map *nest_lock, unsigned long ip, 3070 struct lockdep_map *nest_lock, unsigned long ip,
3071 int references) 3071 int references, int pin_count)
3072{ 3072{
3073 struct task_struct *curr = current; 3073 struct task_struct *curr = current;
3074 struct lock_class *class = NULL; 3074 struct lock_class *class = NULL;
@@ -3157,7 +3157,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3157 hlock->waittime_stamp = 0; 3157 hlock->waittime_stamp = 0;
3158 hlock->holdtime_stamp = lockstat_clock(); 3158 hlock->holdtime_stamp = lockstat_clock();
3159#endif 3159#endif
3160 hlock->pin_count = 0; 3160 hlock->pin_count = pin_count;
3161 3161
3162 if (check && !mark_irqflags(curr, hlock)) 3162 if (check && !mark_irqflags(curr, hlock))
3163 return 0; 3163 return 0;
@@ -3343,7 +3343,7 @@ found_it:
3343 hlock_class(hlock)->subclass, hlock->trylock, 3343 hlock_class(hlock)->subclass, hlock->trylock,
3344 hlock->read, hlock->check, hlock->hardirqs_off, 3344 hlock->read, hlock->check, hlock->hardirqs_off,
3345 hlock->nest_lock, hlock->acquire_ip, 3345 hlock->nest_lock, hlock->acquire_ip,
3346 hlock->references)) 3346 hlock->references, hlock->pin_count))
3347 return 0; 3347 return 0;
3348 } 3348 }
3349 3349
@@ -3433,7 +3433,7 @@ found_it:
3433 hlock_class(hlock)->subclass, hlock->trylock, 3433 hlock_class(hlock)->subclass, hlock->trylock,
3434 hlock->read, hlock->check, hlock->hardirqs_off, 3434 hlock->read, hlock->check, hlock->hardirqs_off,
3435 hlock->nest_lock, hlock->acquire_ip, 3435 hlock->nest_lock, hlock->acquire_ip,
3436 hlock->references)) 3436 hlock->references, hlock->pin_count))
3437 return 0; 3437 return 0;
3438 } 3438 }
3439 3439
@@ -3583,7 +3583,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3583 current->lockdep_recursion = 1; 3583 current->lockdep_recursion = 1;
3584 trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); 3584 trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
3585 __lock_acquire(lock, subclass, trylock, read, check, 3585 __lock_acquire(lock, subclass, trylock, read, check,
3586 irqs_disabled_flags(flags), nest_lock, ip, 0); 3586 irqs_disabled_flags(flags), nest_lock, ip, 0, 0);
3587 current->lockdep_recursion = 0; 3587 current->lockdep_recursion = 0;
3588 raw_local_irq_restore(flags); 3588 raw_local_irq_restore(flags);
3589} 3589}
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 32244186f1f2..8ef1919d63b2 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -17,12 +17,14 @@
17 * 17 *
18 * Copyright (C) IBM Corporation, 2014 18 * Copyright (C) IBM Corporation, 2014
19 * 19 *
20 * Author: Paul E. McKenney <paulmck@us.ibm.com> 20 * Authors: Paul E. McKenney <paulmck@us.ibm.com>
21 * Davidlohr Bueso <dave@stgolabs.net>
21 * Based on kernel/rcu/torture.c. 22 * Based on kernel/rcu/torture.c.
22 */ 23 */
23#include <linux/kernel.h> 24#include <linux/kernel.h>
24#include <linux/module.h> 25#include <linux/module.h>
25#include <linux/kthread.h> 26#include <linux/kthread.h>
27#include <linux/sched/rt.h>
26#include <linux/spinlock.h> 28#include <linux/spinlock.h>
27#include <linux/rwlock.h> 29#include <linux/rwlock.h>
28#include <linux/mutex.h> 30#include <linux/mutex.h>
@@ -34,6 +36,7 @@
34#include <linux/moduleparam.h> 36#include <linux/moduleparam.h>
35#include <linux/delay.h> 37#include <linux/delay.h>
36#include <linux/slab.h> 38#include <linux/slab.h>
39#include <linux/percpu-rwsem.h>
37#include <linux/torture.h> 40#include <linux/torture.h>
38 41
39MODULE_LICENSE("GPL"); 42MODULE_LICENSE("GPL");
@@ -91,11 +94,13 @@ struct lock_torture_ops {
91 void (*init)(void); 94 void (*init)(void);
92 int (*writelock)(void); 95 int (*writelock)(void);
93 void (*write_delay)(struct torture_random_state *trsp); 96 void (*write_delay)(struct torture_random_state *trsp);
97 void (*task_boost)(struct torture_random_state *trsp);
94 void (*writeunlock)(void); 98 void (*writeunlock)(void);
95 int (*readlock)(void); 99 int (*readlock)(void);
96 void (*read_delay)(struct torture_random_state *trsp); 100 void (*read_delay)(struct torture_random_state *trsp);
97 void (*readunlock)(void); 101 void (*readunlock)(void);
98 unsigned long flags; 102
103 unsigned long flags; /* for irq spinlocks */
99 const char *name; 104 const char *name;
100}; 105};
101 106
@@ -139,9 +144,15 @@ static void torture_lock_busted_write_unlock(void)
139 /* BUGGY, do not use in real life!!! */ 144 /* BUGGY, do not use in real life!!! */
140} 145}
141 146
147static void torture_boost_dummy(struct torture_random_state *trsp)
148{
149 /* Only rtmutexes care about priority */
150}
151
142static struct lock_torture_ops lock_busted_ops = { 152static struct lock_torture_ops lock_busted_ops = {
143 .writelock = torture_lock_busted_write_lock, 153 .writelock = torture_lock_busted_write_lock,
144 .write_delay = torture_lock_busted_write_delay, 154 .write_delay = torture_lock_busted_write_delay,
155 .task_boost = torture_boost_dummy,
145 .writeunlock = torture_lock_busted_write_unlock, 156 .writeunlock = torture_lock_busted_write_unlock,
146 .readlock = NULL, 157 .readlock = NULL,
147 .read_delay = NULL, 158 .read_delay = NULL,
@@ -185,6 +196,7 @@ static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock)
185static struct lock_torture_ops spin_lock_ops = { 196static struct lock_torture_ops spin_lock_ops = {
186 .writelock = torture_spin_lock_write_lock, 197 .writelock = torture_spin_lock_write_lock,
187 .write_delay = torture_spin_lock_write_delay, 198 .write_delay = torture_spin_lock_write_delay,
199 .task_boost = torture_boost_dummy,
188 .writeunlock = torture_spin_lock_write_unlock, 200 .writeunlock = torture_spin_lock_write_unlock,
189 .readlock = NULL, 201 .readlock = NULL,
190 .read_delay = NULL, 202 .read_delay = NULL,
@@ -211,6 +223,7 @@ __releases(torture_spinlock)
211static struct lock_torture_ops spin_lock_irq_ops = { 223static struct lock_torture_ops spin_lock_irq_ops = {
212 .writelock = torture_spin_lock_write_lock_irq, 224 .writelock = torture_spin_lock_write_lock_irq,
213 .write_delay = torture_spin_lock_write_delay, 225 .write_delay = torture_spin_lock_write_delay,
226 .task_boost = torture_boost_dummy,
214 .writeunlock = torture_lock_spin_write_unlock_irq, 227 .writeunlock = torture_lock_spin_write_unlock_irq,
215 .readlock = NULL, 228 .readlock = NULL,
216 .read_delay = NULL, 229 .read_delay = NULL,
@@ -275,6 +288,7 @@ static void torture_rwlock_read_unlock(void) __releases(torture_rwlock)
275static struct lock_torture_ops rw_lock_ops = { 288static struct lock_torture_ops rw_lock_ops = {
276 .writelock = torture_rwlock_write_lock, 289 .writelock = torture_rwlock_write_lock,
277 .write_delay = torture_rwlock_write_delay, 290 .write_delay = torture_rwlock_write_delay,
291 .task_boost = torture_boost_dummy,
278 .writeunlock = torture_rwlock_write_unlock, 292 .writeunlock = torture_rwlock_write_unlock,
279 .readlock = torture_rwlock_read_lock, 293 .readlock = torture_rwlock_read_lock,
280 .read_delay = torture_rwlock_read_delay, 294 .read_delay = torture_rwlock_read_delay,
@@ -315,6 +329,7 @@ __releases(torture_rwlock)
315static struct lock_torture_ops rw_lock_irq_ops = { 329static struct lock_torture_ops rw_lock_irq_ops = {
316 .writelock = torture_rwlock_write_lock_irq, 330 .writelock = torture_rwlock_write_lock_irq,
317 .write_delay = torture_rwlock_write_delay, 331 .write_delay = torture_rwlock_write_delay,
332 .task_boost = torture_boost_dummy,
318 .writeunlock = torture_rwlock_write_unlock_irq, 333 .writeunlock = torture_rwlock_write_unlock_irq,
319 .readlock = torture_rwlock_read_lock_irq, 334 .readlock = torture_rwlock_read_lock_irq,
320 .read_delay = torture_rwlock_read_delay, 335 .read_delay = torture_rwlock_read_delay,
@@ -354,6 +369,7 @@ static void torture_mutex_unlock(void) __releases(torture_mutex)
354static struct lock_torture_ops mutex_lock_ops = { 369static struct lock_torture_ops mutex_lock_ops = {
355 .writelock = torture_mutex_lock, 370 .writelock = torture_mutex_lock,
356 .write_delay = torture_mutex_delay, 371 .write_delay = torture_mutex_delay,
372 .task_boost = torture_boost_dummy,
357 .writeunlock = torture_mutex_unlock, 373 .writeunlock = torture_mutex_unlock,
358 .readlock = NULL, 374 .readlock = NULL,
359 .read_delay = NULL, 375 .read_delay = NULL,
@@ -361,6 +377,90 @@ static struct lock_torture_ops mutex_lock_ops = {
361 .name = "mutex_lock" 377 .name = "mutex_lock"
362}; 378};
363 379
380#ifdef CONFIG_RT_MUTEXES
381static DEFINE_RT_MUTEX(torture_rtmutex);
382
383static int torture_rtmutex_lock(void) __acquires(torture_rtmutex)
384{
385 rt_mutex_lock(&torture_rtmutex);
386 return 0;
387}
388
389static void torture_rtmutex_boost(struct torture_random_state *trsp)
390{
391 int policy;
392 struct sched_param param;
393 const unsigned int factor = 50000; /* yes, quite arbitrary */
394
395 if (!rt_task(current)) {
396 /*
397 * (1) Boost priority once every ~50k operations. When the
398 * task tries to take the lock, the rtmutex it will account
399 * for the new priority, and do any corresponding pi-dance.
400 */
401 if (!(torture_random(trsp) %
402 (cxt.nrealwriters_stress * factor))) {
403 policy = SCHED_FIFO;
404 param.sched_priority = MAX_RT_PRIO - 1;
405 } else /* common case, do nothing */
406 return;
407 } else {
408 /*
409 * The task will remain boosted for another ~500k operations,
410 * then restored back to its original prio, and so forth.
411 *
412 * When @trsp is nil, we want to force-reset the task for
413 * stopping the kthread.
414 */
415 if (!trsp || !(torture_random(trsp) %
416 (cxt.nrealwriters_stress * factor * 2))) {
417 policy = SCHED_NORMAL;
418 param.sched_priority = 0;
419 } else /* common case, do nothing */
420 return;
421 }
422
423 sched_setscheduler_nocheck(current, policy, &param);
424}
425
426static void torture_rtmutex_delay(struct torture_random_state *trsp)
427{
428 const unsigned long shortdelay_us = 2;
429 const unsigned long longdelay_ms = 100;
430
431 /*
432 * We want a short delay mostly to emulate likely code, and
433 * we want a long delay occasionally to force massive contention.
434 */
435 if (!(torture_random(trsp) %
436 (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
437 mdelay(longdelay_ms);
438 if (!(torture_random(trsp) %
439 (cxt.nrealwriters_stress * 2 * shortdelay_us)))
440 udelay(shortdelay_us);
441#ifdef CONFIG_PREEMPT
442 if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
443 preempt_schedule(); /* Allow test to be preempted. */
444#endif
445}
446
447static void torture_rtmutex_unlock(void) __releases(torture_rtmutex)
448{
449 rt_mutex_unlock(&torture_rtmutex);
450}
451
452static struct lock_torture_ops rtmutex_lock_ops = {
453 .writelock = torture_rtmutex_lock,
454 .write_delay = torture_rtmutex_delay,
455 .task_boost = torture_rtmutex_boost,
456 .writeunlock = torture_rtmutex_unlock,
457 .readlock = NULL,
458 .read_delay = NULL,
459 .readunlock = NULL,
460 .name = "rtmutex_lock"
461};
462#endif
463
364static DECLARE_RWSEM(torture_rwsem); 464static DECLARE_RWSEM(torture_rwsem);
365static int torture_rwsem_down_write(void) __acquires(torture_rwsem) 465static int torture_rwsem_down_write(void) __acquires(torture_rwsem)
366{ 466{
@@ -419,6 +519,7 @@ static void torture_rwsem_up_read(void) __releases(torture_rwsem)
419static struct lock_torture_ops rwsem_lock_ops = { 519static struct lock_torture_ops rwsem_lock_ops = {
420 .writelock = torture_rwsem_down_write, 520 .writelock = torture_rwsem_down_write,
421 .write_delay = torture_rwsem_write_delay, 521 .write_delay = torture_rwsem_write_delay,
522 .task_boost = torture_boost_dummy,
422 .writeunlock = torture_rwsem_up_write, 523 .writeunlock = torture_rwsem_up_write,
423 .readlock = torture_rwsem_down_read, 524 .readlock = torture_rwsem_down_read,
424 .read_delay = torture_rwsem_read_delay, 525 .read_delay = torture_rwsem_read_delay,
@@ -426,6 +527,48 @@ static struct lock_torture_ops rwsem_lock_ops = {
426 .name = "rwsem_lock" 527 .name = "rwsem_lock"
427}; 528};
428 529
530#include <linux/percpu-rwsem.h>
531static struct percpu_rw_semaphore pcpu_rwsem;
532
533void torture_percpu_rwsem_init(void)
534{
535 BUG_ON(percpu_init_rwsem(&pcpu_rwsem));
536}
537
538static int torture_percpu_rwsem_down_write(void) __acquires(pcpu_rwsem)
539{
540 percpu_down_write(&pcpu_rwsem);
541 return 0;
542}
543
544static void torture_percpu_rwsem_up_write(void) __releases(pcpu_rwsem)
545{
546 percpu_up_write(&pcpu_rwsem);
547}
548
549static int torture_percpu_rwsem_down_read(void) __acquires(pcpu_rwsem)
550{
551 percpu_down_read(&pcpu_rwsem);
552 return 0;
553}
554
555static void torture_percpu_rwsem_up_read(void) __releases(pcpu_rwsem)
556{
557 percpu_up_read(&pcpu_rwsem);
558}
559
560static struct lock_torture_ops percpu_rwsem_lock_ops = {
561 .init = torture_percpu_rwsem_init,
562 .writelock = torture_percpu_rwsem_down_write,
563 .write_delay = torture_rwsem_write_delay,
564 .task_boost = torture_boost_dummy,
565 .writeunlock = torture_percpu_rwsem_up_write,
566 .readlock = torture_percpu_rwsem_down_read,
567 .read_delay = torture_rwsem_read_delay,
568 .readunlock = torture_percpu_rwsem_up_read,
569 .name = "percpu_rwsem_lock"
570};
571
429/* 572/*
430 * Lock torture writer kthread. Repeatedly acquires and releases 573 * Lock torture writer kthread. Repeatedly acquires and releases
431 * the lock, checking for duplicate acquisitions. 574 * the lock, checking for duplicate acquisitions.
@@ -442,6 +585,7 @@ static int lock_torture_writer(void *arg)
442 if ((torture_random(&rand) & 0xfffff) == 0) 585 if ((torture_random(&rand) & 0xfffff) == 0)
443 schedule_timeout_uninterruptible(1); 586 schedule_timeout_uninterruptible(1);
444 587
588 cxt.cur_ops->task_boost(&rand);
445 cxt.cur_ops->writelock(); 589 cxt.cur_ops->writelock();
446 if (WARN_ON_ONCE(lock_is_write_held)) 590 if (WARN_ON_ONCE(lock_is_write_held))
447 lwsp->n_lock_fail++; 591 lwsp->n_lock_fail++;
@@ -456,6 +600,8 @@ static int lock_torture_writer(void *arg)
456 600
457 stutter_wait("lock_torture_writer"); 601 stutter_wait("lock_torture_writer");
458 } while (!torture_must_stop()); 602 } while (!torture_must_stop());
603
604 cxt.cur_ops->task_boost(NULL); /* reset prio */
459 torture_kthread_stopping("lock_torture_writer"); 605 torture_kthread_stopping("lock_torture_writer");
460 return 0; 606 return 0;
461} 607}
@@ -642,7 +788,11 @@ static int __init lock_torture_init(void)
642 &spin_lock_ops, &spin_lock_irq_ops, 788 &spin_lock_ops, &spin_lock_irq_ops,
643 &rw_lock_ops, &rw_lock_irq_ops, 789 &rw_lock_ops, &rw_lock_irq_ops,
644 &mutex_lock_ops, 790 &mutex_lock_ops,
791#ifdef CONFIG_RT_MUTEXES
792 &rtmutex_lock_ops,
793#endif
645 &rwsem_lock_ops, 794 &rwsem_lock_ops,
795 &percpu_rwsem_lock_ops,
646 }; 796 };
647 797
648 if (!torture_init_begin(torture_type, verbose, &torture_runnable)) 798 if (!torture_init_begin(torture_type, verbose, &torture_runnable))
@@ -661,11 +811,11 @@ static int __init lock_torture_init(void)
661 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) 811 for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
662 pr_alert(" %s", torture_ops[i]->name); 812 pr_alert(" %s", torture_ops[i]->name);
663 pr_alert("\n"); 813 pr_alert("\n");
664 torture_init_end(); 814 firsterr = -EINVAL;
665 return -EINVAL; 815 goto unwind;
666 } 816 }
667 if (cxt.cur_ops->init) 817 if (cxt.cur_ops->init)
668 cxt.cur_ops->init(); /* no "goto unwind" prior to this point!!! */ 818 cxt.cur_ops->init();
669 819
670 if (nwriters_stress >= 0) 820 if (nwriters_stress >= 0)
671 cxt.nrealwriters_stress = nwriters_stress; 821 cxt.nrealwriters_stress = nwriters_stress;
@@ -676,6 +826,10 @@ static int __init lock_torture_init(void)
676 if (strncmp(torture_type, "mutex", 5) == 0) 826 if (strncmp(torture_type, "mutex", 5) == 0)
677 cxt.debug_lock = true; 827 cxt.debug_lock = true;
678#endif 828#endif
829#ifdef CONFIG_DEBUG_RT_MUTEXES
830 if (strncmp(torture_type, "rtmutex", 7) == 0)
831 cxt.debug_lock = true;
832#endif
679#ifdef CONFIG_DEBUG_SPINLOCK 833#ifdef CONFIG_DEBUG_SPINLOCK
680 if ((strncmp(torture_type, "spin", 4) == 0) || 834 if ((strncmp(torture_type, "spin", 4) == 0) ||
681 (strncmp(torture_type, "rw_lock", 7) == 0)) 835 (strncmp(torture_type, "rw_lock", 7) == 0))
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index fd91aaa4554c..5b9102a47ea5 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -67,7 +67,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
67 node->locked = 0; 67 node->locked = 0;
68 node->next = NULL; 68 node->next = NULL;
69 69
70 prev = xchg(lock, node); 70 prev = xchg_acquire(lock, node);
71 if (likely(prev == NULL)) { 71 if (likely(prev == NULL)) {
72 /* 72 /*
73 * Lock acquired, don't need to set node->locked to 1. Threads 73 * Lock acquired, don't need to set node->locked to 1. Threads
@@ -98,7 +98,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
98 /* 98 /*
99 * Release the lock by setting it to NULL 99 * Release the lock by setting it to NULL
100 */ 100 */
101 if (likely(cmpxchg(lock, node, NULL) == node)) 101 if (likely(cmpxchg_release(lock, node, NULL) == node))
102 return; 102 return;
103 /* Wait until the next pointer is set */ 103 /* Wait until the next pointer is set */
104 while (!(next = READ_ONCE(node->next))) 104 while (!(next = READ_ONCE(node->next)))
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 4cccea6b8934..0551c219c40e 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -277,7 +277,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
277static inline bool mutex_try_to_acquire(struct mutex *lock) 277static inline bool mutex_try_to_acquire(struct mutex *lock)
278{ 278{
279 return !mutex_is_locked(lock) && 279 return !mutex_is_locked(lock) &&
280 (atomic_cmpxchg(&lock->count, 1, 0) == 1); 280 (atomic_cmpxchg_acquire(&lock->count, 1, 0) == 1);
281} 281}
282 282
283/* 283/*
@@ -529,7 +529,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
529 * Once more, try to acquire the lock. Only try-lock the mutex if 529 * Once more, try to acquire the lock. Only try-lock the mutex if
530 * it is unlocked to reduce unnecessary xchg() operations. 530 * it is unlocked to reduce unnecessary xchg() operations.
531 */ 531 */
532 if (!mutex_is_locked(lock) && (atomic_xchg(&lock->count, 0) == 1)) 532 if (!mutex_is_locked(lock) &&
533 (atomic_xchg_acquire(&lock->count, 0) == 1))
533 goto skip_wait; 534 goto skip_wait;
534 535
535 debug_mutex_lock_common(lock, &waiter); 536 debug_mutex_lock_common(lock, &waiter);
@@ -553,7 +554,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
553 * non-negative in order to avoid unnecessary xchg operations: 554 * non-negative in order to avoid unnecessary xchg operations:
554 */ 555 */
555 if (atomic_read(&lock->count) >= 0 && 556 if (atomic_read(&lock->count) >= 0 &&
556 (atomic_xchg(&lock->count, -1) == 1)) 557 (atomic_xchg_acquire(&lock->count, -1) == 1))
557 break; 558 break;
558 559
559 /* 560 /*
@@ -867,7 +868,7 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
867 868
868 spin_lock_mutex(&lock->wait_lock, flags); 869 spin_lock_mutex(&lock->wait_lock, flags);
869 870
870 prev = atomic_xchg(&lock->count, -1); 871 prev = atomic_xchg_acquire(&lock->count, -1);
871 if (likely(prev == 1)) { 872 if (likely(prev == 1)) {
872 mutex_set_owner(lock); 873 mutex_set_owner(lock);
873 mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); 874 mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index dc85ee23a26f..d092a0c9c2d4 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -50,7 +50,7 @@ osq_wait_next(struct optimistic_spin_queue *lock,
50 50
51 for (;;) { 51 for (;;) {
52 if (atomic_read(&lock->tail) == curr && 52 if (atomic_read(&lock->tail) == curr &&
53 atomic_cmpxchg(&lock->tail, curr, old) == curr) { 53 atomic_cmpxchg_acquire(&lock->tail, curr, old) == curr) {
54 /* 54 /*
55 * We were the last queued, we moved @lock back. @prev 55 * We were the last queued, we moved @lock back. @prev
56 * will now observe @lock and will complete its 56 * will now observe @lock and will complete its
@@ -92,7 +92,11 @@ bool osq_lock(struct optimistic_spin_queue *lock)
92 node->next = NULL; 92 node->next = NULL;
93 node->cpu = curr; 93 node->cpu = curr;
94 94
95 old = atomic_xchg(&lock->tail, curr); 95 /*
96 * ACQUIRE semantics, pairs with corresponding RELEASE
97 * in unlock() uncontended, or fastpath.
98 */
99 old = atomic_xchg_acquire(&lock->tail, curr);
96 if (old == OSQ_UNLOCKED_VAL) 100 if (old == OSQ_UNLOCKED_VAL)
97 return true; 101 return true;
98 102
@@ -184,7 +188,8 @@ void osq_unlock(struct optimistic_spin_queue *lock)
184 /* 188 /*
185 * Fast path for the uncontended case. 189 * Fast path for the uncontended case.
186 */ 190 */
187 if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr)) 191 if (likely(atomic_cmpxchg_release(&lock->tail, curr,
192 OSQ_UNLOCKED_VAL) == curr))
188 return; 193 return;
189 194
190 /* 195 /*
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index f32567254867..f231e0bb311c 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -17,50 +17,43 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *brw,
17 17
18 /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ 18 /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
19 __init_rwsem(&brw->rw_sem, name, rwsem_key); 19 __init_rwsem(&brw->rw_sem, name, rwsem_key);
20 atomic_set(&brw->write_ctr, 0); 20 rcu_sync_init(&brw->rss, RCU_SCHED_SYNC);
21 atomic_set(&brw->slow_read_ctr, 0); 21 atomic_set(&brw->slow_read_ctr, 0);
22 init_waitqueue_head(&brw->write_waitq); 22 init_waitqueue_head(&brw->write_waitq);
23 return 0; 23 return 0;
24} 24}
25EXPORT_SYMBOL_GPL(__percpu_init_rwsem);
25 26
26void percpu_free_rwsem(struct percpu_rw_semaphore *brw) 27void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
27{ 28{
29 /*
30 * XXX: temporary kludge. The error path in alloc_super()
31 * assumes that percpu_free_rwsem() is safe after kzalloc().
32 */
33 if (!brw->fast_read_ctr)
34 return;
35
36 rcu_sync_dtor(&brw->rss);
28 free_percpu(brw->fast_read_ctr); 37 free_percpu(brw->fast_read_ctr);
29 brw->fast_read_ctr = NULL; /* catch use after free bugs */ 38 brw->fast_read_ctr = NULL; /* catch use after free bugs */
30} 39}
31 40
32/* 41/*
33 * This is the fast-path for down_read/up_read, it only needs to ensure 42 * This is the fast-path for down_read/up_read. If it succeeds we rely
34 * there is no pending writer (atomic_read(write_ctr) == 0) and inc/dec the 43 * on the barriers provided by rcu_sync_enter/exit; see the comments in
35 * fast per-cpu counter. The writer uses synchronize_sched_expedited() to 44 * percpu_down_write() and percpu_up_write().
36 * serialize with the preempt-disabled section below.
37 *
38 * The nontrivial part is that we should guarantee acquire/release semantics
39 * in case when
40 *
41 * R_W: down_write() comes after up_read(), the writer should see all
42 * changes done by the reader
43 * or
44 * W_R: down_read() comes after up_write(), the reader should see all
45 * changes done by the writer
46 * 45 *
47 * If this helper fails the callers rely on the normal rw_semaphore and 46 * If this helper fails the callers rely on the normal rw_semaphore and
48 * atomic_dec_and_test(), so in this case we have the necessary barriers. 47 * atomic_dec_and_test(), so in this case we have the necessary barriers.
49 *
50 * But if it succeeds we do not have any barriers, atomic_read(write_ctr) or
51 * __this_cpu_add() below can be reordered with any LOAD/STORE done by the
52 * reader inside the critical section. See the comments in down_write and
53 * up_write below.
54 */ 48 */
55static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val) 49static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
56{ 50{
57 bool success = false; 51 bool success;
58 52
59 preempt_disable(); 53 preempt_disable();
60 if (likely(!atomic_read(&brw->write_ctr))) { 54 success = rcu_sync_is_idle(&brw->rss);
55 if (likely(success))
61 __this_cpu_add(*brw->fast_read_ctr, val); 56 __this_cpu_add(*brw->fast_read_ctr, val);
62 success = true;
63 }
64 preempt_enable(); 57 preempt_enable();
65 58
66 return success; 59 return success;
@@ -77,16 +70,17 @@ static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
77void percpu_down_read(struct percpu_rw_semaphore *brw) 70void percpu_down_read(struct percpu_rw_semaphore *brw)
78{ 71{
79 might_sleep(); 72 might_sleep();
80 if (likely(update_fast_ctr(brw, +1))) { 73 rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_);
81 rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_); 74
75 if (likely(update_fast_ctr(brw, +1)))
82 return; 76 return;
83 }
84 77
85 down_read(&brw->rw_sem); 78 /* Avoid rwsem_acquire_read() and rwsem_release() */
79 __down_read(&brw->rw_sem);
86 atomic_inc(&brw->slow_read_ctr); 80 atomic_inc(&brw->slow_read_ctr);
87 /* avoid up_read()->rwsem_release() */
88 __up_read(&brw->rw_sem); 81 __up_read(&brw->rw_sem);
89} 82}
83EXPORT_SYMBOL_GPL(percpu_down_read);
90 84
91int percpu_down_read_trylock(struct percpu_rw_semaphore *brw) 85int percpu_down_read_trylock(struct percpu_rw_semaphore *brw)
92{ 86{
@@ -112,6 +106,7 @@ void percpu_up_read(struct percpu_rw_semaphore *brw)
112 if (atomic_dec_and_test(&brw->slow_read_ctr)) 106 if (atomic_dec_and_test(&brw->slow_read_ctr))
113 wake_up_all(&brw->write_waitq); 107 wake_up_all(&brw->write_waitq);
114} 108}
109EXPORT_SYMBOL_GPL(percpu_up_read);
115 110
116static int clear_fast_ctr(struct percpu_rw_semaphore *brw) 111static int clear_fast_ctr(struct percpu_rw_semaphore *brw)
117{ 112{
@@ -126,33 +121,17 @@ static int clear_fast_ctr(struct percpu_rw_semaphore *brw)
126 return sum; 121 return sum;
127} 122}
128 123
129/*
130 * A writer increments ->write_ctr to force the readers to switch to the
131 * slow mode, note the atomic_read() check in update_fast_ctr().
132 *
133 * After that the readers can only inc/dec the slow ->slow_read_ctr counter,
134 * ->fast_read_ctr is stable. Once the writer moves its sum into the slow
135 * counter it represents the number of active readers.
136 *
137 * Finally the writer takes ->rw_sem for writing and blocks the new readers,
138 * then waits until the slow counter becomes zero.
139 */
140void percpu_down_write(struct percpu_rw_semaphore *brw) 124void percpu_down_write(struct percpu_rw_semaphore *brw)
141{ 125{
142 /* tell update_fast_ctr() there is a pending writer */
143 atomic_inc(&brw->write_ctr);
144 /* 126 /*
145 * 1. Ensures that write_ctr != 0 is visible to any down_read/up_read 127 * Make rcu_sync_is_idle() == F and thus disable the fast-path in
146 * so that update_fast_ctr() can't succeed. 128 * percpu_down_read() and percpu_up_read(), and wait for gp pass.
147 *
148 * 2. Ensures we see the result of every previous this_cpu_add() in
149 * update_fast_ctr().
150 * 129 *
151 * 3. Ensures that if any reader has exited its critical section via 130 * The latter synchronises us with the preceding readers which used
152 * fast-path, it executes a full memory barrier before we return. 131 * the fast-past, so we can not miss the result of __this_cpu_add()
153 * See R_W case in the comment above update_fast_ctr(). 132 * or anything else inside their criticial sections.
154 */ 133 */
155 synchronize_sched_expedited(); 134 rcu_sync_enter(&brw->rss);
156 135
157 /* exclude other writers, and block the new readers completely */ 136 /* exclude other writers, and block the new readers completely */
158 down_write(&brw->rw_sem); 137 down_write(&brw->rw_sem);
@@ -163,16 +142,17 @@ void percpu_down_write(struct percpu_rw_semaphore *brw)
163 /* wait for all readers to complete their percpu_up_read() */ 142 /* wait for all readers to complete their percpu_up_read() */
164 wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr)); 143 wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr));
165} 144}
145EXPORT_SYMBOL_GPL(percpu_down_write);
166 146
167void percpu_up_write(struct percpu_rw_semaphore *brw) 147void percpu_up_write(struct percpu_rw_semaphore *brw)
168{ 148{
169 /* release the lock, but the readers can't use the fast-path */ 149 /* release the lock, but the readers can't use the fast-path */
170 up_write(&brw->rw_sem); 150 up_write(&brw->rw_sem);
171 /* 151 /*
172 * Insert the barrier before the next fast-path in down_read, 152 * Enable the fast-path in percpu_down_read() and percpu_up_read()
173 * see W_R case in the comment above update_fast_ctr(). 153 * but only after another gp pass; this adds the necessary barrier
154 * to ensure the reader can't miss the changes done by us.
174 */ 155 */
175 synchronize_sched_expedited(); 156 rcu_sync_exit(&brw->rss);
176 /* the last writer unblocks update_fast_ctr() */
177 atomic_dec(&brw->write_ctr);
178} 157}
158EXPORT_SYMBOL_GPL(percpu_up_write);
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index f17a3e3b3550..fec082338668 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -86,7 +86,7 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
86 /* 86 /*
87 * Put the reader into the wait queue 87 * Put the reader into the wait queue
88 */ 88 */
89 arch_spin_lock(&lock->lock); 89 arch_spin_lock(&lock->wait_lock);
90 90
91 /* 91 /*
92 * The ACQUIRE semantics of the following spinning code ensure 92 * The ACQUIRE semantics of the following spinning code ensure
@@ -99,7 +99,7 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
99 /* 99 /*
100 * Signal the next one in queue to become queue head 100 * Signal the next one in queue to become queue head
101 */ 101 */
102 arch_spin_unlock(&lock->lock); 102 arch_spin_unlock(&lock->wait_lock);
103} 103}
104EXPORT_SYMBOL(queued_read_lock_slowpath); 104EXPORT_SYMBOL(queued_read_lock_slowpath);
105 105
@@ -112,7 +112,7 @@ void queued_write_lock_slowpath(struct qrwlock *lock)
112 u32 cnts; 112 u32 cnts;
113 113
114 /* Put the writer into the wait queue */ 114 /* Put the writer into the wait queue */
115 arch_spin_lock(&lock->lock); 115 arch_spin_lock(&lock->wait_lock);
116 116
117 /* Try to acquire the lock directly if no reader is present */ 117 /* Try to acquire the lock directly if no reader is present */
118 if (!atomic_read(&lock->cnts) && 118 if (!atomic_read(&lock->cnts) &&
@@ -144,6 +144,6 @@ void queued_write_lock_slowpath(struct qrwlock *lock)
144 cpu_relax_lowlatency(); 144 cpu_relax_lowlatency();
145 } 145 }
146unlock: 146unlock:
147 arch_spin_unlock(&lock->lock); 147 arch_spin_unlock(&lock->wait_lock);
148} 148}
149EXPORT_SYMBOL(queued_write_lock_slowpath); 149EXPORT_SYMBOL(queued_write_lock_slowpath);
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index c8e6e9a596f5..f0450ff4829b 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -267,7 +267,6 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
267 } 267 }
268 268
269 if (!lp) { /* ONCE */ 269 if (!lp) { /* ONCE */
270 WRITE_ONCE(pn->state, vcpu_hashed);
271 lp = pv_hash(lock, pn); 270 lp = pv_hash(lock, pn);
272 271
273 /* 272 /*
@@ -275,11 +274,9 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
275 * when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock() 274 * when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock()
276 * we'll be sure to be able to observe our hash entry. 275 * we'll be sure to be able to observe our hash entry.
277 * 276 *
278 * [S] pn->state
279 * [S] <hash> [Rmw] l->locked == _Q_SLOW_VAL 277 * [S] <hash> [Rmw] l->locked == _Q_SLOW_VAL
280 * MB RMB 278 * MB RMB
281 * [RmW] l->locked = _Q_SLOW_VAL [L] <unhash> 279 * [RmW] l->locked = _Q_SLOW_VAL [L] <unhash>
282 * [L] pn->state
283 * 280 *
284 * Matches the smp_rmb() in __pv_queued_spin_unlock(). 281 * Matches the smp_rmb() in __pv_queued_spin_unlock().
285 */ 282 */
@@ -364,8 +361,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
364 * vCPU is harmless other than the additional latency in completing 361 * vCPU is harmless other than the additional latency in completing
365 * the unlock. 362 * the unlock.
366 */ 363 */
367 if (READ_ONCE(node->state) == vcpu_hashed) 364 pv_kick(node->cpu);
368 pv_kick(node->cpu);
369} 365}
370/* 366/*
371 * Include the architecture specific callee-save thunk of the 367 * Include the architecture specific callee-save thunk of the
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 7781d801212f..8251e75dd9c0 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -74,14 +74,23 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
74 * set up. 74 * set up.
75 */ 75 */
76#ifndef CONFIG_DEBUG_RT_MUTEXES 76#ifndef CONFIG_DEBUG_RT_MUTEXES
77# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c) 77# define rt_mutex_cmpxchg_relaxed(l,c,n) (cmpxchg_relaxed(&l->owner, c, n) == c)
78# define rt_mutex_cmpxchg_acquire(l,c,n) (cmpxchg_acquire(&l->owner, c, n) == c)
79# define rt_mutex_cmpxchg_release(l,c,n) (cmpxchg_release(&l->owner, c, n) == c)
80
81/*
82 * Callers must hold the ->wait_lock -- which is the whole purpose as we force
83 * all future threads that attempt to [Rmw] the lock to the slowpath. As such
84 * relaxed semantics suffice.
85 */
78static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) 86static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
79{ 87{
80 unsigned long owner, *p = (unsigned long *) &lock->owner; 88 unsigned long owner, *p = (unsigned long *) &lock->owner;
81 89
82 do { 90 do {
83 owner = *p; 91 owner = *p;
84 } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); 92 } while (cmpxchg_relaxed(p, owner,
93 owner | RT_MUTEX_HAS_WAITERS) != owner);
85} 94}
86 95
87/* 96/*
@@ -121,11 +130,14 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
121 * lock(wait_lock); 130 * lock(wait_lock);
122 * acquire(lock); 131 * acquire(lock);
123 */ 132 */
124 return rt_mutex_cmpxchg(lock, owner, NULL); 133 return rt_mutex_cmpxchg_release(lock, owner, NULL);
125} 134}
126 135
127#else 136#else
128# define rt_mutex_cmpxchg(l,c,n) (0) 137# define rt_mutex_cmpxchg_relaxed(l,c,n) (0)
138# define rt_mutex_cmpxchg_acquire(l,c,n) (0)
139# define rt_mutex_cmpxchg_release(l,c,n) (0)
140
129static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) 141static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
130{ 142{
131 lock->owner = (struct task_struct *) 143 lock->owner = (struct task_struct *)
@@ -158,7 +170,8 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left,
158 * then right waiter has a dl_prio() too. 170 * then right waiter has a dl_prio() too.
159 */ 171 */
160 if (dl_prio(left->prio)) 172 if (dl_prio(left->prio))
161 return (left->task->dl.deadline < right->task->dl.deadline); 173 return dl_time_before(left->task->dl.deadline,
174 right->task->dl.deadline);
162 175
163 return 0; 176 return 0;
164} 177}
@@ -1321,7 +1334,7 @@ rt_mutex_fastlock(struct rt_mutex *lock, int state,
1321 struct hrtimer_sleeper *timeout, 1334 struct hrtimer_sleeper *timeout,
1322 enum rtmutex_chainwalk chwalk)) 1335 enum rtmutex_chainwalk chwalk))
1323{ 1336{
1324 if (likely(rt_mutex_cmpxchg(lock, NULL, current))) { 1337 if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
1325 rt_mutex_deadlock_account_lock(lock, current); 1338 rt_mutex_deadlock_account_lock(lock, current);
1326 return 0; 1339 return 0;
1327 } else 1340 } else
@@ -1337,7 +1350,7 @@ rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
1337 enum rtmutex_chainwalk chwalk)) 1350 enum rtmutex_chainwalk chwalk))
1338{ 1351{
1339 if (chwalk == RT_MUTEX_MIN_CHAINWALK && 1352 if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
1340 likely(rt_mutex_cmpxchg(lock, NULL, current))) { 1353 likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
1341 rt_mutex_deadlock_account_lock(lock, current); 1354 rt_mutex_deadlock_account_lock(lock, current);
1342 return 0; 1355 return 0;
1343 } else 1356 } else
@@ -1348,7 +1361,7 @@ static inline int
1348rt_mutex_fasttrylock(struct rt_mutex *lock, 1361rt_mutex_fasttrylock(struct rt_mutex *lock,
1349 int (*slowfn)(struct rt_mutex *lock)) 1362 int (*slowfn)(struct rt_mutex *lock))
1350{ 1363{
1351 if (likely(rt_mutex_cmpxchg(lock, NULL, current))) { 1364 if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
1352 rt_mutex_deadlock_account_lock(lock, current); 1365 rt_mutex_deadlock_account_lock(lock, current);
1353 return 1; 1366 return 1;
1354 } 1367 }
@@ -1362,7 +1375,7 @@ rt_mutex_fastunlock(struct rt_mutex *lock,
1362{ 1375{
1363 WAKE_Q(wake_q); 1376 WAKE_Q(wake_q);
1364 1377
1365 if (likely(rt_mutex_cmpxchg(lock, current, NULL))) { 1378 if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
1366 rt_mutex_deadlock_account_unlock(current); 1379 rt_mutex_deadlock_account_unlock(current);
1367 1380
1368 } else { 1381 } else {
@@ -1484,7 +1497,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_unlock);
1484bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock, 1497bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
1485 struct wake_q_head *wqh) 1498 struct wake_q_head *wqh)
1486{ 1499{
1487 if (likely(rt_mutex_cmpxchg(lock, current, NULL))) { 1500 if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
1488 rt_mutex_deadlock_account_unlock(current); 1501 rt_mutex_deadlock_account_unlock(current);
1489 return false; 1502 return false;
1490 } 1503 }
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 0f189714e457..a4d4de05b2d1 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -262,7 +262,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
262 * to reduce unnecessary expensive cmpxchg() operations. 262 * to reduce unnecessary expensive cmpxchg() operations.
263 */ 263 */
264 if (count == RWSEM_WAITING_BIAS && 264 if (count == RWSEM_WAITING_BIAS &&
265 cmpxchg(&sem->count, RWSEM_WAITING_BIAS, 265 cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS,
266 RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { 266 RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
267 if (!list_is_singular(&sem->wait_list)) 267 if (!list_is_singular(&sem->wait_list))
268 rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); 268 rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
@@ -285,7 +285,8 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
285 if (!(count == 0 || count == RWSEM_WAITING_BIAS)) 285 if (!(count == 0 || count == RWSEM_WAITING_BIAS))
286 return false; 286 return false;
287 287
288 old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS); 288 old = cmpxchg_acquire(&sem->count, count,
289 count + RWSEM_ACTIVE_WRITE_BIAS);
289 if (old == count) { 290 if (old == count) {
290 rwsem_set_owner(sem); 291 rwsem_set_owner(sem);
291 return true; 292 return true;
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 72b0c66628b6..7658d32c5c78 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -24,6 +24,16 @@ __weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size)
24} 24}
25#endif 25#endif
26 26
27static void *try_ram_remap(resource_size_t offset, size_t size)
28{
29 struct page *page = pfn_to_page(offset >> PAGE_SHIFT);
30
31 /* In the simple case just return the existing linear address */
32 if (!PageHighMem(page))
33 return __va(offset);
34 return NULL; /* fallback to ioremap_cache */
35}
36
27/** 37/**
28 * memremap() - remap an iomem_resource as cacheable memory 38 * memremap() - remap an iomem_resource as cacheable memory
29 * @offset: iomem resource start address 39 * @offset: iomem resource start address
@@ -66,8 +76,8 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
66 * the requested range is potentially in "System RAM" 76 * the requested range is potentially in "System RAM"
67 */ 77 */
68 if (is_ram == REGION_INTERSECTS) 78 if (is_ram == REGION_INTERSECTS)
69 addr = __va(offset); 79 addr = try_ram_remap(offset, size);
70 else 80 if (!addr)
71 addr = ioremap_cache(offset, size); 81 addr = ioremap_cache(offset, size);
72 } 82 }
73 83
@@ -114,9 +124,10 @@ void *devm_memremap(struct device *dev, resource_size_t offset,
114{ 124{
115 void **ptr, *addr; 125 void **ptr, *addr;
116 126
117 ptr = devres_alloc(devm_memremap_release, sizeof(*ptr), GFP_KERNEL); 127 ptr = devres_alloc_node(devm_memremap_release, sizeof(*ptr), GFP_KERNEL,
128 dev_to_node(dev));
118 if (!ptr) 129 if (!ptr)
119 return NULL; 130 return ERR_PTR(-ENOMEM);
120 131
121 addr = memremap(offset, size, flags); 132 addr = memremap(offset, size, flags);
122 if (addr) { 133 if (addr) {
@@ -131,9 +142,8 @@ EXPORT_SYMBOL(devm_memremap);
131 142
132void devm_memunmap(struct device *dev, void *addr) 143void devm_memunmap(struct device *dev, void *addr)
133{ 144{
134 WARN_ON(devres_destroy(dev, devm_memremap_release, devm_memremap_match, 145 WARN_ON(devres_release(dev, devm_memremap_release,
135 addr)); 146 devm_memremap_match, addr));
136 memunmap(addr);
137} 147}
138EXPORT_SYMBOL(devm_memunmap); 148EXPORT_SYMBOL(devm_memunmap);
139 149
@@ -166,8 +176,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res)
166 if (is_ram == REGION_INTERSECTS) 176 if (is_ram == REGION_INTERSECTS)
167 return __va(res->start); 177 return __va(res->start);
168 178
169 page_map = devres_alloc(devm_memremap_pages_release, 179 page_map = devres_alloc_node(devm_memremap_pages_release,
170 sizeof(*page_map), GFP_KERNEL); 180 sizeof(*page_map), GFP_KERNEL, dev_to_node(dev));
171 if (!page_map) 181 if (!page_map)
172 return ERR_PTR(-ENOMEM); 182 return ERR_PTR(-ENOMEM);
173 183
@@ -175,7 +185,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res)
175 185
176 nid = dev_to_node(dev); 186 nid = dev_to_node(dev);
177 if (nid < 0) 187 if (nid < 0)
178 nid = 0; 188 nid = numa_mem_id();
179 189
180 error = arch_add_memory(nid, res->start, resource_size(res), true); 190 error = arch_add_memory(nid, res->start, resource_size(res), true);
181 if (error) { 191 if (error) {
diff --git a/kernel/module.c b/kernel/module.c
index b86b7bf1be38..8f051a106676 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1063,11 +1063,15 @@ void symbol_put_addr(void *addr)
1063 if (core_kernel_text(a)) 1063 if (core_kernel_text(a))
1064 return; 1064 return;
1065 1065
1066 /* module_text_address is safe here: we're supposed to have reference 1066 /*
1067 * to module from symbol_get, so it can't go away. */ 1067 * Even though we hold a reference on the module; we still need to
1068 * disable preemption in order to safely traverse the data structure.
1069 */
1070 preempt_disable();
1068 modaddr = __module_text_address(a); 1071 modaddr = __module_text_address(a);
1069 BUG_ON(!modaddr); 1072 BUG_ON(!modaddr);
1070 module_put(modaddr); 1073 module_put(modaddr);
1074 preempt_enable();
1071} 1075}
1072EXPORT_SYMBOL_GPL(symbol_put_addr); 1076EXPORT_SYMBOL_GPL(symbol_put_addr);
1073 1077
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index bd62f5cda746..6528a79d998d 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -10,6 +10,7 @@
10 */ 10 */
11 11
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/errno.h>
13#include <keys/system_keyring.h> 14#include <keys/system_keyring.h>
14#include <crypto/public_key.h> 15#include <crypto/public_key.h>
15#include "module-internal.h" 16#include "module-internal.h"
diff --git a/kernel/panic.c b/kernel/panic.c
index 04e91ff7560b..4579dbb7ed87 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -23,6 +23,7 @@
23#include <linux/sysrq.h> 23#include <linux/sysrq.h>
24#include <linux/init.h> 24#include <linux/init.h>
25#include <linux/nmi.h> 25#include <linux/nmi.h>
26#include <linux/console.h>
26 27
27#define PANIC_TIMER_STEP 100 28#define PANIC_TIMER_STEP 100
28#define PANIC_BLINK_SPD 18 29#define PANIC_BLINK_SPD 18
@@ -147,6 +148,15 @@ void panic(const char *fmt, ...)
147 148
148 bust_spinlocks(0); 149 bust_spinlocks(0);
149 150
151 /*
152 * We may have ended up stopping the CPU holding the lock (in
153 * smp_send_stop()) while still having some valuable data in the console
154 * buffer. Try to acquire the lock then release it regardless of the
155 * result. The release will also print the buffers out.
156 */
157 console_trylock();
158 console_unlock();
159
150 if (!panic_blink) 160 if (!panic_blink)
151 panic_blink = no_blink; 161 panic_blink = no_blink;
152 162
diff --git a/kernel/params.c b/kernel/params.c
index b6554aa71094..a6d6149c0fe6 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -223,7 +223,7 @@ char *parse_args(const char *doing,
223 int (*unknown)(char *param, char *val, 223 int (*unknown)(char *param, char *val,
224 const char *doing, void *arg)) 224 const char *doing, void *arg))
225{ 225{
226 char *param, *val; 226 char *param, *val, *err = NULL;
227 227
228 /* Chew leading spaces */ 228 /* Chew leading spaces */
229 args = skip_spaces(args); 229 args = skip_spaces(args);
@@ -238,7 +238,7 @@ char *parse_args(const char *doing,
238 args = next_arg(args, &param, &val); 238 args = next_arg(args, &param, &val);
239 /* Stop at -- */ 239 /* Stop at -- */
240 if (!val && strcmp(param, "--") == 0) 240 if (!val && strcmp(param, "--") == 0)
241 return args; 241 return err ?: args;
242 irq_was_disabled = irqs_disabled(); 242 irq_was_disabled = irqs_disabled();
243 ret = parse_one(param, val, doing, params, num, 243 ret = parse_one(param, val, doing, params, num,
244 min_level, max_level, arg, unknown); 244 min_level, max_level, arg, unknown);
@@ -247,24 +247,25 @@ char *parse_args(const char *doing,
247 doing, param); 247 doing, param);
248 248
249 switch (ret) { 249 switch (ret) {
250 case 0:
251 continue;
250 case -ENOENT: 252 case -ENOENT:
251 pr_err("%s: Unknown parameter `%s'\n", doing, param); 253 pr_err("%s: Unknown parameter `%s'\n", doing, param);
252 return ERR_PTR(ret); 254 break;
253 case -ENOSPC: 255 case -ENOSPC:
254 pr_err("%s: `%s' too large for parameter `%s'\n", 256 pr_err("%s: `%s' too large for parameter `%s'\n",
255 doing, val ?: "", param); 257 doing, val ?: "", param);
256 return ERR_PTR(ret);
257 case 0:
258 break; 258 break;
259 default: 259 default:
260 pr_err("%s: `%s' invalid for parameter `%s'\n", 260 pr_err("%s: `%s' invalid for parameter `%s'\n",
261 doing, val ?: "", param); 261 doing, val ?: "", param);
262 return ERR_PTR(ret); 262 break;
263 } 263 }
264
265 err = ERR_PTR(ret);
264 } 266 }
265 267
266 /* All parsed OK. */ 268 return err;
267 return NULL;
268} 269}
269 270
270/* Lazy bastard, eh? */ 271/* Lazy bastard, eh? */
@@ -325,10 +326,11 @@ int param_get_charp(char *buffer, const struct kernel_param *kp)
325} 326}
326EXPORT_SYMBOL(param_get_charp); 327EXPORT_SYMBOL(param_get_charp);
327 328
328static void param_free_charp(void *arg) 329void param_free_charp(void *arg)
329{ 330{
330 maybe_kfree_parameter(*((char **)arg)); 331 maybe_kfree_parameter(*((char **)arg));
331} 332}
333EXPORT_SYMBOL(param_free_charp);
332 334
333const struct kernel_param_ops param_ops_charp = { 335const struct kernel_param_ops param_ops_charp = {
334 .set = param_set_charp, 336 .set = param_set_charp,
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 690f78f210f2..b7342a24f559 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -733,7 +733,7 @@ int hibernate(void)
733 * contents of memory is restored from the saved image. 733 * contents of memory is restored from the saved image.
734 * 734 *
735 * If this is successful, control reappears in the restored target kernel in 735 * If this is successful, control reappears in the restored target kernel in
736 * hibernation_snaphot() which returns to hibernate(). Otherwise, the routine 736 * hibernation_snapshot() which returns to hibernate(). Otherwise, the routine
737 * attempts to recover gracefully and make the kernel return to the normal mode 737 * attempts to recover gracefully and make the kernel return to the normal mode
738 * of operation. 738 * of operation.
739 */ 739 */
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 63d395b5df93..b2dd4d999900 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -272,6 +272,22 @@ static inline void pm_print_times_init(void)
272{ 272{
273 pm_print_times_enabled = !!initcall_debug; 273 pm_print_times_enabled = !!initcall_debug;
274} 274}
275
276static ssize_t pm_wakeup_irq_show(struct kobject *kobj,
277 struct kobj_attribute *attr,
278 char *buf)
279{
280 return pm_wakeup_irq ? sprintf(buf, "%u\n", pm_wakeup_irq) : -ENODATA;
281}
282
283static ssize_t pm_wakeup_irq_store(struct kobject *kobj,
284 struct kobj_attribute *attr,
285 const char *buf, size_t n)
286{
287 return -EINVAL;
288}
289power_attr(pm_wakeup_irq);
290
275#else /* !CONFIG_PM_SLEEP_DEBUG */ 291#else /* !CONFIG_PM_SLEEP_DEBUG */
276static inline void pm_print_times_init(void) {} 292static inline void pm_print_times_init(void) {}
277#endif /* CONFIG_PM_SLEEP_DEBUG */ 293#endif /* CONFIG_PM_SLEEP_DEBUG */
@@ -604,6 +620,7 @@ static struct attribute * g[] = {
604#endif 620#endif
605#ifdef CONFIG_PM_SLEEP_DEBUG 621#ifdef CONFIG_PM_SLEEP_DEBUG
606 &pm_print_times_attr.attr, 622 &pm_print_times_attr.attr,
623 &pm_wakeup_irq_attr.attr,
607#endif 624#endif
608#endif 625#endif
609#ifdef CONFIG_FREEZER 626#ifdef CONFIG_FREEZER
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 5235dd4e1e2f..3a970604308f 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1779,7 +1779,7 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
1779 while (to_alloc-- > 0) { 1779 while (to_alloc-- > 0) {
1780 struct page *page; 1780 struct page *page;
1781 1781
1782 page = alloc_image_page(__GFP_HIGHMEM); 1782 page = alloc_image_page(__GFP_HIGHMEM|__GFP_KSWAPD_RECLAIM);
1783 memory_bm_set_bit(bm, page_to_pfn(page)); 1783 memory_bm_set_bit(bm, page_to_pfn(page));
1784 } 1784 }
1785 return nr_highmem; 1785 return nr_highmem;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 7e4cda4a8dd9..f9fe133c13e2 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -35,6 +35,9 @@
35const char *pm_labels[] = { "mem", "standby", "freeze", NULL }; 35const char *pm_labels[] = { "mem", "standby", "freeze", NULL };
36const char *pm_states[PM_SUSPEND_MAX]; 36const char *pm_states[PM_SUSPEND_MAX];
37 37
38unsigned int pm_suspend_global_flags;
39EXPORT_SYMBOL_GPL(pm_suspend_global_flags);
40
38static const struct platform_suspend_ops *suspend_ops; 41static const struct platform_suspend_ops *suspend_ops;
39static const struct platform_freeze_ops *freeze_ops; 42static const struct platform_freeze_ops *freeze_ops;
40static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head); 43static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
@@ -493,6 +496,7 @@ static int enter_state(suspend_state_t state)
493#endif 496#endif
494 497
495 pr_debug("PM: Preparing system for sleep (%s)\n", pm_states[state]); 498 pr_debug("PM: Preparing system for sleep (%s)\n", pm_states[state]);
499 pm_suspend_clear_flags();
496 error = suspend_prepare(state); 500 error = suspend_prepare(state);
497 if (error) 501 if (error)
498 goto Unlock; 502 goto Unlock;
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index b2066fb5b10f..12cd989dadf6 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -257,7 +257,7 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr,
257 struct bio *bio; 257 struct bio *bio;
258 int error = 0; 258 int error = 0;
259 259
260 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); 260 bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1);
261 bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); 261 bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
262 bio->bi_bdev = hib_resume_bdev; 262 bio->bi_bdev = hib_resume_bdev;
263 263
@@ -356,7 +356,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
356 return -ENOSPC; 356 return -ENOSPC;
357 357
358 if (hb) { 358 if (hb) {
359 src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN | 359 src = (void *)__get_free_page(__GFP_RECLAIM | __GFP_NOWARN |
360 __GFP_NORETRY); 360 __GFP_NORETRY);
361 if (src) { 361 if (src) {
362 copy_page(src, buf); 362 copy_page(src, buf);
@@ -364,7 +364,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
364 ret = hib_wait_io(hb); /* Free pages */ 364 ret = hib_wait_io(hb); /* Free pages */
365 if (ret) 365 if (ret)
366 return ret; 366 return ret;
367 src = (void *)__get_free_page(__GFP_WAIT | 367 src = (void *)__get_free_page(__GFP_RECLAIM |
368 __GFP_NOWARN | 368 __GFP_NOWARN |
369 __GFP_NORETRY); 369 __GFP_NORETRY);
370 if (src) { 370 if (src) {
@@ -672,7 +672,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
672 nr_threads = num_online_cpus() - 1; 672 nr_threads = num_online_cpus() - 1;
673 nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); 673 nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
674 674
675 page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); 675 page = (void *)__get_free_page(__GFP_RECLAIM | __GFP_HIGH);
676 if (!page) { 676 if (!page) {
677 printk(KERN_ERR "PM: Failed to allocate LZO page\n"); 677 printk(KERN_ERR "PM: Failed to allocate LZO page\n");
678 ret = -ENOMEM; 678 ret = -ENOMEM;
@@ -975,7 +975,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
975 last = tmp; 975 last = tmp;
976 976
977 tmp->map = (struct swap_map_page *) 977 tmp->map = (struct swap_map_page *)
978 __get_free_page(__GFP_WAIT | __GFP_HIGH); 978 __get_free_page(__GFP_RECLAIM | __GFP_HIGH);
979 if (!tmp->map) { 979 if (!tmp->map) {
980 release_swap_reader(handle); 980 release_swap_reader(handle);
981 return -ENOMEM; 981 return -ENOMEM;
@@ -1242,9 +1242,9 @@ static int load_image_lzo(struct swap_map_handle *handle,
1242 1242
1243 for (i = 0; i < read_pages; i++) { 1243 for (i = 0; i < read_pages; i++) {
1244 page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? 1244 page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ?
1245 __GFP_WAIT | __GFP_HIGH : 1245 __GFP_RECLAIM | __GFP_HIGH :
1246 __GFP_WAIT | __GFP_NOWARN | 1246 __GFP_RECLAIM | __GFP_NOWARN |
1247 __GFP_NORETRY); 1247 __GFP_NORETRY);
1248 1248
1249 if (!page[i]) { 1249 if (!page[i]) {
1250 if (i < LZO_CMP_PAGES) { 1250 if (i < LZO_CMP_PAGES) {
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 8f0324ef72ab..2ce8826f1053 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -269,6 +269,9 @@ static u32 clear_idx;
269#define PREFIX_MAX 32 269#define PREFIX_MAX 32
270#define LOG_LINE_MAX (1024 - PREFIX_MAX) 270#define LOG_LINE_MAX (1024 - PREFIX_MAX)
271 271
272#define LOG_LEVEL(v) ((v) & 0x07)
273#define LOG_FACILITY(v) ((v) >> 3 & 0xff)
274
272/* record buffer */ 275/* record buffer */
273#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) 276#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
274#define LOG_ALIGN 4 277#define LOG_ALIGN 4
@@ -517,6 +520,7 @@ int check_syslog_permissions(int type, int source)
517ok: 520ok:
518 return security_syslog(type); 521 return security_syslog(type);
519} 522}
523EXPORT_SYMBOL_GPL(check_syslog_permissions);
520 524
521static void append_char(char **pp, char *e, char c) 525static void append_char(char **pp, char *e, char c)
522{ 526{
@@ -611,7 +615,6 @@ struct devkmsg_user {
611static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) 615static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
612{ 616{
613 char *buf, *line; 617 char *buf, *line;
614 int i;
615 int level = default_message_loglevel; 618 int level = default_message_loglevel;
616 int facility = 1; /* LOG_USER */ 619 int facility = 1; /* LOG_USER */
617 size_t len = iov_iter_count(from); 620 size_t len = iov_iter_count(from);
@@ -641,12 +644,13 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
641 line = buf; 644 line = buf;
642 if (line[0] == '<') { 645 if (line[0] == '<') {
643 char *endp = NULL; 646 char *endp = NULL;
647 unsigned int u;
644 648
645 i = simple_strtoul(line+1, &endp, 10); 649 u = simple_strtoul(line + 1, &endp, 10);
646 if (endp && endp[0] == '>') { 650 if (endp && endp[0] == '>') {
647 level = i & 7; 651 level = LOG_LEVEL(u);
648 if (i >> 3) 652 if (LOG_FACILITY(u) != 0)
649 facility = i >> 3; 653 facility = LOG_FACILITY(u);
650 endp++; 654 endp++;
651 len -= endp - line; 655 len -= endp - line;
652 line = endp; 656 line = endp;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 787320de68e0..b760bae64cf1 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -1016,6 +1016,11 @@ int ptrace_request(struct task_struct *child, long request,
1016 break; 1016 break;
1017 } 1017 }
1018#endif 1018#endif
1019
1020 case PTRACE_SECCOMP_GET_FILTER:
1021 ret = seccomp_get_filter(child, addr, datavp);
1022 break;
1023
1019 default: 1024 default:
1020 break; 1025 break;
1021 } 1026 }
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 50a808424b06..61a16569ffbf 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,4 +1,4 @@
1obj-y += update.o 1obj-y += update.o sync.o
2obj-$(CONFIG_SRCU) += srcu.o 2obj-$(CONFIG_SRCU) += srcu.o
3obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 3obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
4obj-$(CONFIG_TREE_RCU) += tree.o 4obj-$(CONFIG_TREE_RCU) += tree.o
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 77192953dee5..d89328e260df 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -252,7 +252,7 @@ struct rcu_torture_ops {
252 void (*exp_sync)(void); 252 void (*exp_sync)(void);
253 unsigned long (*get_state)(void); 253 unsigned long (*get_state)(void);
254 void (*cond_sync)(unsigned long oldstate); 254 void (*cond_sync)(unsigned long oldstate);
255 void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 255 call_rcu_func_t call;
256 void (*cb_barrier)(void); 256 void (*cb_barrier)(void);
257 void (*fqs)(void); 257 void (*fqs)(void);
258 void (*stats)(void); 258 void (*stats)(void);
@@ -448,7 +448,7 @@ static void synchronize_rcu_busted(void)
448} 448}
449 449
450static void 450static void
451call_rcu_busted(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 451call_rcu_busted(struct rcu_head *head, rcu_callback_t func)
452{ 452{
453 /* This is a deliberate bug for testing purposes only! */ 453 /* This is a deliberate bug for testing purposes only! */
454 func(head); 454 func(head);
@@ -523,7 +523,7 @@ static void srcu_torture_synchronize(void)
523} 523}
524 524
525static void srcu_torture_call(struct rcu_head *head, 525static void srcu_torture_call(struct rcu_head *head,
526 void (*func)(struct rcu_head *head)) 526 rcu_callback_t func)
527{ 527{
528 call_srcu(srcu_ctlp, head, func); 528 call_srcu(srcu_ctlp, head, func);
529} 529}
@@ -695,7 +695,7 @@ static bool __maybe_unused torturing_tasks(void)
695 695
696#define RCUTORTURE_TASKS_OPS 696#define RCUTORTURE_TASKS_OPS
697 697
698static bool torturing_tasks(void) 698static bool __maybe_unused torturing_tasks(void)
699{ 699{
700 return false; 700 return false;
701} 701}
@@ -768,7 +768,6 @@ static int rcu_torture_boost(void *arg)
768 } 768 }
769 call_rcu_time = jiffies; 769 call_rcu_time = jiffies;
770 } 770 }
771 cond_resched_rcu_qs();
772 stutter_wait("rcu_torture_boost"); 771 stutter_wait("rcu_torture_boost");
773 if (torture_must_stop()) 772 if (torture_must_stop())
774 goto checkwait; 773 goto checkwait;
@@ -1208,7 +1207,6 @@ rcu_torture_reader(void *arg)
1208 __this_cpu_inc(rcu_torture_batch[completed]); 1207 __this_cpu_inc(rcu_torture_batch[completed]);
1209 preempt_enable(); 1208 preempt_enable();
1210 cur_ops->readunlock(idx); 1209 cur_ops->readunlock(idx);
1211 cond_resched_rcu_qs();
1212 stutter_wait("rcu_torture_reader"); 1210 stutter_wait("rcu_torture_reader");
1213 } while (!torture_must_stop()); 1211 } while (!torture_must_stop());
1214 if (irqreader && cur_ops->irq_capable) { 1212 if (irqreader && cur_ops->irq_capable) {
@@ -1742,15 +1740,15 @@ rcu_torture_init(void)
1742 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) 1740 for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
1743 pr_alert(" %s", torture_ops[i]->name); 1741 pr_alert(" %s", torture_ops[i]->name);
1744 pr_alert("\n"); 1742 pr_alert("\n");
1745 torture_init_end(); 1743 firsterr = -EINVAL;
1746 return -EINVAL; 1744 goto unwind;
1747 } 1745 }
1748 if (cur_ops->fqs == NULL && fqs_duration != 0) { 1746 if (cur_ops->fqs == NULL && fqs_duration != 0) {
1749 pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); 1747 pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n");
1750 fqs_duration = 0; 1748 fqs_duration = 0;
1751 } 1749 }
1752 if (cur_ops->init) 1750 if (cur_ops->init)
1753 cur_ops->init(); /* no "goto unwind" prior to this point!!! */ 1751 cur_ops->init();
1754 1752
1755 if (nreaders >= 0) { 1753 if (nreaders >= 0) {
1756 nrealreaders = nreaders; 1754 nrealreaders = nreaders;
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index d3fcb2ec8536..a63a1ea5a41b 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -298,11 +298,9 @@ int __srcu_read_lock(struct srcu_struct *sp)
298 int idx; 298 int idx;
299 299
300 idx = READ_ONCE(sp->completed) & 0x1; 300 idx = READ_ONCE(sp->completed) & 0x1;
301 preempt_disable();
302 __this_cpu_inc(sp->per_cpu_ref->c[idx]); 301 __this_cpu_inc(sp->per_cpu_ref->c[idx]);
303 smp_mb(); /* B */ /* Avoid leaking the critical section. */ 302 smp_mb(); /* B */ /* Avoid leaking the critical section. */
304 __this_cpu_inc(sp->per_cpu_ref->seq[idx]); 303 __this_cpu_inc(sp->per_cpu_ref->seq[idx]);
305 preempt_enable();
306 return idx; 304 return idx;
307} 305}
308EXPORT_SYMBOL_GPL(__srcu_read_lock); 306EXPORT_SYMBOL_GPL(__srcu_read_lock);
@@ -387,7 +385,7 @@ static void srcu_flip(struct srcu_struct *sp)
387 * srcu_struct structure. 385 * srcu_struct structure.
388 */ 386 */
389void call_srcu(struct srcu_struct *sp, struct rcu_head *head, 387void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
390 void (*func)(struct rcu_head *head)) 388 rcu_callback_t func)
391{ 389{
392 unsigned long flags; 390 unsigned long flags;
393 391
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
new file mode 100644
index 000000000000..be922c9f3d37
--- /dev/null
+++ b/kernel/rcu/sync.c
@@ -0,0 +1,223 @@
1/*
2 * RCU-based infrastructure for lightweight reader-writer locking
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, you can access it online at
16 * http://www.gnu.org/licenses/gpl-2.0.html.
17 *
18 * Copyright (c) 2015, Red Hat, Inc.
19 *
20 * Author: Oleg Nesterov <oleg@redhat.com>
21 */
22
23#include <linux/rcu_sync.h>
24#include <linux/sched.h>
25
26#ifdef CONFIG_PROVE_RCU
27#define __INIT_HELD(func) .held = func,
28#else
29#define __INIT_HELD(func)
30#endif
31
32static const struct {
33 void (*sync)(void);
34 void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
35 void (*wait)(void);
36#ifdef CONFIG_PROVE_RCU
37 int (*held)(void);
38#endif
39} gp_ops[] = {
40 [RCU_SYNC] = {
41 .sync = synchronize_rcu,
42 .call = call_rcu,
43 .wait = rcu_barrier,
44 __INIT_HELD(rcu_read_lock_held)
45 },
46 [RCU_SCHED_SYNC] = {
47 .sync = synchronize_sched,
48 .call = call_rcu_sched,
49 .wait = rcu_barrier_sched,
50 __INIT_HELD(rcu_read_lock_sched_held)
51 },
52 [RCU_BH_SYNC] = {
53 .sync = synchronize_rcu_bh,
54 .call = call_rcu_bh,
55 .wait = rcu_barrier_bh,
56 __INIT_HELD(rcu_read_lock_bh_held)
57 },
58};
59
60enum { GP_IDLE = 0, GP_PENDING, GP_PASSED };
61enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY };
62
63#define rss_lock gp_wait.lock
64
65#ifdef CONFIG_PROVE_RCU
66void rcu_sync_lockdep_assert(struct rcu_sync *rsp)
67{
68 RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(),
69 "suspicious rcu_sync_is_idle() usage");
70}
71#endif
72
73/**
74 * rcu_sync_init() - Initialize an rcu_sync structure
75 * @rsp: Pointer to rcu_sync structure to be initialized
76 * @type: Flavor of RCU with which to synchronize rcu_sync structure
77 */
78void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type)
79{
80 memset(rsp, 0, sizeof(*rsp));
81 init_waitqueue_head(&rsp->gp_wait);
82 rsp->gp_type = type;
83}
84
85/**
86 * rcu_sync_enter() - Force readers onto slowpath
87 * @rsp: Pointer to rcu_sync structure to use for synchronization
88 *
89 * This function is used by updaters who need readers to make use of
90 * a slowpath during the update. After this function returns, all
91 * subsequent calls to rcu_sync_is_idle() will return false, which
92 * tells readers to stay off their fastpaths. A later call to
93 * rcu_sync_exit() re-enables reader slowpaths.
94 *
95 * When called in isolation, rcu_sync_enter() must wait for a grace
96 * period, however, closely spaced calls to rcu_sync_enter() can
97 * optimize away the grace-period wait via a state machine implemented
98 * by rcu_sync_enter(), rcu_sync_exit(), and rcu_sync_func().
99 */
100void rcu_sync_enter(struct rcu_sync *rsp)
101{
102 bool need_wait, need_sync;
103
104 spin_lock_irq(&rsp->rss_lock);
105 need_wait = rsp->gp_count++;
106 need_sync = rsp->gp_state == GP_IDLE;
107 if (need_sync)
108 rsp->gp_state = GP_PENDING;
109 spin_unlock_irq(&rsp->rss_lock);
110
111 BUG_ON(need_wait && need_sync);
112
113 if (need_sync) {
114 gp_ops[rsp->gp_type].sync();
115 rsp->gp_state = GP_PASSED;
116 wake_up_all(&rsp->gp_wait);
117 } else if (need_wait) {
118 wait_event(rsp->gp_wait, rsp->gp_state == GP_PASSED);
119 } else {
120 /*
121 * Possible when there's a pending CB from a rcu_sync_exit().
122 * Nobody has yet been allowed the 'fast' path and thus we can
123 * avoid doing any sync(). The callback will get 'dropped'.
124 */
125 BUG_ON(rsp->gp_state != GP_PASSED);
126 }
127}
128
129/**
130 * rcu_sync_func() - Callback function managing reader access to fastpath
131 * @rsp: Pointer to rcu_sync structure to use for synchronization
132 *
133 * This function is passed to one of the call_rcu() functions by
134 * rcu_sync_exit(), so that it is invoked after a grace period following the
135 * that invocation of rcu_sync_exit(). It takes action based on events that
136 * have taken place in the meantime, so that closely spaced rcu_sync_enter()
137 * and rcu_sync_exit() pairs need not wait for a grace period.
138 *
139 * If another rcu_sync_enter() is invoked before the grace period
140 * ended, reset state to allow the next rcu_sync_exit() to let the
141 * readers back onto their fastpaths (after a grace period). If both
142 * another rcu_sync_enter() and its matching rcu_sync_exit() are invoked
143 * before the grace period ended, re-invoke call_rcu() on behalf of that
144 * rcu_sync_exit(). Otherwise, set all state back to idle so that readers
145 * can again use their fastpaths.
146 */
147static void rcu_sync_func(struct rcu_head *rcu)
148{
149 struct rcu_sync *rsp = container_of(rcu, struct rcu_sync, cb_head);
150 unsigned long flags;
151
152 BUG_ON(rsp->gp_state != GP_PASSED);
153 BUG_ON(rsp->cb_state == CB_IDLE);
154
155 spin_lock_irqsave(&rsp->rss_lock, flags);
156 if (rsp->gp_count) {
157 /*
158 * A new rcu_sync_begin() has happened; drop the callback.
159 */
160 rsp->cb_state = CB_IDLE;
161 } else if (rsp->cb_state == CB_REPLAY) {
162 /*
163 * A new rcu_sync_exit() has happened; requeue the callback
164 * to catch a later GP.
165 */
166 rsp->cb_state = CB_PENDING;
167 gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func);
168 } else {
169 /*
170 * We're at least a GP after rcu_sync_exit(); eveybody will now
171 * have observed the write side critical section. Let 'em rip!.
172 */
173 rsp->cb_state = CB_IDLE;
174 rsp->gp_state = GP_IDLE;
175 }
176 spin_unlock_irqrestore(&rsp->rss_lock, flags);
177}
178
179/**
180 * rcu_sync_exit() - Allow readers back onto fast patch after grace period
181 * @rsp: Pointer to rcu_sync structure to use for synchronization
182 *
183 * This function is used by updaters who have completed, and can therefore
184 * now allow readers to make use of their fastpaths after a grace period
185 * has elapsed. After this grace period has completed, all subsequent
186 * calls to rcu_sync_is_idle() will return true, which tells readers that
187 * they can once again use their fastpaths.
188 */
189void rcu_sync_exit(struct rcu_sync *rsp)
190{
191 spin_lock_irq(&rsp->rss_lock);
192 if (!--rsp->gp_count) {
193 if (rsp->cb_state == CB_IDLE) {
194 rsp->cb_state = CB_PENDING;
195 gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func);
196 } else if (rsp->cb_state == CB_PENDING) {
197 rsp->cb_state = CB_REPLAY;
198 }
199 }
200 spin_unlock_irq(&rsp->rss_lock);
201}
202
203/**
204 * rcu_sync_dtor() - Clean up an rcu_sync structure
205 * @rsp: Pointer to rcu_sync structure to be cleaned up
206 */
207void rcu_sync_dtor(struct rcu_sync *rsp)
208{
209 int cb_state;
210
211 BUG_ON(rsp->gp_count);
212
213 spin_lock_irq(&rsp->rss_lock);
214 if (rsp->cb_state == CB_REPLAY)
215 rsp->cb_state = CB_PENDING;
216 cb_state = rsp->cb_state;
217 spin_unlock_irq(&rsp->rss_lock);
218
219 if (cb_state != CB_IDLE) {
220 gp_ops[rsp->gp_type].wait();
221 BUG_ON(rsp->cb_state != CB_IDLE);
222 }
223}
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index d0471056d0af..944b1b491ed8 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -44,7 +44,7 @@ struct rcu_ctrlblk;
44static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); 44static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
45static void rcu_process_callbacks(struct softirq_action *unused); 45static void rcu_process_callbacks(struct softirq_action *unused);
46static void __call_rcu(struct rcu_head *head, 46static void __call_rcu(struct rcu_head *head,
47 void (*func)(struct rcu_head *rcu), 47 rcu_callback_t func,
48 struct rcu_ctrlblk *rcp); 48 struct rcu_ctrlblk *rcp);
49 49
50#include "tiny_plugin.h" 50#include "tiny_plugin.h"
@@ -203,7 +203,7 @@ EXPORT_SYMBOL_GPL(synchronize_sched);
203 * Helper function for call_rcu() and call_rcu_bh(). 203 * Helper function for call_rcu() and call_rcu_bh().
204 */ 204 */
205static void __call_rcu(struct rcu_head *head, 205static void __call_rcu(struct rcu_head *head,
206 void (*func)(struct rcu_head *rcu), 206 rcu_callback_t func,
207 struct rcu_ctrlblk *rcp) 207 struct rcu_ctrlblk *rcp)
208{ 208{
209 unsigned long flags; 209 unsigned long flags;
@@ -229,7 +229,7 @@ static void __call_rcu(struct rcu_head *head,
229 * period. But since we have but one CPU, that would be after any 229 * period. But since we have but one CPU, that would be after any
230 * quiescent state. 230 * quiescent state.
231 */ 231 */
232void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 232void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
233{ 233{
234 __call_rcu(head, func, &rcu_sched_ctrlblk); 234 __call_rcu(head, func, &rcu_sched_ctrlblk);
235} 235}
@@ -239,7 +239,7 @@ EXPORT_SYMBOL_GPL(call_rcu_sched);
239 * Post an RCU bottom-half callback to be invoked after any subsequent 239 * Post an RCU bottom-half callback to be invoked after any subsequent
240 * quiescent state. 240 * quiescent state.
241 */ 241 */
242void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 242void call_rcu_bh(struct rcu_head *head, rcu_callback_t func)
243{ 243{
244 __call_rcu(head, func, &rcu_bh_ctrlblk); 244 __call_rcu(head, func, &rcu_bh_ctrlblk);
245} 245}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 9f75f25cc5d9..f07343b54fe5 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -71,7 +71,6 @@ MODULE_ALIAS("rcutree");
71static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; 71static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
72static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; 72static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
73static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS]; 73static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];
74static struct lock_class_key rcu_exp_sched_class[RCU_NUM_LVLS];
75 74
76/* 75/*
77 * In order to export the rcu_state name to the tracing tools, it 76 * In order to export the rcu_state name to the tracing tools, it
@@ -98,7 +97,7 @@ struct rcu_state sname##_state = { \
98 .level = { &sname##_state.node[0] }, \ 97 .level = { &sname##_state.node[0] }, \
99 .rda = &sname##_data, \ 98 .rda = &sname##_data, \
100 .call = cr, \ 99 .call = cr, \
101 .fqs_state = RCU_GP_IDLE, \ 100 .gp_state = RCU_GP_IDLE, \
102 .gpnum = 0UL - 300UL, \ 101 .gpnum = 0UL - 300UL, \
103 .completed = 0UL - 300UL, \ 102 .completed = 0UL - 300UL, \
104 .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ 103 .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \
@@ -161,6 +160,8 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
161static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); 160static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
162static void invoke_rcu_core(void); 161static void invoke_rcu_core(void);
163static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); 162static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
163static void rcu_report_exp_rdp(struct rcu_state *rsp,
164 struct rcu_data *rdp, bool wake);
164 165
165/* rcuc/rcub kthread realtime priority */ 166/* rcuc/rcub kthread realtime priority */
166#ifdef CONFIG_RCU_KTHREAD_PRIO 167#ifdef CONFIG_RCU_KTHREAD_PRIO
@@ -245,21 +246,33 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
245 */ 246 */
246void rcu_sched_qs(void) 247void rcu_sched_qs(void)
247{ 248{
248 if (!__this_cpu_read(rcu_sched_data.passed_quiesce)) { 249 unsigned long flags;
250
251 if (__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) {
249 trace_rcu_grace_period(TPS("rcu_sched"), 252 trace_rcu_grace_period(TPS("rcu_sched"),
250 __this_cpu_read(rcu_sched_data.gpnum), 253 __this_cpu_read(rcu_sched_data.gpnum),
251 TPS("cpuqs")); 254 TPS("cpuqs"));
252 __this_cpu_write(rcu_sched_data.passed_quiesce, 1); 255 __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false);
256 if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
257 return;
258 local_irq_save(flags);
259 if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) {
260 __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false);
261 rcu_report_exp_rdp(&rcu_sched_state,
262 this_cpu_ptr(&rcu_sched_data),
263 true);
264 }
265 local_irq_restore(flags);
253 } 266 }
254} 267}
255 268
256void rcu_bh_qs(void) 269void rcu_bh_qs(void)
257{ 270{
258 if (!__this_cpu_read(rcu_bh_data.passed_quiesce)) { 271 if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) {
259 trace_rcu_grace_period(TPS("rcu_bh"), 272 trace_rcu_grace_period(TPS("rcu_bh"),
260 __this_cpu_read(rcu_bh_data.gpnum), 273 __this_cpu_read(rcu_bh_data.gpnum),
261 TPS("cpuqs")); 274 TPS("cpuqs"));
262 __this_cpu_write(rcu_bh_data.passed_quiesce, 1); 275 __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false);
263 } 276 }
264} 277}
265 278
@@ -337,12 +350,14 @@ static void rcu_momentary_dyntick_idle(void)
337 */ 350 */
338void rcu_note_context_switch(void) 351void rcu_note_context_switch(void)
339{ 352{
353 barrier(); /* Avoid RCU read-side critical sections leaking down. */
340 trace_rcu_utilization(TPS("Start context switch")); 354 trace_rcu_utilization(TPS("Start context switch"));
341 rcu_sched_qs(); 355 rcu_sched_qs();
342 rcu_preempt_note_context_switch(); 356 rcu_preempt_note_context_switch();
343 if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) 357 if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
344 rcu_momentary_dyntick_idle(); 358 rcu_momentary_dyntick_idle();
345 trace_rcu_utilization(TPS("End context switch")); 359 trace_rcu_utilization(TPS("End context switch"));
360 barrier(); /* Avoid RCU read-side critical sections leaking up. */
346} 361}
347EXPORT_SYMBOL_GPL(rcu_note_context_switch); 362EXPORT_SYMBOL_GPL(rcu_note_context_switch);
348 363
@@ -353,12 +368,19 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
353 * RCU flavors in desperate need of a quiescent state, which will normally 368 * RCU flavors in desperate need of a quiescent state, which will normally
354 * be none of them). Either way, do a lightweight quiescent state for 369 * be none of them). Either way, do a lightweight quiescent state for
355 * all RCU flavors. 370 * all RCU flavors.
371 *
372 * The barrier() calls are redundant in the common case when this is
373 * called externally, but just in case this is called from within this
374 * file.
375 *
356 */ 376 */
357void rcu_all_qs(void) 377void rcu_all_qs(void)
358{ 378{
379 barrier(); /* Avoid RCU read-side critical sections leaking down. */
359 if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) 380 if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
360 rcu_momentary_dyntick_idle(); 381 rcu_momentary_dyntick_idle();
361 this_cpu_inc(rcu_qs_ctr); 382 this_cpu_inc(rcu_qs_ctr);
383 barrier(); /* Avoid RCU read-side critical sections leaking up. */
362} 384}
363EXPORT_SYMBOL_GPL(rcu_all_qs); 385EXPORT_SYMBOL_GPL(rcu_all_qs);
364 386
@@ -1744,9 +1766,9 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
1744 */ 1766 */
1745 rdp->gpnum = rnp->gpnum; 1767 rdp->gpnum = rnp->gpnum;
1746 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); 1768 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
1747 rdp->passed_quiesce = 0; 1769 rdp->cpu_no_qs.b.norm = true;
1748 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); 1770 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
1749 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); 1771 rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask);
1750 zero_cpu_stall_ticks(rdp); 1772 zero_cpu_stall_ticks(rdp);
1751 WRITE_ONCE(rdp->gpwrap, false); 1773 WRITE_ONCE(rdp->gpwrap, false);
1752 } 1774 }
@@ -1927,16 +1949,15 @@ static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp)
1927/* 1949/*
1928 * Do one round of quiescent-state forcing. 1950 * Do one round of quiescent-state forcing.
1929 */ 1951 */
1930static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) 1952static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time)
1931{ 1953{
1932 int fqs_state = fqs_state_in;
1933 bool isidle = false; 1954 bool isidle = false;
1934 unsigned long maxj; 1955 unsigned long maxj;
1935 struct rcu_node *rnp = rcu_get_root(rsp); 1956 struct rcu_node *rnp = rcu_get_root(rsp);
1936 1957
1937 WRITE_ONCE(rsp->gp_activity, jiffies); 1958 WRITE_ONCE(rsp->gp_activity, jiffies);
1938 rsp->n_force_qs++; 1959 rsp->n_force_qs++;
1939 if (fqs_state == RCU_SAVE_DYNTICK) { 1960 if (first_time) {
1940 /* Collect dyntick-idle snapshots. */ 1961 /* Collect dyntick-idle snapshots. */
1941 if (is_sysidle_rcu_state(rsp)) { 1962 if (is_sysidle_rcu_state(rsp)) {
1942 isidle = true; 1963 isidle = true;
@@ -1945,7 +1966,6 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
1945 force_qs_rnp(rsp, dyntick_save_progress_counter, 1966 force_qs_rnp(rsp, dyntick_save_progress_counter,
1946 &isidle, &maxj); 1967 &isidle, &maxj);
1947 rcu_sysidle_report_gp(rsp, isidle, maxj); 1968 rcu_sysidle_report_gp(rsp, isidle, maxj);
1948 fqs_state = RCU_FORCE_QS;
1949 } else { 1969 } else {
1950 /* Handle dyntick-idle and offline CPUs. */ 1970 /* Handle dyntick-idle and offline CPUs. */
1951 isidle = true; 1971 isidle = true;
@@ -1959,7 +1979,6 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
1959 READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS); 1979 READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS);
1960 raw_spin_unlock_irq(&rnp->lock); 1980 raw_spin_unlock_irq(&rnp->lock);
1961 } 1981 }
1962 return fqs_state;
1963} 1982}
1964 1983
1965/* 1984/*
@@ -2023,7 +2042,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
2023 /* Declare grace period done. */ 2042 /* Declare grace period done. */
2024 WRITE_ONCE(rsp->completed, rsp->gpnum); 2043 WRITE_ONCE(rsp->completed, rsp->gpnum);
2025 trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); 2044 trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
2026 rsp->fqs_state = RCU_GP_IDLE; 2045 rsp->gp_state = RCU_GP_IDLE;
2027 rdp = this_cpu_ptr(rsp->rda); 2046 rdp = this_cpu_ptr(rsp->rda);
2028 /* Advance CBs to reduce false positives below. */ 2047 /* Advance CBs to reduce false positives below. */
2029 needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp; 2048 needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp;
@@ -2041,7 +2060,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
2041 */ 2060 */
2042static int __noreturn rcu_gp_kthread(void *arg) 2061static int __noreturn rcu_gp_kthread(void *arg)
2043{ 2062{
2044 int fqs_state; 2063 bool first_gp_fqs;
2045 int gf; 2064 int gf;
2046 unsigned long j; 2065 unsigned long j;
2047 int ret; 2066 int ret;
@@ -2073,7 +2092,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
2073 } 2092 }
2074 2093
2075 /* Handle quiescent-state forcing. */ 2094 /* Handle quiescent-state forcing. */
2076 fqs_state = RCU_SAVE_DYNTICK; 2095 first_gp_fqs = true;
2077 j = jiffies_till_first_fqs; 2096 j = jiffies_till_first_fqs;
2078 if (j > HZ) { 2097 if (j > HZ) {
2079 j = HZ; 2098 j = HZ;
@@ -2101,7 +2120,8 @@ static int __noreturn rcu_gp_kthread(void *arg)
2101 trace_rcu_grace_period(rsp->name, 2120 trace_rcu_grace_period(rsp->name,
2102 READ_ONCE(rsp->gpnum), 2121 READ_ONCE(rsp->gpnum),
2103 TPS("fqsstart")); 2122 TPS("fqsstart"));
2104 fqs_state = rcu_gp_fqs(rsp, fqs_state); 2123 rcu_gp_fqs(rsp, first_gp_fqs);
2124 first_gp_fqs = false;
2105 trace_rcu_grace_period(rsp->name, 2125 trace_rcu_grace_period(rsp->name,
2106 READ_ONCE(rsp->gpnum), 2126 READ_ONCE(rsp->gpnum),
2107 TPS("fqsend")); 2127 TPS("fqsend"));
@@ -2337,7 +2357,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
2337 rnp = rdp->mynode; 2357 rnp = rdp->mynode;
2338 raw_spin_lock_irqsave(&rnp->lock, flags); 2358 raw_spin_lock_irqsave(&rnp->lock, flags);
2339 smp_mb__after_unlock_lock(); 2359 smp_mb__after_unlock_lock();
2340 if ((rdp->passed_quiesce == 0 && 2360 if ((rdp->cpu_no_qs.b.norm &&
2341 rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) || 2361 rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) ||
2342 rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum || 2362 rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum ||
2343 rdp->gpwrap) { 2363 rdp->gpwrap) {
@@ -2348,7 +2368,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
2348 * We will instead need a new quiescent state that lies 2368 * We will instead need a new quiescent state that lies
2349 * within the current grace period. 2369 * within the current grace period.
2350 */ 2370 */
2351 rdp->passed_quiesce = 0; /* need qs for new gp. */ 2371 rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */
2352 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); 2372 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
2353 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2373 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2354 return; 2374 return;
@@ -2357,7 +2377,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
2357 if ((rnp->qsmask & mask) == 0) { 2377 if ((rnp->qsmask & mask) == 0) {
2358 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2378 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2359 } else { 2379 } else {
2360 rdp->qs_pending = 0; 2380 rdp->core_needs_qs = 0;
2361 2381
2362 /* 2382 /*
2363 * This GP can't end until cpu checks in, so all of our 2383 * This GP can't end until cpu checks in, so all of our
@@ -2388,14 +2408,14 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
2388 * Does this CPU still need to do its part for current grace period? 2408 * Does this CPU still need to do its part for current grace period?
2389 * If no, return and let the other CPUs do their part as well. 2409 * If no, return and let the other CPUs do their part as well.
2390 */ 2410 */
2391 if (!rdp->qs_pending) 2411 if (!rdp->core_needs_qs)
2392 return; 2412 return;
2393 2413
2394 /* 2414 /*
2395 * Was there a quiescent state since the beginning of the grace 2415 * Was there a quiescent state since the beginning of the grace
2396 * period? If no, then exit and wait for the next call. 2416 * period? If no, then exit and wait for the next call.
2397 */ 2417 */
2398 if (!rdp->passed_quiesce && 2418 if (rdp->cpu_no_qs.b.norm &&
2399 rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) 2419 rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr))
2400 return; 2420 return;
2401 2421
@@ -3017,7 +3037,7 @@ static void rcu_leak_callback(struct rcu_head *rhp)
3017 * is expected to specify a CPU. 3037 * is expected to specify a CPU.
3018 */ 3038 */
3019static void 3039static void
3020__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), 3040__call_rcu(struct rcu_head *head, rcu_callback_t func,
3021 struct rcu_state *rsp, int cpu, bool lazy) 3041 struct rcu_state *rsp, int cpu, bool lazy)
3022{ 3042{
3023 unsigned long flags; 3043 unsigned long flags;
@@ -3088,7 +3108,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
3088/* 3108/*
3089 * Queue an RCU-sched callback for invocation after a grace period. 3109 * Queue an RCU-sched callback for invocation after a grace period.
3090 */ 3110 */
3091void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 3111void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
3092{ 3112{
3093 __call_rcu(head, func, &rcu_sched_state, -1, 0); 3113 __call_rcu(head, func, &rcu_sched_state, -1, 0);
3094} 3114}
@@ -3097,7 +3117,7 @@ EXPORT_SYMBOL_GPL(call_rcu_sched);
3097/* 3117/*
3098 * Queue an RCU callback for invocation after a quicker grace period. 3118 * Queue an RCU callback for invocation after a quicker grace period.
3099 */ 3119 */
3100void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 3120void call_rcu_bh(struct rcu_head *head, rcu_callback_t func)
3101{ 3121{
3102 __call_rcu(head, func, &rcu_bh_state, -1, 0); 3122 __call_rcu(head, func, &rcu_bh_state, -1, 0);
3103} 3123}
@@ -3111,7 +3131,7 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
3111 * function may only be called from __kfree_rcu(). 3131 * function may only be called from __kfree_rcu().
3112 */ 3132 */
3113void kfree_call_rcu(struct rcu_head *head, 3133void kfree_call_rcu(struct rcu_head *head,
3114 void (*func)(struct rcu_head *rcu)) 3134 rcu_callback_t func)
3115{ 3135{
3116 __call_rcu(head, func, rcu_state_p, -1, 1); 3136 __call_rcu(head, func, rcu_state_p, -1, 1);
3117} 3137}
@@ -3379,6 +3399,191 @@ static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
3379 return rcu_seq_done(&rsp->expedited_sequence, s); 3399 return rcu_seq_done(&rsp->expedited_sequence, s);
3380} 3400}
3381 3401
3402/*
3403 * Reset the ->expmaskinit values in the rcu_node tree to reflect any
3404 * recent CPU-online activity. Note that these masks are not cleared
3405 * when CPUs go offline, so they reflect the union of all CPUs that have
3406 * ever been online. This means that this function normally takes its
3407 * no-work-to-do fastpath.
3408 */
3409static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
3410{
3411 bool done;
3412 unsigned long flags;
3413 unsigned long mask;
3414 unsigned long oldmask;
3415 int ncpus = READ_ONCE(rsp->ncpus);
3416 struct rcu_node *rnp;
3417 struct rcu_node *rnp_up;
3418
3419 /* If no new CPUs onlined since last time, nothing to do. */
3420 if (likely(ncpus == rsp->ncpus_snap))
3421 return;
3422 rsp->ncpus_snap = ncpus;
3423
3424 /*
3425 * Each pass through the following loop propagates newly onlined
3426 * CPUs for the current rcu_node structure up the rcu_node tree.
3427 */
3428 rcu_for_each_leaf_node(rsp, rnp) {
3429 raw_spin_lock_irqsave(&rnp->lock, flags);
3430 smp_mb__after_unlock_lock();
3431 if (rnp->expmaskinit == rnp->expmaskinitnext) {
3432 raw_spin_unlock_irqrestore(&rnp->lock, flags);
3433 continue; /* No new CPUs, nothing to do. */
3434 }
3435
3436 /* Update this node's mask, track old value for propagation. */
3437 oldmask = rnp->expmaskinit;
3438 rnp->expmaskinit = rnp->expmaskinitnext;
3439 raw_spin_unlock_irqrestore(&rnp->lock, flags);
3440
3441 /* If was already nonzero, nothing to propagate. */
3442 if (oldmask)
3443 continue;
3444
3445 /* Propagate the new CPU up the tree. */
3446 mask = rnp->grpmask;
3447 rnp_up = rnp->parent;
3448 done = false;
3449 while (rnp_up) {
3450 raw_spin_lock_irqsave(&rnp_up->lock, flags);
3451 smp_mb__after_unlock_lock();
3452 if (rnp_up->expmaskinit)
3453 done = true;
3454 rnp_up->expmaskinit |= mask;
3455 raw_spin_unlock_irqrestore(&rnp_up->lock, flags);
3456 if (done)
3457 break;
3458 mask = rnp_up->grpmask;
3459 rnp_up = rnp_up->parent;
3460 }
3461 }
3462}
3463
3464/*
3465 * Reset the ->expmask values in the rcu_node tree in preparation for
3466 * a new expedited grace period.
3467 */
3468static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
3469{
3470 unsigned long flags;
3471 struct rcu_node *rnp;
3472
3473 sync_exp_reset_tree_hotplug(rsp);
3474 rcu_for_each_node_breadth_first(rsp, rnp) {
3475 raw_spin_lock_irqsave(&rnp->lock, flags);
3476 smp_mb__after_unlock_lock();
3477 WARN_ON_ONCE(rnp->expmask);
3478 rnp->expmask = rnp->expmaskinit;
3479 raw_spin_unlock_irqrestore(&rnp->lock, flags);
3480 }
3481}
3482
3483/*
3484 * Return non-zero if there is no RCU expedited grace period in progress
3485 * for the specified rcu_node structure, in other words, if all CPUs and
3486 * tasks covered by the specified rcu_node structure have done their bit
3487 * for the current expedited grace period. Works only for preemptible
3488 * RCU -- other RCU implementation use other means.
3489 *
3490 * Caller must hold the root rcu_node's exp_funnel_mutex.
3491 */
3492static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
3493{
3494 return rnp->exp_tasks == NULL &&
3495 READ_ONCE(rnp->expmask) == 0;
3496}
3497
3498/*
3499 * Report the exit from RCU read-side critical section for the last task
3500 * that queued itself during or before the current expedited preemptible-RCU
3501 * grace period. This event is reported either to the rcu_node structure on
3502 * which the task was queued or to one of that rcu_node structure's ancestors,
3503 * recursively up the tree. (Calm down, calm down, we do the recursion
3504 * iteratively!)
3505 *
3506 * Caller must hold the root rcu_node's exp_funnel_mutex and the
3507 * specified rcu_node structure's ->lock.
3508 */
3509static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
3510 bool wake, unsigned long flags)
3511 __releases(rnp->lock)
3512{
3513 unsigned long mask;
3514
3515 for (;;) {
3516 if (!sync_rcu_preempt_exp_done(rnp)) {
3517 if (!rnp->expmask)
3518 rcu_initiate_boost(rnp, flags);
3519 else
3520 raw_spin_unlock_irqrestore(&rnp->lock, flags);
3521 break;
3522 }
3523 if (rnp->parent == NULL) {
3524 raw_spin_unlock_irqrestore(&rnp->lock, flags);
3525 if (wake) {
3526 smp_mb(); /* EGP done before wake_up(). */
3527 wake_up(&rsp->expedited_wq);
3528 }
3529 break;
3530 }
3531 mask = rnp->grpmask;
3532 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
3533 rnp = rnp->parent;
3534 raw_spin_lock(&rnp->lock); /* irqs already disabled */
3535 smp_mb__after_unlock_lock();
3536 WARN_ON_ONCE(!(rnp->expmask & mask));
3537 rnp->expmask &= ~mask;
3538 }
3539}
3540
3541/*
3542 * Report expedited quiescent state for specified node. This is a
3543 * lock-acquisition wrapper function for __rcu_report_exp_rnp().
3544 *
3545 * Caller must hold the root rcu_node's exp_funnel_mutex.
3546 */
3547static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
3548 struct rcu_node *rnp, bool wake)
3549{
3550 unsigned long flags;
3551
3552 raw_spin_lock_irqsave(&rnp->lock, flags);
3553 smp_mb__after_unlock_lock();
3554 __rcu_report_exp_rnp(rsp, rnp, wake, flags);
3555}
3556
3557/*
3558 * Report expedited quiescent state for multiple CPUs, all covered by the
3559 * specified leaf rcu_node structure. Caller must hold the root
3560 * rcu_node's exp_funnel_mutex.
3561 */
3562static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
3563 unsigned long mask, bool wake)
3564{
3565 unsigned long flags;
3566
3567 raw_spin_lock_irqsave(&rnp->lock, flags);
3568 smp_mb__after_unlock_lock();
3569 if (!(rnp->expmask & mask)) {
3570 raw_spin_unlock_irqrestore(&rnp->lock, flags);
3571 return;
3572 }
3573 rnp->expmask &= ~mask;
3574 __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */
3575}
3576
3577/*
3578 * Report expedited quiescent state for specified rcu_data (CPU).
3579 * Caller must hold the root rcu_node's exp_funnel_mutex.
3580 */
3581static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp,
3582 bool wake)
3583{
3584 rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake);
3585}
3586
3382/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ 3587/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
3383static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, 3588static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp,
3384 struct rcu_data *rdp, 3589 struct rcu_data *rdp,
@@ -3455,16 +3660,111 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
3455} 3660}
3456 3661
3457/* Invoked on each online non-idle CPU for expedited quiescent state. */ 3662/* Invoked on each online non-idle CPU for expedited quiescent state. */
3458static int synchronize_sched_expedited_cpu_stop(void *data) 3663static void sync_sched_exp_handler(void *data)
3459{ 3664{
3460 struct rcu_data *rdp = data; 3665 struct rcu_data *rdp;
3461 struct rcu_state *rsp = rdp->rsp; 3666 struct rcu_node *rnp;
3667 struct rcu_state *rsp = data;
3462 3668
3463 /* We are here: If we are last, do the wakeup. */ 3669 rdp = this_cpu_ptr(rsp->rda);
3464 rdp->exp_done = true; 3670 rnp = rdp->mynode;
3465 if (atomic_dec_and_test(&rsp->expedited_need_qs)) 3671 if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
3466 wake_up(&rsp->expedited_wq); 3672 __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
3467 return 0; 3673 return;
3674 __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true);
3675 resched_cpu(smp_processor_id());
3676}
3677
3678/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */
3679static void sync_sched_exp_online_cleanup(int cpu)
3680{
3681 struct rcu_data *rdp;
3682 int ret;
3683 struct rcu_node *rnp;
3684 struct rcu_state *rsp = &rcu_sched_state;
3685
3686 rdp = per_cpu_ptr(rsp->rda, cpu);
3687 rnp = rdp->mynode;
3688 if (!(READ_ONCE(rnp->expmask) & rdp->grpmask))
3689 return;
3690 ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0);
3691 WARN_ON_ONCE(ret);
3692}
3693
3694/*
3695 * Select the nodes that the upcoming expedited grace period needs
3696 * to wait for.
3697 */
3698static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
3699 smp_call_func_t func)
3700{
3701 int cpu;
3702 unsigned long flags;
3703 unsigned long mask;
3704 unsigned long mask_ofl_test;
3705 unsigned long mask_ofl_ipi;
3706 int ret;
3707 struct rcu_node *rnp;
3708
3709 sync_exp_reset_tree(rsp);
3710 rcu_for_each_leaf_node(rsp, rnp) {
3711 raw_spin_lock_irqsave(&rnp->lock, flags);
3712 smp_mb__after_unlock_lock();
3713
3714 /* Each pass checks a CPU for identity, offline, and idle. */
3715 mask_ofl_test = 0;
3716 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) {
3717 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
3718 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
3719
3720 if (raw_smp_processor_id() == cpu ||
3721 !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
3722 mask_ofl_test |= rdp->grpmask;
3723 }
3724 mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
3725
3726 /*
3727 * Need to wait for any blocked tasks as well. Note that
3728 * additional blocking tasks will also block the expedited
3729 * GP until such time as the ->expmask bits are cleared.
3730 */
3731 if (rcu_preempt_has_tasks(rnp))
3732 rnp->exp_tasks = rnp->blkd_tasks.next;
3733 raw_spin_unlock_irqrestore(&rnp->lock, flags);
3734
3735 /* IPI the remaining CPUs for expedited quiescent state. */
3736 mask = 1;
3737 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
3738 if (!(mask_ofl_ipi & mask))
3739 continue;
3740retry_ipi:
3741 ret = smp_call_function_single(cpu, func, rsp, 0);
3742 if (!ret) {
3743 mask_ofl_ipi &= ~mask;
3744 } else {
3745 /* Failed, raced with offline. */
3746 raw_spin_lock_irqsave(&rnp->lock, flags);
3747 if (cpu_online(cpu) &&
3748 (rnp->expmask & mask)) {
3749 raw_spin_unlock_irqrestore(&rnp->lock,
3750 flags);
3751 schedule_timeout_uninterruptible(1);
3752 if (cpu_online(cpu) &&
3753 (rnp->expmask & mask))
3754 goto retry_ipi;
3755 raw_spin_lock_irqsave(&rnp->lock,
3756 flags);
3757 }
3758 if (!(rnp->expmask & mask))
3759 mask_ofl_ipi &= ~mask;
3760 raw_spin_unlock_irqrestore(&rnp->lock, flags);
3761 }
3762 }
3763 /* Report quiescent states for those that went offline. */
3764 mask_ofl_test |= mask_ofl_ipi;
3765 if (mask_ofl_test)
3766 rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
3767 }
3468} 3768}
3469 3769
3470static void synchronize_sched_expedited_wait(struct rcu_state *rsp) 3770static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
@@ -3472,7 +3772,9 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
3472 int cpu; 3772 int cpu;
3473 unsigned long jiffies_stall; 3773 unsigned long jiffies_stall;
3474 unsigned long jiffies_start; 3774 unsigned long jiffies_start;
3475 struct rcu_data *rdp; 3775 unsigned long mask;
3776 struct rcu_node *rnp;
3777 struct rcu_node *rnp_root = rcu_get_root(rsp);
3476 int ret; 3778 int ret;
3477 3779
3478 jiffies_stall = rcu_jiffies_till_stall_check(); 3780 jiffies_stall = rcu_jiffies_till_stall_check();
@@ -3481,33 +3783,43 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
3481 for (;;) { 3783 for (;;) {
3482 ret = wait_event_interruptible_timeout( 3784 ret = wait_event_interruptible_timeout(
3483 rsp->expedited_wq, 3785 rsp->expedited_wq,
3484 !atomic_read(&rsp->expedited_need_qs), 3786 sync_rcu_preempt_exp_done(rnp_root),
3485 jiffies_stall); 3787 jiffies_stall);
3486 if (ret > 0) 3788 if (ret > 0)
3487 return; 3789 return;
3488 if (ret < 0) { 3790 if (ret < 0) {
3489 /* Hit a signal, disable CPU stall warnings. */ 3791 /* Hit a signal, disable CPU stall warnings. */
3490 wait_event(rsp->expedited_wq, 3792 wait_event(rsp->expedited_wq,
3491 !atomic_read(&rsp->expedited_need_qs)); 3793 sync_rcu_preempt_exp_done(rnp_root));
3492 return; 3794 return;
3493 } 3795 }
3494 pr_err("INFO: %s detected expedited stalls on CPUs: {", 3796 pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
3495 rsp->name); 3797 rsp->name);
3496 for_each_online_cpu(cpu) { 3798 rcu_for_each_leaf_node(rsp, rnp) {
3497 rdp = per_cpu_ptr(rsp->rda, cpu); 3799 (void)rcu_print_task_exp_stall(rnp);
3498 3800 mask = 1;
3499 if (rdp->exp_done) 3801 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
3500 continue; 3802 struct rcu_data *rdp;
3501 pr_cont(" %d", cpu); 3803
3804 if (!(rnp->expmask & mask))
3805 continue;
3806 rdp = per_cpu_ptr(rsp->rda, cpu);
3807 pr_cont(" %d-%c%c%c", cpu,
3808 "O."[cpu_online(cpu)],
3809 "o."[!!(rdp->grpmask & rnp->expmaskinit)],
3810 "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]);
3811 }
3812 mask <<= 1;
3502 } 3813 }
3503 pr_cont(" } %lu jiffies s: %lu\n", 3814 pr_cont(" } %lu jiffies s: %lu\n",
3504 jiffies - jiffies_start, rsp->expedited_sequence); 3815 jiffies - jiffies_start, rsp->expedited_sequence);
3505 for_each_online_cpu(cpu) { 3816 rcu_for_each_leaf_node(rsp, rnp) {
3506 rdp = per_cpu_ptr(rsp->rda, cpu); 3817 mask = 1;
3507 3818 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
3508 if (rdp->exp_done) 3819 if (!(rnp->expmask & mask))
3509 continue; 3820 continue;
3510 dump_cpu_task(cpu); 3821 dump_cpu_task(cpu);
3822 }
3511 } 3823 }
3512 jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; 3824 jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3;
3513 } 3825 }
@@ -3531,7 +3843,6 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
3531 */ 3843 */
3532void synchronize_sched_expedited(void) 3844void synchronize_sched_expedited(void)
3533{ 3845{
3534 int cpu;
3535 unsigned long s; 3846 unsigned long s;
3536 struct rcu_node *rnp; 3847 struct rcu_node *rnp;
3537 struct rcu_state *rsp = &rcu_sched_state; 3848 struct rcu_state *rsp = &rcu_sched_state;
@@ -3539,48 +3850,16 @@ void synchronize_sched_expedited(void)
3539 /* Take a snapshot of the sequence number. */ 3850 /* Take a snapshot of the sequence number. */
3540 s = rcu_exp_gp_seq_snap(rsp); 3851 s = rcu_exp_gp_seq_snap(rsp);
3541 3852
3542 if (!try_get_online_cpus()) {
3543 /* CPU hotplug operation in flight, fall back to normal GP. */
3544 wait_rcu_gp(call_rcu_sched);
3545 atomic_long_inc(&rsp->expedited_normal);
3546 return;
3547 }
3548 WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
3549
3550 rnp = exp_funnel_lock(rsp, s); 3853 rnp = exp_funnel_lock(rsp, s);
3551 if (rnp == NULL) { 3854 if (rnp == NULL)
3552 put_online_cpus();
3553 return; /* Someone else did our work for us. */ 3855 return; /* Someone else did our work for us. */
3554 }
3555 3856
3556 rcu_exp_gp_seq_start(rsp); 3857 rcu_exp_gp_seq_start(rsp);
3557 3858 sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler);
3558 /* Stop each CPU that is online, non-idle, and not us. */ 3859 synchronize_sched_expedited_wait(rsp);
3559 init_waitqueue_head(&rsp->expedited_wq);
3560 atomic_set(&rsp->expedited_need_qs, 1); /* Extra count avoids race. */
3561 for_each_online_cpu(cpu) {
3562 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
3563 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
3564
3565 rdp->exp_done = false;
3566
3567 /* Skip our CPU and any idle CPUs. */
3568 if (raw_smp_processor_id() == cpu ||
3569 !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
3570 continue;
3571 atomic_inc(&rsp->expedited_need_qs);
3572 stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop,
3573 rdp, &rdp->exp_stop_work);
3574 }
3575
3576 /* Remove extra count and, if necessary, wait for CPUs to stop. */
3577 if (!atomic_dec_and_test(&rsp->expedited_need_qs))
3578 synchronize_sched_expedited_wait(rsp);
3579 3860
3580 rcu_exp_gp_seq_end(rsp); 3861 rcu_exp_gp_seq_end(rsp);
3581 mutex_unlock(&rnp->exp_funnel_mutex); 3862 mutex_unlock(&rnp->exp_funnel_mutex);
3582
3583 put_online_cpus();
3584} 3863}
3585EXPORT_SYMBOL_GPL(synchronize_sched_expedited); 3864EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
3586 3865
@@ -3606,11 +3885,11 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
3606 3885
3607 /* Is the RCU core waiting for a quiescent state from this CPU? */ 3886 /* Is the RCU core waiting for a quiescent state from this CPU? */
3608 if (rcu_scheduler_fully_active && 3887 if (rcu_scheduler_fully_active &&
3609 rdp->qs_pending && !rdp->passed_quiesce && 3888 rdp->core_needs_qs && rdp->cpu_no_qs.b.norm &&
3610 rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) { 3889 rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) {
3611 rdp->n_rp_qs_pending++; 3890 rdp->n_rp_core_needs_qs++;
3612 } else if (rdp->qs_pending && 3891 } else if (rdp->core_needs_qs &&
3613 (rdp->passed_quiesce || 3892 (!rdp->cpu_no_qs.b.norm ||
3614 rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) { 3893 rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) {
3615 rdp->n_rp_report_qs++; 3894 rdp->n_rp_report_qs++;
3616 return 1; 3895 return 1;
@@ -3901,7 +4180,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
3901 4180
3902 /* Set up local state, ensuring consistent view of global state. */ 4181 /* Set up local state, ensuring consistent view of global state. */
3903 raw_spin_lock_irqsave(&rnp->lock, flags); 4182 raw_spin_lock_irqsave(&rnp->lock, flags);
3904 rdp->beenonline = 1; /* We have now been online. */
3905 rdp->qlen_last_fqs_check = 0; 4183 rdp->qlen_last_fqs_check = 0;
3906 rdp->n_force_qs_snap = rsp->n_force_qs; 4184 rdp->n_force_qs_snap = rsp->n_force_qs;
3907 rdp->blimit = blimit; 4185 rdp->blimit = blimit;
@@ -3923,11 +4201,15 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
3923 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 4201 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
3924 smp_mb__after_unlock_lock(); 4202 smp_mb__after_unlock_lock();
3925 rnp->qsmaskinitnext |= mask; 4203 rnp->qsmaskinitnext |= mask;
4204 rnp->expmaskinitnext |= mask;
4205 if (!rdp->beenonline)
4206 WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1);
4207 rdp->beenonline = true; /* We have now been online. */
3926 rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ 4208 rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
3927 rdp->completed = rnp->completed; 4209 rdp->completed = rnp->completed;
3928 rdp->passed_quiesce = false; 4210 rdp->cpu_no_qs.b.norm = true;
3929 rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu); 4211 rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu);
3930 rdp->qs_pending = false; 4212 rdp->core_needs_qs = false;
3931 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); 4213 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
3932 raw_spin_unlock_irqrestore(&rnp->lock, flags); 4214 raw_spin_unlock_irqrestore(&rnp->lock, flags);
3933} 4215}
@@ -3960,6 +4242,7 @@ int rcu_cpu_notify(struct notifier_block *self,
3960 break; 4242 break;
3961 case CPU_ONLINE: 4243 case CPU_ONLINE:
3962 case CPU_DOWN_FAILED: 4244 case CPU_DOWN_FAILED:
4245 sync_sched_exp_online_cleanup(cpu);
3963 rcu_boost_kthread_setaffinity(rnp, -1); 4246 rcu_boost_kthread_setaffinity(rnp, -1);
3964 break; 4247 break;
3965 case CPU_DOWN_PREPARE: 4248 case CPU_DOWN_PREPARE:
@@ -3971,6 +4254,12 @@ int rcu_cpu_notify(struct notifier_block *self,
3971 rcu_cleanup_dying_cpu(rsp); 4254 rcu_cleanup_dying_cpu(rsp);
3972 break; 4255 break;
3973 case CPU_DYING_IDLE: 4256 case CPU_DYING_IDLE:
4257 /* QS for any half-done expedited RCU-sched GP. */
4258 preempt_disable();
4259 rcu_report_exp_rdp(&rcu_sched_state,
4260 this_cpu_ptr(rcu_sched_state.rda), true);
4261 preempt_enable();
4262
3974 for_each_rcu_flavor(rsp) { 4263 for_each_rcu_flavor(rsp) {
3975 rcu_cleanup_dying_idle_cpu(cpu, rsp); 4264 rcu_cleanup_dying_idle_cpu(cpu, rsp);
3976 } 4265 }
@@ -4102,7 +4391,6 @@ static void __init rcu_init_one(struct rcu_state *rsp,
4102 static const char * const buf[] = RCU_NODE_NAME_INIT; 4391 static const char * const buf[] = RCU_NODE_NAME_INIT;
4103 static const char * const fqs[] = RCU_FQS_NAME_INIT; 4392 static const char * const fqs[] = RCU_FQS_NAME_INIT;
4104 static const char * const exp[] = RCU_EXP_NAME_INIT; 4393 static const char * const exp[] = RCU_EXP_NAME_INIT;
4105 static const char * const exp_sched[] = RCU_EXP_SCHED_NAME_INIT;
4106 static u8 fl_mask = 0x1; 4394 static u8 fl_mask = 0x1;
4107 4395
4108 int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ 4396 int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */
@@ -4162,18 +4450,13 @@ static void __init rcu_init_one(struct rcu_state *rsp,
4162 INIT_LIST_HEAD(&rnp->blkd_tasks); 4450 INIT_LIST_HEAD(&rnp->blkd_tasks);
4163 rcu_init_one_nocb(rnp); 4451 rcu_init_one_nocb(rnp);
4164 mutex_init(&rnp->exp_funnel_mutex); 4452 mutex_init(&rnp->exp_funnel_mutex);
4165 if (rsp == &rcu_sched_state) 4453 lockdep_set_class_and_name(&rnp->exp_funnel_mutex,
4166 lockdep_set_class_and_name( 4454 &rcu_exp_class[i], exp[i]);
4167 &rnp->exp_funnel_mutex,
4168 &rcu_exp_sched_class[i], exp_sched[i]);
4169 else
4170 lockdep_set_class_and_name(
4171 &rnp->exp_funnel_mutex,
4172 &rcu_exp_class[i], exp[i]);
4173 } 4455 }
4174 } 4456 }
4175 4457
4176 init_waitqueue_head(&rsp->gp_wq); 4458 init_waitqueue_head(&rsp->gp_wq);
4459 init_waitqueue_head(&rsp->expedited_wq);
4177 rnp = rsp->level[rcu_num_lvls - 1]; 4460 rnp = rsp->level[rcu_num_lvls - 1];
4178 for_each_possible_cpu(i) { 4461 for_each_possible_cpu(i) {
4179 while (i > rnp->grphi) 4462 while (i > rnp->grphi)
@@ -4216,13 +4499,12 @@ static void __init rcu_init_geometry(void)
4216 rcu_fanout_leaf, nr_cpu_ids); 4499 rcu_fanout_leaf, nr_cpu_ids);
4217 4500
4218 /* 4501 /*
4219 * The boot-time rcu_fanout_leaf parameter is only permitted 4502 * The boot-time rcu_fanout_leaf parameter must be at least two
4220 * to increase the leaf-level fanout, not decrease it. Of course, 4503 * and cannot exceed the number of bits in the rcu_node masks.
4221 * the leaf-level fanout cannot exceed the number of bits in 4504 * Complain and fall back to the compile-time values if this
4222 * the rcu_node masks. Complain and fall back to the compile- 4505 * limit is exceeded.
4223 * time values if these limits are exceeded.
4224 */ 4506 */
4225 if (rcu_fanout_leaf < RCU_FANOUT_LEAF || 4507 if (rcu_fanout_leaf < 2 ||
4226 rcu_fanout_leaf > sizeof(unsigned long) * 8) { 4508 rcu_fanout_leaf > sizeof(unsigned long) * 8) {
4227 rcu_fanout_leaf = RCU_FANOUT_LEAF; 4509 rcu_fanout_leaf = RCU_FANOUT_LEAF;
4228 WARN_ON(1); 4510 WARN_ON(1);
@@ -4239,10 +4521,13 @@ static void __init rcu_init_geometry(void)
4239 4521
4240 /* 4522 /*
4241 * The tree must be able to accommodate the configured number of CPUs. 4523 * The tree must be able to accommodate the configured number of CPUs.
4242 * If this limit is exceeded than we have a serious problem elsewhere. 4524 * If this limit is exceeded, fall back to the compile-time values.
4243 */ 4525 */
4244 if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1]) 4526 if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1]) {
4245 panic("rcu_init_geometry: rcu_capacity[] is too small"); 4527 rcu_fanout_leaf = RCU_FANOUT_LEAF;
4528 WARN_ON(1);
4529 return;
4530 }
4246 4531
4247 /* Calculate the number of levels in the tree. */ 4532 /* Calculate the number of levels in the tree. */
4248 for (i = 0; nr_cpu_ids > rcu_capacity[i]; i++) { 4533 for (i = 0; nr_cpu_ids > rcu_capacity[i]; i++) {
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 2e991f8361e4..9fb4e238d4dc 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -70,8 +70,6 @@
70# define RCU_NODE_NAME_INIT { "rcu_node_0" } 70# define RCU_NODE_NAME_INIT { "rcu_node_0" }
71# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } 71# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" }
72# define RCU_EXP_NAME_INIT { "rcu_node_exp_0" } 72# define RCU_EXP_NAME_INIT { "rcu_node_exp_0" }
73# define RCU_EXP_SCHED_NAME_INIT \
74 { "rcu_node_exp_sched_0" }
75#elif NR_CPUS <= RCU_FANOUT_2 73#elif NR_CPUS <= RCU_FANOUT_2
76# define RCU_NUM_LVLS 2 74# define RCU_NUM_LVLS 2
77# define NUM_RCU_LVL_0 1 75# define NUM_RCU_LVL_0 1
@@ -81,8 +79,6 @@
81# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } 79# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" }
82# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } 80# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" }
83# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" } 81# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" }
84# define RCU_EXP_SCHED_NAME_INIT \
85 { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1" }
86#elif NR_CPUS <= RCU_FANOUT_3 82#elif NR_CPUS <= RCU_FANOUT_3
87# define RCU_NUM_LVLS 3 83# define RCU_NUM_LVLS 3
88# define NUM_RCU_LVL_0 1 84# define NUM_RCU_LVL_0 1
@@ -93,8 +89,6 @@
93# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } 89# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
94# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } 90# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
95# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" } 91# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" }
96# define RCU_EXP_SCHED_NAME_INIT \
97 { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1", "rcu_node_exp_sched_2" }
98#elif NR_CPUS <= RCU_FANOUT_4 92#elif NR_CPUS <= RCU_FANOUT_4
99# define RCU_NUM_LVLS 4 93# define RCU_NUM_LVLS 4
100# define NUM_RCU_LVL_0 1 94# define NUM_RCU_LVL_0 1
@@ -106,8 +100,6 @@
106# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } 100# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
107# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } 101# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
108# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" } 102# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" }
109# define RCU_EXP_SCHED_NAME_INIT \
110 { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1", "rcu_node_exp_sched_2", "rcu_node_exp_sched_3" }
111#else 103#else
112# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" 104# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
113#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ 105#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
@@ -171,16 +163,21 @@ struct rcu_node {
171 /* an rcu_data structure, otherwise, each */ 163 /* an rcu_data structure, otherwise, each */
172 /* bit corresponds to a child rcu_node */ 164 /* bit corresponds to a child rcu_node */
173 /* structure. */ 165 /* structure. */
174 unsigned long expmask; /* Groups that have ->blkd_tasks */
175 /* elements that need to drain to allow the */
176 /* current expedited grace period to */
177 /* complete (only for PREEMPT_RCU). */
178 unsigned long qsmaskinit; 166 unsigned long qsmaskinit;
179 /* Per-GP initial value for qsmask & expmask. */ 167 /* Per-GP initial value for qsmask. */
180 /* Initialized from ->qsmaskinitnext at the */ 168 /* Initialized from ->qsmaskinitnext at the */
181 /* beginning of each grace period. */ 169 /* beginning of each grace period. */
182 unsigned long qsmaskinitnext; 170 unsigned long qsmaskinitnext;
183 /* Online CPUs for next grace period. */ 171 /* Online CPUs for next grace period. */
172 unsigned long expmask; /* CPUs or groups that need to check in */
173 /* to allow the current expedited GP */
174 /* to complete. */
175 unsigned long expmaskinit;
176 /* Per-GP initial values for expmask. */
177 /* Initialized from ->expmaskinitnext at the */
178 /* beginning of each expedited GP. */
179 unsigned long expmaskinitnext;
180 /* Online CPUs for next expedited GP. */
184 unsigned long grpmask; /* Mask to apply to parent qsmask. */ 181 unsigned long grpmask; /* Mask to apply to parent qsmask. */
185 /* Only one bit will be set in this mask. */ 182 /* Only one bit will be set in this mask. */
186 int grplo; /* lowest-numbered CPU or group here. */ 183 int grplo; /* lowest-numbered CPU or group here. */
@@ -281,6 +278,18 @@ struct rcu_node {
281 for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ 278 for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \
282 (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) 279 (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
283 280
281/*
282 * Union to allow "aggregate OR" operation on the need for a quiescent
283 * state by the normal and expedited grace periods.
284 */
285union rcu_noqs {
286 struct {
287 u8 norm;
288 u8 exp;
289 } b; /* Bits. */
290 u16 s; /* Set of bits, aggregate OR here. */
291};
292
284/* Index values for nxttail array in struct rcu_data. */ 293/* Index values for nxttail array in struct rcu_data. */
285#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ 294#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
286#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ 295#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */
@@ -297,8 +306,8 @@ struct rcu_data {
297 /* is aware of having started. */ 306 /* is aware of having started. */
298 unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */ 307 unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */
299 /* for rcu_all_qs() invocations. */ 308 /* for rcu_all_qs() invocations. */
300 bool passed_quiesce; /* User-mode/idle loop etc. */ 309 union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */
301 bool qs_pending; /* Core waits for quiesc state. */ 310 bool core_needs_qs; /* Core waits for quiesc state. */
302 bool beenonline; /* CPU online at least once. */ 311 bool beenonline; /* CPU online at least once. */
303 bool gpwrap; /* Possible gpnum/completed wrap. */ 312 bool gpwrap; /* Possible gpnum/completed wrap. */
304 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ 313 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
@@ -307,9 +316,6 @@ struct rcu_data {
307 /* ticks this CPU has handled */ 316 /* ticks this CPU has handled */
308 /* during and after the last grace */ 317 /* during and after the last grace */
309 /* period it is aware of. */ 318 /* period it is aware of. */
310 struct cpu_stop_work exp_stop_work;
311 /* Expedited grace-period control */
312 /* for CPU stopping. */
313 319
314 /* 2) batch handling */ 320 /* 2) batch handling */
315 /* 321 /*
@@ -363,7 +369,7 @@ struct rcu_data {
363 369
364 /* 5) __rcu_pending() statistics. */ 370 /* 5) __rcu_pending() statistics. */
365 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ 371 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
366 unsigned long n_rp_qs_pending; 372 unsigned long n_rp_core_needs_qs;
367 unsigned long n_rp_report_qs; 373 unsigned long n_rp_report_qs;
368 unsigned long n_rp_cb_ready; 374 unsigned long n_rp_cb_ready;
369 unsigned long n_rp_cpu_needs_gp; 375 unsigned long n_rp_cpu_needs_gp;
@@ -378,7 +384,6 @@ struct rcu_data {
378 struct rcu_head oom_head; 384 struct rcu_head oom_head;
379#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 385#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
380 struct mutex exp_funnel_mutex; 386 struct mutex exp_funnel_mutex;
381 bool exp_done; /* Expedited QS for this CPU? */
382 387
383 /* 7) Callback offloading. */ 388 /* 7) Callback offloading. */
384#ifdef CONFIG_RCU_NOCB_CPU 389#ifdef CONFIG_RCU_NOCB_CPU
@@ -412,13 +417,6 @@ struct rcu_data {
412 struct rcu_state *rsp; 417 struct rcu_state *rsp;
413}; 418};
414 419
415/* Values for fqs_state field in struct rcu_state. */
416#define RCU_GP_IDLE 0 /* No grace period in progress. */
417#define RCU_GP_INIT 1 /* Grace period being initialized. */
418#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */
419#define RCU_FORCE_QS 3 /* Need to force quiescent state. */
420#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
421
422/* Values for nocb_defer_wakeup field in struct rcu_data. */ 420/* Values for nocb_defer_wakeup field in struct rcu_data. */
423#define RCU_NOGP_WAKE_NOT 0 421#define RCU_NOGP_WAKE_NOT 0
424#define RCU_NOGP_WAKE 1 422#define RCU_NOGP_WAKE 1
@@ -464,14 +462,13 @@ struct rcu_state {
464 /* shut bogus gcc warning) */ 462 /* shut bogus gcc warning) */
465 u8 flavor_mask; /* bit in flavor mask. */ 463 u8 flavor_mask; /* bit in flavor mask. */
466 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ 464 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
467 void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ 465 call_rcu_func_t call; /* call_rcu() flavor. */
468 void (*func)(struct rcu_head *head)); 466 int ncpus; /* # CPUs seen so far. */
469 467
470 /* The following fields are guarded by the root rcu_node's lock. */ 468 /* The following fields are guarded by the root rcu_node's lock. */
471 469
472 u8 fqs_state ____cacheline_internodealigned_in_smp; 470 u8 boost ____cacheline_internodealigned_in_smp;
473 /* Force QS state. */ 471 /* Subject to priority boost. */
474 u8 boost; /* Subject to priority boost. */
475 unsigned long gpnum; /* Current gp number. */ 472 unsigned long gpnum; /* Current gp number. */
476 unsigned long completed; /* # of last completed gp. */ 473 unsigned long completed; /* # of last completed gp. */
477 struct task_struct *gp_kthread; /* Task for grace periods. */ 474 struct task_struct *gp_kthread; /* Task for grace periods. */
@@ -508,6 +505,7 @@ struct rcu_state {
508 atomic_long_t expedited_normal; /* # fallbacks to normal. */ 505 atomic_long_t expedited_normal; /* # fallbacks to normal. */
509 atomic_t expedited_need_qs; /* # CPUs left to check in. */ 506 atomic_t expedited_need_qs; /* # CPUs left to check in. */
510 wait_queue_head_t expedited_wq; /* Wait for check-ins. */ 507 wait_queue_head_t expedited_wq; /* Wait for check-ins. */
508 int ncpus_snap; /* # CPUs seen last time. */
511 509
512 unsigned long jiffies_force_qs; /* Time at which to invoke */ 510 unsigned long jiffies_force_qs; /* Time at which to invoke */
513 /* force_quiescent_state(). */ 511 /* force_quiescent_state(). */
@@ -538,8 +536,8 @@ struct rcu_state {
538#define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */ 536#define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */
539#define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ 537#define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */
540 538
541/* Values for rcu_state structure's gp_flags field. */ 539/* Values for rcu_state structure's gp_state field. */
542#define RCU_GP_WAIT_INIT 0 /* Initial state. */ 540#define RCU_GP_IDLE 0 /* Initial state and no GP in progress. */
543#define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */ 541#define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */
544#define RCU_GP_DONE_GPS 2 /* Wait done for grace-period start. */ 542#define RCU_GP_DONE_GPS 2 /* Wait done for grace-period start. */
545#define RCU_GP_WAIT_FQS 3 /* Wait for force-quiescent-state time. */ 543#define RCU_GP_WAIT_FQS 3 /* Wait for force-quiescent-state time. */
@@ -582,9 +580,10 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
582#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 580#endif /* #ifdef CONFIG_HOTPLUG_CPU */
583static void rcu_print_detail_task_stall(struct rcu_state *rsp); 581static void rcu_print_detail_task_stall(struct rcu_state *rsp);
584static int rcu_print_task_stall(struct rcu_node *rnp); 582static int rcu_print_task_stall(struct rcu_node *rnp);
583static int rcu_print_task_exp_stall(struct rcu_node *rnp);
585static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); 584static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
586static void rcu_preempt_check_callbacks(void); 585static void rcu_preempt_check_callbacks(void);
587void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 586void call_rcu(struct rcu_head *head, rcu_callback_t func);
588static void __init __rcu_init_preempt(void); 587static void __init __rcu_init_preempt(void);
589static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); 588static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
590static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); 589static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index b2bf3963a0ae..630c19772630 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -101,7 +101,6 @@ RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
101static struct rcu_state *const rcu_state_p = &rcu_preempt_state; 101static struct rcu_state *const rcu_state_p = &rcu_preempt_state;
102static struct rcu_data __percpu *const rcu_data_p = &rcu_preempt_data; 102static struct rcu_data __percpu *const rcu_data_p = &rcu_preempt_data;
103 103
104static int rcu_preempted_readers_exp(struct rcu_node *rnp);
105static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, 104static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
106 bool wake); 105 bool wake);
107 106
@@ -114,6 +113,147 @@ static void __init rcu_bootup_announce(void)
114 rcu_bootup_announce_oddness(); 113 rcu_bootup_announce_oddness();
115} 114}
116 115
116/* Flags for rcu_preempt_ctxt_queue() decision table. */
117#define RCU_GP_TASKS 0x8
118#define RCU_EXP_TASKS 0x4
119#define RCU_GP_BLKD 0x2
120#define RCU_EXP_BLKD 0x1
121
122/*
123 * Queues a task preempted within an RCU-preempt read-side critical
124 * section into the appropriate location within the ->blkd_tasks list,
125 * depending on the states of any ongoing normal and expedited grace
126 * periods. The ->gp_tasks pointer indicates which element the normal
127 * grace period is waiting on (NULL if none), and the ->exp_tasks pointer
128 * indicates which element the expedited grace period is waiting on (again,
129 * NULL if none). If a grace period is waiting on a given element in the
130 * ->blkd_tasks list, it also waits on all subsequent elements. Thus,
131 * adding a task to the tail of the list blocks any grace period that is
132 * already waiting on one of the elements. In contrast, adding a task
133 * to the head of the list won't block any grace period that is already
134 * waiting on one of the elements.
135 *
136 * This queuing is imprecise, and can sometimes make an ongoing grace
137 * period wait for a task that is not strictly speaking blocking it.
138 * Given the choice, we needlessly block a normal grace period rather than
139 * blocking an expedited grace period.
140 *
141 * Note that an endless sequence of expedited grace periods still cannot
142 * indefinitely postpone a normal grace period. Eventually, all of the
143 * fixed number of preempted tasks blocking the normal grace period that are
144 * not also blocking the expedited grace period will resume and complete
145 * their RCU read-side critical sections. At that point, the ->gp_tasks
146 * pointer will equal the ->exp_tasks pointer, at which point the end of
147 * the corresponding expedited grace period will also be the end of the
148 * normal grace period.
149 */
150static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp,
151 unsigned long flags) __releases(rnp->lock)
152{
153 int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) +
154 (rnp->exp_tasks ? RCU_EXP_TASKS : 0) +
155 (rnp->qsmask & rdp->grpmask ? RCU_GP_BLKD : 0) +
156 (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0);
157 struct task_struct *t = current;
158
159 /*
160 * Decide where to queue the newly blocked task. In theory,
161 * this could be an if-statement. In practice, when I tried
162 * that, it was quite messy.
163 */
164 switch (blkd_state) {
165 case 0:
166 case RCU_EXP_TASKS:
167 case RCU_EXP_TASKS + RCU_GP_BLKD:
168 case RCU_GP_TASKS:
169 case RCU_GP_TASKS + RCU_EXP_TASKS:
170
171 /*
172 * Blocking neither GP, or first task blocking the normal
173 * GP but not blocking the already-waiting expedited GP.
174 * Queue at the head of the list to avoid unnecessarily
175 * blocking the already-waiting GPs.
176 */
177 list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
178 break;
179
180 case RCU_EXP_BLKD:
181 case RCU_GP_BLKD:
182 case RCU_GP_BLKD + RCU_EXP_BLKD:
183 case RCU_GP_TASKS + RCU_EXP_BLKD:
184 case RCU_GP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD:
185 case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD:
186
187 /*
188 * First task arriving that blocks either GP, or first task
189 * arriving that blocks the expedited GP (with the normal
190 * GP already waiting), or a task arriving that blocks
191 * both GPs with both GPs already waiting. Queue at the
192 * tail of the list to avoid any GP waiting on any of the
193 * already queued tasks that are not blocking it.
194 */
195 list_add_tail(&t->rcu_node_entry, &rnp->blkd_tasks);
196 break;
197
198 case RCU_EXP_TASKS + RCU_EXP_BLKD:
199 case RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD:
200 case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_EXP_BLKD:
201
202 /*
203 * Second or subsequent task blocking the expedited GP.
204 * The task either does not block the normal GP, or is the
205 * first task blocking the normal GP. Queue just after
206 * the first task blocking the expedited GP.
207 */
208 list_add(&t->rcu_node_entry, rnp->exp_tasks);
209 break;
210
211 case RCU_GP_TASKS + RCU_GP_BLKD:
212 case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD:
213
214 /*
215 * Second or subsequent task blocking the normal GP.
216 * The task does not block the expedited GP. Queue just
217 * after the first task blocking the normal GP.
218 */
219 list_add(&t->rcu_node_entry, rnp->gp_tasks);
220 break;
221
222 default:
223
224 /* Yet another exercise in excessive paranoia. */
225 WARN_ON_ONCE(1);
226 break;
227 }
228
229 /*
230 * We have now queued the task. If it was the first one to
231 * block either grace period, update the ->gp_tasks and/or
232 * ->exp_tasks pointers, respectively, to reference the newly
233 * blocked tasks.
234 */
235 if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD))
236 rnp->gp_tasks = &t->rcu_node_entry;
237 if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
238 rnp->exp_tasks = &t->rcu_node_entry;
239 raw_spin_unlock(&rnp->lock);
240
241 /*
242 * Report the quiescent state for the expedited GP. This expedited
243 * GP should not be able to end until we report, so there should be
244 * no need to check for a subsequent expedited GP. (Though we are
245 * still in a quiescent state in any case.)
246 */
247 if (blkd_state & RCU_EXP_BLKD &&
248 t->rcu_read_unlock_special.b.exp_need_qs) {
249 t->rcu_read_unlock_special.b.exp_need_qs = false;
250 rcu_report_exp_rdp(rdp->rsp, rdp, true);
251 } else {
252 WARN_ON_ONCE(t->rcu_read_unlock_special.b.exp_need_qs);
253 }
254 local_irq_restore(flags);
255}
256
117/* 257/*
118 * Record a preemptible-RCU quiescent state for the specified CPU. Note 258 * Record a preemptible-RCU quiescent state for the specified CPU. Note
119 * that this just means that the task currently running on the CPU is 259 * that this just means that the task currently running on the CPU is
@@ -125,11 +265,11 @@ static void __init rcu_bootup_announce(void)
125 */ 265 */
126static void rcu_preempt_qs(void) 266static void rcu_preempt_qs(void)
127{ 267{
128 if (!__this_cpu_read(rcu_data_p->passed_quiesce)) { 268 if (__this_cpu_read(rcu_data_p->cpu_no_qs.s)) {
129 trace_rcu_grace_period(TPS("rcu_preempt"), 269 trace_rcu_grace_period(TPS("rcu_preempt"),
130 __this_cpu_read(rcu_data_p->gpnum), 270 __this_cpu_read(rcu_data_p->gpnum),
131 TPS("cpuqs")); 271 TPS("cpuqs"));
132 __this_cpu_write(rcu_data_p->passed_quiesce, 1); 272 __this_cpu_write(rcu_data_p->cpu_no_qs.b.norm, false);
133 barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */ 273 barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */
134 current->rcu_read_unlock_special.b.need_qs = false; 274 current->rcu_read_unlock_special.b.need_qs = false;
135 } 275 }
@@ -167,42 +307,18 @@ static void rcu_preempt_note_context_switch(void)
167 t->rcu_blocked_node = rnp; 307 t->rcu_blocked_node = rnp;
168 308
169 /* 309 /*
170 * If this CPU has already checked in, then this task 310 * Verify the CPU's sanity, trace the preemption, and
171 * will hold up the next grace period rather than the 311 * then queue the task as required based on the states
172 * current grace period. Queue the task accordingly. 312 * of any ongoing and expedited grace periods.
173 * If the task is queued for the current grace period
174 * (i.e., this CPU has not yet passed through a quiescent
175 * state for the current grace period), then as long
176 * as that task remains queued, the current grace period
177 * cannot end. Note that there is some uncertainty as
178 * to exactly when the current grace period started.
179 * We take a conservative approach, which can result
180 * in unnecessarily waiting on tasks that started very
181 * slightly after the current grace period began. C'est
182 * la vie!!!
183 *
184 * But first, note that the current CPU must still be
185 * on line!
186 */ 313 */
187 WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0); 314 WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0);
188 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); 315 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
189 if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
190 list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
191 rnp->gp_tasks = &t->rcu_node_entry;
192 if (IS_ENABLED(CONFIG_RCU_BOOST) &&
193 rnp->boost_tasks != NULL)
194 rnp->boost_tasks = rnp->gp_tasks;
195 } else {
196 list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
197 if (rnp->qsmask & rdp->grpmask)
198 rnp->gp_tasks = &t->rcu_node_entry;
199 }
200 trace_rcu_preempt_task(rdp->rsp->name, 316 trace_rcu_preempt_task(rdp->rsp->name,
201 t->pid, 317 t->pid,
202 (rnp->qsmask & rdp->grpmask) 318 (rnp->qsmask & rdp->grpmask)
203 ? rnp->gpnum 319 ? rnp->gpnum
204 : rnp->gpnum + 1); 320 : rnp->gpnum + 1);
205 raw_spin_unlock_irqrestore(&rnp->lock, flags); 321 rcu_preempt_ctxt_queue(rnp, rdp, flags);
206 } else if (t->rcu_read_lock_nesting < 0 && 322 } else if (t->rcu_read_lock_nesting < 0 &&
207 t->rcu_read_unlock_special.s) { 323 t->rcu_read_unlock_special.s) {
208 324
@@ -272,6 +388,7 @@ void rcu_read_unlock_special(struct task_struct *t)
272 unsigned long flags; 388 unsigned long flags;
273 struct list_head *np; 389 struct list_head *np;
274 bool drop_boost_mutex = false; 390 bool drop_boost_mutex = false;
391 struct rcu_data *rdp;
275 struct rcu_node *rnp; 392 struct rcu_node *rnp;
276 union rcu_special special; 393 union rcu_special special;
277 394
@@ -282,8 +399,8 @@ void rcu_read_unlock_special(struct task_struct *t)
282 local_irq_save(flags); 399 local_irq_save(flags);
283 400
284 /* 401 /*
285 * If RCU core is waiting for this CPU to exit critical section, 402 * If RCU core is waiting for this CPU to exit its critical section,
286 * let it know that we have done so. Because irqs are disabled, 403 * report the fact that it has exited. Because irqs are disabled,
287 * t->rcu_read_unlock_special cannot change. 404 * t->rcu_read_unlock_special cannot change.
288 */ 405 */
289 special = t->rcu_read_unlock_special; 406 special = t->rcu_read_unlock_special;
@@ -296,13 +413,32 @@ void rcu_read_unlock_special(struct task_struct *t)
296 } 413 }
297 } 414 }
298 415
416 /*
417 * Respond to a request for an expedited grace period, but only if
418 * we were not preempted, meaning that we were running on the same
419 * CPU throughout. If we were preempted, the exp_need_qs flag
420 * would have been cleared at the time of the first preemption,
421 * and the quiescent state would be reported when we were dequeued.
422 */
423 if (special.b.exp_need_qs) {
424 WARN_ON_ONCE(special.b.blocked);
425 t->rcu_read_unlock_special.b.exp_need_qs = false;
426 rdp = this_cpu_ptr(rcu_state_p->rda);
427 rcu_report_exp_rdp(rcu_state_p, rdp, true);
428 if (!t->rcu_read_unlock_special.s) {
429 local_irq_restore(flags);
430 return;
431 }
432 }
433
299 /* Hardware IRQ handlers cannot block, complain if they get here. */ 434 /* Hardware IRQ handlers cannot block, complain if they get here. */
300 if (in_irq() || in_serving_softirq()) { 435 if (in_irq() || in_serving_softirq()) {
301 lockdep_rcu_suspicious(__FILE__, __LINE__, 436 lockdep_rcu_suspicious(__FILE__, __LINE__,
302 "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n"); 437 "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
303 pr_alert("->rcu_read_unlock_special: %#x (b: %d, nq: %d)\n", 438 pr_alert("->rcu_read_unlock_special: %#x (b: %d, enq: %d nq: %d)\n",
304 t->rcu_read_unlock_special.s, 439 t->rcu_read_unlock_special.s,
305 t->rcu_read_unlock_special.b.blocked, 440 t->rcu_read_unlock_special.b.blocked,
441 t->rcu_read_unlock_special.b.exp_need_qs,
306 t->rcu_read_unlock_special.b.need_qs); 442 t->rcu_read_unlock_special.b.need_qs);
307 local_irq_restore(flags); 443 local_irq_restore(flags);
308 return; 444 return;
@@ -329,7 +465,7 @@ void rcu_read_unlock_special(struct task_struct *t)
329 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 465 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
330 } 466 }
331 empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); 467 empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
332 empty_exp = !rcu_preempted_readers_exp(rnp); 468 empty_exp = sync_rcu_preempt_exp_done(rnp);
333 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ 469 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
334 np = rcu_next_node_entry(t, rnp); 470 np = rcu_next_node_entry(t, rnp);
335 list_del_init(&t->rcu_node_entry); 471 list_del_init(&t->rcu_node_entry);
@@ -353,7 +489,7 @@ void rcu_read_unlock_special(struct task_struct *t)
353 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, 489 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
354 * so we must take a snapshot of the expedited state. 490 * so we must take a snapshot of the expedited state.
355 */ 491 */
356 empty_exp_now = !rcu_preempted_readers_exp(rnp); 492 empty_exp_now = sync_rcu_preempt_exp_done(rnp);
357 if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) { 493 if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) {
358 trace_rcu_quiescent_state_report(TPS("preempt_rcu"), 494 trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
359 rnp->gpnum, 495 rnp->gpnum,
@@ -450,6 +586,27 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
450} 586}
451 587
452/* 588/*
589 * Scan the current list of tasks blocked within RCU read-side critical
590 * sections, printing out the tid of each that is blocking the current
591 * expedited grace period.
592 */
593static int rcu_print_task_exp_stall(struct rcu_node *rnp)
594{
595 struct task_struct *t;
596 int ndetected = 0;
597
598 if (!rnp->exp_tasks)
599 return 0;
600 t = list_entry(rnp->exp_tasks->prev,
601 struct task_struct, rcu_node_entry);
602 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
603 pr_cont(" P%d", t->pid);
604 ndetected++;
605 }
606 return ndetected;
607}
608
609/*
453 * Check that the list of blocked tasks for the newly completed grace 610 * Check that the list of blocked tasks for the newly completed grace
454 * period is in fact empty. It is a serious bug to complete a grace 611 * period is in fact empty. It is a serious bug to complete a grace
455 * period that still has RCU readers blocked! This function must be 612 * period that still has RCU readers blocked! This function must be
@@ -483,8 +640,8 @@ static void rcu_preempt_check_callbacks(void)
483 return; 640 return;
484 } 641 }
485 if (t->rcu_read_lock_nesting > 0 && 642 if (t->rcu_read_lock_nesting > 0 &&
486 __this_cpu_read(rcu_data_p->qs_pending) && 643 __this_cpu_read(rcu_data_p->core_needs_qs) &&
487 !__this_cpu_read(rcu_data_p->passed_quiesce)) 644 __this_cpu_read(rcu_data_p->cpu_no_qs.b.norm))
488 t->rcu_read_unlock_special.b.need_qs = true; 645 t->rcu_read_unlock_special.b.need_qs = true;
489} 646}
490 647
@@ -500,7 +657,7 @@ static void rcu_preempt_do_callbacks(void)
500/* 657/*
501 * Queue a preemptible-RCU callback for invocation after a grace period. 658 * Queue a preemptible-RCU callback for invocation after a grace period.
502 */ 659 */
503void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 660void call_rcu(struct rcu_head *head, rcu_callback_t func)
504{ 661{
505 __call_rcu(head, func, rcu_state_p, -1, 0); 662 __call_rcu(head, func, rcu_state_p, -1, 0);
506} 663}
@@ -535,155 +692,41 @@ void synchronize_rcu(void)
535} 692}
536EXPORT_SYMBOL_GPL(synchronize_rcu); 693EXPORT_SYMBOL_GPL(synchronize_rcu);
537 694
538static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
539
540/*
541 * Return non-zero if there are any tasks in RCU read-side critical
542 * sections blocking the current preemptible-RCU expedited grace period.
543 * If there is no preemptible-RCU expedited grace period currently in
544 * progress, returns zero unconditionally.
545 */
546static int rcu_preempted_readers_exp(struct rcu_node *rnp)
547{
548 return rnp->exp_tasks != NULL;
549}
550
551/*
552 * return non-zero if there is no RCU expedited grace period in progress
553 * for the specified rcu_node structure, in other words, if all CPUs and
554 * tasks covered by the specified rcu_node structure have done their bit
555 * for the current expedited grace period. Works only for preemptible
556 * RCU -- other RCU implementation use other means.
557 *
558 * Caller must hold the root rcu_node's exp_funnel_mutex.
559 */
560static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
561{
562 return !rcu_preempted_readers_exp(rnp) &&
563 READ_ONCE(rnp->expmask) == 0;
564}
565
566/*
567 * Report the exit from RCU read-side critical section for the last task
568 * that queued itself during or before the current expedited preemptible-RCU
569 * grace period. This event is reported either to the rcu_node structure on
570 * which the task was queued or to one of that rcu_node structure's ancestors,
571 * recursively up the tree. (Calm down, calm down, we do the recursion
572 * iteratively!)
573 *
574 * Caller must hold the root rcu_node's exp_funnel_mutex.
575 */
576static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
577 bool wake)
578{
579 unsigned long flags;
580 unsigned long mask;
581
582 raw_spin_lock_irqsave(&rnp->lock, flags);
583 smp_mb__after_unlock_lock();
584 for (;;) {
585 if (!sync_rcu_preempt_exp_done(rnp)) {
586 raw_spin_unlock_irqrestore(&rnp->lock, flags);
587 break;
588 }
589 if (rnp->parent == NULL) {
590 raw_spin_unlock_irqrestore(&rnp->lock, flags);
591 if (wake) {
592 smp_mb(); /* EGP done before wake_up(). */
593 wake_up(&sync_rcu_preempt_exp_wq);
594 }
595 break;
596 }
597 mask = rnp->grpmask;
598 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
599 rnp = rnp->parent;
600 raw_spin_lock(&rnp->lock); /* irqs already disabled */
601 smp_mb__after_unlock_lock();
602 rnp->expmask &= ~mask;
603 }
604}
605
606/* 695/*
607 * Snapshot the tasks blocking the newly started preemptible-RCU expedited 696 * Remote handler for smp_call_function_single(). If there is an
608 * grace period for the specified rcu_node structure, phase 1. If there 697 * RCU read-side critical section in effect, request that the
609 * are such tasks, set the ->expmask bits up the rcu_node tree and also 698 * next rcu_read_unlock() record the quiescent state up the
610 * set the ->expmask bits on the leaf rcu_node structures to tell phase 2 699 * ->expmask fields in the rcu_node tree. Otherwise, immediately
611 * that work is needed here. 700 * report the quiescent state.
612 *
613 * Caller must hold the root rcu_node's exp_funnel_mutex.
614 */ 701 */
615static void 702static void sync_rcu_exp_handler(void *info)
616sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp)
617{ 703{
618 unsigned long flags; 704 struct rcu_data *rdp;
619 unsigned long mask; 705 struct rcu_state *rsp = info;
620 struct rcu_node *rnp_up; 706 struct task_struct *t = current;
621
622 raw_spin_lock_irqsave(&rnp->lock, flags);
623 smp_mb__after_unlock_lock();
624 WARN_ON_ONCE(rnp->expmask);
625 WARN_ON_ONCE(rnp->exp_tasks);
626 if (!rcu_preempt_has_tasks(rnp)) {
627 /* No blocked tasks, nothing to do. */
628 raw_spin_unlock_irqrestore(&rnp->lock, flags);
629 return;
630 }
631 /* Call for Phase 2 and propagate ->expmask bits up the tree. */
632 rnp->expmask = 1;
633 rnp_up = rnp;
634 while (rnp_up->parent) {
635 mask = rnp_up->grpmask;
636 rnp_up = rnp_up->parent;
637 if (rnp_up->expmask & mask)
638 break;
639 raw_spin_lock(&rnp_up->lock); /* irqs already off */
640 smp_mb__after_unlock_lock();
641 rnp_up->expmask |= mask;
642 raw_spin_unlock(&rnp_up->lock); /* irqs still off */
643 }
644 raw_spin_unlock_irqrestore(&rnp->lock, flags);
645}
646
647/*
648 * Snapshot the tasks blocking the newly started preemptible-RCU expedited
649 * grace period for the specified rcu_node structure, phase 2. If the
650 * leaf rcu_node structure has its ->expmask field set, check for tasks.
651 * If there are some, clear ->expmask and set ->exp_tasks accordingly,
652 * then initiate RCU priority boosting. Otherwise, clear ->expmask and
653 * invoke rcu_report_exp_rnp() to clear out the upper-level ->expmask bits,
654 * enabling rcu_read_unlock_special() to do the bit-clearing.
655 *
656 * Caller must hold the root rcu_node's exp_funnel_mutex.
657 */
658static void
659sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp)
660{
661 unsigned long flags;
662
663 raw_spin_lock_irqsave(&rnp->lock, flags);
664 smp_mb__after_unlock_lock();
665 if (!rnp->expmask) {
666 /* Phase 1 didn't do anything, so Phase 2 doesn't either. */
667 raw_spin_unlock_irqrestore(&rnp->lock, flags);
668 return;
669 }
670
671 /* Phase 1 is over. */
672 rnp->expmask = 0;
673 707
674 /* 708 /*
675 * If there are still blocked tasks, set up ->exp_tasks so that 709 * Within an RCU read-side critical section, request that the next
676 * rcu_read_unlock_special() will wake us and then boost them. 710 * rcu_read_unlock() report. Unless this RCU read-side critical
711 * section has already blocked, in which case it is already set
712 * up for the expedited grace period to wait on it.
677 */ 713 */
678 if (rcu_preempt_has_tasks(rnp)) { 714 if (t->rcu_read_lock_nesting > 0 &&
679 rnp->exp_tasks = rnp->blkd_tasks.next; 715 !t->rcu_read_unlock_special.b.blocked) {
680 rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ 716 t->rcu_read_unlock_special.b.exp_need_qs = true;
681 return; 717 return;
682 } 718 }
683 719
684 /* No longer any blocked tasks, so undo bit setting. */ 720 /*
685 raw_spin_unlock_irqrestore(&rnp->lock, flags); 721 * We are either exiting an RCU read-side critical section (negative
686 rcu_report_exp_rnp(rsp, rnp, false); 722 * values of t->rcu_read_lock_nesting) or are not in one at all
723 * (zero value of t->rcu_read_lock_nesting). Or we are in an RCU
724 * read-side critical section that blocked before this expedited
725 * grace period started. Either way, we can immediately report
726 * the quiescent state.
727 */
728 rdp = this_cpu_ptr(rsp->rda);
729 rcu_report_exp_rdp(rsp, rdp, true);
687} 730}
688 731
689/** 732/**
@@ -713,24 +756,12 @@ void synchronize_rcu_expedited(void)
713 756
714 rcu_exp_gp_seq_start(rsp); 757 rcu_exp_gp_seq_start(rsp);
715 758
716 /* force all RCU readers onto ->blkd_tasks lists. */ 759 /* Initialize the rcu_node tree in preparation for the wait. */
717 synchronize_sched_expedited(); 760 sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler);
718
719 /*
720 * Snapshot current state of ->blkd_tasks lists into ->expmask.
721 * Phase 1 sets bits and phase 2 permits rcu_read_unlock_special()
722 * to start clearing them. Doing this in one phase leads to
723 * strange races between setting and clearing bits, so just say "no"!
724 */
725 rcu_for_each_leaf_node(rsp, rnp)
726 sync_rcu_preempt_exp_init1(rsp, rnp);
727 rcu_for_each_leaf_node(rsp, rnp)
728 sync_rcu_preempt_exp_init2(rsp, rnp);
729 761
730 /* Wait for snapshotted ->blkd_tasks lists to drain. */ 762 /* Wait for snapshotted ->blkd_tasks lists to drain. */
731 rnp = rcu_get_root(rsp); 763 rnp = rcu_get_root(rsp);
732 wait_event(sync_rcu_preempt_exp_wq, 764 synchronize_sched_expedited_wait(rsp);
733 sync_rcu_preempt_exp_done(rnp));
734 765
735 /* Clean up and exit. */ 766 /* Clean up and exit. */
736 rcu_exp_gp_seq_end(rsp); 767 rcu_exp_gp_seq_end(rsp);
@@ -835,6 +866,16 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
835} 866}
836 867
837/* 868/*
869 * Because preemptible RCU does not exist, we never have to check for
870 * tasks blocked within RCU read-side critical sections that are
871 * blocking the current expedited grace period.
872 */
873static int rcu_print_task_exp_stall(struct rcu_node *rnp)
874{
875 return 0;
876}
877
878/*
838 * Because there is no preemptible RCU, there can be no readers blocked, 879 * Because there is no preemptible RCU, there can be no readers blocked,
839 * so there is no need to check for blocked tasks. So check only for 880 * so there is no need to check for blocked tasks. So check only for
840 * bogus qsmask values. 881 * bogus qsmask values.
@@ -1702,8 +1743,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
1702 ticks_value = rsp->gpnum - rdp->gpnum; 1743 ticks_value = rsp->gpnum - rdp->gpnum;
1703 } 1744 }
1704 print_cpu_stall_fast_no_hz(fast_no_hz, cpu); 1745 print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
1705 pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n", 1746 pr_err("\t%d-%c%c%c: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n",
1706 cpu, ticks_value, ticks_title, 1747 cpu,
1748 "O."[!!cpu_online(cpu)],
1749 "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
1750 "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)],
1751 ticks_value, ticks_title,
1707 atomic_read(&rdtp->dynticks) & 0xfff, 1752 atomic_read(&rdtp->dynticks) & 0xfff,
1708 rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, 1753 rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
1709 rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), 1754 rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 6fc4c5ff3bb5..ef7093cc9b5c 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -117,13 +117,13 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
117 117
118 if (!rdp->beenonline) 118 if (!rdp->beenonline)
119 return; 119 return;
120 seq_printf(m, "%3d%cc=%ld g=%ld pq=%d/%d qp=%d", 120 seq_printf(m, "%3d%cc=%ld g=%ld cnq=%d/%d:%d",
121 rdp->cpu, 121 rdp->cpu,
122 cpu_is_offline(rdp->cpu) ? '!' : ' ', 122 cpu_is_offline(rdp->cpu) ? '!' : ' ',
123 ulong2long(rdp->completed), ulong2long(rdp->gpnum), 123 ulong2long(rdp->completed), ulong2long(rdp->gpnum),
124 rdp->passed_quiesce, 124 rdp->cpu_no_qs.b.norm,
125 rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu), 125 rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu),
126 rdp->qs_pending); 126 rdp->core_needs_qs);
127 seq_printf(m, " dt=%d/%llx/%d df=%lu", 127 seq_printf(m, " dt=%d/%llx/%d df=%lu",
128 atomic_read(&rdp->dynticks->dynticks), 128 atomic_read(&rdp->dynticks->dynticks),
129 rdp->dynticks->dynticks_nesting, 129 rdp->dynticks->dynticks_nesting,
@@ -268,7 +268,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
268 gpnum = rsp->gpnum; 268 gpnum = rsp->gpnum;
269 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ", 269 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ",
270 ulong2long(rsp->completed), ulong2long(gpnum), 270 ulong2long(rsp->completed), ulong2long(gpnum),
271 rsp->fqs_state, 271 rsp->gp_state,
272 (long)(rsp->jiffies_force_qs - jiffies), 272 (long)(rsp->jiffies_force_qs - jiffies),
273 (int)(jiffies & 0xffff)); 273 (int)(jiffies & 0xffff));
274 seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", 274 seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
@@ -361,7 +361,7 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
361 cpu_is_offline(rdp->cpu) ? '!' : ' ', 361 cpu_is_offline(rdp->cpu) ? '!' : ' ',
362 rdp->n_rcu_pending); 362 rdp->n_rcu_pending);
363 seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ", 363 seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ",
364 rdp->n_rp_qs_pending, 364 rdp->n_rp_core_needs_qs,
365 rdp->n_rp_report_qs, 365 rdp->n_rp_report_qs,
366 rdp->n_rp_cb_ready, 366 rdp->n_rp_cb_ready,
367 rdp->n_rp_cpu_needs_gp); 367 rdp->n_rp_cpu_needs_gp);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 7a0b3bc7c5ed..5f748c5a40f0 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -534,7 +534,7 @@ static void rcu_spawn_tasks_kthread(void);
534 * Post an RCU-tasks callback. First call must be from process context 534 * Post an RCU-tasks callback. First call must be from process context
535 * after the scheduler if fully operational. 535 * after the scheduler if fully operational.
536 */ 536 */
537void call_rcu_tasks(struct rcu_head *rhp, void (*func)(struct rcu_head *rhp)) 537void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func)
538{ 538{
539 unsigned long flags; 539 unsigned long flags;
540 bool needwake; 540 bool needwake;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2f9c92884817..4d568ac9319e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -817,7 +817,7 @@ static void set_load_weight(struct task_struct *p)
817 /* 817 /*
818 * SCHED_IDLE tasks get minimal weight: 818 * SCHED_IDLE tasks get minimal weight:
819 */ 819 */
820 if (p->policy == SCHED_IDLE) { 820 if (idle_policy(p->policy)) {
821 load->weight = scale_load(WEIGHT_IDLEPRIO); 821 load->weight = scale_load(WEIGHT_IDLEPRIO);
822 load->inv_weight = WMULT_IDLEPRIO; 822 load->inv_weight = WMULT_IDLEPRIO;
823 return; 823 return;
@@ -827,17 +827,19 @@ static void set_load_weight(struct task_struct *p)
827 load->inv_weight = prio_to_wmult[prio]; 827 load->inv_weight = prio_to_wmult[prio];
828} 828}
829 829
830static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 830static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
831{ 831{
832 update_rq_clock(rq); 832 update_rq_clock(rq);
833 sched_info_queued(rq, p); 833 if (!(flags & ENQUEUE_RESTORE))
834 sched_info_queued(rq, p);
834 p->sched_class->enqueue_task(rq, p, flags); 835 p->sched_class->enqueue_task(rq, p, flags);
835} 836}
836 837
837static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 838static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
838{ 839{
839 update_rq_clock(rq); 840 update_rq_clock(rq);
840 sched_info_dequeued(rq, p); 841 if (!(flags & DEQUEUE_SAVE))
842 sched_info_dequeued(rq, p);
841 p->sched_class->dequeue_task(rq, p, flags); 843 p->sched_class->dequeue_task(rq, p, flags);
842} 844}
843 845
@@ -1178,7 +1180,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
1178 * holding rq->lock. 1180 * holding rq->lock.
1179 */ 1181 */
1180 lockdep_assert_held(&rq->lock); 1182 lockdep_assert_held(&rq->lock);
1181 dequeue_task(rq, p, 0); 1183 dequeue_task(rq, p, DEQUEUE_SAVE);
1182 } 1184 }
1183 if (running) 1185 if (running)
1184 put_prev_task(rq, p); 1186 put_prev_task(rq, p);
@@ -1188,7 +1190,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
1188 if (running) 1190 if (running)
1189 p->sched_class->set_curr_task(rq); 1191 p->sched_class->set_curr_task(rq);
1190 if (queued) 1192 if (queued)
1191 enqueue_task(rq, p, 0); 1193 enqueue_task(rq, p, ENQUEUE_RESTORE);
1192} 1194}
1193 1195
1194/* 1196/*
@@ -1292,7 +1294,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1292 1294
1293 if (task_cpu(p) != new_cpu) { 1295 if (task_cpu(p) != new_cpu) {
1294 if (p->sched_class->migrate_task_rq) 1296 if (p->sched_class->migrate_task_rq)
1295 p->sched_class->migrate_task_rq(p, new_cpu); 1297 p->sched_class->migrate_task_rq(p);
1296 p->se.nr_migrations++; 1298 p->se.nr_migrations++;
1297 perf_event_task_migrate(p); 1299 perf_event_task_migrate(p);
1298 } 1300 }
@@ -1333,12 +1335,16 @@ static int migrate_swap_stop(void *data)
1333 struct rq *src_rq, *dst_rq; 1335 struct rq *src_rq, *dst_rq;
1334 int ret = -EAGAIN; 1336 int ret = -EAGAIN;
1335 1337
1338 if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
1339 return -EAGAIN;
1340
1336 src_rq = cpu_rq(arg->src_cpu); 1341 src_rq = cpu_rq(arg->src_cpu);
1337 dst_rq = cpu_rq(arg->dst_cpu); 1342 dst_rq = cpu_rq(arg->dst_cpu);
1338 1343
1339 double_raw_lock(&arg->src_task->pi_lock, 1344 double_raw_lock(&arg->src_task->pi_lock,
1340 &arg->dst_task->pi_lock); 1345 &arg->dst_task->pi_lock);
1341 double_rq_lock(src_rq, dst_rq); 1346 double_rq_lock(src_rq, dst_rq);
1347
1342 if (task_cpu(arg->dst_task) != arg->dst_cpu) 1348 if (task_cpu(arg->dst_task) != arg->dst_cpu)
1343 goto unlock; 1349 goto unlock;
1344 1350
@@ -1574,13 +1580,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
1574 goto out; 1580 goto out;
1575 } 1581 }
1576 1582
1583 /* No more Mr. Nice Guy. */
1577 switch (state) { 1584 switch (state) {
1578 case cpuset: 1585 case cpuset:
1579 /* No more Mr. Nice Guy. */ 1586 if (IS_ENABLED(CONFIG_CPUSETS)) {
1580 cpuset_cpus_allowed_fallback(p); 1587 cpuset_cpus_allowed_fallback(p);
1581 state = possible; 1588 state = possible;
1582 break; 1589 break;
1583 1590 }
1591 /* fall-through */
1584 case possible: 1592 case possible:
1585 do_set_cpus_allowed(p, cpu_possible_mask); 1593 do_set_cpus_allowed(p, cpu_possible_mask);
1586 state = fail; 1594 state = fail;
@@ -1692,7 +1700,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1692#endif /* CONFIG_SCHEDSTATS */ 1700#endif /* CONFIG_SCHEDSTATS */
1693} 1701}
1694 1702
1695static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) 1703static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
1696{ 1704{
1697 activate_task(rq, p, en_flags); 1705 activate_task(rq, p, en_flags);
1698 p->on_rq = TASK_ON_RQ_QUEUED; 1706 p->on_rq = TASK_ON_RQ_QUEUED;
@@ -2114,23 +2122,17 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
2114#endif /* CONFIG_NUMA_BALANCING */ 2122#endif /* CONFIG_NUMA_BALANCING */
2115} 2123}
2116 2124
2125DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
2126
2117#ifdef CONFIG_NUMA_BALANCING 2127#ifdef CONFIG_NUMA_BALANCING
2118#ifdef CONFIG_SCHED_DEBUG 2128
2119void set_numabalancing_state(bool enabled) 2129void set_numabalancing_state(bool enabled)
2120{ 2130{
2121 if (enabled) 2131 if (enabled)
2122 sched_feat_set("NUMA"); 2132 static_branch_enable(&sched_numa_balancing);
2123 else 2133 else
2124 sched_feat_set("NO_NUMA"); 2134 static_branch_disable(&sched_numa_balancing);
2125} 2135}
2126#else
2127__read_mostly bool numabalancing_enabled;
2128
2129void set_numabalancing_state(bool enabled)
2130{
2131 numabalancing_enabled = enabled;
2132}
2133#endif /* CONFIG_SCHED_DEBUG */
2134 2136
2135#ifdef CONFIG_PROC_SYSCTL 2137#ifdef CONFIG_PROC_SYSCTL
2136int sysctl_numa_balancing(struct ctl_table *table, int write, 2138int sysctl_numa_balancing(struct ctl_table *table, int write,
@@ -2138,7 +2140,7 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
2138{ 2140{
2139 struct ctl_table t; 2141 struct ctl_table t;
2140 int err; 2142 int err;
2141 int state = numabalancing_enabled; 2143 int state = static_branch_likely(&sched_numa_balancing);
2142 2144
2143 if (write && !capable(CAP_SYS_ADMIN)) 2145 if (write && !capable(CAP_SYS_ADMIN))
2144 return -EPERM; 2146 return -EPERM;
@@ -2349,6 +2351,8 @@ void wake_up_new_task(struct task_struct *p)
2349 struct rq *rq; 2351 struct rq *rq;
2350 2352
2351 raw_spin_lock_irqsave(&p->pi_lock, flags); 2353 raw_spin_lock_irqsave(&p->pi_lock, flags);
2354 /* Initialize new task's runnable average */
2355 init_entity_runnable_average(&p->se);
2352#ifdef CONFIG_SMP 2356#ifdef CONFIG_SMP
2353 /* 2357 /*
2354 * Fork balancing, do it here and not earlier because: 2358 * Fork balancing, do it here and not earlier because:
@@ -2358,16 +2362,21 @@ void wake_up_new_task(struct task_struct *p)
2358 set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); 2362 set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
2359#endif 2363#endif
2360 2364
2361 /* Initialize new task's runnable average */
2362 init_entity_runnable_average(&p->se);
2363 rq = __task_rq_lock(p); 2365 rq = __task_rq_lock(p);
2364 activate_task(rq, p, 0); 2366 activate_task(rq, p, 0);
2365 p->on_rq = TASK_ON_RQ_QUEUED; 2367 p->on_rq = TASK_ON_RQ_QUEUED;
2366 trace_sched_wakeup_new(p); 2368 trace_sched_wakeup_new(p);
2367 check_preempt_curr(rq, p, WF_FORK); 2369 check_preempt_curr(rq, p, WF_FORK);
2368#ifdef CONFIG_SMP 2370#ifdef CONFIG_SMP
2369 if (p->sched_class->task_woken) 2371 if (p->sched_class->task_woken) {
2372 /*
2373 * Nothing relies on rq->lock after this, so its fine to
2374 * drop it.
2375 */
2376 lockdep_unpin_lock(&rq->lock);
2370 p->sched_class->task_woken(rq, p); 2377 p->sched_class->task_woken(rq, p);
2378 lockdep_pin_lock(&rq->lock);
2379 }
2371#endif 2380#endif
2372 task_rq_unlock(rq, p, &flags); 2381 task_rq_unlock(rq, p, &flags);
2373} 2382}
@@ -2476,7 +2485,6 @@ static inline void
2476prepare_task_switch(struct rq *rq, struct task_struct *prev, 2485prepare_task_switch(struct rq *rq, struct task_struct *prev,
2477 struct task_struct *next) 2486 struct task_struct *next)
2478{ 2487{
2479 trace_sched_switch(prev, next);
2480 sched_info_switch(rq, prev, next); 2488 sched_info_switch(rq, prev, next);
2481 perf_event_task_sched_out(prev, next); 2489 perf_event_task_sched_out(prev, next);
2482 fire_sched_out_preempt_notifiers(prev, next); 2490 fire_sched_out_preempt_notifiers(prev, next);
@@ -2510,6 +2518,22 @@ static struct rq *finish_task_switch(struct task_struct *prev)
2510 struct mm_struct *mm = rq->prev_mm; 2518 struct mm_struct *mm = rq->prev_mm;
2511 long prev_state; 2519 long prev_state;
2512 2520
2521 /*
2522 * The previous task will have left us with a preempt_count of 2
2523 * because it left us after:
2524 *
2525 * schedule()
2526 * preempt_disable(); // 1
2527 * __schedule()
2528 * raw_spin_lock_irq(&rq->lock) // 2
2529 *
2530 * Also, see FORK_PREEMPT_COUNT.
2531 */
2532 if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
2533 "corrupted preempt_count: %s/%d/0x%x\n",
2534 current->comm, current->pid, preempt_count()))
2535 preempt_count_set(FORK_PREEMPT_COUNT);
2536
2513 rq->prev_mm = NULL; 2537 rq->prev_mm = NULL;
2514 2538
2515 /* 2539 /*
@@ -2517,11 +2541,11 @@ static struct rq *finish_task_switch(struct task_struct *prev)
2517 * If a task dies, then it sets TASK_DEAD in tsk->state and calls 2541 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
2518 * schedule one last time. The schedule call will never return, and 2542 * schedule one last time. The schedule call will never return, and
2519 * the scheduled task must drop that reference. 2543 * the scheduled task must drop that reference.
2520 * The test for TASK_DEAD must occur while the runqueue locks are 2544 *
2521 * still held, otherwise prev could be scheduled on another cpu, die 2545 * We must observe prev->state before clearing prev->on_cpu (in
2522 * there before we look at prev->state, and then the reference would 2546 * finish_lock_switch), otherwise a concurrent wakeup can get prev
2523 * be dropped twice. 2547 * running on another CPU and we could rave with its RUNNING -> DEAD
2524 * Manfred Spraul <manfred@colorfullife.com> 2548 * transition, resulting in a double drop.
2525 */ 2549 */
2526 prev_state = prev->state; 2550 prev_state = prev->state;
2527 vtime_task_switch(prev); 2551 vtime_task_switch(prev);
@@ -2594,8 +2618,15 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
2594{ 2618{
2595 struct rq *rq; 2619 struct rq *rq;
2596 2620
2597 /* finish_task_switch() drops rq->lock and enables preemtion */ 2621 /*
2598 preempt_disable(); 2622 * New tasks start with FORK_PREEMPT_COUNT, see there and
2623 * finish_task_switch() for details.
2624 *
2625 * finish_task_switch() will drop rq->lock() and lower preempt_count
2626 * and the preempt_enable() will end up enabling preemption (on
2627 * PREEMPT_COUNT kernels).
2628 */
2629
2599 rq = finish_task_switch(prev); 2630 rq = finish_task_switch(prev);
2600 balance_callback(rq); 2631 balance_callback(rq);
2601 preempt_enable(); 2632 preempt_enable();
@@ -2953,15 +2984,13 @@ static noinline void __schedule_bug(struct task_struct *prev)
2953static inline void schedule_debug(struct task_struct *prev) 2984static inline void schedule_debug(struct task_struct *prev)
2954{ 2985{
2955#ifdef CONFIG_SCHED_STACK_END_CHECK 2986#ifdef CONFIG_SCHED_STACK_END_CHECK
2956 BUG_ON(unlikely(task_stack_end_corrupted(prev))); 2987 BUG_ON(task_stack_end_corrupted(prev));
2957#endif 2988#endif
2958 /* 2989
2959 * Test if we are atomic. Since do_exit() needs to call into 2990 if (unlikely(in_atomic_preempt_off())) {
2960 * schedule() atomically, we ignore that path. Otherwise whine
2961 * if we are scheduling when we should not.
2962 */
2963 if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
2964 __schedule_bug(prev); 2991 __schedule_bug(prev);
2992 preempt_count_set(PREEMPT_DISABLED);
2993 }
2965 rcu_sleep_check(); 2994 rcu_sleep_check();
2966 2995
2967 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 2996 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -3047,7 +3076,7 @@ again:
3047 * 3076 *
3048 * WARNING: must be called with preemption disabled! 3077 * WARNING: must be called with preemption disabled!
3049 */ 3078 */
3050static void __sched __schedule(void) 3079static void __sched notrace __schedule(bool preempt)
3051{ 3080{
3052 struct task_struct *prev, *next; 3081 struct task_struct *prev, *next;
3053 unsigned long *switch_count; 3082 unsigned long *switch_count;
@@ -3059,6 +3088,17 @@ static void __sched __schedule(void)
3059 rcu_note_context_switch(); 3088 rcu_note_context_switch();
3060 prev = rq->curr; 3089 prev = rq->curr;
3061 3090
3091 /*
3092 * do_exit() calls schedule() with preemption disabled as an exception;
3093 * however we must fix that up, otherwise the next task will see an
3094 * inconsistent (higher) preempt count.
3095 *
3096 * It also avoids the below schedule_debug() test from complaining
3097 * about this.
3098 */
3099 if (unlikely(prev->state == TASK_DEAD))
3100 preempt_enable_no_resched_notrace();
3101
3062 schedule_debug(prev); 3102 schedule_debug(prev);
3063 3103
3064 if (sched_feat(HRTICK)) 3104 if (sched_feat(HRTICK))
@@ -3076,7 +3116,7 @@ static void __sched __schedule(void)
3076 rq->clock_skip_update <<= 1; /* promote REQ to ACT */ 3116 rq->clock_skip_update <<= 1; /* promote REQ to ACT */
3077 3117
3078 switch_count = &prev->nivcsw; 3118 switch_count = &prev->nivcsw;
3079 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3119 if (!preempt && prev->state) {
3080 if (unlikely(signal_pending_state(prev->state, prev))) { 3120 if (unlikely(signal_pending_state(prev->state, prev))) {
3081 prev->state = TASK_RUNNING; 3121 prev->state = TASK_RUNNING;
3082 } else { 3122 } else {
@@ -3112,6 +3152,7 @@ static void __sched __schedule(void)
3112 rq->curr = next; 3152 rq->curr = next;
3113 ++*switch_count; 3153 ++*switch_count;
3114 3154
3155 trace_sched_switch(preempt, prev, next);
3115 rq = context_switch(rq, prev, next); /* unlocks the rq */ 3156 rq = context_switch(rq, prev, next); /* unlocks the rq */
3116 cpu = cpu_of(rq); 3157 cpu = cpu_of(rq);
3117 } else { 3158 } else {
@@ -3141,7 +3182,7 @@ asmlinkage __visible void __sched schedule(void)
3141 sched_submit_work(tsk); 3182 sched_submit_work(tsk);
3142 do { 3183 do {
3143 preempt_disable(); 3184 preempt_disable();
3144 __schedule(); 3185 __schedule(false);
3145 sched_preempt_enable_no_resched(); 3186 sched_preempt_enable_no_resched();
3146 } while (need_resched()); 3187 } while (need_resched());
3147} 3188}
@@ -3181,9 +3222,9 @@ void __sched schedule_preempt_disabled(void)
3181static void __sched notrace preempt_schedule_common(void) 3222static void __sched notrace preempt_schedule_common(void)
3182{ 3223{
3183 do { 3224 do {
3184 preempt_active_enter(); 3225 preempt_disable_notrace();
3185 __schedule(); 3226 __schedule(true);
3186 preempt_active_exit(); 3227 preempt_enable_no_resched_notrace();
3187 3228
3188 /* 3229 /*
3189 * Check again in case we missed a preemption opportunity 3230 * Check again in case we missed a preemption opportunity
@@ -3234,24 +3275,17 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
3234 return; 3275 return;
3235 3276
3236 do { 3277 do {
3237 /* 3278 preempt_disable_notrace();
3238 * Use raw __prempt_count() ops that don't call function.
3239 * We can't call functions before disabling preemption which
3240 * disarm preemption tracing recursions.
3241 */
3242 __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
3243 barrier();
3244 /* 3279 /*
3245 * Needs preempt disabled in case user_exit() is traced 3280 * Needs preempt disabled in case user_exit() is traced
3246 * and the tracer calls preempt_enable_notrace() causing 3281 * and the tracer calls preempt_enable_notrace() causing
3247 * an infinite recursion. 3282 * an infinite recursion.
3248 */ 3283 */
3249 prev_ctx = exception_enter(); 3284 prev_ctx = exception_enter();
3250 __schedule(); 3285 __schedule(true);
3251 exception_exit(prev_ctx); 3286 exception_exit(prev_ctx);
3252 3287
3253 barrier(); 3288 preempt_enable_no_resched_notrace();
3254 __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
3255 } while (need_resched()); 3289 } while (need_resched());
3256} 3290}
3257EXPORT_SYMBOL_GPL(preempt_schedule_notrace); 3291EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
@@ -3274,11 +3308,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
3274 prev_state = exception_enter(); 3308 prev_state = exception_enter();
3275 3309
3276 do { 3310 do {
3277 preempt_active_enter(); 3311 preempt_disable();
3278 local_irq_enable(); 3312 local_irq_enable();
3279 __schedule(); 3313 __schedule(true);
3280 local_irq_disable(); 3314 local_irq_disable();
3281 preempt_active_exit(); 3315 sched_preempt_enable_no_resched();
3282 } while (need_resched()); 3316 } while (need_resched());
3283 3317
3284 exception_exit(prev_state); 3318 exception_exit(prev_state);
@@ -3306,7 +3340,7 @@ EXPORT_SYMBOL(default_wake_function);
3306 */ 3340 */
3307void rt_mutex_setprio(struct task_struct *p, int prio) 3341void rt_mutex_setprio(struct task_struct *p, int prio)
3308{ 3342{
3309 int oldprio, queued, running, enqueue_flag = 0; 3343 int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
3310 struct rq *rq; 3344 struct rq *rq;
3311 const struct sched_class *prev_class; 3345 const struct sched_class *prev_class;
3312 3346
@@ -3338,7 +3372,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
3338 queued = task_on_rq_queued(p); 3372 queued = task_on_rq_queued(p);
3339 running = task_current(rq, p); 3373 running = task_current(rq, p);
3340 if (queued) 3374 if (queued)
3341 dequeue_task(rq, p, 0); 3375 dequeue_task(rq, p, DEQUEUE_SAVE);
3342 if (running) 3376 if (running)
3343 put_prev_task(rq, p); 3377 put_prev_task(rq, p);
3344 3378
@@ -3356,7 +3390,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
3356 if (!dl_prio(p->normal_prio) || 3390 if (!dl_prio(p->normal_prio) ||
3357 (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { 3391 (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
3358 p->dl.dl_boosted = 1; 3392 p->dl.dl_boosted = 1;
3359 enqueue_flag = ENQUEUE_REPLENISH; 3393 enqueue_flag |= ENQUEUE_REPLENISH;
3360 } else 3394 } else
3361 p->dl.dl_boosted = 0; 3395 p->dl.dl_boosted = 0;
3362 p->sched_class = &dl_sched_class; 3396 p->sched_class = &dl_sched_class;
@@ -3364,7 +3398,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
3364 if (dl_prio(oldprio)) 3398 if (dl_prio(oldprio))
3365 p->dl.dl_boosted = 0; 3399 p->dl.dl_boosted = 0;
3366 if (oldprio < prio) 3400 if (oldprio < prio)
3367 enqueue_flag = ENQUEUE_HEAD; 3401 enqueue_flag |= ENQUEUE_HEAD;
3368 p->sched_class = &rt_sched_class; 3402 p->sched_class = &rt_sched_class;
3369 } else { 3403 } else {
3370 if (dl_prio(oldprio)) 3404 if (dl_prio(oldprio))
@@ -3416,7 +3450,7 @@ void set_user_nice(struct task_struct *p, long nice)
3416 } 3450 }
3417 queued = task_on_rq_queued(p); 3451 queued = task_on_rq_queued(p);
3418 if (queued) 3452 if (queued)
3419 dequeue_task(rq, p, 0); 3453 dequeue_task(rq, p, DEQUEUE_SAVE);
3420 3454
3421 p->static_prio = NICE_TO_PRIO(nice); 3455 p->static_prio = NICE_TO_PRIO(nice);
3422 set_load_weight(p); 3456 set_load_weight(p);
@@ -3425,7 +3459,7 @@ void set_user_nice(struct task_struct *p, long nice)
3425 delta = p->prio - old_prio; 3459 delta = p->prio - old_prio;
3426 3460
3427 if (queued) { 3461 if (queued) {
3428 enqueue_task(rq, p, 0); 3462 enqueue_task(rq, p, ENQUEUE_RESTORE);
3429 /* 3463 /*
3430 * If the task increased its priority or is running and 3464 * If the task increased its priority or is running and
3431 * lowered its priority, then reschedule its CPU: 3465 * lowered its priority, then reschedule its CPU:
@@ -3746,10 +3780,7 @@ recheck:
3746 } else { 3780 } else {
3747 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); 3781 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
3748 3782
3749 if (policy != SCHED_DEADLINE && 3783 if (!valid_policy(policy))
3750 policy != SCHED_FIFO && policy != SCHED_RR &&
3751 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
3752 policy != SCHED_IDLE)
3753 return -EINVAL; 3784 return -EINVAL;
3754 } 3785 }
3755 3786
@@ -3805,7 +3836,7 @@ recheck:
3805 * Treat SCHED_IDLE as nice 20. Only allow a switch to 3836 * Treat SCHED_IDLE as nice 20. Only allow a switch to
3806 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. 3837 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
3807 */ 3838 */
3808 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { 3839 if (idle_policy(p->policy) && !idle_policy(policy)) {
3809 if (!can_nice(p, task_nice(p))) 3840 if (!can_nice(p, task_nice(p)))
3810 return -EPERM; 3841 return -EPERM;
3811 } 3842 }
@@ -3930,7 +3961,7 @@ change:
3930 queued = task_on_rq_queued(p); 3961 queued = task_on_rq_queued(p);
3931 running = task_current(rq, p); 3962 running = task_current(rq, p);
3932 if (queued) 3963 if (queued)
3933 dequeue_task(rq, p, 0); 3964 dequeue_task(rq, p, DEQUEUE_SAVE);
3934 if (running) 3965 if (running)
3935 put_prev_task(rq, p); 3966 put_prev_task(rq, p);
3936 3967
@@ -3940,11 +3971,15 @@ change:
3940 if (running) 3971 if (running)
3941 p->sched_class->set_curr_task(rq); 3972 p->sched_class->set_curr_task(rq);
3942 if (queued) { 3973 if (queued) {
3974 int enqueue_flags = ENQUEUE_RESTORE;
3943 /* 3975 /*
3944 * We enqueue to tail when the priority of a task is 3976 * We enqueue to tail when the priority of a task is
3945 * increased (user space view). 3977 * increased (user space view).
3946 */ 3978 */
3947 enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0); 3979 if (oldprio <= p->prio)
3980 enqueue_flags |= ENQUEUE_HEAD;
3981
3982 enqueue_task(rq, p, enqueue_flags);
3948 } 3983 }
3949 3984
3950 check_class_changed(rq, p, prev_class, oldprio); 3985 check_class_changed(rq, p, prev_class, oldprio);
@@ -4022,6 +4057,7 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy,
4022{ 4057{
4023 return _sched_setscheduler(p, policy, param, false); 4058 return _sched_setscheduler(p, policy, param, false);
4024} 4059}
4060EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
4025 4061
4026static int 4062static int
4027do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 4063do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
@@ -4934,7 +4970,15 @@ void init_idle(struct task_struct *idle, int cpu)
4934 idle->state = TASK_RUNNING; 4970 idle->state = TASK_RUNNING;
4935 idle->se.exec_start = sched_clock(); 4971 idle->se.exec_start = sched_clock();
4936 4972
4937 do_set_cpus_allowed(idle, cpumask_of(cpu)); 4973#ifdef CONFIG_SMP
4974 /*
4975 * Its possible that init_idle() gets called multiple times on a task,
4976 * in that case do_set_cpus_allowed() will not do the right thing.
4977 *
4978 * And since this is boot we can forgo the serialization.
4979 */
4980 set_cpus_allowed_common(idle, cpumask_of(cpu));
4981#endif
4938 /* 4982 /*
4939 * We're having a chicken and egg problem, even though we are 4983 * We're having a chicken and egg problem, even though we are
4940 * holding rq->lock, the cpu isn't yet set to this cpu so the 4984 * holding rq->lock, the cpu isn't yet set to this cpu so the
@@ -4951,7 +4995,7 @@ void init_idle(struct task_struct *idle, int cpu)
4951 4995
4952 rq->curr = rq->idle = idle; 4996 rq->curr = rq->idle = idle;
4953 idle->on_rq = TASK_ON_RQ_QUEUED; 4997 idle->on_rq = TASK_ON_RQ_QUEUED;
4954#if defined(CONFIG_SMP) 4998#ifdef CONFIG_SMP
4955 idle->on_cpu = 1; 4999 idle->on_cpu = 1;
4956#endif 5000#endif
4957 raw_spin_unlock(&rq->lock); 5001 raw_spin_unlock(&rq->lock);
@@ -4966,7 +5010,7 @@ void init_idle(struct task_struct *idle, int cpu)
4966 idle->sched_class = &idle_sched_class; 5010 idle->sched_class = &idle_sched_class;
4967 ftrace_graph_init_idle_task(idle, cpu); 5011 ftrace_graph_init_idle_task(idle, cpu);
4968 vtime_init_idle(idle, cpu); 5012 vtime_init_idle(idle, cpu);
4969#if defined(CONFIG_SMP) 5013#ifdef CONFIG_SMP
4970 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); 5014 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
4971#endif 5015#endif
4972} 5016}
@@ -5085,7 +5129,7 @@ void sched_setnuma(struct task_struct *p, int nid)
5085 running = task_current(rq, p); 5129 running = task_current(rq, p);
5086 5130
5087 if (queued) 5131 if (queued)
5088 dequeue_task(rq, p, 0); 5132 dequeue_task(rq, p, DEQUEUE_SAVE);
5089 if (running) 5133 if (running)
5090 put_prev_task(rq, p); 5134 put_prev_task(rq, p);
5091 5135
@@ -5094,7 +5138,7 @@ void sched_setnuma(struct task_struct *p, int nid)
5094 if (running) 5138 if (running)
5095 p->sched_class->set_curr_task(rq); 5139 p->sched_class->set_curr_task(rq);
5096 if (queued) 5140 if (queued)
5097 enqueue_task(rq, p, 0); 5141 enqueue_task(rq, p, ENQUEUE_RESTORE);
5098 task_rq_unlock(rq, p, &flags); 5142 task_rq_unlock(rq, p, &flags);
5099} 5143}
5100#endif /* CONFIG_NUMA_BALANCING */ 5144#endif /* CONFIG_NUMA_BALANCING */
@@ -5515,21 +5559,27 @@ static void set_cpu_rq_start_time(void)
5515static int sched_cpu_active(struct notifier_block *nfb, 5559static int sched_cpu_active(struct notifier_block *nfb,
5516 unsigned long action, void *hcpu) 5560 unsigned long action, void *hcpu)
5517{ 5561{
5562 int cpu = (long)hcpu;
5563
5518 switch (action & ~CPU_TASKS_FROZEN) { 5564 switch (action & ~CPU_TASKS_FROZEN) {
5519 case CPU_STARTING: 5565 case CPU_STARTING:
5520 set_cpu_rq_start_time(); 5566 set_cpu_rq_start_time();
5521 return NOTIFY_OK; 5567 return NOTIFY_OK;
5568
5522 case CPU_ONLINE: 5569 case CPU_ONLINE:
5523 /* 5570 /*
5524 * At this point a starting CPU has marked itself as online via 5571 * At this point a starting CPU has marked itself as online via
5525 * set_cpu_online(). But it might not yet have marked itself 5572 * set_cpu_online(). But it might not yet have marked itself
5526 * as active, which is essential from here on. 5573 * as active, which is essential from here on.
5527 *
5528 * Thus, fall-through and help the starting CPU along.
5529 */ 5574 */
5575 set_cpu_active(cpu, true);
5576 stop_machine_unpark(cpu);
5577 return NOTIFY_OK;
5578
5530 case CPU_DOWN_FAILED: 5579 case CPU_DOWN_FAILED:
5531 set_cpu_active((long)hcpu, true); 5580 set_cpu_active(cpu, true);
5532 return NOTIFY_OK; 5581 return NOTIFY_OK;
5582
5533 default: 5583 default:
5534 return NOTIFY_DONE; 5584 return NOTIFY_DONE;
5535 } 5585 }
@@ -6461,7 +6511,8 @@ static struct sched_domain_topology_level default_topology[] = {
6461 { NULL, }, 6511 { NULL, },
6462}; 6512};
6463 6513
6464struct sched_domain_topology_level *sched_domain_topology = default_topology; 6514static struct sched_domain_topology_level *sched_domain_topology =
6515 default_topology;
6465 6516
6466#define for_each_sd_topology(tl) \ 6517#define for_each_sd_topology(tl) \
6467 for (tl = sched_domain_topology; tl->mask; tl++) 6518 for (tl = sched_domain_topology; tl->mask; tl++)
@@ -7230,9 +7281,6 @@ void __init sched_init_smp(void)
7230 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 7281 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
7231 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 7282 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
7232 7283
7233 /* nohz_full won't take effect without isolating the cpus. */
7234 tick_nohz_full_add_cpus_to(cpu_isolated_map);
7235
7236 sched_init_numa(); 7284 sched_init_numa();
7237 7285
7238 /* 7286 /*
@@ -7465,7 +7513,7 @@ void __init sched_init(void)
7465#ifdef CONFIG_DEBUG_ATOMIC_SLEEP 7513#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
7466static inline int preempt_count_equals(int preempt_offset) 7514static inline int preempt_count_equals(int preempt_offset)
7467{ 7515{
7468 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); 7516 int nested = preempt_count() + rcu_preempt_depth();
7469 7517
7470 return (nested == preempt_offset); 7518 return (nested == preempt_offset);
7471} 7519}
@@ -7712,7 +7760,7 @@ void sched_move_task(struct task_struct *tsk)
7712 queued = task_on_rq_queued(tsk); 7760 queued = task_on_rq_queued(tsk);
7713 7761
7714 if (queued) 7762 if (queued)
7715 dequeue_task(rq, tsk, 0); 7763 dequeue_task(rq, tsk, DEQUEUE_SAVE);
7716 if (unlikely(running)) 7764 if (unlikely(running))
7717 put_prev_task(rq, tsk); 7765 put_prev_task(rq, tsk);
7718 7766
@@ -7728,7 +7776,7 @@ void sched_move_task(struct task_struct *tsk)
7728 7776
7729#ifdef CONFIG_FAIR_GROUP_SCHED 7777#ifdef CONFIG_FAIR_GROUP_SCHED
7730 if (tsk->sched_class->task_move_group) 7778 if (tsk->sched_class->task_move_group)
7731 tsk->sched_class->task_move_group(tsk, queued); 7779 tsk->sched_class->task_move_group(tsk);
7732 else 7780 else
7733#endif 7781#endif
7734 set_task_rq(tsk, task_cpu(tsk)); 7782 set_task_rq(tsk, task_cpu(tsk));
@@ -7736,7 +7784,7 @@ void sched_move_task(struct task_struct *tsk)
7736 if (unlikely(running)) 7784 if (unlikely(running))
7737 tsk->sched_class->set_curr_task(rq); 7785 tsk->sched_class->set_curr_task(rq);
7738 if (queued) 7786 if (queued)
7739 enqueue_task(rq, tsk, 0); 7787 enqueue_task(rq, tsk, ENQUEUE_RESTORE);
7740 7788
7741 task_rq_unlock(rq, tsk, &flags); 7789 task_rq_unlock(rq, tsk, &flags);
7742} 7790}
@@ -8196,21 +8244,6 @@ static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
8196 sched_move_task(task); 8244 sched_move_task(task);
8197} 8245}
8198 8246
8199static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
8200 struct cgroup_subsys_state *old_css,
8201 struct task_struct *task)
8202{
8203 /*
8204 * cgroup_exit() is called in the copy_process() failure path.
8205 * Ignore this case since the task hasn't ran yet, this avoids
8206 * trying to poke a half freed task state from generic code.
8207 */
8208 if (!(task->flags & PF_EXITING))
8209 return;
8210
8211 sched_move_task(task);
8212}
8213
8214#ifdef CONFIG_FAIR_GROUP_SCHED 8247#ifdef CONFIG_FAIR_GROUP_SCHED
8215static int cpu_shares_write_u64(struct cgroup_subsys_state *css, 8248static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
8216 struct cftype *cftype, u64 shareval) 8249 struct cftype *cftype, u64 shareval)
@@ -8542,7 +8575,6 @@ struct cgroup_subsys cpu_cgrp_subsys = {
8542 .fork = cpu_cgroup_fork, 8575 .fork = cpu_cgroup_fork,
8543 .can_attach = cpu_cgroup_can_attach, 8576 .can_attach = cpu_cgroup_can_attach,
8544 .attach = cpu_cgroup_attach, 8577 .attach = cpu_cgroup_attach,
8545 .exit = cpu_cgroup_exit,
8546 .legacy_cftypes = cpu_files, 8578 .legacy_cftypes = cpu_files,
8547 .early_init = 1, 8579 .early_init = 1,
8548}; 8580};
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index c6acb07466bb..5a75b08cfd85 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -31,11 +31,6 @@ static inline int right_child(int i)
31 return (i << 1) + 2; 31 return (i << 1) + 2;
32} 32}
33 33
34static inline int dl_time_before(u64 a, u64 b)
35{
36 return (s64)(a - b) < 0;
37}
38
39static void cpudl_exchange(struct cpudl *cp, int a, int b) 34static void cpudl_exchange(struct cpudl *cp, int a, int b)
40{ 35{
41 int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; 36 int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index 1a0a6ef2fbe1..fcbdf83fed7e 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -2,6 +2,7 @@
2#define _LINUX_CPUDL_H 2#define _LINUX_CPUDL_H
3 3
4#include <linux/sched.h> 4#include <linux/sched.h>
5#include <linux/sched/deadline.h>
5 6
6#define IDX_INVALID -1 7#define IDX_INVALID -1
7 8
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 8cbc3db671df..26a54461bf59 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -444,6 +444,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
444 *ut = p->utime; 444 *ut = p->utime;
445 *st = p->stime; 445 *st = p->stime;
446} 446}
447EXPORT_SYMBOL_GPL(task_cputime_adjusted);
447 448
448void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) 449void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
449{ 450{
@@ -652,6 +653,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
652 task_cputime(p, &cputime.utime, &cputime.stime); 653 task_cputime(p, &cputime.utime, &cputime.stime);
653 cputime_adjust(&cputime, &p->prev_cputime, ut, st); 654 cputime_adjust(&cputime, &p->prev_cputime, ut, st);
654} 655}
656EXPORT_SYMBOL_GPL(task_cputime_adjusted);
655 657
656void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) 658void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
657{ 659{
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fc8f01083527..8b0a15e285f9 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -668,8 +668,15 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
668 * Queueing this task back might have overloaded rq, check if we need 668 * Queueing this task back might have overloaded rq, check if we need
669 * to kick someone away. 669 * to kick someone away.
670 */ 670 */
671 if (has_pushable_dl_tasks(rq)) 671 if (has_pushable_dl_tasks(rq)) {
672 /*
673 * Nothing relies on rq->lock after this, so its safe to drop
674 * rq->lock.
675 */
676 lockdep_unpin_lock(&rq->lock);
672 push_dl_task(rq); 677 push_dl_task(rq);
678 lockdep_pin_lock(&rq->lock);
679 }
673#endif 680#endif
674 681
675unlock: 682unlock:
@@ -1066,8 +1073,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
1066 int target = find_later_rq(p); 1073 int target = find_later_rq(p);
1067 1074
1068 if (target != -1 && 1075 if (target != -1 &&
1069 dl_time_before(p->dl.deadline, 1076 (dl_time_before(p->dl.deadline,
1070 cpu_rq(target)->dl.earliest_dl.curr)) 1077 cpu_rq(target)->dl.earliest_dl.curr) ||
1078 (cpu_rq(target)->dl.dl_nr_running == 0)))
1071 cpu = target; 1079 cpu = target;
1072 } 1080 }
1073 rcu_read_unlock(); 1081 rcu_read_unlock();
@@ -1417,7 +1425,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
1417 1425
1418 later_rq = cpu_rq(cpu); 1426 later_rq = cpu_rq(cpu);
1419 1427
1420 if (!dl_time_before(task->dl.deadline, 1428 if (later_rq->dl.dl_nr_running &&
1429 !dl_time_before(task->dl.deadline,
1421 later_rq->dl.earliest_dl.curr)) { 1430 later_rq->dl.earliest_dl.curr)) {
1422 /* 1431 /*
1423 * Target rq has tasks of equal or earlier deadline, 1432 * Target rq has tasks of equal or earlier deadline,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6e2e3483b1ec..824aa9f501a3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -661,11 +661,12 @@ static unsigned long task_h_load(struct task_struct *p);
661 661
662/* 662/*
663 * We choose a half-life close to 1 scheduling period. 663 * We choose a half-life close to 1 scheduling period.
664 * Note: The tables below are dependent on this value. 664 * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
665 * dependent on this value.
665 */ 666 */
666#define LOAD_AVG_PERIOD 32 667#define LOAD_AVG_PERIOD 32
667#define LOAD_AVG_MAX 47742 /* maximum possible load avg */ 668#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
668#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */ 669#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */
669 670
670/* Give new sched_entity start runnable values to heavy its load in infant time */ 671/* Give new sched_entity start runnable values to heavy its load in infant time */
671void init_entity_runnable_average(struct sched_entity *se) 672void init_entity_runnable_average(struct sched_entity *se)
@@ -682,7 +683,7 @@ void init_entity_runnable_average(struct sched_entity *se)
682 sa->load_avg = scale_load_down(se->load.weight); 683 sa->load_avg = scale_load_down(se->load.weight);
683 sa->load_sum = sa->load_avg * LOAD_AVG_MAX; 684 sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
684 sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); 685 sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
685 sa->util_sum = LOAD_AVG_MAX; 686 sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
686 /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ 687 /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
687} 688}
688 689
@@ -2069,7 +2070,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2069 int local = !!(flags & TNF_FAULT_LOCAL); 2070 int local = !!(flags & TNF_FAULT_LOCAL);
2070 int priv; 2071 int priv;
2071 2072
2072 if (!numabalancing_enabled) 2073 if (!static_branch_likely(&sched_numa_balancing))
2073 return; 2074 return;
2074 2075
2075 /* for example, ksmd faulting in a user's mm */ 2076 /* for example, ksmd faulting in a user's mm */
@@ -2157,7 +2158,7 @@ void task_numa_work(struct callback_head *work)
2157 struct vm_area_struct *vma; 2158 struct vm_area_struct *vma;
2158 unsigned long start, end; 2159 unsigned long start, end;
2159 unsigned long nr_pte_updates = 0; 2160 unsigned long nr_pte_updates = 0;
2160 long pages; 2161 long pages, virtpages;
2161 2162
2162 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); 2163 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
2163 2164
@@ -2203,9 +2204,11 @@ void task_numa_work(struct callback_head *work)
2203 start = mm->numa_scan_offset; 2204 start = mm->numa_scan_offset;
2204 pages = sysctl_numa_balancing_scan_size; 2205 pages = sysctl_numa_balancing_scan_size;
2205 pages <<= 20 - PAGE_SHIFT; /* MB in pages */ 2206 pages <<= 20 - PAGE_SHIFT; /* MB in pages */
2207 virtpages = pages * 8; /* Scan up to this much virtual space */
2206 if (!pages) 2208 if (!pages)
2207 return; 2209 return;
2208 2210
2211
2209 down_read(&mm->mmap_sem); 2212 down_read(&mm->mmap_sem);
2210 vma = find_vma(mm, start); 2213 vma = find_vma(mm, start);
2211 if (!vma) { 2214 if (!vma) {
@@ -2240,18 +2243,22 @@ void task_numa_work(struct callback_head *work)
2240 start = max(start, vma->vm_start); 2243 start = max(start, vma->vm_start);
2241 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); 2244 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2242 end = min(end, vma->vm_end); 2245 end = min(end, vma->vm_end);
2243 nr_pte_updates += change_prot_numa(vma, start, end); 2246 nr_pte_updates = change_prot_numa(vma, start, end);
2244 2247
2245 /* 2248 /*
2246 * Scan sysctl_numa_balancing_scan_size but ensure that 2249 * Try to scan sysctl_numa_balancing_size worth of
2247 * at least one PTE is updated so that unused virtual 2250 * hpages that have at least one present PTE that
2248 * address space is quickly skipped. 2251 * is not already pte-numa. If the VMA contains
2252 * areas that are unused or already full of prot_numa
2253 * PTEs, scan up to virtpages, to skip through those
2254 * areas faster.
2249 */ 2255 */
2250 if (nr_pte_updates) 2256 if (nr_pte_updates)
2251 pages -= (end - start) >> PAGE_SHIFT; 2257 pages -= (end - start) >> PAGE_SHIFT;
2258 virtpages -= (end - start) >> PAGE_SHIFT;
2252 2259
2253 start = end; 2260 start = end;
2254 if (pages <= 0) 2261 if (pages <= 0 || virtpages <= 0)
2255 goto out; 2262 goto out;
2256 2263
2257 cond_resched(); 2264 cond_resched();
@@ -2363,7 +2370,7 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
2363 */ 2370 */
2364 tg_weight = atomic_long_read(&tg->load_avg); 2371 tg_weight = atomic_long_read(&tg->load_avg);
2365 tg_weight -= cfs_rq->tg_load_avg_contrib; 2372 tg_weight -= cfs_rq->tg_load_avg_contrib;
2366 tg_weight += cfs_rq_load_avg(cfs_rq); 2373 tg_weight += cfs_rq->load.weight;
2367 2374
2368 return tg_weight; 2375 return tg_weight;
2369} 2376}
@@ -2373,7 +2380,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2373 long tg_weight, load, shares; 2380 long tg_weight, load, shares;
2374 2381
2375 tg_weight = calc_tg_weight(tg, cfs_rq); 2382 tg_weight = calc_tg_weight(tg, cfs_rq);
2376 load = cfs_rq_load_avg(cfs_rq); 2383 load = cfs_rq->load.weight;
2377 2384
2378 shares = (tg->shares * load); 2385 shares = (tg->shares * load);
2379 if (tg_weight) 2386 if (tg_weight)
@@ -2515,6 +2522,12 @@ static u32 __compute_runnable_contrib(u64 n)
2515 return contrib + runnable_avg_yN_sum[n]; 2522 return contrib + runnable_avg_yN_sum[n];
2516} 2523}
2517 2524
2525#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
2526#error "load tracking assumes 2^10 as unit"
2527#endif
2528
2529#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
2530
2518/* 2531/*
2519 * We can represent the historical contribution to runnable average as the 2532 * We can represent the historical contribution to runnable average as the
2520 * coefficients of a geometric series. To do this we sub-divide our runnable 2533 * coefficients of a geometric series. To do this we sub-divide our runnable
@@ -2547,10 +2560,10 @@ static __always_inline int
2547__update_load_avg(u64 now, int cpu, struct sched_avg *sa, 2560__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
2548 unsigned long weight, int running, struct cfs_rq *cfs_rq) 2561 unsigned long weight, int running, struct cfs_rq *cfs_rq)
2549{ 2562{
2550 u64 delta, periods; 2563 u64 delta, scaled_delta, periods;
2551 u32 contrib; 2564 u32 contrib;
2552 int delta_w, decayed = 0; 2565 unsigned int delta_w, scaled_delta_w, decayed = 0;
2553 unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); 2566 unsigned long scale_freq, scale_cpu;
2554 2567
2555 delta = now - sa->last_update_time; 2568 delta = now - sa->last_update_time;
2556 /* 2569 /*
@@ -2571,6 +2584,9 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
2571 return 0; 2584 return 0;
2572 sa->last_update_time = now; 2585 sa->last_update_time = now;
2573 2586
2587 scale_freq = arch_scale_freq_capacity(NULL, cpu);
2588 scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
2589
2574 /* delta_w is the amount already accumulated against our next period */ 2590 /* delta_w is the amount already accumulated against our next period */
2575 delta_w = sa->period_contrib; 2591 delta_w = sa->period_contrib;
2576 if (delta + delta_w >= 1024) { 2592 if (delta + delta_w >= 1024) {
@@ -2585,13 +2601,16 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
2585 * period and accrue it. 2601 * period and accrue it.
2586 */ 2602 */
2587 delta_w = 1024 - delta_w; 2603 delta_w = 1024 - delta_w;
2604 scaled_delta_w = cap_scale(delta_w, scale_freq);
2588 if (weight) { 2605 if (weight) {
2589 sa->load_sum += weight * delta_w; 2606 sa->load_sum += weight * scaled_delta_w;
2590 if (cfs_rq) 2607 if (cfs_rq) {
2591 cfs_rq->runnable_load_sum += weight * delta_w; 2608 cfs_rq->runnable_load_sum +=
2609 weight * scaled_delta_w;
2610 }
2592 } 2611 }
2593 if (running) 2612 if (running)
2594 sa->util_sum += delta_w * scale_freq >> SCHED_CAPACITY_SHIFT; 2613 sa->util_sum += scaled_delta_w * scale_cpu;
2595 2614
2596 delta -= delta_w; 2615 delta -= delta_w;
2597 2616
@@ -2608,23 +2627,25 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
2608 2627
2609 /* Efficiently calculate \sum (1..n_period) 1024*y^i */ 2628 /* Efficiently calculate \sum (1..n_period) 1024*y^i */
2610 contrib = __compute_runnable_contrib(periods); 2629 contrib = __compute_runnable_contrib(periods);
2630 contrib = cap_scale(contrib, scale_freq);
2611 if (weight) { 2631 if (weight) {
2612 sa->load_sum += weight * contrib; 2632 sa->load_sum += weight * contrib;
2613 if (cfs_rq) 2633 if (cfs_rq)
2614 cfs_rq->runnable_load_sum += weight * contrib; 2634 cfs_rq->runnable_load_sum += weight * contrib;
2615 } 2635 }
2616 if (running) 2636 if (running)
2617 sa->util_sum += contrib * scale_freq >> SCHED_CAPACITY_SHIFT; 2637 sa->util_sum += contrib * scale_cpu;
2618 } 2638 }
2619 2639
2620 /* Remainder of delta accrued against u_0` */ 2640 /* Remainder of delta accrued against u_0` */
2641 scaled_delta = cap_scale(delta, scale_freq);
2621 if (weight) { 2642 if (weight) {
2622 sa->load_sum += weight * delta; 2643 sa->load_sum += weight * scaled_delta;
2623 if (cfs_rq) 2644 if (cfs_rq)
2624 cfs_rq->runnable_load_sum += weight * delta; 2645 cfs_rq->runnable_load_sum += weight * scaled_delta;
2625 } 2646 }
2626 if (running) 2647 if (running)
2627 sa->util_sum += delta * scale_freq >> SCHED_CAPACITY_SHIFT; 2648 sa->util_sum += scaled_delta * scale_cpu;
2628 2649
2629 sa->period_contrib += delta; 2650 sa->period_contrib += delta;
2630 2651
@@ -2634,7 +2655,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
2634 cfs_rq->runnable_load_avg = 2655 cfs_rq->runnable_load_avg =
2635 div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); 2656 div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
2636 } 2657 }
2637 sa->util_avg = (sa->util_sum << SCHED_LOAD_SHIFT) / LOAD_AVG_MAX; 2658 sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
2638 } 2659 }
2639 2660
2640 return decayed; 2661 return decayed;
@@ -2664,20 +2685,20 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
2664/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ 2685/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
2665static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) 2686static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
2666{ 2687{
2667 int decayed;
2668 struct sched_avg *sa = &cfs_rq->avg; 2688 struct sched_avg *sa = &cfs_rq->avg;
2689 int decayed, removed = 0;
2669 2690
2670 if (atomic_long_read(&cfs_rq->removed_load_avg)) { 2691 if (atomic_long_read(&cfs_rq->removed_load_avg)) {
2671 long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); 2692 long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
2672 sa->load_avg = max_t(long, sa->load_avg - r, 0); 2693 sa->load_avg = max_t(long, sa->load_avg - r, 0);
2673 sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); 2694 sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
2695 removed = 1;
2674 } 2696 }
2675 2697
2676 if (atomic_long_read(&cfs_rq->removed_util_avg)) { 2698 if (atomic_long_read(&cfs_rq->removed_util_avg)) {
2677 long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); 2699 long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
2678 sa->util_avg = max_t(long, sa->util_avg - r, 0); 2700 sa->util_avg = max_t(long, sa->util_avg - r, 0);
2679 sa->util_sum = max_t(s32, sa->util_sum - 2701 sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
2680 ((r * LOAD_AVG_MAX) >> SCHED_LOAD_SHIFT), 0);
2681 } 2702 }
2682 2703
2683 decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, 2704 decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
@@ -2688,40 +2709,77 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
2688 cfs_rq->load_last_update_time_copy = sa->last_update_time; 2709 cfs_rq->load_last_update_time_copy = sa->last_update_time;
2689#endif 2710#endif
2690 2711
2691 return decayed; 2712 return decayed || removed;
2692} 2713}
2693 2714
2694/* Update task and its cfs_rq load average */ 2715/* Update task and its cfs_rq load average */
2695static inline void update_load_avg(struct sched_entity *se, int update_tg) 2716static inline void update_load_avg(struct sched_entity *se, int update_tg)
2696{ 2717{
2697 struct cfs_rq *cfs_rq = cfs_rq_of(se); 2718 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2698 int cpu = cpu_of(rq_of(cfs_rq));
2699 u64 now = cfs_rq_clock_task(cfs_rq); 2719 u64 now = cfs_rq_clock_task(cfs_rq);
2720 int cpu = cpu_of(rq_of(cfs_rq));
2700 2721
2701 /* 2722 /*
2702 * Track task load average for carrying it to new CPU after migrated, and 2723 * Track task load average for carrying it to new CPU after migrated, and
2703 * track group sched_entity load average for task_h_load calc in migration 2724 * track group sched_entity load average for task_h_load calc in migration
2704 */ 2725 */
2705 __update_load_avg(now, cpu, &se->avg, 2726 __update_load_avg(now, cpu, &se->avg,
2706 se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); 2727 se->on_rq * scale_load_down(se->load.weight),
2728 cfs_rq->curr == se, NULL);
2707 2729
2708 if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) 2730 if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
2709 update_tg_load_avg(cfs_rq, 0); 2731 update_tg_load_avg(cfs_rq, 0);
2710} 2732}
2711 2733
2734static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2735{
2736 if (!sched_feat(ATTACH_AGE_LOAD))
2737 goto skip_aging;
2738
2739 /*
2740 * If we got migrated (either between CPUs or between cgroups) we'll
2741 * have aged the average right before clearing @last_update_time.
2742 */
2743 if (se->avg.last_update_time) {
2744 __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
2745 &se->avg, 0, 0, NULL);
2746
2747 /*
2748 * XXX: we could have just aged the entire load away if we've been
2749 * absent from the fair class for too long.
2750 */
2751 }
2752
2753skip_aging:
2754 se->avg.last_update_time = cfs_rq->avg.last_update_time;
2755 cfs_rq->avg.load_avg += se->avg.load_avg;
2756 cfs_rq->avg.load_sum += se->avg.load_sum;
2757 cfs_rq->avg.util_avg += se->avg.util_avg;
2758 cfs_rq->avg.util_sum += se->avg.util_sum;
2759}
2760
2761static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2762{
2763 __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
2764 &se->avg, se->on_rq * scale_load_down(se->load.weight),
2765 cfs_rq->curr == se, NULL);
2766
2767 cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
2768 cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0);
2769 cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
2770 cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0);
2771}
2772
2712/* Add the load generated by se into cfs_rq's load average */ 2773/* Add the load generated by se into cfs_rq's load average */
2713static inline void 2774static inline void
2714enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 2775enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2715{ 2776{
2716 struct sched_avg *sa = &se->avg; 2777 struct sched_avg *sa = &se->avg;
2717 u64 now = cfs_rq_clock_task(cfs_rq); 2778 u64 now = cfs_rq_clock_task(cfs_rq);
2718 int migrated = 0, decayed; 2779 int migrated, decayed;
2719 2780
2720 if (sa->last_update_time == 0) { 2781 migrated = !sa->last_update_time;
2721 sa->last_update_time = now; 2782 if (!migrated) {
2722 migrated = 1;
2723 }
2724 else {
2725 __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, 2783 __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
2726 se->on_rq * scale_load_down(se->load.weight), 2784 se->on_rq * scale_load_down(se->load.weight),
2727 cfs_rq->curr == se, NULL); 2785 cfs_rq->curr == se, NULL);
@@ -2732,12 +2790,8 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2732 cfs_rq->runnable_load_avg += sa->load_avg; 2790 cfs_rq->runnable_load_avg += sa->load_avg;
2733 cfs_rq->runnable_load_sum += sa->load_sum; 2791 cfs_rq->runnable_load_sum += sa->load_sum;
2734 2792
2735 if (migrated) { 2793 if (migrated)
2736 cfs_rq->avg.load_avg += sa->load_avg; 2794 attach_entity_load_avg(cfs_rq, se);
2737 cfs_rq->avg.load_sum += sa->load_sum;
2738 cfs_rq->avg.util_avg += sa->util_avg;
2739 cfs_rq->avg.util_sum += sa->util_sum;
2740 }
2741 2795
2742 if (decayed || migrated) 2796 if (decayed || migrated)
2743 update_tg_load_avg(cfs_rq, 0); 2797 update_tg_load_avg(cfs_rq, 0);
@@ -2752,7 +2806,7 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2752 cfs_rq->runnable_load_avg = 2806 cfs_rq->runnable_load_avg =
2753 max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); 2807 max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
2754 cfs_rq->runnable_load_sum = 2808 cfs_rq->runnable_load_sum =
2755 max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); 2809 max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
2756} 2810}
2757 2811
2758/* 2812/*
@@ -2820,6 +2874,11 @@ static inline void
2820dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} 2874dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
2821static inline void remove_entity_load_avg(struct sched_entity *se) {} 2875static inline void remove_entity_load_avg(struct sched_entity *se) {}
2822 2876
2877static inline void
2878attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
2879static inline void
2880detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
2881
2823static inline int idle_balance(struct rq *rq) 2882static inline int idle_balance(struct rq *rq)
2824{ 2883{
2825 return 0; 2884 return 0;
@@ -4816,32 +4875,39 @@ next:
4816done: 4875done:
4817 return target; 4876 return target;
4818} 4877}
4878
4819/* 4879/*
4820 * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS 4880 * cpu_util returns the amount of capacity of a CPU that is used by CFS
4821 * tasks. The unit of the return value must be the one of capacity so we can 4881 * tasks. The unit of the return value must be the one of capacity so we can
4822 * compare the usage with the capacity of the CPU that is available for CFS 4882 * compare the utilization with the capacity of the CPU that is available for
4823 * task (ie cpu_capacity). 4883 * CFS task (ie cpu_capacity).
4824 * cfs.avg.util_avg is the sum of running time of runnable tasks on a 4884 *
4825 * CPU. It represents the amount of utilization of a CPU in the range 4885 * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
4826 * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full 4886 * recent utilization of currently non-runnable tasks on a CPU. It represents
4827 * capacity of the CPU because it's about the running time on this CPU. 4887 * the amount of utilization of a CPU in the range [0..capacity_orig] where
4828 * Nevertheless, cfs.avg.util_avg can be higher than SCHED_LOAD_SCALE 4888 * capacity_orig is the cpu_capacity available at the highest frequency
4829 * because of unfortunate rounding in util_avg or just 4889 * (arch_scale_freq_capacity()).
4830 * after migrating tasks until the average stabilizes with the new running 4890 * The utilization of a CPU converges towards a sum equal to or less than the
4831 * time. So we need to check that the usage stays into the range 4891 * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
4832 * [0..cpu_capacity_orig] and cap if necessary. 4892 * the running time on this CPU scaled by capacity_curr.
4833 * Without capping the usage, a group could be seen as overloaded (CPU0 usage 4893 *
4834 * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity 4894 * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
4895 * higher than capacity_orig because of unfortunate rounding in
4896 * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
4897 * the average stabilizes with the new running time. We need to check that the
4898 * utilization stays within the range of [0..capacity_orig] and cap it if
4899 * necessary. Without utilization capping, a group could be seen as overloaded
4900 * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
4901 * available capacity. We allow utilization to overshoot capacity_curr (but not
4902 * capacity_orig) as it useful for predicting the capacity required after task
4903 * migrations (scheduler-driven DVFS).
4835 */ 4904 */
4836static int get_cpu_usage(int cpu) 4905static int cpu_util(int cpu)
4837{ 4906{
4838 unsigned long usage = cpu_rq(cpu)->cfs.avg.util_avg; 4907 unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
4839 unsigned long capacity = capacity_orig_of(cpu); 4908 unsigned long capacity = capacity_orig_of(cpu);
4840 4909
4841 if (usage >= SCHED_LOAD_SCALE) 4910 return (util >= capacity) ? capacity : util;
4842 return capacity;
4843
4844 return (usage * capacity) >> SCHED_LOAD_SHIFT;
4845} 4911}
4846 4912
4847/* 4913/*
@@ -4944,7 +5010,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
4944 * previous cpu. However, the caller only guarantees p->pi_lock is held; no 5010 * previous cpu. However, the caller only guarantees p->pi_lock is held; no
4945 * other assumptions, including the state of rq->lock, should be made. 5011 * other assumptions, including the state of rq->lock, should be made.
4946 */ 5012 */
4947static void migrate_task_rq_fair(struct task_struct *p, int next_cpu) 5013static void migrate_task_rq_fair(struct task_struct *p)
4948{ 5014{
4949 /* 5015 /*
4950 * We are supposed to update the task to "current" time, then its up to date 5016 * We are supposed to update the task to "current" time, then its up to date
@@ -5524,10 +5590,10 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
5524 unsigned long src_faults, dst_faults; 5590 unsigned long src_faults, dst_faults;
5525 int src_nid, dst_nid; 5591 int src_nid, dst_nid;
5526 5592
5527 if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) 5593 if (!static_branch_likely(&sched_numa_balancing))
5528 return -1; 5594 return -1;
5529 5595
5530 if (!sched_feat(NUMA)) 5596 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
5531 return -1; 5597 return -1;
5532 5598
5533 src_nid = cpu_to_node(env->src_cpu); 5599 src_nid = cpu_to_node(env->src_cpu);
@@ -5933,7 +5999,7 @@ struct sg_lb_stats {
5933 unsigned long sum_weighted_load; /* Weighted load of group's tasks */ 5999 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
5934 unsigned long load_per_task; 6000 unsigned long load_per_task;
5935 unsigned long group_capacity; 6001 unsigned long group_capacity;
5936 unsigned long group_usage; /* Total usage of the group */ 6002 unsigned long group_util; /* Total utilization of the group */
5937 unsigned int sum_nr_running; /* Nr tasks running in the group */ 6003 unsigned int sum_nr_running; /* Nr tasks running in the group */
5938 unsigned int idle_cpus; 6004 unsigned int idle_cpus;
5939 unsigned int group_weight; 6005 unsigned int group_weight;
@@ -6009,19 +6075,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
6009 return load_idx; 6075 return load_idx;
6010} 6076}
6011 6077
6012static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu)
6013{
6014 if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
6015 return sd->smt_gain / sd->span_weight;
6016
6017 return SCHED_CAPACITY_SCALE;
6018}
6019
6020unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
6021{
6022 return default_scale_cpu_capacity(sd, cpu);
6023}
6024
6025static unsigned long scale_rt_capacity(int cpu) 6078static unsigned long scale_rt_capacity(int cpu)
6026{ 6079{
6027 struct rq *rq = cpu_rq(cpu); 6080 struct rq *rq = cpu_rq(cpu);
@@ -6051,16 +6104,9 @@ static unsigned long scale_rt_capacity(int cpu)
6051 6104
6052static void update_cpu_capacity(struct sched_domain *sd, int cpu) 6105static void update_cpu_capacity(struct sched_domain *sd, int cpu)
6053{ 6106{
6054 unsigned long capacity = SCHED_CAPACITY_SCALE; 6107 unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
6055 struct sched_group *sdg = sd->groups; 6108 struct sched_group *sdg = sd->groups;
6056 6109
6057 if (sched_feat(ARCH_CAPACITY))
6058 capacity *= arch_scale_cpu_capacity(sd, cpu);
6059 else
6060 capacity *= default_scale_cpu_capacity(sd, cpu);
6061
6062 capacity >>= SCHED_CAPACITY_SHIFT;
6063
6064 cpu_rq(cpu)->cpu_capacity_orig = capacity; 6110 cpu_rq(cpu)->cpu_capacity_orig = capacity;
6065 6111
6066 capacity *= scale_rt_capacity(cpu); 6112 capacity *= scale_rt_capacity(cpu);
@@ -6186,8 +6232,8 @@ static inline int sg_imbalanced(struct sched_group *group)
6186 * group_has_capacity returns true if the group has spare capacity that could 6232 * group_has_capacity returns true if the group has spare capacity that could
6187 * be used by some tasks. 6233 * be used by some tasks.
6188 * We consider that a group has spare capacity if the * number of task is 6234 * We consider that a group has spare capacity if the * number of task is
6189 * smaller than the number of CPUs or if the usage is lower than the available 6235 * smaller than the number of CPUs or if the utilization is lower than the
6190 * capacity for CFS tasks. 6236 * available capacity for CFS tasks.
6191 * For the latter, we use a threshold to stabilize the state, to take into 6237 * For the latter, we use a threshold to stabilize the state, to take into
6192 * account the variance of the tasks' load and to return true if the available 6238 * account the variance of the tasks' load and to return true if the available
6193 * capacity in meaningful for the load balancer. 6239 * capacity in meaningful for the load balancer.
@@ -6201,7 +6247,7 @@ group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
6201 return true; 6247 return true;
6202 6248
6203 if ((sgs->group_capacity * 100) > 6249 if ((sgs->group_capacity * 100) >
6204 (sgs->group_usage * env->sd->imbalance_pct)) 6250 (sgs->group_util * env->sd->imbalance_pct))
6205 return true; 6251 return true;
6206 6252
6207 return false; 6253 return false;
@@ -6222,15 +6268,15 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
6222 return false; 6268 return false;
6223 6269
6224 if ((sgs->group_capacity * 100) < 6270 if ((sgs->group_capacity * 100) <
6225 (sgs->group_usage * env->sd->imbalance_pct)) 6271 (sgs->group_util * env->sd->imbalance_pct))
6226 return true; 6272 return true;
6227 6273
6228 return false; 6274 return false;
6229} 6275}
6230 6276
6231static enum group_type group_classify(struct lb_env *env, 6277static inline enum
6232 struct sched_group *group, 6278group_type group_classify(struct sched_group *group,
6233 struct sg_lb_stats *sgs) 6279 struct sg_lb_stats *sgs)
6234{ 6280{
6235 if (sgs->group_no_capacity) 6281 if (sgs->group_no_capacity)
6236 return group_overloaded; 6282 return group_overloaded;
@@ -6270,7 +6316,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
6270 load = source_load(i, load_idx); 6316 load = source_load(i, load_idx);
6271 6317
6272 sgs->group_load += load; 6318 sgs->group_load += load;
6273 sgs->group_usage += get_cpu_usage(i); 6319 sgs->group_util += cpu_util(i);
6274 sgs->sum_nr_running += rq->cfs.h_nr_running; 6320 sgs->sum_nr_running += rq->cfs.h_nr_running;
6275 6321
6276 if (rq->nr_running > 1) 6322 if (rq->nr_running > 1)
@@ -6295,7 +6341,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
6295 sgs->group_weight = group->group_weight; 6341 sgs->group_weight = group->group_weight;
6296 6342
6297 sgs->group_no_capacity = group_is_overloaded(env, sgs); 6343 sgs->group_no_capacity = group_is_overloaded(env, sgs);
6298 sgs->group_type = group_classify(env, group, sgs); 6344 sgs->group_type = group_classify(group, sgs);
6299} 6345}
6300 6346
6301/** 6347/**
@@ -6429,7 +6475,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
6429 group_has_capacity(env, &sds->local_stat) && 6475 group_has_capacity(env, &sds->local_stat) &&
6430 (sgs->sum_nr_running > 1)) { 6476 (sgs->sum_nr_running > 1)) {
6431 sgs->group_no_capacity = 1; 6477 sgs->group_no_capacity = 1;
6432 sgs->group_type = group_overloaded; 6478 sgs->group_type = group_classify(sg, sgs);
6433 } 6479 }
6434 6480
6435 if (update_sd_pick_busiest(env, sds, sg, sgs)) { 6481 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
@@ -7609,8 +7655,22 @@ out:
7609 * When the cpu is attached to null domain for ex, it will not be 7655 * When the cpu is attached to null domain for ex, it will not be
7610 * updated. 7656 * updated.
7611 */ 7657 */
7612 if (likely(update_next_balance)) 7658 if (likely(update_next_balance)) {
7613 rq->next_balance = next_balance; 7659 rq->next_balance = next_balance;
7660
7661#ifdef CONFIG_NO_HZ_COMMON
7662 /*
7663 * If this CPU has been elected to perform the nohz idle
7664 * balance. Other idle CPUs have already rebalanced with
7665 * nohz_idle_balance() and nohz.next_balance has been
7666 * updated accordingly. This CPU is now running the idle load
7667 * balance for itself and we need to update the
7668 * nohz.next_balance accordingly.
7669 */
7670 if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
7671 nohz.next_balance = rq->next_balance;
7672#endif
7673 }
7614} 7674}
7615 7675
7616#ifdef CONFIG_NO_HZ_COMMON 7676#ifdef CONFIG_NO_HZ_COMMON
@@ -7623,6 +7683,9 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
7623 int this_cpu = this_rq->cpu; 7683 int this_cpu = this_rq->cpu;
7624 struct rq *rq; 7684 struct rq *rq;
7625 int balance_cpu; 7685 int balance_cpu;
7686 /* Earliest time when we have to do rebalance again */
7687 unsigned long next_balance = jiffies + 60*HZ;
7688 int update_next_balance = 0;
7626 7689
7627 if (idle != CPU_IDLE || 7690 if (idle != CPU_IDLE ||
7628 !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) 7691 !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
@@ -7654,10 +7717,19 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
7654 rebalance_domains(rq, CPU_IDLE); 7717 rebalance_domains(rq, CPU_IDLE);
7655 } 7718 }
7656 7719
7657 if (time_after(this_rq->next_balance, rq->next_balance)) 7720 if (time_after(next_balance, rq->next_balance)) {
7658 this_rq->next_balance = rq->next_balance; 7721 next_balance = rq->next_balance;
7722 update_next_balance = 1;
7723 }
7659 } 7724 }
7660 nohz.next_balance = this_rq->next_balance; 7725
7726 /*
7727 * next_balance will be updated only when there is a need.
7728 * When the CPU is attached to null domain for ex, it will not be
7729 * updated.
7730 */
7731 if (likely(update_next_balance))
7732 nohz.next_balance = next_balance;
7661end: 7733end:
7662 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)); 7734 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
7663} 7735}
@@ -7810,7 +7882,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
7810 entity_tick(cfs_rq, se, queued); 7882 entity_tick(cfs_rq, se, queued);
7811 } 7883 }
7812 7884
7813 if (numabalancing_enabled) 7885 if (static_branch_unlikely(&sched_numa_balancing))
7814 task_tick_numa(rq, curr); 7886 task_tick_numa(rq, curr);
7815} 7887}
7816 7888
@@ -7886,21 +7958,39 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
7886 check_preempt_curr(rq, p, 0); 7958 check_preempt_curr(rq, p, 0);
7887} 7959}
7888 7960
7889static void switched_from_fair(struct rq *rq, struct task_struct *p) 7961static inline bool vruntime_normalized(struct task_struct *p)
7890{ 7962{
7891 struct sched_entity *se = &p->se; 7963 struct sched_entity *se = &p->se;
7892 struct cfs_rq *cfs_rq = cfs_rq_of(se);
7893 7964
7894 /* 7965 /*
7895 * Ensure the task's vruntime is normalized, so that when it's 7966 * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
7896 * switched back to the fair class the enqueue_entity(.flags=0) will 7967 * the dequeue_entity(.flags=0) will already have normalized the
7897 * do the right thing. 7968 * vruntime.
7969 */
7970 if (p->on_rq)
7971 return true;
7972
7973 /*
7974 * When !on_rq, vruntime of the task has usually NOT been normalized.
7975 * But there are some cases where it has already been normalized:
7898 * 7976 *
7899 * If it's queued, then the dequeue_entity(.flags=0) will already 7977 * - A forked child which is waiting for being woken up by
7900 * have normalized the vruntime, if it's !queued, then only when 7978 * wake_up_new_task().
7901 * the task is sleeping will it still have non-normalized vruntime. 7979 * - A task which has been woken up by try_to_wake_up() and
7980 * waiting for actually being woken up by sched_ttwu_pending().
7902 */ 7981 */
7903 if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) { 7982 if (!se->sum_exec_runtime || p->state == TASK_WAKING)
7983 return true;
7984
7985 return false;
7986}
7987
7988static void detach_task_cfs_rq(struct task_struct *p)
7989{
7990 struct sched_entity *se = &p->se;
7991 struct cfs_rq *cfs_rq = cfs_rq_of(se);
7992
7993 if (!vruntime_normalized(p)) {
7904 /* 7994 /*
7905 * Fix up our vruntime so that the current sleep doesn't 7995 * Fix up our vruntime so that the current sleep doesn't
7906 * cause 'unlimited' sleep bonus. 7996 * cause 'unlimited' sleep bonus.
@@ -7909,28 +7999,14 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
7909 se->vruntime -= cfs_rq->min_vruntime; 7999 se->vruntime -= cfs_rq->min_vruntime;
7910 } 8000 }
7911 8001
7912#ifdef CONFIG_SMP
7913 /* Catch up with the cfs_rq and remove our load when we leave */ 8002 /* Catch up with the cfs_rq and remove our load when we leave */
7914 __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq), &se->avg, 8003 detach_entity_load_avg(cfs_rq, se);
7915 se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL);
7916
7917 cfs_rq->avg.load_avg =
7918 max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
7919 cfs_rq->avg.load_sum =
7920 max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0);
7921 cfs_rq->avg.util_avg =
7922 max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
7923 cfs_rq->avg.util_sum =
7924 max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0);
7925#endif
7926} 8004}
7927 8005
7928/* 8006static void attach_task_cfs_rq(struct task_struct *p)
7929 * We switched to the sched_fair class.
7930 */
7931static void switched_to_fair(struct rq *rq, struct task_struct *p)
7932{ 8007{
7933 struct sched_entity *se = &p->se; 8008 struct sched_entity *se = &p->se;
8009 struct cfs_rq *cfs_rq = cfs_rq_of(se);
7934 8010
7935#ifdef CONFIG_FAIR_GROUP_SCHED 8011#ifdef CONFIG_FAIR_GROUP_SCHED
7936 /* 8012 /*
@@ -7940,31 +8016,33 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
7940 se->depth = se->parent ? se->parent->depth + 1 : 0; 8016 se->depth = se->parent ? se->parent->depth + 1 : 0;
7941#endif 8017#endif
7942 8018
7943 if (!task_on_rq_queued(p)) { 8019 /* Synchronize task with its cfs_rq */
8020 attach_entity_load_avg(cfs_rq, se);
8021
8022 if (!vruntime_normalized(p))
8023 se->vruntime += cfs_rq->min_vruntime;
8024}
8025
8026static void switched_from_fair(struct rq *rq, struct task_struct *p)
8027{
8028 detach_task_cfs_rq(p);
8029}
8030
8031static void switched_to_fair(struct rq *rq, struct task_struct *p)
8032{
8033 attach_task_cfs_rq(p);
7944 8034
8035 if (task_on_rq_queued(p)) {
7945 /* 8036 /*
7946 * Ensure the task has a non-normalized vruntime when it is switched 8037 * We were most likely switched from sched_rt, so
7947 * back to the fair class with !queued, so that enqueue_entity() at 8038 * kick off the schedule if running, otherwise just see
7948 * wake-up time will do the right thing. 8039 * if we can still preempt the current task.
7949 *
7950 * If it's queued, then the enqueue_entity(.flags=0) makes the task
7951 * has non-normalized vruntime, if it's !queued, then it still has
7952 * normalized vruntime.
7953 */ 8040 */
7954 if (p->state != TASK_RUNNING) 8041 if (rq->curr == p)
7955 se->vruntime += cfs_rq_of(se)->min_vruntime; 8042 resched_curr(rq);
7956 return; 8043 else
8044 check_preempt_curr(rq, p, 0);
7957 } 8045 }
7958
7959 /*
7960 * We were most likely switched from sched_rt, so
7961 * kick off the schedule if running, otherwise just see
7962 * if we can still preempt the current task.
7963 */
7964 if (rq->curr == p)
7965 resched_curr(rq);
7966 else
7967 check_preempt_curr(rq, p, 0);
7968} 8046}
7969 8047
7970/* Account for a task changing its policy or group. 8048/* Account for a task changing its policy or group.
@@ -7999,56 +8077,16 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
7999} 8077}
8000 8078
8001#ifdef CONFIG_FAIR_GROUP_SCHED 8079#ifdef CONFIG_FAIR_GROUP_SCHED
8002static void task_move_group_fair(struct task_struct *p, int queued) 8080static void task_move_group_fair(struct task_struct *p)
8003{ 8081{
8004 struct sched_entity *se = &p->se; 8082 detach_task_cfs_rq(p);
8005 struct cfs_rq *cfs_rq;
8006
8007 /*
8008 * If the task was not on the rq at the time of this cgroup movement
8009 * it must have been asleep, sleeping tasks keep their ->vruntime
8010 * absolute on their old rq until wakeup (needed for the fair sleeper
8011 * bonus in place_entity()).
8012 *
8013 * If it was on the rq, we've just 'preempted' it, which does convert
8014 * ->vruntime to a relative base.
8015 *
8016 * Make sure both cases convert their relative position when migrating
8017 * to another cgroup's rq. This does somewhat interfere with the
8018 * fair sleeper stuff for the first placement, but who cares.
8019 */
8020 /*
8021 * When !queued, vruntime of the task has usually NOT been normalized.
8022 * But there are some cases where it has already been normalized:
8023 *
8024 * - Moving a forked child which is waiting for being woken up by
8025 * wake_up_new_task().
8026 * - Moving a task which has been woken up by try_to_wake_up() and
8027 * waiting for actually being woken up by sched_ttwu_pending().
8028 *
8029 * To prevent boost or penalty in the new cfs_rq caused by delta
8030 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
8031 */
8032 if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING))
8033 queued = 1;
8034
8035 if (!queued)
8036 se->vruntime -= cfs_rq_of(se)->min_vruntime;
8037 set_task_rq(p, task_cpu(p)); 8083 set_task_rq(p, task_cpu(p));
8038 se->depth = se->parent ? se->parent->depth + 1 : 0;
8039 if (!queued) {
8040 cfs_rq = cfs_rq_of(se);
8041 se->vruntime += cfs_rq->min_vruntime;
8042 8084
8043#ifdef CONFIG_SMP 8085#ifdef CONFIG_SMP
8044 /* Virtually synchronize task with its new cfs_rq */ 8086 /* Tell se's cfs_rq has been changed -- migrated */
8045 p->se.avg.last_update_time = cfs_rq->avg.last_update_time; 8087 p->se.avg.last_update_time = 0;
8046 cfs_rq->avg.load_avg += p->se.avg.load_avg;
8047 cfs_rq->avg.load_sum += p->se.avg.load_sum;
8048 cfs_rq->avg.util_avg += p->se.avg.util_avg;
8049 cfs_rq->avg.util_sum += p->se.avg.util_sum;
8050#endif 8088#endif
8051 } 8089 attach_task_cfs_rq(p);
8052} 8090}
8053 8091
8054void free_fair_sched_group(struct task_group *tg) 8092void free_fair_sched_group(struct task_group *tg)
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 83a50e7ca533..69631fa46c2f 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -36,11 +36,6 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
36 */ 36 */
37SCHED_FEAT(WAKEUP_PREEMPTION, true) 37SCHED_FEAT(WAKEUP_PREEMPTION, true)
38 38
39/*
40 * Use arch dependent cpu capacity functions
41 */
42SCHED_FEAT(ARCH_CAPACITY, true)
43
44SCHED_FEAT(HRTICK, false) 39SCHED_FEAT(HRTICK, false)
45SCHED_FEAT(DOUBLE_TICK, false) 40SCHED_FEAT(DOUBLE_TICK, false)
46SCHED_FEAT(LB_BIAS, true) 41SCHED_FEAT(LB_BIAS, true)
@@ -72,19 +67,5 @@ SCHED_FEAT(RT_PUSH_IPI, true)
72SCHED_FEAT(FORCE_SD_OVERLAP, false) 67SCHED_FEAT(FORCE_SD_OVERLAP, false)
73SCHED_FEAT(RT_RUNTIME_SHARE, true) 68SCHED_FEAT(RT_RUNTIME_SHARE, true)
74SCHED_FEAT(LB_MIN, false) 69SCHED_FEAT(LB_MIN, false)
70SCHED_FEAT(ATTACH_AGE_LOAD, true)
75 71
76/*
77 * Apply the automatic NUMA scheduling policy. Enabled automatically
78 * at runtime if running on a NUMA machine. Can be controlled via
79 * numa_balancing=
80 */
81#ifdef CONFIG_NUMA_BALANCING
82
83/*
84 * NUMA will favor moving tasks towards nodes where a higher number of
85 * hinting faults are recorded during active load balancing. It will
86 * resist moving tasks towards nodes where a lower number of hinting
87 * faults have been recorded.
88 */
89SCHED_FEAT(NUMA, true)
90#endif
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 8f177c73ae19..4a2ef5a02fd3 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -57,9 +57,11 @@ static inline int cpu_idle_poll(void)
57 rcu_idle_enter(); 57 rcu_idle_enter();
58 trace_cpu_idle_rcuidle(0, smp_processor_id()); 58 trace_cpu_idle_rcuidle(0, smp_processor_id());
59 local_irq_enable(); 59 local_irq_enable();
60 stop_critical_timings();
60 while (!tif_need_resched() && 61 while (!tif_need_resched() &&
61 (cpu_idle_force_poll || tick_check_broadcast_expired())) 62 (cpu_idle_force_poll || tick_check_broadcast_expired()))
62 cpu_relax(); 63 cpu_relax();
64 start_critical_timings();
63 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 65 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
64 rcu_idle_exit(); 66 rcu_idle_exit();
65 return 1; 67 return 1;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index d2ea59364a1c..e3cc16312046 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -635,11 +635,11 @@ bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
635/* 635/*
636 * We ran out of runtime, see if we can borrow some from our neighbours. 636 * We ran out of runtime, see if we can borrow some from our neighbours.
637 */ 637 */
638static int do_balance_runtime(struct rt_rq *rt_rq) 638static void do_balance_runtime(struct rt_rq *rt_rq)
639{ 639{
640 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 640 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
641 struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd; 641 struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
642 int i, weight, more = 0; 642 int i, weight;
643 u64 rt_period; 643 u64 rt_period;
644 644
645 weight = cpumask_weight(rd->span); 645 weight = cpumask_weight(rd->span);
@@ -673,7 +673,6 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
673 diff = rt_period - rt_rq->rt_runtime; 673 diff = rt_period - rt_rq->rt_runtime;
674 iter->rt_runtime -= diff; 674 iter->rt_runtime -= diff;
675 rt_rq->rt_runtime += diff; 675 rt_rq->rt_runtime += diff;
676 more = 1;
677 if (rt_rq->rt_runtime == rt_period) { 676 if (rt_rq->rt_runtime == rt_period) {
678 raw_spin_unlock(&iter->rt_runtime_lock); 677 raw_spin_unlock(&iter->rt_runtime_lock);
679 break; 678 break;
@@ -683,8 +682,6 @@ next:
683 raw_spin_unlock(&iter->rt_runtime_lock); 682 raw_spin_unlock(&iter->rt_runtime_lock);
684 } 683 }
685 raw_spin_unlock(&rt_b->rt_runtime_lock); 684 raw_spin_unlock(&rt_b->rt_runtime_lock);
686
687 return more;
688} 685}
689 686
690/* 687/*
@@ -796,26 +793,19 @@ static void __enable_runtime(struct rq *rq)
796 } 793 }
797} 794}
798 795
799static int balance_runtime(struct rt_rq *rt_rq) 796static void balance_runtime(struct rt_rq *rt_rq)
800{ 797{
801 int more = 0;
802
803 if (!sched_feat(RT_RUNTIME_SHARE)) 798 if (!sched_feat(RT_RUNTIME_SHARE))
804 return more; 799 return;
805 800
806 if (rt_rq->rt_time > rt_rq->rt_runtime) { 801 if (rt_rq->rt_time > rt_rq->rt_runtime) {
807 raw_spin_unlock(&rt_rq->rt_runtime_lock); 802 raw_spin_unlock(&rt_rq->rt_runtime_lock);
808 more = do_balance_runtime(rt_rq); 803 do_balance_runtime(rt_rq);
809 raw_spin_lock(&rt_rq->rt_runtime_lock); 804 raw_spin_lock(&rt_rq->rt_runtime_lock);
810 } 805 }
811
812 return more;
813} 806}
814#else /* !CONFIG_SMP */ 807#else /* !CONFIG_SMP */
815static inline int balance_runtime(struct rt_rq *rt_rq) 808static inline void balance_runtime(struct rt_rq *rt_rq) {}
816{
817 return 0;
818}
819#endif /* CONFIG_SMP */ 809#endif /* CONFIG_SMP */
820 810
821static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) 811static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 68cda117574c..efd3bfc7e347 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -84,6 +84,10 @@ static inline void update_cpu_load_active(struct rq *this_rq) { }
84 */ 84 */
85#define RUNTIME_INF ((u64)~0ULL) 85#define RUNTIME_INF ((u64)~0ULL)
86 86
87static inline int idle_policy(int policy)
88{
89 return policy == SCHED_IDLE;
90}
87static inline int fair_policy(int policy) 91static inline int fair_policy(int policy)
88{ 92{
89 return policy == SCHED_NORMAL || policy == SCHED_BATCH; 93 return policy == SCHED_NORMAL || policy == SCHED_BATCH;
@@ -98,6 +102,11 @@ static inline int dl_policy(int policy)
98{ 102{
99 return policy == SCHED_DEADLINE; 103 return policy == SCHED_DEADLINE;
100} 104}
105static inline bool valid_policy(int policy)
106{
107 return idle_policy(policy) || fair_policy(policy) ||
108 rt_policy(policy) || dl_policy(policy);
109}
101 110
102static inline int task_has_rt_policy(struct task_struct *p) 111static inline int task_has_rt_policy(struct task_struct *p)
103{ 112{
@@ -109,11 +118,6 @@ static inline int task_has_dl_policy(struct task_struct *p)
109 return dl_policy(p->policy); 118 return dl_policy(p->policy);
110} 119}
111 120
112static inline bool dl_time_before(u64 a, u64 b)
113{
114 return (s64)(a - b) < 0;
115}
116
117/* 121/*
118 * Tells if entity @a should preempt entity @b. 122 * Tells if entity @a should preempt entity @b.
119 */ 123 */
@@ -1003,17 +1007,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
1003#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) 1007#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
1004#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ 1008#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
1005 1009
1006#ifdef CONFIG_NUMA_BALANCING 1010extern struct static_key_false sched_numa_balancing;
1007#define sched_feat_numa(x) sched_feat(x)
1008#ifdef CONFIG_SCHED_DEBUG
1009#define numabalancing_enabled sched_feat_numa(NUMA)
1010#else
1011extern bool numabalancing_enabled;
1012#endif /* CONFIG_SCHED_DEBUG */
1013#else
1014#define sched_feat_numa(x) (0)
1015#define numabalancing_enabled (0)
1016#endif /* CONFIG_NUMA_BALANCING */
1017 1011
1018static inline u64 global_rt_period(void) 1012static inline u64 global_rt_period(void)
1019{ 1013{
@@ -1078,9 +1072,10 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
1078 * After ->on_cpu is cleared, the task can be moved to a different CPU. 1072 * After ->on_cpu is cleared, the task can be moved to a different CPU.
1079 * We must ensure this doesn't happen until the switch is completely 1073 * We must ensure this doesn't happen until the switch is completely
1080 * finished. 1074 * finished.
1075 *
1076 * Pairs with the control dependency and rmb in try_to_wake_up().
1081 */ 1077 */
1082 smp_wmb(); 1078 smp_store_release(&prev->on_cpu, 0);
1083 prev->on_cpu = 0;
1084#endif 1079#endif
1085#ifdef CONFIG_DEBUG_SPINLOCK 1080#ifdef CONFIG_DEBUG_SPINLOCK
1086 /* this is a valid case when another task releases the spinlock */ 1081 /* this is a valid case when another task releases the spinlock */
@@ -1156,16 +1151,18 @@ static const u32 prio_to_wmult[40] = {
1156 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 1151 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1157}; 1152};
1158 1153
1159#define ENQUEUE_WAKEUP 1 1154#define ENQUEUE_WAKEUP 0x01
1160#define ENQUEUE_HEAD 2 1155#define ENQUEUE_HEAD 0x02
1161#ifdef CONFIG_SMP 1156#ifdef CONFIG_SMP
1162#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */ 1157#define ENQUEUE_WAKING 0x04 /* sched_class::task_waking was called */
1163#else 1158#else
1164#define ENQUEUE_WAKING 0 1159#define ENQUEUE_WAKING 0x00
1165#endif 1160#endif
1166#define ENQUEUE_REPLENISH 8 1161#define ENQUEUE_REPLENISH 0x08
1162#define ENQUEUE_RESTORE 0x10
1167 1163
1168#define DEQUEUE_SLEEP 1 1164#define DEQUEUE_SLEEP 0x01
1165#define DEQUEUE_SAVE 0x02
1169 1166
1170#define RETRY_TASK ((void *)-1UL) 1167#define RETRY_TASK ((void *)-1UL)
1171 1168
@@ -1193,7 +1190,7 @@ struct sched_class {
1193 1190
1194#ifdef CONFIG_SMP 1191#ifdef CONFIG_SMP
1195 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); 1192 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
1196 void (*migrate_task_rq)(struct task_struct *p, int next_cpu); 1193 void (*migrate_task_rq)(struct task_struct *p);
1197 1194
1198 void (*task_waking) (struct task_struct *task); 1195 void (*task_waking) (struct task_struct *task);
1199 void (*task_woken) (struct rq *this_rq, struct task_struct *task); 1196 void (*task_woken) (struct rq *this_rq, struct task_struct *task);
@@ -1226,7 +1223,7 @@ struct sched_class {
1226 void (*update_curr) (struct rq *rq); 1223 void (*update_curr) (struct rq *rq);
1227 1224
1228#ifdef CONFIG_FAIR_GROUP_SCHED 1225#ifdef CONFIG_FAIR_GROUP_SCHED
1229 void (*task_move_group) (struct task_struct *p, int on_rq); 1226 void (*task_move_group) (struct task_struct *p);
1230#endif 1227#endif
1231}; 1228};
1232 1229
@@ -1404,6 +1401,17 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
1404} 1401}
1405#endif 1402#endif
1406 1403
1404#ifndef arch_scale_cpu_capacity
1405static __always_inline
1406unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
1407{
1408 if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
1409 return sd->smt_gain / sd->span_weight;
1410
1411 return SCHED_CAPACITY_SCALE;
1412}
1413#endif
1414
1407static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) 1415static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1408{ 1416{
1409 rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); 1417 rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 272d9322bc5d..052e02672d12 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -106,10 +106,9 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
106} 106}
107EXPORT_SYMBOL_GPL(__wake_up_locked); 107EXPORT_SYMBOL_GPL(__wake_up_locked);
108 108
109void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr, 109void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
110 void *key)
111{ 110{
112 __wake_up_common(q, mode, nr, 0, key); 111 __wake_up_common(q, mode, 1, 0, key);
113} 112}
114EXPORT_SYMBOL_GPL(__wake_up_locked_key); 113EXPORT_SYMBOL_GPL(__wake_up_locked_key);
115 114
@@ -284,7 +283,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
284 if (!list_empty(&wait->task_list)) 283 if (!list_empty(&wait->task_list))
285 list_del_init(&wait->task_list); 284 list_del_init(&wait->task_list);
286 else if (waitqueue_active(q)) 285 else if (waitqueue_active(q))
287 __wake_up_locked_key(q, mode, 1, key); 286 __wake_up_locked_key(q, mode, key);
288 spin_unlock_irqrestore(&q->lock, flags); 287 spin_unlock_irqrestore(&q->lock, flags);
289} 288}
290EXPORT_SYMBOL(abort_exclusive_wait); 289EXPORT_SYMBOL(abort_exclusive_wait);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 5bd4779282df..580ac2d4024f 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -347,6 +347,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
347{ 347{
348 struct seccomp_filter *sfilter; 348 struct seccomp_filter *sfilter;
349 int ret; 349 int ret;
350 const bool save_orig = config_enabled(CONFIG_CHECKPOINT_RESTORE);
350 351
351 if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) 352 if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
352 return ERR_PTR(-EINVAL); 353 return ERR_PTR(-EINVAL);
@@ -370,7 +371,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
370 return ERR_PTR(-ENOMEM); 371 return ERR_PTR(-ENOMEM);
371 372
372 ret = bpf_prog_create_from_user(&sfilter->prog, fprog, 373 ret = bpf_prog_create_from_user(&sfilter->prog, fprog,
373 seccomp_check_filter); 374 seccomp_check_filter, save_orig);
374 if (ret < 0) { 375 if (ret < 0) {
375 kfree(sfilter); 376 kfree(sfilter);
376 return ERR_PTR(ret); 377 return ERR_PTR(ret);
@@ -469,7 +470,7 @@ void get_seccomp_filter(struct task_struct *tsk)
469static inline void seccomp_filter_free(struct seccomp_filter *filter) 470static inline void seccomp_filter_free(struct seccomp_filter *filter)
470{ 471{
471 if (filter) { 472 if (filter) {
472 bpf_prog_free(filter->prog); 473 bpf_prog_destroy(filter->prog);
473 kfree(filter); 474 kfree(filter);
474 } 475 }
475} 476}
@@ -867,3 +868,76 @@ long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
867 /* prctl interface doesn't have flags, so they are always zero. */ 868 /* prctl interface doesn't have flags, so they are always zero. */
868 return do_seccomp(op, 0, uargs); 869 return do_seccomp(op, 0, uargs);
869} 870}
871
872#if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
873long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
874 void __user *data)
875{
876 struct seccomp_filter *filter;
877 struct sock_fprog_kern *fprog;
878 long ret;
879 unsigned long count = 0;
880
881 if (!capable(CAP_SYS_ADMIN) ||
882 current->seccomp.mode != SECCOMP_MODE_DISABLED) {
883 return -EACCES;
884 }
885
886 spin_lock_irq(&task->sighand->siglock);
887 if (task->seccomp.mode != SECCOMP_MODE_FILTER) {
888 ret = -EINVAL;
889 goto out;
890 }
891
892 filter = task->seccomp.filter;
893 while (filter) {
894 filter = filter->prev;
895 count++;
896 }
897
898 if (filter_off >= count) {
899 ret = -ENOENT;
900 goto out;
901 }
902 count -= filter_off;
903
904 filter = task->seccomp.filter;
905 while (filter && count > 1) {
906 filter = filter->prev;
907 count--;
908 }
909
910 if (WARN_ON(count != 1 || !filter)) {
911 /* The filter tree shouldn't shrink while we're using it. */
912 ret = -ENOENT;
913 goto out;
914 }
915
916 fprog = filter->prog->orig_prog;
917 if (!fprog) {
918 /* This must be a new non-cBPF filter, since we save every
919 * every cBPF filter's orig_prog above when
920 * CONFIG_CHECKPOINT_RESTORE is enabled.
921 */
922 ret = -EMEDIUMTYPE;
923 goto out;
924 }
925
926 ret = fprog->len;
927 if (!data)
928 goto out;
929
930 get_seccomp_filter(task);
931 spin_unlock_irq(&task->sighand->siglock);
932
933 if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog)))
934 ret = -EFAULT;
935
936 put_seccomp_filter(task);
937 return ret;
938
939out:
940 spin_unlock_irq(&task->sighand->siglock);
941 return ret;
942}
943#endif
diff --git a/kernel/signal.c b/kernel/signal.c
index 0f6bbbe77b46..c0b01fe24bbd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -503,41 +503,6 @@ int unhandled_signal(struct task_struct *tsk, int sig)
503 return !tsk->ptrace; 503 return !tsk->ptrace;
504} 504}
505 505
506/*
507 * Notify the system that a driver wants to block all signals for this
508 * process, and wants to be notified if any signals at all were to be
509 * sent/acted upon. If the notifier routine returns non-zero, then the
510 * signal will be acted upon after all. If the notifier routine returns 0,
511 * then then signal will be blocked. Only one block per process is
512 * allowed. priv is a pointer to private data that the notifier routine
513 * can use to determine if the signal should be blocked or not.
514 */
515void
516block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask)
517{
518 unsigned long flags;
519
520 spin_lock_irqsave(&current->sighand->siglock, flags);
521 current->notifier_mask = mask;
522 current->notifier_data = priv;
523 current->notifier = notifier;
524 spin_unlock_irqrestore(&current->sighand->siglock, flags);
525}
526
527/* Notify the system that blocking has ended. */
528
529void
530unblock_all_signals(void)
531{
532 unsigned long flags;
533
534 spin_lock_irqsave(&current->sighand->siglock, flags);
535 current->notifier = NULL;
536 current->notifier_data = NULL;
537 recalc_sigpending();
538 spin_unlock_irqrestore(&current->sighand->siglock, flags);
539}
540
541static void collect_signal(int sig, struct sigpending *list, siginfo_t *info) 506static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
542{ 507{
543 struct sigqueue *q, *first = NULL; 508 struct sigqueue *q, *first = NULL;
@@ -580,19 +545,8 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
580{ 545{
581 int sig = next_signal(pending, mask); 546 int sig = next_signal(pending, mask);
582 547
583 if (sig) { 548 if (sig)
584 if (current->notifier) {
585 if (sigismember(current->notifier_mask, sig)) {
586 if (!(current->notifier)(current->notifier_data)) {
587 clear_thread_flag(TIF_SIGPENDING);
588 return 0;
589 }
590 }
591 }
592
593 collect_signal(sig, pending, info); 549 collect_signal(sig, pending, info);
594 }
595
596 return sig; 550 return sig;
597} 551}
598 552
@@ -834,7 +788,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
834 sigset_t flush; 788 sigset_t flush;
835 789
836 if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) { 790 if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) {
837 if (signal->flags & SIGNAL_GROUP_COREDUMP) 791 if (!(signal->flags & SIGNAL_GROUP_EXIT))
838 return sig == SIGKILL; 792 return sig == SIGKILL;
839 /* 793 /*
840 * The process is in the middle of dying, nothing to do. 794 * The process is in the middle of dying, nothing to do.
@@ -2483,9 +2437,6 @@ EXPORT_SYMBOL(force_sig);
2483EXPORT_SYMBOL(send_sig); 2437EXPORT_SYMBOL(send_sig);
2484EXPORT_SYMBOL(send_sig_info); 2438EXPORT_SYMBOL(send_sig_info);
2485EXPORT_SYMBOL(sigprocmask); 2439EXPORT_SYMBOL(sigprocmask);
2486EXPORT_SYMBOL(block_all_signals);
2487EXPORT_SYMBOL(unblock_all_signals);
2488
2489 2440
2490/* 2441/*
2491 * System call entry points. 2442 * System call entry points.
diff --git a/kernel/smp.c b/kernel/smp.c
index 07854477c164..d903c02223af 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -669,7 +669,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
669 cpumask_var_t cpus; 669 cpumask_var_t cpus;
670 int cpu, ret; 670 int cpu, ret;
671 671
672 might_sleep_if(gfp_flags & __GFP_WAIT); 672 might_sleep_if(gfpflags_allow_blocking(gfp_flags));
673 673
674 if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) { 674 if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) {
675 preempt_disable(); 675 preempt_disable();
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index a818cbc73e14..d264f59bff56 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -222,9 +222,8 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp
222{ 222{
223 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); 223 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
224 224
225 if (ht->pre_unpark) 225 if (!ht->selfparking)
226 ht->pre_unpark(cpu); 226 kthread_unpark(tsk);
227 kthread_unpark(tsk);
228} 227}
229 228
230void smpboot_unpark_threads(unsigned int cpu) 229void smpboot_unpark_threads(unsigned int cpu)
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 12484e5d5c88..867bc20e1ef1 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -73,21 +73,24 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
73 } 73 }
74} 74}
75 75
76static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
77 struct cpu_stop_work *work)
78{
79 list_add_tail(&work->list, &stopper->works);
80 wake_up_process(stopper->thread);
81}
82
76/* queue @work to @stopper. if offline, @work is completed immediately */ 83/* queue @work to @stopper. if offline, @work is completed immediately */
77static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) 84static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
78{ 85{
79 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); 86 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
80
81 unsigned long flags; 87 unsigned long flags;
82 88
83 spin_lock_irqsave(&stopper->lock, flags); 89 spin_lock_irqsave(&stopper->lock, flags);
84 90 if (stopper->enabled)
85 if (stopper->enabled) { 91 __cpu_stop_queue_work(stopper, work);
86 list_add_tail(&work->list, &stopper->works); 92 else
87 wake_up_process(stopper->thread);
88 } else
89 cpu_stop_signal_done(work->done, false); 93 cpu_stop_signal_done(work->done, false);
90
91 spin_unlock_irqrestore(&stopper->lock, flags); 94 spin_unlock_irqrestore(&stopper->lock, flags);
92} 95}
93 96
@@ -213,6 +216,31 @@ static int multi_cpu_stop(void *data)
213 return err; 216 return err;
214} 217}
215 218
219static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
220 int cpu2, struct cpu_stop_work *work2)
221{
222 struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1);
223 struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
224 int err;
225
226 lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
227 spin_lock_irq(&stopper1->lock);
228 spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
229
230 err = -ENOENT;
231 if (!stopper1->enabled || !stopper2->enabled)
232 goto unlock;
233
234 err = 0;
235 __cpu_stop_queue_work(stopper1, work1);
236 __cpu_stop_queue_work(stopper2, work2);
237unlock:
238 spin_unlock(&stopper2->lock);
239 spin_unlock_irq(&stopper1->lock);
240 lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
241
242 return err;
243}
216/** 244/**
217 * stop_two_cpus - stops two cpus 245 * stop_two_cpus - stops two cpus
218 * @cpu1: the cpu to stop 246 * @cpu1: the cpu to stop
@@ -247,24 +275,13 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
247 cpu_stop_init_done(&done, 2); 275 cpu_stop_init_done(&done, 2);
248 set_state(&msdata, MULTI_STOP_PREPARE); 276 set_state(&msdata, MULTI_STOP_PREPARE);
249 277
250 /* 278 if (cpu1 > cpu2)
251 * If we observe both CPUs active we know _cpu_down() cannot yet have 279 swap(cpu1, cpu2);
252 * queued its stop_machine works and therefore ours will get executed 280 if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2)) {
253 * first. Or its not either one of our CPUs that's getting unplugged,
254 * in which case we don't care.
255 *
256 * This relies on the stopper workqueues to be FIFO.
257 */
258 if (!cpu_active(cpu1) || !cpu_active(cpu2)) {
259 preempt_enable(); 281 preempt_enable();
260 return -ENOENT; 282 return -ENOENT;
261 } 283 }
262 284
263 lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
264 cpu_stop_queue_work(cpu1, &work1);
265 cpu_stop_queue_work(cpu2, &work2);
266 lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
267
268 preempt_enable(); 285 preempt_enable();
269 286
270 wait_for_completion(&done.completion); 287 wait_for_completion(&done.completion);
@@ -452,6 +469,18 @@ repeat:
452 } 469 }
453} 470}
454 471
472void stop_machine_park(int cpu)
473{
474 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
475 /*
476 * Lockless. cpu_stopper_thread() will take stopper->lock and flush
477 * the pending works before it parks, until then it is fine to queue
478 * the new works.
479 */
480 stopper->enabled = false;
481 kthread_park(stopper->thread);
482}
483
455extern void sched_set_stop_task(int cpu, struct task_struct *stop); 484extern void sched_set_stop_task(int cpu, struct task_struct *stop);
456 485
457static void cpu_stop_create(unsigned int cpu) 486static void cpu_stop_create(unsigned int cpu)
@@ -462,26 +491,16 @@ static void cpu_stop_create(unsigned int cpu)
462static void cpu_stop_park(unsigned int cpu) 491static void cpu_stop_park(unsigned int cpu)
463{ 492{
464 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); 493 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
465 struct cpu_stop_work *work, *tmp;
466 unsigned long flags;
467 494
468 /* drain remaining works */ 495 WARN_ON(!list_empty(&stopper->works));
469 spin_lock_irqsave(&stopper->lock, flags);
470 list_for_each_entry_safe(work, tmp, &stopper->works, list) {
471 list_del_init(&work->list);
472 cpu_stop_signal_done(work->done, false);
473 }
474 stopper->enabled = false;
475 spin_unlock_irqrestore(&stopper->lock, flags);
476} 496}
477 497
478static void cpu_stop_unpark(unsigned int cpu) 498void stop_machine_unpark(int cpu)
479{ 499{
480 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); 500 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
481 501
482 spin_lock_irq(&stopper->lock);
483 stopper->enabled = true; 502 stopper->enabled = true;
484 spin_unlock_irq(&stopper->lock); 503 kthread_unpark(stopper->thread);
485} 504}
486 505
487static struct smp_hotplug_thread cpu_stop_threads = { 506static struct smp_hotplug_thread cpu_stop_threads = {
@@ -490,9 +509,7 @@ static struct smp_hotplug_thread cpu_stop_threads = {
490 .thread_fn = cpu_stopper_thread, 509 .thread_fn = cpu_stopper_thread,
491 .thread_comm = "migration/%u", 510 .thread_comm = "migration/%u",
492 .create = cpu_stop_create, 511 .create = cpu_stop_create,
493 .setup = cpu_stop_unpark,
494 .park = cpu_stop_park, 512 .park = cpu_stop_park,
495 .pre_unpark = cpu_stop_unpark,
496 .selfparking = true, 513 .selfparking = true,
497}; 514};
498 515
@@ -508,6 +525,7 @@ static int __init cpu_stop_init(void)
508 } 525 }
509 526
510 BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads)); 527 BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads));
528 stop_machine_unpark(raw_smp_processor_id());
511 stop_machine_initialized = true; 529 stop_machine_initialized = true;
512 return 0; 530 return 0;
513} 531}
diff --git a/kernel/sys.c b/kernel/sys.c
index fa2f2f671a5c..6af9212ab5aa 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -222,7 +222,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
222 goto out_unlock; /* No processes for this user */ 222 goto out_unlock; /* No processes for this user */
223 } 223 }
224 do_each_thread(g, p) { 224 do_each_thread(g, p) {
225 if (uid_eq(task_uid(p), uid)) 225 if (uid_eq(task_uid(p), uid) && task_pid_vnr(p))
226 error = set_one_prio(p, niceval, error); 226 error = set_one_prio(p, niceval, error);
227 } while_each_thread(g, p); 227 } while_each_thread(g, p);
228 if (!uid_eq(uid, cred->uid)) 228 if (!uid_eq(uid, cred->uid))
@@ -290,7 +290,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
290 goto out_unlock; /* No processes for this user */ 290 goto out_unlock; /* No processes for this user */
291 } 291 }
292 do_each_thread(g, p) { 292 do_each_thread(g, p) {
293 if (uid_eq(task_uid(p), uid)) { 293 if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) {
294 niceval = nice_to_rlimit(task_nice(p)); 294 niceval = nice_to_rlimit(task_nice(p));
295 if (niceval > retval) 295 if (niceval > retval)
296 retval = niceval; 296 retval = niceval;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index a02decf15583..0623787ec67a 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -194,6 +194,7 @@ cond_syscall(sys_mlock);
194cond_syscall(sys_munlock); 194cond_syscall(sys_munlock);
195cond_syscall(sys_mlockall); 195cond_syscall(sys_mlockall);
196cond_syscall(sys_munlockall); 196cond_syscall(sys_munlockall);
197cond_syscall(sys_mlock2);
197cond_syscall(sys_mincore); 198cond_syscall(sys_mincore);
198cond_syscall(sys_madvise); 199cond_syscall(sys_madvise);
199cond_syscall(sys_mremap); 200cond_syscall(sys_mremap);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e69201d8094e..dc6858d6639e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -64,6 +64,7 @@
64#include <linux/binfmts.h> 64#include <linux/binfmts.h>
65#include <linux/sched/sysctl.h> 65#include <linux/sched/sysctl.h>
66#include <linux/kexec.h> 66#include <linux/kexec.h>
67#include <linux/bpf.h>
67 68
68#include <asm/uaccess.h> 69#include <asm/uaccess.h>
69#include <asm/processor.h> 70#include <asm/processor.h>
@@ -887,6 +888,17 @@ static struct ctl_table kern_table[] = {
887 .extra1 = &zero, 888 .extra1 = &zero,
888 .extra2 = &one, 889 .extra2 = &one,
889 }, 890 },
891#ifdef CONFIG_HARDLOCKUP_DETECTOR
892 {
893 .procname = "hardlockup_panic",
894 .data = &hardlockup_panic,
895 .maxlen = sizeof(int),
896 .mode = 0644,
897 .proc_handler = proc_dointvec_minmax,
898 .extra1 = &zero,
899 .extra2 = &one,
900 },
901#endif
890#ifdef CONFIG_SMP 902#ifdef CONFIG_SMP
891 { 903 {
892 .procname = "softlockup_all_cpu_backtrace", 904 .procname = "softlockup_all_cpu_backtrace",
@@ -897,6 +909,15 @@ static struct ctl_table kern_table[] = {
897 .extra1 = &zero, 909 .extra1 = &zero,
898 .extra2 = &one, 910 .extra2 = &one,
899 }, 911 },
912 {
913 .procname = "hardlockup_all_cpu_backtrace",
914 .data = &sysctl_hardlockup_all_cpu_backtrace,
915 .maxlen = sizeof(int),
916 .mode = 0644,
917 .proc_handler = proc_dointvec_minmax,
918 .extra1 = &zero,
919 .extra2 = &one,
920 },
900#endif /* CONFIG_SMP */ 921#endif /* CONFIG_SMP */
901#endif 922#endif
902#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) 923#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
@@ -1139,6 +1160,18 @@ static struct ctl_table kern_table[] = {
1139 .proc_handler = timer_migration_handler, 1160 .proc_handler = timer_migration_handler,
1140 }, 1161 },
1141#endif 1162#endif
1163#ifdef CONFIG_BPF_SYSCALL
1164 {
1165 .procname = "unprivileged_bpf_disabled",
1166 .data = &sysctl_unprivileged_bpf_disabled,
1167 .maxlen = sizeof(sysctl_unprivileged_bpf_disabled),
1168 .mode = 0644,
1169 /* only handle a transition from default "0" to "1" */
1170 .proc_handler = proc_dointvec_minmax,
1171 .extra1 = &one,
1172 .extra2 = &one,
1173 },
1174#endif
1142 { } 1175 { }
1143}; 1176};
1144 1177
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 841b72f720e8..1347882d131e 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -217,7 +217,7 @@ static void clocksource_watchdog(unsigned long data)
217 continue; 217 continue;
218 218
219 /* Check the deviation from the watchdog clocksource. */ 219 /* Check the deviation from the watchdog clocksource. */
220 if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) { 220 if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
221 pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable because the skew is too large:\n", 221 pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable because the skew is too large:\n",
222 cs->name); 222 cs->name);
223 pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n", 223 pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n",
@@ -479,7 +479,7 @@ static u32 clocksource_max_adjustment(struct clocksource *cs)
479 * return half the number of nanoseconds the hardware counter can technically 479 * return half the number of nanoseconds the hardware counter can technically
480 * cover. This is done so that we can potentially detect problems caused by 480 * cover. This is done so that we can potentially detect problems caused by
481 * delayed timers or bad hardware, which might result in time intervals that 481 * delayed timers or bad hardware, which might result in time intervals that
482 * are larger then what the math used can handle without overflows. 482 * are larger than what the math used can handle without overflows.
483 */ 483 */
484u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc) 484u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc)
485{ 485{
@@ -595,16 +595,15 @@ static void __clocksource_select(bool skipcur)
595 */ 595 */
596static void clocksource_select(void) 596static void clocksource_select(void)
597{ 597{
598 return __clocksource_select(false); 598 __clocksource_select(false);
599} 599}
600 600
601static void clocksource_select_fallback(void) 601static void clocksource_select_fallback(void)
602{ 602{
603 return __clocksource_select(true); 603 __clocksource_select(true);
604} 604}
605 605
606#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ 606#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */
607
608static inline void clocksource_select(void) { } 607static inline void clocksource_select(void) { }
609static inline void clocksource_select_fallback(void) { } 608static inline void clocksource_select_fallback(void) { }
610 609
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 457a373e2181..435b8850dd80 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -59,7 +59,7 @@
59/* 59/*
60 * The timer bases: 60 * The timer bases:
61 * 61 *
62 * There are more clockids then hrtimer bases. Thus, we index 62 * There are more clockids than hrtimer bases. Thus, we index
63 * into the timer bases by the hrtimer_base_type enum. When trying 63 * into the timer bases by the hrtimer_base_type enum. When trying
64 * to reach a base using a clockid, hrtimer_clockid_to_base() 64 * to reach a base using a clockid, hrtimer_clockid_to_base()
65 * is used to convert from clockid to the proper hrtimer_base_type. 65 * is used to convert from clockid to the proper hrtimer_base_type.
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index df68cb875248..149cc8086aea 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -99,7 +99,7 @@ static time64_t ntp_next_leap_sec = TIME64_MAX;
99static int pps_valid; /* signal watchdog counter */ 99static int pps_valid; /* signal watchdog counter */
100static long pps_tf[3]; /* phase median filter */ 100static long pps_tf[3]; /* phase median filter */
101static long pps_jitter; /* current jitter (ns) */ 101static long pps_jitter; /* current jitter (ns) */
102static struct timespec pps_fbase; /* beginning of the last freq interval */ 102static struct timespec64 pps_fbase; /* beginning of the last freq interval */
103static int pps_shift; /* current interval duration (s) (shift) */ 103static int pps_shift; /* current interval duration (s) (shift) */
104static int pps_intcnt; /* interval counter */ 104static int pps_intcnt; /* interval counter */
105static s64 pps_freq; /* frequency offset (scaled ns/s) */ 105static s64 pps_freq; /* frequency offset (scaled ns/s) */
@@ -509,7 +509,7 @@ static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
509static void sync_cmos_clock(struct work_struct *work) 509static void sync_cmos_clock(struct work_struct *work)
510{ 510{
511 struct timespec64 now; 511 struct timespec64 now;
512 struct timespec next; 512 struct timespec64 next;
513 int fail = 1; 513 int fail = 1;
514 514
515 /* 515 /*
@@ -559,7 +559,7 @@ static void sync_cmos_clock(struct work_struct *work)
559 next.tv_nsec -= NSEC_PER_SEC; 559 next.tv_nsec -= NSEC_PER_SEC;
560 } 560 }
561 queue_delayed_work(system_power_efficient_wq, 561 queue_delayed_work(system_power_efficient_wq,
562 &sync_cmos_work, timespec_to_jiffies(&next)); 562 &sync_cmos_work, timespec64_to_jiffies(&next));
563} 563}
564 564
565void ntp_notify_cmos_timer(void) 565void ntp_notify_cmos_timer(void)
@@ -773,13 +773,13 @@ int __do_adjtimex(struct timex *txc, struct timespec64 *ts, s32 *time_tai)
773 * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] 773 * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ]
774 * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */ 774 * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */
775struct pps_normtime { 775struct pps_normtime {
776 __kernel_time_t sec; /* seconds */ 776 s64 sec; /* seconds */
777 long nsec; /* nanoseconds */ 777 long nsec; /* nanoseconds */
778}; 778};
779 779
780/* normalize the timestamp so that nsec is in the 780/* normalize the timestamp so that nsec is in the
781 ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */ 781 ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */
782static inline struct pps_normtime pps_normalize_ts(struct timespec ts) 782static inline struct pps_normtime pps_normalize_ts(struct timespec64 ts)
783{ 783{
784 struct pps_normtime norm = { 784 struct pps_normtime norm = {
785 .sec = ts.tv_sec, 785 .sec = ts.tv_sec,
@@ -861,7 +861,7 @@ static long hardpps_update_freq(struct pps_normtime freq_norm)
861 pps_errcnt++; 861 pps_errcnt++;
862 pps_dec_freq_interval(); 862 pps_dec_freq_interval();
863 printk_deferred(KERN_ERR 863 printk_deferred(KERN_ERR
864 "hardpps: PPSERROR: interval too long - %ld s\n", 864 "hardpps: PPSERROR: interval too long - %lld s\n",
865 freq_norm.sec); 865 freq_norm.sec);
866 return 0; 866 return 0;
867 } 867 }
@@ -948,7 +948,7 @@ static void hardpps_update_phase(long error)
948 * This code is based on David Mills's reference nanokernel 948 * This code is based on David Mills's reference nanokernel
949 * implementation. It was mostly rewritten but keeps the same idea. 949 * implementation. It was mostly rewritten but keeps the same idea.
950 */ 950 */
951void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) 951void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
952{ 952{
953 struct pps_normtime pts_norm, freq_norm; 953 struct pps_normtime pts_norm, freq_norm;
954 954
@@ -969,7 +969,7 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
969 } 969 }
970 970
971 /* ok, now we have a base for frequency calculation */ 971 /* ok, now we have a base for frequency calculation */
972 freq_norm = pps_normalize_ts(timespec_sub(*raw_ts, pps_fbase)); 972 freq_norm = pps_normalize_ts(timespec64_sub(*raw_ts, pps_fbase));
973 973
974 /* check that the signal is in the range 974 /* check that the signal is in the range
975 * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */ 975 * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index 65430504ca26..af924470eac0 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -9,5 +9,5 @@ extern ktime_t ntp_get_next_leap(void);
9extern int second_overflow(unsigned long secs); 9extern int second_overflow(unsigned long secs);
10extern int ntp_validate_timex(struct timex *); 10extern int ntp_validate_timex(struct timex *);
11extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *); 11extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *);
12extern void __hardpps(const struct timespec *, const struct timespec *); 12extern void __hardpps(const struct timespec64 *, const struct timespec64 *);
13#endif /* _LINUX_NTP_INTERNAL_H */ 13#endif /* _LINUX_NTP_INTERNAL_H */
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 892e3dae0aac..f5e86d282d52 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -249,7 +249,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
249 * but barriers are not required because update_gt_cputime() 249 * but barriers are not required because update_gt_cputime()
250 * can handle concurrent updates. 250 * can handle concurrent updates.
251 */ 251 */
252 WRITE_ONCE(cputimer->running, 1); 252 WRITE_ONCE(cputimer->running, true);
253 } 253 }
254 sample_cputime_atomic(times, &cputimer->cputime_atomic); 254 sample_cputime_atomic(times, &cputimer->cputime_atomic);
255} 255}
@@ -864,6 +864,13 @@ static void check_thread_timers(struct task_struct *tsk,
864 unsigned long long expires; 864 unsigned long long expires;
865 unsigned long soft; 865 unsigned long soft;
866 866
867 /*
868 * If cputime_expires is zero, then there are no active
869 * per thread CPU timers.
870 */
871 if (task_cputime_zero(&tsk->cputime_expires))
872 return;
873
867 expires = check_timers_list(timers, firing, prof_ticks(tsk)); 874 expires = check_timers_list(timers, firing, prof_ticks(tsk));
868 tsk_expires->prof_exp = expires_to_cputime(expires); 875 tsk_expires->prof_exp = expires_to_cputime(expires);
869 876
@@ -911,7 +918,7 @@ static inline void stop_process_timers(struct signal_struct *sig)
911 struct thread_group_cputimer *cputimer = &sig->cputimer; 918 struct thread_group_cputimer *cputimer = &sig->cputimer;
912 919
913 /* Turn off cputimer->running. This is done without locking. */ 920 /* Turn off cputimer->running. This is done without locking. */
914 WRITE_ONCE(cputimer->running, 0); 921 WRITE_ONCE(cputimer->running, false);
915} 922}
916 923
917static u32 onecputick; 924static u32 onecputick;
@@ -962,6 +969,19 @@ static void check_process_timers(struct task_struct *tsk,
962 unsigned long soft; 969 unsigned long soft;
963 970
964 /* 971 /*
972 * If cputimer is not running, then there are no active
973 * process wide timers (POSIX 1.b, itimers, RLIMIT_CPU).
974 */
975 if (!READ_ONCE(tsk->signal->cputimer.running))
976 return;
977
978 /*
979 * Signify that a thread is checking for process timers.
980 * Write access to this field is protected by the sighand lock.
981 */
982 sig->cputimer.checking_timer = true;
983
984 /*
965 * Collect the current process totals. 985 * Collect the current process totals.
966 */ 986 */
967 thread_group_cputimer(tsk, &cputime); 987 thread_group_cputimer(tsk, &cputime);
@@ -1015,6 +1035,8 @@ static void check_process_timers(struct task_struct *tsk,
1015 sig->cputime_expires.sched_exp = sched_expires; 1035 sig->cputime_expires.sched_exp = sched_expires;
1016 if (task_cputime_zero(&sig->cputime_expires)) 1036 if (task_cputime_zero(&sig->cputime_expires))
1017 stop_process_timers(sig); 1037 stop_process_timers(sig);
1038
1039 sig->cputimer.checking_timer = false;
1018} 1040}
1019 1041
1020/* 1042/*
@@ -1117,24 +1139,33 @@ static inline int task_cputime_expired(const struct task_cputime *sample,
1117static inline int fastpath_timer_check(struct task_struct *tsk) 1139static inline int fastpath_timer_check(struct task_struct *tsk)
1118{ 1140{
1119 struct signal_struct *sig; 1141 struct signal_struct *sig;
1120 cputime_t utime, stime;
1121
1122 task_cputime(tsk, &utime, &stime);
1123 1142
1124 if (!task_cputime_zero(&tsk->cputime_expires)) { 1143 if (!task_cputime_zero(&tsk->cputime_expires)) {
1125 struct task_cputime task_sample = { 1144 struct task_cputime task_sample;
1126 .utime = utime,
1127 .stime = stime,
1128 .sum_exec_runtime = tsk->se.sum_exec_runtime
1129 };
1130 1145
1146 task_cputime(tsk, &task_sample.utime, &task_sample.stime);
1147 task_sample.sum_exec_runtime = tsk->se.sum_exec_runtime;
1131 if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) 1148 if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
1132 return 1; 1149 return 1;
1133 } 1150 }
1134 1151
1135 sig = tsk->signal; 1152 sig = tsk->signal;
1136 /* Check if cputimer is running. This is accessed without locking. */ 1153 /*
1137 if (READ_ONCE(sig->cputimer.running)) { 1154 * Check if thread group timers expired when the cputimer is
1155 * running and no other thread in the group is already checking
1156 * for thread group cputimers. These fields are read without the
1157 * sighand lock. However, this is fine because this is meant to
1158 * be a fastpath heuristic to determine whether we should try to
1159 * acquire the sighand lock to check/handle timers.
1160 *
1161 * In the worst case scenario, if 'running' or 'checking_timer' gets
1162 * set but the current thread doesn't see the change yet, we'll wait
1163 * until the next thread in the group gets a scheduler interrupt to
1164 * handle the timer. This isn't an issue in practice because these
1165 * types of delays with signals actually getting sent are expected.
1166 */
1167 if (READ_ONCE(sig->cputimer.running) &&
1168 !READ_ONCE(sig->cputimer.checking_timer)) {
1138 struct task_cputime group_sample; 1169 struct task_cputime group_sample;
1139 1170
1140 sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic); 1171 sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic);
@@ -1174,12 +1205,8 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1174 * put them on the firing list. 1205 * put them on the firing list.
1175 */ 1206 */
1176 check_thread_timers(tsk, &firing); 1207 check_thread_timers(tsk, &firing);
1177 /* 1208
1178 * If there are any active process wide timers (POSIX 1.b, itimers, 1209 check_process_timers(tsk, &firing);
1179 * RLIMIT_CPU) cputimer must be running.
1180 */
1181 if (READ_ONCE(tsk->signal->cputimer.running))
1182 check_process_timers(tsk, &firing);
1183 1210
1184 /* 1211 /*
1185 * We must release these locks before taking any timer's lock. 1212 * We must release these locks before taking any timer's lock.
diff --git a/kernel/time/timeconst.bc b/kernel/time/timeconst.bc
index c7388dee8635..c48688904f9f 100644
--- a/kernel/time/timeconst.bc
+++ b/kernel/time/timeconst.bc
@@ -39,7 +39,7 @@ define fmuls(b,n,d) {
39} 39}
40 40
41define timeconst(hz) { 41define timeconst(hz) {
42 print "/* Automatically generated by kernel/timeconst.bc */\n" 42 print "/* Automatically generated by kernel/time/timeconst.bc */\n"
43 print "/* Time conversion constants for HZ == ", hz, " */\n" 43 print "/* Time conversion constants for HZ == ", hz, " */\n"
44 print "\n" 44 print "\n"
45 45
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 3739ac6aa473..d563c1960302 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -849,7 +849,7 @@ EXPORT_SYMBOL_GPL(ktime_get_real_seconds);
849#ifdef CONFIG_NTP_PPS 849#ifdef CONFIG_NTP_PPS
850 850
851/** 851/**
852 * getnstime_raw_and_real - get day and raw monotonic time in timespec format 852 * ktime_get_raw_and_real_ts64 - get day and raw monotonic time in timespec format
853 * @ts_raw: pointer to the timespec to be set to raw monotonic time 853 * @ts_raw: pointer to the timespec to be set to raw monotonic time
854 * @ts_real: pointer to the timespec to be set to the time of day 854 * @ts_real: pointer to the timespec to be set to the time of day
855 * 855 *
@@ -857,7 +857,7 @@ EXPORT_SYMBOL_GPL(ktime_get_real_seconds);
857 * same time atomically and stores the resulting timestamps in timespec 857 * same time atomically and stores the resulting timestamps in timespec
858 * format. 858 * format.
859 */ 859 */
860void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) 860void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw, struct timespec64 *ts_real)
861{ 861{
862 struct timekeeper *tk = &tk_core.timekeeper; 862 struct timekeeper *tk = &tk_core.timekeeper;
863 unsigned long seq; 863 unsigned long seq;
@@ -868,7 +868,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
868 do { 868 do {
869 seq = read_seqcount_begin(&tk_core.seq); 869 seq = read_seqcount_begin(&tk_core.seq);
870 870
871 *ts_raw = timespec64_to_timespec(tk->raw_time); 871 *ts_raw = tk->raw_time;
872 ts_real->tv_sec = tk->xtime_sec; 872 ts_real->tv_sec = tk->xtime_sec;
873 ts_real->tv_nsec = 0; 873 ts_real->tv_nsec = 0;
874 874
@@ -877,10 +877,10 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
877 877
878 } while (read_seqcount_retry(&tk_core.seq, seq)); 878 } while (read_seqcount_retry(&tk_core.seq, seq));
879 879
880 timespec_add_ns(ts_raw, nsecs_raw); 880 timespec64_add_ns(ts_raw, nsecs_raw);
881 timespec_add_ns(ts_real, nsecs_real); 881 timespec64_add_ns(ts_real, nsecs_real);
882} 882}
883EXPORT_SYMBOL(getnstime_raw_and_real); 883EXPORT_SYMBOL(ktime_get_raw_and_real_ts64);
884 884
885#endif /* CONFIG_NTP_PPS */ 885#endif /* CONFIG_NTP_PPS */
886 886
@@ -1251,7 +1251,7 @@ void __init timekeeping_init(void)
1251 set_normalized_timespec64(&tmp, -boot.tv_sec, -boot.tv_nsec); 1251 set_normalized_timespec64(&tmp, -boot.tv_sec, -boot.tv_nsec);
1252 tk_set_wall_to_mono(tk, tmp); 1252 tk_set_wall_to_mono(tk, tmp);
1253 1253
1254 timekeeping_update(tk, TK_MIRROR); 1254 timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
1255 1255
1256 write_seqcount_end(&tk_core.seq); 1256 write_seqcount_end(&tk_core.seq);
1257 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1257 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -1614,7 +1614,7 @@ static __always_inline void timekeeping_freqadjust(struct timekeeper *tk,
1614 negative = (tick_error < 0); 1614 negative = (tick_error < 0);
1615 1615
1616 /* Sort out the magnitude of the correction */ 1616 /* Sort out the magnitude of the correction */
1617 tick_error = abs64(tick_error); 1617 tick_error = abs(tick_error);
1618 for (adj = 0; tick_error > interval; adj++) 1618 for (adj = 0; tick_error > interval; adj++)
1619 tick_error >>= 1; 1619 tick_error >>= 1;
1620 1620
@@ -1674,7 +1674,7 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
1674/** 1674/**
1675 * accumulate_nsecs_to_secs - Accumulates nsecs into secs 1675 * accumulate_nsecs_to_secs - Accumulates nsecs into secs
1676 * 1676 *
1677 * Helper function that accumulates a the nsecs greater then a second 1677 * Helper function that accumulates the nsecs greater than a second
1678 * from the xtime_nsec field to the xtime_secs field. 1678 * from the xtime_nsec field to the xtime_secs field.
1679 * It also calls into the NTP code to handle leapsecond processing. 1679 * It also calls into the NTP code to handle leapsecond processing.
1680 * 1680 *
@@ -1726,7 +1726,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
1726 cycle_t interval = tk->cycle_interval << shift; 1726 cycle_t interval = tk->cycle_interval << shift;
1727 u64 raw_nsecs; 1727 u64 raw_nsecs;
1728 1728
1729 /* If the offset is smaller then a shifted interval, do nothing */ 1729 /* If the offset is smaller than a shifted interval, do nothing */
1730 if (offset < interval) 1730 if (offset < interval)
1731 return offset; 1731 return offset;
1732 1732
@@ -2025,7 +2025,7 @@ int do_adjtimex(struct timex *txc)
2025/** 2025/**
2026 * hardpps() - Accessor function to NTP __hardpps function 2026 * hardpps() - Accessor function to NTP __hardpps function
2027 */ 2027 */
2028void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) 2028void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
2029{ 2029{
2030 unsigned long flags; 2030 unsigned long flags;
2031 2031
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 84190f02b521..74591ba9474f 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -461,10 +461,17 @@ void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
461 461
462static void timer_stats_account_timer(struct timer_list *timer) 462static void timer_stats_account_timer(struct timer_list *timer)
463{ 463{
464 if (likely(!timer->start_site)) 464 void *site;
465
466 /*
467 * start_site can be concurrently reset by
468 * timer_stats_timer_clear_start_info()
469 */
470 site = READ_ONCE(timer->start_site);
471 if (likely(!site))
465 return; 472 return;
466 473
467 timer_stats_update_stats(timer, timer->start_pid, timer->start_site, 474 timer_stats_update_stats(timer, timer->start_pid, site,
468 timer->function, timer->start_comm, 475 timer->function, timer->start_comm,
469 timer->flags); 476 timer->flags);
470} 477}
@@ -867,7 +874,7 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
867 if (mask == 0) 874 if (mask == 0)
868 return expires; 875 return expires;
869 876
870 bit = find_last_bit(&mask, BITS_PER_LONG); 877 bit = __fls(mask);
871 878
872 mask = (1UL << bit) - 1; 879 mask = (1UL << bit) - 1;
873 880
diff --git a/kernel/torture.c b/kernel/torture.c
index 3e4840633d3e..44aa462d033f 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -523,6 +523,7 @@ static int stutter;
523 */ 523 */
524void stutter_wait(const char *title) 524void stutter_wait(const char *title)
525{ 525{
526 cond_resched_rcu_qs();
526 while (READ_ONCE(stutter_pause_test) || 527 while (READ_ONCE(stutter_pause_test) ||
527 (torture_runnable && !READ_ONCE(*torture_runnable))) { 528 (torture_runnable && !READ_ONCE(*torture_runnable))) {
528 if (stutter_pause_test) 529 if (stutter_pause_test)
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8d6363f42169..e45db6b0d878 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -434,7 +434,7 @@ config UPROBE_EVENT
434 434
435config BPF_EVENTS 435config BPF_EVENTS
436 depends on BPF_SYSCALL 436 depends on BPF_SYSCALL
437 depends on KPROBE_EVENT || UPROBE_EVENT 437 depends on (KPROBE_EVENT || UPROBE_EVENT) && PERF_EVENTS
438 bool 438 bool
439 default y 439 default y
440 help 440 help
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b2fcf472774e..a990824c8604 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -437,7 +437,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
437 struct block_device *bdev, 437 struct block_device *bdev,
438 struct blk_user_trace_setup *buts) 438 struct blk_user_trace_setup *buts)
439{ 439{
440 struct blk_trace *old_bt, *bt = NULL; 440 struct blk_trace *bt = NULL;
441 struct dentry *dir = NULL; 441 struct dentry *dir = NULL;
442 int ret; 442 int ret;
443 443
@@ -519,11 +519,8 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
519 bt->trace_state = Blktrace_setup; 519 bt->trace_state = Blktrace_setup;
520 520
521 ret = -EBUSY; 521 ret = -EBUSY;
522 old_bt = xchg(&q->blk_trace, bt); 522 if (cmpxchg(&q->blk_trace, NULL, bt))
523 if (old_bt) {
524 (void) xchg(&q->blk_trace, old_bt);
525 goto err; 523 goto err;
526 }
527 524
528 if (atomic_inc_return(&blk_probes_ref) == 1) 525 if (atomic_inc_return(&blk_probes_ref) == 1)
529 blk_register_tracepoints(); 526 blk_register_tracepoints();
@@ -1482,7 +1479,7 @@ static int blk_trace_remove_queue(struct request_queue *q)
1482static int blk_trace_setup_queue(struct request_queue *q, 1479static int blk_trace_setup_queue(struct request_queue *q,
1483 struct block_device *bdev) 1480 struct block_device *bdev)
1484{ 1481{
1485 struct blk_trace *old_bt, *bt = NULL; 1482 struct blk_trace *bt = NULL;
1486 int ret = -ENOMEM; 1483 int ret = -ENOMEM;
1487 1484
1488 bt = kzalloc(sizeof(*bt), GFP_KERNEL); 1485 bt = kzalloc(sizeof(*bt), GFP_KERNEL);
@@ -1498,12 +1495,9 @@ static int blk_trace_setup_queue(struct request_queue *q,
1498 1495
1499 blk_trace_setup_lba(bt, bdev); 1496 blk_trace_setup_lba(bt, bdev);
1500 1497
1501 old_bt = xchg(&q->blk_trace, bt); 1498 ret = -EBUSY;
1502 if (old_bt != NULL) { 1499 if (cmpxchg(&q->blk_trace, NULL, bt))
1503 (void)xchg(&q->blk_trace, old_bt);
1504 ret = -EBUSY;
1505 goto free_bt; 1500 goto free_bt;
1506 }
1507 1501
1508 if (atomic_inc_return(&blk_probes_ref) == 1) 1502 if (atomic_inc_return(&blk_probes_ref) == 1)
1509 blk_register_tracepoints(); 1503 blk_register_tracepoints();
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 0fe96c7c8803..4228fd3682c3 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -199,6 +199,11 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
199 if (!event) 199 if (!event)
200 return -ENOENT; 200 return -ENOENT;
201 201
202 /* make sure event is local and doesn't have pmu::count */
203 if (event->oncpu != smp_processor_id() ||
204 event->pmu->count)
205 return -EINVAL;
206
202 /* 207 /*
203 * we don't know if the function is run successfully by the 208 * we don't know if the function is run successfully by the
204 * return value. It can be judged in other places, such as 209 * return value. It can be judged in other places, such as
@@ -207,14 +212,58 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
207 return perf_event_read_local(event); 212 return perf_event_read_local(event);
208} 213}
209 214
210const struct bpf_func_proto bpf_perf_event_read_proto = { 215static const struct bpf_func_proto bpf_perf_event_read_proto = {
211 .func = bpf_perf_event_read, 216 .func = bpf_perf_event_read,
212 .gpl_only = false, 217 .gpl_only = true,
213 .ret_type = RET_INTEGER, 218 .ret_type = RET_INTEGER,
214 .arg1_type = ARG_CONST_MAP_PTR, 219 .arg1_type = ARG_CONST_MAP_PTR,
215 .arg2_type = ARG_ANYTHING, 220 .arg2_type = ARG_ANYTHING,
216}; 221};
217 222
223static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
224{
225 struct pt_regs *regs = (struct pt_regs *) (long) r1;
226 struct bpf_map *map = (struct bpf_map *) (long) r2;
227 struct bpf_array *array = container_of(map, struct bpf_array, map);
228 void *data = (void *) (long) r4;
229 struct perf_sample_data sample_data;
230 struct perf_event *event;
231 struct perf_raw_record raw = {
232 .size = size,
233 .data = data,
234 };
235
236 if (unlikely(index >= array->map.max_entries))
237 return -E2BIG;
238
239 event = (struct perf_event *)array->ptrs[index];
240 if (unlikely(!event))
241 return -ENOENT;
242
243 if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||
244 event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
245 return -EINVAL;
246
247 if (unlikely(event->oncpu != smp_processor_id()))
248 return -EOPNOTSUPP;
249
250 perf_sample_data_init(&sample_data, 0, 0);
251 sample_data.raw = &raw;
252 perf_event_output(event, &sample_data, regs);
253 return 0;
254}
255
256static const struct bpf_func_proto bpf_perf_event_output_proto = {
257 .func = bpf_perf_event_output,
258 .gpl_only = true,
259 .ret_type = RET_INTEGER,
260 .arg1_type = ARG_PTR_TO_CTX,
261 .arg2_type = ARG_CONST_MAP_PTR,
262 .arg3_type = ARG_ANYTHING,
263 .arg4_type = ARG_PTR_TO_STACK,
264 .arg5_type = ARG_CONST_STACK_SIZE,
265};
266
218static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) 267static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
219{ 268{
220 switch (func_id) { 269 switch (func_id) {
@@ -242,6 +291,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
242 return &bpf_get_smp_processor_id_proto; 291 return &bpf_get_smp_processor_id_proto;
243 case BPF_FUNC_perf_event_read: 292 case BPF_FUNC_perf_event_read:
244 return &bpf_perf_event_read_proto; 293 return &bpf_perf_event_read_proto;
294 case BPF_FUNC_perf_event_output:
295 return &bpf_perf_event_output_proto;
245 default: 296 default:
246 return NULL; 297 return NULL;
247 } 298 }
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ea2725053771..3f743b147247 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5708,7 +5708,7 @@ free:
5708} 5708}
5709 5709
5710static void 5710static void
5711ftrace_graph_probe_sched_switch(void *ignore, 5711ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
5712 struct task_struct *prev, struct task_struct *next) 5712 struct task_struct *prev, struct task_struct *next)
5713{ 5713{
5714 unsigned long long timestamp; 5714 unsigned long long timestamp;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index bee1e1530052..6bbc5f652355 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -506,7 +506,7 @@ check_ignore_pid(struct trace_pid_list *filtered_pids, struct task_struct *task)
506} 506}
507 507
508static void 508static void
509event_filter_pid_sched_switch_probe_pre(void *data, 509event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
510 struct task_struct *prev, struct task_struct *next) 510 struct task_struct *prev, struct task_struct *next)
511{ 511{
512 struct trace_array *tr = data; 512 struct trace_array *tr = data;
@@ -520,7 +520,7 @@ event_filter_pid_sched_switch_probe_pre(void *data,
520} 520}
521 521
522static void 522static void
523event_filter_pid_sched_switch_probe_post(void *data, 523event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
524 struct task_struct *prev, struct task_struct *next) 524 struct task_struct *prev, struct task_struct *next)
525{ 525{
526 struct trace_array *tr = data; 526 struct trace_array *tr = data;
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index f270088e9929..4c896a0101bd 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -16,7 +16,8 @@ static int sched_ref;
16static DEFINE_MUTEX(sched_register_mutex); 16static DEFINE_MUTEX(sched_register_mutex);
17 17
18static void 18static void
19probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next) 19probe_sched_switch(void *ignore, bool preempt,
20 struct task_struct *prev, struct task_struct *next)
20{ 21{
21 if (unlikely(!sched_ref)) 22 if (unlikely(!sched_ref))
22 return; 23 return;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 855c2c7612e8..9d4399b553a3 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -424,7 +424,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
424} 424}
425 425
426static void notrace 426static void notrace
427probe_wakeup_sched_switch(void *ignore, 427probe_wakeup_sched_switch(void *ignore, bool preempt,
428 struct task_struct *prev, struct task_struct *next) 428 struct task_struct *prev, struct task_struct *next)
429{ 429{
430 struct trace_array_cpu *data; 430 struct trace_array_cpu *data;
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 0bd212af406c..dda9e6742950 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -91,9 +91,19 @@ check_stack(unsigned long ip, unsigned long *stack)
91 if (!object_is_on_stack(stack)) 91 if (!object_is_on_stack(stack))
92 return; 92 return;
93 93
94 /* Can't do this from NMI context (can cause deadlocks) */
95 if (in_nmi())
96 return;
97
94 local_irq_save(flags); 98 local_irq_save(flags);
95 arch_spin_lock(&stack_trace_max_lock); 99 arch_spin_lock(&stack_trace_max_lock);
96 100
101 /*
102 * RCU may not be watching, make it see us.
103 * The stack trace code uses rcu_sched.
104 */
105 rcu_irq_enter();
106
97 /* In case another CPU set the tracer_frame on us */ 107 /* In case another CPU set the tracer_frame on us */
98 if (unlikely(!frame_size)) 108 if (unlikely(!frame_size))
99 this_size -= tracer_frame; 109 this_size -= tracer_frame;
@@ -175,6 +185,7 @@ check_stack(unsigned long ip, unsigned long *stack)
175 } 185 }
176 186
177 out: 187 out:
188 rcu_irq_exit();
178 arch_spin_unlock(&stack_trace_max_lock); 189 arch_spin_unlock(&stack_trace_max_lock);
179 local_irq_restore(flags); 190 local_irq_restore(flags);
180} 191}
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 64ed1c37bd1f..18f34cf75f74 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -57,8 +57,10 @@ int __read_mostly watchdog_thresh = 10;
57 57
58#ifdef CONFIG_SMP 58#ifdef CONFIG_SMP
59int __read_mostly sysctl_softlockup_all_cpu_backtrace; 59int __read_mostly sysctl_softlockup_all_cpu_backtrace;
60int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
60#else 61#else
61#define sysctl_softlockup_all_cpu_backtrace 0 62#define sysctl_softlockup_all_cpu_backtrace 0
63#define sysctl_hardlockup_all_cpu_backtrace 0
62#endif 64#endif
63static struct cpumask watchdog_cpumask __read_mostly; 65static struct cpumask watchdog_cpumask __read_mostly;
64unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); 66unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
@@ -110,8 +112,9 @@ static unsigned long soft_lockup_nmi_warn;
110 * Should we panic when a soft-lockup or hard-lockup occurs: 112 * Should we panic when a soft-lockup or hard-lockup occurs:
111 */ 113 */
112#ifdef CONFIG_HARDLOCKUP_DETECTOR 114#ifdef CONFIG_HARDLOCKUP_DETECTOR
113static int hardlockup_panic = 115unsigned int __read_mostly hardlockup_panic =
114 CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; 116 CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
117static unsigned long hardlockup_allcpu_dumped;
115/* 118/*
116 * We may not want to enable hard lockup detection by default in all cases, 119 * We may not want to enable hard lockup detection by default in all cases,
117 * for example when running the kernel as a guest on a hypervisor. In these 120 * for example when running the kernel as a guest on a hypervisor. In these
@@ -173,6 +176,13 @@ static int __init softlockup_all_cpu_backtrace_setup(char *str)
173 return 1; 176 return 1;
174} 177}
175__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); 178__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
179static int __init hardlockup_all_cpu_backtrace_setup(char *str)
180{
181 sysctl_hardlockup_all_cpu_backtrace =
182 !!simple_strtol(str, NULL, 0);
183 return 1;
184}
185__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
176#endif 186#endif
177 187
178/* 188/*
@@ -263,15 +273,15 @@ void touch_softlockup_watchdog_sync(void)
263 273
264#ifdef CONFIG_HARDLOCKUP_DETECTOR 274#ifdef CONFIG_HARDLOCKUP_DETECTOR
265/* watchdog detector functions */ 275/* watchdog detector functions */
266static int is_hardlockup(void) 276static bool is_hardlockup(void)
267{ 277{
268 unsigned long hrint = __this_cpu_read(hrtimer_interrupts); 278 unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
269 279
270 if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) 280 if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
271 return 1; 281 return true;
272 282
273 __this_cpu_write(hrtimer_interrupts_saved, hrint); 283 __this_cpu_write(hrtimer_interrupts_saved, hrint);
274 return 0; 284 return false;
275} 285}
276#endif 286#endif
277 287
@@ -279,7 +289,7 @@ static int is_softlockup(unsigned long touch_ts)
279{ 289{
280 unsigned long now = get_timestamp(); 290 unsigned long now = get_timestamp();
281 291
282 if (watchdog_enabled & SOFT_WATCHDOG_ENABLED) { 292 if ((watchdog_enabled & SOFT_WATCHDOG_ENABLED) && watchdog_thresh){
283 /* Warn about unreasonable delays. */ 293 /* Warn about unreasonable delays. */
284 if (time_after(now, touch_ts + get_softlockup_thresh())) 294 if (time_after(now, touch_ts + get_softlockup_thresh()))
285 return now - touch_ts; 295 return now - touch_ts;
@@ -318,17 +328,30 @@ static void watchdog_overflow_callback(struct perf_event *event,
318 */ 328 */
319 if (is_hardlockup()) { 329 if (is_hardlockup()) {
320 int this_cpu = smp_processor_id(); 330 int this_cpu = smp_processor_id();
331 struct pt_regs *regs = get_irq_regs();
321 332
322 /* only print hardlockups once */ 333 /* only print hardlockups once */
323 if (__this_cpu_read(hard_watchdog_warn) == true) 334 if (__this_cpu_read(hard_watchdog_warn) == true)
324 return; 335 return;
325 336
326 if (hardlockup_panic) 337 pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
327 panic("Watchdog detected hard LOCKUP on cpu %d", 338 print_modules();
328 this_cpu); 339 print_irqtrace_events(current);
340 if (regs)
341 show_regs(regs);
329 else 342 else
330 WARN(1, "Watchdog detected hard LOCKUP on cpu %d", 343 dump_stack();
331 this_cpu); 344
345 /*
346 * Perform all-CPU dump only once to avoid multiple hardlockups
347 * generating interleaving traces
348 */
349 if (sysctl_hardlockup_all_cpu_backtrace &&
350 !test_and_set_bit(0, &hardlockup_allcpu_dumped))
351 trigger_allbutself_cpu_backtrace();
352
353 if (hardlockup_panic)
354 panic("Hard LOCKUP");
332 355
333 __this_cpu_write(hard_watchdog_warn, true); 356 __this_cpu_write(hard_watchdog_warn, true);
334 return; 357 return;
@@ -347,6 +370,9 @@ static void watchdog_interrupt_count(void)
347static int watchdog_nmi_enable(unsigned int cpu); 370static int watchdog_nmi_enable(unsigned int cpu);
348static void watchdog_nmi_disable(unsigned int cpu); 371static void watchdog_nmi_disable(unsigned int cpu);
349 372
373static int watchdog_enable_all_cpus(void);
374static void watchdog_disable_all_cpus(void);
375
350/* watchdog kicker functions */ 376/* watchdog kicker functions */
351static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) 377static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
352{ 378{
@@ -651,37 +677,41 @@ static struct smp_hotplug_thread watchdog_threads = {
651 677
652/* 678/*
653 * park all watchdog threads that are specified in 'watchdog_cpumask' 679 * park all watchdog threads that are specified in 'watchdog_cpumask'
680 *
681 * This function returns an error if kthread_park() of a watchdog thread
682 * fails. In this situation, the watchdog threads of some CPUs can already
683 * be parked and the watchdog threads of other CPUs can still be runnable.
684 * Callers are expected to handle this special condition as appropriate in
685 * their context.
686 *
687 * This function may only be called in a context that is protected against
688 * races with CPU hotplug - for example, via get_online_cpus().
654 */ 689 */
655static int watchdog_park_threads(void) 690static int watchdog_park_threads(void)
656{ 691{
657 int cpu, ret = 0; 692 int cpu, ret = 0;
658 693
659 get_online_cpus();
660 for_each_watchdog_cpu(cpu) { 694 for_each_watchdog_cpu(cpu) {
661 ret = kthread_park(per_cpu(softlockup_watchdog, cpu)); 695 ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
662 if (ret) 696 if (ret)
663 break; 697 break;
664 } 698 }
665 if (ret) {
666 for_each_watchdog_cpu(cpu)
667 kthread_unpark(per_cpu(softlockup_watchdog, cpu));
668 }
669 put_online_cpus();
670 699
671 return ret; 700 return ret;
672} 701}
673 702
674/* 703/*
675 * unpark all watchdog threads that are specified in 'watchdog_cpumask' 704 * unpark all watchdog threads that are specified in 'watchdog_cpumask'
705 *
706 * This function may only be called in a context that is protected against
707 * races with CPU hotplug - for example, via get_online_cpus().
676 */ 708 */
677static void watchdog_unpark_threads(void) 709static void watchdog_unpark_threads(void)
678{ 710{
679 int cpu; 711 int cpu;
680 712
681 get_online_cpus();
682 for_each_watchdog_cpu(cpu) 713 for_each_watchdog_cpu(cpu)
683 kthread_unpark(per_cpu(softlockup_watchdog, cpu)); 714 kthread_unpark(per_cpu(softlockup_watchdog, cpu));
684 put_online_cpus();
685} 715}
686 716
687/* 717/*
@@ -691,6 +721,7 @@ int lockup_detector_suspend(void)
691{ 721{
692 int ret = 0; 722 int ret = 0;
693 723
724 get_online_cpus();
694 mutex_lock(&watchdog_proc_mutex); 725 mutex_lock(&watchdog_proc_mutex);
695 /* 726 /*
696 * Multiple suspend requests can be active in parallel (counted by 727 * Multiple suspend requests can be active in parallel (counted by
@@ -704,6 +735,11 @@ int lockup_detector_suspend(void)
704 735
705 if (ret == 0) 736 if (ret == 0)
706 watchdog_suspended++; 737 watchdog_suspended++;
738 else {
739 watchdog_disable_all_cpus();
740 pr_err("Failed to suspend lockup detectors, disabled\n");
741 watchdog_enabled = 0;
742 }
707 743
708 mutex_unlock(&watchdog_proc_mutex); 744 mutex_unlock(&watchdog_proc_mutex);
709 745
@@ -726,12 +762,20 @@ void lockup_detector_resume(void)
726 watchdog_unpark_threads(); 762 watchdog_unpark_threads();
727 763
728 mutex_unlock(&watchdog_proc_mutex); 764 mutex_unlock(&watchdog_proc_mutex);
765 put_online_cpus();
729} 766}
730 767
731static void update_watchdog_all_cpus(void) 768static int update_watchdog_all_cpus(void)
732{ 769{
733 watchdog_park_threads(); 770 int ret;
771
772 ret = watchdog_park_threads();
773 if (ret)
774 return ret;
775
734 watchdog_unpark_threads(); 776 watchdog_unpark_threads();
777
778 return 0;
735} 779}
736 780
737static int watchdog_enable_all_cpus(void) 781static int watchdog_enable_all_cpus(void)
@@ -750,15 +794,20 @@ static int watchdog_enable_all_cpus(void)
750 * Enable/disable the lockup detectors or 794 * Enable/disable the lockup detectors or
751 * change the sample period 'on the fly'. 795 * change the sample period 'on the fly'.
752 */ 796 */
753 update_watchdog_all_cpus(); 797 err = update_watchdog_all_cpus();
798
799 if (err) {
800 watchdog_disable_all_cpus();
801 pr_err("Failed to update lockup detectors, disabled\n");
802 }
754 } 803 }
755 804
805 if (err)
806 watchdog_enabled = 0;
807
756 return err; 808 return err;
757} 809}
758 810
759/* prepare/enable/disable routines */
760/* sysctl functions */
761#ifdef CONFIG_SYSCTL
762static void watchdog_disable_all_cpus(void) 811static void watchdog_disable_all_cpus(void)
763{ 812{
764 if (watchdog_running) { 813 if (watchdog_running) {
@@ -767,6 +816,8 @@ static void watchdog_disable_all_cpus(void)
767 } 816 }
768} 817}
769 818
819#ifdef CONFIG_SYSCTL
820
770/* 821/*
771 * Update the run state of the lockup detectors. 822 * Update the run state of the lockup detectors.
772 */ 823 */
@@ -808,6 +859,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
808 int err, old, new; 859 int err, old, new;
809 int *watchdog_param = (int *)table->data; 860 int *watchdog_param = (int *)table->data;
810 861
862 get_online_cpus();
811 mutex_lock(&watchdog_proc_mutex); 863 mutex_lock(&watchdog_proc_mutex);
812 864
813 if (watchdog_suspended) { 865 if (watchdog_suspended) {
@@ -849,15 +901,17 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
849 } while (cmpxchg(&watchdog_enabled, old, new) != old); 901 } while (cmpxchg(&watchdog_enabled, old, new) != old);
850 902
851 /* 903 /*
852 * Update the run state of the lockup detectors. 904 * Update the run state of the lockup detectors. There is _no_
853 * Restore 'watchdog_enabled' on failure. 905 * need to check the value returned by proc_watchdog_update()
906 * and to restore the previous value of 'watchdog_enabled' as
907 * both lockup detectors are disabled if proc_watchdog_update()
908 * returns an error.
854 */ 909 */
855 err = proc_watchdog_update(); 910 err = proc_watchdog_update();
856 if (err)
857 watchdog_enabled = old;
858 } 911 }
859out: 912out:
860 mutex_unlock(&watchdog_proc_mutex); 913 mutex_unlock(&watchdog_proc_mutex);
914 put_online_cpus();
861 return err; 915 return err;
862} 916}
863 917
@@ -899,6 +953,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
899{ 953{
900 int err, old; 954 int err, old;
901 955
956 get_online_cpus();
902 mutex_lock(&watchdog_proc_mutex); 957 mutex_lock(&watchdog_proc_mutex);
903 958
904 if (watchdog_suspended) { 959 if (watchdog_suspended) {
@@ -914,15 +969,17 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
914 goto out; 969 goto out;
915 970
916 /* 971 /*
917 * Update the sample period. 972 * Update the sample period. Restore on failure.
918 * Restore 'watchdog_thresh' on failure.
919 */ 973 */
920 set_sample_period(); 974 set_sample_period();
921 err = proc_watchdog_update(); 975 err = proc_watchdog_update();
922 if (err) 976 if (err) {
923 watchdog_thresh = old; 977 watchdog_thresh = old;
978 set_sample_period();
979 }
924out: 980out:
925 mutex_unlock(&watchdog_proc_mutex); 981 mutex_unlock(&watchdog_proc_mutex);
982 put_online_cpus();
926 return err; 983 return err;
927} 984}
928 985
@@ -937,6 +994,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
937{ 994{
938 int err; 995 int err;
939 996
997 get_online_cpus();
940 mutex_lock(&watchdog_proc_mutex); 998 mutex_lock(&watchdog_proc_mutex);
941 999
942 if (watchdog_suspended) { 1000 if (watchdog_suspended) {
@@ -964,6 +1022,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
964 } 1022 }
965out: 1023out:
966 mutex_unlock(&watchdog_proc_mutex); 1024 mutex_unlock(&watchdog_proc_mutex);
1025 put_online_cpus();
967 return err; 1026 return err;
968} 1027}
969 1028
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ca71582fcfab..c579dbab2e36 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1458,13 +1458,13 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
1458 timer_stats_timer_set_start_info(&dwork->timer); 1458 timer_stats_timer_set_start_info(&dwork->timer);
1459 1459
1460 dwork->wq = wq; 1460 dwork->wq = wq;
1461 /* timer isn't guaranteed to run in this cpu, record earlier */
1462 if (cpu == WORK_CPU_UNBOUND)
1463 cpu = raw_smp_processor_id();
1461 dwork->cpu = cpu; 1464 dwork->cpu = cpu;
1462 timer->expires = jiffies + delay; 1465 timer->expires = jiffies + delay;
1463 1466
1464 if (unlikely(cpu != WORK_CPU_UNBOUND)) 1467 add_timer_on(timer, cpu);
1465 add_timer_on(timer, cpu);
1466 else
1467 add_timer(timer);
1468} 1468}
1469 1469
1470/** 1470/**
@@ -3199,6 +3199,7 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
3199 u32 hash = wqattrs_hash(attrs); 3199 u32 hash = wqattrs_hash(attrs);
3200 struct worker_pool *pool; 3200 struct worker_pool *pool;
3201 int node; 3201 int node;
3202 int target_node = NUMA_NO_NODE;
3202 3203
3203 lockdep_assert_held(&wq_pool_mutex); 3204 lockdep_assert_held(&wq_pool_mutex);
3204 3205
@@ -3210,13 +3211,25 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
3210 } 3211 }
3211 } 3212 }
3212 3213
3214 /* if cpumask is contained inside a NUMA node, we belong to that node */
3215 if (wq_numa_enabled) {
3216 for_each_node(node) {
3217 if (cpumask_subset(attrs->cpumask,
3218 wq_numa_possible_cpumask[node])) {
3219 target_node = node;
3220 break;
3221 }
3222 }
3223 }
3224
3213 /* nope, create a new one */ 3225 /* nope, create a new one */
3214 pool = kzalloc(sizeof(*pool), GFP_KERNEL); 3226 pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, target_node);
3215 if (!pool || init_worker_pool(pool) < 0) 3227 if (!pool || init_worker_pool(pool) < 0)
3216 goto fail; 3228 goto fail;
3217 3229
3218 lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ 3230 lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */
3219 copy_workqueue_attrs(pool->attrs, attrs); 3231 copy_workqueue_attrs(pool->attrs, attrs);
3232 pool->node = target_node;
3220 3233
3221 /* 3234 /*
3222 * no_numa isn't a worker_pool attribute, always clear it. See 3235 * no_numa isn't a worker_pool attribute, always clear it. See
@@ -3224,17 +3237,6 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
3224 */ 3237 */
3225 pool->attrs->no_numa = false; 3238 pool->attrs->no_numa = false;
3226 3239
3227 /* if cpumask is contained inside a NUMA node, we belong to that node */
3228 if (wq_numa_enabled) {
3229 for_each_node(node) {
3230 if (cpumask_subset(pool->attrs->cpumask,
3231 wq_numa_possible_cpumask[node])) {
3232 pool->node = node;
3233 break;
3234 }
3235 }
3236 }
3237
3238 if (worker_pool_assign_id(pool) < 0) 3240 if (worker_pool_assign_id(pool) < 0)
3239 goto fail; 3241 goto fail;
3240 3242