aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile23
-rw-r--r--kernel/acct.c46
-rw-r--r--kernel/async.c2
-rw-r--r--kernel/audit.c13
-rw-r--r--kernel/audit.h6
-rw-r--r--kernel/auditfilter.c17
-rw-r--r--kernel/auditsc.c754
-rw-r--r--kernel/capability.c80
-rw-r--r--kernel/cgroup.c423
-rw-r--r--kernel/cgroup_freezer.c77
-rw-r--r--kernel/cpu.c8
-rw-r--r--kernel/cpuset.c105
-rw-r--r--kernel/debug/kdb/kdb_main.c2
-rw-r--r--kernel/debug/kdb/kdb_support.c2
-rw-r--r--kernel/events/Makefile2
-rw-r--r--kernel/events/callchain.c189
-rw-r--r--kernel/events/core.c315
-rw-r--r--kernel/events/internal.h39
-rw-r--r--kernel/events/ring_buffer.c2
-rw-r--r--kernel/exit.c46
-rw-r--r--kernel/fork.c40
-rw-r--r--kernel/freezer.c203
-rw-r--r--kernel/irq/internals.h2
-rw-r--r--kernel/irq/irqdomain.c15
-rw-r--r--kernel/irq/manage.c2
-rw-r--r--kernel/irq/spurious.c2
-rw-r--r--kernel/itimer.c15
-rw-r--r--kernel/jump_label.c51
-rw-r--r--kernel/kexec.c29
-rw-r--r--kernel/kmod.c27
-rw-r--r--kernel/kprobes.c4
-rw-r--r--kernel/kthread.c27
-rw-r--r--kernel/lockdep.c83
-rw-r--r--kernel/module.c205
-rw-r--r--kernel/panic.c43
-rw-r--r--kernel/params.c38
-rw-r--r--kernel/pid.c4
-rw-r--r--kernel/pid_namespace.c31
-rw-r--r--kernel/posix-cpu-timers.c132
-rw-r--r--kernel/power/hibernate.c92
-rw-r--r--kernel/power/main.c10
-rw-r--r--kernel/power/power.h2
-rw-r--r--kernel/power/process.c90
-rw-r--r--kernel/power/snapshot.c9
-rw-r--r--kernel/power/suspend.c12
-rw-r--r--kernel/power/swap.c14
-rw-r--r--kernel/power/user.c193
-rw-r--r--kernel/printk.c21
-rw-r--r--kernel/ptrace.c27
-rw-r--r--kernel/rcu.h7
-rw-r--r--kernel/rcupdate.c12
-rw-r--r--kernel/rcutiny.c149
-rw-r--r--kernel/rcutiny_plugin.h29
-rw-r--r--kernel/rcutorture.c229
-rw-r--r--kernel/rcutree.c290
-rw-r--r--kernel/rcutree.h26
-rw-r--r--kernel/rcutree_plugin.h289
-rw-r--r--kernel/rcutree_trace.c12
-rw-r--r--kernel/relay.c2
-rw-r--r--kernel/res_counter.c28
-rw-r--r--kernel/rtmutex-debug.c1
-rw-r--r--kernel/rtmutex-tester.c37
-rw-r--r--kernel/rtmutex.c8
-rw-r--r--kernel/sched/Makefile20
-rw-r--r--kernel/sched/auto_group.c (renamed from kernel/sched_autogroup.c)33
-rw-r--r--kernel/sched/auto_group.h (renamed from kernel/sched_autogroup.h)26
-rw-r--r--kernel/sched/clock.c (renamed from kernel/sched_clock.c)0
-rw-r--r--kernel/sched/core.c (renamed from kernel/sched.c)2269
-rw-r--r--kernel/sched/cpupri.c (renamed from kernel/sched_cpupri.c)7
-rw-r--r--kernel/sched/cpupri.h (renamed from kernel/sched_cpupri.h)0
-rw-r--r--kernel/sched/debug.c (renamed from kernel/sched_debug.c)6
-rw-r--r--kernel/sched/fair.c (renamed from kernel/sched_fair.c)1004
-rw-r--r--kernel/sched/features.h (renamed from kernel/sched_features.h)30
-rw-r--r--kernel/sched/idle_task.c (renamed from kernel/sched_idletask.c)4
-rw-r--r--kernel/sched/rt.c (renamed from kernel/sched_rt.c)218
-rw-r--r--kernel/sched/sched.h1166
-rw-r--r--kernel/sched/stats.c111
-rw-r--r--kernel/sched/stats.h (renamed from kernel/sched_stats.h)109
-rw-r--r--kernel/sched/stop_task.c (renamed from kernel/sched_stoptask.c)4
-rw-r--r--kernel/seccomp.c2
-rw-r--r--kernel/signal.c82
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/sys.c127
-rw-r--r--kernel/sysctl.c9
-rw-r--r--kernel/time/Kconfig2
-rw-r--r--kernel/time/clockevents.c1
-rw-r--r--kernel/time/clocksource.c37
-rw-r--r--kernel/time/tick-sched.c105
-rw-r--r--kernel/time/timekeeping.c10
-rw-r--r--kernel/timer.c62
-rw-r--r--kernel/trace/blktrace.c2
-rw-r--r--kernel/trace/ftrace.c715
-rw-r--r--kernel/trace/trace.c108
-rw-r--r--kernel/trace/trace.h4
-rw-r--r--kernel/trace/trace_events_filter.c283
-rw-r--r--kernel/trace/trace_irqsoff.c13
-rw-r--r--kernel/trace/trace_output.c16
-rw-r--r--kernel/trace/trace_sched_wakeup.c13
-rw-r--r--kernel/trace/trace_stack.c30
-rw-r--r--kernel/tracepoint.c7
-rw-r--r--kernel/tsacct.c2
-rw-r--r--kernel/wait.c4
-rw-r--r--kernel/workqueue.c32
103 files changed, 6730 insertions, 4640 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index e898c5b9d02c..2d9de86b7e76 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -2,16 +2,15 @@
2# Makefile for the linux kernel. 2# Makefile for the linux kernel.
3# 3#
4 4
5obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ 5obj-y = fork.o exec_domain.o panic.o printk.o \
6 cpu.o exit.o itimer.o time.o softirq.o resource.o \ 6 cpu.o exit.o itimer.o time.o softirq.o resource.o \
7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
8 signal.o sys.o kmod.o workqueue.o pid.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o sched_clock.o cred.o \ 12 notifier.o ksysfs.o cred.o \
13 async.o range.o 13 async.o range.o groups.o
14obj-y += groups.o
15 14
16ifdef CONFIG_FUNCTION_TRACER 15ifdef CONFIG_FUNCTION_TRACER
17# Do not trace debug files and internal ftrace files 16# Do not trace debug files and internal ftrace files
@@ -20,10 +19,12 @@ CFLAGS_REMOVE_lockdep_proc.o = -pg
20CFLAGS_REMOVE_mutex-debug.o = -pg 19CFLAGS_REMOVE_mutex-debug.o = -pg
21CFLAGS_REMOVE_rtmutex-debug.o = -pg 20CFLAGS_REMOVE_rtmutex-debug.o = -pg
22CFLAGS_REMOVE_cgroup-debug.o = -pg 21CFLAGS_REMOVE_cgroup-debug.o = -pg
23CFLAGS_REMOVE_sched_clock.o = -pg
24CFLAGS_REMOVE_irq_work.o = -pg 22CFLAGS_REMOVE_irq_work.o = -pg
25endif 23endif
26 24
25obj-y += sched/
26obj-y += power/
27
27obj-$(CONFIG_FREEZER) += freezer.o 28obj-$(CONFIG_FREEZER) += freezer.o
28obj-$(CONFIG_PROFILING) += profile.o 29obj-$(CONFIG_PROFILING) += profile.o
29obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o 30obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
@@ -52,8 +53,6 @@ obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
52obj-$(CONFIG_UID16) += uid16.o 53obj-$(CONFIG_UID16) += uid16.o
53obj-$(CONFIG_MODULES) += module.o 54obj-$(CONFIG_MODULES) += module.o
54obj-$(CONFIG_KALLSYMS) += kallsyms.o 55obj-$(CONFIG_KALLSYMS) += kallsyms.o
55obj-$(CONFIG_PM) += power/
56obj-$(CONFIG_FREEZER) += power/
57obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o 56obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
58obj-$(CONFIG_KEXEC) += kexec.o 57obj-$(CONFIG_KEXEC) += kexec.o
59obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o 58obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
@@ -99,7 +98,6 @@ obj-$(CONFIG_TRACING) += trace/
99obj-$(CONFIG_X86_DS) += trace/ 98obj-$(CONFIG_X86_DS) += trace/
100obj-$(CONFIG_RING_BUFFER) += trace/ 99obj-$(CONFIG_RING_BUFFER) += trace/
101obj-$(CONFIG_TRACEPOINTS) += trace/ 100obj-$(CONFIG_TRACEPOINTS) += trace/
102obj-$(CONFIG_SMP) += sched_cpupri.o
103obj-$(CONFIG_IRQ_WORK) += irq_work.o 101obj-$(CONFIG_IRQ_WORK) += irq_work.o
104obj-$(CONFIG_CPU_PM) += cpu_pm.o 102obj-$(CONFIG_CPU_PM) += cpu_pm.o
105 103
@@ -110,15 +108,6 @@ obj-$(CONFIG_PADATA) += padata.o
110obj-$(CONFIG_CRASH_DUMP) += crash_dump.o 108obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
111obj-$(CONFIG_JUMP_LABEL) += jump_label.o 109obj-$(CONFIG_JUMP_LABEL) += jump_label.o
112 110
113ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
114# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
115# needed for x86 only. Why this used to be enabled for all architectures is beyond
116# me. I suspect most platforms don't need this, but until we know that for sure
117# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k
118# to get a correct value for the wait-channel (WCHAN in ps). --davidm
119CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer
120endif
121
122$(obj)/configs.o: $(obj)/config_data.h 111$(obj)/configs.o: $(obj)/config_data.h
123 112
124# config_data.h contains the same information as ikconfig.h but gzipped. 113# config_data.h contains the same information as ikconfig.h but gzipped.
diff --git a/kernel/acct.c b/kernel/acct.c
index fa7eb3de2ddc..02e6167a53b0 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -84,11 +84,10 @@ static void do_acct_process(struct bsd_acct_struct *acct,
84 * the cache line to have the data after getting the lock. 84 * the cache line to have the data after getting the lock.
85 */ 85 */
86struct bsd_acct_struct { 86struct bsd_acct_struct {
87 volatile int active; 87 int active;
88 volatile int needcheck; 88 unsigned long needcheck;
89 struct file *file; 89 struct file *file;
90 struct pid_namespace *ns; 90 struct pid_namespace *ns;
91 struct timer_list timer;
92 struct list_head list; 91 struct list_head list;
93}; 92};
94 93
@@ -96,15 +95,6 @@ static DEFINE_SPINLOCK(acct_lock);
96static LIST_HEAD(acct_list); 95static LIST_HEAD(acct_list);
97 96
98/* 97/*
99 * Called whenever the timer says to check the free space.
100 */
101static void acct_timeout(unsigned long x)
102{
103 struct bsd_acct_struct *acct = (struct bsd_acct_struct *)x;
104 acct->needcheck = 1;
105}
106
107/*
108 * Check the amount of free space and suspend/resume accordingly. 98 * Check the amount of free space and suspend/resume accordingly.
109 */ 99 */
110static int check_free_space(struct bsd_acct_struct *acct, struct file *file) 100static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
@@ -112,12 +102,12 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
112 struct kstatfs sbuf; 102 struct kstatfs sbuf;
113 int res; 103 int res;
114 int act; 104 int act;
115 sector_t resume; 105 u64 resume;
116 sector_t suspend; 106 u64 suspend;
117 107
118 spin_lock(&acct_lock); 108 spin_lock(&acct_lock);
119 res = acct->active; 109 res = acct->active;
120 if (!file || !acct->needcheck) 110 if (!file || time_is_before_jiffies(acct->needcheck))
121 goto out; 111 goto out;
122 spin_unlock(&acct_lock); 112 spin_unlock(&acct_lock);
123 113
@@ -127,8 +117,8 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
127 suspend = sbuf.f_blocks * SUSPEND; 117 suspend = sbuf.f_blocks * SUSPEND;
128 resume = sbuf.f_blocks * RESUME; 118 resume = sbuf.f_blocks * RESUME;
129 119
130 sector_div(suspend, 100); 120 do_div(suspend, 100);
131 sector_div(resume, 100); 121 do_div(resume, 100);
132 122
133 if (sbuf.f_bavail <= suspend) 123 if (sbuf.f_bavail <= suspend)
134 act = -1; 124 act = -1;
@@ -160,10 +150,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
160 } 150 }
161 } 151 }
162 152
163 del_timer(&acct->timer); 153 acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
164 acct->needcheck = 0;
165 acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
166 add_timer(&acct->timer);
167 res = acct->active; 154 res = acct->active;
168out: 155out:
169 spin_unlock(&acct_lock); 156 spin_unlock(&acct_lock);
@@ -185,9 +172,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
185 if (acct->file) { 172 if (acct->file) {
186 old_acct = acct->file; 173 old_acct = acct->file;
187 old_ns = acct->ns; 174 old_ns = acct->ns;
188 del_timer(&acct->timer);
189 acct->active = 0; 175 acct->active = 0;
190 acct->needcheck = 0;
191 acct->file = NULL; 176 acct->file = NULL;
192 acct->ns = NULL; 177 acct->ns = NULL;
193 list_del(&acct->list); 178 list_del(&acct->list);
@@ -195,13 +180,9 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
195 if (file) { 180 if (file) {
196 acct->file = file; 181 acct->file = file;
197 acct->ns = ns; 182 acct->ns = ns;
198 acct->needcheck = 0; 183 acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
199 acct->active = 1; 184 acct->active = 1;
200 list_add(&acct->list, &acct_list); 185 list_add(&acct->list, &acct_list);
201 /* It's been deleted if it was used before so this is safe */
202 setup_timer(&acct->timer, acct_timeout, (unsigned long)acct);
203 acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
204 add_timer(&acct->timer);
205 } 186 }
206 if (old_acct) { 187 if (old_acct) {
207 mnt_unpin(old_acct->f_path.mnt); 188 mnt_unpin(old_acct->f_path.mnt);
@@ -334,7 +315,7 @@ void acct_auto_close(struct super_block *sb)
334 spin_lock(&acct_lock); 315 spin_lock(&acct_lock);
335restart: 316restart:
336 list_for_each_entry(acct, &acct_list, list) 317 list_for_each_entry(acct, &acct_list, list)
337 if (acct->file && acct->file->f_path.mnt->mnt_sb == sb) { 318 if (acct->file && acct->file->f_path.dentry->d_sb == sb) {
338 acct_file_reopen(acct, NULL, NULL); 319 acct_file_reopen(acct, NULL, NULL);
339 goto restart; 320 goto restart;
340 } 321 }
@@ -348,7 +329,6 @@ void acct_exit_ns(struct pid_namespace *ns)
348 if (acct == NULL) 329 if (acct == NULL)
349 return; 330 return;
350 331
351 del_timer_sync(&acct->timer);
352 spin_lock(&acct_lock); 332 spin_lock(&acct_lock);
353 if (acct->file != NULL) 333 if (acct->file != NULL)
354 acct_file_reopen(acct, NULL, NULL); 334 acct_file_reopen(acct, NULL, NULL);
@@ -498,7 +478,7 @@ static void do_acct_process(struct bsd_acct_struct *acct,
498 * Fill the accounting struct with the needed info as recorded 478 * Fill the accounting struct with the needed info as recorded
499 * by the different kernel functions. 479 * by the different kernel functions.
500 */ 480 */
501 memset((caddr_t)&ac, 0, sizeof(acct_t)); 481 memset(&ac, 0, sizeof(acct_t));
502 482
503 ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER; 483 ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
504 strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm)); 484 strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
@@ -613,8 +593,8 @@ void acct_collect(long exitcode, int group_dead)
613 pacct->ac_flag |= ACORE; 593 pacct->ac_flag |= ACORE;
614 if (current->flags & PF_SIGNALED) 594 if (current->flags & PF_SIGNALED)
615 pacct->ac_flag |= AXSIG; 595 pacct->ac_flag |= AXSIG;
616 pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime); 596 pacct->ac_utime += current->utime;
617 pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime); 597 pacct->ac_stime += current->stime;
618 pacct->ac_minflt += current->min_flt; 598 pacct->ac_minflt += current->min_flt;
619 pacct->ac_majflt += current->maj_flt; 599 pacct->ac_majflt += current->maj_flt;
620 spin_unlock_irq(&current->sighand->siglock); 600 spin_unlock_irq(&current->sighand->siglock);
diff --git a/kernel/async.c b/kernel/async.c
index 80b74b88fefe..bd0c168a3bbe 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -78,8 +78,6 @@ static DECLARE_WAIT_QUEUE_HEAD(async_done);
78 78
79static atomic_t entry_count; 79static atomic_t entry_count;
80 80
81extern int initcall_debug;
82
83 81
84/* 82/*
85 * MUST be called with the lock held! 83 * MUST be called with the lock held!
diff --git a/kernel/audit.c b/kernel/audit.c
index 09fae2677a45..bb0eb5bb9a0a 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -601,13 +601,13 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
601 case AUDIT_TTY_SET: 601 case AUDIT_TTY_SET:
602 case AUDIT_TRIM: 602 case AUDIT_TRIM:
603 case AUDIT_MAKE_EQUIV: 603 case AUDIT_MAKE_EQUIV:
604 if (security_netlink_recv(skb, CAP_AUDIT_CONTROL)) 604 if (!capable(CAP_AUDIT_CONTROL))
605 err = -EPERM; 605 err = -EPERM;
606 break; 606 break;
607 case AUDIT_USER: 607 case AUDIT_USER:
608 case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: 608 case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
609 case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: 609 case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
610 if (security_netlink_recv(skb, CAP_AUDIT_WRITE)) 610 if (!capable(CAP_AUDIT_WRITE))
611 err = -EPERM; 611 err = -EPERM;
612 break; 612 break;
613 default: /* bad msg */ 613 default: /* bad msg */
@@ -631,7 +631,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
631 } 631 }
632 632
633 *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); 633 *ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
634 audit_log_format(*ab, "user pid=%d uid=%u auid=%u ses=%u", 634 audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u",
635 pid, uid, auid, ses); 635 pid, uid, auid, ses);
636 if (sid) { 636 if (sid) {
637 rc = security_secid_to_secctx(sid, &ctx, &len); 637 rc = security_secid_to_secctx(sid, &ctx, &len);
@@ -1260,12 +1260,13 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
1260 avail = audit_expand(ab, 1260 avail = audit_expand(ab,
1261 max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail)); 1261 max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
1262 if (!avail) 1262 if (!avail)
1263 goto out; 1263 goto out_va_end;
1264 len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2); 1264 len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2);
1265 } 1265 }
1266 va_end(args2);
1267 if (len > 0) 1266 if (len > 0)
1268 skb_put(skb, len); 1267 skb_put(skb, len);
1268out_va_end:
1269 va_end(args2);
1269out: 1270out:
1270 return; 1271 return;
1271} 1272}
@@ -1422,7 +1423,7 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
1422 char *p, *pathname; 1423 char *p, *pathname;
1423 1424
1424 if (prefix) 1425 if (prefix)
1425 audit_log_format(ab, " %s", prefix); 1426 audit_log_format(ab, "%s", prefix);
1426 1427
1427 /* We will allow 11 spaces for ' (deleted)' to be appended */ 1428 /* We will allow 11 spaces for ' (deleted)' to be appended */
1428 pathname = kmalloc(PATH_MAX+11, ab->gfp_mask); 1429 pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
diff --git a/kernel/audit.h b/kernel/audit.h
index 91e7071c4d2c..816766803371 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -36,12 +36,8 @@ enum audit_state {
36 AUDIT_DISABLED, /* Do not create per-task audit_context. 36 AUDIT_DISABLED, /* Do not create per-task audit_context.
37 * No syscall-specific audit records can 37 * No syscall-specific audit records can
38 * be generated. */ 38 * be generated. */
39 AUDIT_SETUP_CONTEXT, /* Create the per-task audit_context,
40 * but don't necessarily fill it in at
41 * syscall entry time (i.e., filter
42 * instead). */
43 AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context, 39 AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context,
44 * and always fill it in at syscall 40 * and fill it in at syscall
45 * entry time. This makes a full 41 * entry time. This makes a full
46 * syscall record available if some 42 * syscall record available if some
47 * other part of the kernel decides it 43 * other part of the kernel decides it
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index f8277c80d678..a6c3f1abd206 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -235,13 +235,15 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
235 switch(listnr) { 235 switch(listnr) {
236 default: 236 default:
237 goto exit_err; 237 goto exit_err;
238 case AUDIT_FILTER_USER:
239 case AUDIT_FILTER_TYPE:
240#ifdef CONFIG_AUDITSYSCALL 238#ifdef CONFIG_AUDITSYSCALL
241 case AUDIT_FILTER_ENTRY: 239 case AUDIT_FILTER_ENTRY:
240 if (rule->action == AUDIT_ALWAYS)
241 goto exit_err;
242 case AUDIT_FILTER_EXIT: 242 case AUDIT_FILTER_EXIT:
243 case AUDIT_FILTER_TASK: 243 case AUDIT_FILTER_TASK:
244#endif 244#endif
245 case AUDIT_FILTER_USER:
246 case AUDIT_FILTER_TYPE:
245 ; 247 ;
246 } 248 }
247 if (unlikely(rule->action == AUDIT_POSSIBLE)) { 249 if (unlikely(rule->action == AUDIT_POSSIBLE)) {
@@ -385,7 +387,7 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
385 goto exit_free; 387 goto exit_free;
386 break; 388 break;
387 case AUDIT_FILETYPE: 389 case AUDIT_FILETYPE:
388 if ((f->val & ~S_IFMT) > S_IFMT) 390 if (f->val & ~S_IFMT)
389 goto exit_free; 391 goto exit_free;
390 break; 392 break;
391 case AUDIT_INODE: 393 case AUDIT_INODE:
@@ -459,6 +461,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
459 case AUDIT_ARG1: 461 case AUDIT_ARG1:
460 case AUDIT_ARG2: 462 case AUDIT_ARG2:
461 case AUDIT_ARG3: 463 case AUDIT_ARG3:
464 case AUDIT_OBJ_UID:
465 case AUDIT_OBJ_GID:
462 break; 466 break;
463 case AUDIT_ARCH: 467 case AUDIT_ARCH:
464 entry->rule.arch_f = f; 468 entry->rule.arch_f = f;
@@ -522,7 +526,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
522 goto exit_free; 526 goto exit_free;
523 break; 527 break;
524 case AUDIT_FILTERKEY: 528 case AUDIT_FILTERKEY:
525 err = -EINVAL;
526 if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN) 529 if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN)
527 goto exit_free; 530 goto exit_free;
528 str = audit_unpack_string(&bufp, &remain, f->val); 531 str = audit_unpack_string(&bufp, &remain, f->val);
@@ -536,7 +539,11 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
536 goto exit_free; 539 goto exit_free;
537 break; 540 break;
538 case AUDIT_FILETYPE: 541 case AUDIT_FILETYPE:
539 if ((f->val & ~S_IFMT) > S_IFMT) 542 if (f->val & ~S_IFMT)
543 goto exit_free;
544 break;
545 case AUDIT_FIELD_COMPARE:
546 if (f->val > AUDIT_MAX_FIELD_COMPARE)
540 goto exit_free; 547 goto exit_free;
541 break; 548 break;
542 default: 549 default:
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 47b7fc1ea893..af1de0f34eae 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -70,9 +70,15 @@
70 70
71#include "audit.h" 71#include "audit.h"
72 72
73/* flags stating the success for a syscall */
74#define AUDITSC_INVALID 0
75#define AUDITSC_SUCCESS 1
76#define AUDITSC_FAILURE 2
77
73/* AUDIT_NAMES is the number of slots we reserve in the audit_context 78/* AUDIT_NAMES is the number of slots we reserve in the audit_context
74 * for saving names from getname(). */ 79 * for saving names from getname(). If we get more names we will allocate
75#define AUDIT_NAMES 20 80 * a name dynamically and also add those to the list anchored by names_list. */
81#define AUDIT_NAMES 5
76 82
77/* Indicates that audit should log the full pathname. */ 83/* Indicates that audit should log the full pathname. */
78#define AUDIT_NAME_FULL -1 84#define AUDIT_NAME_FULL -1
@@ -101,9 +107,8 @@ struct audit_cap_data {
101 * 107 *
102 * Further, in fs/namei.c:path_lookup() we store the inode and device. */ 108 * Further, in fs/namei.c:path_lookup() we store the inode and device. */
103struct audit_names { 109struct audit_names {
110 struct list_head list; /* audit_context->names_list */
104 const char *name; 111 const char *name;
105 int name_len; /* number of name's characters to log */
106 unsigned name_put; /* call __putname() for this name */
107 unsigned long ino; 112 unsigned long ino;
108 dev_t dev; 113 dev_t dev;
109 umode_t mode; 114 umode_t mode;
@@ -113,6 +118,14 @@ struct audit_names {
113 u32 osid; 118 u32 osid;
114 struct audit_cap_data fcap; 119 struct audit_cap_data fcap;
115 unsigned int fcap_ver; 120 unsigned int fcap_ver;
121 int name_len; /* number of name's characters to log */
122 bool name_put; /* call __putname() for this name */
123 /*
124 * This was an allocated audit_names and not from the array of
125 * names allocated in the task audit context. Thus this name
126 * should be freed on syscall exit
127 */
128 bool should_free;
116}; 129};
117 130
118struct audit_aux_data { 131struct audit_aux_data {
@@ -174,8 +187,17 @@ struct audit_context {
174 long return_code;/* syscall return code */ 187 long return_code;/* syscall return code */
175 u64 prio; 188 u64 prio;
176 int return_valid; /* return code is valid */ 189 int return_valid; /* return code is valid */
177 int name_count; 190 /*
178 struct audit_names names[AUDIT_NAMES]; 191 * The names_list is the list of all audit_names collected during this
192 * syscall. The first AUDIT_NAMES entries in the names_list will
193 * actually be from the preallocated_names array for performance
194 * reasons. Except during allocation they should never be referenced
195 * through the preallocated_names array and should only be found/used
196 * by running the names_list.
197 */
198 struct audit_names preallocated_names[AUDIT_NAMES];
199 int name_count; /* total records in names_list */
200 struct list_head names_list; /* anchor for struct audit_names->list */
179 char * filterkey; /* key for rule that triggered record */ 201 char * filterkey; /* key for rule that triggered record */
180 struct path pwd; 202 struct path pwd;
181 struct audit_context *previous; /* For nested syscalls */ 203 struct audit_context *previous; /* For nested syscalls */
@@ -210,12 +232,12 @@ struct audit_context {
210 struct { 232 struct {
211 uid_t uid; 233 uid_t uid;
212 gid_t gid; 234 gid_t gid;
213 mode_t mode; 235 umode_t mode;
214 u32 osid; 236 u32 osid;
215 int has_perm; 237 int has_perm;
216 uid_t perm_uid; 238 uid_t perm_uid;
217 gid_t perm_gid; 239 gid_t perm_gid;
218 mode_t perm_mode; 240 umode_t perm_mode;
219 unsigned long qbytes; 241 unsigned long qbytes;
220 } ipc; 242 } ipc;
221 struct { 243 struct {
@@ -234,7 +256,7 @@ struct audit_context {
234 } mq_sendrecv; 256 } mq_sendrecv;
235 struct { 257 struct {
236 int oflag; 258 int oflag;
237 mode_t mode; 259 umode_t mode;
238 struct mq_attr attr; 260 struct mq_attr attr;
239 } mq_open; 261 } mq_open;
240 struct { 262 struct {
@@ -305,21 +327,21 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
305 } 327 }
306} 328}
307 329
308static int audit_match_filetype(struct audit_context *ctx, int which) 330static int audit_match_filetype(struct audit_context *ctx, int val)
309{ 331{
310 unsigned index = which & ~S_IFMT; 332 struct audit_names *n;
311 mode_t mode = which & S_IFMT; 333 umode_t mode = (umode_t)val;
312 334
313 if (unlikely(!ctx)) 335 if (unlikely(!ctx))
314 return 0; 336 return 0;
315 337
316 if (index >= ctx->name_count) 338 list_for_each_entry(n, &ctx->names_list, list) {
317 return 0; 339 if ((n->ino != -1) &&
318 if (ctx->names[index].ino == -1) 340 ((n->mode & S_IFMT) == mode))
319 return 0; 341 return 1;
320 if ((ctx->names[index].mode ^ mode) & S_IFMT) 342 }
321 return 0; 343
322 return 1; 344 return 0;
323} 345}
324 346
325/* 347/*
@@ -441,6 +463,134 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
441 return 0; 463 return 0;
442} 464}
443 465
466static int audit_compare_id(uid_t uid1,
467 struct audit_names *name,
468 unsigned long name_offset,
469 struct audit_field *f,
470 struct audit_context *ctx)
471{
472 struct audit_names *n;
473 unsigned long addr;
474 uid_t uid2;
475 int rc;
476
477 BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t));
478
479 if (name) {
480 addr = (unsigned long)name;
481 addr += name_offset;
482
483 uid2 = *(uid_t *)addr;
484 rc = audit_comparator(uid1, f->op, uid2);
485 if (rc)
486 return rc;
487 }
488
489 if (ctx) {
490 list_for_each_entry(n, &ctx->names_list, list) {
491 addr = (unsigned long)n;
492 addr += name_offset;
493
494 uid2 = *(uid_t *)addr;
495
496 rc = audit_comparator(uid1, f->op, uid2);
497 if (rc)
498 return rc;
499 }
500 }
501 return 0;
502}
503
504static int audit_field_compare(struct task_struct *tsk,
505 const struct cred *cred,
506 struct audit_field *f,
507 struct audit_context *ctx,
508 struct audit_names *name)
509{
510 switch (f->val) {
511 /* process to file object comparisons */
512 case AUDIT_COMPARE_UID_TO_OBJ_UID:
513 return audit_compare_id(cred->uid,
514 name, offsetof(struct audit_names, uid),
515 f, ctx);
516 case AUDIT_COMPARE_GID_TO_OBJ_GID:
517 return audit_compare_id(cred->gid,
518 name, offsetof(struct audit_names, gid),
519 f, ctx);
520 case AUDIT_COMPARE_EUID_TO_OBJ_UID:
521 return audit_compare_id(cred->euid,
522 name, offsetof(struct audit_names, uid),
523 f, ctx);
524 case AUDIT_COMPARE_EGID_TO_OBJ_GID:
525 return audit_compare_id(cred->egid,
526 name, offsetof(struct audit_names, gid),
527 f, ctx);
528 case AUDIT_COMPARE_AUID_TO_OBJ_UID:
529 return audit_compare_id(tsk->loginuid,
530 name, offsetof(struct audit_names, uid),
531 f, ctx);
532 case AUDIT_COMPARE_SUID_TO_OBJ_UID:
533 return audit_compare_id(cred->suid,
534 name, offsetof(struct audit_names, uid),
535 f, ctx);
536 case AUDIT_COMPARE_SGID_TO_OBJ_GID:
537 return audit_compare_id(cred->sgid,
538 name, offsetof(struct audit_names, gid),
539 f, ctx);
540 case AUDIT_COMPARE_FSUID_TO_OBJ_UID:
541 return audit_compare_id(cred->fsuid,
542 name, offsetof(struct audit_names, uid),
543 f, ctx);
544 case AUDIT_COMPARE_FSGID_TO_OBJ_GID:
545 return audit_compare_id(cred->fsgid,
546 name, offsetof(struct audit_names, gid),
547 f, ctx);
548 /* uid comparisons */
549 case AUDIT_COMPARE_UID_TO_AUID:
550 return audit_comparator(cred->uid, f->op, tsk->loginuid);
551 case AUDIT_COMPARE_UID_TO_EUID:
552 return audit_comparator(cred->uid, f->op, cred->euid);
553 case AUDIT_COMPARE_UID_TO_SUID:
554 return audit_comparator(cred->uid, f->op, cred->suid);
555 case AUDIT_COMPARE_UID_TO_FSUID:
556 return audit_comparator(cred->uid, f->op, cred->fsuid);
557 /* auid comparisons */
558 case AUDIT_COMPARE_AUID_TO_EUID:
559 return audit_comparator(tsk->loginuid, f->op, cred->euid);
560 case AUDIT_COMPARE_AUID_TO_SUID:
561 return audit_comparator(tsk->loginuid, f->op, cred->suid);
562 case AUDIT_COMPARE_AUID_TO_FSUID:
563 return audit_comparator(tsk->loginuid, f->op, cred->fsuid);
564 /* euid comparisons */
565 case AUDIT_COMPARE_EUID_TO_SUID:
566 return audit_comparator(cred->euid, f->op, cred->suid);
567 case AUDIT_COMPARE_EUID_TO_FSUID:
568 return audit_comparator(cred->euid, f->op, cred->fsuid);
569 /* suid comparisons */
570 case AUDIT_COMPARE_SUID_TO_FSUID:
571 return audit_comparator(cred->suid, f->op, cred->fsuid);
572 /* gid comparisons */
573 case AUDIT_COMPARE_GID_TO_EGID:
574 return audit_comparator(cred->gid, f->op, cred->egid);
575 case AUDIT_COMPARE_GID_TO_SGID:
576 return audit_comparator(cred->gid, f->op, cred->sgid);
577 case AUDIT_COMPARE_GID_TO_FSGID:
578 return audit_comparator(cred->gid, f->op, cred->fsgid);
579 /* egid comparisons */
580 case AUDIT_COMPARE_EGID_TO_SGID:
581 return audit_comparator(cred->egid, f->op, cred->sgid);
582 case AUDIT_COMPARE_EGID_TO_FSGID:
583 return audit_comparator(cred->egid, f->op, cred->fsgid);
584 /* sgid comparison */
585 case AUDIT_COMPARE_SGID_TO_FSGID:
586 return audit_comparator(cred->sgid, f->op, cred->fsgid);
587 default:
588 WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n");
589 return 0;
590 }
591 return 0;
592}
593
444/* Determine if any context name data matches a rule's watch data */ 594/* Determine if any context name data matches a rule's watch data */
445/* Compare a task_struct with an audit_rule. Return 1 on match, 0 595/* Compare a task_struct with an audit_rule. Return 1 on match, 0
446 * otherwise. 596 * otherwise.
@@ -457,13 +607,14 @@ static int audit_filter_rules(struct task_struct *tsk,
457 bool task_creation) 607 bool task_creation)
458{ 608{
459 const struct cred *cred; 609 const struct cred *cred;
460 int i, j, need_sid = 1; 610 int i, need_sid = 1;
461 u32 sid; 611 u32 sid;
462 612
463 cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation); 613 cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation);
464 614
465 for (i = 0; i < rule->field_count; i++) { 615 for (i = 0; i < rule->field_count; i++) {
466 struct audit_field *f = &rule->fields[i]; 616 struct audit_field *f = &rule->fields[i];
617 struct audit_names *n;
467 int result = 0; 618 int result = 0;
468 619
469 switch (f->type) { 620 switch (f->type) {
@@ -522,12 +673,14 @@ static int audit_filter_rules(struct task_struct *tsk,
522 } 673 }
523 break; 674 break;
524 case AUDIT_DEVMAJOR: 675 case AUDIT_DEVMAJOR:
525 if (name) 676 if (name) {
526 result = audit_comparator(MAJOR(name->dev), 677 if (audit_comparator(MAJOR(name->dev), f->op, f->val) ||
527 f->op, f->val); 678 audit_comparator(MAJOR(name->rdev), f->op, f->val))
528 else if (ctx) { 679 ++result;
529 for (j = 0; j < ctx->name_count; j++) { 680 } else if (ctx) {
530 if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) { 681 list_for_each_entry(n, &ctx->names_list, list) {
682 if (audit_comparator(MAJOR(n->dev), f->op, f->val) ||
683 audit_comparator(MAJOR(n->rdev), f->op, f->val)) {
531 ++result; 684 ++result;
532 break; 685 break;
533 } 686 }
@@ -535,12 +688,14 @@ static int audit_filter_rules(struct task_struct *tsk,
535 } 688 }
536 break; 689 break;
537 case AUDIT_DEVMINOR: 690 case AUDIT_DEVMINOR:
538 if (name) 691 if (name) {
539 result = audit_comparator(MINOR(name->dev), 692 if (audit_comparator(MINOR(name->dev), f->op, f->val) ||
540 f->op, f->val); 693 audit_comparator(MINOR(name->rdev), f->op, f->val))
541 else if (ctx) { 694 ++result;
542 for (j = 0; j < ctx->name_count; j++) { 695 } else if (ctx) {
543 if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) { 696 list_for_each_entry(n, &ctx->names_list, list) {
697 if (audit_comparator(MINOR(n->dev), f->op, f->val) ||
698 audit_comparator(MINOR(n->rdev), f->op, f->val)) {
544 ++result; 699 ++result;
545 break; 700 break;
546 } 701 }
@@ -551,8 +706,32 @@ static int audit_filter_rules(struct task_struct *tsk,
551 if (name) 706 if (name)
552 result = (name->ino == f->val); 707 result = (name->ino == f->val);
553 else if (ctx) { 708 else if (ctx) {
554 for (j = 0; j < ctx->name_count; j++) { 709 list_for_each_entry(n, &ctx->names_list, list) {
555 if (audit_comparator(ctx->names[j].ino, f->op, f->val)) { 710 if (audit_comparator(n->ino, f->op, f->val)) {
711 ++result;
712 break;
713 }
714 }
715 }
716 break;
717 case AUDIT_OBJ_UID:
718 if (name) {
719 result = audit_comparator(name->uid, f->op, f->val);
720 } else if (ctx) {
721 list_for_each_entry(n, &ctx->names_list, list) {
722 if (audit_comparator(n->uid, f->op, f->val)) {
723 ++result;
724 break;
725 }
726 }
727 }
728 break;
729 case AUDIT_OBJ_GID:
730 if (name) {
731 result = audit_comparator(name->gid, f->op, f->val);
732 } else if (ctx) {
733 list_for_each_entry(n, &ctx->names_list, list) {
734 if (audit_comparator(n->gid, f->op, f->val)) {
556 ++result; 735 ++result;
557 break; 736 break;
558 } 737 }
@@ -607,11 +786,10 @@ static int audit_filter_rules(struct task_struct *tsk,
607 name->osid, f->type, f->op, 786 name->osid, f->type, f->op,
608 f->lsm_rule, ctx); 787 f->lsm_rule, ctx);
609 } else if (ctx) { 788 } else if (ctx) {
610 for (j = 0; j < ctx->name_count; j++) { 789 list_for_each_entry(n, &ctx->names_list, list) {
611 if (security_audit_rule_match( 790 if (security_audit_rule_match(n->osid, f->type,
612 ctx->names[j].osid, 791 f->op, f->lsm_rule,
613 f->type, f->op, 792 ctx)) {
614 f->lsm_rule, ctx)) {
615 ++result; 793 ++result;
616 break; 794 break;
617 } 795 }
@@ -643,8 +821,10 @@ static int audit_filter_rules(struct task_struct *tsk,
643 case AUDIT_FILETYPE: 821 case AUDIT_FILETYPE:
644 result = audit_match_filetype(ctx, f->val); 822 result = audit_match_filetype(ctx, f->val);
645 break; 823 break;
824 case AUDIT_FIELD_COMPARE:
825 result = audit_field_compare(tsk, cred, f, ctx, name);
826 break;
646 } 827 }
647
648 if (!result) 828 if (!result)
649 return 0; 829 return 0;
650 } 830 }
@@ -722,40 +902,53 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
722 return AUDIT_BUILD_CONTEXT; 902 return AUDIT_BUILD_CONTEXT;
723} 903}
724 904
725/* At syscall exit time, this filter is called if any audit_names[] have been 905/*
906 * Given an audit_name check the inode hash table to see if they match.
907 * Called holding the rcu read lock to protect the use of audit_inode_hash
908 */
909static int audit_filter_inode_name(struct task_struct *tsk,
910 struct audit_names *n,
911 struct audit_context *ctx) {
912 int word, bit;
913 int h = audit_hash_ino((u32)n->ino);
914 struct list_head *list = &audit_inode_hash[h];
915 struct audit_entry *e;
916 enum audit_state state;
917
918 word = AUDIT_WORD(ctx->major);
919 bit = AUDIT_BIT(ctx->major);
920
921 if (list_empty(list))
922 return 0;
923
924 list_for_each_entry_rcu(e, list, list) {
925 if ((e->rule.mask[word] & bit) == bit &&
926 audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) {
927 ctx->current_state = state;
928 return 1;
929 }
930 }
931
932 return 0;
933}
934
935/* At syscall exit time, this filter is called if any audit_names have been
726 * collected during syscall processing. We only check rules in sublists at hash 936 * collected during syscall processing. We only check rules in sublists at hash
727 * buckets applicable to the inode numbers in audit_names[]. 937 * buckets applicable to the inode numbers in audit_names.
728 * Regarding audit_state, same rules apply as for audit_filter_syscall(). 938 * Regarding audit_state, same rules apply as for audit_filter_syscall().
729 */ 939 */
730void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx) 940void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
731{ 941{
732 int i; 942 struct audit_names *n;
733 struct audit_entry *e;
734 enum audit_state state;
735 943
736 if (audit_pid && tsk->tgid == audit_pid) 944 if (audit_pid && tsk->tgid == audit_pid)
737 return; 945 return;
738 946
739 rcu_read_lock(); 947 rcu_read_lock();
740 for (i = 0; i < ctx->name_count; i++) {
741 int word = AUDIT_WORD(ctx->major);
742 int bit = AUDIT_BIT(ctx->major);
743 struct audit_names *n = &ctx->names[i];
744 int h = audit_hash_ino((u32)n->ino);
745 struct list_head *list = &audit_inode_hash[h];
746
747 if (list_empty(list))
748 continue;
749 948
750 list_for_each_entry_rcu(e, list, list) { 949 list_for_each_entry(n, &ctx->names_list, list) {
751 if ((e->rule.mask[word] & bit) == bit && 950 if (audit_filter_inode_name(tsk, n, ctx))
752 audit_filter_rules(tsk, &e->rule, ctx, n, 951 break;
753 &state, false)) {
754 rcu_read_unlock();
755 ctx->current_state = state;
756 return;
757 }
758 }
759 } 952 }
760 rcu_read_unlock(); 953 rcu_read_unlock();
761} 954}
@@ -766,7 +959,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
766{ 959{
767 struct audit_context *context = tsk->audit_context; 960 struct audit_context *context = tsk->audit_context;
768 961
769 if (likely(!context)) 962 if (!context)
770 return NULL; 963 return NULL;
771 context->return_valid = return_valid; 964 context->return_valid = return_valid;
772 965
@@ -799,7 +992,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
799 992
800static inline void audit_free_names(struct audit_context *context) 993static inline void audit_free_names(struct audit_context *context)
801{ 994{
802 int i; 995 struct audit_names *n, *next;
803 996
804#if AUDIT_DEBUG == 2 997#if AUDIT_DEBUG == 2
805 if (context->put_count + context->ino_count != context->name_count) { 998 if (context->put_count + context->ino_count != context->name_count) {
@@ -810,10 +1003,9 @@ static inline void audit_free_names(struct audit_context *context)
810 context->serial, context->major, context->in_syscall, 1003 context->serial, context->major, context->in_syscall,
811 context->name_count, context->put_count, 1004 context->name_count, context->put_count,
812 context->ino_count); 1005 context->ino_count);
813 for (i = 0; i < context->name_count; i++) { 1006 list_for_each_entry(n, &context->names_list, list) {
814 printk(KERN_ERR "names[%d] = %p = %s\n", i, 1007 printk(KERN_ERR "names[%d] = %p = %s\n", i,
815 context->names[i].name, 1008 n->name, n->name ?: "(null)");
816 context->names[i].name ?: "(null)");
817 } 1009 }
818 dump_stack(); 1010 dump_stack();
819 return; 1011 return;
@@ -824,9 +1016,12 @@ static inline void audit_free_names(struct audit_context *context)
824 context->ino_count = 0; 1016 context->ino_count = 0;
825#endif 1017#endif
826 1018
827 for (i = 0; i < context->name_count; i++) { 1019 list_for_each_entry_safe(n, next, &context->names_list, list) {
828 if (context->names[i].name && context->names[i].name_put) 1020 list_del(&n->list);
829 __putname(context->names[i].name); 1021 if (n->name && n->name_put)
1022 __putname(n->name);
1023 if (n->should_free)
1024 kfree(n);
830 } 1025 }
831 context->name_count = 0; 1026 context->name_count = 0;
832 path_put(&context->pwd); 1027 path_put(&context->pwd);
@@ -864,6 +1059,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
864 return NULL; 1059 return NULL;
865 audit_zero_context(context, state); 1060 audit_zero_context(context, state);
866 INIT_LIST_HEAD(&context->killed_trees); 1061 INIT_LIST_HEAD(&context->killed_trees);
1062 INIT_LIST_HEAD(&context->names_list);
867 return context; 1063 return context;
868} 1064}
869 1065
@@ -886,7 +1082,7 @@ int audit_alloc(struct task_struct *tsk)
886 return 0; /* Return if not auditing. */ 1082 return 0; /* Return if not auditing. */
887 1083
888 state = audit_filter_task(tsk, &key); 1084 state = audit_filter_task(tsk, &key);
889 if (likely(state == AUDIT_DISABLED)) 1085 if (state == AUDIT_DISABLED)
890 return 0; 1086 return 0;
891 1087
892 if (!(context = audit_alloc_context(state))) { 1088 if (!(context = audit_alloc_context(state))) {
@@ -975,7 +1171,7 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk
975 while (vma) { 1171 while (vma) {
976 if ((vma->vm_flags & VM_EXECUTABLE) && 1172 if ((vma->vm_flags & VM_EXECUTABLE) &&
977 vma->vm_file) { 1173 vma->vm_file) {
978 audit_log_d_path(ab, "exe=", 1174 audit_log_d_path(ab, " exe=",
979 &vma->vm_file->f_path); 1175 &vma->vm_file->f_path);
980 break; 1176 break;
981 } 1177 }
@@ -1166,8 +1362,8 @@ static void audit_log_execve_info(struct audit_context *context,
1166 struct audit_buffer **ab, 1362 struct audit_buffer **ab,
1167 struct audit_aux_data_execve *axi) 1363 struct audit_aux_data_execve *axi)
1168{ 1364{
1169 int i; 1365 int i, len;
1170 size_t len, len_sent = 0; 1366 size_t len_sent = 0;
1171 const char __user *p; 1367 const char __user *p;
1172 char *buf; 1368 char *buf;
1173 1369
@@ -1249,7 +1445,7 @@ static void show_special(struct audit_context *context, int *call_panic)
1249 case AUDIT_IPC: { 1445 case AUDIT_IPC: {
1250 u32 osid = context->ipc.osid; 1446 u32 osid = context->ipc.osid;
1251 1447
1252 audit_log_format(ab, "ouid=%u ogid=%u mode=%#o", 1448 audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho",
1253 context->ipc.uid, context->ipc.gid, context->ipc.mode); 1449 context->ipc.uid, context->ipc.gid, context->ipc.mode);
1254 if (osid) { 1450 if (osid) {
1255 char *ctx = NULL; 1451 char *ctx = NULL;
@@ -1267,7 +1463,7 @@ static void show_special(struct audit_context *context, int *call_panic)
1267 ab = audit_log_start(context, GFP_KERNEL, 1463 ab = audit_log_start(context, GFP_KERNEL,
1268 AUDIT_IPC_SET_PERM); 1464 AUDIT_IPC_SET_PERM);
1269 audit_log_format(ab, 1465 audit_log_format(ab,
1270 "qbytes=%lx ouid=%u ogid=%u mode=%#o", 1466 "qbytes=%lx ouid=%u ogid=%u mode=%#ho",
1271 context->ipc.qbytes, 1467 context->ipc.qbytes,
1272 context->ipc.perm_uid, 1468 context->ipc.perm_uid,
1273 context->ipc.perm_gid, 1469 context->ipc.perm_gid,
@@ -1278,7 +1474,7 @@ static void show_special(struct audit_context *context, int *call_panic)
1278 break; } 1474 break; }
1279 case AUDIT_MQ_OPEN: { 1475 case AUDIT_MQ_OPEN: {
1280 audit_log_format(ab, 1476 audit_log_format(ab,
1281 "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld " 1477 "oflag=0x%x mode=%#ho mq_flags=0x%lx mq_maxmsg=%ld "
1282 "mq_msgsize=%ld mq_curmsgs=%ld", 1478 "mq_msgsize=%ld mq_curmsgs=%ld",
1283 context->mq_open.oflag, context->mq_open.mode, 1479 context->mq_open.oflag, context->mq_open.mode,
1284 context->mq_open.attr.mq_flags, 1480 context->mq_open.attr.mq_flags,
@@ -1324,6 +1520,68 @@ static void show_special(struct audit_context *context, int *call_panic)
1324 audit_log_end(ab); 1520 audit_log_end(ab);
1325} 1521}
1326 1522
1523static void audit_log_name(struct audit_context *context, struct audit_names *n,
1524 int record_num, int *call_panic)
1525{
1526 struct audit_buffer *ab;
1527 ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
1528 if (!ab)
1529 return; /* audit_panic has been called */
1530
1531 audit_log_format(ab, "item=%d", record_num);
1532
1533 if (n->name) {
1534 switch (n->name_len) {
1535 case AUDIT_NAME_FULL:
1536 /* log the full path */
1537 audit_log_format(ab, " name=");
1538 audit_log_untrustedstring(ab, n->name);
1539 break;
1540 case 0:
1541 /* name was specified as a relative path and the
1542 * directory component is the cwd */
1543 audit_log_d_path(ab, " name=", &context->pwd);
1544 break;
1545 default:
1546 /* log the name's directory component */
1547 audit_log_format(ab, " name=");
1548 audit_log_n_untrustedstring(ab, n->name,
1549 n->name_len);
1550 }
1551 } else
1552 audit_log_format(ab, " name=(null)");
1553
1554 if (n->ino != (unsigned long)-1) {
1555 audit_log_format(ab, " inode=%lu"
1556 " dev=%02x:%02x mode=%#ho"
1557 " ouid=%u ogid=%u rdev=%02x:%02x",
1558 n->ino,
1559 MAJOR(n->dev),
1560 MINOR(n->dev),
1561 n->mode,
1562 n->uid,
1563 n->gid,
1564 MAJOR(n->rdev),
1565 MINOR(n->rdev));
1566 }
1567 if (n->osid != 0) {
1568 char *ctx = NULL;
1569 u32 len;
1570 if (security_secid_to_secctx(
1571 n->osid, &ctx, &len)) {
1572 audit_log_format(ab, " osid=%u", n->osid);
1573 *call_panic = 2;
1574 } else {
1575 audit_log_format(ab, " obj=%s", ctx);
1576 security_release_secctx(ctx, len);
1577 }
1578 }
1579
1580 audit_log_fcaps(ab, n);
1581
1582 audit_log_end(ab);
1583}
1584
1327static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) 1585static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
1328{ 1586{
1329 const struct cred *cred; 1587 const struct cred *cred;
@@ -1331,6 +1589,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1331 struct audit_buffer *ab; 1589 struct audit_buffer *ab;
1332 struct audit_aux_data *aux; 1590 struct audit_aux_data *aux;
1333 const char *tty; 1591 const char *tty;
1592 struct audit_names *n;
1334 1593
1335 /* tsk == current */ 1594 /* tsk == current */
1336 context->pid = tsk->pid; 1595 context->pid = tsk->pid;
@@ -1466,70 +1725,14 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1466 if (context->pwd.dentry && context->pwd.mnt) { 1725 if (context->pwd.dentry && context->pwd.mnt) {
1467 ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); 1726 ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
1468 if (ab) { 1727 if (ab) {
1469 audit_log_d_path(ab, "cwd=", &context->pwd); 1728 audit_log_d_path(ab, " cwd=", &context->pwd);
1470 audit_log_end(ab); 1729 audit_log_end(ab);
1471 } 1730 }
1472 } 1731 }
1473 for (i = 0; i < context->name_count; i++) {
1474 struct audit_names *n = &context->names[i];
1475 1732
1476 ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); 1733 i = 0;
1477 if (!ab) 1734 list_for_each_entry(n, &context->names_list, list)
1478 continue; /* audit_panic has been called */ 1735 audit_log_name(context, n, i++, &call_panic);
1479
1480 audit_log_format(ab, "item=%d", i);
1481
1482 if (n->name) {
1483 switch(n->name_len) {
1484 case AUDIT_NAME_FULL:
1485 /* log the full path */
1486 audit_log_format(ab, " name=");
1487 audit_log_untrustedstring(ab, n->name);
1488 break;
1489 case 0:
1490 /* name was specified as a relative path and the
1491 * directory component is the cwd */
1492 audit_log_d_path(ab, "name=", &context->pwd);
1493 break;
1494 default:
1495 /* log the name's directory component */
1496 audit_log_format(ab, " name=");
1497 audit_log_n_untrustedstring(ab, n->name,
1498 n->name_len);
1499 }
1500 } else
1501 audit_log_format(ab, " name=(null)");
1502
1503 if (n->ino != (unsigned long)-1) {
1504 audit_log_format(ab, " inode=%lu"
1505 " dev=%02x:%02x mode=%#o"
1506 " ouid=%u ogid=%u rdev=%02x:%02x",
1507 n->ino,
1508 MAJOR(n->dev),
1509 MINOR(n->dev),
1510 n->mode,
1511 n->uid,
1512 n->gid,
1513 MAJOR(n->rdev),
1514 MINOR(n->rdev));
1515 }
1516 if (n->osid != 0) {
1517 char *ctx = NULL;
1518 u32 len;
1519 if (security_secid_to_secctx(
1520 n->osid, &ctx, &len)) {
1521 audit_log_format(ab, " osid=%u", n->osid);
1522 call_panic = 2;
1523 } else {
1524 audit_log_format(ab, " obj=%s", ctx);
1525 security_release_secctx(ctx, len);
1526 }
1527 }
1528
1529 audit_log_fcaps(ab, n);
1530
1531 audit_log_end(ab);
1532 }
1533 1736
1534 /* Send end of event record to help user space know we are finished */ 1737 /* Send end of event record to help user space know we are finished */
1535 ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); 1738 ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
@@ -1545,12 +1748,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1545 * 1748 *
1546 * Called from copy_process and do_exit 1749 * Called from copy_process and do_exit
1547 */ 1750 */
1548void audit_free(struct task_struct *tsk) 1751void __audit_free(struct task_struct *tsk)
1549{ 1752{
1550 struct audit_context *context; 1753 struct audit_context *context;
1551 1754
1552 context = audit_get_context(tsk, 0, 0); 1755 context = audit_get_context(tsk, 0, 0);
1553 if (likely(!context)) 1756 if (!context)
1554 return; 1757 return;
1555 1758
1556 /* Check for system calls that do not go through the exit 1759 /* Check for system calls that do not go through the exit
@@ -1583,7 +1786,7 @@ void audit_free(struct task_struct *tsk)
1583 * will only be written if another part of the kernel requests that it 1786 * will only be written if another part of the kernel requests that it
1584 * be written). 1787 * be written).
1585 */ 1788 */
1586void audit_syscall_entry(int arch, int major, 1789void __audit_syscall_entry(int arch, int major,
1587 unsigned long a1, unsigned long a2, 1790 unsigned long a1, unsigned long a2,
1588 unsigned long a3, unsigned long a4) 1791 unsigned long a3, unsigned long a4)
1589{ 1792{
@@ -1591,7 +1794,7 @@ void audit_syscall_entry(int arch, int major,
1591 struct audit_context *context = tsk->audit_context; 1794 struct audit_context *context = tsk->audit_context;
1592 enum audit_state state; 1795 enum audit_state state;
1593 1796
1594 if (unlikely(!context)) 1797 if (!context)
1595 return; 1798 return;
1596 1799
1597 /* 1800 /*
@@ -1648,7 +1851,7 @@ void audit_syscall_entry(int arch, int major,
1648 context->prio = 0; 1851 context->prio = 0;
1649 state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]); 1852 state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]);
1650 } 1853 }
1651 if (likely(state == AUDIT_DISABLED)) 1854 if (state == AUDIT_DISABLED)
1652 return; 1855 return;
1653 1856
1654 context->serial = 0; 1857 context->serial = 0;
@@ -1658,45 +1861,29 @@ void audit_syscall_entry(int arch, int major,
1658 context->ppid = 0; 1861 context->ppid = 0;
1659} 1862}
1660 1863
1661void audit_finish_fork(struct task_struct *child)
1662{
1663 struct audit_context *ctx = current->audit_context;
1664 struct audit_context *p = child->audit_context;
1665 if (!p || !ctx)
1666 return;
1667 if (!ctx->in_syscall || ctx->current_state != AUDIT_RECORD_CONTEXT)
1668 return;
1669 p->arch = ctx->arch;
1670 p->major = ctx->major;
1671 memcpy(p->argv, ctx->argv, sizeof(ctx->argv));
1672 p->ctime = ctx->ctime;
1673 p->dummy = ctx->dummy;
1674 p->in_syscall = ctx->in_syscall;
1675 p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL);
1676 p->ppid = current->pid;
1677 p->prio = ctx->prio;
1678 p->current_state = ctx->current_state;
1679}
1680
1681/** 1864/**
1682 * audit_syscall_exit - deallocate audit context after a system call 1865 * audit_syscall_exit - deallocate audit context after a system call
1683 * @valid: success/failure flag 1866 * @success: success value of the syscall
1684 * @return_code: syscall return value 1867 * @return_code: return value of the syscall
1685 * 1868 *
1686 * Tear down after system call. If the audit context has been marked as 1869 * Tear down after system call. If the audit context has been marked as
1687 * auditable (either because of the AUDIT_RECORD_CONTEXT state from 1870 * auditable (either because of the AUDIT_RECORD_CONTEXT state from
1688 * filtering, or because some other part of the kernel write an audit 1871 * filtering, or because some other part of the kernel wrote an audit
1689 * message), then write out the syscall information. In call cases, 1872 * message), then write out the syscall information. In call cases,
1690 * free the names stored from getname(). 1873 * free the names stored from getname().
1691 */ 1874 */
1692void audit_syscall_exit(int valid, long return_code) 1875void __audit_syscall_exit(int success, long return_code)
1693{ 1876{
1694 struct task_struct *tsk = current; 1877 struct task_struct *tsk = current;
1695 struct audit_context *context; 1878 struct audit_context *context;
1696 1879
1697 context = audit_get_context(tsk, valid, return_code); 1880 if (success)
1881 success = AUDITSC_SUCCESS;
1882 else
1883 success = AUDITSC_FAILURE;
1698 1884
1699 if (likely(!context)) 1885 context = audit_get_context(tsk, success, return_code);
1886 if (!context)
1700 return; 1887 return;
1701 1888
1702 if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) 1889 if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
@@ -1821,6 +2008,30 @@ retry:
1821#endif 2008#endif
1822} 2009}
1823 2010
2011static struct audit_names *audit_alloc_name(struct audit_context *context)
2012{
2013 struct audit_names *aname;
2014
2015 if (context->name_count < AUDIT_NAMES) {
2016 aname = &context->preallocated_names[context->name_count];
2017 memset(aname, 0, sizeof(*aname));
2018 } else {
2019 aname = kzalloc(sizeof(*aname), GFP_NOFS);
2020 if (!aname)
2021 return NULL;
2022 aname->should_free = true;
2023 }
2024
2025 aname->ino = (unsigned long)-1;
2026 list_add_tail(&aname->list, &context->names_list);
2027
2028 context->name_count++;
2029#if AUDIT_DEBUG
2030 context->ino_count++;
2031#endif
2032 return aname;
2033}
2034
1824/** 2035/**
1825 * audit_getname - add a name to the list 2036 * audit_getname - add a name to the list
1826 * @name: name to add 2037 * @name: name to add
@@ -1831,9 +2042,7 @@ retry:
1831void __audit_getname(const char *name) 2042void __audit_getname(const char *name)
1832{ 2043{
1833 struct audit_context *context = current->audit_context; 2044 struct audit_context *context = current->audit_context;
1834 2045 struct audit_names *n;
1835 if (IS_ERR(name) || !name)
1836 return;
1837 2046
1838 if (!context->in_syscall) { 2047 if (!context->in_syscall) {
1839#if AUDIT_DEBUG == 2 2048#if AUDIT_DEBUG == 2
@@ -1843,13 +2052,15 @@ void __audit_getname(const char *name)
1843#endif 2052#endif
1844 return; 2053 return;
1845 } 2054 }
1846 BUG_ON(context->name_count >= AUDIT_NAMES); 2055
1847 context->names[context->name_count].name = name; 2056 n = audit_alloc_name(context);
1848 context->names[context->name_count].name_len = AUDIT_NAME_FULL; 2057 if (!n)
1849 context->names[context->name_count].name_put = 1; 2058 return;
1850 context->names[context->name_count].ino = (unsigned long)-1; 2059
1851 context->names[context->name_count].osid = 0; 2060 n->name = name;
1852 ++context->name_count; 2061 n->name_len = AUDIT_NAME_FULL;
2062 n->name_put = true;
2063
1853 if (!context->pwd.dentry) 2064 if (!context->pwd.dentry)
1854 get_fs_pwd(current->fs, &context->pwd); 2065 get_fs_pwd(current->fs, &context->pwd);
1855} 2066}
@@ -1871,12 +2082,13 @@ void audit_putname(const char *name)
1871 printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n", 2082 printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n",
1872 __FILE__, __LINE__, context->serial, name); 2083 __FILE__, __LINE__, context->serial, name);
1873 if (context->name_count) { 2084 if (context->name_count) {
2085 struct audit_names *n;
1874 int i; 2086 int i;
1875 for (i = 0; i < context->name_count; i++) 2087
2088 list_for_each_entry(n, &context->names_list, list)
1876 printk(KERN_ERR "name[%d] = %p = %s\n", i, 2089 printk(KERN_ERR "name[%d] = %p = %s\n", i,
1877 context->names[i].name, 2090 n->name, n->name ?: "(null)");
1878 context->names[i].name ?: "(null)"); 2091 }
1879 }
1880#endif 2092#endif
1881 __putname(name); 2093 __putname(name);
1882 } 2094 }
@@ -1897,39 +2109,11 @@ void audit_putname(const char *name)
1897#endif 2109#endif
1898} 2110}
1899 2111
1900static int audit_inc_name_count(struct audit_context *context,
1901 const struct inode *inode)
1902{
1903 if (context->name_count >= AUDIT_NAMES) {
1904 if (inode)
1905 printk(KERN_DEBUG "audit: name_count maxed, losing inode data: "
1906 "dev=%02x:%02x, inode=%lu\n",
1907 MAJOR(inode->i_sb->s_dev),
1908 MINOR(inode->i_sb->s_dev),
1909 inode->i_ino);
1910
1911 else
1912 printk(KERN_DEBUG "name_count maxed, losing inode data\n");
1913 return 1;
1914 }
1915 context->name_count++;
1916#if AUDIT_DEBUG
1917 context->ino_count++;
1918#endif
1919 return 0;
1920}
1921
1922
1923static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry) 2112static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry)
1924{ 2113{
1925 struct cpu_vfs_cap_data caps; 2114 struct cpu_vfs_cap_data caps;
1926 int rc; 2115 int rc;
1927 2116
1928 memset(&name->fcap.permitted, 0, sizeof(kernel_cap_t));
1929 memset(&name->fcap.inheritable, 0, sizeof(kernel_cap_t));
1930 name->fcap.fE = 0;
1931 name->fcap_ver = 0;
1932
1933 if (!dentry) 2117 if (!dentry)
1934 return 0; 2118 return 0;
1935 2119
@@ -1969,30 +2153,25 @@ static void audit_copy_inode(struct audit_names *name, const struct dentry *dent
1969 */ 2153 */
1970void __audit_inode(const char *name, const struct dentry *dentry) 2154void __audit_inode(const char *name, const struct dentry *dentry)
1971{ 2155{
1972 int idx;
1973 struct audit_context *context = current->audit_context; 2156 struct audit_context *context = current->audit_context;
1974 const struct inode *inode = dentry->d_inode; 2157 const struct inode *inode = dentry->d_inode;
2158 struct audit_names *n;
1975 2159
1976 if (!context->in_syscall) 2160 if (!context->in_syscall)
1977 return; 2161 return;
1978 if (context->name_count 2162
1979 && context->names[context->name_count-1].name 2163 list_for_each_entry_reverse(n, &context->names_list, list) {
1980 && context->names[context->name_count-1].name == name) 2164 if (n->name && (n->name == name))
1981 idx = context->name_count - 1; 2165 goto out;
1982 else if (context->name_count > 1
1983 && context->names[context->name_count-2].name
1984 && context->names[context->name_count-2].name == name)
1985 idx = context->name_count - 2;
1986 else {
1987 /* FIXME: how much do we care about inodes that have no
1988 * associated name? */
1989 if (audit_inc_name_count(context, inode))
1990 return;
1991 idx = context->name_count - 1;
1992 context->names[idx].name = NULL;
1993 } 2166 }
2167
2168 /* unable to find the name from a previous getname() */
2169 n = audit_alloc_name(context);
2170 if (!n)
2171 return;
2172out:
1994 handle_path(dentry); 2173 handle_path(dentry);
1995 audit_copy_inode(&context->names[idx], dentry, inode); 2174 audit_copy_inode(n, dentry, inode);
1996} 2175}
1997 2176
1998/** 2177/**
@@ -2011,11 +2190,11 @@ void __audit_inode(const char *name, const struct dentry *dentry)
2011void __audit_inode_child(const struct dentry *dentry, 2190void __audit_inode_child(const struct dentry *dentry,
2012 const struct inode *parent) 2191 const struct inode *parent)
2013{ 2192{
2014 int idx;
2015 struct audit_context *context = current->audit_context; 2193 struct audit_context *context = current->audit_context;
2016 const char *found_parent = NULL, *found_child = NULL; 2194 const char *found_parent = NULL, *found_child = NULL;
2017 const struct inode *inode = dentry->d_inode; 2195 const struct inode *inode = dentry->d_inode;
2018 const char *dname = dentry->d_name.name; 2196 const char *dname = dentry->d_name.name;
2197 struct audit_names *n;
2019 int dirlen = 0; 2198 int dirlen = 0;
2020 2199
2021 if (!context->in_syscall) 2200 if (!context->in_syscall)
@@ -2025,9 +2204,7 @@ void __audit_inode_child(const struct dentry *dentry,
2025 handle_one(inode); 2204 handle_one(inode);
2026 2205
2027 /* parent is more likely, look for it first */ 2206 /* parent is more likely, look for it first */
2028 for (idx = 0; idx < context->name_count; idx++) { 2207 list_for_each_entry(n, &context->names_list, list) {
2029 struct audit_names *n = &context->names[idx];
2030
2031 if (!n->name) 2208 if (!n->name)
2032 continue; 2209 continue;
2033 2210
@@ -2040,9 +2217,7 @@ void __audit_inode_child(const struct dentry *dentry,
2040 } 2217 }
2041 2218
2042 /* no matching parent, look for matching child */ 2219 /* no matching parent, look for matching child */
2043 for (idx = 0; idx < context->name_count; idx++) { 2220 list_for_each_entry(n, &context->names_list, list) {
2044 struct audit_names *n = &context->names[idx];
2045
2046 if (!n->name) 2221 if (!n->name)
2047 continue; 2222 continue;
2048 2223
@@ -2060,34 +2235,29 @@ void __audit_inode_child(const struct dentry *dentry,
2060 2235
2061add_names: 2236add_names:
2062 if (!found_parent) { 2237 if (!found_parent) {
2063 if (audit_inc_name_count(context, parent)) 2238 n = audit_alloc_name(context);
2239 if (!n)
2064 return; 2240 return;
2065 idx = context->name_count - 1; 2241 audit_copy_inode(n, NULL, parent);
2066 context->names[idx].name = NULL;
2067 audit_copy_inode(&context->names[idx], NULL, parent);
2068 } 2242 }
2069 2243
2070 if (!found_child) { 2244 if (!found_child) {
2071 if (audit_inc_name_count(context, inode)) 2245 n = audit_alloc_name(context);
2246 if (!n)
2072 return; 2247 return;
2073 idx = context->name_count - 1;
2074 2248
2075 /* Re-use the name belonging to the slot for a matching parent 2249 /* Re-use the name belonging to the slot for a matching parent
2076 * directory. All names for this context are relinquished in 2250 * directory. All names for this context are relinquished in
2077 * audit_free_names() */ 2251 * audit_free_names() */
2078 if (found_parent) { 2252 if (found_parent) {
2079 context->names[idx].name = found_parent; 2253 n->name = found_parent;
2080 context->names[idx].name_len = AUDIT_NAME_FULL; 2254 n->name_len = AUDIT_NAME_FULL;
2081 /* don't call __putname() */ 2255 /* don't call __putname() */
2082 context->names[idx].name_put = 0; 2256 n->name_put = false;
2083 } else {
2084 context->names[idx].name = NULL;
2085 } 2257 }
2086 2258
2087 if (inode) 2259 if (inode)
2088 audit_copy_inode(&context->names[idx], NULL, inode); 2260 audit_copy_inode(n, NULL, inode);
2089 else
2090 context->names[idx].ino = (unsigned long)-1;
2091 } 2261 }
2092} 2262}
2093EXPORT_SYMBOL_GPL(__audit_inode_child); 2263EXPORT_SYMBOL_GPL(__audit_inode_child);
@@ -2121,19 +2291,28 @@ int auditsc_get_stamp(struct audit_context *ctx,
2121static atomic_t session_id = ATOMIC_INIT(0); 2291static atomic_t session_id = ATOMIC_INIT(0);
2122 2292
2123/** 2293/**
2124 * audit_set_loginuid - set a task's audit_context loginuid 2294 * audit_set_loginuid - set current task's audit_context loginuid
2125 * @task: task whose audit context is being modified
2126 * @loginuid: loginuid value 2295 * @loginuid: loginuid value
2127 * 2296 *
2128 * Returns 0. 2297 * Returns 0.
2129 * 2298 *
2130 * Called (set) from fs/proc/base.c::proc_loginuid_write(). 2299 * Called (set) from fs/proc/base.c::proc_loginuid_write().
2131 */ 2300 */
2132int audit_set_loginuid(struct task_struct *task, uid_t loginuid) 2301int audit_set_loginuid(uid_t loginuid)
2133{ 2302{
2134 unsigned int sessionid = atomic_inc_return(&session_id); 2303 struct task_struct *task = current;
2135 struct audit_context *context = task->audit_context; 2304 struct audit_context *context = task->audit_context;
2305 unsigned int sessionid;
2306
2307#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE
2308 if (task->loginuid != -1)
2309 return -EPERM;
2310#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
2311 if (!capable(CAP_AUDIT_CONTROL))
2312 return -EPERM;
2313#endif /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
2136 2314
2315 sessionid = atomic_inc_return(&session_id);
2137 if (context && context->in_syscall) { 2316 if (context && context->in_syscall) {
2138 struct audit_buffer *ab; 2317 struct audit_buffer *ab;
2139 2318
@@ -2160,7 +2339,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
2160 * @attr: queue attributes 2339 * @attr: queue attributes
2161 * 2340 *
2162 */ 2341 */
2163void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr) 2342void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
2164{ 2343{
2165 struct audit_context *context = current->audit_context; 2344 struct audit_context *context = current->audit_context;
2166 2345
@@ -2260,7 +2439,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
2260 * 2439 *
2261 * Called only after audit_ipc_obj(). 2440 * Called only after audit_ipc_obj().
2262 */ 2441 */
2263void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) 2442void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode)
2264{ 2443{
2265 struct audit_context *context = current->audit_context; 2444 struct audit_context *context = current->audit_context;
2266 2445
@@ -2271,14 +2450,11 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mod
2271 context->ipc.has_perm = 1; 2450 context->ipc.has_perm = 1;
2272} 2451}
2273 2452
2274int audit_bprm(struct linux_binprm *bprm) 2453int __audit_bprm(struct linux_binprm *bprm)
2275{ 2454{
2276 struct audit_aux_data_execve *ax; 2455 struct audit_aux_data_execve *ax;
2277 struct audit_context *context = current->audit_context; 2456 struct audit_context *context = current->audit_context;
2278 2457
2279 if (likely(!audit_enabled || !context || context->dummy))
2280 return 0;
2281
2282 ax = kmalloc(sizeof(*ax), GFP_KERNEL); 2458 ax = kmalloc(sizeof(*ax), GFP_KERNEL);
2283 if (!ax) 2459 if (!ax)
2284 return -ENOMEM; 2460 return -ENOMEM;
@@ -2299,13 +2475,10 @@ int audit_bprm(struct linux_binprm *bprm)
2299 * @args: args array 2475 * @args: args array
2300 * 2476 *
2301 */ 2477 */
2302void audit_socketcall(int nargs, unsigned long *args) 2478void __audit_socketcall(int nargs, unsigned long *args)
2303{ 2479{
2304 struct audit_context *context = current->audit_context; 2480 struct audit_context *context = current->audit_context;
2305 2481
2306 if (likely(!context || context->dummy))
2307 return;
2308
2309 context->type = AUDIT_SOCKETCALL; 2482 context->type = AUDIT_SOCKETCALL;
2310 context->socketcall.nargs = nargs; 2483 context->socketcall.nargs = nargs;
2311 memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long)); 2484 memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long));
@@ -2331,13 +2504,10 @@ void __audit_fd_pair(int fd1, int fd2)
2331 * 2504 *
2332 * Returns 0 for success or NULL context or < 0 on error. 2505 * Returns 0 for success or NULL context or < 0 on error.
2333 */ 2506 */
2334int audit_sockaddr(int len, void *a) 2507int __audit_sockaddr(int len, void *a)
2335{ 2508{
2336 struct audit_context *context = current->audit_context; 2509 struct audit_context *context = current->audit_context;
2337 2510
2338 if (likely(!context || context->dummy))
2339 return 0;
2340
2341 if (!context->sockaddr) { 2511 if (!context->sockaddr) {
2342 void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL); 2512 void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL);
2343 if (!p) 2513 if (!p)
@@ -2499,6 +2669,25 @@ void __audit_mmap_fd(int fd, int flags)
2499 context->type = AUDIT_MMAP; 2669 context->type = AUDIT_MMAP;
2500} 2670}
2501 2671
2672static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
2673{
2674 uid_t auid, uid;
2675 gid_t gid;
2676 unsigned int sessionid;
2677
2678 auid = audit_get_loginuid(current);
2679 sessionid = audit_get_sessionid(current);
2680 current_uid_gid(&uid, &gid);
2681
2682 audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
2683 auid, uid, gid, sessionid);
2684 audit_log_task_context(ab);
2685 audit_log_format(ab, " pid=%d comm=", current->pid);
2686 audit_log_untrustedstring(ab, current->comm);
2687 audit_log_format(ab, " reason=");
2688 audit_log_string(ab, reason);
2689 audit_log_format(ab, " sig=%ld", signr);
2690}
2502/** 2691/**
2503 * audit_core_dumps - record information about processes that end abnormally 2692 * audit_core_dumps - record information about processes that end abnormally
2504 * @signr: signal value 2693 * @signr: signal value
@@ -2509,10 +2698,6 @@ void __audit_mmap_fd(int fd, int flags)
2509void audit_core_dumps(long signr) 2698void audit_core_dumps(long signr)
2510{ 2699{
2511 struct audit_buffer *ab; 2700 struct audit_buffer *ab;
2512 u32 sid;
2513 uid_t auid = audit_get_loginuid(current), uid;
2514 gid_t gid;
2515 unsigned int sessionid = audit_get_sessionid(current);
2516 2701
2517 if (!audit_enabled) 2702 if (!audit_enabled)
2518 return; 2703 return;
@@ -2521,24 +2706,17 @@ void audit_core_dumps(long signr)
2521 return; 2706 return;
2522 2707
2523 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); 2708 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
2524 current_uid_gid(&uid, &gid); 2709 audit_log_abend(ab, "memory violation", signr);
2525 audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", 2710 audit_log_end(ab);
2526 auid, uid, gid, sessionid); 2711}
2527 security_task_getsecid(current, &sid);
2528 if (sid) {
2529 char *ctx = NULL;
2530 u32 len;
2531 2712
2532 if (security_secid_to_secctx(sid, &ctx, &len)) 2713void __audit_seccomp(unsigned long syscall)
2533 audit_log_format(ab, " ssid=%u", sid); 2714{
2534 else { 2715 struct audit_buffer *ab;
2535 audit_log_format(ab, " subj=%s", ctx); 2716
2536 security_release_secctx(ctx, len); 2717 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
2537 } 2718 audit_log_abend(ab, "seccomp", SIGKILL);
2538 } 2719 audit_log_format(ab, " syscall=%ld", syscall);
2539 audit_log_format(ab, " pid=%d comm=", current->pid);
2540 audit_log_untrustedstring(ab, current->comm);
2541 audit_log_format(ab, " sig=%ld", signr);
2542 audit_log_end(ab); 2720 audit_log_end(ab);
2543} 2721}
2544 2722
diff --git a/kernel/capability.c b/kernel/capability.c
index b463871a4e69..3f1adb6c6470 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -287,74 +287,84 @@ error:
287} 287}
288 288
289/** 289/**
290 * has_capability - Does a task have a capability in init_user_ns 290 * has_ns_capability - Does a task have a capability in a specific user ns
291 * @t: The task in question 291 * @t: The task in question
292 * @ns: target user namespace
292 * @cap: The capability to be tested for 293 * @cap: The capability to be tested for
293 * 294 *
294 * Return true if the specified task has the given superior capability 295 * Return true if the specified task has the given superior capability
295 * currently in effect to the initial user namespace, false if not. 296 * currently in effect to the specified user namespace, false if not.
296 * 297 *
297 * Note that this does not set PF_SUPERPRIV on the task. 298 * Note that this does not set PF_SUPERPRIV on the task.
298 */ 299 */
299bool has_capability(struct task_struct *t, int cap) 300bool has_ns_capability(struct task_struct *t,
301 struct user_namespace *ns, int cap)
300{ 302{
301 int ret = security_real_capable(t, &init_user_ns, cap); 303 int ret;
304
305 rcu_read_lock();
306 ret = security_capable(__task_cred(t), ns, cap);
307 rcu_read_unlock();
302 308
303 return (ret == 0); 309 return (ret == 0);
304} 310}
305 311
306/** 312/**
307 * has_capability - Does a task have a capability in a specific user ns 313 * has_capability - Does a task have a capability in init_user_ns
308 * @t: The task in question 314 * @t: The task in question
309 * @ns: target user namespace
310 * @cap: The capability to be tested for 315 * @cap: The capability to be tested for
311 * 316 *
312 * Return true if the specified task has the given superior capability 317 * Return true if the specified task has the given superior capability
313 * currently in effect to the specified user namespace, false if not. 318 * currently in effect to the initial user namespace, false if not.
314 * 319 *
315 * Note that this does not set PF_SUPERPRIV on the task. 320 * Note that this does not set PF_SUPERPRIV on the task.
316 */ 321 */
317bool has_ns_capability(struct task_struct *t, 322bool has_capability(struct task_struct *t, int cap)
318 struct user_namespace *ns, int cap)
319{ 323{
320 int ret = security_real_capable(t, ns, cap); 324 return has_ns_capability(t, &init_user_ns, cap);
321
322 return (ret == 0);
323} 325}
324 326
325/** 327/**
326 * has_capability_noaudit - Does a task have a capability (unaudited) 328 * has_ns_capability_noaudit - Does a task have a capability (unaudited)
329 * in a specific user ns.
327 * @t: The task in question 330 * @t: The task in question
331 * @ns: target user namespace
328 * @cap: The capability to be tested for 332 * @cap: The capability to be tested for
329 * 333 *
330 * Return true if the specified task has the given superior capability 334 * Return true if the specified task has the given superior capability
331 * currently in effect to init_user_ns, false if not. Don't write an 335 * currently in effect to the specified user namespace, false if not.
332 * audit message for the check. 336 * Do not write an audit message for the check.
333 * 337 *
334 * Note that this does not set PF_SUPERPRIV on the task. 338 * Note that this does not set PF_SUPERPRIV on the task.
335 */ 339 */
336bool has_capability_noaudit(struct task_struct *t, int cap) 340bool has_ns_capability_noaudit(struct task_struct *t,
341 struct user_namespace *ns, int cap)
337{ 342{
338 int ret = security_real_capable_noaudit(t, &init_user_ns, cap); 343 int ret;
344
345 rcu_read_lock();
346 ret = security_capable_noaudit(__task_cred(t), ns, cap);
347 rcu_read_unlock();
339 348
340 return (ret == 0); 349 return (ret == 0);
341} 350}
342 351
343/** 352/**
344 * capable - Determine if the current task has a superior capability in effect 353 * has_capability_noaudit - Does a task have a capability (unaudited) in the
354 * initial user ns
355 * @t: The task in question
345 * @cap: The capability to be tested for 356 * @cap: The capability to be tested for
346 * 357 *
347 * Return true if the current task has the given superior capability currently 358 * Return true if the specified task has the given superior capability
348 * available for use, false if not. 359 * currently in effect to init_user_ns, false if not. Don't write an
360 * audit message for the check.
349 * 361 *
350 * This sets PF_SUPERPRIV on the task if the capability is available on the 362 * Note that this does not set PF_SUPERPRIV on the task.
351 * assumption that it's about to be used.
352 */ 363 */
353bool capable(int cap) 364bool has_capability_noaudit(struct task_struct *t, int cap)
354{ 365{
355 return ns_capable(&init_user_ns, cap); 366 return has_ns_capability_noaudit(t, &init_user_ns, cap);
356} 367}
357EXPORT_SYMBOL(capable);
358 368
359/** 369/**
360 * ns_capable - Determine if the current task has a superior capability in effect 370 * ns_capable - Determine if the current task has a superior capability in effect
@@ -374,7 +384,7 @@ bool ns_capable(struct user_namespace *ns, int cap)
374 BUG(); 384 BUG();
375 } 385 }
376 386
377 if (security_capable(ns, current_cred(), cap) == 0) { 387 if (security_capable(current_cred(), ns, cap) == 0) {
378 current->flags |= PF_SUPERPRIV; 388 current->flags |= PF_SUPERPRIV;
379 return true; 389 return true;
380 } 390 }
@@ -383,18 +393,20 @@ bool ns_capable(struct user_namespace *ns, int cap)
383EXPORT_SYMBOL(ns_capable); 393EXPORT_SYMBOL(ns_capable);
384 394
385/** 395/**
386 * task_ns_capable - Determine whether current task has a superior 396 * capable - Determine if the current task has a superior capability in effect
387 * capability targeted at a specific task's user namespace. 397 * @cap: The capability to be tested for
388 * @t: The task whose user namespace is targeted. 398 *
389 * @cap: The capability in question. 399 * Return true if the current task has the given superior capability currently
400 * available for use, false if not.
390 * 401 *
391 * Return true if it does, false otherwise. 402 * This sets PF_SUPERPRIV on the task if the capability is available on the
403 * assumption that it's about to be used.
392 */ 404 */
393bool task_ns_capable(struct task_struct *t, int cap) 405bool capable(int cap)
394{ 406{
395 return ns_capable(task_cred_xxx(t, user)->user_ns, cap); 407 return ns_capable(&init_user_ns, cap);
396} 408}
397EXPORT_SYMBOL(task_ns_capable); 409EXPORT_SYMBOL(capable);
398 410
399/** 411/**
400 * nsown_capable - Check superior capability to one's own user_ns 412 * nsown_capable - Check superior capability to one's own user_ns
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a184470cf9b5..a5d3b5325f77 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -63,7 +63,24 @@
63 63
64#include <linux/atomic.h> 64#include <linux/atomic.h>
65 65
66/*
67 * cgroup_mutex is the master lock. Any modification to cgroup or its
68 * hierarchy must be performed while holding it.
69 *
70 * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify
71 * cgroupfs_root of any cgroup hierarchy - subsys list, flags,
72 * release_agent_path and so on. Modifying requires both cgroup_mutex and
73 * cgroup_root_mutex. Readers can acquire either of the two. This is to
74 * break the following locking order cycle.
75 *
76 * A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem
77 * B. namespace_sem -> cgroup_mutex
78 *
79 * B happens only through cgroup_show_options() and using cgroup_root_mutex
80 * breaks it.
81 */
66static DEFINE_MUTEX(cgroup_mutex); 82static DEFINE_MUTEX(cgroup_mutex);
83static DEFINE_MUTEX(cgroup_root_mutex);
67 84
68/* 85/*
69 * Generate an array of cgroup subsystem pointers. At boot time, this is 86 * Generate an array of cgroup subsystem pointers. At boot time, this is
@@ -760,7 +777,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
760 * -> cgroup_mkdir. 777 * -> cgroup_mkdir.
761 */ 778 */
762 779
763static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); 780static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
764static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *); 781static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
765static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 782static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
766static int cgroup_populate_dir(struct cgroup *cgrp); 783static int cgroup_populate_dir(struct cgroup *cgrp);
@@ -775,7 +792,7 @@ static struct backing_dev_info cgroup_backing_dev_info = {
775static int alloc_css_id(struct cgroup_subsys *ss, 792static int alloc_css_id(struct cgroup_subsys *ss,
776 struct cgroup *parent, struct cgroup *child); 793 struct cgroup *parent, struct cgroup *child);
777 794
778static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) 795static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
779{ 796{
780 struct inode *inode = new_inode(sb); 797 struct inode *inode = new_inode(sb);
781 798
@@ -921,7 +938,7 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
921 * 938 *
922 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; 939 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
923 */ 940 */
924DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); 941static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
925 942
926static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) 943static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
927{ 944{
@@ -953,6 +970,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
953 int i; 970 int i;
954 971
955 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 972 BUG_ON(!mutex_is_locked(&cgroup_mutex));
973 BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
956 974
957 removed_bits = root->actual_subsys_bits & ~final_bits; 975 removed_bits = root->actual_subsys_bits & ~final_bits;
958 added_bits = final_bits & ~root->actual_subsys_bits; 976 added_bits = final_bits & ~root->actual_subsys_bits;
@@ -1038,12 +1056,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1038 return 0; 1056 return 0;
1039} 1057}
1040 1058
1041static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) 1059static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1042{ 1060{
1043 struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info; 1061 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
1044 struct cgroup_subsys *ss; 1062 struct cgroup_subsys *ss;
1045 1063
1046 mutex_lock(&cgroup_mutex); 1064 mutex_lock(&cgroup_root_mutex);
1047 for_each_subsys(root, ss) 1065 for_each_subsys(root, ss)
1048 seq_printf(seq, ",%s", ss->name); 1066 seq_printf(seq, ",%s", ss->name);
1049 if (test_bit(ROOT_NOPREFIX, &root->flags)) 1067 if (test_bit(ROOT_NOPREFIX, &root->flags))
@@ -1054,7 +1072,7 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
1054 seq_puts(seq, ",clone_children"); 1072 seq_puts(seq, ",clone_children");
1055 if (strlen(root->name)) 1073 if (strlen(root->name))
1056 seq_printf(seq, ",name=%s", root->name); 1074 seq_printf(seq, ",name=%s", root->name);
1057 mutex_unlock(&cgroup_mutex); 1075 mutex_unlock(&cgroup_root_mutex);
1058 return 0; 1076 return 0;
1059} 1077}
1060 1078
@@ -1175,10 +1193,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1175 1193
1176 /* 1194 /*
1177 * If the 'all' option was specified select all the subsystems, 1195 * If the 'all' option was specified select all the subsystems,
1178 * otherwise 'all, 'none' and a subsystem name options were not 1196 * otherwise if 'none', 'name=' and a subsystem name options
1179 * specified, let's default to 'all' 1197 * were not specified, let's default to 'all'
1180 */ 1198 */
1181 if (all_ss || (!all_ss && !one_ss && !opts->none)) { 1199 if (all_ss || (!one_ss && !opts->none && !opts->name)) {
1182 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1200 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1183 struct cgroup_subsys *ss = subsys[i]; 1201 struct cgroup_subsys *ss = subsys[i];
1184 if (ss == NULL) 1202 if (ss == NULL)
@@ -1269,6 +1287,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1269 1287
1270 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 1288 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1271 mutex_lock(&cgroup_mutex); 1289 mutex_lock(&cgroup_mutex);
1290 mutex_lock(&cgroup_root_mutex);
1272 1291
1273 /* See what subsystems are wanted */ 1292 /* See what subsystems are wanted */
1274 ret = parse_cgroupfs_options(data, &opts); 1293 ret = parse_cgroupfs_options(data, &opts);
@@ -1297,6 +1316,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1297 out_unlock: 1316 out_unlock:
1298 kfree(opts.release_agent); 1317 kfree(opts.release_agent);
1299 kfree(opts.name); 1318 kfree(opts.name);
1319 mutex_unlock(&cgroup_root_mutex);
1300 mutex_unlock(&cgroup_mutex); 1320 mutex_unlock(&cgroup_mutex);
1301 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1321 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1302 return ret; 1322 return ret;
@@ -1481,6 +1501,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1481 int ret = 0; 1501 int ret = 0;
1482 struct super_block *sb; 1502 struct super_block *sb;
1483 struct cgroupfs_root *new_root; 1503 struct cgroupfs_root *new_root;
1504 struct inode *inode;
1484 1505
1485 /* First find the desired set of subsystems */ 1506 /* First find the desired set of subsystems */
1486 mutex_lock(&cgroup_mutex); 1507 mutex_lock(&cgroup_mutex);
@@ -1514,7 +1535,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1514 /* We used the new root structure, so this is a new hierarchy */ 1535 /* We used the new root structure, so this is a new hierarchy */
1515 struct list_head tmp_cg_links; 1536 struct list_head tmp_cg_links;
1516 struct cgroup *root_cgrp = &root->top_cgroup; 1537 struct cgroup *root_cgrp = &root->top_cgroup;
1517 struct inode *inode;
1518 struct cgroupfs_root *existing_root; 1538 struct cgroupfs_root *existing_root;
1519 const struct cred *cred; 1539 const struct cred *cred;
1520 int i; 1540 int i;
@@ -1528,18 +1548,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1528 1548
1529 mutex_lock(&inode->i_mutex); 1549 mutex_lock(&inode->i_mutex);
1530 mutex_lock(&cgroup_mutex); 1550 mutex_lock(&cgroup_mutex);
1551 mutex_lock(&cgroup_root_mutex);
1531 1552
1532 if (strlen(root->name)) { 1553 /* Check for name clashes with existing mounts */
1533 /* Check for name clashes with existing mounts */ 1554 ret = -EBUSY;
1534 for_each_active_root(existing_root) { 1555 if (strlen(root->name))
1535 if (!strcmp(existing_root->name, root->name)) { 1556 for_each_active_root(existing_root)
1536 ret = -EBUSY; 1557 if (!strcmp(existing_root->name, root->name))
1537 mutex_unlock(&cgroup_mutex); 1558 goto unlock_drop;
1538 mutex_unlock(&inode->i_mutex);
1539 goto drop_new_super;
1540 }
1541 }
1542 }
1543 1559
1544 /* 1560 /*
1545 * We're accessing css_set_count without locking 1561 * We're accessing css_set_count without locking
@@ -1549,18 +1565,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1549 * have some link structures left over 1565 * have some link structures left over
1550 */ 1566 */
1551 ret = allocate_cg_links(css_set_count, &tmp_cg_links); 1567 ret = allocate_cg_links(css_set_count, &tmp_cg_links);
1552 if (ret) { 1568 if (ret)
1553 mutex_unlock(&cgroup_mutex); 1569 goto unlock_drop;
1554 mutex_unlock(&inode->i_mutex);
1555 goto drop_new_super;
1556 }
1557 1570
1558 ret = rebind_subsystems(root, root->subsys_bits); 1571 ret = rebind_subsystems(root, root->subsys_bits);
1559 if (ret == -EBUSY) { 1572 if (ret == -EBUSY) {
1560 mutex_unlock(&cgroup_mutex);
1561 mutex_unlock(&inode->i_mutex);
1562 free_cg_links(&tmp_cg_links); 1573 free_cg_links(&tmp_cg_links);
1563 goto drop_new_super; 1574 goto unlock_drop;
1564 } 1575 }
1565 /* 1576 /*
1566 * There must be no failure case after here, since rebinding 1577 * There must be no failure case after here, since rebinding
@@ -1599,6 +1610,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1599 cred = override_creds(&init_cred); 1610 cred = override_creds(&init_cred);
1600 cgroup_populate_dir(root_cgrp); 1611 cgroup_populate_dir(root_cgrp);
1601 revert_creds(cred); 1612 revert_creds(cred);
1613 mutex_unlock(&cgroup_root_mutex);
1602 mutex_unlock(&cgroup_mutex); 1614 mutex_unlock(&cgroup_mutex);
1603 mutex_unlock(&inode->i_mutex); 1615 mutex_unlock(&inode->i_mutex);
1604 } else { 1616 } else {
@@ -1615,6 +1627,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1615 kfree(opts.name); 1627 kfree(opts.name);
1616 return dget(sb->s_root); 1628 return dget(sb->s_root);
1617 1629
1630 unlock_drop:
1631 mutex_unlock(&cgroup_root_mutex);
1632 mutex_unlock(&cgroup_mutex);
1633 mutex_unlock(&inode->i_mutex);
1618 drop_new_super: 1634 drop_new_super:
1619 deactivate_locked_super(sb); 1635 deactivate_locked_super(sb);
1620 drop_modules: 1636 drop_modules:
@@ -1639,6 +1655,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1639 BUG_ON(!list_empty(&cgrp->sibling)); 1655 BUG_ON(!list_empty(&cgrp->sibling));
1640 1656
1641 mutex_lock(&cgroup_mutex); 1657 mutex_lock(&cgroup_mutex);
1658 mutex_lock(&cgroup_root_mutex);
1642 1659
1643 /* Rebind all subsystems back to the default hierarchy */ 1660 /* Rebind all subsystems back to the default hierarchy */
1644 ret = rebind_subsystems(root, 0); 1661 ret = rebind_subsystems(root, 0);
@@ -1664,6 +1681,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1664 root_count--; 1681 root_count--;
1665 } 1682 }
1666 1683
1684 mutex_unlock(&cgroup_root_mutex);
1667 mutex_unlock(&cgroup_mutex); 1685 mutex_unlock(&cgroup_mutex);
1668 1686
1669 kill_litter_super(sb); 1687 kill_litter_super(sb);
@@ -1740,11 +1758,90 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1740EXPORT_SYMBOL_GPL(cgroup_path); 1758EXPORT_SYMBOL_GPL(cgroup_path);
1741 1759
1742/* 1760/*
1761 * Control Group taskset
1762 */
1763struct task_and_cgroup {
1764 struct task_struct *task;
1765 struct cgroup *cgrp;
1766};
1767
1768struct cgroup_taskset {
1769 struct task_and_cgroup single;
1770 struct flex_array *tc_array;
1771 int tc_array_len;
1772 int idx;
1773 struct cgroup *cur_cgrp;
1774};
1775
1776/**
1777 * cgroup_taskset_first - reset taskset and return the first task
1778 * @tset: taskset of interest
1779 *
1780 * @tset iteration is initialized and the first task is returned.
1781 */
1782struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
1783{
1784 if (tset->tc_array) {
1785 tset->idx = 0;
1786 return cgroup_taskset_next(tset);
1787 } else {
1788 tset->cur_cgrp = tset->single.cgrp;
1789 return tset->single.task;
1790 }
1791}
1792EXPORT_SYMBOL_GPL(cgroup_taskset_first);
1793
1794/**
1795 * cgroup_taskset_next - iterate to the next task in taskset
1796 * @tset: taskset of interest
1797 *
1798 * Return the next task in @tset. Iteration must have been initialized
1799 * with cgroup_taskset_first().
1800 */
1801struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
1802{
1803 struct task_and_cgroup *tc;
1804
1805 if (!tset->tc_array || tset->idx >= tset->tc_array_len)
1806 return NULL;
1807
1808 tc = flex_array_get(tset->tc_array, tset->idx++);
1809 tset->cur_cgrp = tc->cgrp;
1810 return tc->task;
1811}
1812EXPORT_SYMBOL_GPL(cgroup_taskset_next);
1813
1814/**
1815 * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task
1816 * @tset: taskset of interest
1817 *
1818 * Return the cgroup for the current (last returned) task of @tset. This
1819 * function must be preceded by either cgroup_taskset_first() or
1820 * cgroup_taskset_next().
1821 */
1822struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset)
1823{
1824 return tset->cur_cgrp;
1825}
1826EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup);
1827
1828/**
1829 * cgroup_taskset_size - return the number of tasks in taskset
1830 * @tset: taskset of interest
1831 */
1832int cgroup_taskset_size(struct cgroup_taskset *tset)
1833{
1834 return tset->tc_array ? tset->tc_array_len : 1;
1835}
1836EXPORT_SYMBOL_GPL(cgroup_taskset_size);
1837
1838
1839/*
1743 * cgroup_task_migrate - move a task from one cgroup to another. 1840 * cgroup_task_migrate - move a task from one cgroup to another.
1744 * 1841 *
1745 * 'guarantee' is set if the caller promises that a new css_set for the task 1842 * 'guarantee' is set if the caller promises that a new css_set for the task
1746 * will already exist. If not set, this function might sleep, and can fail with 1843 * will already exist. If not set, this function might sleep, and can fail with
1747 * -ENOMEM. Otherwise, it can only fail with -ESRCH. 1844 * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.
1748 */ 1845 */
1749static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, 1846static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1750 struct task_struct *tsk, bool guarantee) 1847 struct task_struct *tsk, bool guarantee)
@@ -1753,14 +1850,12 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1753 struct css_set *newcg; 1850 struct css_set *newcg;
1754 1851
1755 /* 1852 /*
1756 * get old css_set. we need to take task_lock and refcount it, because 1853 * We are synchronized through threadgroup_lock() against PF_EXITING
1757 * an exiting task can change its css_set to init_css_set and drop its 1854 * setting such that we can't race against cgroup_exit() changing the
1758 * old one without taking cgroup_mutex. 1855 * css_set to init_css_set and dropping the old one.
1759 */ 1856 */
1760 task_lock(tsk); 1857 WARN_ON_ONCE(tsk->flags & PF_EXITING);
1761 oldcg = tsk->cgroups; 1858 oldcg = tsk->cgroups;
1762 get_css_set(oldcg);
1763 task_unlock(tsk);
1764 1859
1765 /* locate or allocate a new css_set for this task. */ 1860 /* locate or allocate a new css_set for this task. */
1766 if (guarantee) { 1861 if (guarantee) {
@@ -1775,20 +1870,11 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1775 might_sleep(); 1870 might_sleep();
1776 /* find_css_set will give us newcg already referenced. */ 1871 /* find_css_set will give us newcg already referenced. */
1777 newcg = find_css_set(oldcg, cgrp); 1872 newcg = find_css_set(oldcg, cgrp);
1778 if (!newcg) { 1873 if (!newcg)
1779 put_css_set(oldcg);
1780 return -ENOMEM; 1874 return -ENOMEM;
1781 }
1782 } 1875 }
1783 put_css_set(oldcg);
1784 1876
1785 /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
1786 task_lock(tsk); 1877 task_lock(tsk);
1787 if (tsk->flags & PF_EXITING) {
1788 task_unlock(tsk);
1789 put_css_set(newcg);
1790 return -ESRCH;
1791 }
1792 rcu_assign_pointer(tsk->cgroups, newcg); 1878 rcu_assign_pointer(tsk->cgroups, newcg);
1793 task_unlock(tsk); 1879 task_unlock(tsk);
1794 1880
@@ -1814,8 +1900,8 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1814 * @cgrp: the cgroup the task is attaching to 1900 * @cgrp: the cgroup the task is attaching to
1815 * @tsk: the task to be attached 1901 * @tsk: the task to be attached
1816 * 1902 *
1817 * Call holding cgroup_mutex. May take task_lock of 1903 * Call with cgroup_mutex and threadgroup locked. May take task_lock of
1818 * the task 'tsk' during call. 1904 * @tsk during call.
1819 */ 1905 */
1820int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 1906int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1821{ 1907{
@@ -1823,15 +1909,23 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1823 struct cgroup_subsys *ss, *failed_ss = NULL; 1909 struct cgroup_subsys *ss, *failed_ss = NULL;
1824 struct cgroup *oldcgrp; 1910 struct cgroup *oldcgrp;
1825 struct cgroupfs_root *root = cgrp->root; 1911 struct cgroupfs_root *root = cgrp->root;
1912 struct cgroup_taskset tset = { };
1913
1914 /* @tsk either already exited or can't exit until the end */
1915 if (tsk->flags & PF_EXITING)
1916 return -ESRCH;
1826 1917
1827 /* Nothing to do if the task is already in that cgroup */ 1918 /* Nothing to do if the task is already in that cgroup */
1828 oldcgrp = task_cgroup_from_root(tsk, root); 1919 oldcgrp = task_cgroup_from_root(tsk, root);
1829 if (cgrp == oldcgrp) 1920 if (cgrp == oldcgrp)
1830 return 0; 1921 return 0;
1831 1922
1923 tset.single.task = tsk;
1924 tset.single.cgrp = oldcgrp;
1925
1832 for_each_subsys(root, ss) { 1926 for_each_subsys(root, ss) {
1833 if (ss->can_attach) { 1927 if (ss->can_attach) {
1834 retval = ss->can_attach(ss, cgrp, tsk); 1928 retval = ss->can_attach(ss, cgrp, &tset);
1835 if (retval) { 1929 if (retval) {
1836 /* 1930 /*
1837 * Remember on which subsystem the can_attach() 1931 * Remember on which subsystem the can_attach()
@@ -1843,13 +1937,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1843 goto out; 1937 goto out;
1844 } 1938 }
1845 } 1939 }
1846 if (ss->can_attach_task) {
1847 retval = ss->can_attach_task(cgrp, tsk);
1848 if (retval) {
1849 failed_ss = ss;
1850 goto out;
1851 }
1852 }
1853 } 1940 }
1854 1941
1855 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); 1942 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
@@ -1857,12 +1944,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1857 goto out; 1944 goto out;
1858 1945
1859 for_each_subsys(root, ss) { 1946 for_each_subsys(root, ss) {
1860 if (ss->pre_attach)
1861 ss->pre_attach(cgrp);
1862 if (ss->attach_task)
1863 ss->attach_task(cgrp, tsk);
1864 if (ss->attach) 1947 if (ss->attach)
1865 ss->attach(ss, cgrp, oldcgrp, tsk); 1948 ss->attach(ss, cgrp, &tset);
1866 } 1949 }
1867 1950
1868 synchronize_rcu(); 1951 synchronize_rcu();
@@ -1884,7 +1967,7 @@ out:
1884 */ 1967 */
1885 break; 1968 break;
1886 if (ss->cancel_attach) 1969 if (ss->cancel_attach)
1887 ss->cancel_attach(ss, cgrp, tsk); 1970 ss->cancel_attach(ss, cgrp, &tset);
1888 } 1971 }
1889 } 1972 }
1890 return retval; 1973 return retval;
@@ -1935,23 +2018,17 @@ static bool css_set_check_fetched(struct cgroup *cgrp,
1935 2018
1936 read_lock(&css_set_lock); 2019 read_lock(&css_set_lock);
1937 newcg = find_existing_css_set(cg, cgrp, template); 2020 newcg = find_existing_css_set(cg, cgrp, template);
1938 if (newcg)
1939 get_css_set(newcg);
1940 read_unlock(&css_set_lock); 2021 read_unlock(&css_set_lock);
1941 2022
1942 /* doesn't exist at all? */ 2023 /* doesn't exist at all? */
1943 if (!newcg) 2024 if (!newcg)
1944 return false; 2025 return false;
1945 /* see if it's already in the list */ 2026 /* see if it's already in the list */
1946 list_for_each_entry(cg_entry, newcg_list, links) { 2027 list_for_each_entry(cg_entry, newcg_list, links)
1947 if (cg_entry->cg == newcg) { 2028 if (cg_entry->cg == newcg)
1948 put_css_set(newcg);
1949 return true; 2029 return true;
1950 }
1951 }
1952 2030
1953 /* not found */ 2031 /* not found */
1954 put_css_set(newcg);
1955 return false; 2032 return false;
1956} 2033}
1957 2034
@@ -1985,21 +2062,21 @@ static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
1985 * @cgrp: the cgroup to attach to 2062 * @cgrp: the cgroup to attach to
1986 * @leader: the threadgroup leader task_struct of the group to be attached 2063 * @leader: the threadgroup leader task_struct of the group to be attached
1987 * 2064 *
1988 * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will 2065 * Call holding cgroup_mutex and the group_rwsem of the leader. Will take
1989 * take task_lock of each thread in leader's threadgroup individually in turn. 2066 * task_lock of each thread in leader's threadgroup individually in turn.
1990 */ 2067 */
1991int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) 2068static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
1992{ 2069{
1993 int retval, i, group_size; 2070 int retval, i, group_size;
1994 struct cgroup_subsys *ss, *failed_ss = NULL; 2071 struct cgroup_subsys *ss, *failed_ss = NULL;
1995 bool cancel_failed_ss = false;
1996 /* guaranteed to be initialized later, but the compiler needs this */ 2072 /* guaranteed to be initialized later, but the compiler needs this */
1997 struct cgroup *oldcgrp = NULL;
1998 struct css_set *oldcg; 2073 struct css_set *oldcg;
1999 struct cgroupfs_root *root = cgrp->root; 2074 struct cgroupfs_root *root = cgrp->root;
2000 /* threadgroup list cursor and array */ 2075 /* threadgroup list cursor and array */
2001 struct task_struct *tsk; 2076 struct task_struct *tsk;
2077 struct task_and_cgroup *tc;
2002 struct flex_array *group; 2078 struct flex_array *group;
2079 struct cgroup_taskset tset = { };
2003 /* 2080 /*
2004 * we need to make sure we have css_sets for all the tasks we're 2081 * we need to make sure we have css_sets for all the tasks we're
2005 * going to move -before- we actually start moving them, so that in 2082 * going to move -before- we actually start moving them, so that in
@@ -2012,13 +2089,12 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2012 * step 0: in order to do expensive, possibly blocking operations for 2089 * step 0: in order to do expensive, possibly blocking operations for
2013 * every thread, we cannot iterate the thread group list, since it needs 2090 * every thread, we cannot iterate the thread group list, since it needs
2014 * rcu or tasklist locked. instead, build an array of all threads in the 2091 * rcu or tasklist locked. instead, build an array of all threads in the
2015 * group - threadgroup_fork_lock prevents new threads from appearing, 2092 * group - group_rwsem prevents new threads from appearing, and if
2016 * and if threads exit, this will just be an over-estimate. 2093 * threads exit, this will just be an over-estimate.
2017 */ 2094 */
2018 group_size = get_nr_threads(leader); 2095 group_size = get_nr_threads(leader);
2019 /* flex_array supports very large thread-groups better than kmalloc. */ 2096 /* flex_array supports very large thread-groups better than kmalloc. */
2020 group = flex_array_alloc(sizeof(struct task_struct *), group_size, 2097 group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);
2021 GFP_KERNEL);
2022 if (!group) 2098 if (!group)
2023 return -ENOMEM; 2099 return -ENOMEM;
2024 /* pre-allocate to guarantee space while iterating in rcu read-side. */ 2100 /* pre-allocate to guarantee space while iterating in rcu read-side. */
@@ -2040,49 +2116,53 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2040 retval = -EAGAIN; 2116 retval = -EAGAIN;
2041 goto out_free_group_list; 2117 goto out_free_group_list;
2042 } 2118 }
2043 /* take a reference on each task in the group to go in the array. */ 2119
2044 tsk = leader; 2120 tsk = leader;
2045 i = 0; 2121 i = 0;
2046 do { 2122 do {
2123 struct task_and_cgroup ent;
2124
2125 /* @tsk either already exited or can't exit until the end */
2126 if (tsk->flags & PF_EXITING)
2127 continue;
2128
2047 /* as per above, nr_threads may decrease, but not increase. */ 2129 /* as per above, nr_threads may decrease, but not increase. */
2048 BUG_ON(i >= group_size); 2130 BUG_ON(i >= group_size);
2049 get_task_struct(tsk);
2050 /* 2131 /*
2051 * saying GFP_ATOMIC has no effect here because we did prealloc 2132 * saying GFP_ATOMIC has no effect here because we did prealloc
2052 * earlier, but it's good form to communicate our expectations. 2133 * earlier, but it's good form to communicate our expectations.
2053 */ 2134 */
2054 retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC); 2135 ent.task = tsk;
2136 ent.cgrp = task_cgroup_from_root(tsk, root);
2137 /* nothing to do if this task is already in the cgroup */
2138 if (ent.cgrp == cgrp)
2139 continue;
2140 retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
2055 BUG_ON(retval != 0); 2141 BUG_ON(retval != 0);
2056 i++; 2142 i++;
2057 } while_each_thread(leader, tsk); 2143 } while_each_thread(leader, tsk);
2058 /* remember the number of threads in the array for later. */ 2144 /* remember the number of threads in the array for later. */
2059 group_size = i; 2145 group_size = i;
2146 tset.tc_array = group;
2147 tset.tc_array_len = group_size;
2060 read_unlock(&tasklist_lock); 2148 read_unlock(&tasklist_lock);
2061 2149
2150 /* methods shouldn't be called if no task is actually migrating */
2151 retval = 0;
2152 if (!group_size)
2153 goto out_free_group_list;
2154
2062 /* 2155 /*
2063 * step 1: check that we can legitimately attach to the cgroup. 2156 * step 1: check that we can legitimately attach to the cgroup.
2064 */ 2157 */
2065 for_each_subsys(root, ss) { 2158 for_each_subsys(root, ss) {
2066 if (ss->can_attach) { 2159 if (ss->can_attach) {
2067 retval = ss->can_attach(ss, cgrp, leader); 2160 retval = ss->can_attach(ss, cgrp, &tset);
2068 if (retval) { 2161 if (retval) {
2069 failed_ss = ss; 2162 failed_ss = ss;
2070 goto out_cancel_attach; 2163 goto out_cancel_attach;
2071 } 2164 }
2072 } 2165 }
2073 /* a callback to be run on every thread in the threadgroup. */
2074 if (ss->can_attach_task) {
2075 /* run on each task in the threadgroup. */
2076 for (i = 0; i < group_size; i++) {
2077 tsk = flex_array_get_ptr(group, i);
2078 retval = ss->can_attach_task(cgrp, tsk);
2079 if (retval) {
2080 failed_ss = ss;
2081 cancel_failed_ss = true;
2082 goto out_cancel_attach;
2083 }
2084 }
2085 }
2086 } 2166 }
2087 2167
2088 /* 2168 /*
@@ -2091,67 +2171,36 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2091 */ 2171 */
2092 INIT_LIST_HEAD(&newcg_list); 2172 INIT_LIST_HEAD(&newcg_list);
2093 for (i = 0; i < group_size; i++) { 2173 for (i = 0; i < group_size; i++) {
2094 tsk = flex_array_get_ptr(group, i); 2174 tc = flex_array_get(group, i);
2095 /* nothing to do if this task is already in the cgroup */ 2175 oldcg = tc->task->cgroups;
2096 oldcgrp = task_cgroup_from_root(tsk, root); 2176
2097 if (cgrp == oldcgrp) 2177 /* if we don't already have it in the list get a new one */
2098 continue; 2178 if (!css_set_check_fetched(cgrp, tc->task, oldcg,
2099 /* get old css_set pointer */ 2179 &newcg_list)) {
2100 task_lock(tsk);
2101 oldcg = tsk->cgroups;
2102 get_css_set(oldcg);
2103 task_unlock(tsk);
2104 /* see if the new one for us is already in the list? */
2105 if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
2106 /* was already there, nothing to do. */
2107 put_css_set(oldcg);
2108 } else {
2109 /* we don't already have it. get new one. */
2110 retval = css_set_prefetch(cgrp, oldcg, &newcg_list); 2180 retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
2111 put_css_set(oldcg);
2112 if (retval) 2181 if (retval)
2113 goto out_list_teardown; 2182 goto out_list_teardown;
2114 } 2183 }
2115 } 2184 }
2116 2185
2117 /* 2186 /*
2118 * step 3: now that we're guaranteed success wrt the css_sets, proceed 2187 * step 3: now that we're guaranteed success wrt the css_sets,
2119 * to move all tasks to the new cgroup, calling ss->attach_task for each 2188 * proceed to move all tasks to the new cgroup. There are no
2120 * one along the way. there are no failure cases after here, so this is 2189 * failure cases after here, so this is the commit point.
2121 * the commit point.
2122 */ 2190 */
2123 for_each_subsys(root, ss) {
2124 if (ss->pre_attach)
2125 ss->pre_attach(cgrp);
2126 }
2127 for (i = 0; i < group_size; i++) { 2191 for (i = 0; i < group_size; i++) {
2128 tsk = flex_array_get_ptr(group, i); 2192 tc = flex_array_get(group, i);
2129 /* leave current thread as it is if it's already there */ 2193 retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true);
2130 oldcgrp = task_cgroup_from_root(tsk, root); 2194 BUG_ON(retval);
2131 if (cgrp == oldcgrp)
2132 continue;
2133 /* if the thread is PF_EXITING, it can just get skipped. */
2134 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
2135 if (retval == 0) {
2136 /* attach each task to each subsystem */
2137 for_each_subsys(root, ss) {
2138 if (ss->attach_task)
2139 ss->attach_task(cgrp, tsk);
2140 }
2141 } else {
2142 BUG_ON(retval != -ESRCH);
2143 }
2144 } 2195 }
2145 /* nothing is sensitive to fork() after this point. */ 2196 /* nothing is sensitive to fork() after this point. */
2146 2197
2147 /* 2198 /*
2148 * step 4: do expensive, non-thread-specific subsystem callbacks. 2199 * step 4: do subsystem attach callbacks.
2149 * TODO: if ever a subsystem needs to know the oldcgrp for each task
2150 * being moved, this call will need to be reworked to communicate that.
2151 */ 2200 */
2152 for_each_subsys(root, ss) { 2201 for_each_subsys(root, ss) {
2153 if (ss->attach) 2202 if (ss->attach)
2154 ss->attach(ss, cgrp, oldcgrp, leader); 2203 ss->attach(ss, cgrp, &tset);
2155 } 2204 }
2156 2205
2157 /* 2206 /*
@@ -2171,20 +2220,12 @@ out_cancel_attach:
2171 /* same deal as in cgroup_attach_task */ 2220 /* same deal as in cgroup_attach_task */
2172 if (retval) { 2221 if (retval) {
2173 for_each_subsys(root, ss) { 2222 for_each_subsys(root, ss) {
2174 if (ss == failed_ss) { 2223 if (ss == failed_ss)
2175 if (cancel_failed_ss && ss->cancel_attach)
2176 ss->cancel_attach(ss, cgrp, leader);
2177 break; 2224 break;
2178 }
2179 if (ss->cancel_attach) 2225 if (ss->cancel_attach)
2180 ss->cancel_attach(ss, cgrp, leader); 2226 ss->cancel_attach(ss, cgrp, &tset);
2181 } 2227 }
2182 } 2228 }
2183 /* clean up the array of referenced threads in the group. */
2184 for (i = 0; i < group_size; i++) {
2185 tsk = flex_array_get_ptr(group, i);
2186 put_task_struct(tsk);
2187 }
2188out_free_group_list: 2229out_free_group_list:
2189 flex_array_free(group); 2230 flex_array_free(group);
2190 return retval; 2231 return retval;
@@ -2192,8 +2233,8 @@ out_free_group_list:
2192 2233
2193/* 2234/*
2194 * Find the task_struct of the task to attach by vpid and pass it along to the 2235 * Find the task_struct of the task to attach by vpid and pass it along to the
2195 * function to attach either it or all tasks in its threadgroup. Will take 2236 * function to attach either it or all tasks in its threadgroup. Will lock
2196 * cgroup_mutex; may take task_lock of task. 2237 * cgroup_mutex and threadgroup; may take task_lock of task.
2197 */ 2238 */
2198static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) 2239static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2199{ 2240{
@@ -2220,13 +2261,7 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2220 * detect it later. 2261 * detect it later.
2221 */ 2262 */
2222 tsk = tsk->group_leader; 2263 tsk = tsk->group_leader;
2223 } else if (tsk->flags & PF_EXITING) {
2224 /* optimization for the single-task-only case */
2225 rcu_read_unlock();
2226 cgroup_unlock();
2227 return -ESRCH;
2228 } 2264 }
2229
2230 /* 2265 /*
2231 * even if we're attaching all tasks in the thread group, we 2266 * even if we're attaching all tasks in the thread group, we
2232 * only need to check permissions on one of them. 2267 * only need to check permissions on one of them.
@@ -2249,13 +2284,15 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2249 get_task_struct(tsk); 2284 get_task_struct(tsk);
2250 } 2285 }
2251 2286
2252 if (threadgroup) { 2287 threadgroup_lock(tsk);
2253 threadgroup_fork_write_lock(tsk); 2288
2289 if (threadgroup)
2254 ret = cgroup_attach_proc(cgrp, tsk); 2290 ret = cgroup_attach_proc(cgrp, tsk);
2255 threadgroup_fork_write_unlock(tsk); 2291 else
2256 } else {
2257 ret = cgroup_attach_task(cgrp, tsk); 2292 ret = cgroup_attach_task(cgrp, tsk);
2258 } 2293
2294 threadgroup_unlock(tsk);
2295
2259 put_task_struct(tsk); 2296 put_task_struct(tsk);
2260 cgroup_unlock(); 2297 cgroup_unlock();
2261 return ret; 2298 return ret;
@@ -2306,7 +2343,9 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
2306 return -EINVAL; 2343 return -EINVAL;
2307 if (!cgroup_lock_live_group(cgrp)) 2344 if (!cgroup_lock_live_group(cgrp))
2308 return -ENODEV; 2345 return -ENODEV;
2346 mutex_lock(&cgroup_root_mutex);
2309 strcpy(cgrp->root->release_agent_path, buffer); 2347 strcpy(cgrp->root->release_agent_path, buffer);
2348 mutex_unlock(&cgroup_root_mutex);
2310 cgroup_unlock(); 2349 cgroup_unlock();
2311 return 0; 2350 return 0;
2312} 2351}
@@ -2585,7 +2624,7 @@ static inline struct cftype *__file_cft(struct file *file)
2585 return __d_cft(file->f_dentry); 2624 return __d_cft(file->f_dentry);
2586} 2625}
2587 2626
2588static int cgroup_create_file(struct dentry *dentry, mode_t mode, 2627static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2589 struct super_block *sb) 2628 struct super_block *sb)
2590{ 2629{
2591 struct inode *inode; 2630 struct inode *inode;
@@ -2626,7 +2665,7 @@ static int cgroup_create_file(struct dentry *dentry, mode_t mode,
2626 * @mode: mode to set on new directory. 2665 * @mode: mode to set on new directory.
2627 */ 2666 */
2628static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, 2667static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
2629 mode_t mode) 2668 umode_t mode)
2630{ 2669{
2631 struct dentry *parent; 2670 struct dentry *parent;
2632 int error = 0; 2671 int error = 0;
@@ -2653,9 +2692,9 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
2653 * returns S_IRUGO if it has only a read handler 2692 * returns S_IRUGO if it has only a read handler
2654 * returns S_IWUSR if it has only a write hander 2693 * returns S_IWUSR if it has only a write hander
2655 */ 2694 */
2656static mode_t cgroup_file_mode(const struct cftype *cft) 2695static umode_t cgroup_file_mode(const struct cftype *cft)
2657{ 2696{
2658 mode_t mode = 0; 2697 umode_t mode = 0;
2659 2698
2660 if (cft->mode) 2699 if (cft->mode)
2661 return cft->mode; 2700 return cft->mode;
@@ -2678,7 +2717,7 @@ int cgroup_add_file(struct cgroup *cgrp,
2678 struct dentry *dir = cgrp->dentry; 2717 struct dentry *dir = cgrp->dentry;
2679 struct dentry *dentry; 2718 struct dentry *dentry;
2680 int error; 2719 int error;
2681 mode_t mode; 2720 umode_t mode;
2682 2721
2683 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; 2722 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2684 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { 2723 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
@@ -2789,6 +2828,7 @@ static void cgroup_enable_task_cg_lists(void)
2789} 2828}
2790 2829
2791void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) 2830void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
2831 __acquires(css_set_lock)
2792{ 2832{
2793 /* 2833 /*
2794 * The first time anyone tries to iterate across a cgroup, 2834 * The first time anyone tries to iterate across a cgroup,
@@ -2828,6 +2868,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
2828} 2868}
2829 2869
2830void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) 2870void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
2871 __releases(css_set_lock)
2831{ 2872{
2832 read_unlock(&css_set_lock); 2873 read_unlock(&css_set_lock);
2833} 2874}
@@ -3752,7 +3793,7 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
3752 * Must be called with the mutex on the parent inode held 3793 * Must be called with the mutex on the parent inode held
3753 */ 3794 */
3754static long cgroup_create(struct cgroup *parent, struct dentry *dentry, 3795static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3755 mode_t mode) 3796 umode_t mode)
3756{ 3797{
3757 struct cgroup *cgrp; 3798 struct cgroup *cgrp;
3758 struct cgroupfs_root *root = parent->root; 3799 struct cgroupfs_root *root = parent->root;
@@ -3846,7 +3887,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3846 return err; 3887 return err;
3847} 3888}
3848 3889
3849static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode) 3890static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
3850{ 3891{
3851 struct cgroup *c_parent = dentry->d_parent->d_fsdata; 3892 struct cgroup *c_parent = dentry->d_parent->d_fsdata;
3852 3893
@@ -4491,20 +4532,31 @@ static const struct file_operations proc_cgroupstats_operations = {
4491 * 4532 *
4492 * A pointer to the shared css_set was automatically copied in 4533 * A pointer to the shared css_set was automatically copied in
4493 * fork.c by dup_task_struct(). However, we ignore that copy, since 4534 * fork.c by dup_task_struct(). However, we ignore that copy, since
4494 * it was not made under the protection of RCU or cgroup_mutex, so 4535 * it was not made under the protection of RCU, cgroup_mutex or
4495 * might no longer be a valid cgroup pointer. cgroup_attach_task() might 4536 * threadgroup_change_begin(), so it might no longer be a valid
4496 * have already changed current->cgroups, allowing the previously 4537 * cgroup pointer. cgroup_attach_task() might have already changed
4497 * referenced cgroup group to be removed and freed. 4538 * current->cgroups, allowing the previously referenced cgroup
4539 * group to be removed and freed.
4540 *
4541 * Outside the pointer validity we also need to process the css_set
4542 * inheritance between threadgoup_change_begin() and
4543 * threadgoup_change_end(), this way there is no leak in any process
4544 * wide migration performed by cgroup_attach_proc() that could otherwise
4545 * miss a thread because it is too early or too late in the fork stage.
4498 * 4546 *
4499 * At the point that cgroup_fork() is called, 'current' is the parent 4547 * At the point that cgroup_fork() is called, 'current' is the parent
4500 * task, and the passed argument 'child' points to the child task. 4548 * task, and the passed argument 'child' points to the child task.
4501 */ 4549 */
4502void cgroup_fork(struct task_struct *child) 4550void cgroup_fork(struct task_struct *child)
4503{ 4551{
4504 task_lock(current); 4552 /*
4553 * We don't need to task_lock() current because current->cgroups
4554 * can't be changed concurrently here. The parent obviously hasn't
4555 * exited and called cgroup_exit(), and we are synchronized against
4556 * cgroup migration through threadgroup_change_begin().
4557 */
4505 child->cgroups = current->cgroups; 4558 child->cgroups = current->cgroups;
4506 get_css_set(child->cgroups); 4559 get_css_set(child->cgroups);
4507 task_unlock(current);
4508 INIT_LIST_HEAD(&child->cg_list); 4560 INIT_LIST_HEAD(&child->cg_list);
4509} 4561}
4510 4562
@@ -4546,10 +4598,19 @@ void cgroup_post_fork(struct task_struct *child)
4546{ 4598{
4547 if (use_task_css_set_links) { 4599 if (use_task_css_set_links) {
4548 write_lock(&css_set_lock); 4600 write_lock(&css_set_lock);
4549 task_lock(child); 4601 if (list_empty(&child->cg_list)) {
4550 if (list_empty(&child->cg_list)) 4602 /*
4603 * It's safe to use child->cgroups without task_lock()
4604 * here because we are protected through
4605 * threadgroup_change_begin() against concurrent
4606 * css_set change in cgroup_task_migrate(). Also
4607 * the task can't exit at that point until
4608 * wake_up_new_task() is called, so we are protected
4609 * against cgroup_exit() setting child->cgroup to
4610 * init_css_set.
4611 */
4551 list_add(&child->cg_list, &child->cgroups->tasks); 4612 list_add(&child->cg_list, &child->cgroups->tasks);
4552 task_unlock(child); 4613 }
4553 write_unlock(&css_set_lock); 4614 write_unlock(&css_set_lock);
4554 } 4615 }
4555} 4616}
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 213c0351dad8..fc0646b78a64 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -48,19 +48,17 @@ static inline struct freezer *task_freezer(struct task_struct *task)
48 struct freezer, css); 48 struct freezer, css);
49} 49}
50 50
51static inline int __cgroup_freezing_or_frozen(struct task_struct *task) 51bool cgroup_freezing(struct task_struct *task)
52{ 52{
53 enum freezer_state state = task_freezer(task)->state; 53 enum freezer_state state;
54 return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN); 54 bool ret;
55}
56 55
57int cgroup_freezing_or_frozen(struct task_struct *task) 56 rcu_read_lock();
58{ 57 state = task_freezer(task)->state;
59 int result; 58 ret = state == CGROUP_FREEZING || state == CGROUP_FROZEN;
60 task_lock(task); 59 rcu_read_unlock();
61 result = __cgroup_freezing_or_frozen(task); 60
62 task_unlock(task); 61 return ret;
63 return result;
64} 62}
65 63
66/* 64/*
@@ -102,9 +100,6 @@ struct cgroup_subsys freezer_subsys;
102 * freezer_can_attach(): 100 * freezer_can_attach():
103 * cgroup_mutex (held by caller of can_attach) 101 * cgroup_mutex (held by caller of can_attach)
104 * 102 *
105 * cgroup_freezing_or_frozen():
106 * task->alloc_lock (to get task's cgroup)
107 *
108 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): 103 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
109 * freezer->lock 104 * freezer->lock
110 * sighand->siglock (if the cgroup is freezing) 105 * sighand->siglock (if the cgroup is freezing)
@@ -130,7 +125,7 @@ struct cgroup_subsys freezer_subsys;
130 * write_lock css_set_lock (cgroup iterator start) 125 * write_lock css_set_lock (cgroup iterator start)
131 * task->alloc_lock 126 * task->alloc_lock
132 * read_lock css_set_lock (cgroup iterator start) 127 * read_lock css_set_lock (cgroup iterator start)
133 * task->alloc_lock (inside thaw_process(), prevents race with refrigerator()) 128 * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator())
134 * sighand->siglock 129 * sighand->siglock
135 */ 130 */
136static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, 131static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
@@ -150,7 +145,11 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
150static void freezer_destroy(struct cgroup_subsys *ss, 145static void freezer_destroy(struct cgroup_subsys *ss,
151 struct cgroup *cgroup) 146 struct cgroup *cgroup)
152{ 147{
153 kfree(cgroup_freezer(cgroup)); 148 struct freezer *freezer = cgroup_freezer(cgroup);
149
150 if (freezer->state != CGROUP_THAWED)
151 atomic_dec(&system_freezing_cnt);
152 kfree(freezer);
154} 153}
155 154
156/* task is frozen or will freeze immediately when next it gets woken */ 155/* task is frozen or will freeze immediately when next it gets woken */
@@ -167,13 +166,17 @@ static bool is_task_frozen_enough(struct task_struct *task)
167 */ 166 */
168static int freezer_can_attach(struct cgroup_subsys *ss, 167static int freezer_can_attach(struct cgroup_subsys *ss,
169 struct cgroup *new_cgroup, 168 struct cgroup *new_cgroup,
170 struct task_struct *task) 169 struct cgroup_taskset *tset)
171{ 170{
172 struct freezer *freezer; 171 struct freezer *freezer;
172 struct task_struct *task;
173 173
174 /* 174 /*
175 * Anything frozen can't move or be moved to/from. 175 * Anything frozen can't move or be moved to/from.
176 */ 176 */
177 cgroup_taskset_for_each(task, new_cgroup, tset)
178 if (cgroup_freezing(task))
179 return -EBUSY;
177 180
178 freezer = cgroup_freezer(new_cgroup); 181 freezer = cgroup_freezer(new_cgroup);
179 if (freezer->state != CGROUP_THAWED) 182 if (freezer->state != CGROUP_THAWED)
@@ -182,17 +185,6 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
182 return 0; 185 return 0;
183} 186}
184 187
185static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
186{
187 rcu_read_lock();
188 if (__cgroup_freezing_or_frozen(tsk)) {
189 rcu_read_unlock();
190 return -EBUSY;
191 }
192 rcu_read_unlock();
193 return 0;
194}
195
196static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) 188static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
197{ 189{
198 struct freezer *freezer; 190 struct freezer *freezer;
@@ -220,7 +212,7 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
220 212
221 /* Locking avoids race with FREEZING -> THAWED transitions. */ 213 /* Locking avoids race with FREEZING -> THAWED transitions. */
222 if (freezer->state == CGROUP_FREEZING) 214 if (freezer->state == CGROUP_FREEZING)
223 freeze_task(task, true); 215 freeze_task(task);
224 spin_unlock_irq(&freezer->lock); 216 spin_unlock_irq(&freezer->lock);
225} 217}
226 218
@@ -238,7 +230,7 @@ static void update_if_frozen(struct cgroup *cgroup,
238 cgroup_iter_start(cgroup, &it); 230 cgroup_iter_start(cgroup, &it);
239 while ((task = cgroup_iter_next(cgroup, &it))) { 231 while ((task = cgroup_iter_next(cgroup, &it))) {
240 ntotal++; 232 ntotal++;
241 if (is_task_frozen_enough(task)) 233 if (freezing(task) && is_task_frozen_enough(task))
242 nfrozen++; 234 nfrozen++;
243 } 235 }
244 236
@@ -286,10 +278,9 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
286 struct task_struct *task; 278 struct task_struct *task;
287 unsigned int num_cant_freeze_now = 0; 279 unsigned int num_cant_freeze_now = 0;
288 280
289 freezer->state = CGROUP_FREEZING;
290 cgroup_iter_start(cgroup, &it); 281 cgroup_iter_start(cgroup, &it);
291 while ((task = cgroup_iter_next(cgroup, &it))) { 282 while ((task = cgroup_iter_next(cgroup, &it))) {
292 if (!freeze_task(task, true)) 283 if (!freeze_task(task))
293 continue; 284 continue;
294 if (is_task_frozen_enough(task)) 285 if (is_task_frozen_enough(task))
295 continue; 286 continue;
@@ -307,12 +298,9 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
307 struct task_struct *task; 298 struct task_struct *task;
308 299
309 cgroup_iter_start(cgroup, &it); 300 cgroup_iter_start(cgroup, &it);
310 while ((task = cgroup_iter_next(cgroup, &it))) { 301 while ((task = cgroup_iter_next(cgroup, &it)))
311 thaw_process(task); 302 __thaw_task(task);
312 }
313 cgroup_iter_end(cgroup, &it); 303 cgroup_iter_end(cgroup, &it);
314
315 freezer->state = CGROUP_THAWED;
316} 304}
317 305
318static int freezer_change_state(struct cgroup *cgroup, 306static int freezer_change_state(struct cgroup *cgroup,
@@ -326,20 +314,24 @@ static int freezer_change_state(struct cgroup *cgroup,
326 spin_lock_irq(&freezer->lock); 314 spin_lock_irq(&freezer->lock);
327 315
328 update_if_frozen(cgroup, freezer); 316 update_if_frozen(cgroup, freezer);
329 if (goal_state == freezer->state)
330 goto out;
331 317
332 switch (goal_state) { 318 switch (goal_state) {
333 case CGROUP_THAWED: 319 case CGROUP_THAWED:
320 if (freezer->state != CGROUP_THAWED)
321 atomic_dec(&system_freezing_cnt);
322 freezer->state = CGROUP_THAWED;
334 unfreeze_cgroup(cgroup, freezer); 323 unfreeze_cgroup(cgroup, freezer);
335 break; 324 break;
336 case CGROUP_FROZEN: 325 case CGROUP_FROZEN:
326 if (freezer->state == CGROUP_THAWED)
327 atomic_inc(&system_freezing_cnt);
328 freezer->state = CGROUP_FREEZING;
337 retval = try_to_freeze_cgroup(cgroup, freezer); 329 retval = try_to_freeze_cgroup(cgroup, freezer);
338 break; 330 break;
339 default: 331 default:
340 BUG(); 332 BUG();
341 } 333 }
342out: 334
343 spin_unlock_irq(&freezer->lock); 335 spin_unlock_irq(&freezer->lock);
344 336
345 return retval; 337 return retval;
@@ -388,10 +380,5 @@ struct cgroup_subsys freezer_subsys = {
388 .populate = freezer_populate, 380 .populate = freezer_populate,
389 .subsys_id = freezer_subsys_id, 381 .subsys_id = freezer_subsys_id,
390 .can_attach = freezer_can_attach, 382 .can_attach = freezer_can_attach,
391 .can_attach_task = freezer_can_attach_task,
392 .pre_attach = NULL,
393 .attach_task = NULL,
394 .attach = NULL,
395 .fork = freezer_fork, 383 .fork = freezer_fork,
396 .exit = NULL,
397}; 384};
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 563f13609470..2060c6e57027 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -178,8 +178,7 @@ static inline void check_for_tasks(int cpu)
178 write_lock_irq(&tasklist_lock); 178 write_lock_irq(&tasklist_lock);
179 for_each_process(p) { 179 for_each_process(p) {
180 if (task_cpu(p) == cpu && p->state == TASK_RUNNING && 180 if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
181 (!cputime_eq(p->utime, cputime_zero) || 181 (p->utime || p->stime))
182 !cputime_eq(p->stime, cputime_zero)))
183 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " 182 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
184 "(state = %ld, flags = %x)\n", 183 "(state = %ld, flags = %x)\n",
185 p->comm, task_pid_nr(p), cpu, 184 p->comm, task_pid_nr(p), cpu,
@@ -380,6 +379,7 @@ out:
380 cpu_maps_update_done(); 379 cpu_maps_update_done();
381 return err; 380 return err;
382} 381}
382EXPORT_SYMBOL_GPL(cpu_up);
383 383
384#ifdef CONFIG_PM_SLEEP_SMP 384#ifdef CONFIG_PM_SLEEP_SMP
385static cpumask_var_t frozen_cpus; 385static cpumask_var_t frozen_cpus;
@@ -470,7 +470,7 @@ out:
470 cpu_maps_update_done(); 470 cpu_maps_update_done();
471} 471}
472 472
473static int alloc_frozen_cpus(void) 473static int __init alloc_frozen_cpus(void)
474{ 474{
475 if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO)) 475 if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO))
476 return -ENOMEM; 476 return -ENOMEM;
@@ -543,7 +543,7 @@ cpu_hotplug_pm_callback(struct notifier_block *nb,
543} 543}
544 544
545 545
546int cpu_hotplug_pm_sync_init(void) 546static int __init cpu_hotplug_pm_sync_init(void)
547{ 547{
548 pm_notifier(cpu_hotplug_pm_callback, 0); 548 pm_notifier(cpu_hotplug_pm_callback, 0);
549 return 0; 549 return 0;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 0b1712dba587..a09ac2b9a661 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1389,79 +1389,73 @@ static int fmeter_getrate(struct fmeter *fmp)
1389 return val; 1389 return val;
1390} 1390}
1391 1391
1392/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
1393static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1394 struct task_struct *tsk)
1395{
1396 struct cpuset *cs = cgroup_cs(cont);
1397
1398 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1399 return -ENOSPC;
1400
1401 /*
1402 * Kthreads bound to specific cpus cannot be moved to a new cpuset; we
1403 * cannot change their cpu affinity and isolating such threads by their
1404 * set of allowed nodes is unnecessary. Thus, cpusets are not
1405 * applicable for such threads. This prevents checking for success of
1406 * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may
1407 * be changed.
1408 */
1409 if (tsk->flags & PF_THREAD_BOUND)
1410 return -EINVAL;
1411
1412 return 0;
1413}
1414
1415static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task)
1416{
1417 return security_task_setscheduler(task);
1418}
1419
1420/* 1392/*
1421 * Protected by cgroup_lock. The nodemasks must be stored globally because 1393 * Protected by cgroup_lock. The nodemasks must be stored globally because
1422 * dynamically allocating them is not allowed in pre_attach, and they must 1394 * dynamically allocating them is not allowed in can_attach, and they must
1423 * persist among pre_attach, attach_task, and attach. 1395 * persist until attach.
1424 */ 1396 */
1425static cpumask_var_t cpus_attach; 1397static cpumask_var_t cpus_attach;
1426static nodemask_t cpuset_attach_nodemask_from; 1398static nodemask_t cpuset_attach_nodemask_from;
1427static nodemask_t cpuset_attach_nodemask_to; 1399static nodemask_t cpuset_attach_nodemask_to;
1428 1400
1429/* Set-up work for before attaching each task. */ 1401/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
1430static void cpuset_pre_attach(struct cgroup *cont) 1402static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
1403 struct cgroup_taskset *tset)
1431{ 1404{
1432 struct cpuset *cs = cgroup_cs(cont); 1405 struct cpuset *cs = cgroup_cs(cgrp);
1406 struct task_struct *task;
1407 int ret;
1408
1409 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1410 return -ENOSPC;
1411
1412 cgroup_taskset_for_each(task, cgrp, tset) {
1413 /*
1414 * Kthreads bound to specific cpus cannot be moved to a new
1415 * cpuset; we cannot change their cpu affinity and
1416 * isolating such threads by their set of allowed nodes is
1417 * unnecessary. Thus, cpusets are not applicable for such
1418 * threads. This prevents checking for success of
1419 * set_cpus_allowed_ptr() on all attached tasks before
1420 * cpus_allowed may be changed.
1421 */
1422 if (task->flags & PF_THREAD_BOUND)
1423 return -EINVAL;
1424 if ((ret = security_task_setscheduler(task)))
1425 return ret;
1426 }
1433 1427
1428 /* prepare for attach */
1434 if (cs == &top_cpuset) 1429 if (cs == &top_cpuset)
1435 cpumask_copy(cpus_attach, cpu_possible_mask); 1430 cpumask_copy(cpus_attach, cpu_possible_mask);
1436 else 1431 else
1437 guarantee_online_cpus(cs, cpus_attach); 1432 guarantee_online_cpus(cs, cpus_attach);
1438 1433
1439 guarantee_online_mems(cs, &cpuset_attach_nodemask_to); 1434 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
1440}
1441
1442/* Per-thread attachment work. */
1443static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk)
1444{
1445 int err;
1446 struct cpuset *cs = cgroup_cs(cont);
1447 1435
1448 /* 1436 return 0;
1449 * can_attach beforehand should guarantee that this doesn't fail.
1450 * TODO: have a better way to handle failure here
1451 */
1452 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1453 WARN_ON_ONCE(err);
1454
1455 cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to);
1456 cpuset_update_task_spread_flag(cs, tsk);
1457} 1437}
1458 1438
1459static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, 1439static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
1460 struct cgroup *oldcont, struct task_struct *tsk) 1440 struct cgroup_taskset *tset)
1461{ 1441{
1462 struct mm_struct *mm; 1442 struct mm_struct *mm;
1463 struct cpuset *cs = cgroup_cs(cont); 1443 struct task_struct *task;
1464 struct cpuset *oldcs = cgroup_cs(oldcont); 1444 struct task_struct *leader = cgroup_taskset_first(tset);
1445 struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset);
1446 struct cpuset *cs = cgroup_cs(cgrp);
1447 struct cpuset *oldcs = cgroup_cs(oldcgrp);
1448
1449 cgroup_taskset_for_each(task, cgrp, tset) {
1450 /*
1451 * can_attach beforehand should guarantee that this doesn't
1452 * fail. TODO: have a better way to handle failure here
1453 */
1454 WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
1455
1456 cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
1457 cpuset_update_task_spread_flag(cs, task);
1458 }
1465 1459
1466 /* 1460 /*
1467 * Change mm, possibly for multiple threads in a threadgroup. This is 1461 * Change mm, possibly for multiple threads in a threadgroup. This is
@@ -1469,7 +1463,7 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1469 */ 1463 */
1470 cpuset_attach_nodemask_from = oldcs->mems_allowed; 1464 cpuset_attach_nodemask_from = oldcs->mems_allowed;
1471 cpuset_attach_nodemask_to = cs->mems_allowed; 1465 cpuset_attach_nodemask_to = cs->mems_allowed;
1472 mm = get_task_mm(tsk); 1466 mm = get_task_mm(leader);
1473 if (mm) { 1467 if (mm) {
1474 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); 1468 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
1475 if (is_memory_migrate(cs)) 1469 if (is_memory_migrate(cs))
@@ -1925,9 +1919,6 @@ struct cgroup_subsys cpuset_subsys = {
1925 .create = cpuset_create, 1919 .create = cpuset_create,
1926 .destroy = cpuset_destroy, 1920 .destroy = cpuset_destroy,
1927 .can_attach = cpuset_can_attach, 1921 .can_attach = cpuset_can_attach,
1928 .can_attach_task = cpuset_can_attach_task,
1929 .pre_attach = cpuset_pre_attach,
1930 .attach_task = cpuset_attach_task,
1931 .attach = cpuset_attach, 1922 .attach = cpuset_attach,
1932 .populate = cpuset_populate, 1923 .populate = cpuset_populate,
1933 .post_clone = cpuset_post_clone, 1924 .post_clone = cpuset_post_clone,
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 63786e71a3cd..e2ae7349437f 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1982,7 +1982,7 @@ static int kdb_lsmod(int argc, const char **argv)
1982 kdb_printf("%-20s%8u 0x%p ", mod->name, 1982 kdb_printf("%-20s%8u 0x%p ", mod->name,
1983 mod->core_size, (void *)mod); 1983 mod->core_size, (void *)mod);
1984#ifdef CONFIG_MODULE_UNLOAD 1984#ifdef CONFIG_MODULE_UNLOAD
1985 kdb_printf("%4d ", module_refcount(mod)); 1985 kdb_printf("%4ld ", module_refcount(mod));
1986#endif 1986#endif
1987 if (mod->state == MODULE_STATE_GOING) 1987 if (mod->state == MODULE_STATE_GOING)
1988 kdb_printf(" (Unloading)"); 1988 kdb_printf(" (Unloading)");
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 5532dd37aa86..7d6fb40d2188 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -636,7 +636,7 @@ char kdb_task_state_char (const struct task_struct *p)
636 (p->exit_state & EXIT_ZOMBIE) ? 'Z' : 636 (p->exit_state & EXIT_ZOMBIE) ? 'Z' :
637 (p->exit_state & EXIT_DEAD) ? 'E' : 637 (p->exit_state & EXIT_DEAD) ? 'E' :
638 (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?'; 638 (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?';
639 if (p->pid == 0) { 639 if (is_idle_task(p)) {
640 /* Idle task. Is it really idle, apart from the kdb 640 /* Idle task. Is it really idle, apart from the kdb
641 * interrupt? */ 641 * interrupt? */
642 if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) { 642 if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) {
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 89e5e8aa4c36..22d901f9caf4 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -2,5 +2,5 @@ ifdef CONFIG_FUNCTION_TRACER
2CFLAGS_REMOVE_core.o = -pg 2CFLAGS_REMOVE_core.o = -pg
3endif 3endif
4 4
5obj-y := core.o ring_buffer.o 5obj-y := core.o ring_buffer.o callchain.o
6obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 6obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
new file mode 100644
index 000000000000..6581a040f399
--- /dev/null
+++ b/kernel/events/callchain.c
@@ -0,0 +1,189 @@
1/*
2 * Performance events callchain code, extracted from core.c:
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
9 * For licensing details see kernel-base/COPYING
10 */
11
12#include <linux/perf_event.h>
13#include <linux/slab.h>
14#include "internal.h"
15
16struct callchain_cpus_entries {
17 struct rcu_head rcu_head;
18 struct perf_callchain_entry *cpu_entries[0];
19};
20
21static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
22static atomic_t nr_callchain_events;
23static DEFINE_MUTEX(callchain_mutex);
24static struct callchain_cpus_entries *callchain_cpus_entries;
25
26
27__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
28 struct pt_regs *regs)
29{
30}
31
32__weak void perf_callchain_user(struct perf_callchain_entry *entry,
33 struct pt_regs *regs)
34{
35}
36
37static void release_callchain_buffers_rcu(struct rcu_head *head)
38{
39 struct callchain_cpus_entries *entries;
40 int cpu;
41
42 entries = container_of(head, struct callchain_cpus_entries, rcu_head);
43
44 for_each_possible_cpu(cpu)
45 kfree(entries->cpu_entries[cpu]);
46
47 kfree(entries);
48}
49
50static void release_callchain_buffers(void)
51{
52 struct callchain_cpus_entries *entries;
53
54 entries = callchain_cpus_entries;
55 rcu_assign_pointer(callchain_cpus_entries, NULL);
56 call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
57}
58
59static int alloc_callchain_buffers(void)
60{
61 int cpu;
62 int size;
63 struct callchain_cpus_entries *entries;
64
65 /*
66 * We can't use the percpu allocation API for data that can be
67 * accessed from NMI. Use a temporary manual per cpu allocation
68 * until that gets sorted out.
69 */
70 size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
71
72 entries = kzalloc(size, GFP_KERNEL);
73 if (!entries)
74 return -ENOMEM;
75
76 size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
77
78 for_each_possible_cpu(cpu) {
79 entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
80 cpu_to_node(cpu));
81 if (!entries->cpu_entries[cpu])
82 goto fail;
83 }
84
85 rcu_assign_pointer(callchain_cpus_entries, entries);
86
87 return 0;
88
89fail:
90 for_each_possible_cpu(cpu)
91 kfree(entries->cpu_entries[cpu]);
92 kfree(entries);
93
94 return -ENOMEM;
95}
96
97int get_callchain_buffers(void)
98{
99 int err = 0;
100 int count;
101
102 mutex_lock(&callchain_mutex);
103
104 count = atomic_inc_return(&nr_callchain_events);
105 if (WARN_ON_ONCE(count < 1)) {
106 err = -EINVAL;
107 goto exit;
108 }
109
110 if (count > 1) {
111 /* If the allocation failed, give up */
112 if (!callchain_cpus_entries)
113 err = -ENOMEM;
114 goto exit;
115 }
116
117 err = alloc_callchain_buffers();
118exit:
119 mutex_unlock(&callchain_mutex);
120
121 return err;
122}
123
124void put_callchain_buffers(void)
125{
126 if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
127 release_callchain_buffers();
128 mutex_unlock(&callchain_mutex);
129 }
130}
131
132static struct perf_callchain_entry *get_callchain_entry(int *rctx)
133{
134 int cpu;
135 struct callchain_cpus_entries *entries;
136
137 *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
138 if (*rctx == -1)
139 return NULL;
140
141 entries = rcu_dereference(callchain_cpus_entries);
142 if (!entries)
143 return NULL;
144
145 cpu = smp_processor_id();
146
147 return &entries->cpu_entries[cpu][*rctx];
148}
149
150static void
151put_callchain_entry(int rctx)
152{
153 put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
154}
155
156struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
157{
158 int rctx;
159 struct perf_callchain_entry *entry;
160
161
162 entry = get_callchain_entry(&rctx);
163 if (rctx == -1)
164 return NULL;
165
166 if (!entry)
167 goto exit_put;
168
169 entry->nr = 0;
170
171 if (!user_mode(regs)) {
172 perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
173 perf_callchain_kernel(entry, regs);
174 if (current->mm)
175 regs = task_pt_regs(current);
176 else
177 regs = NULL;
178 }
179
180 if (regs) {
181 perf_callchain_store(entry, PERF_CONTEXT_USER);
182 perf_callchain_user(entry, regs);
183 }
184
185exit_put:
186 put_callchain_entry(rctx);
187
188 return entry;
189}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 58690af323e4..32b48c889711 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4,7 +4,7 @@
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> 4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar 5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 * 8 *
9 * For licensing details see kernel-base/COPYING 9 * For licensing details see kernel-base/COPYING
10 */ 10 */
@@ -128,7 +128,7 @@ enum event_type_t {
128 * perf_sched_events : >0 events exist 128 * perf_sched_events : >0 events exist
129 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu 129 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
130 */ 130 */
131struct jump_label_key perf_sched_events __read_mostly; 131struct jump_label_key_deferred perf_sched_events __read_mostly;
132static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); 132static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
133 133
134static atomic_t nr_mmap_events __read_mostly; 134static atomic_t nr_mmap_events __read_mostly;
@@ -815,7 +815,7 @@ static void update_event_times(struct perf_event *event)
815 * here. 815 * here.
816 */ 816 */
817 if (is_cgroup_event(event)) 817 if (is_cgroup_event(event))
818 run_end = perf_event_time(event); 818 run_end = perf_cgroup_event_time(event);
819 else if (ctx->is_active) 819 else if (ctx->is_active)
820 run_end = ctx->time; 820 run_end = ctx->time;
821 else 821 else
@@ -1130,6 +1130,8 @@ event_sched_out(struct perf_event *event,
1130 if (!is_software_event(event)) 1130 if (!is_software_event(event))
1131 cpuctx->active_oncpu--; 1131 cpuctx->active_oncpu--;
1132 ctx->nr_active--; 1132 ctx->nr_active--;
1133 if (event->attr.freq && event->attr.sample_freq)
1134 ctx->nr_freq--;
1133 if (event->attr.exclusive || !cpuctx->active_oncpu) 1135 if (event->attr.exclusive || !cpuctx->active_oncpu)
1134 cpuctx->exclusive = 0; 1136 cpuctx->exclusive = 0;
1135} 1137}
@@ -1325,6 +1327,7 @@ retry:
1325 } 1327 }
1326 raw_spin_unlock_irq(&ctx->lock); 1328 raw_spin_unlock_irq(&ctx->lock);
1327} 1329}
1330EXPORT_SYMBOL_GPL(perf_event_disable);
1328 1331
1329static void perf_set_shadow_time(struct perf_event *event, 1332static void perf_set_shadow_time(struct perf_event *event,
1330 struct perf_event_context *ctx, 1333 struct perf_event_context *ctx,
@@ -1406,6 +1409,8 @@ event_sched_in(struct perf_event *event,
1406 if (!is_software_event(event)) 1409 if (!is_software_event(event))
1407 cpuctx->active_oncpu++; 1410 cpuctx->active_oncpu++;
1408 ctx->nr_active++; 1411 ctx->nr_active++;
1412 if (event->attr.freq && event->attr.sample_freq)
1413 ctx->nr_freq++;
1409 1414
1410 if (event->attr.exclusive) 1415 if (event->attr.exclusive)
1411 cpuctx->exclusive = 1; 1416 cpuctx->exclusive = 1;
@@ -1662,8 +1667,7 @@ retry:
1662 * Note: this works for group members as well as group leaders 1667 * Note: this works for group members as well as group leaders
1663 * since the non-leader members' sibling_lists will be empty. 1668 * since the non-leader members' sibling_lists will be empty.
1664 */ 1669 */
1665static void __perf_event_mark_enabled(struct perf_event *event, 1670static void __perf_event_mark_enabled(struct perf_event *event)
1666 struct perf_event_context *ctx)
1667{ 1671{
1668 struct perf_event *sub; 1672 struct perf_event *sub;
1669 u64 tstamp = perf_event_time(event); 1673 u64 tstamp = perf_event_time(event);
@@ -1701,7 +1705,7 @@ static int __perf_event_enable(void *info)
1701 */ 1705 */
1702 perf_cgroup_set_timestamp(current, ctx); 1706 perf_cgroup_set_timestamp(current, ctx);
1703 1707
1704 __perf_event_mark_enabled(event, ctx); 1708 __perf_event_mark_enabled(event);
1705 1709
1706 if (!event_filter_match(event)) { 1710 if (!event_filter_match(event)) {
1707 if (is_cgroup_event(event)) 1711 if (is_cgroup_event(event))
@@ -1782,7 +1786,7 @@ void perf_event_enable(struct perf_event *event)
1782 1786
1783retry: 1787retry:
1784 if (!ctx->is_active) { 1788 if (!ctx->is_active) {
1785 __perf_event_mark_enabled(event, ctx); 1789 __perf_event_mark_enabled(event);
1786 goto out; 1790 goto out;
1787 } 1791 }
1788 1792
@@ -1809,6 +1813,7 @@ retry:
1809out: 1813out:
1810 raw_spin_unlock_irq(&ctx->lock); 1814 raw_spin_unlock_irq(&ctx->lock);
1811} 1815}
1816EXPORT_SYMBOL_GPL(perf_event_enable);
1812 1817
1813int perf_event_refresh(struct perf_event *event, int refresh) 1818int perf_event_refresh(struct perf_event *event, int refresh)
1814{ 1819{
@@ -2327,6 +2332,9 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
2327 u64 interrupts, now; 2332 u64 interrupts, now;
2328 s64 delta; 2333 s64 delta;
2329 2334
2335 if (!ctx->nr_freq)
2336 return;
2337
2330 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 2338 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
2331 if (event->state != PERF_EVENT_STATE_ACTIVE) 2339 if (event->state != PERF_EVENT_STATE_ACTIVE)
2332 continue; 2340 continue;
@@ -2382,12 +2390,14 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2382{ 2390{
2383 u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC; 2391 u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
2384 struct perf_event_context *ctx = NULL; 2392 struct perf_event_context *ctx = NULL;
2385 int rotate = 0, remove = 1; 2393 int rotate = 0, remove = 1, freq = 0;
2386 2394
2387 if (cpuctx->ctx.nr_events) { 2395 if (cpuctx->ctx.nr_events) {
2388 remove = 0; 2396 remove = 0;
2389 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) 2397 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
2390 rotate = 1; 2398 rotate = 1;
2399 if (cpuctx->ctx.nr_freq)
2400 freq = 1;
2391 } 2401 }
2392 2402
2393 ctx = cpuctx->task_ctx; 2403 ctx = cpuctx->task_ctx;
@@ -2395,33 +2405,40 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2395 remove = 0; 2405 remove = 0;
2396 if (ctx->nr_events != ctx->nr_active) 2406 if (ctx->nr_events != ctx->nr_active)
2397 rotate = 1; 2407 rotate = 1;
2408 if (ctx->nr_freq)
2409 freq = 1;
2398 } 2410 }
2399 2411
2412 if (!rotate && !freq)
2413 goto done;
2414
2400 perf_ctx_lock(cpuctx, cpuctx->task_ctx); 2415 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2401 perf_pmu_disable(cpuctx->ctx.pmu); 2416 perf_pmu_disable(cpuctx->ctx.pmu);
2402 perf_ctx_adjust_freq(&cpuctx->ctx, interval);
2403 if (ctx)
2404 perf_ctx_adjust_freq(ctx, interval);
2405 2417
2406 if (!rotate) 2418 if (freq) {
2407 goto done; 2419 perf_ctx_adjust_freq(&cpuctx->ctx, interval);
2420 if (ctx)
2421 perf_ctx_adjust_freq(ctx, interval);
2422 }
2408 2423
2409 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2424 if (rotate) {
2410 if (ctx) 2425 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2411 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); 2426 if (ctx)
2427 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
2412 2428
2413 rotate_ctx(&cpuctx->ctx); 2429 rotate_ctx(&cpuctx->ctx);
2414 if (ctx) 2430 if (ctx)
2415 rotate_ctx(ctx); 2431 rotate_ctx(ctx);
2416 2432
2417 perf_event_sched_in(cpuctx, ctx, current); 2433 perf_event_sched_in(cpuctx, ctx, current);
2434 }
2435
2436 perf_pmu_enable(cpuctx->ctx.pmu);
2437 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2418 2438
2419done: 2439done:
2420 if (remove) 2440 if (remove)
2421 list_del_init(&cpuctx->rotation_list); 2441 list_del_init(&cpuctx->rotation_list);
2422
2423 perf_pmu_enable(cpuctx->ctx.pmu);
2424 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2425} 2442}
2426 2443
2427void perf_event_task_tick(void) 2444void perf_event_task_tick(void)
@@ -2448,7 +2465,7 @@ static int event_enable_on_exec(struct perf_event *event,
2448 if (event->state >= PERF_EVENT_STATE_INACTIVE) 2465 if (event->state >= PERF_EVENT_STATE_INACTIVE)
2449 return 0; 2466 return 0;
2450 2467
2451 __perf_event_mark_enabled(event, ctx); 2468 __perf_event_mark_enabled(event);
2452 2469
2453 return 1; 2470 return 1;
2454} 2471}
@@ -2480,13 +2497,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
2480 raw_spin_lock(&ctx->lock); 2497 raw_spin_lock(&ctx->lock);
2481 task_ctx_sched_out(ctx); 2498 task_ctx_sched_out(ctx);
2482 2499
2483 list_for_each_entry(event, &ctx->pinned_groups, group_entry) { 2500 list_for_each_entry(event, &ctx->event_list, event_entry) {
2484 ret = event_enable_on_exec(event, ctx);
2485 if (ret)
2486 enabled = 1;
2487 }
2488
2489 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
2490 ret = event_enable_on_exec(event, ctx); 2501 ret = event_enable_on_exec(event, ctx);
2491 if (ret) 2502 if (ret)
2492 enabled = 1; 2503 enabled = 1;
@@ -2574,215 +2585,6 @@ static u64 perf_event_read(struct perf_event *event)
2574} 2585}
2575 2586
2576/* 2587/*
2577 * Callchain support
2578 */
2579
2580struct callchain_cpus_entries {
2581 struct rcu_head rcu_head;
2582 struct perf_callchain_entry *cpu_entries[0];
2583};
2584
2585static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
2586static atomic_t nr_callchain_events;
2587static DEFINE_MUTEX(callchain_mutex);
2588struct callchain_cpus_entries *callchain_cpus_entries;
2589
2590
2591__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
2592 struct pt_regs *regs)
2593{
2594}
2595
2596__weak void perf_callchain_user(struct perf_callchain_entry *entry,
2597 struct pt_regs *regs)
2598{
2599}
2600
2601static void release_callchain_buffers_rcu(struct rcu_head *head)
2602{
2603 struct callchain_cpus_entries *entries;
2604 int cpu;
2605
2606 entries = container_of(head, struct callchain_cpus_entries, rcu_head);
2607
2608 for_each_possible_cpu(cpu)
2609 kfree(entries->cpu_entries[cpu]);
2610
2611 kfree(entries);
2612}
2613
2614static void release_callchain_buffers(void)
2615{
2616 struct callchain_cpus_entries *entries;
2617
2618 entries = callchain_cpus_entries;
2619 rcu_assign_pointer(callchain_cpus_entries, NULL);
2620 call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
2621}
2622
2623static int alloc_callchain_buffers(void)
2624{
2625 int cpu;
2626 int size;
2627 struct callchain_cpus_entries *entries;
2628
2629 /*
2630 * We can't use the percpu allocation API for data that can be
2631 * accessed from NMI. Use a temporary manual per cpu allocation
2632 * until that gets sorted out.
2633 */
2634 size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
2635
2636 entries = kzalloc(size, GFP_KERNEL);
2637 if (!entries)
2638 return -ENOMEM;
2639
2640 size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
2641
2642 for_each_possible_cpu(cpu) {
2643 entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
2644 cpu_to_node(cpu));
2645 if (!entries->cpu_entries[cpu])
2646 goto fail;
2647 }
2648
2649 rcu_assign_pointer(callchain_cpus_entries, entries);
2650
2651 return 0;
2652
2653fail:
2654 for_each_possible_cpu(cpu)
2655 kfree(entries->cpu_entries[cpu]);
2656 kfree(entries);
2657
2658 return -ENOMEM;
2659}
2660
2661static int get_callchain_buffers(void)
2662{
2663 int err = 0;
2664 int count;
2665
2666 mutex_lock(&callchain_mutex);
2667
2668 count = atomic_inc_return(&nr_callchain_events);
2669 if (WARN_ON_ONCE(count < 1)) {
2670 err = -EINVAL;
2671 goto exit;
2672 }
2673
2674 if (count > 1) {
2675 /* If the allocation failed, give up */
2676 if (!callchain_cpus_entries)
2677 err = -ENOMEM;
2678 goto exit;
2679 }
2680
2681 err = alloc_callchain_buffers();
2682 if (err)
2683 release_callchain_buffers();
2684exit:
2685 mutex_unlock(&callchain_mutex);
2686
2687 return err;
2688}
2689
2690static void put_callchain_buffers(void)
2691{
2692 if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
2693 release_callchain_buffers();
2694 mutex_unlock(&callchain_mutex);
2695 }
2696}
2697
2698static int get_recursion_context(int *recursion)
2699{
2700 int rctx;
2701
2702 if (in_nmi())
2703 rctx = 3;
2704 else if (in_irq())
2705 rctx = 2;
2706 else if (in_softirq())
2707 rctx = 1;
2708 else
2709 rctx = 0;
2710
2711 if (recursion[rctx])
2712 return -1;
2713
2714 recursion[rctx]++;
2715 barrier();
2716
2717 return rctx;
2718}
2719
2720static inline void put_recursion_context(int *recursion, int rctx)
2721{
2722 barrier();
2723 recursion[rctx]--;
2724}
2725
2726static struct perf_callchain_entry *get_callchain_entry(int *rctx)
2727{
2728 int cpu;
2729 struct callchain_cpus_entries *entries;
2730
2731 *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
2732 if (*rctx == -1)
2733 return NULL;
2734
2735 entries = rcu_dereference(callchain_cpus_entries);
2736 if (!entries)
2737 return NULL;
2738
2739 cpu = smp_processor_id();
2740
2741 return &entries->cpu_entries[cpu][*rctx];
2742}
2743
2744static void
2745put_callchain_entry(int rctx)
2746{
2747 put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
2748}
2749
2750static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2751{
2752 int rctx;
2753 struct perf_callchain_entry *entry;
2754
2755
2756 entry = get_callchain_entry(&rctx);
2757 if (rctx == -1)
2758 return NULL;
2759
2760 if (!entry)
2761 goto exit_put;
2762
2763 entry->nr = 0;
2764
2765 if (!user_mode(regs)) {
2766 perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
2767 perf_callchain_kernel(entry, regs);
2768 if (current->mm)
2769 regs = task_pt_regs(current);
2770 else
2771 regs = NULL;
2772 }
2773
2774 if (regs) {
2775 perf_callchain_store(entry, PERF_CONTEXT_USER);
2776 perf_callchain_user(entry, regs);
2777 }
2778
2779exit_put:
2780 put_callchain_entry(rctx);
2781
2782 return entry;
2783}
2784
2785/*
2786 * Initialize the perf_event context in a task_struct: 2588 * Initialize the perf_event context in a task_struct:
2787 */ 2589 */
2788static void __perf_event_init_context(struct perf_event_context *ctx) 2590static void __perf_event_init_context(struct perf_event_context *ctx)
@@ -2946,7 +2748,7 @@ static void free_event(struct perf_event *event)
2946 2748
2947 if (!event->parent) { 2749 if (!event->parent) {
2948 if (event->attach_state & PERF_ATTACH_TASK) 2750 if (event->attach_state & PERF_ATTACH_TASK)
2949 jump_label_dec(&perf_sched_events); 2751 jump_label_dec_deferred(&perf_sched_events);
2950 if (event->attr.mmap || event->attr.mmap_data) 2752 if (event->attr.mmap || event->attr.mmap_data)
2951 atomic_dec(&nr_mmap_events); 2753 atomic_dec(&nr_mmap_events);
2952 if (event->attr.comm) 2754 if (event->attr.comm)
@@ -2957,7 +2759,7 @@ static void free_event(struct perf_event *event)
2957 put_callchain_buffers(); 2759 put_callchain_buffers();
2958 if (is_cgroup_event(event)) { 2760 if (is_cgroup_event(event)) {
2959 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); 2761 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
2960 jump_label_dec(&perf_sched_events); 2762 jump_label_dec_deferred(&perf_sched_events);
2961 } 2763 }
2962 } 2764 }
2963 2765
@@ -4820,7 +4622,6 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
4820 struct hw_perf_event *hwc = &event->hw; 4622 struct hw_perf_event *hwc = &event->hw;
4821 int throttle = 0; 4623 int throttle = 0;
4822 4624
4823 data->period = event->hw.last_period;
4824 if (!overflow) 4625 if (!overflow)
4825 overflow = perf_swevent_set_period(event); 4626 overflow = perf_swevent_set_period(event);
4826 4627
@@ -4854,6 +4655,12 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
4854 if (!is_sampling_event(event)) 4655 if (!is_sampling_event(event))
4855 return; 4656 return;
4856 4657
4658 if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
4659 data->period = nr;
4660 return perf_swevent_overflow(event, 1, data, regs);
4661 } else
4662 data->period = event->hw.last_period;
4663
4857 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) 4664 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
4858 return perf_swevent_overflow(event, 1, data, regs); 4665 return perf_swevent_overflow(event, 1, data, regs);
4859 4666
@@ -5366,7 +5173,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
5366 regs = get_irq_regs(); 5173 regs = get_irq_regs();
5367 5174
5368 if (regs && !perf_exclude_event(event, regs)) { 5175 if (regs && !perf_exclude_event(event, regs)) {
5369 if (!(event->attr.exclude_idle && current->pid == 0)) 5176 if (!(event->attr.exclude_idle && is_idle_task(current)))
5370 if (perf_event_overflow(event, &data, regs)) 5177 if (perf_event_overflow(event, &data, regs))
5371 ret = HRTIMER_NORESTART; 5178 ret = HRTIMER_NORESTART;
5372 } 5179 }
@@ -5981,7 +5788,7 @@ done:
5981 5788
5982 if (!event->parent) { 5789 if (!event->parent) {
5983 if (event->attach_state & PERF_ATTACH_TASK) 5790 if (event->attach_state & PERF_ATTACH_TASK)
5984 jump_label_inc(&perf_sched_events); 5791 jump_label_inc(&perf_sched_events.key);
5985 if (event->attr.mmap || event->attr.mmap_data) 5792 if (event->attr.mmap || event->attr.mmap_data)
5986 atomic_inc(&nr_mmap_events); 5793 atomic_inc(&nr_mmap_events);
5987 if (event->attr.comm) 5794 if (event->attr.comm)
@@ -6219,7 +6026,7 @@ SYSCALL_DEFINE5(perf_event_open,
6219 * - that may need work on context switch 6026 * - that may need work on context switch
6220 */ 6027 */
6221 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); 6028 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
6222 jump_label_inc(&perf_sched_events); 6029 jump_label_inc(&perf_sched_events.key);
6223 } 6030 }
6224 6031
6225 /* 6032 /*
@@ -7065,6 +6872,9 @@ void __init perf_event_init(void)
7065 6872
7066 ret = init_hw_breakpoint(); 6873 ret = init_hw_breakpoint();
7067 WARN(ret, "hw_breakpoint initialization failed with: %d", ret); 6874 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
6875
6876 /* do not patch jump label more than once per second */
6877 jump_label_rate_limit(&perf_sched_events, HZ);
7068} 6878}
7069 6879
7070static int __init perf_event_sysfs_init(void) 6880static int __init perf_event_sysfs_init(void)
@@ -7131,10 +6941,13 @@ static int __perf_cgroup_move(void *info)
7131 return 0; 6941 return 0;
7132} 6942}
7133 6943
7134static void 6944static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7135perf_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *task) 6945 struct cgroup_taskset *tset)
7136{ 6946{
7137 task_function_call(task, __perf_cgroup_move, task); 6947 struct task_struct *task;
6948
6949 cgroup_taskset_for_each(task, cgrp, tset)
6950 task_function_call(task, __perf_cgroup_move, task);
7138} 6951}
7139 6952
7140static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, 6953static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
@@ -7148,7 +6961,7 @@ static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
7148 if (!(task->flags & PF_EXITING)) 6961 if (!(task->flags & PF_EXITING))
7149 return; 6962 return;
7150 6963
7151 perf_cgroup_attach_task(cgrp, task); 6964 task_function_call(task, __perf_cgroup_move, task);
7152} 6965}
7153 6966
7154struct cgroup_subsys perf_subsys = { 6967struct cgroup_subsys perf_subsys = {
@@ -7157,6 +6970,6 @@ struct cgroup_subsys perf_subsys = {
7157 .create = perf_cgroup_create, 6970 .create = perf_cgroup_create,
7158 .destroy = perf_cgroup_destroy, 6971 .destroy = perf_cgroup_destroy,
7159 .exit = perf_cgroup_exit, 6972 .exit = perf_cgroup_exit,
7160 .attach_task = perf_cgroup_attach_task, 6973 .attach = perf_cgroup_attach,
7161}; 6974};
7162#endif /* CONFIG_CGROUP_PERF */ 6975#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 64568a699375..b0b107f90afc 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -1,6 +1,10 @@
1#ifndef _KERNEL_EVENTS_INTERNAL_H 1#ifndef _KERNEL_EVENTS_INTERNAL_H
2#define _KERNEL_EVENTS_INTERNAL_H 2#define _KERNEL_EVENTS_INTERNAL_H
3 3
4#include <linux/hardirq.h>
5
6/* Buffer handling */
7
4#define RING_BUFFER_WRITABLE 0x01 8#define RING_BUFFER_WRITABLE 0x01
5 9
6struct ring_buffer { 10struct ring_buffer {
@@ -67,7 +71,7 @@ static inline int page_order(struct ring_buffer *rb)
67} 71}
68#endif 72#endif
69 73
70static unsigned long perf_data_size(struct ring_buffer *rb) 74static inline unsigned long perf_data_size(struct ring_buffer *rb)
71{ 75{
72 return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); 76 return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
73} 77}
@@ -96,4 +100,37 @@ __output_copy(struct perf_output_handle *handle,
96 } while (len); 100 } while (len);
97} 101}
98 102
103/* Callchain handling */
104extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
105extern int get_callchain_buffers(void);
106extern void put_callchain_buffers(void);
107
108static inline int get_recursion_context(int *recursion)
109{
110 int rctx;
111
112 if (in_nmi())
113 rctx = 3;
114 else if (in_irq())
115 rctx = 2;
116 else if (in_softirq())
117 rctx = 1;
118 else
119 rctx = 0;
120
121 if (recursion[rctx])
122 return -1;
123
124 recursion[rctx]++;
125 barrier();
126
127 return rctx;
128}
129
130static inline void put_recursion_context(int *recursion, int rctx)
131{
132 barrier();
133 recursion[rctx]--;
134}
135
99#endif /* _KERNEL_EVENTS_INTERNAL_H */ 136#endif /* _KERNEL_EVENTS_INTERNAL_H */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 7f3011c6b57f..6ddaba43fb7a 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -4,7 +4,7 @@
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> 4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar 5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 * 8 *
9 * For licensing details see kernel-base/COPYING 9 * For licensing details see kernel-base/COPYING
10 */ 10 */
diff --git a/kernel/exit.c b/kernel/exit.c
index d0b7d988f873..294b1709170d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -51,6 +51,7 @@
51#include <trace/events/sched.h> 51#include <trace/events/sched.h>
52#include <linux/hw_breakpoint.h> 52#include <linux/hw_breakpoint.h>
53#include <linux/oom.h> 53#include <linux/oom.h>
54#include <linux/writeback.h>
54 55
55#include <asm/uaccess.h> 56#include <asm/uaccess.h>
56#include <asm/unistd.h> 57#include <asm/unistd.h>
@@ -121,9 +122,9 @@ static void __exit_signal(struct task_struct *tsk)
121 * We won't ever get here for the group leader, since it 122 * We won't ever get here for the group leader, since it
122 * will have been the last reference on the signal_struct. 123 * will have been the last reference on the signal_struct.
123 */ 124 */
124 sig->utime = cputime_add(sig->utime, tsk->utime); 125 sig->utime += tsk->utime;
125 sig->stime = cputime_add(sig->stime, tsk->stime); 126 sig->stime += tsk->stime;
126 sig->gtime = cputime_add(sig->gtime, tsk->gtime); 127 sig->gtime += tsk->gtime;
127 sig->min_flt += tsk->min_flt; 128 sig->min_flt += tsk->min_flt;
128 sig->maj_flt += tsk->maj_flt; 129 sig->maj_flt += tsk->maj_flt;
129 sig->nvcsw += tsk->nvcsw; 130 sig->nvcsw += tsk->nvcsw;
@@ -679,8 +680,6 @@ static void exit_mm(struct task_struct * tsk)
679 tsk->mm = NULL; 680 tsk->mm = NULL;
680 up_read(&mm->mmap_sem); 681 up_read(&mm->mmap_sem);
681 enter_lazy_tlb(mm, current); 682 enter_lazy_tlb(mm, current);
682 /* We don't want this task to be frozen prematurely */
683 clear_freeze_flag(tsk);
684 task_unlock(tsk); 683 task_unlock(tsk);
685 mm_update_next_owner(mm); 684 mm_update_next_owner(mm);
686 mmput(mm); 685 mmput(mm);
@@ -888,7 +887,7 @@ static void check_stack_usage(void)
888static inline void check_stack_usage(void) {} 887static inline void check_stack_usage(void) {}
889#endif 888#endif
890 889
891NORET_TYPE void do_exit(long code) 890void do_exit(long code)
892{ 891{
893 struct task_struct *tsk = current; 892 struct task_struct *tsk = current;
894 int group_dead; 893 int group_dead;
@@ -965,8 +964,7 @@ NORET_TYPE void do_exit(long code)
965 acct_collect(code, group_dead); 964 acct_collect(code, group_dead);
966 if (group_dead) 965 if (group_dead)
967 tty_audit_exit(); 966 tty_audit_exit();
968 if (unlikely(tsk->audit_context)) 967 audit_free(tsk);
969 audit_free(tsk);
970 968
971 tsk->exit_code = code; 969 tsk->exit_code = code;
972 taskstats_exit(tsk, group_dead); 970 taskstats_exit(tsk, group_dead);
@@ -1037,9 +1035,12 @@ NORET_TYPE void do_exit(long code)
1037 validate_creds_for_do_exit(tsk); 1035 validate_creds_for_do_exit(tsk);
1038 1036
1039 preempt_disable(); 1037 preempt_disable();
1038 if (tsk->nr_dirtied)
1039 __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
1040 exit_rcu(); 1040 exit_rcu();
1041 /* causes final put_task_struct in finish_task_switch(). */ 1041 /* causes final put_task_struct in finish_task_switch(). */
1042 tsk->state = TASK_DEAD; 1042 tsk->state = TASK_DEAD;
1043 tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */
1043 schedule(); 1044 schedule();
1044 BUG(); 1045 BUG();
1045 /* Avoid "noreturn function does return". */ 1046 /* Avoid "noreturn function does return". */
@@ -1049,7 +1050,7 @@ NORET_TYPE void do_exit(long code)
1049 1050
1050EXPORT_SYMBOL_GPL(do_exit); 1051EXPORT_SYMBOL_GPL(do_exit);
1051 1052
1052NORET_TYPE void complete_and_exit(struct completion *comp, long code) 1053void complete_and_exit(struct completion *comp, long code)
1053{ 1054{
1054 if (comp) 1055 if (comp)
1055 complete(comp); 1056 complete(comp);
@@ -1068,7 +1069,7 @@ SYSCALL_DEFINE1(exit, int, error_code)
1068 * Take down every thread in the group. This is called by fatal signals 1069 * Take down every thread in the group. This is called by fatal signals
1069 * as well as by sys_exit_group (below). 1070 * as well as by sys_exit_group (below).
1070 */ 1071 */
1071NORET_TYPE void 1072void
1072do_group_exit(int exit_code) 1073do_group_exit(int exit_code)
1073{ 1074{
1074 struct signal_struct *sig = current->signal; 1075 struct signal_struct *sig = current->signal;
@@ -1255,19 +1256,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1255 spin_lock_irq(&p->real_parent->sighand->siglock); 1256 spin_lock_irq(&p->real_parent->sighand->siglock);
1256 psig = p->real_parent->signal; 1257 psig = p->real_parent->signal;
1257 sig = p->signal; 1258 sig = p->signal;
1258 psig->cutime = 1259 psig->cutime += tgutime + sig->cutime;
1259 cputime_add(psig->cutime, 1260 psig->cstime += tgstime + sig->cstime;
1260 cputime_add(tgutime, 1261 psig->cgtime += p->gtime + sig->gtime + sig->cgtime;
1261 sig->cutime));
1262 psig->cstime =
1263 cputime_add(psig->cstime,
1264 cputime_add(tgstime,
1265 sig->cstime));
1266 psig->cgtime =
1267 cputime_add(psig->cgtime,
1268 cputime_add(p->gtime,
1269 cputime_add(sig->gtime,
1270 sig->cgtime)));
1271 psig->cmin_flt += 1262 psig->cmin_flt +=
1272 p->min_flt + sig->min_flt + sig->cmin_flt; 1263 p->min_flt + sig->min_flt + sig->cmin_flt;
1273 psig->cmaj_flt += 1264 psig->cmaj_flt +=
@@ -1540,8 +1531,15 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
1540 } 1531 }
1541 1532
1542 /* dead body doesn't have much to contribute */ 1533 /* dead body doesn't have much to contribute */
1543 if (p->exit_state == EXIT_DEAD) 1534 if (unlikely(p->exit_state == EXIT_DEAD)) {
1535 /*
1536 * But do not ignore this task until the tracer does
1537 * wait_task_zombie()->do_notify_parent().
1538 */
1539 if (likely(!ptrace) && unlikely(ptrace_reparented(p)))
1540 wo->notask_error = 0;
1544 return 0; 1541 return 0;
1542 }
1545 1543
1546 /* slay zombie? */ 1544 /* slay zombie? */
1547 if (p->exit_state == EXIT_ZOMBIE) { 1545 if (p->exit_state == EXIT_ZOMBIE) {
diff --git a/kernel/fork.c b/kernel/fork.c
index da4a6a10d088..051f090d40c1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -76,6 +76,9 @@
76 76
77#include <trace/events/sched.h> 77#include <trace/events/sched.h>
78 78
79#define CREATE_TRACE_POINTS
80#include <trace/events/task.h>
81
79/* 82/*
80 * Protected counters by write_lock_irq(&tasklist_lock) 83 * Protected counters by write_lock_irq(&tasklist_lock)
81 */ 84 */
@@ -870,6 +873,7 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
870{ 873{
871#ifdef CONFIG_BLOCK 874#ifdef CONFIG_BLOCK
872 struct io_context *ioc = current->io_context; 875 struct io_context *ioc = current->io_context;
876 struct io_context *new_ioc;
873 877
874 if (!ioc) 878 if (!ioc)
875 return 0; 879 return 0;
@@ -881,11 +885,12 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
881 if (unlikely(!tsk->io_context)) 885 if (unlikely(!tsk->io_context))
882 return -ENOMEM; 886 return -ENOMEM;
883 } else if (ioprio_valid(ioc->ioprio)) { 887 } else if (ioprio_valid(ioc->ioprio)) {
884 tsk->io_context = alloc_io_context(GFP_KERNEL, -1); 888 new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
885 if (unlikely(!tsk->io_context)) 889 if (unlikely(!new_ioc))
886 return -ENOMEM; 890 return -ENOMEM;
887 891
888 tsk->io_context->ioprio = ioc->ioprio; 892 new_ioc->ioprio = ioc->ioprio;
893 put_io_context(new_ioc, NULL);
889 } 894 }
890#endif 895#endif
891 return 0; 896 return 0;
@@ -972,7 +977,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
972 sched_autogroup_fork(sig); 977 sched_autogroup_fork(sig);
973 978
974#ifdef CONFIG_CGROUPS 979#ifdef CONFIG_CGROUPS
975 init_rwsem(&sig->threadgroup_fork_lock); 980 init_rwsem(&sig->group_rwsem);
976#endif 981#endif
977 982
978 sig->oom_adj = current->signal->oom_adj; 983 sig->oom_adj = current->signal->oom_adj;
@@ -992,7 +997,6 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
992 new_flags |= PF_FORKNOEXEC; 997 new_flags |= PF_FORKNOEXEC;
993 new_flags |= PF_STARTING; 998 new_flags |= PF_STARTING;
994 p->flags = new_flags; 999 p->flags = new_flags;
995 clear_freeze_flag(p);
996} 1000}
997 1001
998SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) 1002SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
@@ -1023,8 +1027,8 @@ void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
1023 */ 1027 */
1024static void posix_cpu_timers_init(struct task_struct *tsk) 1028static void posix_cpu_timers_init(struct task_struct *tsk)
1025{ 1029{
1026 tsk->cputime_expires.prof_exp = cputime_zero; 1030 tsk->cputime_expires.prof_exp = 0;
1027 tsk->cputime_expires.virt_exp = cputime_zero; 1031 tsk->cputime_expires.virt_exp = 0;
1028 tsk->cputime_expires.sched_exp = 0; 1032 tsk->cputime_expires.sched_exp = 0;
1029 INIT_LIST_HEAD(&tsk->cpu_timers[0]); 1033 INIT_LIST_HEAD(&tsk->cpu_timers[0]);
1030 INIT_LIST_HEAD(&tsk->cpu_timers[1]); 1034 INIT_LIST_HEAD(&tsk->cpu_timers[1]);
@@ -1132,14 +1136,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1132 1136
1133 init_sigpending(&p->pending); 1137 init_sigpending(&p->pending);
1134 1138
1135 p->utime = cputime_zero; 1139 p->utime = p->stime = p->gtime = 0;
1136 p->stime = cputime_zero; 1140 p->utimescaled = p->stimescaled = 0;
1137 p->gtime = cputime_zero;
1138 p->utimescaled = cputime_zero;
1139 p->stimescaled = cputime_zero;
1140#ifndef CONFIG_VIRT_CPU_ACCOUNTING 1141#ifndef CONFIG_VIRT_CPU_ACCOUNTING
1141 p->prev_utime = cputime_zero; 1142 p->prev_utime = p->prev_stime = 0;
1142 p->prev_stime = cputime_zero;
1143#endif 1143#endif
1144#if defined(SPLIT_RSS_COUNTING) 1144#if defined(SPLIT_RSS_COUNTING)
1145 memset(&p->rss_stat, 0, sizeof(p->rss_stat)); 1145 memset(&p->rss_stat, 0, sizeof(p->rss_stat));
@@ -1158,7 +1158,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1158 p->io_context = NULL; 1158 p->io_context = NULL;
1159 p->audit_context = NULL; 1159 p->audit_context = NULL;
1160 if (clone_flags & CLONE_THREAD) 1160 if (clone_flags & CLONE_THREAD)
1161 threadgroup_fork_read_lock(current); 1161 threadgroup_change_begin(current);
1162 cgroup_fork(p); 1162 cgroup_fork(p);
1163#ifdef CONFIG_NUMA 1163#ifdef CONFIG_NUMA
1164 p->mempolicy = mpol_dup(p->mempolicy); 1164 p->mempolicy = mpol_dup(p->mempolicy);
@@ -1296,6 +1296,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1296 1296
1297 p->nr_dirtied = 0; 1297 p->nr_dirtied = 0;
1298 p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); 1298 p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
1299 p->dirty_paused_when = 0;
1299 1300
1300 /* 1301 /*
1301 * Ok, make it visible to the rest of the system. 1302 * Ok, make it visible to the rest of the system.
@@ -1373,8 +1374,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1373 proc_fork_connector(p); 1374 proc_fork_connector(p);
1374 cgroup_post_fork(p); 1375 cgroup_post_fork(p);
1375 if (clone_flags & CLONE_THREAD) 1376 if (clone_flags & CLONE_THREAD)
1376 threadgroup_fork_read_unlock(current); 1377 threadgroup_change_end(current);
1377 perf_event_fork(p); 1378 perf_event_fork(p);
1379
1380 trace_task_newtask(p, clone_flags);
1381
1378 return p; 1382 return p;
1379 1383
1380bad_fork_free_pid: 1384bad_fork_free_pid:
@@ -1408,7 +1412,7 @@ bad_fork_cleanup_policy:
1408bad_fork_cleanup_cgroup: 1412bad_fork_cleanup_cgroup:
1409#endif 1413#endif
1410 if (clone_flags & CLONE_THREAD) 1414 if (clone_flags & CLONE_THREAD)
1411 threadgroup_fork_read_unlock(current); 1415 threadgroup_change_end(current);
1412 cgroup_exit(p, cgroup_callbacks_done); 1416 cgroup_exit(p, cgroup_callbacks_done);
1413 delayacct_tsk_free(p); 1417 delayacct_tsk_free(p);
1414 module_put(task_thread_info(p)->exec_domain->module); 1418 module_put(task_thread_info(p)->exec_domain->module);
@@ -1523,8 +1527,6 @@ long do_fork(unsigned long clone_flags,
1523 init_completion(&vfork); 1527 init_completion(&vfork);
1524 } 1528 }
1525 1529
1526 audit_finish_fork(p);
1527
1528 /* 1530 /*
1529 * We set PF_STARTING at creation in case tracing wants to 1531 * We set PF_STARTING at creation in case tracing wants to
1530 * use this to distinguish a fully live task from one that 1532 * use this to distinguish a fully live task from one that
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 7be56c534397..9815b8d1eed5 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -9,101 +9,114 @@
9#include <linux/export.h> 9#include <linux/export.h>
10#include <linux/syscalls.h> 10#include <linux/syscalls.h>
11#include <linux/freezer.h> 11#include <linux/freezer.h>
12#include <linux/kthread.h>
12 13
13/* 14/* total number of freezing conditions in effect */
14 * freezing is complete, mark current process as frozen 15atomic_t system_freezing_cnt = ATOMIC_INIT(0);
16EXPORT_SYMBOL(system_freezing_cnt);
17
18/* indicate whether PM freezing is in effect, protected by pm_mutex */
19bool pm_freezing;
20bool pm_nosig_freezing;
21
22/* protects freezing and frozen transitions */
23static DEFINE_SPINLOCK(freezer_lock);
24
25/**
26 * freezing_slow_path - slow path for testing whether a task needs to be frozen
27 * @p: task to be tested
28 *
29 * This function is called by freezing() if system_freezing_cnt isn't zero
30 * and tests whether @p needs to enter and stay in frozen state. Can be
31 * called under any context. The freezers are responsible for ensuring the
32 * target tasks see the updated state.
15 */ 33 */
16static inline void frozen_process(void) 34bool freezing_slow_path(struct task_struct *p)
17{ 35{
18 if (!unlikely(current->flags & PF_NOFREEZE)) { 36 if (p->flags & PF_NOFREEZE)
19 current->flags |= PF_FROZEN; 37 return false;
20 smp_wmb(); 38
21 } 39 if (pm_nosig_freezing || cgroup_freezing(p))
22 clear_freeze_flag(current); 40 return true;
41
42 if (pm_freezing && !(p->flags & PF_KTHREAD))
43 return true;
44
45 return false;
23} 46}
47EXPORT_SYMBOL(freezing_slow_path);
24 48
25/* Refrigerator is place where frozen processes are stored :-). */ 49/* Refrigerator is place where frozen processes are stored :-). */
26void refrigerator(void) 50bool __refrigerator(bool check_kthr_stop)
27{ 51{
28 /* Hmm, should we be allowed to suspend when there are realtime 52 /* Hmm, should we be allowed to suspend when there are realtime
29 processes around? */ 53 processes around? */
30 long save; 54 bool was_frozen = false;
55 long save = current->state;
31 56
32 task_lock(current);
33 if (freezing(current)) {
34 frozen_process();
35 task_unlock(current);
36 } else {
37 task_unlock(current);
38 return;
39 }
40 save = current->state;
41 pr_debug("%s entered refrigerator\n", current->comm); 57 pr_debug("%s entered refrigerator\n", current->comm);
42 58
43 spin_lock_irq(&current->sighand->siglock);
44 recalc_sigpending(); /* We sent fake signal, clean it up */
45 spin_unlock_irq(&current->sighand->siglock);
46
47 /* prevent accounting of that task to load */
48 current->flags |= PF_FREEZING;
49
50 for (;;) { 59 for (;;) {
51 set_current_state(TASK_UNINTERRUPTIBLE); 60 set_current_state(TASK_UNINTERRUPTIBLE);
52 if (!frozen(current)) 61
62 spin_lock_irq(&freezer_lock);
63 current->flags |= PF_FROZEN;
64 if (!freezing(current) ||
65 (check_kthr_stop && kthread_should_stop()))
66 current->flags &= ~PF_FROZEN;
67 spin_unlock_irq(&freezer_lock);
68
69 if (!(current->flags & PF_FROZEN))
53 break; 70 break;
71 was_frozen = true;
54 schedule(); 72 schedule();
55 } 73 }
56 74
57 /* Remove the accounting blocker */
58 current->flags &= ~PF_FREEZING;
59
60 pr_debug("%s left refrigerator\n", current->comm); 75 pr_debug("%s left refrigerator\n", current->comm);
61 __set_current_state(save); 76
77 /*
78 * Restore saved task state before returning. The mb'd version
79 * needs to be used; otherwise, it might silently break
80 * synchronization which depends on ordered task state change.
81 */
82 set_current_state(save);
83
84 return was_frozen;
62} 85}
63EXPORT_SYMBOL(refrigerator); 86EXPORT_SYMBOL(__refrigerator);
64 87
65static void fake_signal_wake_up(struct task_struct *p) 88static void fake_signal_wake_up(struct task_struct *p)
66{ 89{
67 unsigned long flags; 90 unsigned long flags;
68 91
69 spin_lock_irqsave(&p->sighand->siglock, flags); 92 if (lock_task_sighand(p, &flags)) {
70 signal_wake_up(p, 0); 93 signal_wake_up(p, 0);
71 spin_unlock_irqrestore(&p->sighand->siglock, flags); 94 unlock_task_sighand(p, &flags);
95 }
72} 96}
73 97
74/** 98/**
75 * freeze_task - send a freeze request to given task 99 * freeze_task - send a freeze request to given task
76 * @p: task to send the request to 100 * @p: task to send the request to
77 * @sig_only: if set, the request will only be sent if the task has the 101 *
78 * PF_FREEZER_NOSIG flag unset 102 * If @p is freezing, the freeze request is sent by setting %TIF_FREEZE
79 * Return value: 'false', if @sig_only is set and the task has 103 * flag and either sending a fake signal to it or waking it up, depending
80 * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise 104 * on whether it has %PF_FREEZER_NOSIG set.
81 * 105 *
82 * The freeze request is sent by setting the tasks's TIF_FREEZE flag and 106 * RETURNS:
83 * either sending a fake signal to it or waking it up, depending on whether 107 * %false, if @p is not freezing or already frozen; %true, otherwise
84 * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task
85 * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its
86 * TIF_FREEZE flag will not be set.
87 */ 108 */
88bool freeze_task(struct task_struct *p, bool sig_only) 109bool freeze_task(struct task_struct *p)
89{ 110{
90 /* 111 unsigned long flags;
91 * We first check if the task is freezing and next if it has already 112
92 * been frozen to avoid the race with frozen_process() which first marks 113 spin_lock_irqsave(&freezer_lock, flags);
93 * the task as frozen and next clears its TIF_FREEZE. 114 if (!freezing(p) || frozen(p)) {
94 */ 115 spin_unlock_irqrestore(&freezer_lock, flags);
95 if (!freezing(p)) { 116 return false;
96 smp_rmb();
97 if (frozen(p))
98 return false;
99
100 if (!sig_only || should_send_signal(p))
101 set_freeze_flag(p);
102 else
103 return false;
104 } 117 }
105 118
106 if (should_send_signal(p)) { 119 if (!(p->flags & PF_KTHREAD)) {
107 fake_signal_wake_up(p); 120 fake_signal_wake_up(p);
108 /* 121 /*
109 * fake_signal_wake_up() goes through p's scheduler 122 * fake_signal_wake_up() goes through p's scheduler
@@ -111,56 +124,48 @@ bool freeze_task(struct task_struct *p, bool sig_only)
111 * TASK_RUNNING transition can't race with task state 124 * TASK_RUNNING transition can't race with task state
112 * testing in try_to_freeze_tasks(). 125 * testing in try_to_freeze_tasks().
113 */ 126 */
114 } else if (sig_only) {
115 return false;
116 } else { 127 } else {
117 wake_up_state(p, TASK_INTERRUPTIBLE); 128 wake_up_state(p, TASK_INTERRUPTIBLE);
118 } 129 }
119 130
131 spin_unlock_irqrestore(&freezer_lock, flags);
120 return true; 132 return true;
121} 133}
122 134
123void cancel_freezing(struct task_struct *p) 135void __thaw_task(struct task_struct *p)
124{ 136{
125 unsigned long flags; 137 unsigned long flags;
126 138
127 if (freezing(p)) { 139 /*
128 pr_debug(" clean up: %s\n", p->comm); 140 * Clear freezing and kick @p if FROZEN. Clearing is guaranteed to
129 clear_freeze_flag(p); 141 * be visible to @p as waking up implies wmb. Waking up inside
130 spin_lock_irqsave(&p->sighand->siglock, flags); 142 * freezer_lock also prevents wakeups from leaking outside
131 recalc_sigpending_and_wake(p); 143 * refrigerator.
132 spin_unlock_irqrestore(&p->sighand->siglock, flags); 144 */
133 } 145 spin_lock_irqsave(&freezer_lock, flags);
134} 146 if (frozen(p))
135 147 wake_up_process(p);
136static int __thaw_process(struct task_struct *p) 148 spin_unlock_irqrestore(&freezer_lock, flags);
137{
138 if (frozen(p)) {
139 p->flags &= ~PF_FROZEN;
140 return 1;
141 }
142 clear_freeze_flag(p);
143 return 0;
144} 149}
145 150
146/* 151/**
147 * Wake up a frozen process 152 * set_freezable - make %current freezable
148 * 153 *
149 * task_lock() is needed to prevent the race with refrigerator() which may 154 * Mark %current freezable and enter refrigerator if necessary.
150 * occur if the freezing of tasks fails. Namely, without the lock, if the
151 * freezing of tasks failed, thaw_tasks() might have run before a task in
152 * refrigerator() could call frozen_process(), in which case the task would be
153 * frozen and no one would thaw it.
154 */ 155 */
155int thaw_process(struct task_struct *p) 156bool set_freezable(void)
156{ 157{
157 task_lock(p); 158 might_sleep();
158 if (__thaw_process(p) == 1) { 159
159 task_unlock(p); 160 /*
160 wake_up_process(p); 161 * Modify flags while holding freezer_lock. This ensures the
161 return 1; 162 * freezer notices that we aren't frozen yet or the freezing
162 } 163 * condition is visible to try_to_freeze() below.
163 task_unlock(p); 164 */
164 return 0; 165 spin_lock_irq(&freezer_lock);
166 current->flags &= ~PF_NOFREEZE;
167 spin_unlock_irq(&freezer_lock);
168
169 return try_to_freeze();
165} 170}
166EXPORT_SYMBOL(thaw_process); 171EXPORT_SYMBOL(set_freezable);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index a73dd6c7372d..b7952316016a 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -15,7 +15,7 @@
15 15
16#define istate core_internal_state__do_not_mess_with_it 16#define istate core_internal_state__do_not_mess_with_it
17 17
18extern int noirqdebug; 18extern bool noirqdebug;
19 19
20/* 20/*
21 * Bits used by threaded handlers: 21 * Bits used by threaded handlers:
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 200ce832c585..1f9e26526b69 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -135,6 +135,9 @@ int irq_domain_simple_dt_translate(struct irq_domain *d,
135 return -EINVAL; 135 return -EINVAL;
136 if (intsize < 1) 136 if (intsize < 1)
137 return -EINVAL; 137 return -EINVAL;
138 if (d->nr_irq && ((intspec[0] < d->hwirq_base) ||
139 (intspec[0] >= d->hwirq_base + d->nr_irq)))
140 return -EINVAL;
138 141
139 *out_hwirq = intspec[0]; 142 *out_hwirq = intspec[0];
140 *out_type = IRQ_TYPE_NONE; 143 *out_type = IRQ_TYPE_NONE;
@@ -143,11 +146,6 @@ int irq_domain_simple_dt_translate(struct irq_domain *d,
143 return 0; 146 return 0;
144} 147}
145 148
146struct irq_domain_ops irq_domain_simple_ops = {
147 .dt_translate = irq_domain_simple_dt_translate,
148};
149EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
150
151/** 149/**
152 * irq_domain_create_simple() - Set up a 'simple' translation range 150 * irq_domain_create_simple() - Set up a 'simple' translation range
153 */ 151 */
@@ -182,3 +180,10 @@ void irq_domain_generate_simple(const struct of_device_id *match,
182} 180}
183EXPORT_SYMBOL_GPL(irq_domain_generate_simple); 181EXPORT_SYMBOL_GPL(irq_domain_generate_simple);
184#endif /* CONFIG_OF_IRQ */ 182#endif /* CONFIG_OF_IRQ */
183
184struct irq_domain_ops irq_domain_simple_ops = {
185#ifdef CONFIG_OF_IRQ
186 .dt_translate = irq_domain_simple_dt_translate,
187#endif /* CONFIG_OF_IRQ */
188};
189EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 1da999f5e746..a9a9dbe49fea 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1292,7 +1292,7 @@ EXPORT_SYMBOL(free_irq);
1292 * and to set up the interrupt handler in the right order. 1292 * and to set up the interrupt handler in the right order.
1293 * 1293 *
1294 * If you want to set up a threaded irq handler for your device 1294 * If you want to set up a threaded irq handler for your device
1295 * then you need to supply @handler and @thread_fn. @handler ist 1295 * then you need to supply @handler and @thread_fn. @handler is
1296 * still called in hard interrupt context and has to check 1296 * still called in hard interrupt context and has to check
1297 * whether the interrupt originates from the device. If yes it 1297 * whether the interrupt originates from the device. If yes it
1298 * needs to disable the interrupt on the device and return 1298 * needs to disable the interrupt on the device and return
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index dc813a948be2..611cd6003c45 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -325,7 +325,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
325 desc->irqs_unhandled = 0; 325 desc->irqs_unhandled = 0;
326} 326}
327 327
328int noirqdebug __read_mostly; 328bool noirqdebug __read_mostly;
329 329
330int noirqdebug_setup(char *str) 330int noirqdebug_setup(char *str)
331{ 331{
diff --git a/kernel/itimer.c b/kernel/itimer.c
index d802883153da..22000c3db0dd 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -52,22 +52,22 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
52 52
53 cval = it->expires; 53 cval = it->expires;
54 cinterval = it->incr; 54 cinterval = it->incr;
55 if (!cputime_eq(cval, cputime_zero)) { 55 if (cval) {
56 struct task_cputime cputime; 56 struct task_cputime cputime;
57 cputime_t t; 57 cputime_t t;
58 58
59 thread_group_cputimer(tsk, &cputime); 59 thread_group_cputimer(tsk, &cputime);
60 if (clock_id == CPUCLOCK_PROF) 60 if (clock_id == CPUCLOCK_PROF)
61 t = cputime_add(cputime.utime, cputime.stime); 61 t = cputime.utime + cputime.stime;
62 else 62 else
63 /* CPUCLOCK_VIRT */ 63 /* CPUCLOCK_VIRT */
64 t = cputime.utime; 64 t = cputime.utime;
65 65
66 if (cputime_le(cval, t)) 66 if (cval < t)
67 /* about to fire */ 67 /* about to fire */
68 cval = cputime_one_jiffy; 68 cval = cputime_one_jiffy;
69 else 69 else
70 cval = cputime_sub(cval, t); 70 cval = cval - t;
71 } 71 }
72 72
73 spin_unlock_irq(&tsk->sighand->siglock); 73 spin_unlock_irq(&tsk->sighand->siglock);
@@ -161,10 +161,9 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
161 161
162 cval = it->expires; 162 cval = it->expires;
163 cinterval = it->incr; 163 cinterval = it->incr;
164 if (!cputime_eq(cval, cputime_zero) || 164 if (cval || nval) {
165 !cputime_eq(nval, cputime_zero)) { 165 if (nval > 0)
166 if (cputime_gt(nval, cputime_zero)) 166 nval += cputime_one_jiffy;
167 nval = cputime_add(nval, cputime_one_jiffy);
168 set_process_cpu_timer(tsk, clock_id, &nval, &cval); 167 set_process_cpu_timer(tsk, clock_id, &nval, &cval);
169 } 168 }
170 it->expires = nval; 169 it->expires = nval;
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 66ff7109f697..01d3b70fc98a 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -71,15 +71,48 @@ void jump_label_inc(struct jump_label_key *key)
71 atomic_inc(&key->enabled); 71 atomic_inc(&key->enabled);
72 jump_label_unlock(); 72 jump_label_unlock();
73} 73}
74EXPORT_SYMBOL_GPL(jump_label_inc);
74 75
75void jump_label_dec(struct jump_label_key *key) 76static void __jump_label_dec(struct jump_label_key *key,
77 unsigned long rate_limit, struct delayed_work *work)
76{ 78{
77 if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) 79 if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex))
78 return; 80 return;
79 81
80 jump_label_update(key, JUMP_LABEL_DISABLE); 82 if (rate_limit) {
83 atomic_inc(&key->enabled);
84 schedule_delayed_work(work, rate_limit);
85 } else
86 jump_label_update(key, JUMP_LABEL_DISABLE);
87
81 jump_label_unlock(); 88 jump_label_unlock();
82} 89}
90EXPORT_SYMBOL_GPL(jump_label_dec);
91
92static void jump_label_update_timeout(struct work_struct *work)
93{
94 struct jump_label_key_deferred *key =
95 container_of(work, struct jump_label_key_deferred, work.work);
96 __jump_label_dec(&key->key, 0, NULL);
97}
98
99void jump_label_dec(struct jump_label_key *key)
100{
101 __jump_label_dec(key, 0, NULL);
102}
103
104void jump_label_dec_deferred(struct jump_label_key_deferred *key)
105{
106 __jump_label_dec(&key->key, key->timeout, &key->work);
107}
108
109
110void jump_label_rate_limit(struct jump_label_key_deferred *key,
111 unsigned long rl)
112{
113 key->timeout = rl;
114 INIT_DELAYED_WORK(&key->work, jump_label_update_timeout);
115}
83 116
84static int addr_conflict(struct jump_entry *entry, void *start, void *end) 117static int addr_conflict(struct jump_entry *entry, void *start, void *end)
85{ 118{
@@ -111,7 +144,7 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start,
111 * running code can override this to make the non-live update case 144 * running code can override this to make the non-live update case
112 * cheaper. 145 * cheaper.
113 */ 146 */
114void __weak arch_jump_label_transform_static(struct jump_entry *entry, 147void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry,
115 enum jump_label_type type) 148 enum jump_label_type type)
116{ 149{
117 arch_jump_label_transform(entry, type); 150 arch_jump_label_transform(entry, type);
@@ -217,8 +250,13 @@ void jump_label_apply_nops(struct module *mod)
217 if (iter_start == iter_stop) 250 if (iter_start == iter_stop)
218 return; 251 return;
219 252
220 for (iter = iter_start; iter < iter_stop; iter++) 253 for (iter = iter_start; iter < iter_stop; iter++) {
221 arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE); 254 struct jump_label_key *iterk;
255
256 iterk = (struct jump_label_key *)(unsigned long)iter->key;
257 arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ?
258 JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE);
259 }
222} 260}
223 261
224static int jump_label_add_module(struct module *mod) 262static int jump_label_add_module(struct module *mod)
@@ -258,8 +296,7 @@ static int jump_label_add_module(struct module *mod)
258 key->next = jlm; 296 key->next = jlm;
259 297
260 if (jump_label_enabled(key)) 298 if (jump_label_enabled(key))
261 __jump_label_update(key, iter, iter_stop, 299 __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE);
262 JUMP_LABEL_ENABLE);
263 } 300 }
264 301
265 return 0; 302 return 0;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index dc7bc0829286..7b0886786701 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -32,7 +32,6 @@
32#include <linux/console.h> 32#include <linux/console.h>
33#include <linux/vmalloc.h> 33#include <linux/vmalloc.h>
34#include <linux/swap.h> 34#include <linux/swap.h>
35#include <linux/kmsg_dump.h>
36#include <linux/syscore_ops.h> 35#include <linux/syscore_ops.h>
37 36
38#include <asm/page.h> 37#include <asm/page.h>
@@ -1094,8 +1093,6 @@ void crash_kexec(struct pt_regs *regs)
1094 if (kexec_crash_image) { 1093 if (kexec_crash_image) {
1095 struct pt_regs fixed_regs; 1094 struct pt_regs fixed_regs;
1096 1095
1097 kmsg_dump(KMSG_DUMP_KEXEC);
1098
1099 crash_setup_regs(&fixed_regs, regs); 1096 crash_setup_regs(&fixed_regs, regs);
1100 crash_save_vmcoreinfo(); 1097 crash_save_vmcoreinfo();
1101 machine_crash_shutdown(&fixed_regs); 1098 machine_crash_shutdown(&fixed_regs);
@@ -1132,6 +1129,8 @@ int crash_shrink_memory(unsigned long new_size)
1132{ 1129{
1133 int ret = 0; 1130 int ret = 0;
1134 unsigned long start, end; 1131 unsigned long start, end;
1132 unsigned long old_size;
1133 struct resource *ram_res;
1135 1134
1136 mutex_lock(&kexec_mutex); 1135 mutex_lock(&kexec_mutex);
1137 1136
@@ -1141,11 +1140,15 @@ int crash_shrink_memory(unsigned long new_size)
1141 } 1140 }
1142 start = crashk_res.start; 1141 start = crashk_res.start;
1143 end = crashk_res.end; 1142 end = crashk_res.end;
1143 old_size = (end == 0) ? 0 : end - start + 1;
1144 if (new_size >= old_size) {
1145 ret = (new_size == old_size) ? 0 : -EINVAL;
1146 goto unlock;
1147 }
1144 1148
1145 if (new_size >= end - start + 1) { 1149 ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
1146 ret = -EINVAL; 1150 if (!ram_res) {
1147 if (new_size == end - start + 1) 1151 ret = -ENOMEM;
1148 ret = 0;
1149 goto unlock; 1152 goto unlock;
1150 } 1153 }
1151 1154
@@ -1157,7 +1160,15 @@ int crash_shrink_memory(unsigned long new_size)
1157 1160
1158 if ((start == end) && (crashk_res.parent != NULL)) 1161 if ((start == end) && (crashk_res.parent != NULL))
1159 release_resource(&crashk_res); 1162 release_resource(&crashk_res);
1163
1164 ram_res->start = end;
1165 ram_res->end = crashk_res.end;
1166 ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
1167 ram_res->name = "System RAM";
1168
1160 crashk_res.end = end - 1; 1169 crashk_res.end = end - 1;
1170
1171 insert_resource(&iomem_resource, ram_res);
1161 crash_unmap_reserved_pages(); 1172 crash_unmap_reserved_pages();
1162 1173
1163unlock: 1174unlock:
@@ -1523,7 +1534,7 @@ int kernel_kexec(void)
1523 1534
1524#ifdef CONFIG_KEXEC_JUMP 1535#ifdef CONFIG_KEXEC_JUMP
1525 if (kexec_image->preserve_context) { 1536 if (kexec_image->preserve_context) {
1526 mutex_lock(&pm_mutex); 1537 lock_system_sleep();
1527 pm_prepare_console(); 1538 pm_prepare_console();
1528 error = freeze_processes(); 1539 error = freeze_processes();
1529 if (error) { 1540 if (error) {
@@ -1576,7 +1587,7 @@ int kernel_kexec(void)
1576 thaw_processes(); 1587 thaw_processes();
1577 Restore_console: 1588 Restore_console:
1578 pm_restore_console(); 1589 pm_restore_console();
1579 mutex_unlock(&pm_mutex); 1590 unlock_system_sleep();
1580 } 1591 }
1581#endif 1592#endif
1582 1593
diff --git a/kernel/kmod.c b/kernel/kmod.c
index a4bea97c75b6..a0a88543934e 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -36,6 +36,7 @@
36#include <linux/resource.h> 36#include <linux/resource.h>
37#include <linux/notifier.h> 37#include <linux/notifier.h>
38#include <linux/suspend.h> 38#include <linux/suspend.h>
39#include <linux/rwsem.h>
39#include <asm/uaccess.h> 40#include <asm/uaccess.h>
40 41
41#include <trace/events/module.h> 42#include <trace/events/module.h>
@@ -50,6 +51,7 @@ static struct workqueue_struct *khelper_wq;
50static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; 51static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
51static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; 52static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
52static DEFINE_SPINLOCK(umh_sysctl_lock); 53static DEFINE_SPINLOCK(umh_sysctl_lock);
54static DECLARE_RWSEM(umhelper_sem);
53 55
54#ifdef CONFIG_MODULES 56#ifdef CONFIG_MODULES
55 57
@@ -275,6 +277,7 @@ static void __call_usermodehelper(struct work_struct *work)
275 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY 277 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
276 * (used for preventing user land processes from being created after the user 278 * (used for preventing user land processes from being created after the user
277 * land has been frozen during a system-wide hibernation or suspend operation). 279 * land has been frozen during a system-wide hibernation or suspend operation).
280 * Should always be manipulated under umhelper_sem acquired for write.
278 */ 281 */
279static int usermodehelper_disabled = 1; 282static int usermodehelper_disabled = 1;
280 283
@@ -282,17 +285,29 @@ static int usermodehelper_disabled = 1;
282static atomic_t running_helpers = ATOMIC_INIT(0); 285static atomic_t running_helpers = ATOMIC_INIT(0);
283 286
284/* 287/*
285 * Wait queue head used by usermodehelper_pm_callback() to wait for all running 288 * Wait queue head used by usermodehelper_disable() to wait for all running
286 * helpers to finish. 289 * helpers to finish.
287 */ 290 */
288static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); 291static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);
289 292
290/* 293/*
291 * Time to wait for running_helpers to become zero before the setting of 294 * Time to wait for running_helpers to become zero before the setting of
292 * usermodehelper_disabled in usermodehelper_pm_callback() fails 295 * usermodehelper_disabled in usermodehelper_disable() fails
293 */ 296 */
294#define RUNNING_HELPERS_TIMEOUT (5 * HZ) 297#define RUNNING_HELPERS_TIMEOUT (5 * HZ)
295 298
299void read_lock_usermodehelper(void)
300{
301 down_read(&umhelper_sem);
302}
303EXPORT_SYMBOL_GPL(read_lock_usermodehelper);
304
305void read_unlock_usermodehelper(void)
306{
307 up_read(&umhelper_sem);
308}
309EXPORT_SYMBOL_GPL(read_unlock_usermodehelper);
310
296/** 311/**
297 * usermodehelper_disable - prevent new helpers from being started 312 * usermodehelper_disable - prevent new helpers from being started
298 */ 313 */
@@ -300,8 +315,10 @@ int usermodehelper_disable(void)
300{ 315{
301 long retval; 316 long retval;
302 317
318 down_write(&umhelper_sem);
303 usermodehelper_disabled = 1; 319 usermodehelper_disabled = 1;
304 smp_mb(); 320 up_write(&umhelper_sem);
321
305 /* 322 /*
306 * From now on call_usermodehelper_exec() won't start any new 323 * From now on call_usermodehelper_exec() won't start any new
307 * helpers, so it is sufficient if running_helpers turns out to 324 * helpers, so it is sufficient if running_helpers turns out to
@@ -314,7 +331,9 @@ int usermodehelper_disable(void)
314 if (retval) 331 if (retval)
315 return 0; 332 return 0;
316 333
334 down_write(&umhelper_sem);
317 usermodehelper_disabled = 0; 335 usermodehelper_disabled = 0;
336 up_write(&umhelper_sem);
318 return -EAGAIN; 337 return -EAGAIN;
319} 338}
320 339
@@ -323,7 +342,9 @@ int usermodehelper_disable(void)
323 */ 342 */
324void usermodehelper_enable(void) 343void usermodehelper_enable(void)
325{ 344{
345 down_write(&umhelper_sem);
326 usermodehelper_disabled = 0; 346 usermodehelper_disabled = 0;
347 up_write(&umhelper_sem);
327} 348}
328 349
329/** 350/**
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e5d84644823b..29f5b65bee29 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1077,6 +1077,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
1077 /* Early boot. kretprobe_table_locks not yet initialized. */ 1077 /* Early boot. kretprobe_table_locks not yet initialized. */
1078 return; 1078 return;
1079 1079
1080 INIT_HLIST_HEAD(&empty_rp);
1080 hash = hash_ptr(tk, KPROBE_HASH_BITS); 1081 hash = hash_ptr(tk, KPROBE_HASH_BITS);
1081 head = &kretprobe_inst_table[hash]; 1082 head = &kretprobe_inst_table[hash];
1082 kretprobe_table_lock(hash, &flags); 1083 kretprobe_table_lock(hash, &flags);
@@ -1085,7 +1086,6 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
1085 recycle_rp_inst(ri, &empty_rp); 1086 recycle_rp_inst(ri, &empty_rp);
1086 } 1087 }
1087 kretprobe_table_unlock(hash, &flags); 1088 kretprobe_table_unlock(hash, &flags);
1088 INIT_HLIST_HEAD(&empty_rp);
1089 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { 1089 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
1090 hlist_del(&ri->hlist); 1090 hlist_del(&ri->hlist);
1091 kfree(ri); 1091 kfree(ri);
@@ -2198,7 +2198,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
2198 const char __user *user_buf, size_t count, loff_t *ppos) 2198 const char __user *user_buf, size_t count, loff_t *ppos)
2199{ 2199{
2200 char buf[32]; 2200 char buf[32];
2201 int buf_size; 2201 size_t buf_size;
2202 2202
2203 buf_size = min(count, (sizeof(buf)-1)); 2203 buf_size = min(count, (sizeof(buf)-1));
2204 if (copy_from_user(buf, user_buf, buf_size)) 2204 if (copy_from_user(buf, user_buf, buf_size))
diff --git a/kernel/kthread.c b/kernel/kthread.c
index b6d216a92639..3d3de633702e 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -59,6 +59,31 @@ int kthread_should_stop(void)
59EXPORT_SYMBOL(kthread_should_stop); 59EXPORT_SYMBOL(kthread_should_stop);
60 60
61/** 61/**
62 * kthread_freezable_should_stop - should this freezable kthread return now?
63 * @was_frozen: optional out parameter, indicates whether %current was frozen
64 *
65 * kthread_should_stop() for freezable kthreads, which will enter
66 * refrigerator if necessary. This function is safe from kthread_stop() /
67 * freezer deadlock and freezable kthreads should use this function instead
68 * of calling try_to_freeze() directly.
69 */
70bool kthread_freezable_should_stop(bool *was_frozen)
71{
72 bool frozen = false;
73
74 might_sleep();
75
76 if (unlikely(freezing(current)))
77 frozen = __refrigerator(true);
78
79 if (was_frozen)
80 *was_frozen = frozen;
81
82 return kthread_should_stop();
83}
84EXPORT_SYMBOL_GPL(kthread_freezable_should_stop);
85
86/**
62 * kthread_data - return data value specified on kthread creation 87 * kthread_data - return data value specified on kthread creation
63 * @task: kthread task in question 88 * @task: kthread task in question
64 * 89 *
@@ -257,7 +282,7 @@ int kthreadd(void *unused)
257 set_cpus_allowed_ptr(tsk, cpu_all_mask); 282 set_cpus_allowed_ptr(tsk, cpu_all_mask);
258 set_mems_allowed(node_states[N_HIGH_MEMORY]); 283 set_mems_allowed(node_states[N_HIGH_MEMORY]);
259 284
260 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; 285 current->flags |= PF_NOFREEZE;
261 286
262 for (;;) { 287 for (;;) {
263 set_current_state(TASK_INTERRUPTIBLE); 288 set_current_state(TASK_INTERRUPTIBLE);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index b2e08c932d91..8889f7dd7c46 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -431,6 +431,7 @@ unsigned int max_lockdep_depth;
431 * about it later on, in lockdep_info(). 431 * about it later on, in lockdep_info().
432 */ 432 */
433static int lockdep_init_error; 433static int lockdep_init_error;
434static const char *lock_init_error;
434static unsigned long lockdep_init_trace_data[20]; 435static unsigned long lockdep_init_trace_data[20];
435static struct stack_trace lockdep_init_trace = { 436static struct stack_trace lockdep_init_trace = {
436 .max_entries = ARRAY_SIZE(lockdep_init_trace_data), 437 .max_entries = ARRAY_SIZE(lockdep_init_trace_data),
@@ -499,36 +500,32 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS])
499 usage[i] = '\0'; 500 usage[i] = '\0';
500} 501}
501 502
502static int __print_lock_name(struct lock_class *class) 503static void __print_lock_name(struct lock_class *class)
503{ 504{
504 char str[KSYM_NAME_LEN]; 505 char str[KSYM_NAME_LEN];
505 const char *name; 506 const char *name;
506 507
507 name = class->name; 508 name = class->name;
508 if (!name)
509 name = __get_key_name(class->key, str);
510
511 return printk("%s", name);
512}
513
514static void print_lock_name(struct lock_class *class)
515{
516 char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS];
517 const char *name;
518
519 get_usage_chars(class, usage);
520
521 name = class->name;
522 if (!name) { 509 if (!name) {
523 name = __get_key_name(class->key, str); 510 name = __get_key_name(class->key, str);
524 printk(" (%s", name); 511 printk("%s", name);
525 } else { 512 } else {
526 printk(" (%s", name); 513 printk("%s", name);
527 if (class->name_version > 1) 514 if (class->name_version > 1)
528 printk("#%d", class->name_version); 515 printk("#%d", class->name_version);
529 if (class->subclass) 516 if (class->subclass)
530 printk("/%d", class->subclass); 517 printk("/%d", class->subclass);
531 } 518 }
519}
520
521static void print_lock_name(struct lock_class *class)
522{
523 char usage[LOCK_USAGE_CHARS];
524
525 get_usage_chars(class, usage);
526
527 printk(" (");
528 __print_lock_name(class);
532 printk("){%s}", usage); 529 printk("){%s}", usage);
533} 530}
534 531
@@ -568,11 +565,12 @@ static void lockdep_print_held_locks(struct task_struct *curr)
568 } 565 }
569} 566}
570 567
571static void print_kernel_version(void) 568static void print_kernel_ident(void)
572{ 569{
573 printk("%s %.*s\n", init_utsname()->release, 570 printk("%s %.*s %s\n", init_utsname()->release,
574 (int)strcspn(init_utsname()->version, " "), 571 (int)strcspn(init_utsname()->version, " "),
575 init_utsname()->version); 572 init_utsname()->version,
573 print_tainted());
576} 574}
577 575
578static int very_verbose(struct lock_class *class) 576static int very_verbose(struct lock_class *class)
@@ -656,6 +654,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
656 if (unlikely(!lockdep_initialized)) { 654 if (unlikely(!lockdep_initialized)) {
657 lockdep_init(); 655 lockdep_init();
658 lockdep_init_error = 1; 656 lockdep_init_error = 1;
657 lock_init_error = lock->name;
659 save_stack_trace(&lockdep_init_trace); 658 save_stack_trace(&lockdep_init_trace);
660 } 659 }
661#endif 660#endif
@@ -723,7 +722,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
723 722
724 class = look_up_lock_class(lock, subclass); 723 class = look_up_lock_class(lock, subclass);
725 if (likely(class)) 724 if (likely(class))
726 return class; 725 goto out_set_class_cache;
727 726
728 /* 727 /*
729 * Debug-check: all keys must be persistent! 728 * Debug-check: all keys must be persistent!
@@ -808,6 +807,7 @@ out_unlock_set:
808 graph_unlock(); 807 graph_unlock();
809 raw_local_irq_restore(flags); 808 raw_local_irq_restore(flags);
810 809
810out_set_class_cache:
811 if (!subclass || force) 811 if (!subclass || force)
812 lock->class_cache[0] = class; 812 lock->class_cache[0] = class;
813 else if (subclass < NR_LOCKDEP_CACHING_CLASSES) 813 else if (subclass < NR_LOCKDEP_CACHING_CLASSES)
@@ -1149,7 +1149,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
1149 printk("\n"); 1149 printk("\n");
1150 printk("======================================================\n"); 1150 printk("======================================================\n");
1151 printk("[ INFO: possible circular locking dependency detected ]\n"); 1151 printk("[ INFO: possible circular locking dependency detected ]\n");
1152 print_kernel_version(); 1152 print_kernel_ident();
1153 printk("-------------------------------------------------------\n"); 1153 printk("-------------------------------------------------------\n");
1154 printk("%s/%d is trying to acquire lock:\n", 1154 printk("%s/%d is trying to acquire lock:\n",
1155 curr->comm, task_pid_nr(curr)); 1155 curr->comm, task_pid_nr(curr));
@@ -1488,7 +1488,7 @@ print_bad_irq_dependency(struct task_struct *curr,
1488 printk("======================================================\n"); 1488 printk("======================================================\n");
1489 printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", 1489 printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
1490 irqclass, irqclass); 1490 irqclass, irqclass);
1491 print_kernel_version(); 1491 print_kernel_ident();
1492 printk("------------------------------------------------------\n"); 1492 printk("------------------------------------------------------\n");
1493 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", 1493 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
1494 curr->comm, task_pid_nr(curr), 1494 curr->comm, task_pid_nr(curr),
@@ -1717,7 +1717,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
1717 printk("\n"); 1717 printk("\n");
1718 printk("=============================================\n"); 1718 printk("=============================================\n");
1719 printk("[ INFO: possible recursive locking detected ]\n"); 1719 printk("[ INFO: possible recursive locking detected ]\n");
1720 print_kernel_version(); 1720 print_kernel_ident();
1721 printk("---------------------------------------------\n"); 1721 printk("---------------------------------------------\n");
1722 printk("%s/%d is trying to acquire lock:\n", 1722 printk("%s/%d is trying to acquire lock:\n",
1723 curr->comm, task_pid_nr(curr)); 1723 curr->comm, task_pid_nr(curr));
@@ -2224,7 +2224,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
2224 printk("\n"); 2224 printk("\n");
2225 printk("=================================\n"); 2225 printk("=================================\n");
2226 printk("[ INFO: inconsistent lock state ]\n"); 2226 printk("[ INFO: inconsistent lock state ]\n");
2227 print_kernel_version(); 2227 print_kernel_ident();
2228 printk("---------------------------------\n"); 2228 printk("---------------------------------\n");
2229 2229
2230 printk("inconsistent {%s} -> {%s} usage.\n", 2230 printk("inconsistent {%s} -> {%s} usage.\n",
@@ -2289,7 +2289,7 @@ print_irq_inversion_bug(struct task_struct *curr,
2289 printk("\n"); 2289 printk("\n");
2290 printk("=========================================================\n"); 2290 printk("=========================================================\n");
2291 printk("[ INFO: possible irq lock inversion dependency detected ]\n"); 2291 printk("[ INFO: possible irq lock inversion dependency detected ]\n");
2292 print_kernel_version(); 2292 print_kernel_ident();
2293 printk("---------------------------------------------------------\n"); 2293 printk("---------------------------------------------------------\n");
2294 printk("%s/%d just changed the state of lock:\n", 2294 printk("%s/%d just changed the state of lock:\n",
2295 curr->comm, task_pid_nr(curr)); 2295 curr->comm, task_pid_nr(curr));
@@ -3175,6 +3175,7 @@ print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
3175 printk("\n"); 3175 printk("\n");
3176 printk("=====================================\n"); 3176 printk("=====================================\n");
3177 printk("[ BUG: bad unlock balance detected! ]\n"); 3177 printk("[ BUG: bad unlock balance detected! ]\n");
3178 print_kernel_ident();
3178 printk("-------------------------------------\n"); 3179 printk("-------------------------------------\n");
3179 printk("%s/%d is trying to release lock (", 3180 printk("%s/%d is trying to release lock (",
3180 curr->comm, task_pid_nr(curr)); 3181 curr->comm, task_pid_nr(curr));
@@ -3619,6 +3620,7 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
3619 printk("\n"); 3620 printk("\n");
3620 printk("=================================\n"); 3621 printk("=================================\n");
3621 printk("[ BUG: bad contention detected! ]\n"); 3622 printk("[ BUG: bad contention detected! ]\n");
3623 print_kernel_ident();
3622 printk("---------------------------------\n"); 3624 printk("---------------------------------\n");
3623 printk("%s/%d is trying to contend lock (", 3625 printk("%s/%d is trying to contend lock (",
3624 curr->comm, task_pid_nr(curr)); 3626 curr->comm, task_pid_nr(curr));
@@ -3974,7 +3976,8 @@ void __init lockdep_info(void)
3974 3976
3975#ifdef CONFIG_DEBUG_LOCKDEP 3977#ifdef CONFIG_DEBUG_LOCKDEP
3976 if (lockdep_init_error) { 3978 if (lockdep_init_error) {
3977 printk("WARNING: lockdep init error! Arch code didn't call lockdep_init() early enough?\n"); 3979 printk("WARNING: lockdep init error! lock-%s was acquired"
3980 "before lockdep_init\n", lock_init_error);
3978 printk("Call stack leading to lockdep invocation was:\n"); 3981 printk("Call stack leading to lockdep invocation was:\n");
3979 print_stack_trace(&lockdep_init_trace, 0); 3982 print_stack_trace(&lockdep_init_trace, 0);
3980 } 3983 }
@@ -3993,6 +3996,7 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
3993 printk("\n"); 3996 printk("\n");
3994 printk("=========================\n"); 3997 printk("=========================\n");
3995 printk("[ BUG: held lock freed! ]\n"); 3998 printk("[ BUG: held lock freed! ]\n");
3999 print_kernel_ident();
3996 printk("-------------------------\n"); 4000 printk("-------------------------\n");
3997 printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", 4001 printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
3998 curr->comm, task_pid_nr(curr), mem_from, mem_to-1); 4002 curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
@@ -4050,6 +4054,7 @@ static void print_held_locks_bug(struct task_struct *curr)
4050 printk("\n"); 4054 printk("\n");
4051 printk("=====================================\n"); 4055 printk("=====================================\n");
4052 printk("[ BUG: lock held at task exit time! ]\n"); 4056 printk("[ BUG: lock held at task exit time! ]\n");
4057 print_kernel_ident();
4053 printk("-------------------------------------\n"); 4058 printk("-------------------------------------\n");
4054 printk("%s/%d is exiting with locks still held!\n", 4059 printk("%s/%d is exiting with locks still held!\n",
4055 curr->comm, task_pid_nr(curr)); 4060 curr->comm, task_pid_nr(curr));
@@ -4147,6 +4152,7 @@ void lockdep_sys_exit(void)
4147 printk("\n"); 4152 printk("\n");
4148 printk("================================================\n"); 4153 printk("================================================\n");
4149 printk("[ BUG: lock held when returning to user space! ]\n"); 4154 printk("[ BUG: lock held when returning to user space! ]\n");
4155 print_kernel_ident();
4150 printk("------------------------------------------------\n"); 4156 printk("------------------------------------------------\n");
4151 printk("%s/%d is leaving the kernel with locks still held!\n", 4157 printk("%s/%d is leaving the kernel with locks still held!\n",
4152 curr->comm, curr->pid); 4158 curr->comm, curr->pid);
@@ -4166,10 +4172,33 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
4166 printk("\n"); 4172 printk("\n");
4167 printk("===============================\n"); 4173 printk("===============================\n");
4168 printk("[ INFO: suspicious RCU usage. ]\n"); 4174 printk("[ INFO: suspicious RCU usage. ]\n");
4175 print_kernel_ident();
4169 printk("-------------------------------\n"); 4176 printk("-------------------------------\n");
4170 printk("%s:%d %s!\n", file, line, s); 4177 printk("%s:%d %s!\n", file, line, s);
4171 printk("\nother info that might help us debug this:\n\n"); 4178 printk("\nother info that might help us debug this:\n\n");
4172 printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks); 4179 printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks);
4180
4181 /*
4182 * If a CPU is in the RCU-free window in idle (ie: in the section
4183 * between rcu_idle_enter() and rcu_idle_exit(), then RCU
4184 * considers that CPU to be in an "extended quiescent state",
4185 * which means that RCU will be completely ignoring that CPU.
4186 * Therefore, rcu_read_lock() and friends have absolutely no
4187 * effect on a CPU running in that state. In other words, even if
4188 * such an RCU-idle CPU has called rcu_read_lock(), RCU might well
4189 * delete data structures out from under it. RCU really has no
4190 * choice here: we need to keep an RCU-free window in idle where
4191 * the CPU may possibly enter into low power mode. This way we can
4192 * notice an extended quiescent state to other CPUs that started a grace
4193 * period. Otherwise we would delay any grace period as long as we run
4194 * in the idle task.
4195 *
4196 * So complain bitterly if someone does call rcu_read_lock(),
4197 * rcu_read_lock_bh() and so on from extended quiescent states.
4198 */
4199 if (rcu_is_cpu_idle())
4200 printk("RCU used illegally from extended quiescent state!\n");
4201
4173 lockdep_print_held_locks(curr); 4202 lockdep_print_held_locks(curr);
4174 printk("\nstack backtrace:\n"); 4203 printk("\nstack backtrace:\n");
4175 dump_stack(); 4204 dump_stack();
diff --git a/kernel/module.c b/kernel/module.c
index 178333c48d1e..2c932760fd33 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -62,12 +62,6 @@
62#define CREATE_TRACE_POINTS 62#define CREATE_TRACE_POINTS
63#include <trace/events/module.h> 63#include <trace/events/module.h>
64 64
65#if 0
66#define DEBUGP printk
67#else
68#define DEBUGP(fmt , a...)
69#endif
70
71#ifndef ARCH_SHF_SMALL 65#ifndef ARCH_SHF_SMALL
72#define ARCH_SHF_SMALL 0 66#define ARCH_SHF_SMALL 0
73#endif 67#endif
@@ -138,7 +132,6 @@ struct load_info {
138 unsigned long len; 132 unsigned long len;
139 Elf_Shdr *sechdrs; 133 Elf_Shdr *sechdrs;
140 char *secstrings, *strtab; 134 char *secstrings, *strtab;
141 unsigned long *strmap;
142 unsigned long symoffs, stroffs; 135 unsigned long symoffs, stroffs;
143 struct _ddebug *debug; 136 struct _ddebug *debug;
144 unsigned int num_debug; 137 unsigned int num_debug;
@@ -410,7 +403,7 @@ const struct kernel_symbol *find_symbol(const char *name,
410 return fsa.sym; 403 return fsa.sym;
411 } 404 }
412 405
413 DEBUGP("Failed to find symbol %s\n", name); 406 pr_debug("Failed to find symbol %s\n", name);
414 return NULL; 407 return NULL;
415} 408}
416EXPORT_SYMBOL_GPL(find_symbol); 409EXPORT_SYMBOL_GPL(find_symbol);
@@ -600,11 +593,11 @@ static int already_uses(struct module *a, struct module *b)
600 593
601 list_for_each_entry(use, &b->source_list, source_list) { 594 list_for_each_entry(use, &b->source_list, source_list) {
602 if (use->source == a) { 595 if (use->source == a) {
603 DEBUGP("%s uses %s!\n", a->name, b->name); 596 pr_debug("%s uses %s!\n", a->name, b->name);
604 return 1; 597 return 1;
605 } 598 }
606 } 599 }
607 DEBUGP("%s does not use %s!\n", a->name, b->name); 600 pr_debug("%s does not use %s!\n", a->name, b->name);
608 return 0; 601 return 0;
609} 602}
610 603
@@ -619,7 +612,7 @@ static int add_module_usage(struct module *a, struct module *b)
619{ 612{
620 struct module_use *use; 613 struct module_use *use;
621 614
622 DEBUGP("Allocating new usage for %s.\n", a->name); 615 pr_debug("Allocating new usage for %s.\n", a->name);
623 use = kmalloc(sizeof(*use), GFP_ATOMIC); 616 use = kmalloc(sizeof(*use), GFP_ATOMIC);
624 if (!use) { 617 if (!use) {
625 printk(KERN_WARNING "%s: out of memory loading\n", a->name); 618 printk(KERN_WARNING "%s: out of memory loading\n", a->name);
@@ -663,7 +656,7 @@ static void module_unload_free(struct module *mod)
663 mutex_lock(&module_mutex); 656 mutex_lock(&module_mutex);
664 list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) { 657 list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) {
665 struct module *i = use->target; 658 struct module *i = use->target;
666 DEBUGP("%s unusing %s\n", mod->name, i->name); 659 pr_debug("%s unusing %s\n", mod->name, i->name);
667 module_put(i); 660 module_put(i);
668 list_del(&use->source_list); 661 list_del(&use->source_list);
669 list_del(&use->target_list); 662 list_del(&use->target_list);
@@ -726,9 +719,9 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
726 } 719 }
727} 720}
728 721
729unsigned int module_refcount(struct module *mod) 722unsigned long module_refcount(struct module *mod)
730{ 723{
731 unsigned int incs = 0, decs = 0; 724 unsigned long incs = 0, decs = 0;
732 int cpu; 725 int cpu;
733 726
734 for_each_possible_cpu(cpu) 727 for_each_possible_cpu(cpu)
@@ -761,7 +754,7 @@ static void wait_for_zero_refcount(struct module *mod)
761 /* Since we might sleep for some time, release the mutex first */ 754 /* Since we might sleep for some time, release the mutex first */
762 mutex_unlock(&module_mutex); 755 mutex_unlock(&module_mutex);
763 for (;;) { 756 for (;;) {
764 DEBUGP("Looking at refcount...\n"); 757 pr_debug("Looking at refcount...\n");
765 set_current_state(TASK_UNINTERRUPTIBLE); 758 set_current_state(TASK_UNINTERRUPTIBLE);
766 if (module_refcount(mod) == 0) 759 if (module_refcount(mod) == 0)
767 break; 760 break;
@@ -804,7 +797,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
804 if (mod->state != MODULE_STATE_LIVE) { 797 if (mod->state != MODULE_STATE_LIVE) {
805 /* FIXME: if (force), slam module count and wake up 798 /* FIXME: if (force), slam module count and wake up
806 waiter --RR */ 799 waiter --RR */
807 DEBUGP("%s already dying\n", mod->name); 800 pr_debug("%s already dying\n", mod->name);
808 ret = -EBUSY; 801 ret = -EBUSY;
809 goto out; 802 goto out;
810 } 803 }
@@ -854,7 +847,7 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
854 struct module_use *use; 847 struct module_use *use;
855 int printed_something = 0; 848 int printed_something = 0;
856 849
857 seq_printf(m, " %u ", module_refcount(mod)); 850 seq_printf(m, " %lu ", module_refcount(mod));
858 851
859 /* Always include a trailing , so userspace can differentiate 852 /* Always include a trailing , so userspace can differentiate
860 between this and the old multi-field proc format. */ 853 between this and the old multi-field proc format. */
@@ -904,13 +897,11 @@ EXPORT_SYMBOL_GPL(symbol_put_addr);
904static ssize_t show_refcnt(struct module_attribute *mattr, 897static ssize_t show_refcnt(struct module_attribute *mattr,
905 struct module_kobject *mk, char *buffer) 898 struct module_kobject *mk, char *buffer)
906{ 899{
907 return sprintf(buffer, "%u\n", module_refcount(mk->mod)); 900 return sprintf(buffer, "%lu\n", module_refcount(mk->mod));
908} 901}
909 902
910static struct module_attribute refcnt = { 903static struct module_attribute modinfo_refcnt =
911 .attr = { .name = "refcnt", .mode = 0444 }, 904 __ATTR(refcnt, 0444, show_refcnt, NULL);
912 .show = show_refcnt,
913};
914 905
915void module_put(struct module *module) 906void module_put(struct module *module)
916{ 907{
@@ -951,6 +942,26 @@ static inline int module_unload_init(struct module *mod)
951} 942}
952#endif /* CONFIG_MODULE_UNLOAD */ 943#endif /* CONFIG_MODULE_UNLOAD */
953 944
945static size_t module_flags_taint(struct module *mod, char *buf)
946{
947 size_t l = 0;
948
949 if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
950 buf[l++] = 'P';
951 if (mod->taints & (1 << TAINT_OOT_MODULE))
952 buf[l++] = 'O';
953 if (mod->taints & (1 << TAINT_FORCED_MODULE))
954 buf[l++] = 'F';
955 if (mod->taints & (1 << TAINT_CRAP))
956 buf[l++] = 'C';
957 /*
958 * TAINT_FORCED_RMMOD: could be added.
959 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
960 * apply to modules.
961 */
962 return l;
963}
964
954static ssize_t show_initstate(struct module_attribute *mattr, 965static ssize_t show_initstate(struct module_attribute *mattr,
955 struct module_kobject *mk, char *buffer) 966 struct module_kobject *mk, char *buffer)
956{ 967{
@@ -970,10 +981,8 @@ static ssize_t show_initstate(struct module_attribute *mattr,
970 return sprintf(buffer, "%s\n", state); 981 return sprintf(buffer, "%s\n", state);
971} 982}
972 983
973static struct module_attribute initstate = { 984static struct module_attribute modinfo_initstate =
974 .attr = { .name = "initstate", .mode = 0444 }, 985 __ATTR(initstate, 0444, show_initstate, NULL);
975 .show = show_initstate,
976};
977 986
978static ssize_t store_uevent(struct module_attribute *mattr, 987static ssize_t store_uevent(struct module_attribute *mattr,
979 struct module_kobject *mk, 988 struct module_kobject *mk,
@@ -986,18 +995,50 @@ static ssize_t store_uevent(struct module_attribute *mattr,
986 return count; 995 return count;
987} 996}
988 997
989struct module_attribute module_uevent = { 998struct module_attribute module_uevent =
990 .attr = { .name = "uevent", .mode = 0200 }, 999 __ATTR(uevent, 0200, NULL, store_uevent);
991 .store = store_uevent, 1000
992}; 1001static ssize_t show_coresize(struct module_attribute *mattr,
1002 struct module_kobject *mk, char *buffer)
1003{
1004 return sprintf(buffer, "%u\n", mk->mod->core_size);
1005}
1006
1007static struct module_attribute modinfo_coresize =
1008 __ATTR(coresize, 0444, show_coresize, NULL);
1009
1010static ssize_t show_initsize(struct module_attribute *mattr,
1011 struct module_kobject *mk, char *buffer)
1012{
1013 return sprintf(buffer, "%u\n", mk->mod->init_size);
1014}
1015
1016static struct module_attribute modinfo_initsize =
1017 __ATTR(initsize, 0444, show_initsize, NULL);
1018
1019static ssize_t show_taint(struct module_attribute *mattr,
1020 struct module_kobject *mk, char *buffer)
1021{
1022 size_t l;
1023
1024 l = module_flags_taint(mk->mod, buffer);
1025 buffer[l++] = '\n';
1026 return l;
1027}
1028
1029static struct module_attribute modinfo_taint =
1030 __ATTR(taint, 0444, show_taint, NULL);
993 1031
994static struct module_attribute *modinfo_attrs[] = { 1032static struct module_attribute *modinfo_attrs[] = {
1033 &module_uevent,
995 &modinfo_version, 1034 &modinfo_version,
996 &modinfo_srcversion, 1035 &modinfo_srcversion,
997 &initstate, 1036 &modinfo_initstate,
998 &module_uevent, 1037 &modinfo_coresize,
1038 &modinfo_initsize,
1039 &modinfo_taint,
999#ifdef CONFIG_MODULE_UNLOAD 1040#ifdef CONFIG_MODULE_UNLOAD
1000 &refcnt, 1041 &modinfo_refcnt,
1001#endif 1042#endif
1002 NULL, 1043 NULL,
1003}; 1044};
@@ -1057,7 +1098,7 @@ static int check_version(Elf_Shdr *sechdrs,
1057 1098
1058 if (versions[i].crc == maybe_relocated(*crc, crc_owner)) 1099 if (versions[i].crc == maybe_relocated(*crc, crc_owner))
1059 return 1; 1100 return 1;
1060 DEBUGP("Found checksum %lX vs module %lX\n", 1101 pr_debug("Found checksum %lX vs module %lX\n",
1061 maybe_relocated(*crc, crc_owner), versions[i].crc); 1102 maybe_relocated(*crc, crc_owner), versions[i].crc);
1062 goto bad_version; 1103 goto bad_version;
1063 } 1104 }
@@ -1834,7 +1875,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
1834 case SHN_COMMON: 1875 case SHN_COMMON:
1835 /* We compiled with -fno-common. These are not 1876 /* We compiled with -fno-common. These are not
1836 supposed to happen. */ 1877 supposed to happen. */
1837 DEBUGP("Common symbol: %s\n", name); 1878 pr_debug("Common symbol: %s\n", name);
1838 printk("%s: please compile with -fno-common\n", 1879 printk("%s: please compile with -fno-common\n",
1839 mod->name); 1880 mod->name);
1840 ret = -ENOEXEC; 1881 ret = -ENOEXEC;
@@ -1842,7 +1883,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
1842 1883
1843 case SHN_ABS: 1884 case SHN_ABS:
1844 /* Don't need to do anything */ 1885 /* Don't need to do anything */
1845 DEBUGP("Absolute symbol: 0x%08lx\n", 1886 pr_debug("Absolute symbol: 0x%08lx\n",
1846 (long)sym[i].st_value); 1887 (long)sym[i].st_value);
1847 break; 1888 break;
1848 1889
@@ -1966,7 +2007,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
1966 for (i = 0; i < info->hdr->e_shnum; i++) 2007 for (i = 0; i < info->hdr->e_shnum; i++)
1967 info->sechdrs[i].sh_entsize = ~0UL; 2008 info->sechdrs[i].sh_entsize = ~0UL;
1968 2009
1969 DEBUGP("Core section allocation order:\n"); 2010 pr_debug("Core section allocation order:\n");
1970 for (m = 0; m < ARRAY_SIZE(masks); ++m) { 2011 for (m = 0; m < ARRAY_SIZE(masks); ++m) {
1971 for (i = 0; i < info->hdr->e_shnum; ++i) { 2012 for (i = 0; i < info->hdr->e_shnum; ++i) {
1972 Elf_Shdr *s = &info->sechdrs[i]; 2013 Elf_Shdr *s = &info->sechdrs[i];
@@ -1978,7 +2019,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
1978 || strstarts(sname, ".init")) 2019 || strstarts(sname, ".init"))
1979 continue; 2020 continue;
1980 s->sh_entsize = get_offset(mod, &mod->core_size, s, i); 2021 s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
1981 DEBUGP("\t%s\n", name); 2022 pr_debug("\t%s\n", sname);
1982 } 2023 }
1983 switch (m) { 2024 switch (m) {
1984 case 0: /* executable */ 2025 case 0: /* executable */
@@ -1995,7 +2036,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
1995 } 2036 }
1996 } 2037 }
1997 2038
1998 DEBUGP("Init section allocation order:\n"); 2039 pr_debug("Init section allocation order:\n");
1999 for (m = 0; m < ARRAY_SIZE(masks); ++m) { 2040 for (m = 0; m < ARRAY_SIZE(masks); ++m) {
2000 for (i = 0; i < info->hdr->e_shnum; ++i) { 2041 for (i = 0; i < info->hdr->e_shnum; ++i) {
2001 Elf_Shdr *s = &info->sechdrs[i]; 2042 Elf_Shdr *s = &info->sechdrs[i];
@@ -2008,7 +2049,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
2008 continue; 2049 continue;
2009 s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) 2050 s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
2010 | INIT_OFFSET_MASK); 2051 | INIT_OFFSET_MASK);
2011 DEBUGP("\t%s\n", sname); 2052 pr_debug("\t%s\n", sname);
2012 } 2053 }
2013 switch (m) { 2054 switch (m) {
2014 case 0: /* executable */ 2055 case 0: /* executable */
@@ -2178,45 +2219,46 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
2178 return true; 2219 return true;
2179} 2220}
2180 2221
2222/*
2223 * We only allocate and copy the strings needed by the parts of symtab
2224 * we keep. This is simple, but has the effect of making multiple
2225 * copies of duplicates. We could be more sophisticated, see
2226 * linux-kernel thread starting with
2227 * <73defb5e4bca04a6431392cc341112b1@localhost>.
2228 */
2181static void layout_symtab(struct module *mod, struct load_info *info) 2229static void layout_symtab(struct module *mod, struct load_info *info)
2182{ 2230{
2183 Elf_Shdr *symsect = info->sechdrs + info->index.sym; 2231 Elf_Shdr *symsect = info->sechdrs + info->index.sym;
2184 Elf_Shdr *strsect = info->sechdrs + info->index.str; 2232 Elf_Shdr *strsect = info->sechdrs + info->index.str;
2185 const Elf_Sym *src; 2233 const Elf_Sym *src;
2186 unsigned int i, nsrc, ndst; 2234 unsigned int i, nsrc, ndst, strtab_size;
2187 2235
2188 /* Put symbol section at end of init part of module. */ 2236 /* Put symbol section at end of init part of module. */
2189 symsect->sh_flags |= SHF_ALLOC; 2237 symsect->sh_flags |= SHF_ALLOC;
2190 symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, 2238 symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
2191 info->index.sym) | INIT_OFFSET_MASK; 2239 info->index.sym) | INIT_OFFSET_MASK;
2192 DEBUGP("\t%s\n", info->secstrings + symsect->sh_name); 2240 pr_debug("\t%s\n", info->secstrings + symsect->sh_name);
2193 2241
2194 src = (void *)info->hdr + symsect->sh_offset; 2242 src = (void *)info->hdr + symsect->sh_offset;
2195 nsrc = symsect->sh_size / sizeof(*src); 2243 nsrc = symsect->sh_size / sizeof(*src);
2196 for (ndst = i = 1; i < nsrc; ++i, ++src)
2197 if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) {
2198 unsigned int j = src->st_name;
2199 2244
2200 while (!__test_and_set_bit(j, info->strmap) 2245 /* Compute total space required for the core symbols' strtab. */
2201 && info->strtab[j]) 2246 for (ndst = i = strtab_size = 1; i < nsrc; ++i, ++src)
2202 ++j; 2247 if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) {
2203 ++ndst; 2248 strtab_size += strlen(&info->strtab[src->st_name]) + 1;
2249 ndst++;
2204 } 2250 }
2205 2251
2206 /* Append room for core symbols at end of core part. */ 2252 /* Append room for core symbols at end of core part. */
2207 info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); 2253 info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
2208 mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym); 2254 info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
2255 mod->core_size += strtab_size;
2209 2256
2210 /* Put string table section at end of init part of module. */ 2257 /* Put string table section at end of init part of module. */
2211 strsect->sh_flags |= SHF_ALLOC; 2258 strsect->sh_flags |= SHF_ALLOC;
2212 strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, 2259 strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
2213 info->index.str) | INIT_OFFSET_MASK; 2260 info->index.str) | INIT_OFFSET_MASK;
2214 DEBUGP("\t%s\n", info->secstrings + strsect->sh_name); 2261 pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
2215
2216 /* Append room for core symbols' strings at end of core part. */
2217 info->stroffs = mod->core_size;
2218 __set_bit(0, info->strmap);
2219 mod->core_size += bitmap_weight(info->strmap, strsect->sh_size);
2220} 2262}
2221 2263
2222static void add_kallsyms(struct module *mod, const struct load_info *info) 2264static void add_kallsyms(struct module *mod, const struct load_info *info)
@@ -2237,22 +2279,19 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
2237 mod->symtab[i].st_info = elf_type(&mod->symtab[i], info); 2279 mod->symtab[i].st_info = elf_type(&mod->symtab[i], info);
2238 2280
2239 mod->core_symtab = dst = mod->module_core + info->symoffs; 2281 mod->core_symtab = dst = mod->module_core + info->symoffs;
2282 mod->core_strtab = s = mod->module_core + info->stroffs;
2240 src = mod->symtab; 2283 src = mod->symtab;
2241 *dst = *src; 2284 *dst = *src;
2285 *s++ = 0;
2242 for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { 2286 for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) {
2243 if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) 2287 if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum))
2244 continue; 2288 continue;
2289
2245 dst[ndst] = *src; 2290 dst[ndst] = *src;
2246 dst[ndst].st_name = bitmap_weight(info->strmap, 2291 dst[ndst++].st_name = s - mod->core_strtab;
2247 dst[ndst].st_name); 2292 s += strlcpy(s, &mod->strtab[src->st_name], KSYM_NAME_LEN) + 1;
2248 ++ndst;
2249 } 2293 }
2250 mod->core_num_syms = ndst; 2294 mod->core_num_syms = ndst;
2251
2252 mod->core_strtab = s = mod->module_core + info->stroffs;
2253 for (*s = 0, i = 1; i < info->sechdrs[info->index.str].sh_size; ++i)
2254 if (test_bit(i, info->strmap))
2255 *++s = mod->strtab[i];
2256} 2295}
2257#else 2296#else
2258static inline void layout_symtab(struct module *mod, struct load_info *info) 2297static inline void layout_symtab(struct module *mod, struct load_info *info)
@@ -2621,7 +2660,7 @@ static int move_module(struct module *mod, struct load_info *info)
2621 mod->module_init = ptr; 2660 mod->module_init = ptr;
2622 2661
2623 /* Transfer each section which specifies SHF_ALLOC */ 2662 /* Transfer each section which specifies SHF_ALLOC */
2624 DEBUGP("final section addresses:\n"); 2663 pr_debug("final section addresses:\n");
2625 for (i = 0; i < info->hdr->e_shnum; i++) { 2664 for (i = 0; i < info->hdr->e_shnum; i++) {
2626 void *dest; 2665 void *dest;
2627 Elf_Shdr *shdr = &info->sechdrs[i]; 2666 Elf_Shdr *shdr = &info->sechdrs[i];
@@ -2639,8 +2678,8 @@ static int move_module(struct module *mod, struct load_info *info)
2639 memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size); 2678 memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
2640 /* Update sh_addr to point to copy in image. */ 2679 /* Update sh_addr to point to copy in image. */
2641 shdr->sh_addr = (unsigned long)dest; 2680 shdr->sh_addr = (unsigned long)dest;
2642 DEBUGP("\t0x%lx %s\n", 2681 pr_debug("\t0x%lx %s\n",
2643 shdr->sh_addr, info->secstrings + shdr->sh_name); 2682 (long)shdr->sh_addr, info->secstrings + shdr->sh_name);
2644 } 2683 }
2645 2684
2646 return 0; 2685 return 0;
@@ -2742,27 +2781,18 @@ static struct module *layout_and_allocate(struct load_info *info)
2742 this is done generically; there doesn't appear to be any 2781 this is done generically; there doesn't appear to be any
2743 special cases for the architectures. */ 2782 special cases for the architectures. */
2744 layout_sections(mod, info); 2783 layout_sections(mod, info);
2745
2746 info->strmap = kzalloc(BITS_TO_LONGS(info->sechdrs[info->index.str].sh_size)
2747 * sizeof(long), GFP_KERNEL);
2748 if (!info->strmap) {
2749 err = -ENOMEM;
2750 goto free_percpu;
2751 }
2752 layout_symtab(mod, info); 2784 layout_symtab(mod, info);
2753 2785
2754 /* Allocate and move to the final place */ 2786 /* Allocate and move to the final place */
2755 err = move_module(mod, info); 2787 err = move_module(mod, info);
2756 if (err) 2788 if (err)
2757 goto free_strmap; 2789 goto free_percpu;
2758 2790
2759 /* Module has been copied to its final place now: return it. */ 2791 /* Module has been copied to its final place now: return it. */
2760 mod = (void *)info->sechdrs[info->index.mod].sh_addr; 2792 mod = (void *)info->sechdrs[info->index.mod].sh_addr;
2761 kmemleak_load_module(mod, info); 2793 kmemleak_load_module(mod, info);
2762 return mod; 2794 return mod;
2763 2795
2764free_strmap:
2765 kfree(info->strmap);
2766free_percpu: 2796free_percpu:
2767 percpu_modfree(mod); 2797 percpu_modfree(mod);
2768out: 2798out:
@@ -2772,7 +2802,6 @@ out:
2772/* mod is no longer valid after this! */ 2802/* mod is no longer valid after this! */
2773static void module_deallocate(struct module *mod, struct load_info *info) 2803static void module_deallocate(struct module *mod, struct load_info *info)
2774{ 2804{
2775 kfree(info->strmap);
2776 percpu_modfree(mod); 2805 percpu_modfree(mod);
2777 module_free(mod, mod->module_init); 2806 module_free(mod, mod->module_init);
2778 module_free(mod, mod->module_core); 2807 module_free(mod, mod->module_core);
@@ -2811,7 +2840,7 @@ static struct module *load_module(void __user *umod,
2811 struct module *mod; 2840 struct module *mod;
2812 long err; 2841 long err;
2813 2842
2814 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", 2843 pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n",
2815 umod, len, uargs); 2844 umod, len, uargs);
2816 2845
2817 /* Copy in the blobs from userspace, check they are vaguely sane. */ 2846 /* Copy in the blobs from userspace, check they are vaguely sane. */
@@ -2902,8 +2931,7 @@ static struct module *load_module(void __user *umod,
2902 if (err < 0) 2931 if (err < 0)
2903 goto unlink; 2932 goto unlink;
2904 2933
2905 /* Get rid of temporary copy and strmap. */ 2934 /* Get rid of temporary copy. */
2906 kfree(info.strmap);
2907 free_copy(&info); 2935 free_copy(&info);
2908 2936
2909 /* Done! */ 2937 /* Done! */
@@ -3256,20 +3284,7 @@ static char *module_flags(struct module *mod, char *buf)
3256 mod->state == MODULE_STATE_GOING || 3284 mod->state == MODULE_STATE_GOING ||
3257 mod->state == MODULE_STATE_COMING) { 3285 mod->state == MODULE_STATE_COMING) {
3258 buf[bx++] = '('; 3286 buf[bx++] = '(';
3259 if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE)) 3287 bx += module_flags_taint(mod, buf + bx);
3260 buf[bx++] = 'P';
3261 else if (mod->taints & (1 << TAINT_OOT_MODULE))
3262 buf[bx++] = 'O';
3263 if (mod->taints & (1 << TAINT_FORCED_MODULE))
3264 buf[bx++] = 'F';
3265 if (mod->taints & (1 << TAINT_CRAP))
3266 buf[bx++] = 'C';
3267 /*
3268 * TAINT_FORCED_RMMOD: could be added.
3269 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
3270 * apply to modules.
3271 */
3272
3273 /* Show a - for module-is-being-unloaded */ 3288 /* Show a - for module-is-being-unloaded */
3274 if (mod->state == MODULE_STATE_GOING) 3289 if (mod->state == MODULE_STATE_GOING)
3275 buf[bx++] = '-'; 3290 buf[bx++] = '-';
diff --git a/kernel/panic.c b/kernel/panic.c
index b26593604214..80aed44e345a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -49,6 +49,15 @@ static long no_blink(int state)
49long (*panic_blink)(int state); 49long (*panic_blink)(int state);
50EXPORT_SYMBOL(panic_blink); 50EXPORT_SYMBOL(panic_blink);
51 51
52/*
53 * Stop ourself in panic -- architecture code may override this
54 */
55void __weak panic_smp_self_stop(void)
56{
57 while (1)
58 cpu_relax();
59}
60
52/** 61/**
53 * panic - halt the system 62 * panic - halt the system
54 * @fmt: The text string to print 63 * @fmt: The text string to print
@@ -57,8 +66,9 @@ EXPORT_SYMBOL(panic_blink);
57 * 66 *
58 * This function never returns. 67 * This function never returns.
59 */ 68 */
60NORET_TYPE void panic(const char * fmt, ...) 69void panic(const char *fmt, ...)
61{ 70{
71 static DEFINE_SPINLOCK(panic_lock);
62 static char buf[1024]; 72 static char buf[1024];
63 va_list args; 73 va_list args;
64 long i, i_next = 0; 74 long i, i_next = 0;
@@ -68,8 +78,14 @@ NORET_TYPE void panic(const char * fmt, ...)
68 * It's possible to come here directly from a panic-assertion and 78 * It's possible to come here directly from a panic-assertion and
69 * not have preempt disabled. Some functions called from here want 79 * not have preempt disabled. Some functions called from here want
70 * preempt to be disabled. No point enabling it later though... 80 * preempt to be disabled. No point enabling it later though...
81 *
82 * Only one CPU is allowed to execute the panic code from here. For
83 * multiple parallel invocations of panic, all other CPUs either
84 * stop themself or will wait until they are stopped by the 1st CPU
85 * with smp_send_stop().
71 */ 86 */
72 preempt_disable(); 87 if (!spin_trylock(&panic_lock))
88 panic_smp_self_stop();
73 89
74 console_verbose(); 90 console_verbose();
75 bust_spinlocks(1); 91 bust_spinlocks(1);
@@ -78,7 +94,11 @@ NORET_TYPE void panic(const char * fmt, ...)
78 va_end(args); 94 va_end(args);
79 printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); 95 printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
80#ifdef CONFIG_DEBUG_BUGVERBOSE 96#ifdef CONFIG_DEBUG_BUGVERBOSE
81 dump_stack(); 97 /*
98 * Avoid nested stack-dumping if a panic occurs during oops processing
99 */
100 if (!oops_in_progress)
101 dump_stack();
82#endif 102#endif
83 103
84 /* 104 /*
@@ -237,11 +257,20 @@ void add_taint(unsigned flag)
237 * Can't trust the integrity of the kernel anymore. 257 * Can't trust the integrity of the kernel anymore.
238 * We don't call directly debug_locks_off() because the issue 258 * We don't call directly debug_locks_off() because the issue
239 * is not necessarily serious enough to set oops_in_progress to 1 259 * is not necessarily serious enough to set oops_in_progress to 1
240 * Also we want to keep up lockdep for staging development and 260 * Also we want to keep up lockdep for staging/out-of-tree
241 * post-warning case. 261 * development and post-warning case.
242 */ 262 */
243 if (flag != TAINT_CRAP && flag != TAINT_WARN && __debug_locks_off()) 263 switch (flag) {
244 printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n"); 264 case TAINT_CRAP:
265 case TAINT_OOT_MODULE:
266 case TAINT_WARN:
267 case TAINT_FIRMWARE_WORKAROUND:
268 break;
269
270 default:
271 if (__debug_locks_off())
272 printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n");
273 }
245 274
246 set_bit(flag, &tainted_mask); 275 set_bit(flag, &tainted_mask);
247} 276}
diff --git a/kernel/params.c b/kernel/params.c
index 65aae11eb93f..32ee04308285 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -25,12 +25,6 @@
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/ctype.h> 26#include <linux/ctype.h>
27 27
28#if 0
29#define DEBUGP printk
30#else
31#define DEBUGP(fmt, a...)
32#endif
33
34/* Protects all parameters, and incidentally kmalloced_param list. */ 28/* Protects all parameters, and incidentally kmalloced_param list. */
35static DEFINE_MUTEX(param_lock); 29static DEFINE_MUTEX(param_lock);
36 30
@@ -105,7 +99,7 @@ static int parse_one(char *param,
105 /* No one handled NULL, so do it here. */ 99 /* No one handled NULL, so do it here. */
106 if (!val && params[i].ops->set != param_set_bool) 100 if (!val && params[i].ops->set != param_set_bool)
107 return -EINVAL; 101 return -EINVAL;
108 DEBUGP("They are equal! Calling %p\n", 102 pr_debug("They are equal! Calling %p\n",
109 params[i].ops->set); 103 params[i].ops->set);
110 mutex_lock(&param_lock); 104 mutex_lock(&param_lock);
111 err = params[i].ops->set(val, &params[i]); 105 err = params[i].ops->set(val, &params[i]);
@@ -115,11 +109,11 @@ static int parse_one(char *param,
115 } 109 }
116 110
117 if (handle_unknown) { 111 if (handle_unknown) {
118 DEBUGP("Unknown argument: calling %p\n", handle_unknown); 112 pr_debug("Unknown argument: calling %p\n", handle_unknown);
119 return handle_unknown(param, val); 113 return handle_unknown(param, val);
120 } 114 }
121 115
122 DEBUGP("Unknown argument `%s'\n", param); 116 pr_debug("Unknown argument `%s'\n", param);
123 return -ENOENT; 117 return -ENOENT;
124} 118}
125 119
@@ -184,7 +178,7 @@ int parse_args(const char *name,
184{ 178{
185 char *param, *val; 179 char *param, *val;
186 180
187 DEBUGP("Parsing ARGS: %s\n", args); 181 pr_debug("Parsing ARGS: %s\n", args);
188 182
189 /* Chew leading spaces */ 183 /* Chew leading spaces */
190 args = skip_spaces(args); 184 args = skip_spaces(args);
@@ -369,6 +363,30 @@ struct kernel_param_ops param_ops_invbool = {
369}; 363};
370EXPORT_SYMBOL(param_ops_invbool); 364EXPORT_SYMBOL(param_ops_invbool);
371 365
366int param_set_bint(const char *val, const struct kernel_param *kp)
367{
368 struct kernel_param boolkp;
369 bool v;
370 int ret;
371
372 /* Match bool exactly, by re-using it. */
373 boolkp = *kp;
374 boolkp.arg = &v;
375 boolkp.flags |= KPARAM_ISBOOL;
376
377 ret = param_set_bool(val, &boolkp);
378 if (ret == 0)
379 *(int *)kp->arg = v;
380 return ret;
381}
382EXPORT_SYMBOL(param_set_bint);
383
384struct kernel_param_ops param_ops_bint = {
385 .set = param_set_bint,
386 .get = param_get_int,
387};
388EXPORT_SYMBOL(param_ops_bint);
389
372/* We break the rule and mangle the string. */ 390/* We break the rule and mangle the string. */
373static int param_array(const char *name, 391static int param_array(const char *name,
374 const char *val, 392 const char *val,
diff --git a/kernel/pid.c b/kernel/pid.c
index fa5f72227e5f..ce8e00deaccb 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -137,7 +137,9 @@ static int pid_before(int base, int a, int b)
137} 137}
138 138
139/* 139/*
140 * We might be racing with someone else trying to set pid_ns->last_pid. 140 * We might be racing with someone else trying to set pid_ns->last_pid
141 * at the pid allocation time (there's also a sysctl for this, but racing
142 * with this one is OK, see comment in kernel/pid_namespace.c about it).
141 * We want the winner to have the "later" value, because if the 143 * We want the winner to have the "later" value, because if the
142 * "earlier" value prevails, then a pid may get reused immediately. 144 * "earlier" value prevails, then a pid may get reused immediately.
143 * 145 *
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index e9c9adc84ca6..a8968396046d 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -191,9 +191,40 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
191 return; 191 return;
192} 192}
193 193
194static int pid_ns_ctl_handler(struct ctl_table *table, int write,
195 void __user *buffer, size_t *lenp, loff_t *ppos)
196{
197 struct ctl_table tmp = *table;
198
199 if (write && !capable(CAP_SYS_ADMIN))
200 return -EPERM;
201
202 /*
203 * Writing directly to ns' last_pid field is OK, since this field
204 * is volatile in a living namespace anyway and a code writing to
205 * it should synchronize its usage with external means.
206 */
207
208 tmp.data = &current->nsproxy->pid_ns->last_pid;
209 return proc_dointvec(&tmp, write, buffer, lenp, ppos);
210}
211
212static struct ctl_table pid_ns_ctl_table[] = {
213 {
214 .procname = "ns_last_pid",
215 .maxlen = sizeof(int),
216 .mode = 0666, /* permissions are checked in the handler */
217 .proc_handler = pid_ns_ctl_handler,
218 },
219 { }
220};
221
222static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
223
194static __init int pid_namespaces_init(void) 224static __init int pid_namespaces_init(void)
195{ 225{
196 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); 226 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
227 register_sysctl_paths(kern_path, pid_ns_ctl_table);
197 return 0; 228 return 0;
198} 229}
199 230
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index e7cb76dc18f5..125cb67daa21 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -78,7 +78,7 @@ static inline int cpu_time_before(const clockid_t which_clock,
78 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { 78 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
79 return now.sched < then.sched; 79 return now.sched < then.sched;
80 } else { 80 } else {
81 return cputime_lt(now.cpu, then.cpu); 81 return now.cpu < then.cpu;
82 } 82 }
83} 83}
84static inline void cpu_time_add(const clockid_t which_clock, 84static inline void cpu_time_add(const clockid_t which_clock,
@@ -88,7 +88,7 @@ static inline void cpu_time_add(const clockid_t which_clock,
88 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { 88 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
89 acc->sched += val.sched; 89 acc->sched += val.sched;
90 } else { 90 } else {
91 acc->cpu = cputime_add(acc->cpu, val.cpu); 91 acc->cpu += val.cpu;
92 } 92 }
93} 93}
94static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, 94static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
@@ -98,25 +98,12 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
98 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { 98 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
99 a.sched -= b.sched; 99 a.sched -= b.sched;
100 } else { 100 } else {
101 a.cpu = cputime_sub(a.cpu, b.cpu); 101 a.cpu -= b.cpu;
102 } 102 }
103 return a; 103 return a;
104} 104}
105 105
106/* 106/*
107 * Divide and limit the result to res >= 1
108 *
109 * This is necessary to prevent signal delivery starvation, when the result of
110 * the division would be rounded down to 0.
111 */
112static inline cputime_t cputime_div_non_zero(cputime_t time, unsigned long div)
113{
114 cputime_t res = cputime_div(time, div);
115
116 return max_t(cputime_t, res, 1);
117}
118
119/*
120 * Update expiry time from increment, and increase overrun count, 107 * Update expiry time from increment, and increase overrun count,
121 * given the current clock sample. 108 * given the current clock sample.
122 */ 109 */
@@ -148,28 +135,26 @@ static void bump_cpu_timer(struct k_itimer *timer,
148 } else { 135 } else {
149 cputime_t delta, incr; 136 cputime_t delta, incr;
150 137
151 if (cputime_lt(now.cpu, timer->it.cpu.expires.cpu)) 138 if (now.cpu < timer->it.cpu.expires.cpu)
152 return; 139 return;
153 incr = timer->it.cpu.incr.cpu; 140 incr = timer->it.cpu.incr.cpu;
154 delta = cputime_sub(cputime_add(now.cpu, incr), 141 delta = now.cpu + incr - timer->it.cpu.expires.cpu;
155 timer->it.cpu.expires.cpu);
156 /* Don't use (incr*2 < delta), incr*2 might overflow. */ 142 /* Don't use (incr*2 < delta), incr*2 might overflow. */
157 for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++) 143 for (i = 0; incr < delta - incr; i++)
158 incr = cputime_add(incr, incr); 144 incr += incr;
159 for (; i >= 0; incr = cputime_halve(incr), i--) { 145 for (; i >= 0; incr = incr >> 1, i--) {
160 if (cputime_lt(delta, incr)) 146 if (delta < incr)
161 continue; 147 continue;
162 timer->it.cpu.expires.cpu = 148 timer->it.cpu.expires.cpu += incr;
163 cputime_add(timer->it.cpu.expires.cpu, incr);
164 timer->it_overrun += 1 << i; 149 timer->it_overrun += 1 << i;
165 delta = cputime_sub(delta, incr); 150 delta -= incr;
166 } 151 }
167 } 152 }
168} 153}
169 154
170static inline cputime_t prof_ticks(struct task_struct *p) 155static inline cputime_t prof_ticks(struct task_struct *p)
171{ 156{
172 return cputime_add(p->utime, p->stime); 157 return p->utime + p->stime;
173} 158}
174static inline cputime_t virt_ticks(struct task_struct *p) 159static inline cputime_t virt_ticks(struct task_struct *p)
175{ 160{
@@ -248,8 +233,8 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
248 233
249 t = tsk; 234 t = tsk;
250 do { 235 do {
251 times->utime = cputime_add(times->utime, t->utime); 236 times->utime += t->utime;
252 times->stime = cputime_add(times->stime, t->stime); 237 times->stime += t->stime;
253 times->sum_exec_runtime += task_sched_runtime(t); 238 times->sum_exec_runtime += task_sched_runtime(t);
254 } while_each_thread(tsk, t); 239 } while_each_thread(tsk, t);
255out: 240out:
@@ -258,10 +243,10 @@ out:
258 243
259static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) 244static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
260{ 245{
261 if (cputime_gt(b->utime, a->utime)) 246 if (b->utime > a->utime)
262 a->utime = b->utime; 247 a->utime = b->utime;
263 248
264 if (cputime_gt(b->stime, a->stime)) 249 if (b->stime > a->stime)
265 a->stime = b->stime; 250 a->stime = b->stime;
266 251
267 if (b->sum_exec_runtime > a->sum_exec_runtime) 252 if (b->sum_exec_runtime > a->sum_exec_runtime)
@@ -306,7 +291,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
306 return -EINVAL; 291 return -EINVAL;
307 case CPUCLOCK_PROF: 292 case CPUCLOCK_PROF:
308 thread_group_cputime(p, &cputime); 293 thread_group_cputime(p, &cputime);
309 cpu->cpu = cputime_add(cputime.utime, cputime.stime); 294 cpu->cpu = cputime.utime + cputime.stime;
310 break; 295 break;
311 case CPUCLOCK_VIRT: 296 case CPUCLOCK_VIRT:
312 thread_group_cputime(p, &cputime); 297 thread_group_cputime(p, &cputime);
@@ -470,26 +455,24 @@ static void cleanup_timers(struct list_head *head,
470 unsigned long long sum_exec_runtime) 455 unsigned long long sum_exec_runtime)
471{ 456{
472 struct cpu_timer_list *timer, *next; 457 struct cpu_timer_list *timer, *next;
473 cputime_t ptime = cputime_add(utime, stime); 458 cputime_t ptime = utime + stime;
474 459
475 list_for_each_entry_safe(timer, next, head, entry) { 460 list_for_each_entry_safe(timer, next, head, entry) {
476 list_del_init(&timer->entry); 461 list_del_init(&timer->entry);
477 if (cputime_lt(timer->expires.cpu, ptime)) { 462 if (timer->expires.cpu < ptime) {
478 timer->expires.cpu = cputime_zero; 463 timer->expires.cpu = 0;
479 } else { 464 } else {
480 timer->expires.cpu = cputime_sub(timer->expires.cpu, 465 timer->expires.cpu -= ptime;
481 ptime);
482 } 466 }
483 } 467 }
484 468
485 ++head; 469 ++head;
486 list_for_each_entry_safe(timer, next, head, entry) { 470 list_for_each_entry_safe(timer, next, head, entry) {
487 list_del_init(&timer->entry); 471 list_del_init(&timer->entry);
488 if (cputime_lt(timer->expires.cpu, utime)) { 472 if (timer->expires.cpu < utime) {
489 timer->expires.cpu = cputime_zero; 473 timer->expires.cpu = 0;
490 } else { 474 } else {
491 timer->expires.cpu = cputime_sub(timer->expires.cpu, 475 timer->expires.cpu -= utime;
492 utime);
493 } 476 }
494 } 477 }
495 478
@@ -520,8 +503,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
520 struct signal_struct *const sig = tsk->signal; 503 struct signal_struct *const sig = tsk->signal;
521 504
522 cleanup_timers(tsk->signal->cpu_timers, 505 cleanup_timers(tsk->signal->cpu_timers,
523 cputime_add(tsk->utime, sig->utime), 506 tsk->utime + sig->utime, tsk->stime + sig->stime,
524 cputime_add(tsk->stime, sig->stime),
525 tsk->se.sum_exec_runtime + sig->sum_sched_runtime); 507 tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
526} 508}
527 509
@@ -540,8 +522,7 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
540 522
541static inline int expires_gt(cputime_t expires, cputime_t new_exp) 523static inline int expires_gt(cputime_t expires, cputime_t new_exp)
542{ 524{
543 return cputime_eq(expires, cputime_zero) || 525 return expires == 0 || expires > new_exp;
544 cputime_gt(expires, new_exp);
545} 526}
546 527
547/* 528/*
@@ -651,7 +632,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
651 default: 632 default:
652 return -EINVAL; 633 return -EINVAL;
653 case CPUCLOCK_PROF: 634 case CPUCLOCK_PROF:
654 cpu->cpu = cputime_add(cputime.utime, cputime.stime); 635 cpu->cpu = cputime.utime + cputime.stime;
655 break; 636 break;
656 case CPUCLOCK_VIRT: 637 case CPUCLOCK_VIRT:
657 cpu->cpu = cputime.utime; 638 cpu->cpu = cputime.utime;
@@ -918,12 +899,12 @@ static void check_thread_timers(struct task_struct *tsk,
918 unsigned long soft; 899 unsigned long soft;
919 900
920 maxfire = 20; 901 maxfire = 20;
921 tsk->cputime_expires.prof_exp = cputime_zero; 902 tsk->cputime_expires.prof_exp = 0;
922 while (!list_empty(timers)) { 903 while (!list_empty(timers)) {
923 struct cpu_timer_list *t = list_first_entry(timers, 904 struct cpu_timer_list *t = list_first_entry(timers,
924 struct cpu_timer_list, 905 struct cpu_timer_list,
925 entry); 906 entry);
926 if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) { 907 if (!--maxfire || prof_ticks(tsk) < t->expires.cpu) {
927 tsk->cputime_expires.prof_exp = t->expires.cpu; 908 tsk->cputime_expires.prof_exp = t->expires.cpu;
928 break; 909 break;
929 } 910 }
@@ -933,12 +914,12 @@ static void check_thread_timers(struct task_struct *tsk,
933 914
934 ++timers; 915 ++timers;
935 maxfire = 20; 916 maxfire = 20;
936 tsk->cputime_expires.virt_exp = cputime_zero; 917 tsk->cputime_expires.virt_exp = 0;
937 while (!list_empty(timers)) { 918 while (!list_empty(timers)) {
938 struct cpu_timer_list *t = list_first_entry(timers, 919 struct cpu_timer_list *t = list_first_entry(timers,
939 struct cpu_timer_list, 920 struct cpu_timer_list,
940 entry); 921 entry);
941 if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) { 922 if (!--maxfire || virt_ticks(tsk) < t->expires.cpu) {
942 tsk->cputime_expires.virt_exp = t->expires.cpu; 923 tsk->cputime_expires.virt_exp = t->expires.cpu;
943 break; 924 break;
944 } 925 }
@@ -1009,20 +990,19 @@ static u32 onecputick;
1009static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, 990static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1010 cputime_t *expires, cputime_t cur_time, int signo) 991 cputime_t *expires, cputime_t cur_time, int signo)
1011{ 992{
1012 if (cputime_eq(it->expires, cputime_zero)) 993 if (!it->expires)
1013 return; 994 return;
1014 995
1015 if (cputime_ge(cur_time, it->expires)) { 996 if (cur_time >= it->expires) {
1016 if (!cputime_eq(it->incr, cputime_zero)) { 997 if (it->incr) {
1017 it->expires = cputime_add(it->expires, it->incr); 998 it->expires += it->incr;
1018 it->error += it->incr_error; 999 it->error += it->incr_error;
1019 if (it->error >= onecputick) { 1000 if (it->error >= onecputick) {
1020 it->expires = cputime_sub(it->expires, 1001 it->expires -= cputime_one_jiffy;
1021 cputime_one_jiffy);
1022 it->error -= onecputick; 1002 it->error -= onecputick;
1023 } 1003 }
1024 } else { 1004 } else {
1025 it->expires = cputime_zero; 1005 it->expires = 0;
1026 } 1006 }
1027 1007
1028 trace_itimer_expire(signo == SIGPROF ? 1008 trace_itimer_expire(signo == SIGPROF ?
@@ -1031,9 +1011,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1031 __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); 1011 __group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
1032 } 1012 }
1033 1013
1034 if (!cputime_eq(it->expires, cputime_zero) && 1014 if (it->expires && (!*expires || it->expires < *expires)) {
1035 (cputime_eq(*expires, cputime_zero) ||
1036 cputime_lt(it->expires, *expires))) {
1037 *expires = it->expires; 1015 *expires = it->expires;
1038 } 1016 }
1039} 1017}
@@ -1048,9 +1026,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1048 */ 1026 */
1049static inline int task_cputime_zero(const struct task_cputime *cputime) 1027static inline int task_cputime_zero(const struct task_cputime *cputime)
1050{ 1028{
1051 if (cputime_eq(cputime->utime, cputime_zero) && 1029 if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
1052 cputime_eq(cputime->stime, cputime_zero) &&
1053 cputime->sum_exec_runtime == 0)
1054 return 1; 1030 return 1;
1055 return 0; 1031 return 0;
1056} 1032}
@@ -1076,15 +1052,15 @@ static void check_process_timers(struct task_struct *tsk,
1076 */ 1052 */
1077 thread_group_cputimer(tsk, &cputime); 1053 thread_group_cputimer(tsk, &cputime);
1078 utime = cputime.utime; 1054 utime = cputime.utime;
1079 ptime = cputime_add(utime, cputime.stime); 1055 ptime = utime + cputime.stime;
1080 sum_sched_runtime = cputime.sum_exec_runtime; 1056 sum_sched_runtime = cputime.sum_exec_runtime;
1081 maxfire = 20; 1057 maxfire = 20;
1082 prof_expires = cputime_zero; 1058 prof_expires = 0;
1083 while (!list_empty(timers)) { 1059 while (!list_empty(timers)) {
1084 struct cpu_timer_list *tl = list_first_entry(timers, 1060 struct cpu_timer_list *tl = list_first_entry(timers,
1085 struct cpu_timer_list, 1061 struct cpu_timer_list,
1086 entry); 1062 entry);
1087 if (!--maxfire || cputime_lt(ptime, tl->expires.cpu)) { 1063 if (!--maxfire || ptime < tl->expires.cpu) {
1088 prof_expires = tl->expires.cpu; 1064 prof_expires = tl->expires.cpu;
1089 break; 1065 break;
1090 } 1066 }
@@ -1094,12 +1070,12 @@ static void check_process_timers(struct task_struct *tsk,
1094 1070
1095 ++timers; 1071 ++timers;
1096 maxfire = 20; 1072 maxfire = 20;
1097 virt_expires = cputime_zero; 1073 virt_expires = 0;
1098 while (!list_empty(timers)) { 1074 while (!list_empty(timers)) {
1099 struct cpu_timer_list *tl = list_first_entry(timers, 1075 struct cpu_timer_list *tl = list_first_entry(timers,
1100 struct cpu_timer_list, 1076 struct cpu_timer_list,
1101 entry); 1077 entry);
1102 if (!--maxfire || cputime_lt(utime, tl->expires.cpu)) { 1078 if (!--maxfire || utime < tl->expires.cpu) {
1103 virt_expires = tl->expires.cpu; 1079 virt_expires = tl->expires.cpu;
1104 break; 1080 break;
1105 } 1081 }
@@ -1154,8 +1130,7 @@ static void check_process_timers(struct task_struct *tsk,
1154 } 1130 }
1155 } 1131 }
1156 x = secs_to_cputime(soft); 1132 x = secs_to_cputime(soft);
1157 if (cputime_eq(prof_expires, cputime_zero) || 1133 if (!prof_expires || x < prof_expires) {
1158 cputime_lt(x, prof_expires)) {
1159 prof_expires = x; 1134 prof_expires = x;
1160 } 1135 }
1161 } 1136 }
@@ -1249,12 +1224,9 @@ out:
1249static inline int task_cputime_expired(const struct task_cputime *sample, 1224static inline int task_cputime_expired(const struct task_cputime *sample,
1250 const struct task_cputime *expires) 1225 const struct task_cputime *expires)
1251{ 1226{
1252 if (!cputime_eq(expires->utime, cputime_zero) && 1227 if (expires->utime && sample->utime >= expires->utime)
1253 cputime_ge(sample->utime, expires->utime))
1254 return 1; 1228 return 1;
1255 if (!cputime_eq(expires->stime, cputime_zero) && 1229 if (expires->stime && sample->utime + sample->stime >= expires->stime)
1256 cputime_ge(cputime_add(sample->utime, sample->stime),
1257 expires->stime))
1258 return 1; 1230 return 1;
1259 if (expires->sum_exec_runtime != 0 && 1231 if (expires->sum_exec_runtime != 0 &&
1260 sample->sum_exec_runtime >= expires->sum_exec_runtime) 1232 sample->sum_exec_runtime >= expires->sum_exec_runtime)
@@ -1389,18 +1361,18 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1389 * it to be relative, *newval argument is relative and we update 1361 * it to be relative, *newval argument is relative and we update
1390 * it to be absolute. 1362 * it to be absolute.
1391 */ 1363 */
1392 if (!cputime_eq(*oldval, cputime_zero)) { 1364 if (*oldval) {
1393 if (cputime_le(*oldval, now.cpu)) { 1365 if (*oldval <= now.cpu) {
1394 /* Just about to fire. */ 1366 /* Just about to fire. */
1395 *oldval = cputime_one_jiffy; 1367 *oldval = cputime_one_jiffy;
1396 } else { 1368 } else {
1397 *oldval = cputime_sub(*oldval, now.cpu); 1369 *oldval -= now.cpu;
1398 } 1370 }
1399 } 1371 }
1400 1372
1401 if (cputime_eq(*newval, cputime_zero)) 1373 if (!*newval)
1402 return; 1374 return;
1403 *newval = cputime_add(*newval, now.cpu); 1375 *newval += now.cpu;
1404 } 1376 }
1405 1377
1406 /* 1378 /*
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index a6b0503574ee..6d6d28870335 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -43,8 +43,6 @@ int in_suspend __nosavedata;
43enum { 43enum {
44 HIBERNATION_INVALID, 44 HIBERNATION_INVALID,
45 HIBERNATION_PLATFORM, 45 HIBERNATION_PLATFORM,
46 HIBERNATION_TEST,
47 HIBERNATION_TESTPROC,
48 HIBERNATION_SHUTDOWN, 46 HIBERNATION_SHUTDOWN,
49 HIBERNATION_REBOOT, 47 HIBERNATION_REBOOT,
50 /* keep last */ 48 /* keep last */
@@ -55,7 +53,7 @@ enum {
55 53
56static int hibernation_mode = HIBERNATION_SHUTDOWN; 54static int hibernation_mode = HIBERNATION_SHUTDOWN;
57 55
58static bool freezer_test_done; 56bool freezer_test_done;
59 57
60static const struct platform_hibernation_ops *hibernation_ops; 58static const struct platform_hibernation_ops *hibernation_ops;
61 59
@@ -71,14 +69,14 @@ void hibernation_set_ops(const struct platform_hibernation_ops *ops)
71 WARN_ON(1); 69 WARN_ON(1);
72 return; 70 return;
73 } 71 }
74 mutex_lock(&pm_mutex); 72 lock_system_sleep();
75 hibernation_ops = ops; 73 hibernation_ops = ops;
76 if (ops) 74 if (ops)
77 hibernation_mode = HIBERNATION_PLATFORM; 75 hibernation_mode = HIBERNATION_PLATFORM;
78 else if (hibernation_mode == HIBERNATION_PLATFORM) 76 else if (hibernation_mode == HIBERNATION_PLATFORM)
79 hibernation_mode = HIBERNATION_SHUTDOWN; 77 hibernation_mode = HIBERNATION_SHUTDOWN;
80 78
81 mutex_unlock(&pm_mutex); 79 unlock_system_sleep();
82} 80}
83 81
84static bool entering_platform_hibernation; 82static bool entering_platform_hibernation;
@@ -96,15 +94,6 @@ static void hibernation_debug_sleep(void)
96 mdelay(5000); 94 mdelay(5000);
97} 95}
98 96
99static int hibernation_testmode(int mode)
100{
101 if (hibernation_mode == mode) {
102 hibernation_debug_sleep();
103 return 1;
104 }
105 return 0;
106}
107
108static int hibernation_test(int level) 97static int hibernation_test(int level)
109{ 98{
110 if (pm_test_level == level) { 99 if (pm_test_level == level) {
@@ -114,7 +103,6 @@ static int hibernation_test(int level)
114 return 0; 103 return 0;
115} 104}
116#else /* !CONFIG_PM_DEBUG */ 105#else /* !CONFIG_PM_DEBUG */
117static int hibernation_testmode(int mode) { return 0; }
118static int hibernation_test(int level) { return 0; } 106static int hibernation_test(int level) { return 0; }
119#endif /* !CONFIG_PM_DEBUG */ 107#endif /* !CONFIG_PM_DEBUG */
120 108
@@ -278,8 +266,7 @@ static int create_image(int platform_mode)
278 goto Platform_finish; 266 goto Platform_finish;
279 267
280 error = disable_nonboot_cpus(); 268 error = disable_nonboot_cpus();
281 if (error || hibernation_test(TEST_CPUS) 269 if (error || hibernation_test(TEST_CPUS))
282 || hibernation_testmode(HIBERNATION_TEST))
283 goto Enable_cpus; 270 goto Enable_cpus;
284 271
285 local_irq_disable(); 272 local_irq_disable();
@@ -333,7 +320,7 @@ static int create_image(int platform_mode)
333 */ 320 */
334int hibernation_snapshot(int platform_mode) 321int hibernation_snapshot(int platform_mode)
335{ 322{
336 pm_message_t msg = PMSG_RECOVER; 323 pm_message_t msg;
337 int error; 324 int error;
338 325
339 error = platform_begin(platform_mode); 326 error = platform_begin(platform_mode);
@@ -349,8 +336,7 @@ int hibernation_snapshot(int platform_mode)
349 if (error) 336 if (error)
350 goto Cleanup; 337 goto Cleanup;
351 338
352 if (hibernation_test(TEST_FREEZER) || 339 if (hibernation_test(TEST_FREEZER)) {
353 hibernation_testmode(HIBERNATION_TESTPROC)) {
354 340
355 /* 341 /*
356 * Indicate to the caller that we are returning due to a 342 * Indicate to the caller that we are returning due to a
@@ -362,26 +348,26 @@ int hibernation_snapshot(int platform_mode)
362 348
363 error = dpm_prepare(PMSG_FREEZE); 349 error = dpm_prepare(PMSG_FREEZE);
364 if (error) { 350 if (error) {
365 dpm_complete(msg); 351 dpm_complete(PMSG_RECOVER);
366 goto Cleanup; 352 goto Cleanup;
367 } 353 }
368 354
369 suspend_console(); 355 suspend_console();
370 pm_restrict_gfp_mask(); 356 pm_restrict_gfp_mask();
357
371 error = dpm_suspend(PMSG_FREEZE); 358 error = dpm_suspend(PMSG_FREEZE);
372 if (error)
373 goto Recover_platform;
374 359
375 if (hibernation_test(TEST_DEVICES)) 360 if (error || hibernation_test(TEST_DEVICES))
376 goto Recover_platform; 361 platform_recover(platform_mode);
362 else
363 error = create_image(platform_mode);
377 364
378 error = create_image(platform_mode);
379 /* 365 /*
380 * Control returns here (1) after the image has been created or the 366 * In the case that we call create_image() above, the control
367 * returns here (1) after the image has been created or the
381 * image creation has failed and (2) after a successful restore. 368 * image creation has failed and (2) after a successful restore.
382 */ 369 */
383 370
384 Resume_devices:
385 /* We may need to release the preallocated image pages here. */ 371 /* We may need to release the preallocated image pages here. */
386 if (error || !in_suspend) 372 if (error || !in_suspend)
387 swsusp_free(); 373 swsusp_free();
@@ -399,10 +385,6 @@ int hibernation_snapshot(int platform_mode)
399 platform_end(platform_mode); 385 platform_end(platform_mode);
400 return error; 386 return error;
401 387
402 Recover_platform:
403 platform_recover(platform_mode);
404 goto Resume_devices;
405
406 Cleanup: 388 Cleanup:
407 swsusp_free(); 389 swsusp_free();
408 goto Close; 390 goto Close;
@@ -590,9 +572,6 @@ int hibernation_platform_enter(void)
590static void power_down(void) 572static void power_down(void)
591{ 573{
592 switch (hibernation_mode) { 574 switch (hibernation_mode) {
593 case HIBERNATION_TEST:
594 case HIBERNATION_TESTPROC:
595 break;
596 case HIBERNATION_REBOOT: 575 case HIBERNATION_REBOOT:
597 kernel_restart(NULL); 576 kernel_restart(NULL);
598 break; 577 break;
@@ -611,17 +590,6 @@ static void power_down(void)
611 while(1); 590 while(1);
612} 591}
613 592
614static int prepare_processes(void)
615{
616 int error = 0;
617
618 if (freeze_processes()) {
619 error = -EBUSY;
620 thaw_processes();
621 }
622 return error;
623}
624
625/** 593/**
626 * hibernate - Carry out system hibernation, including saving the image. 594 * hibernate - Carry out system hibernation, including saving the image.
627 */ 595 */
@@ -629,7 +597,7 @@ int hibernate(void)
629{ 597{
630 int error; 598 int error;
631 599
632 mutex_lock(&pm_mutex); 600 lock_system_sleep();
633 /* The snapshot device should not be opened while we're running */ 601 /* The snapshot device should not be opened while we're running */
634 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { 602 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
635 error = -EBUSY; 603 error = -EBUSY;
@@ -654,7 +622,7 @@ int hibernate(void)
654 sys_sync(); 622 sys_sync();
655 printk("done.\n"); 623 printk("done.\n");
656 624
657 error = prepare_processes(); 625 error = freeze_processes();
658 if (error) 626 if (error)
659 goto Finish; 627 goto Finish;
660 628
@@ -697,7 +665,7 @@ int hibernate(void)
697 pm_restore_console(); 665 pm_restore_console();
698 atomic_inc(&snapshot_device_available); 666 atomic_inc(&snapshot_device_available);
699 Unlock: 667 Unlock:
700 mutex_unlock(&pm_mutex); 668 unlock_system_sleep();
701 return error; 669 return error;
702} 670}
703 671
@@ -811,11 +779,13 @@ static int software_resume(void)
811 goto close_finish; 779 goto close_finish;
812 780
813 error = create_basic_memory_bitmaps(); 781 error = create_basic_memory_bitmaps();
814 if (error) 782 if (error) {
783 usermodehelper_enable();
815 goto close_finish; 784 goto close_finish;
785 }
816 786
817 pr_debug("PM: Preparing processes for restore.\n"); 787 pr_debug("PM: Preparing processes for restore.\n");
818 error = prepare_processes(); 788 error = freeze_processes();
819 if (error) { 789 if (error) {
820 swsusp_close(FMODE_READ); 790 swsusp_close(FMODE_READ);
821 goto Done; 791 goto Done;
@@ -855,8 +825,6 @@ static const char * const hibernation_modes[] = {
855 [HIBERNATION_PLATFORM] = "platform", 825 [HIBERNATION_PLATFORM] = "platform",
856 [HIBERNATION_SHUTDOWN] = "shutdown", 826 [HIBERNATION_SHUTDOWN] = "shutdown",
857 [HIBERNATION_REBOOT] = "reboot", 827 [HIBERNATION_REBOOT] = "reboot",
858 [HIBERNATION_TEST] = "test",
859 [HIBERNATION_TESTPROC] = "testproc",
860}; 828};
861 829
862/* 830/*
@@ -865,17 +833,15 @@ static const char * const hibernation_modes[] = {
865 * Hibernation can be handled in several ways. There are a few different ways 833 * Hibernation can be handled in several ways. There are a few different ways
866 * to put the system into the sleep state: using the platform driver (e.g. ACPI 834 * to put the system into the sleep state: using the platform driver (e.g. ACPI
867 * or other hibernation_ops), powering it off or rebooting it (for testing 835 * or other hibernation_ops), powering it off or rebooting it (for testing
868 * mostly), or using one of the two available test modes. 836 * mostly).
869 * 837 *
870 * The sysfs file /sys/power/disk provides an interface for selecting the 838 * The sysfs file /sys/power/disk provides an interface for selecting the
871 * hibernation mode to use. Reading from this file causes the available modes 839 * hibernation mode to use. Reading from this file causes the available modes
872 * to be printed. There are 5 modes that can be supported: 840 * to be printed. There are 3 modes that can be supported:
873 * 841 *
874 * 'platform' 842 * 'platform'
875 * 'shutdown' 843 * 'shutdown'
876 * 'reboot' 844 * 'reboot'
877 * 'test'
878 * 'testproc'
879 * 845 *
880 * If a platform hibernation driver is in use, 'platform' will be supported 846 * If a platform hibernation driver is in use, 'platform' will be supported
881 * and will be used by default. Otherwise, 'shutdown' will be used by default. 847 * and will be used by default. Otherwise, 'shutdown' will be used by default.
@@ -899,8 +865,6 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
899 switch (i) { 865 switch (i) {
900 case HIBERNATION_SHUTDOWN: 866 case HIBERNATION_SHUTDOWN:
901 case HIBERNATION_REBOOT: 867 case HIBERNATION_REBOOT:
902 case HIBERNATION_TEST:
903 case HIBERNATION_TESTPROC:
904 break; 868 break;
905 case HIBERNATION_PLATFORM: 869 case HIBERNATION_PLATFORM:
906 if (hibernation_ops) 870 if (hibernation_ops)
@@ -929,7 +893,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
929 p = memchr(buf, '\n', n); 893 p = memchr(buf, '\n', n);
930 len = p ? p - buf : n; 894 len = p ? p - buf : n;
931 895
932 mutex_lock(&pm_mutex); 896 lock_system_sleep();
933 for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { 897 for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
934 if (len == strlen(hibernation_modes[i]) 898 if (len == strlen(hibernation_modes[i])
935 && !strncmp(buf, hibernation_modes[i], len)) { 899 && !strncmp(buf, hibernation_modes[i], len)) {
@@ -941,8 +905,6 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
941 switch (mode) { 905 switch (mode) {
942 case HIBERNATION_SHUTDOWN: 906 case HIBERNATION_SHUTDOWN:
943 case HIBERNATION_REBOOT: 907 case HIBERNATION_REBOOT:
944 case HIBERNATION_TEST:
945 case HIBERNATION_TESTPROC:
946 hibernation_mode = mode; 908 hibernation_mode = mode;
947 break; 909 break;
948 case HIBERNATION_PLATFORM: 910 case HIBERNATION_PLATFORM:
@@ -957,7 +919,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
957 if (!error) 919 if (!error)
958 pr_debug("PM: Hibernation mode set to '%s'\n", 920 pr_debug("PM: Hibernation mode set to '%s'\n",
959 hibernation_modes[mode]); 921 hibernation_modes[mode]);
960 mutex_unlock(&pm_mutex); 922 unlock_system_sleep();
961 return error ? error : n; 923 return error ? error : n;
962} 924}
963 925
@@ -984,9 +946,9 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
984 if (maj != MAJOR(res) || min != MINOR(res)) 946 if (maj != MAJOR(res) || min != MINOR(res))
985 goto out; 947 goto out;
986 948
987 mutex_lock(&pm_mutex); 949 lock_system_sleep();
988 swsusp_resume_device = res; 950 swsusp_resume_device = res;
989 mutex_unlock(&pm_mutex); 951 unlock_system_sleep();
990 printk(KERN_INFO "PM: Starting manual resume from disk\n"); 952 printk(KERN_INFO "PM: Starting manual resume from disk\n");
991 noresume = 0; 953 noresume = 0;
992 software_resume(); 954 software_resume();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 36e0f0903c32..9824b41e5a18 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Copyright (c) 2003 Patrick Mochel 4 * Copyright (c) 2003 Patrick Mochel
5 * Copyright (c) 2003 Open Source Development Lab 5 * Copyright (c) 2003 Open Source Development Lab
6 * 6 *
7 * This file is released under the GPLv2 7 * This file is released under the GPLv2
8 * 8 *
9 */ 9 */
@@ -116,7 +116,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
116 p = memchr(buf, '\n', n); 116 p = memchr(buf, '\n', n);
117 len = p ? p - buf : n; 117 len = p ? p - buf : n;
118 118
119 mutex_lock(&pm_mutex); 119 lock_system_sleep();
120 120
121 level = TEST_FIRST; 121 level = TEST_FIRST;
122 for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++) 122 for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++)
@@ -126,7 +126,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
126 break; 126 break;
127 } 127 }
128 128
129 mutex_unlock(&pm_mutex); 129 unlock_system_sleep();
130 130
131 return error ? error : n; 131 return error ? error : n;
132} 132}
@@ -240,7 +240,7 @@ struct kobject *power_kobj;
240 * 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and 240 * 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and
241 * 'disk' (Suspend-to-Disk). 241 * 'disk' (Suspend-to-Disk).
242 * 242 *
243 * store() accepts one of those strings, translates it into the 243 * store() accepts one of those strings, translates it into the
244 * proper enumerated value, and initiates a suspend transition. 244 * proper enumerated value, and initiates a suspend transition.
245 */ 245 */
246static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, 246static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
@@ -282,7 +282,7 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
282 /* First, check if we are requested to hibernate */ 282 /* First, check if we are requested to hibernate */
283 if (len == 4 && !strncmp(buf, "disk", len)) { 283 if (len == 4 && !strncmp(buf, "disk", len)) {
284 error = hibernate(); 284 error = hibernate();
285 goto Exit; 285 goto Exit;
286 } 286 }
287 287
288#ifdef CONFIG_SUSPEND 288#ifdef CONFIG_SUSPEND
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 23a2db1ec442..0c4defe6d3b8 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -50,6 +50,8 @@ static inline char *check_image_kernel(struct swsusp_info *info)
50#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) 50#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT)
51 51
52/* kernel/power/hibernate.c */ 52/* kernel/power/hibernate.c */
53extern bool freezer_test_done;
54
53extern int hibernation_snapshot(int platform_mode); 55extern int hibernation_snapshot(int platform_mode);
54extern int hibernation_restore(int platform_mode); 56extern int hibernation_restore(int platform_mode);
55extern int hibernation_platform_enter(void); 57extern int hibernation_platform_enter(void);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index addbbe5531bc..eeca00311f39 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -22,16 +22,7 @@
22 */ 22 */
23#define TIMEOUT (20 * HZ) 23#define TIMEOUT (20 * HZ)
24 24
25static inline int freezable(struct task_struct * p) 25static int try_to_freeze_tasks(bool user_only)
26{
27 if ((p == current) ||
28 (p->flags & PF_NOFREEZE) ||
29 (p->exit_state != 0))
30 return 0;
31 return 1;
32}
33
34static int try_to_freeze_tasks(bool sig_only)
35{ 26{
36 struct task_struct *g, *p; 27 struct task_struct *g, *p;
37 unsigned long end_time; 28 unsigned long end_time;
@@ -46,17 +37,14 @@ static int try_to_freeze_tasks(bool sig_only)
46 37
47 end_time = jiffies + TIMEOUT; 38 end_time = jiffies + TIMEOUT;
48 39
49 if (!sig_only) 40 if (!user_only)
50 freeze_workqueues_begin(); 41 freeze_workqueues_begin();
51 42
52 while (true) { 43 while (true) {
53 todo = 0; 44 todo = 0;
54 read_lock(&tasklist_lock); 45 read_lock(&tasklist_lock);
55 do_each_thread(g, p) { 46 do_each_thread(g, p) {
56 if (frozen(p) || !freezable(p)) 47 if (p == current || !freeze_task(p))
57 continue;
58
59 if (!freeze_task(p, sig_only))
60 continue; 48 continue;
61 49
62 /* 50 /*
@@ -77,7 +65,7 @@ static int try_to_freeze_tasks(bool sig_only)
77 } while_each_thread(g, p); 65 } while_each_thread(g, p);
78 read_unlock(&tasklist_lock); 66 read_unlock(&tasklist_lock);
79 67
80 if (!sig_only) { 68 if (!user_only) {
81 wq_busy = freeze_workqueues_busy(); 69 wq_busy = freeze_workqueues_busy();
82 todo += wq_busy; 70 todo += wq_busy;
83 } 71 }
@@ -103,11 +91,6 @@ static int try_to_freeze_tasks(bool sig_only)
103 elapsed_csecs = elapsed_csecs64; 91 elapsed_csecs = elapsed_csecs64;
104 92
105 if (todo) { 93 if (todo) {
106 /* This does not unfreeze processes that are already frozen
107 * (we have slightly ugly calling convention in that respect,
108 * and caller must call thaw_processes() if something fails),
109 * but it cleans up leftover PF_FREEZE requests.
110 */
111 printk("\n"); 94 printk("\n");
112 printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds " 95 printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds "
113 "(%d tasks refusing to freeze, wq_busy=%d):\n", 96 "(%d tasks refusing to freeze, wq_busy=%d):\n",
@@ -115,15 +98,11 @@ static int try_to_freeze_tasks(bool sig_only)
115 elapsed_csecs / 100, elapsed_csecs % 100, 98 elapsed_csecs / 100, elapsed_csecs % 100,
116 todo - wq_busy, wq_busy); 99 todo - wq_busy, wq_busy);
117 100
118 thaw_workqueues();
119
120 read_lock(&tasklist_lock); 101 read_lock(&tasklist_lock);
121 do_each_thread(g, p) { 102 do_each_thread(g, p) {
122 task_lock(p); 103 if (!wakeup && !freezer_should_skip(p) &&
123 if (!wakeup && freezing(p) && !freezer_should_skip(p)) 104 p != current && freezing(p) && !frozen(p))
124 sched_show_task(p); 105 sched_show_task(p);
125 cancel_freezing(p);
126 task_unlock(p);
127 } while_each_thread(g, p); 106 } while_each_thread(g, p);
128 read_unlock(&tasklist_lock); 107 read_unlock(&tasklist_lock);
129 } else { 108 } else {
@@ -136,12 +115,18 @@ static int try_to_freeze_tasks(bool sig_only)
136 115
137/** 116/**
138 * freeze_processes - Signal user space processes to enter the refrigerator. 117 * freeze_processes - Signal user space processes to enter the refrigerator.
118 *
119 * On success, returns 0. On failure, -errno and system is fully thawed.
139 */ 120 */
140int freeze_processes(void) 121int freeze_processes(void)
141{ 122{
142 int error; 123 int error;
143 124
125 if (!pm_freezing)
126 atomic_inc(&system_freezing_cnt);
127
144 printk("Freezing user space processes ... "); 128 printk("Freezing user space processes ... ");
129 pm_freezing = true;
145 error = try_to_freeze_tasks(true); 130 error = try_to_freeze_tasks(true);
146 if (!error) { 131 if (!error) {
147 printk("done."); 132 printk("done.");
@@ -150,17 +135,22 @@ int freeze_processes(void)
150 printk("\n"); 135 printk("\n");
151 BUG_ON(in_atomic()); 136 BUG_ON(in_atomic());
152 137
138 if (error)
139 thaw_processes();
153 return error; 140 return error;
154} 141}
155 142
156/** 143/**
157 * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator. 144 * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator.
145 *
146 * On success, returns 0. On failure, -errno and system is fully thawed.
158 */ 147 */
159int freeze_kernel_threads(void) 148int freeze_kernel_threads(void)
160{ 149{
161 int error; 150 int error;
162 151
163 printk("Freezing remaining freezable tasks ... "); 152 printk("Freezing remaining freezable tasks ... ");
153 pm_nosig_freezing = true;
164 error = try_to_freeze_tasks(false); 154 error = try_to_freeze_tasks(false);
165 if (!error) 155 if (!error)
166 printk("done."); 156 printk("done.");
@@ -168,38 +158,52 @@ int freeze_kernel_threads(void)
168 printk("\n"); 158 printk("\n");
169 BUG_ON(in_atomic()); 159 BUG_ON(in_atomic());
170 160
161 if (error)
162 thaw_processes();
171 return error; 163 return error;
172} 164}
173 165
174static void thaw_tasks(bool nosig_only) 166void thaw_processes(void)
175{ 167{
176 struct task_struct *g, *p; 168 struct task_struct *g, *p;
177 169
178 read_lock(&tasklist_lock); 170 if (pm_freezing)
179 do_each_thread(g, p) { 171 atomic_dec(&system_freezing_cnt);
180 if (!freezable(p)) 172 pm_freezing = false;
181 continue; 173 pm_nosig_freezing = false;
182 174
183 if (nosig_only && should_send_signal(p)) 175 oom_killer_enable();
184 continue; 176
177 printk("Restarting tasks ... ");
185 178
186 if (cgroup_freezing_or_frozen(p)) 179 thaw_workqueues();
187 continue;
188 180
189 thaw_process(p); 181 read_lock(&tasklist_lock);
182 do_each_thread(g, p) {
183 __thaw_task(p);
190 } while_each_thread(g, p); 184 } while_each_thread(g, p);
191 read_unlock(&tasklist_lock); 185 read_unlock(&tasklist_lock);
186
187 schedule();
188 printk("done.\n");
192} 189}
193 190
194void thaw_processes(void) 191void thaw_kernel_threads(void)
195{ 192{
196 oom_killer_enable(); 193 struct task_struct *g, *p;
194
195 pm_nosig_freezing = false;
196 printk("Restarting kernel threads ... ");
197 197
198 printk("Restarting tasks ... ");
199 thaw_workqueues(); 198 thaw_workqueues();
200 thaw_tasks(true); 199
201 thaw_tasks(false); 200 read_lock(&tasklist_lock);
201 do_each_thread(g, p) {
202 if (p->flags & (PF_KTHREAD | PF_WQ_WORKER))
203 __thaw_task(p);
204 } while_each_thread(g, p);
205 read_unlock(&tasklist_lock);
206
202 schedule(); 207 schedule();
203 printk("done.\n"); 208 printk("done.\n");
204} 209}
205
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index cbe2c1441392..6a768e537001 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -812,7 +812,8 @@ unsigned int snapshot_additional_pages(struct zone *zone)
812 unsigned int res; 812 unsigned int res;
813 813
814 res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); 814 res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
815 res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE); 815 res += DIV_ROUND_UP(res * sizeof(struct bm_block),
816 LINKED_PAGE_DATA_SIZE);
816 return 2 * res; 817 return 2 * res;
817} 818}
818 819
@@ -858,6 +859,9 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
858 PageReserved(page)) 859 PageReserved(page))
859 return NULL; 860 return NULL;
860 861
862 if (page_is_guard(page))
863 return NULL;
864
861 return page; 865 return page;
862} 866}
863 867
@@ -920,6 +924,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
920 && (!kernel_page_present(page) || pfn_is_nosave(pfn))) 924 && (!kernel_page_present(page) || pfn_is_nosave(pfn)))
921 return NULL; 925 return NULL;
922 926
927 if (page_is_guard(page))
928 return NULL;
929
923 return page; 930 return page;
924} 931}
925 932
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4953dc054c53..4fd51beed879 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -42,9 +42,9 @@ static const struct platform_suspend_ops *suspend_ops;
42 */ 42 */
43void suspend_set_ops(const struct platform_suspend_ops *ops) 43void suspend_set_ops(const struct platform_suspend_ops *ops)
44{ 44{
45 mutex_lock(&pm_mutex); 45 lock_system_sleep();
46 suspend_ops = ops; 46 suspend_ops = ops;
47 mutex_unlock(&pm_mutex); 47 unlock_system_sleep();
48} 48}
49EXPORT_SYMBOL_GPL(suspend_set_ops); 49EXPORT_SYMBOL_GPL(suspend_set_ops);
50 50
@@ -106,13 +106,11 @@ static int suspend_prepare(void)
106 goto Finish; 106 goto Finish;
107 107
108 error = suspend_freeze_processes(); 108 error = suspend_freeze_processes();
109 if (error) { 109 if (!error)
110 suspend_stats.failed_freeze++;
111 dpm_save_failed_step(SUSPEND_FREEZE);
112 } else
113 return 0; 110 return 0;
114 111
115 suspend_thaw_processes(); 112 suspend_stats.failed_freeze++;
113 dpm_save_failed_step(SUSPEND_FREEZE);
116 usermodehelper_enable(); 114 usermodehelper_enable();
117 Finish: 115 Finish:
118 pm_notifier_call_chain(PM_POST_SUSPEND); 116 pm_notifier_call_chain(PM_POST_SUSPEND);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 11a594c4ba25..8742fd013a94 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -18,7 +18,6 @@
18#include <linux/bitops.h> 18#include <linux/bitops.h>
19#include <linux/genhd.h> 19#include <linux/genhd.h>
20#include <linux/device.h> 20#include <linux/device.h>
21#include <linux/buffer_head.h>
22#include <linux/bio.h> 21#include <linux/bio.h>
23#include <linux/blkdev.h> 22#include <linux/blkdev.h>
24#include <linux/swap.h> 23#include <linux/swap.h>
@@ -774,8 +773,7 @@ static int enough_swap(unsigned int nr_pages, unsigned int flags)
774 773
775 pr_debug("PM: Free swap pages: %u\n", free_swap); 774 pr_debug("PM: Free swap pages: %u\n", free_swap);
776 775
777 required = PAGES_FOR_IO + ((flags & SF_NOCOMPRESS_MODE) ? 776 required = PAGES_FOR_IO + nr_pages;
778 nr_pages : (nr_pages * LZO_CMP_PAGES) / LZO_UNC_PAGES + 1);
779 return free_swap > required; 777 return free_swap > required;
780} 778}
781 779
@@ -803,10 +801,12 @@ int swsusp_write(unsigned int flags)
803 printk(KERN_ERR "PM: Cannot get swap writer\n"); 801 printk(KERN_ERR "PM: Cannot get swap writer\n");
804 return error; 802 return error;
805 } 803 }
806 if (!enough_swap(pages, flags)) { 804 if (flags & SF_NOCOMPRESS_MODE) {
807 printk(KERN_ERR "PM: Not enough free swap\n"); 805 if (!enough_swap(pages, flags)) {
808 error = -ENOSPC; 806 printk(KERN_ERR "PM: Not enough free swap\n");
809 goto out_finish; 807 error = -ENOSPC;
808 goto out_finish;
809 }
810 } 810 }
811 memset(&snapshot, 0, sizeof(struct snapshot_handle)); 811 memset(&snapshot, 0, sizeof(struct snapshot_handle));
812 error = snapshot_read_next(&snapshot); 812 error = snapshot_read_next(&snapshot);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 6d8f535c2b88..e5a21a857302 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -21,6 +21,7 @@
21#include <linux/swapops.h> 21#include <linux/swapops.h>
22#include <linux/pm.h> 22#include <linux/pm.h>
23#include <linux/fs.h> 23#include <linux/fs.h>
24#include <linux/compat.h>
24#include <linux/console.h> 25#include <linux/console.h>
25#include <linux/cpu.h> 26#include <linux/cpu.h>
26#include <linux/freezer.h> 27#include <linux/freezer.h>
@@ -30,28 +31,6 @@
30 31
31#include "power.h" 32#include "power.h"
32 33
33/*
34 * NOTE: The SNAPSHOT_SET_SWAP_FILE and SNAPSHOT_PMOPS ioctls are obsolete and
35 * will be removed in the future. They are only preserved here for
36 * compatibility with existing userland utilities.
37 */
38#define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
39#define SNAPSHOT_PMOPS _IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int)
40
41#define PMOPS_PREPARE 1
42#define PMOPS_ENTER 2
43#define PMOPS_FINISH 3
44
45/*
46 * NOTE: The following ioctl definitions are wrong and have been replaced with
47 * correct ones. They are only preserved here for compatibility with existing
48 * userland utilities and will be removed in the future.
49 */
50#define SNAPSHOT_ATOMIC_SNAPSHOT _IOW(SNAPSHOT_IOC_MAGIC, 3, void *)
51#define SNAPSHOT_SET_IMAGE_SIZE _IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long)
52#define SNAPSHOT_AVAIL_SWAP _IOR(SNAPSHOT_IOC_MAGIC, 7, void *)
53#define SNAPSHOT_GET_SWAP_PAGE _IOR(SNAPSHOT_IOC_MAGIC, 8, void *)
54
55 34
56#define SNAPSHOT_MINOR 231 35#define SNAPSHOT_MINOR 231
57 36
@@ -71,7 +50,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
71 struct snapshot_data *data; 50 struct snapshot_data *data;
72 int error; 51 int error;
73 52
74 mutex_lock(&pm_mutex); 53 lock_system_sleep();
75 54
76 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { 55 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
77 error = -EBUSY; 56 error = -EBUSY;
@@ -123,7 +102,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
123 data->platform_support = 0; 102 data->platform_support = 0;
124 103
125 Unlock: 104 Unlock:
126 mutex_unlock(&pm_mutex); 105 unlock_system_sleep();
127 106
128 return error; 107 return error;
129} 108}
@@ -132,7 +111,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
132{ 111{
133 struct snapshot_data *data; 112 struct snapshot_data *data;
134 113
135 mutex_lock(&pm_mutex); 114 lock_system_sleep();
136 115
137 swsusp_free(); 116 swsusp_free();
138 free_basic_memory_bitmaps(); 117 free_basic_memory_bitmaps();
@@ -146,7 +125,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
146 PM_POST_HIBERNATION : PM_POST_RESTORE); 125 PM_POST_HIBERNATION : PM_POST_RESTORE);
147 atomic_inc(&snapshot_device_available); 126 atomic_inc(&snapshot_device_available);
148 127
149 mutex_unlock(&pm_mutex); 128 unlock_system_sleep();
150 129
151 return 0; 130 return 0;
152} 131}
@@ -158,7 +137,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
158 ssize_t res; 137 ssize_t res;
159 loff_t pg_offp = *offp & ~PAGE_MASK; 138 loff_t pg_offp = *offp & ~PAGE_MASK;
160 139
161 mutex_lock(&pm_mutex); 140 lock_system_sleep();
162 141
163 data = filp->private_data; 142 data = filp->private_data;
164 if (!data->ready) { 143 if (!data->ready) {
@@ -179,7 +158,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
179 *offp += res; 158 *offp += res;
180 159
181 Unlock: 160 Unlock:
182 mutex_unlock(&pm_mutex); 161 unlock_system_sleep();
183 162
184 return res; 163 return res;
185} 164}
@@ -191,7 +170,7 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
191 ssize_t res; 170 ssize_t res;
192 loff_t pg_offp = *offp & ~PAGE_MASK; 171 loff_t pg_offp = *offp & ~PAGE_MASK;
193 172
194 mutex_lock(&pm_mutex); 173 lock_system_sleep();
195 174
196 data = filp->private_data; 175 data = filp->private_data;
197 176
@@ -208,20 +187,11 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
208 if (res > 0) 187 if (res > 0)
209 *offp += res; 188 *offp += res;
210unlock: 189unlock:
211 mutex_unlock(&pm_mutex); 190 unlock_system_sleep();
212 191
213 return res; 192 return res;
214} 193}
215 194
216static void snapshot_deprecated_ioctl(unsigned int cmd)
217{
218 if (printk_ratelimit())
219 printk(KERN_NOTICE "%pf: ioctl '%.8x' is deprecated and will "
220 "be removed soon, update your suspend-to-disk "
221 "utilities\n",
222 __builtin_return_address(0), cmd);
223}
224
225static long snapshot_ioctl(struct file *filp, unsigned int cmd, 195static long snapshot_ioctl(struct file *filp, unsigned int cmd,
226 unsigned long arg) 196 unsigned long arg)
227{ 197{
@@ -257,11 +227,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
257 break; 227 break;
258 228
259 error = freeze_processes(); 229 error = freeze_processes();
260 if (error) { 230 if (error)
261 thaw_processes();
262 usermodehelper_enable(); 231 usermodehelper_enable();
263 } 232 else
264 if (!error)
265 data->frozen = 1; 233 data->frozen = 1;
266 break; 234 break;
267 235
@@ -274,8 +242,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
274 data->frozen = 0; 242 data->frozen = 0;
275 break; 243 break;
276 244
277 case SNAPSHOT_ATOMIC_SNAPSHOT:
278 snapshot_deprecated_ioctl(cmd);
279 case SNAPSHOT_CREATE_IMAGE: 245 case SNAPSHOT_CREATE_IMAGE:
280 if (data->mode != O_RDONLY || !data->frozen || data->ready) { 246 if (data->mode != O_RDONLY || !data->frozen || data->ready) {
281 error = -EPERM; 247 error = -EPERM;
@@ -283,10 +249,15 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
283 } 249 }
284 pm_restore_gfp_mask(); 250 pm_restore_gfp_mask();
285 error = hibernation_snapshot(data->platform_support); 251 error = hibernation_snapshot(data->platform_support);
286 if (!error) 252 if (!error) {
287 error = put_user(in_suspend, (int __user *)arg); 253 error = put_user(in_suspend, (int __user *)arg);
288 if (!error) 254 if (!error && !freezer_test_done)
289 data->ready = 1; 255 data->ready = 1;
256 if (freezer_test_done) {
257 freezer_test_done = false;
258 thaw_processes();
259 }
260 }
290 break; 261 break;
291 262
292 case SNAPSHOT_ATOMIC_RESTORE: 263 case SNAPSHOT_ATOMIC_RESTORE:
@@ -303,10 +274,17 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
303 swsusp_free(); 274 swsusp_free();
304 memset(&data->handle, 0, sizeof(struct snapshot_handle)); 275 memset(&data->handle, 0, sizeof(struct snapshot_handle));
305 data->ready = 0; 276 data->ready = 0;
277 /*
278 * It is necessary to thaw kernel threads here, because
279 * SNAPSHOT_CREATE_IMAGE may be invoked directly after
280 * SNAPSHOT_FREE. In that case, if kernel threads were not
281 * thawed, the preallocation of memory carried out by
282 * hibernation_snapshot() might run into problems (i.e. it
283 * might fail or even deadlock).
284 */
285 thaw_kernel_threads();
306 break; 286 break;
307 287
308 case SNAPSHOT_SET_IMAGE_SIZE:
309 snapshot_deprecated_ioctl(cmd);
310 case SNAPSHOT_PREF_IMAGE_SIZE: 288 case SNAPSHOT_PREF_IMAGE_SIZE:
311 image_size = arg; 289 image_size = arg;
312 break; 290 break;
@@ -321,16 +299,12 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
321 error = put_user(size, (loff_t __user *)arg); 299 error = put_user(size, (loff_t __user *)arg);
322 break; 300 break;
323 301
324 case SNAPSHOT_AVAIL_SWAP:
325 snapshot_deprecated_ioctl(cmd);
326 case SNAPSHOT_AVAIL_SWAP_SIZE: 302 case SNAPSHOT_AVAIL_SWAP_SIZE:
327 size = count_swap_pages(data->swap, 1); 303 size = count_swap_pages(data->swap, 1);
328 size <<= PAGE_SHIFT; 304 size <<= PAGE_SHIFT;
329 error = put_user(size, (loff_t __user *)arg); 305 error = put_user(size, (loff_t __user *)arg);
330 break; 306 break;
331 307
332 case SNAPSHOT_GET_SWAP_PAGE:
333 snapshot_deprecated_ioctl(cmd);
334 case SNAPSHOT_ALLOC_SWAP_PAGE: 308 case SNAPSHOT_ALLOC_SWAP_PAGE:
335 if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { 309 if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
336 error = -ENODEV; 310 error = -ENODEV;
@@ -353,27 +327,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
353 free_all_swap_pages(data->swap); 327 free_all_swap_pages(data->swap);
354 break; 328 break;
355 329
356 case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */
357 snapshot_deprecated_ioctl(cmd);
358 if (!swsusp_swap_in_use()) {
359 /*
360 * User space encodes device types as two-byte values,
361 * so we need to recode them
362 */
363 if (old_decode_dev(arg)) {
364 data->swap = swap_type_of(old_decode_dev(arg),
365 0, NULL);
366 if (data->swap < 0)
367 error = -ENODEV;
368 } else {
369 data->swap = -1;
370 error = -EINVAL;
371 }
372 } else {
373 error = -EPERM;
374 }
375 break;
376
377 case SNAPSHOT_S2RAM: 330 case SNAPSHOT_S2RAM:
378 if (!data->frozen) { 331 if (!data->frozen) {
379 error = -EPERM; 332 error = -EPERM;
@@ -396,33 +349,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
396 error = hibernation_platform_enter(); 349 error = hibernation_platform_enter();
397 break; 350 break;
398 351
399 case SNAPSHOT_PMOPS: /* This ioctl is deprecated */
400 snapshot_deprecated_ioctl(cmd);
401 error = -EINVAL;
402
403 switch (arg) {
404
405 case PMOPS_PREPARE:
406 data->platform_support = 1;
407 error = 0;
408 break;
409
410 case PMOPS_ENTER:
411 if (data->platform_support)
412 error = hibernation_platform_enter();
413 break;
414
415 case PMOPS_FINISH:
416 if (data->platform_support)
417 error = 0;
418 break;
419
420 default:
421 printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg);
422
423 }
424 break;
425
426 case SNAPSHOT_SET_SWAP_AREA: 352 case SNAPSHOT_SET_SWAP_AREA:
427 if (swsusp_swap_in_use()) { 353 if (swsusp_swap_in_use()) {
428 error = -EPERM; 354 error = -EPERM;
@@ -464,6 +390,66 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
464 return error; 390 return error;
465} 391}
466 392
393#ifdef CONFIG_COMPAT
394
395struct compat_resume_swap_area {
396 compat_loff_t offset;
397 u32 dev;
398} __packed;
399
400static long
401snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
402{
403 BUILD_BUG_ON(sizeof(loff_t) != sizeof(compat_loff_t));
404
405 switch (cmd) {
406 case SNAPSHOT_GET_IMAGE_SIZE:
407 case SNAPSHOT_AVAIL_SWAP_SIZE:
408 case SNAPSHOT_ALLOC_SWAP_PAGE: {
409 compat_loff_t __user *uoffset = compat_ptr(arg);
410 loff_t offset;
411 mm_segment_t old_fs;
412 int err;
413
414 old_fs = get_fs();
415 set_fs(KERNEL_DS);
416 err = snapshot_ioctl(file, cmd, (unsigned long) &offset);
417 set_fs(old_fs);
418 if (!err && put_user(offset, uoffset))
419 err = -EFAULT;
420 return err;
421 }
422
423 case SNAPSHOT_CREATE_IMAGE:
424 return snapshot_ioctl(file, cmd,
425 (unsigned long) compat_ptr(arg));
426
427 case SNAPSHOT_SET_SWAP_AREA: {
428 struct compat_resume_swap_area __user *u_swap_area =
429 compat_ptr(arg);
430 struct resume_swap_area swap_area;
431 mm_segment_t old_fs;
432 int err;
433
434 err = get_user(swap_area.offset, &u_swap_area->offset);
435 err |= get_user(swap_area.dev, &u_swap_area->dev);
436 if (err)
437 return -EFAULT;
438 old_fs = get_fs();
439 set_fs(KERNEL_DS);
440 err = snapshot_ioctl(file, SNAPSHOT_SET_SWAP_AREA,
441 (unsigned long) &swap_area);
442 set_fs(old_fs);
443 return err;
444 }
445
446 default:
447 return snapshot_ioctl(file, cmd, arg);
448 }
449}
450
451#endif /* CONFIG_COMPAT */
452
467static const struct file_operations snapshot_fops = { 453static const struct file_operations snapshot_fops = {
468 .open = snapshot_open, 454 .open = snapshot_open,
469 .release = snapshot_release, 455 .release = snapshot_release,
@@ -471,6 +457,9 @@ static const struct file_operations snapshot_fops = {
471 .write = snapshot_write, 457 .write = snapshot_write,
472 .llseek = no_llseek, 458 .llseek = no_llseek,
473 .unlocked_ioctl = snapshot_ioctl, 459 .unlocked_ioctl = snapshot_ioctl,
460#ifdef CONFIG_COMPAT
461 .compat_ioctl = snapshot_compat_ioctl,
462#endif
474}; 463};
475 464
476static struct miscdevice snapshot_device = { 465static struct miscdevice snapshot_device = {
diff --git a/kernel/printk.c b/kernel/printk.c
index 7982a0a841ea..13c0a1143f49 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -199,7 +199,7 @@ void __init setup_log_buf(int early)
199 unsigned long mem; 199 unsigned long mem;
200 200
201 mem = memblock_alloc(new_log_buf_len, PAGE_SIZE); 201 mem = memblock_alloc(new_log_buf_len, PAGE_SIZE);
202 if (mem == MEMBLOCK_ERROR) 202 if (!mem)
203 return; 203 return;
204 new_log_buf = __va(mem); 204 new_log_buf = __va(mem);
205 } else { 205 } else {
@@ -521,7 +521,7 @@ static void __call_console_drivers(unsigned start, unsigned end)
521 } 521 }
522} 522}
523 523
524static int __read_mostly ignore_loglevel; 524static bool __read_mostly ignore_loglevel;
525 525
526static int __init ignore_loglevel_setup(char *str) 526static int __init ignore_loglevel_setup(char *str)
527{ 527{
@@ -532,7 +532,7 @@ static int __init ignore_loglevel_setup(char *str)
532} 532}
533 533
534early_param("ignore_loglevel", ignore_loglevel_setup); 534early_param("ignore_loglevel", ignore_loglevel_setup);
535module_param_named(ignore_loglevel, ignore_loglevel, bool, S_IRUGO | S_IWUSR); 535module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
536MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" 536MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
537 "print all kernel messages to the console."); 537 "print all kernel messages to the console.");
538 538
@@ -688,6 +688,7 @@ static void zap_locks(void)
688 688
689 oops_timestamp = jiffies; 689 oops_timestamp = jiffies;
690 690
691 debug_locks_off();
691 /* If a crash is occurring, make sure we can't deadlock */ 692 /* If a crash is occurring, make sure we can't deadlock */
692 raw_spin_lock_init(&logbuf_lock); 693 raw_spin_lock_init(&logbuf_lock);
693 /* And make sure that we print immediately */ 694 /* And make sure that we print immediately */
@@ -695,9 +696,9 @@ static void zap_locks(void)
695} 696}
696 697
697#if defined(CONFIG_PRINTK_TIME) 698#if defined(CONFIG_PRINTK_TIME)
698static int printk_time = 1; 699static bool printk_time = 1;
699#else 700#else
700static int printk_time = 0; 701static bool printk_time = 0;
701#endif 702#endif
702module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); 703module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
703 704
@@ -840,9 +841,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
840 boot_delay_msec(); 841 boot_delay_msec();
841 printk_delay(); 842 printk_delay();
842 843
843 preempt_disable();
844 /* This stops the holder of console_sem just where we want him */ 844 /* This stops the holder of console_sem just where we want him */
845 raw_local_irq_save(flags); 845 local_irq_save(flags);
846 this_cpu = smp_processor_id(); 846 this_cpu = smp_processor_id();
847 847
848 /* 848 /*
@@ -856,7 +856,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
856 * recursion and return - but flag the recursion so that 856 * recursion and return - but flag the recursion so that
857 * it can be printed at the next appropriate moment: 857 * it can be printed at the next appropriate moment:
858 */ 858 */
859 if (!oops_in_progress) { 859 if (!oops_in_progress && !lockdep_recursing(current)) {
860 recursion_bug = 1; 860 recursion_bug = 1;
861 goto out_restore_irqs; 861 goto out_restore_irqs;
862 } 862 }
@@ -962,9 +962,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
962 962
963 lockdep_on(); 963 lockdep_on();
964out_restore_irqs: 964out_restore_irqs:
965 raw_local_irq_restore(flags); 965 local_irq_restore(flags);
966 966
967 preempt_enable();
968 return printed_len; 967 return printed_len;
969} 968}
970EXPORT_SYMBOL(printk); 969EXPORT_SYMBOL(printk);
@@ -1099,7 +1098,7 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha
1099 return -1; 1098 return -1;
1100} 1099}
1101 1100
1102int console_suspend_enabled = 1; 1101bool console_suspend_enabled = 1;
1103EXPORT_SYMBOL(console_suspend_enabled); 1102EXPORT_SYMBOL(console_suspend_enabled);
1104 1103
1105static int __init console_suspend_disable(char *str) 1104static int __init console_suspend_disable(char *str)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 24d04477b257..00ab2ca5ed11 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -96,9 +96,20 @@ void __ptrace_unlink(struct task_struct *child)
96 */ 96 */
97 if (!(child->flags & PF_EXITING) && 97 if (!(child->flags & PF_EXITING) &&
98 (child->signal->flags & SIGNAL_STOP_STOPPED || 98 (child->signal->flags & SIGNAL_STOP_STOPPED ||
99 child->signal->group_stop_count)) 99 child->signal->group_stop_count)) {
100 child->jobctl |= JOBCTL_STOP_PENDING; 100 child->jobctl |= JOBCTL_STOP_PENDING;
101 101
102 /*
103 * This is only possible if this thread was cloned by the
104 * traced task running in the stopped group, set the signal
105 * for the future reports.
106 * FIXME: we should change ptrace_init_task() to handle this
107 * case.
108 */
109 if (!(child->jobctl & JOBCTL_STOP_SIGMASK))
110 child->jobctl |= SIGSTOP;
111 }
112
102 /* 113 /*
103 * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick 114 * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick
104 * @child in the butt. Note that @resume should be used iff @child 115 * @child in the butt. Note that @resume should be used iff @child
@@ -161,6 +172,14 @@ int ptrace_check_attach(struct task_struct *child, bool ignore_state)
161 return ret; 172 return ret;
162} 173}
163 174
175static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
176{
177 if (mode & PTRACE_MODE_NOAUDIT)
178 return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE);
179 else
180 return has_ns_capability(current, ns, CAP_SYS_PTRACE);
181}
182
164int __ptrace_may_access(struct task_struct *task, unsigned int mode) 183int __ptrace_may_access(struct task_struct *task, unsigned int mode)
165{ 184{
166 const struct cred *cred = current_cred(), *tcred; 185 const struct cred *cred = current_cred(), *tcred;
@@ -187,7 +206,7 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
187 cred->gid == tcred->sgid && 206 cred->gid == tcred->sgid &&
188 cred->gid == tcred->gid)) 207 cred->gid == tcred->gid))
189 goto ok; 208 goto ok;
190 if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE)) 209 if (ptrace_has_cap(tcred->user->user_ns, mode))
191 goto ok; 210 goto ok;
192 rcu_read_unlock(); 211 rcu_read_unlock();
193 return -EPERM; 212 return -EPERM;
@@ -196,7 +215,7 @@ ok:
196 smp_rmb(); 215 smp_rmb();
197 if (task->mm) 216 if (task->mm)
198 dumpable = get_dumpable(task->mm); 217 dumpable = get_dumpable(task->mm);
199 if (!dumpable && !task_ns_capable(task, CAP_SYS_PTRACE)) 218 if (!dumpable && !ptrace_has_cap(task_user_ns(task), mode))
200 return -EPERM; 219 return -EPERM;
201 220
202 return security_ptrace_access_check(task, mode); 221 return security_ptrace_access_check(task, mode);
@@ -266,7 +285,7 @@ static int ptrace_attach(struct task_struct *task, long request,
266 task->ptrace = PT_PTRACED; 285 task->ptrace = PT_PTRACED;
267 if (seize) 286 if (seize)
268 task->ptrace |= PT_SEIZED; 287 task->ptrace |= PT_SEIZED;
269 if (task_ns_capable(task, CAP_SYS_PTRACE)) 288 if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE))
270 task->ptrace |= PT_PTRACE_CAP; 289 task->ptrace |= PT_PTRACE_CAP;
271 290
272 __ptrace_link(task, current); 291 __ptrace_link(task, current);
diff --git a/kernel/rcu.h b/kernel/rcu.h
index f600868d550d..aa88baab5f78 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu.h
@@ -30,6 +30,13 @@
30#endif /* #else #ifdef CONFIG_RCU_TRACE */ 30#endif /* #else #ifdef CONFIG_RCU_TRACE */
31 31
32/* 32/*
33 * Process-level increment to ->dynticks_nesting field. This allows for
34 * architectures that use half-interrupts and half-exceptions from
35 * process context.
36 */
37#define DYNTICK_TASK_NESTING (LLONG_MAX / 2 - 1)
38
39/*
33 * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally 40 * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally
34 * by call_rcu() and rcu callback execution, and are therefore not part of the 41 * by call_rcu() and rcu callback execution, and are therefore not part of the
35 * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors. 42 * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors.
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index c5b98e565aee..2bc4e135ff23 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -93,6 +93,8 @@ int rcu_read_lock_bh_held(void)
93{ 93{
94 if (!debug_lockdep_rcu_enabled()) 94 if (!debug_lockdep_rcu_enabled())
95 return 1; 95 return 1;
96 if (rcu_is_cpu_idle())
97 return 0;
96 return in_softirq() || irqs_disabled(); 98 return in_softirq() || irqs_disabled();
97} 99}
98EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); 100EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
@@ -316,3 +318,13 @@ struct debug_obj_descr rcuhead_debug_descr = {
316}; 318};
317EXPORT_SYMBOL_GPL(rcuhead_debug_descr); 319EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
318#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ 320#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
321
322#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
323void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp)
324{
325 trace_rcu_torture_read(rcutorturename, rhp);
326}
327EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
328#else
329#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0)
330#endif
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 636af6d9c6e5..977296dca0a4 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -53,31 +53,137 @@ static void __call_rcu(struct rcu_head *head,
53 53
54#include "rcutiny_plugin.h" 54#include "rcutiny_plugin.h"
55 55
56#ifdef CONFIG_NO_HZ 56static long long rcu_dynticks_nesting = DYNTICK_TASK_NESTING;
57 57
58static long rcu_dynticks_nesting = 1; 58/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
59static void rcu_idle_enter_common(long long oldval)
60{
61 if (rcu_dynticks_nesting) {
62 RCU_TRACE(trace_rcu_dyntick("--=",
63 oldval, rcu_dynticks_nesting));
64 return;
65 }
66 RCU_TRACE(trace_rcu_dyntick("Start", oldval, rcu_dynticks_nesting));
67 if (!is_idle_task(current)) {
68 struct task_struct *idle = idle_task(smp_processor_id());
69
70 RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task",
71 oldval, rcu_dynticks_nesting));
72 ftrace_dump(DUMP_ALL);
73 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
74 current->pid, current->comm,
75 idle->pid, idle->comm); /* must be idle task! */
76 }
77 rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */
78}
59 79
60/* 80/*
61 * Enter dynticks-idle mode, which is an extended quiescent state 81 * Enter idle, which is an extended quiescent state if we have fully
62 * if we have fully entered that mode (i.e., if the new value of 82 * entered that mode (i.e., if the new value of dynticks_nesting is zero).
63 * dynticks_nesting is zero).
64 */ 83 */
65void rcu_enter_nohz(void) 84void rcu_idle_enter(void)
66{ 85{
67 if (--rcu_dynticks_nesting == 0) 86 unsigned long flags;
68 rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ 87 long long oldval;
88
89 local_irq_save(flags);
90 oldval = rcu_dynticks_nesting;
91 rcu_dynticks_nesting = 0;
92 rcu_idle_enter_common(oldval);
93 local_irq_restore(flags);
69} 94}
70 95
71/* 96/*
72 * Exit dynticks-idle mode, so that we are no longer in an extended 97 * Exit an interrupt handler towards idle.
73 * quiescent state.
74 */ 98 */
75void rcu_exit_nohz(void) 99void rcu_irq_exit(void)
100{
101 unsigned long flags;
102 long long oldval;
103
104 local_irq_save(flags);
105 oldval = rcu_dynticks_nesting;
106 rcu_dynticks_nesting--;
107 WARN_ON_ONCE(rcu_dynticks_nesting < 0);
108 rcu_idle_enter_common(oldval);
109 local_irq_restore(flags);
110}
111
112/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */
113static void rcu_idle_exit_common(long long oldval)
76{ 114{
115 if (oldval) {
116 RCU_TRACE(trace_rcu_dyntick("++=",
117 oldval, rcu_dynticks_nesting));
118 return;
119 }
120 RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting));
121 if (!is_idle_task(current)) {
122 struct task_struct *idle = idle_task(smp_processor_id());
123
124 RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task",
125 oldval, rcu_dynticks_nesting));
126 ftrace_dump(DUMP_ALL);
127 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
128 current->pid, current->comm,
129 idle->pid, idle->comm); /* must be idle task! */
130 }
131}
132
133/*
134 * Exit idle, so that we are no longer in an extended quiescent state.
135 */
136void rcu_idle_exit(void)
137{
138 unsigned long flags;
139 long long oldval;
140
141 local_irq_save(flags);
142 oldval = rcu_dynticks_nesting;
143 WARN_ON_ONCE(oldval != 0);
144 rcu_dynticks_nesting = DYNTICK_TASK_NESTING;
145 rcu_idle_exit_common(oldval);
146 local_irq_restore(flags);
147}
148
149/*
150 * Enter an interrupt handler, moving away from idle.
151 */
152void rcu_irq_enter(void)
153{
154 unsigned long flags;
155 long long oldval;
156
157 local_irq_save(flags);
158 oldval = rcu_dynticks_nesting;
77 rcu_dynticks_nesting++; 159 rcu_dynticks_nesting++;
160 WARN_ON_ONCE(rcu_dynticks_nesting == 0);
161 rcu_idle_exit_common(oldval);
162 local_irq_restore(flags);
163}
164
165#ifdef CONFIG_PROVE_RCU
166
167/*
168 * Test whether RCU thinks that the current CPU is idle.
169 */
170int rcu_is_cpu_idle(void)
171{
172 return !rcu_dynticks_nesting;
78} 173}
174EXPORT_SYMBOL(rcu_is_cpu_idle);
175
176#endif /* #ifdef CONFIG_PROVE_RCU */
79 177
80#endif /* #ifdef CONFIG_NO_HZ */ 178/*
179 * Test whether the current CPU was interrupted from idle. Nested
180 * interrupts don't count, we must be running at the first interrupt
181 * level.
182 */
183int rcu_is_cpu_rrupt_from_idle(void)
184{
185 return rcu_dynticks_nesting <= 0;
186}
81 187
82/* 188/*
83 * Helper function for rcu_sched_qs() and rcu_bh_qs(). 189 * Helper function for rcu_sched_qs() and rcu_bh_qs().
@@ -126,14 +232,13 @@ void rcu_bh_qs(int cpu)
126 232
127/* 233/*
128 * Check to see if the scheduling-clock interrupt came from an extended 234 * Check to see if the scheduling-clock interrupt came from an extended
129 * quiescent state, and, if so, tell RCU about it. 235 * quiescent state, and, if so, tell RCU about it. This function must
236 * be called from hardirq context. It is normally called from the
237 * scheduling-clock interrupt.
130 */ 238 */
131void rcu_check_callbacks(int cpu, int user) 239void rcu_check_callbacks(int cpu, int user)
132{ 240{
133 if (user || 241 if (user || rcu_is_cpu_rrupt_from_idle())
134 (idle_cpu(cpu) &&
135 !in_softirq() &&
136 hardirq_count() <= (1 << HARDIRQ_SHIFT)))
137 rcu_sched_qs(cpu); 242 rcu_sched_qs(cpu);
138 else if (!in_softirq()) 243 else if (!in_softirq())
139 rcu_bh_qs(cpu); 244 rcu_bh_qs(cpu);
@@ -154,7 +259,11 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
154 /* If no RCU callbacks ready to invoke, just return. */ 259 /* If no RCU callbacks ready to invoke, just return. */
155 if (&rcp->rcucblist == rcp->donetail) { 260 if (&rcp->rcucblist == rcp->donetail) {
156 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); 261 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1));
157 RCU_TRACE(trace_rcu_batch_end(rcp->name, 0)); 262 RCU_TRACE(trace_rcu_batch_end(rcp->name, 0,
263 ACCESS_ONCE(rcp->rcucblist),
264 need_resched(),
265 is_idle_task(current),
266 rcu_is_callbacks_kthread()));
158 return; 267 return;
159 } 268 }
160 269
@@ -183,7 +292,9 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
183 RCU_TRACE(cb_count++); 292 RCU_TRACE(cb_count++);
184 } 293 }
185 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); 294 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
186 RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count)); 295 RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(),
296 is_idle_task(current),
297 rcu_is_callbacks_kthread()));
187} 298}
188 299
189static void rcu_process_callbacks(struct softirq_action *unused) 300static void rcu_process_callbacks(struct softirq_action *unused)
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 2b0484a5dc28..9cb1ae4aabdd 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -312,8 +312,8 @@ static int rcu_boost(void)
312 rt_mutex_lock(&mtx); 312 rt_mutex_lock(&mtx);
313 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ 313 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
314 314
315 return rcu_preempt_ctrlblk.boost_tasks != NULL || 315 return ACCESS_ONCE(rcu_preempt_ctrlblk.boost_tasks) != NULL ||
316 rcu_preempt_ctrlblk.exp_tasks != NULL; 316 ACCESS_ONCE(rcu_preempt_ctrlblk.exp_tasks) != NULL;
317} 317}
318 318
319/* 319/*
@@ -885,6 +885,19 @@ static void invoke_rcu_callbacks(void)
885 wake_up(&rcu_kthread_wq); 885 wake_up(&rcu_kthread_wq);
886} 886}
887 887
888#ifdef CONFIG_RCU_TRACE
889
890/*
891 * Is the current CPU running the RCU-callbacks kthread?
892 * Caller must have preemption disabled.
893 */
894static bool rcu_is_callbacks_kthread(void)
895{
896 return rcu_kthread_task == current;
897}
898
899#endif /* #ifdef CONFIG_RCU_TRACE */
900
888/* 901/*
889 * This kthread invokes RCU callbacks whose grace periods have 902 * This kthread invokes RCU callbacks whose grace periods have
890 * elapsed. It is awakened as needed, and takes the place of the 903 * elapsed. It is awakened as needed, and takes the place of the
@@ -938,6 +951,18 @@ void invoke_rcu_callbacks(void)
938 raise_softirq(RCU_SOFTIRQ); 951 raise_softirq(RCU_SOFTIRQ);
939} 952}
940 953
954#ifdef CONFIG_RCU_TRACE
955
956/*
957 * There is no callback kthread, so this thread is never it.
958 */
959static bool rcu_is_callbacks_kthread(void)
960{
961 return false;
962}
963
964#endif /* #ifdef CONFIG_RCU_TRACE */
965
941void rcu_init(void) 966void rcu_init(void)
942{ 967{
943 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 968 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 764825c2685c..a58ac285fc69 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -56,14 +56,16 @@ static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */
56static int nfakewriters = 4; /* # fake writer threads */ 56static int nfakewriters = 4; /* # fake writer threads */
57static int stat_interval; /* Interval between stats, in seconds. */ 57static int stat_interval; /* Interval between stats, in seconds. */
58 /* Defaults to "only at end of test". */ 58 /* Defaults to "only at end of test". */
59static int verbose; /* Print more debug info. */ 59static bool verbose; /* Print more debug info. */
60static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ 60static bool test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
61static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ 61static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
62static int stutter = 5; /* Start/stop testing interval (in sec) */ 62static int stutter = 5; /* Start/stop testing interval (in sec) */
63static int irqreader = 1; /* RCU readers from irq (timers). */ 63static int irqreader = 1; /* RCU readers from irq (timers). */
64static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ 64static int fqs_duration; /* Duration of bursts (us), 0 to disable. */
65static int fqs_holdoff = 0; /* Hold time within burst (us). */ 65static int fqs_holdoff; /* Hold time within burst (us). */
66static int fqs_stutter = 3; /* Wait time between bursts (s). */ 66static int fqs_stutter = 3; /* Wait time between bursts (s). */
67static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */
68static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */
67static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ 69static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
68static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ 70static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
69static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ 71static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
@@ -91,6 +93,10 @@ module_param(fqs_holdoff, int, 0444);
91MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); 93MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
92module_param(fqs_stutter, int, 0444); 94module_param(fqs_stutter, int, 0444);
93MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); 95MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
96module_param(onoff_interval, int, 0444);
97MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
98module_param(shutdown_secs, int, 0444);
99MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable.");
94module_param(test_boost, int, 0444); 100module_param(test_boost, int, 0444);
95MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); 101MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
96module_param(test_boost_interval, int, 0444); 102module_param(test_boost_interval, int, 0444);
@@ -119,6 +125,10 @@ static struct task_struct *shuffler_task;
119static struct task_struct *stutter_task; 125static struct task_struct *stutter_task;
120static struct task_struct *fqs_task; 126static struct task_struct *fqs_task;
121static struct task_struct *boost_tasks[NR_CPUS]; 127static struct task_struct *boost_tasks[NR_CPUS];
128static struct task_struct *shutdown_task;
129#ifdef CONFIG_HOTPLUG_CPU
130static struct task_struct *onoff_task;
131#endif /* #ifdef CONFIG_HOTPLUG_CPU */
122 132
123#define RCU_TORTURE_PIPE_LEN 10 133#define RCU_TORTURE_PIPE_LEN 10
124 134
@@ -149,6 +159,10 @@ static long n_rcu_torture_boost_rterror;
149static long n_rcu_torture_boost_failure; 159static long n_rcu_torture_boost_failure;
150static long n_rcu_torture_boosts; 160static long n_rcu_torture_boosts;
151static long n_rcu_torture_timers; 161static long n_rcu_torture_timers;
162static long n_offline_attempts;
163static long n_offline_successes;
164static long n_online_attempts;
165static long n_online_successes;
152static struct list_head rcu_torture_removed; 166static struct list_head rcu_torture_removed;
153static cpumask_var_t shuffle_tmp_mask; 167static cpumask_var_t shuffle_tmp_mask;
154 168
@@ -160,6 +174,8 @@ static int stutter_pause_test;
160#define RCUTORTURE_RUNNABLE_INIT 0 174#define RCUTORTURE_RUNNABLE_INIT 0
161#endif 175#endif
162int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; 176int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
177module_param(rcutorture_runnable, int, 0444);
178MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot");
163 179
164#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) 180#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
165#define rcu_can_boost() 1 181#define rcu_can_boost() 1
@@ -167,6 +183,7 @@ int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
167#define rcu_can_boost() 0 183#define rcu_can_boost() 0
168#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ 184#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
169 185
186static unsigned long shutdown_time; /* jiffies to system shutdown. */
170static unsigned long boost_starttime; /* jiffies of next boost test start. */ 187static unsigned long boost_starttime; /* jiffies of next boost test start. */
171DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ 188DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
172 /* and boost task create/destroy. */ 189 /* and boost task create/destroy. */
@@ -182,6 +199,9 @@ static int fullstop = FULLSTOP_RMMOD;
182 */ 199 */
183static DEFINE_MUTEX(fullstop_mutex); 200static DEFINE_MUTEX(fullstop_mutex);
184 201
202/* Forward reference. */
203static void rcu_torture_cleanup(void);
204
185/* 205/*
186 * Detect and respond to a system shutdown. 206 * Detect and respond to a system shutdown.
187 */ 207 */
@@ -612,6 +632,30 @@ static struct rcu_torture_ops srcu_ops = {
612 .name = "srcu" 632 .name = "srcu"
613}; 633};
614 634
635static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl)
636{
637 return srcu_read_lock_raw(&srcu_ctl);
638}
639
640static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl)
641{
642 srcu_read_unlock_raw(&srcu_ctl, idx);
643}
644
645static struct rcu_torture_ops srcu_raw_ops = {
646 .init = srcu_torture_init,
647 .cleanup = srcu_torture_cleanup,
648 .readlock = srcu_torture_read_lock_raw,
649 .read_delay = srcu_read_delay,
650 .readunlock = srcu_torture_read_unlock_raw,
651 .completed = srcu_torture_completed,
652 .deferred_free = rcu_sync_torture_deferred_free,
653 .sync = srcu_torture_synchronize,
654 .cb_barrier = NULL,
655 .stats = srcu_torture_stats,
656 .name = "srcu_raw"
657};
658
615static void srcu_torture_synchronize_expedited(void) 659static void srcu_torture_synchronize_expedited(void)
616{ 660{
617 synchronize_srcu_expedited(&srcu_ctl); 661 synchronize_srcu_expedited(&srcu_ctl);
@@ -913,6 +957,18 @@ rcu_torture_fakewriter(void *arg)
913 return 0; 957 return 0;
914} 958}
915 959
960void rcutorture_trace_dump(void)
961{
962 static atomic_t beenhere = ATOMIC_INIT(0);
963
964 if (atomic_read(&beenhere))
965 return;
966 if (atomic_xchg(&beenhere, 1) != 0)
967 return;
968 do_trace_rcu_torture_read(cur_ops->name, (struct rcu_head *)~0UL);
969 ftrace_dump(DUMP_ALL);
970}
971
916/* 972/*
917 * RCU torture reader from timer handler. Dereferences rcu_torture_current, 973 * RCU torture reader from timer handler. Dereferences rcu_torture_current,
918 * incrementing the corresponding element of the pipeline array. The 974 * incrementing the corresponding element of the pipeline array. The
@@ -934,6 +990,7 @@ static void rcu_torture_timer(unsigned long unused)
934 rcu_read_lock_bh_held() || 990 rcu_read_lock_bh_held() ||
935 rcu_read_lock_sched_held() || 991 rcu_read_lock_sched_held() ||
936 srcu_read_lock_held(&srcu_ctl)); 992 srcu_read_lock_held(&srcu_ctl));
993 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
937 if (p == NULL) { 994 if (p == NULL) {
938 /* Leave because rcu_torture_writer is not yet underway */ 995 /* Leave because rcu_torture_writer is not yet underway */
939 cur_ops->readunlock(idx); 996 cur_ops->readunlock(idx);
@@ -951,6 +1008,8 @@ static void rcu_torture_timer(unsigned long unused)
951 /* Should not happen, but... */ 1008 /* Should not happen, but... */
952 pipe_count = RCU_TORTURE_PIPE_LEN; 1009 pipe_count = RCU_TORTURE_PIPE_LEN;
953 } 1010 }
1011 if (pipe_count > 1)
1012 rcutorture_trace_dump();
954 __this_cpu_inc(rcu_torture_count[pipe_count]); 1013 __this_cpu_inc(rcu_torture_count[pipe_count]);
955 completed = cur_ops->completed() - completed; 1014 completed = cur_ops->completed() - completed;
956 if (completed > RCU_TORTURE_PIPE_LEN) { 1015 if (completed > RCU_TORTURE_PIPE_LEN) {
@@ -994,6 +1053,7 @@ rcu_torture_reader(void *arg)
994 rcu_read_lock_bh_held() || 1053 rcu_read_lock_bh_held() ||
995 rcu_read_lock_sched_held() || 1054 rcu_read_lock_sched_held() ||
996 srcu_read_lock_held(&srcu_ctl)); 1055 srcu_read_lock_held(&srcu_ctl));
1056 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
997 if (p == NULL) { 1057 if (p == NULL) {
998 /* Wait for rcu_torture_writer to get underway */ 1058 /* Wait for rcu_torture_writer to get underway */
999 cur_ops->readunlock(idx); 1059 cur_ops->readunlock(idx);
@@ -1009,6 +1069,8 @@ rcu_torture_reader(void *arg)
1009 /* Should not happen, but... */ 1069 /* Should not happen, but... */
1010 pipe_count = RCU_TORTURE_PIPE_LEN; 1070 pipe_count = RCU_TORTURE_PIPE_LEN;
1011 } 1071 }
1072 if (pipe_count > 1)
1073 rcutorture_trace_dump();
1012 __this_cpu_inc(rcu_torture_count[pipe_count]); 1074 __this_cpu_inc(rcu_torture_count[pipe_count]);
1013 completed = cur_ops->completed() - completed; 1075 completed = cur_ops->completed() - completed;
1014 if (completed > RCU_TORTURE_PIPE_LEN) { 1076 if (completed > RCU_TORTURE_PIPE_LEN) {
@@ -1056,7 +1118,8 @@ rcu_torture_printk(char *page)
1056 cnt += sprintf(&page[cnt], 1118 cnt += sprintf(&page[cnt],
1057 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " 1119 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
1058 "rtmbe: %d rtbke: %ld rtbre: %ld " 1120 "rtmbe: %d rtbke: %ld rtbre: %ld "
1059 "rtbf: %ld rtb: %ld nt: %ld", 1121 "rtbf: %ld rtb: %ld nt: %ld "
1122 "onoff: %ld/%ld:%ld/%ld",
1060 rcu_torture_current, 1123 rcu_torture_current,
1061 rcu_torture_current_version, 1124 rcu_torture_current_version,
1062 list_empty(&rcu_torture_freelist), 1125 list_empty(&rcu_torture_freelist),
@@ -1068,7 +1131,11 @@ rcu_torture_printk(char *page)
1068 n_rcu_torture_boost_rterror, 1131 n_rcu_torture_boost_rterror,
1069 n_rcu_torture_boost_failure, 1132 n_rcu_torture_boost_failure,
1070 n_rcu_torture_boosts, 1133 n_rcu_torture_boosts,
1071 n_rcu_torture_timers); 1134 n_rcu_torture_timers,
1135 n_online_successes,
1136 n_online_attempts,
1137 n_offline_successes,
1138 n_offline_attempts);
1072 if (atomic_read(&n_rcu_torture_mberror) != 0 || 1139 if (atomic_read(&n_rcu_torture_mberror) != 0 ||
1073 n_rcu_torture_boost_ktrerror != 0 || 1140 n_rcu_torture_boost_ktrerror != 0 ||
1074 n_rcu_torture_boost_rterror != 0 || 1141 n_rcu_torture_boost_rterror != 0 ||
@@ -1232,12 +1299,14 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
1232 "shuffle_interval=%d stutter=%d irqreader=%d " 1299 "shuffle_interval=%d stutter=%d irqreader=%d "
1233 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " 1300 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
1234 "test_boost=%d/%d test_boost_interval=%d " 1301 "test_boost=%d/%d test_boost_interval=%d "
1235 "test_boost_duration=%d\n", 1302 "test_boost_duration=%d shutdown_secs=%d "
1303 "onoff_interval=%d\n",
1236 torture_type, tag, nrealreaders, nfakewriters, 1304 torture_type, tag, nrealreaders, nfakewriters,
1237 stat_interval, verbose, test_no_idle_hz, shuffle_interval, 1305 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
1238 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, 1306 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
1239 test_boost, cur_ops->can_boost, 1307 test_boost, cur_ops->can_boost,
1240 test_boost_interval, test_boost_duration); 1308 test_boost_interval, test_boost_duration, shutdown_secs,
1309 onoff_interval);
1241} 1310}
1242 1311
1243static struct notifier_block rcutorture_shutdown_nb = { 1312static struct notifier_block rcutorture_shutdown_nb = {
@@ -1287,6 +1356,131 @@ static int rcutorture_booster_init(int cpu)
1287 return 0; 1356 return 0;
1288} 1357}
1289 1358
1359/*
1360 * Cause the rcutorture test to shutdown the system after the test has
1361 * run for the time specified by the shutdown_secs module parameter.
1362 */
1363static int
1364rcu_torture_shutdown(void *arg)
1365{
1366 long delta;
1367 unsigned long jiffies_snap;
1368
1369 VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started");
1370 jiffies_snap = ACCESS_ONCE(jiffies);
1371 while (ULONG_CMP_LT(jiffies_snap, shutdown_time) &&
1372 !kthread_should_stop()) {
1373 delta = shutdown_time - jiffies_snap;
1374 if (verbose)
1375 printk(KERN_ALERT "%s" TORTURE_FLAG
1376 "rcu_torture_shutdown task: %lu "
1377 "jiffies remaining\n",
1378 torture_type, delta);
1379 schedule_timeout_interruptible(delta);
1380 jiffies_snap = ACCESS_ONCE(jiffies);
1381 }
1382 if (kthread_should_stop()) {
1383 VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping");
1384 return 0;
1385 }
1386
1387 /* OK, shut down the system. */
1388
1389 VERBOSE_PRINTK_STRING("rcu_torture_shutdown task shutting down system");
1390 shutdown_task = NULL; /* Avoid self-kill deadlock. */
1391 rcu_torture_cleanup(); /* Get the success/failure message. */
1392 kernel_power_off(); /* Shut down the system. */
1393 return 0;
1394}
1395
1396#ifdef CONFIG_HOTPLUG_CPU
1397
1398/*
1399 * Execute random CPU-hotplug operations at the interval specified
1400 * by the onoff_interval.
1401 */
1402static int __cpuinit
1403rcu_torture_onoff(void *arg)
1404{
1405 int cpu;
1406 int maxcpu = -1;
1407 DEFINE_RCU_RANDOM(rand);
1408
1409 VERBOSE_PRINTK_STRING("rcu_torture_onoff task started");
1410 for_each_online_cpu(cpu)
1411 maxcpu = cpu;
1412 WARN_ON(maxcpu < 0);
1413 while (!kthread_should_stop()) {
1414 cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1);
1415 if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
1416 if (verbose)
1417 printk(KERN_ALERT "%s" TORTURE_FLAG
1418 "rcu_torture_onoff task: offlining %d\n",
1419 torture_type, cpu);
1420 n_offline_attempts++;
1421 if (cpu_down(cpu) == 0) {
1422 if (verbose)
1423 printk(KERN_ALERT "%s" TORTURE_FLAG
1424 "rcu_torture_onoff task: "
1425 "offlined %d\n",
1426 torture_type, cpu);
1427 n_offline_successes++;
1428 }
1429 } else if (cpu_is_hotpluggable(cpu)) {
1430 if (verbose)
1431 printk(KERN_ALERT "%s" TORTURE_FLAG
1432 "rcu_torture_onoff task: onlining %d\n",
1433 torture_type, cpu);
1434 n_online_attempts++;
1435 if (cpu_up(cpu) == 0) {
1436 if (verbose)
1437 printk(KERN_ALERT "%s" TORTURE_FLAG
1438 "rcu_torture_onoff task: "
1439 "onlined %d\n",
1440 torture_type, cpu);
1441 n_online_successes++;
1442 }
1443 }
1444 schedule_timeout_interruptible(onoff_interval * HZ);
1445 }
1446 VERBOSE_PRINTK_STRING("rcu_torture_onoff task stopping");
1447 return 0;
1448}
1449
1450static int __cpuinit
1451rcu_torture_onoff_init(void)
1452{
1453 if (onoff_interval <= 0)
1454 return 0;
1455 onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff");
1456 if (IS_ERR(onoff_task)) {
1457 onoff_task = NULL;
1458 return PTR_ERR(onoff_task);
1459 }
1460 return 0;
1461}
1462
1463static void rcu_torture_onoff_cleanup(void)
1464{
1465 if (onoff_task == NULL)
1466 return;
1467 VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task");
1468 kthread_stop(onoff_task);
1469}
1470
1471#else /* #ifdef CONFIG_HOTPLUG_CPU */
1472
1473static void
1474rcu_torture_onoff_init(void)
1475{
1476}
1477
1478static void rcu_torture_onoff_cleanup(void)
1479{
1480}
1481
1482#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
1483
1290static int rcutorture_cpu_notify(struct notifier_block *self, 1484static int rcutorture_cpu_notify(struct notifier_block *self,
1291 unsigned long action, void *hcpu) 1485 unsigned long action, void *hcpu)
1292{ 1486{
@@ -1391,6 +1585,11 @@ rcu_torture_cleanup(void)
1391 for_each_possible_cpu(i) 1585 for_each_possible_cpu(i)
1392 rcutorture_booster_cleanup(i); 1586 rcutorture_booster_cleanup(i);
1393 } 1587 }
1588 if (shutdown_task != NULL) {
1589 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task");
1590 kthread_stop(shutdown_task);
1591 }
1592 rcu_torture_onoff_cleanup();
1394 1593
1395 /* Wait for all RCU callbacks to fire. */ 1594 /* Wait for all RCU callbacks to fire. */
1396 1595
@@ -1416,7 +1615,7 @@ rcu_torture_init(void)
1416 static struct rcu_torture_ops *torture_ops[] = 1615 static struct rcu_torture_ops *torture_ops[] =
1417 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, 1616 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
1418 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, 1617 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
1419 &srcu_ops, &srcu_expedited_ops, 1618 &srcu_ops, &srcu_raw_ops, &srcu_expedited_ops,
1420 &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; 1619 &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
1421 1620
1422 mutex_lock(&fullstop_mutex); 1621 mutex_lock(&fullstop_mutex);
@@ -1607,6 +1806,18 @@ rcu_torture_init(void)
1607 } 1806 }
1608 } 1807 }
1609 } 1808 }
1809 if (shutdown_secs > 0) {
1810 shutdown_time = jiffies + shutdown_secs * HZ;
1811 shutdown_task = kthread_run(rcu_torture_shutdown, NULL,
1812 "rcu_torture_shutdown");
1813 if (IS_ERR(shutdown_task)) {
1814 firsterr = PTR_ERR(shutdown_task);
1815 VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown");
1816 shutdown_task = NULL;
1817 goto unwind;
1818 }
1819 }
1820 rcu_torture_onoff_init();
1610 register_reboot_notifier(&rcutorture_shutdown_nb); 1821 register_reboot_notifier(&rcutorture_shutdown_nb);
1611 rcutorture_record_test_transition(); 1822 rcutorture_record_test_transition();
1612 mutex_unlock(&fullstop_mutex); 1823 mutex_unlock(&fullstop_mutex);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 6b76d812740c..6c4a6722abfd 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -69,7 +69,7 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
69 NUM_RCU_LVL_3, \ 69 NUM_RCU_LVL_3, \
70 NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \ 70 NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \
71 }, \ 71 }, \
72 .signaled = RCU_GP_IDLE, \ 72 .fqs_state = RCU_GP_IDLE, \
73 .gpnum = -300, \ 73 .gpnum = -300, \
74 .completed = -300, \ 74 .completed = -300, \
75 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ 75 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \
@@ -195,12 +195,10 @@ void rcu_note_context_switch(int cpu)
195} 195}
196EXPORT_SYMBOL_GPL(rcu_note_context_switch); 196EXPORT_SYMBOL_GPL(rcu_note_context_switch);
197 197
198#ifdef CONFIG_NO_HZ
199DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 198DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
200 .dynticks_nesting = 1, 199 .dynticks_nesting = DYNTICK_TASK_NESTING,
201 .dynticks = ATOMIC_INIT(1), 200 .dynticks = ATOMIC_INIT(1),
202}; 201};
203#endif /* #ifdef CONFIG_NO_HZ */
204 202
205static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ 203static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */
206static int qhimark = 10000; /* If this many pending, ignore blimit. */ 204static int qhimark = 10000; /* If this many pending, ignore blimit. */
@@ -328,11 +326,11 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
328 return 1; 326 return 1;
329 } 327 }
330 328
331 /* If preemptible RCU, no point in sending reschedule IPI. */ 329 /*
332 if (rdp->preemptible) 330 * The CPU is online, so send it a reschedule IPI. This forces
333 return 0; 331 * it through the scheduler, and (inefficiently) also handles cases
334 332 * where idle loops fail to inform RCU about the CPU being idle.
335 /* The CPU is online, so send it a reschedule IPI. */ 333 */
336 if (rdp->cpu != smp_processor_id()) 334 if (rdp->cpu != smp_processor_id())
337 smp_send_reschedule(rdp->cpu); 335 smp_send_reschedule(rdp->cpu);
338 else 336 else
@@ -343,59 +341,181 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
343 341
344#endif /* #ifdef CONFIG_SMP */ 342#endif /* #ifdef CONFIG_SMP */
345 343
346#ifdef CONFIG_NO_HZ 344/*
345 * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle
346 *
347 * If the new value of the ->dynticks_nesting counter now is zero,
348 * we really have entered idle, and must do the appropriate accounting.
349 * The caller must have disabled interrupts.
350 */
351static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)
352{
353 trace_rcu_dyntick("Start", oldval, 0);
354 if (!is_idle_task(current)) {
355 struct task_struct *idle = idle_task(smp_processor_id());
356
357 trace_rcu_dyntick("Error on entry: not idle task", oldval, 0);
358 ftrace_dump(DUMP_ALL);
359 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
360 current->pid, current->comm,
361 idle->pid, idle->comm); /* must be idle task! */
362 }
363 rcu_prepare_for_idle(smp_processor_id());
364 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
365 smp_mb__before_atomic_inc(); /* See above. */
366 atomic_inc(&rdtp->dynticks);
367 smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */
368 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
369}
347 370
348/** 371/**
349 * rcu_enter_nohz - inform RCU that current CPU is entering nohz 372 * rcu_idle_enter - inform RCU that current CPU is entering idle
350 * 373 *
351 * Enter nohz mode, in other words, -leave- the mode in which RCU 374 * Enter idle mode, in other words, -leave- the mode in which RCU
352 * read-side critical sections can occur. (Though RCU read-side 375 * read-side critical sections can occur. (Though RCU read-side
353 * critical sections can occur in irq handlers in nohz mode, a possibility 376 * critical sections can occur in irq handlers in idle, a possibility
354 * handled by rcu_irq_enter() and rcu_irq_exit()). 377 * handled by irq_enter() and irq_exit().)
378 *
379 * We crowbar the ->dynticks_nesting field to zero to allow for
380 * the possibility of usermode upcalls having messed up our count
381 * of interrupt nesting level during the prior busy period.
355 */ 382 */
356void rcu_enter_nohz(void) 383void rcu_idle_enter(void)
357{ 384{
358 unsigned long flags; 385 unsigned long flags;
386 long long oldval;
359 struct rcu_dynticks *rdtp; 387 struct rcu_dynticks *rdtp;
360 388
361 local_irq_save(flags); 389 local_irq_save(flags);
362 rdtp = &__get_cpu_var(rcu_dynticks); 390 rdtp = &__get_cpu_var(rcu_dynticks);
363 if (--rdtp->dynticks_nesting) { 391 oldval = rdtp->dynticks_nesting;
364 local_irq_restore(flags); 392 rdtp->dynticks_nesting = 0;
365 return; 393 rcu_idle_enter_common(rdtp, oldval);
366 }
367 trace_rcu_dyntick("Start");
368 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
369 smp_mb__before_atomic_inc(); /* See above. */
370 atomic_inc(&rdtp->dynticks);
371 smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */
372 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
373 local_irq_restore(flags); 394 local_irq_restore(flags);
374} 395}
375 396
376/* 397/**
377 * rcu_exit_nohz - inform RCU that current CPU is leaving nohz 398 * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
399 *
400 * Exit from an interrupt handler, which might possibly result in entering
401 * idle mode, in other words, leaving the mode in which read-side critical
402 * sections can occur.
378 * 403 *
379 * Exit nohz mode, in other words, -enter- the mode in which RCU 404 * This code assumes that the idle loop never does anything that might
380 * read-side critical sections normally occur. 405 * result in unbalanced calls to irq_enter() and irq_exit(). If your
406 * architecture violates this assumption, RCU will give you what you
407 * deserve, good and hard. But very infrequently and irreproducibly.
408 *
409 * Use things like work queues to work around this limitation.
410 *
411 * You have been warned.
381 */ 412 */
382void rcu_exit_nohz(void) 413void rcu_irq_exit(void)
383{ 414{
384 unsigned long flags; 415 unsigned long flags;
416 long long oldval;
385 struct rcu_dynticks *rdtp; 417 struct rcu_dynticks *rdtp;
386 418
387 local_irq_save(flags); 419 local_irq_save(flags);
388 rdtp = &__get_cpu_var(rcu_dynticks); 420 rdtp = &__get_cpu_var(rcu_dynticks);
389 if (rdtp->dynticks_nesting++) { 421 oldval = rdtp->dynticks_nesting;
390 local_irq_restore(flags); 422 rdtp->dynticks_nesting--;
391 return; 423 WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
392 } 424 if (rdtp->dynticks_nesting)
425 trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting);
426 else
427 rcu_idle_enter_common(rdtp, oldval);
428 local_irq_restore(flags);
429}
430
431/*
432 * rcu_idle_exit_common - inform RCU that current CPU is moving away from idle
433 *
434 * If the new value of the ->dynticks_nesting counter was previously zero,
435 * we really have exited idle, and must do the appropriate accounting.
436 * The caller must have disabled interrupts.
437 */
438static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)
439{
393 smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ 440 smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */
394 atomic_inc(&rdtp->dynticks); 441 atomic_inc(&rdtp->dynticks);
395 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ 442 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
396 smp_mb__after_atomic_inc(); /* See above. */ 443 smp_mb__after_atomic_inc(); /* See above. */
397 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); 444 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
398 trace_rcu_dyntick("End"); 445 rcu_cleanup_after_idle(smp_processor_id());
446 trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting);
447 if (!is_idle_task(current)) {
448 struct task_struct *idle = idle_task(smp_processor_id());
449
450 trace_rcu_dyntick("Error on exit: not idle task",
451 oldval, rdtp->dynticks_nesting);
452 ftrace_dump(DUMP_ALL);
453 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
454 current->pid, current->comm,
455 idle->pid, idle->comm); /* must be idle task! */
456 }
457}
458
459/**
460 * rcu_idle_exit - inform RCU that current CPU is leaving idle
461 *
462 * Exit idle mode, in other words, -enter- the mode in which RCU
463 * read-side critical sections can occur.
464 *
465 * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NESTING to
466 * allow for the possibility of usermode upcalls messing up our count
467 * of interrupt nesting level during the busy period that is just
468 * now starting.
469 */
470void rcu_idle_exit(void)
471{
472 unsigned long flags;
473 struct rcu_dynticks *rdtp;
474 long long oldval;
475
476 local_irq_save(flags);
477 rdtp = &__get_cpu_var(rcu_dynticks);
478 oldval = rdtp->dynticks_nesting;
479 WARN_ON_ONCE(oldval != 0);
480 rdtp->dynticks_nesting = DYNTICK_TASK_NESTING;
481 rcu_idle_exit_common(rdtp, oldval);
482 local_irq_restore(flags);
483}
484
485/**
486 * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
487 *
488 * Enter an interrupt handler, which might possibly result in exiting
489 * idle mode, in other words, entering the mode in which read-side critical
490 * sections can occur.
491 *
492 * Note that the Linux kernel is fully capable of entering an interrupt
493 * handler that it never exits, for example when doing upcalls to
494 * user mode! This code assumes that the idle loop never does upcalls to
495 * user mode. If your architecture does do upcalls from the idle loop (or
496 * does anything else that results in unbalanced calls to the irq_enter()
497 * and irq_exit() functions), RCU will give you what you deserve, good
498 * and hard. But very infrequently and irreproducibly.
499 *
500 * Use things like work queues to work around this limitation.
501 *
502 * You have been warned.
503 */
504void rcu_irq_enter(void)
505{
506 unsigned long flags;
507 struct rcu_dynticks *rdtp;
508 long long oldval;
509
510 local_irq_save(flags);
511 rdtp = &__get_cpu_var(rcu_dynticks);
512 oldval = rdtp->dynticks_nesting;
513 rdtp->dynticks_nesting++;
514 WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
515 if (oldval)
516 trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting);
517 else
518 rcu_idle_exit_common(rdtp, oldval);
399 local_irq_restore(flags); 519 local_irq_restore(flags);
400} 520}
401 521
@@ -442,27 +562,37 @@ void rcu_nmi_exit(void)
442 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); 562 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
443} 563}
444 564
565#ifdef CONFIG_PROVE_RCU
566
445/** 567/**
446 * rcu_irq_enter - inform RCU of entry to hard irq context 568 * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle
447 * 569 *
448 * If the CPU was idle with dynamic ticks active, this updates the 570 * If the current CPU is in its idle loop and is neither in an interrupt
449 * rdtp->dynticks to let the RCU handling know that the CPU is active. 571 * or NMI handler, return true.
450 */ 572 */
451void rcu_irq_enter(void) 573int rcu_is_cpu_idle(void)
452{ 574{
453 rcu_exit_nohz(); 575 int ret;
576
577 preempt_disable();
578 ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0;
579 preempt_enable();
580 return ret;
454} 581}
582EXPORT_SYMBOL(rcu_is_cpu_idle);
583
584#endif /* #ifdef CONFIG_PROVE_RCU */
455 585
456/** 586/**
457 * rcu_irq_exit - inform RCU of exit from hard irq context 587 * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle
458 * 588 *
459 * If the CPU was idle with dynamic ticks active, update the rdp->dynticks 589 * If the current CPU is idle or running at a first-level (not nested)
460 * to put let the RCU handling be aware that the CPU is going back to idle 590 * interrupt from idle, return true. The caller must have at least
461 * with no ticks. 591 * disabled preemption.
462 */ 592 */
463void rcu_irq_exit(void) 593int rcu_is_cpu_rrupt_from_idle(void)
464{ 594{
465 rcu_enter_nohz(); 595 return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;
466} 596}
467 597
468#ifdef CONFIG_SMP 598#ifdef CONFIG_SMP
@@ -475,7 +605,7 @@ void rcu_irq_exit(void)
475static int dyntick_save_progress_counter(struct rcu_data *rdp) 605static int dyntick_save_progress_counter(struct rcu_data *rdp)
476{ 606{
477 rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); 607 rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
478 return 0; 608 return (rdp->dynticks_snap & 0x1) == 0;
479} 609}
480 610
481/* 611/*
@@ -512,26 +642,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
512 642
513#endif /* #ifdef CONFIG_SMP */ 643#endif /* #ifdef CONFIG_SMP */
514 644
515#else /* #ifdef CONFIG_NO_HZ */
516
517#ifdef CONFIG_SMP
518
519static int dyntick_save_progress_counter(struct rcu_data *rdp)
520{
521 return 0;
522}
523
524static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
525{
526 return rcu_implicit_offline_qs(rdp);
527}
528
529#endif /* #ifdef CONFIG_SMP */
530
531#endif /* #else #ifdef CONFIG_NO_HZ */
532
533int rcu_cpu_stall_suppress __read_mostly;
534
535static void record_gp_stall_check_time(struct rcu_state *rsp) 645static void record_gp_stall_check_time(struct rcu_state *rsp)
536{ 646{
537 rsp->gp_start = jiffies; 647 rsp->gp_start = jiffies;
@@ -866,8 +976,8 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
866 /* Advance to a new grace period and initialize state. */ 976 /* Advance to a new grace period and initialize state. */
867 rsp->gpnum++; 977 rsp->gpnum++;
868 trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); 978 trace_rcu_grace_period(rsp->name, rsp->gpnum, "start");
869 WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT); 979 WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT);
870 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ 980 rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */
871 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 981 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
872 record_gp_stall_check_time(rsp); 982 record_gp_stall_check_time(rsp);
873 983
@@ -877,7 +987,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
877 rnp->qsmask = rnp->qsmaskinit; 987 rnp->qsmask = rnp->qsmaskinit;
878 rnp->gpnum = rsp->gpnum; 988 rnp->gpnum = rsp->gpnum;
879 rnp->completed = rsp->completed; 989 rnp->completed = rsp->completed;
880 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ 990 rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state OK */
881 rcu_start_gp_per_cpu(rsp, rnp, rdp); 991 rcu_start_gp_per_cpu(rsp, rnp, rdp);
882 rcu_preempt_boost_start_gp(rnp); 992 rcu_preempt_boost_start_gp(rnp);
883 trace_rcu_grace_period_init(rsp->name, rnp->gpnum, 993 trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
@@ -927,7 +1037,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
927 1037
928 rnp = rcu_get_root(rsp); 1038 rnp = rcu_get_root(rsp);
929 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 1039 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
930 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ 1040 rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
931 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1041 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
932 raw_spin_unlock_irqrestore(&rsp->onofflock, flags); 1042 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
933} 1043}
@@ -991,7 +1101,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
991 1101
992 rsp->completed = rsp->gpnum; /* Declare the grace period complete. */ 1102 rsp->completed = rsp->gpnum; /* Declare the grace period complete. */
993 trace_rcu_grace_period(rsp->name, rsp->completed, "end"); 1103 trace_rcu_grace_period(rsp->name, rsp->completed, "end");
994 rsp->signaled = RCU_GP_IDLE; 1104 rsp->fqs_state = RCU_GP_IDLE;
995 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ 1105 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
996} 1106}
997 1107
@@ -1221,7 +1331,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1221 else 1331 else
1222 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1332 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1223 if (need_report & RCU_OFL_TASKS_EXP_GP) 1333 if (need_report & RCU_OFL_TASKS_EXP_GP)
1224 rcu_report_exp_rnp(rsp, rnp); 1334 rcu_report_exp_rnp(rsp, rnp, true);
1225 rcu_node_kthread_setaffinity(rnp, -1); 1335 rcu_node_kthread_setaffinity(rnp, -1);
1226} 1336}
1227 1337
@@ -1263,7 +1373,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1263 /* If no callbacks are ready, just return.*/ 1373 /* If no callbacks are ready, just return.*/
1264 if (!cpu_has_callbacks_ready_to_invoke(rdp)) { 1374 if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
1265 trace_rcu_batch_start(rsp->name, 0, 0); 1375 trace_rcu_batch_start(rsp->name, 0, 0);
1266 trace_rcu_batch_end(rsp->name, 0); 1376 trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
1377 need_resched(), is_idle_task(current),
1378 rcu_is_callbacks_kthread());
1267 return; 1379 return;
1268 } 1380 }
1269 1381
@@ -1291,12 +1403,17 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1291 debug_rcu_head_unqueue(list); 1403 debug_rcu_head_unqueue(list);
1292 __rcu_reclaim(rsp->name, list); 1404 __rcu_reclaim(rsp->name, list);
1293 list = next; 1405 list = next;
1294 if (++count >= bl) 1406 /* Stop only if limit reached and CPU has something to do. */
1407 if (++count >= bl &&
1408 (need_resched() ||
1409 (!is_idle_task(current) && !rcu_is_callbacks_kthread())))
1295 break; 1410 break;
1296 } 1411 }
1297 1412
1298 local_irq_save(flags); 1413 local_irq_save(flags);
1299 trace_rcu_batch_end(rsp->name, count); 1414 trace_rcu_batch_end(rsp->name, count, !!list, need_resched(),
1415 is_idle_task(current),
1416 rcu_is_callbacks_kthread());
1300 1417
1301 /* Update count, and requeue any remaining callbacks. */ 1418 /* Update count, and requeue any remaining callbacks. */
1302 rdp->qlen -= count; 1419 rdp->qlen -= count;
@@ -1334,16 +1451,14 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1334 * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). 1451 * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
1335 * Also schedule RCU core processing. 1452 * Also schedule RCU core processing.
1336 * 1453 *
1337 * This function must be called with hardirqs disabled. It is normally 1454 * This function must be called from hardirq context. It is normally
1338 * invoked from the scheduling-clock interrupt. If rcu_pending returns 1455 * invoked from the scheduling-clock interrupt. If rcu_pending returns
1339 * false, there is no point in invoking rcu_check_callbacks(). 1456 * false, there is no point in invoking rcu_check_callbacks().
1340 */ 1457 */
1341void rcu_check_callbacks(int cpu, int user) 1458void rcu_check_callbacks(int cpu, int user)
1342{ 1459{
1343 trace_rcu_utilization("Start scheduler-tick"); 1460 trace_rcu_utilization("Start scheduler-tick");
1344 if (user || 1461 if (user || rcu_is_cpu_rrupt_from_idle()) {
1345 (idle_cpu(cpu) && rcu_scheduler_active &&
1346 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
1347 1462
1348 /* 1463 /*
1349 * Get here if this CPU took its interrupt from user 1464 * Get here if this CPU took its interrupt from user
@@ -1457,7 +1572,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1457 goto unlock_fqs_ret; /* no GP in progress, time updated. */ 1572 goto unlock_fqs_ret; /* no GP in progress, time updated. */
1458 } 1573 }
1459 rsp->fqs_active = 1; 1574 rsp->fqs_active = 1;
1460 switch (rsp->signaled) { 1575 switch (rsp->fqs_state) {
1461 case RCU_GP_IDLE: 1576 case RCU_GP_IDLE:
1462 case RCU_GP_INIT: 1577 case RCU_GP_INIT:
1463 1578
@@ -1473,7 +1588,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1473 force_qs_rnp(rsp, dyntick_save_progress_counter); 1588 force_qs_rnp(rsp, dyntick_save_progress_counter);
1474 raw_spin_lock(&rnp->lock); /* irqs already disabled */ 1589 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1475 if (rcu_gp_in_progress(rsp)) 1590 if (rcu_gp_in_progress(rsp))
1476 rsp->signaled = RCU_FORCE_QS; 1591 rsp->fqs_state = RCU_FORCE_QS;
1477 break; 1592 break;
1478 1593
1479 case RCU_FORCE_QS: 1594 case RCU_FORCE_QS:
@@ -1812,7 +1927,7 @@ static int rcu_pending(int cpu)
1812 * by the current CPU, even if none need be done immediately, returning 1927 * by the current CPU, even if none need be done immediately, returning
1813 * 1 if so. 1928 * 1 if so.
1814 */ 1929 */
1815static int rcu_needs_cpu_quick_check(int cpu) 1930static int rcu_cpu_has_callbacks(int cpu)
1816{ 1931{
1817 /* RCU callbacks either ready or pending? */ 1932 /* RCU callbacks either ready or pending? */
1818 return per_cpu(rcu_sched_data, cpu).nxtlist || 1933 return per_cpu(rcu_sched_data, cpu).nxtlist ||
@@ -1913,9 +2028,9 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1913 for (i = 0; i < RCU_NEXT_SIZE; i++) 2028 for (i = 0; i < RCU_NEXT_SIZE; i++)
1914 rdp->nxttail[i] = &rdp->nxtlist; 2029 rdp->nxttail[i] = &rdp->nxtlist;
1915 rdp->qlen = 0; 2030 rdp->qlen = 0;
1916#ifdef CONFIG_NO_HZ
1917 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 2031 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1918#endif /* #ifdef CONFIG_NO_HZ */ 2032 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING);
2033 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
1919 rdp->cpu = cpu; 2034 rdp->cpu = cpu;
1920 rdp->rsp = rsp; 2035 rdp->rsp = rsp;
1921 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2036 raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1942,6 +2057,10 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
1942 rdp->qlen_last_fqs_check = 0; 2057 rdp->qlen_last_fqs_check = 0;
1943 rdp->n_force_qs_snap = rsp->n_force_qs; 2058 rdp->n_force_qs_snap = rsp->n_force_qs;
1944 rdp->blimit = blimit; 2059 rdp->blimit = blimit;
2060 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_NESTING;
2061 atomic_set(&rdp->dynticks->dynticks,
2062 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
2063 rcu_prepare_for_idle_init(cpu);
1945 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 2064 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1946 2065
1947 /* 2066 /*
@@ -2023,6 +2142,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2023 rcu_send_cbs_to_online(&rcu_bh_state); 2142 rcu_send_cbs_to_online(&rcu_bh_state);
2024 rcu_send_cbs_to_online(&rcu_sched_state); 2143 rcu_send_cbs_to_online(&rcu_sched_state);
2025 rcu_preempt_send_cbs_to_online(); 2144 rcu_preempt_send_cbs_to_online();
2145 rcu_cleanup_after_idle(cpu);
2026 break; 2146 break;
2027 case CPU_DEAD: 2147 case CPU_DEAD:
2028 case CPU_DEAD_FROZEN: 2148 case CPU_DEAD_FROZEN:
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 849ce9ec51fe..fddff92d6676 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -84,9 +84,10 @@
84 * Dynticks per-CPU state. 84 * Dynticks per-CPU state.
85 */ 85 */
86struct rcu_dynticks { 86struct rcu_dynticks {
87 int dynticks_nesting; /* Track irq/process nesting level. */ 87 long long dynticks_nesting; /* Track irq/process nesting level. */
88 int dynticks_nmi_nesting; /* Track NMI nesting level. */ 88 /* Process level is worth LLONG_MAX/2. */
89 atomic_t dynticks; /* Even value for dynticks-idle, else odd. */ 89 int dynticks_nmi_nesting; /* Track NMI nesting level. */
90 atomic_t dynticks; /* Even value for idle, else odd. */
90}; 91};
91 92
92/* RCU's kthread states for tracing. */ 93/* RCU's kthread states for tracing. */
@@ -274,16 +275,12 @@ struct rcu_data {
274 /* did other CPU force QS recently? */ 275 /* did other CPU force QS recently? */
275 long blimit; /* Upper limit on a processed batch */ 276 long blimit; /* Upper limit on a processed batch */
276 277
277#ifdef CONFIG_NO_HZ
278 /* 3) dynticks interface. */ 278 /* 3) dynticks interface. */
279 struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ 279 struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */
280 int dynticks_snap; /* Per-GP tracking for dynticks. */ 280 int dynticks_snap; /* Per-GP tracking for dynticks. */
281#endif /* #ifdef CONFIG_NO_HZ */
282 281
283 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ 282 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
284#ifdef CONFIG_NO_HZ
285 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ 283 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
286#endif /* #ifdef CONFIG_NO_HZ */
287 unsigned long offline_fqs; /* Kicked due to being offline. */ 284 unsigned long offline_fqs; /* Kicked due to being offline. */
288 unsigned long resched_ipi; /* Sent a resched IPI. */ 285 unsigned long resched_ipi; /* Sent a resched IPI. */
289 286
@@ -302,16 +299,12 @@ struct rcu_data {
302 struct rcu_state *rsp; 299 struct rcu_state *rsp;
303}; 300};
304 301
305/* Values for signaled field in struct rcu_state. */ 302/* Values for fqs_state field in struct rcu_state. */
306#define RCU_GP_IDLE 0 /* No grace period in progress. */ 303#define RCU_GP_IDLE 0 /* No grace period in progress. */
307#define RCU_GP_INIT 1 /* Grace period being initialized. */ 304#define RCU_GP_INIT 1 /* Grace period being initialized. */
308#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ 305#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */
309#define RCU_FORCE_QS 3 /* Need to force quiescent state. */ 306#define RCU_FORCE_QS 3 /* Need to force quiescent state. */
310#ifdef CONFIG_NO_HZ
311#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK 307#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
312#else /* #ifdef CONFIG_NO_HZ */
313#define RCU_SIGNAL_INIT RCU_FORCE_QS
314#endif /* #else #ifdef CONFIG_NO_HZ */
315 308
316#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 309#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
317 310
@@ -361,7 +354,7 @@ struct rcu_state {
361 354
362 /* The following fields are guarded by the root rcu_node's lock. */ 355 /* The following fields are guarded by the root rcu_node's lock. */
363 356
364 u8 signaled ____cacheline_internodealigned_in_smp; 357 u8 fqs_state ____cacheline_internodealigned_in_smp;
365 /* Force QS state. */ 358 /* Force QS state. */
366 u8 fqs_active; /* force_quiescent_state() */ 359 u8 fqs_active; /* force_quiescent_state() */
367 /* is running. */ 360 /* is running. */
@@ -451,7 +444,8 @@ static void rcu_preempt_check_callbacks(int cpu);
451static void rcu_preempt_process_callbacks(void); 444static void rcu_preempt_process_callbacks(void);
452void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 445void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
453#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) 446#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
454static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp); 447static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
448 bool wake);
455#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ 449#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
456static int rcu_preempt_pending(int cpu); 450static int rcu_preempt_pending(int cpu);
457static int rcu_preempt_needs_cpu(int cpu); 451static int rcu_preempt_needs_cpu(int cpu);
@@ -461,6 +455,7 @@ static void __init __rcu_init_preempt(void);
461static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); 455static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
462static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); 456static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
463static void invoke_rcu_callbacks_kthread(void); 457static void invoke_rcu_callbacks_kthread(void);
458static bool rcu_is_callbacks_kthread(void);
464#ifdef CONFIG_RCU_BOOST 459#ifdef CONFIG_RCU_BOOST
465static void rcu_preempt_do_callbacks(void); 460static void rcu_preempt_do_callbacks(void);
466static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, 461static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
@@ -473,5 +468,8 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg);
473#endif /* #ifdef CONFIG_RCU_BOOST */ 468#endif /* #ifdef CONFIG_RCU_BOOST */
474static void rcu_cpu_kthread_setrt(int cpu, int to_rt); 469static void rcu_cpu_kthread_setrt(int cpu, int to_rt);
475static void __cpuinit rcu_prepare_kthreads(int cpu); 470static void __cpuinit rcu_prepare_kthreads(int cpu);
471static void rcu_prepare_for_idle_init(int cpu);
472static void rcu_cleanup_after_idle(int cpu);
473static void rcu_prepare_for_idle(int cpu);
476 474
477#endif /* #ifndef RCU_TREE_NONCORE */ 475#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 4b9b9f8a4184..8bb35d73e1f9 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -312,6 +312,7 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
312{ 312{
313 int empty; 313 int empty;
314 int empty_exp; 314 int empty_exp;
315 int empty_exp_now;
315 unsigned long flags; 316 unsigned long flags;
316 struct list_head *np; 317 struct list_head *np;
317#ifdef CONFIG_RCU_BOOST 318#ifdef CONFIG_RCU_BOOST
@@ -382,8 +383,10 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
382 /* 383 /*
383 * If this was the last task on the current list, and if 384 * If this was the last task on the current list, and if
384 * we aren't waiting on any CPUs, report the quiescent state. 385 * we aren't waiting on any CPUs, report the quiescent state.
385 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. 386 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
387 * so we must take a snapshot of the expedited state.
386 */ 388 */
389 empty_exp_now = !rcu_preempted_readers_exp(rnp);
387 if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { 390 if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
388 trace_rcu_quiescent_state_report("preempt_rcu", 391 trace_rcu_quiescent_state_report("preempt_rcu",
389 rnp->gpnum, 392 rnp->gpnum,
@@ -406,8 +409,8 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
406 * If this was the last task on the expedited lists, 409 * If this was the last task on the expedited lists,
407 * then we need to report up the rcu_node hierarchy. 410 * then we need to report up the rcu_node hierarchy.
408 */ 411 */
409 if (!empty_exp && !rcu_preempted_readers_exp(rnp)) 412 if (!empty_exp && empty_exp_now)
410 rcu_report_exp_rnp(&rcu_preempt_state, rnp); 413 rcu_report_exp_rnp(&rcu_preempt_state, rnp, true);
411 } else { 414 } else {
412 local_irq_restore(flags); 415 local_irq_restore(flags);
413 } 416 }
@@ -729,9 +732,13 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
729 * recursively up the tree. (Calm down, calm down, we do the recursion 732 * recursively up the tree. (Calm down, calm down, we do the recursion
730 * iteratively!) 733 * iteratively!)
731 * 734 *
735 * Most callers will set the "wake" flag, but the task initiating the
736 * expedited grace period need not wake itself.
737 *
732 * Caller must hold sync_rcu_preempt_exp_mutex. 738 * Caller must hold sync_rcu_preempt_exp_mutex.
733 */ 739 */
734static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) 740static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
741 bool wake)
735{ 742{
736 unsigned long flags; 743 unsigned long flags;
737 unsigned long mask; 744 unsigned long mask;
@@ -744,7 +751,8 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
744 } 751 }
745 if (rnp->parent == NULL) { 752 if (rnp->parent == NULL) {
746 raw_spin_unlock_irqrestore(&rnp->lock, flags); 753 raw_spin_unlock_irqrestore(&rnp->lock, flags);
747 wake_up(&sync_rcu_preempt_exp_wq); 754 if (wake)
755 wake_up(&sync_rcu_preempt_exp_wq);
748 break; 756 break;
749 } 757 }
750 mask = rnp->grpmask; 758 mask = rnp->grpmask;
@@ -777,7 +785,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
777 must_wait = 1; 785 must_wait = 1;
778 } 786 }
779 if (!must_wait) 787 if (!must_wait)
780 rcu_report_exp_rnp(rsp, rnp); 788 rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */
781} 789}
782 790
783/* 791/*
@@ -1069,9 +1077,9 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
1069 * report on tasks preempted in RCU read-side critical sections during 1077 * report on tasks preempted in RCU read-side critical sections during
1070 * expedited RCU grace periods. 1078 * expedited RCU grace periods.
1071 */ 1079 */
1072static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) 1080static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
1081 bool wake)
1073{ 1082{
1074 return;
1075} 1083}
1076 1084
1077#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 1085#endif /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -1157,8 +1165,6 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1157 1165
1158#endif /* #else #ifdef CONFIG_RCU_TRACE */ 1166#endif /* #else #ifdef CONFIG_RCU_TRACE */
1159 1167
1160static struct lock_class_key rcu_boost_class;
1161
1162/* 1168/*
1163 * Carry out RCU priority boosting on the task indicated by ->exp_tasks 1169 * Carry out RCU priority boosting on the task indicated by ->exp_tasks
1164 * or ->boost_tasks, advancing the pointer to the next task in the 1170 * or ->boost_tasks, advancing the pointer to the next task in the
@@ -1221,15 +1227,13 @@ static int rcu_boost(struct rcu_node *rnp)
1221 */ 1227 */
1222 t = container_of(tb, struct task_struct, rcu_node_entry); 1228 t = container_of(tb, struct task_struct, rcu_node_entry);
1223 rt_mutex_init_proxy_locked(&mtx, t); 1229 rt_mutex_init_proxy_locked(&mtx, t);
1224 /* Avoid lockdep false positives. This rt_mutex is its own thing. */
1225 lockdep_set_class_and_name(&mtx.wait_lock, &rcu_boost_class,
1226 "rcu_boost_mutex");
1227 t->rcu_boost_mutex = &mtx; 1230 t->rcu_boost_mutex = &mtx;
1228 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1231 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1229 rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ 1232 rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */
1230 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ 1233 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
1231 1234
1232 return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL; 1235 return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
1236 ACCESS_ONCE(rnp->boost_tasks) != NULL;
1233} 1237}
1234 1238
1235/* 1239/*
@@ -1329,6 +1333,15 @@ static void invoke_rcu_callbacks_kthread(void)
1329} 1333}
1330 1334
1331/* 1335/*
1336 * Is the current CPU running the RCU-callbacks kthread?
1337 * Caller must have preemption disabled.
1338 */
1339static bool rcu_is_callbacks_kthread(void)
1340{
1341 return __get_cpu_var(rcu_cpu_kthread_task) == current;
1342}
1343
1344/*
1332 * Set the affinity of the boost kthread. The CPU-hotplug locks are 1345 * Set the affinity of the boost kthread. The CPU-hotplug locks are
1333 * held, so no one should be messing with the existence of the boost 1346 * held, so no one should be messing with the existence of the boost
1334 * kthread. 1347 * kthread.
@@ -1772,6 +1785,11 @@ static void invoke_rcu_callbacks_kthread(void)
1772 WARN_ON_ONCE(1); 1785 WARN_ON_ONCE(1);
1773} 1786}
1774 1787
1788static bool rcu_is_callbacks_kthread(void)
1789{
1790 return false;
1791}
1792
1775static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) 1793static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1776{ 1794{
1777} 1795}
@@ -1907,7 +1925,7 @@ void synchronize_sched_expedited(void)
1907 * grace period works for us. 1925 * grace period works for us.
1908 */ 1926 */
1909 get_online_cpus(); 1927 get_online_cpus();
1910 snap = atomic_read(&sync_sched_expedited_started) - 1; 1928 snap = atomic_read(&sync_sched_expedited_started);
1911 smp_mb(); /* ensure read is before try_stop_cpus(). */ 1929 smp_mb(); /* ensure read is before try_stop_cpus(). */
1912 } 1930 }
1913 1931
@@ -1939,88 +1957,243 @@ EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
1939 * 1 if so. This function is part of the RCU implementation; it is -not- 1957 * 1 if so. This function is part of the RCU implementation; it is -not-
1940 * an exported member of the RCU API. 1958 * an exported member of the RCU API.
1941 * 1959 *
1942 * Because we have preemptible RCU, just check whether this CPU needs 1960 * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
1943 * any flavor of RCU. Do not chew up lots of CPU cycles with preemption 1961 * any flavor of RCU.
1944 * disabled in a most-likely vain attempt to cause RCU not to need this CPU.
1945 */ 1962 */
1946int rcu_needs_cpu(int cpu) 1963int rcu_needs_cpu(int cpu)
1947{ 1964{
1948 return rcu_needs_cpu_quick_check(cpu); 1965 return rcu_cpu_has_callbacks(cpu);
1966}
1967
1968/*
1969 * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it.
1970 */
1971static void rcu_prepare_for_idle_init(int cpu)
1972{
1973}
1974
1975/*
1976 * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
1977 * after it.
1978 */
1979static void rcu_cleanup_after_idle(int cpu)
1980{
1981}
1982
1983/*
1984 * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=y,
1985 * is nothing.
1986 */
1987static void rcu_prepare_for_idle(int cpu)
1988{
1949} 1989}
1950 1990
1951#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 1991#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
1952 1992
1953#define RCU_NEEDS_CPU_FLUSHES 5 1993/*
1994 * This code is invoked when a CPU goes idle, at which point we want
1995 * to have the CPU do everything required for RCU so that it can enter
1996 * the energy-efficient dyntick-idle mode. This is handled by a
1997 * state machine implemented by rcu_prepare_for_idle() below.
1998 *
1999 * The following three proprocessor symbols control this state machine:
2000 *
2001 * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt
2002 * to satisfy RCU. Beyond this point, it is better to incur a periodic
2003 * scheduling-clock interrupt than to loop through the state machine
2004 * at full power.
2005 * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are
2006 * optional if RCU does not need anything immediately from this
2007 * CPU, even if this CPU still has RCU callbacks queued. The first
2008 * times through the state machine are mandatory: we need to give
2009 * the state machine a chance to communicate a quiescent state
2010 * to the RCU core.
2011 * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
2012 * to sleep in dyntick-idle mode with RCU callbacks pending. This
2013 * is sized to be roughly one RCU grace period. Those energy-efficiency
2014 * benchmarkers who might otherwise be tempted to set this to a large
2015 * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
2016 * system. And if you are -that- concerned about energy efficiency,
2017 * just power the system down and be done with it!
2018 *
2019 * The values below work well in practice. If future workloads require
2020 * adjustment, they can be converted into kernel config parameters, though
2021 * making the state machine smarter might be a better option.
2022 */
2023#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */
2024#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */
2025#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */
2026
1954static DEFINE_PER_CPU(int, rcu_dyntick_drain); 2027static DEFINE_PER_CPU(int, rcu_dyntick_drain);
1955static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); 2028static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
2029static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer);
2030static ktime_t rcu_idle_gp_wait;
1956 2031
1957/* 2032/*
1958 * Check to see if any future RCU-related work will need to be done 2033 * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
1959 * by the current CPU, even if none need be done immediately, returning 2034 * callbacks on this CPU, (2) this CPU has not yet attempted to enter
1960 * 1 if so. This function is part of the RCU implementation; it is -not- 2035 * dyntick-idle mode, or (3) this CPU is in the process of attempting to
1961 * an exported member of the RCU API. 2036 * enter dyntick-idle mode. Otherwise, if we have recently tried and failed
2037 * to enter dyntick-idle mode, we refuse to try to enter it. After all,
2038 * it is better to incur scheduling-clock interrupts than to spin
2039 * continuously for the same time duration!
2040 */
2041int rcu_needs_cpu(int cpu)
2042{
2043 /* If no callbacks, RCU doesn't need the CPU. */
2044 if (!rcu_cpu_has_callbacks(cpu))
2045 return 0;
2046 /* Otherwise, RCU needs the CPU only if it recently tried and failed. */
2047 return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies;
2048}
2049
2050/*
2051 * Timer handler used to force CPU to start pushing its remaining RCU
2052 * callbacks in the case where it entered dyntick-idle mode with callbacks
2053 * pending. The hander doesn't really need to do anything because the
2054 * real work is done upon re-entry to idle, or by the next scheduling-clock
2055 * interrupt should idle not be re-entered.
2056 */
2057static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp)
2058{
2059 trace_rcu_prep_idle("Timer");
2060 return HRTIMER_NORESTART;
2061}
2062
2063/*
2064 * Initialize the timer used to pull CPUs out of dyntick-idle mode.
2065 */
2066static void rcu_prepare_for_idle_init(int cpu)
2067{
2068 static int firsttime = 1;
2069 struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu);
2070
2071 hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2072 hrtp->function = rcu_idle_gp_timer_func;
2073 if (firsttime) {
2074 unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY);
2075
2076 rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000);
2077 firsttime = 0;
2078 }
2079}
2080
2081/*
2082 * Clean up for exit from idle. Because we are exiting from idle, there
2083 * is no longer any point to rcu_idle_gp_timer, so cancel it. This will
2084 * do nothing if this timer is not active, so just cancel it unconditionally.
2085 */
2086static void rcu_cleanup_after_idle(int cpu)
2087{
2088 hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu));
2089}
2090
2091/*
2092 * Check to see if any RCU-related work can be done by the current CPU,
2093 * and if so, schedule a softirq to get it done. This function is part
2094 * of the RCU implementation; it is -not- an exported member of the RCU API.
1962 * 2095 *
1963 * Because we are not supporting preemptible RCU, attempt to accelerate 2096 * The idea is for the current CPU to clear out all work required by the
1964 * any current grace periods so that RCU no longer needs this CPU, but 2097 * RCU core for the current grace period, so that this CPU can be permitted
1965 * only if all other CPUs are already in dynticks-idle mode. This will 2098 * to enter dyntick-idle mode. In some cases, it will need to be awakened
1966 * allow the CPU cores to be powered down immediately, as opposed to after 2099 * at the end of the grace period by whatever CPU ends the grace period.
1967 * waiting many milliseconds for grace periods to elapse. 2100 * This allows CPUs to go dyntick-idle more quickly, and to reduce the
2101 * number of wakeups by a modest integer factor.
1968 * 2102 *
1969 * Because it is not legal to invoke rcu_process_callbacks() with irqs 2103 * Because it is not legal to invoke rcu_process_callbacks() with irqs
1970 * disabled, we do one pass of force_quiescent_state(), then do a 2104 * disabled, we do one pass of force_quiescent_state(), then do a
1971 * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked 2105 * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
1972 * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. 2106 * later. The per-cpu rcu_dyntick_drain variable controls the sequencing.
2107 *
2108 * The caller must have disabled interrupts.
1973 */ 2109 */
1974int rcu_needs_cpu(int cpu) 2110static void rcu_prepare_for_idle(int cpu)
1975{ 2111{
1976 int c = 0; 2112 unsigned long flags;
1977 int snap; 2113
1978 int thatcpu; 2114 local_irq_save(flags);
1979 2115
1980 /* Check for being in the holdoff period. */ 2116 /*
1981 if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) 2117 * If there are no callbacks on this CPU, enter dyntick-idle mode.
1982 return rcu_needs_cpu_quick_check(cpu); 2118 * Also reset state to avoid prejudicing later attempts.
1983 2119 */
1984 /* Don't bother unless we are the last non-dyntick-idle CPU. */ 2120 if (!rcu_cpu_has_callbacks(cpu)) {
1985 for_each_online_cpu(thatcpu) { 2121 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
1986 if (thatcpu == cpu) 2122 per_cpu(rcu_dyntick_drain, cpu) = 0;
1987 continue; 2123 local_irq_restore(flags);
1988 snap = atomic_add_return(0, &per_cpu(rcu_dynticks, 2124 trace_rcu_prep_idle("No callbacks");
1989 thatcpu).dynticks); 2125 return;
1990 smp_mb(); /* Order sampling of snap with end of grace period. */ 2126 }
1991 if ((snap & 0x1) != 0) { 2127
1992 per_cpu(rcu_dyntick_drain, cpu) = 0; 2128 /*
1993 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; 2129 * If in holdoff mode, just return. We will presumably have
1994 return rcu_needs_cpu_quick_check(cpu); 2130 * refrained from disabling the scheduling-clock tick.
1995 } 2131 */
2132 if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) {
2133 local_irq_restore(flags);
2134 trace_rcu_prep_idle("In holdoff");
2135 return;
1996 } 2136 }
1997 2137
1998 /* Check and update the rcu_dyntick_drain sequencing. */ 2138 /* Check and update the rcu_dyntick_drain sequencing. */
1999 if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { 2139 if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
2000 /* First time through, initialize the counter. */ 2140 /* First time through, initialize the counter. */
2001 per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES; 2141 per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES;
2142 } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES &&
2143 !rcu_pending(cpu)) {
2144 /* Can we go dyntick-idle despite still having callbacks? */
2145 trace_rcu_prep_idle("Dyntick with callbacks");
2146 per_cpu(rcu_dyntick_drain, cpu) = 0;
2147 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
2148 hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu),
2149 rcu_idle_gp_wait, HRTIMER_MODE_REL);
2150 return; /* Nothing more to do immediately. */
2002 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { 2151 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
2003 /* We have hit the limit, so time to give up. */ 2152 /* We have hit the limit, so time to give up. */
2004 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; 2153 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
2005 return rcu_needs_cpu_quick_check(cpu); 2154 local_irq_restore(flags);
2155 trace_rcu_prep_idle("Begin holdoff");
2156 invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */
2157 return;
2006 } 2158 }
2007 2159
2008 /* Do one step pushing remaining RCU callbacks through. */ 2160 /*
2161 * Do one step of pushing the remaining RCU callbacks through
2162 * the RCU core state machine.
2163 */
2164#ifdef CONFIG_TREE_PREEMPT_RCU
2165 if (per_cpu(rcu_preempt_data, cpu).nxtlist) {
2166 local_irq_restore(flags);
2167 rcu_preempt_qs(cpu);
2168 force_quiescent_state(&rcu_preempt_state, 0);
2169 local_irq_save(flags);
2170 }
2171#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
2009 if (per_cpu(rcu_sched_data, cpu).nxtlist) { 2172 if (per_cpu(rcu_sched_data, cpu).nxtlist) {
2173 local_irq_restore(flags);
2010 rcu_sched_qs(cpu); 2174 rcu_sched_qs(cpu);
2011 force_quiescent_state(&rcu_sched_state, 0); 2175 force_quiescent_state(&rcu_sched_state, 0);
2012 c = c || per_cpu(rcu_sched_data, cpu).nxtlist; 2176 local_irq_save(flags);
2013 } 2177 }
2014 if (per_cpu(rcu_bh_data, cpu).nxtlist) { 2178 if (per_cpu(rcu_bh_data, cpu).nxtlist) {
2179 local_irq_restore(flags);
2015 rcu_bh_qs(cpu); 2180 rcu_bh_qs(cpu);
2016 force_quiescent_state(&rcu_bh_state, 0); 2181 force_quiescent_state(&rcu_bh_state, 0);
2017 c = c || per_cpu(rcu_bh_data, cpu).nxtlist; 2182 local_irq_save(flags);
2018 } 2183 }
2019 2184
2020 /* If RCU callbacks are still pending, RCU still needs this CPU. */ 2185 /*
2021 if (c) 2186 * If RCU callbacks are still pending, RCU still needs this CPU.
2187 * So try forcing the callbacks through the grace period.
2188 */
2189 if (rcu_cpu_has_callbacks(cpu)) {
2190 local_irq_restore(flags);
2191 trace_rcu_prep_idle("More callbacks");
2022 invoke_rcu_core(); 2192 invoke_rcu_core();
2023 return c; 2193 } else {
2194 local_irq_restore(flags);
2195 trace_rcu_prep_idle("Callbacks drained");
2196 }
2024} 2197}
2025 2198
2026#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 2199#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 9feffa4c0695..654cfe67f0d1 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -67,13 +67,11 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
67 rdp->completed, rdp->gpnum, 67 rdp->completed, rdp->gpnum,
68 rdp->passed_quiesce, rdp->passed_quiesce_gpnum, 68 rdp->passed_quiesce, rdp->passed_quiesce_gpnum,
69 rdp->qs_pending); 69 rdp->qs_pending);
70#ifdef CONFIG_NO_HZ 70 seq_printf(m, " dt=%d/%llx/%d df=%lu",
71 seq_printf(m, " dt=%d/%d/%d df=%lu",
72 atomic_read(&rdp->dynticks->dynticks), 71 atomic_read(&rdp->dynticks->dynticks),
73 rdp->dynticks->dynticks_nesting, 72 rdp->dynticks->dynticks_nesting,
74 rdp->dynticks->dynticks_nmi_nesting, 73 rdp->dynticks->dynticks_nmi_nesting,
75 rdp->dynticks_fqs); 74 rdp->dynticks_fqs);
76#endif /* #ifdef CONFIG_NO_HZ */
77 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); 75 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
78 seq_printf(m, " ql=%ld qs=%c%c%c%c", 76 seq_printf(m, " ql=%ld qs=%c%c%c%c",
79 rdp->qlen, 77 rdp->qlen,
@@ -141,13 +139,11 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
141 rdp->completed, rdp->gpnum, 139 rdp->completed, rdp->gpnum,
142 rdp->passed_quiesce, rdp->passed_quiesce_gpnum, 140 rdp->passed_quiesce, rdp->passed_quiesce_gpnum,
143 rdp->qs_pending); 141 rdp->qs_pending);
144#ifdef CONFIG_NO_HZ 142 seq_printf(m, ",%d,%llx,%d,%lu",
145 seq_printf(m, ",%d,%d,%d,%lu",
146 atomic_read(&rdp->dynticks->dynticks), 143 atomic_read(&rdp->dynticks->dynticks),
147 rdp->dynticks->dynticks_nesting, 144 rdp->dynticks->dynticks_nesting,
148 rdp->dynticks->dynticks_nmi_nesting, 145 rdp->dynticks->dynticks_nmi_nesting,
149 rdp->dynticks_fqs); 146 rdp->dynticks_fqs);
150#endif /* #ifdef CONFIG_NO_HZ */
151 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); 147 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
152 seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen, 148 seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen,
153 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != 149 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
@@ -171,9 +167,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
171static int show_rcudata_csv(struct seq_file *m, void *unused) 167static int show_rcudata_csv(struct seq_file *m, void *unused)
172{ 168{
173 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); 169 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\",");
174#ifdef CONFIG_NO_HZ
175 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); 170 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
176#endif /* #ifdef CONFIG_NO_HZ */
177 seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\""); 171 seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\"");
178#ifdef CONFIG_RCU_BOOST 172#ifdef CONFIG_RCU_BOOST
179 seq_puts(m, "\"kt\",\"ktl\""); 173 seq_puts(m, "\"kt\",\"ktl\"");
@@ -278,7 +272,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
278 gpnum = rsp->gpnum; 272 gpnum = rsp->gpnum;
279 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " 273 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
280 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", 274 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
281 rsp->completed, gpnum, rsp->signaled, 275 rsp->completed, gpnum, rsp->fqs_state,
282 (long)(rsp->jiffies_force_qs - jiffies), 276 (long)(rsp->jiffies_force_qs - jiffies),
283 (int)(jiffies & 0xffff), 277 (int)(jiffies & 0xffff),
284 rsp->n_force_qs, rsp->n_force_qs_ngp, 278 rsp->n_force_qs, rsp->n_force_qs_ngp,
diff --git a/kernel/relay.c b/kernel/relay.c
index 226fade4d727..4335e1d7ee2d 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -302,7 +302,7 @@ static void buf_unmapped_default_callback(struct rchan_buf *buf,
302 */ 302 */
303static struct dentry *create_buf_file_default_callback(const char *filename, 303static struct dentry *create_buf_file_default_callback(const char *filename,
304 struct dentry *parent, 304 struct dentry *parent,
305 int mode, 305 umode_t mode,
306 struct rchan_buf *buf, 306 struct rchan_buf *buf,
307 int *is_global) 307 int *is_global)
308{ 308{
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 34683efa2cce..d508363858b3 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -66,6 +66,31 @@ done:
66 return ret; 66 return ret;
67} 67}
68 68
69int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
70 struct res_counter **limit_fail_at)
71{
72 int ret, r;
73 unsigned long flags;
74 struct res_counter *c;
75
76 r = ret = 0;
77 *limit_fail_at = NULL;
78 local_irq_save(flags);
79 for (c = counter; c != NULL; c = c->parent) {
80 spin_lock(&c->lock);
81 r = res_counter_charge_locked(c, val);
82 if (r)
83 c->usage += val;
84 spin_unlock(&c->lock);
85 if (r < 0 && ret == 0) {
86 *limit_fail_at = c;
87 ret = r;
88 }
89 }
90 local_irq_restore(flags);
91
92 return ret;
93}
69void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) 94void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
70{ 95{
71 if (WARN_ON(counter->usage < val)) 96 if (WARN_ON(counter->usage < val))
@@ -159,8 +184,7 @@ int res_counter_memparse_write_strategy(const char *buf,
159 return 0; 184 return 0;
160 } 185 }
161 186
162 /* FIXME - make memparse() take const char* args */ 187 *res = memparse(buf, &end);
163 *res = memparse((char *)buf, &end);
164 if (*end != '\0') 188 if (*end != '\0')
165 return -EINVAL; 189 return -EINVAL;
166 190
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 8eafd1bd273e..16502d3a71c8 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -101,6 +101,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
101 101
102 printk("\n============================================\n"); 102 printk("\n============================================\n");
103 printk( "[ BUG: circular locking deadlock detected! ]\n"); 103 printk( "[ BUG: circular locking deadlock detected! ]\n");
104 printk("%s\n", print_tainted());
104 printk( "--------------------------------------------\n"); 105 printk( "--------------------------------------------\n");
105 printk("%s/%d is deadlocking current task %s/%d\n\n", 106 printk("%s/%d is deadlocking current task %s/%d\n\n",
106 task->comm, task_pid_nr(task), 107 task->comm, task_pid_nr(task),
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 3d9f31cd79e7..98ec49475460 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -6,11 +6,11 @@
6 * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> 6 * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
7 * 7 *
8 */ 8 */
9#include <linux/device.h>
9#include <linux/kthread.h> 10#include <linux/kthread.h>
10#include <linux/export.h> 11#include <linux/export.h>
11#include <linux/sched.h> 12#include <linux/sched.h>
12#include <linux/spinlock.h> 13#include <linux/spinlock.h>
13#include <linux/sysdev.h>
14#include <linux/timer.h> 14#include <linux/timer.h>
15#include <linux/freezer.h> 15#include <linux/freezer.h>
16 16
@@ -27,7 +27,7 @@ struct test_thread_data {
27 int opdata; 27 int opdata;
28 int mutexes[MAX_RT_TEST_MUTEXES]; 28 int mutexes[MAX_RT_TEST_MUTEXES];
29 int event; 29 int event;
30 struct sys_device sysdev; 30 struct device dev;
31}; 31};
32 32
33static struct test_thread_data thread_data[MAX_RT_TEST_THREADS]; 33static struct test_thread_data thread_data[MAX_RT_TEST_THREADS];
@@ -271,7 +271,7 @@ static int test_func(void *data)
271 * 271 *
272 * opcode:data 272 * opcode:data
273 */ 273 */
274static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribute *attr, 274static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *attr,
275 const char *buf, size_t count) 275 const char *buf, size_t count)
276{ 276{
277 struct sched_param schedpar; 277 struct sched_param schedpar;
@@ -279,8 +279,8 @@ static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribut
279 char cmdbuf[32]; 279 char cmdbuf[32];
280 int op, dat, tid, ret; 280 int op, dat, tid, ret;
281 281
282 td = container_of(dev, struct test_thread_data, sysdev); 282 td = container_of(dev, struct test_thread_data, dev);
283 tid = td->sysdev.id; 283 tid = td->dev.id;
284 284
285 /* strings from sysfs write are not 0 terminated! */ 285 /* strings from sysfs write are not 0 terminated! */
286 if (count >= sizeof(cmdbuf)) 286 if (count >= sizeof(cmdbuf))
@@ -334,7 +334,7 @@ static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribut
334 * @dev: thread to query 334 * @dev: thread to query
335 * @buf: char buffer to be filled with thread status info 335 * @buf: char buffer to be filled with thread status info
336 */ 336 */
337static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute *attr, 337static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *attr,
338 char *buf) 338 char *buf)
339{ 339{
340 struct test_thread_data *td; 340 struct test_thread_data *td;
@@ -342,8 +342,8 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute
342 char *curr = buf; 342 char *curr = buf;
343 int i; 343 int i;
344 344
345 td = container_of(dev, struct test_thread_data, sysdev); 345 td = container_of(dev, struct test_thread_data, dev);
346 tsk = threads[td->sysdev.id]; 346 tsk = threads[td->dev.id];
347 347
348 spin_lock(&rttest_lock); 348 spin_lock(&rttest_lock);
349 349
@@ -360,28 +360,29 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute
360 spin_unlock(&rttest_lock); 360 spin_unlock(&rttest_lock);
361 361
362 curr += sprintf(curr, ", T: %p, R: %p\n", tsk, 362 curr += sprintf(curr, ", T: %p, R: %p\n", tsk,
363 mutexes[td->sysdev.id].owner); 363 mutexes[td->dev.id].owner);
364 364
365 return curr - buf; 365 return curr - buf;
366} 366}
367 367
368static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL); 368static DEVICE_ATTR(status, 0600, sysfs_test_status, NULL);
369static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command); 369static DEVICE_ATTR(command, 0600, NULL, sysfs_test_command);
370 370
371static struct sysdev_class rttest_sysclass = { 371static struct bus_type rttest_subsys = {
372 .name = "rttest", 372 .name = "rttest",
373 .dev_name = "rttest",
373}; 374};
374 375
375static int init_test_thread(int id) 376static int init_test_thread(int id)
376{ 377{
377 thread_data[id].sysdev.cls = &rttest_sysclass; 378 thread_data[id].dev.bus = &rttest_subsys;
378 thread_data[id].sysdev.id = id; 379 thread_data[id].dev.id = id;
379 380
380 threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id); 381 threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id);
381 if (IS_ERR(threads[id])) 382 if (IS_ERR(threads[id]))
382 return PTR_ERR(threads[id]); 383 return PTR_ERR(threads[id]);
383 384
384 return sysdev_register(&thread_data[id].sysdev); 385 return device_register(&thread_data[id].dev);
385} 386}
386 387
387static int init_rttest(void) 388static int init_rttest(void)
@@ -393,7 +394,7 @@ static int init_rttest(void)
393 for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) 394 for (i = 0; i < MAX_RT_TEST_MUTEXES; i++)
394 rt_mutex_init(&mutexes[i]); 395 rt_mutex_init(&mutexes[i]);
395 396
396 ret = sysdev_class_register(&rttest_sysclass); 397 ret = subsys_system_register(&rttest_subsys, NULL);
397 if (ret) 398 if (ret)
398 return ret; 399 return ret;
399 400
@@ -401,10 +402,10 @@ static int init_rttest(void)
401 ret = init_test_thread(i); 402 ret = init_test_thread(i);
402 if (ret) 403 if (ret)
403 break; 404 break;
404 ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status); 405 ret = device_create_file(&thread_data[i].dev, &dev_attr_status);
405 if (ret) 406 if (ret)
406 break; 407 break;
407 ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command); 408 ret = device_create_file(&thread_data[i].dev, &dev_attr_command);
408 if (ret) 409 if (ret)
409 break; 410 break;
410 } 411 }
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index f9d8482dd487..a242e691c993 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -579,7 +579,6 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
579 struct rt_mutex_waiter *waiter) 579 struct rt_mutex_waiter *waiter)
580{ 580{
581 int ret = 0; 581 int ret = 0;
582 int was_disabled;
583 582
584 for (;;) { 583 for (;;) {
585 /* Try to acquire the lock: */ 584 /* Try to acquire the lock: */
@@ -602,17 +601,10 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
602 601
603 raw_spin_unlock(&lock->wait_lock); 602 raw_spin_unlock(&lock->wait_lock);
604 603
605 was_disabled = irqs_disabled();
606 if (was_disabled)
607 local_irq_enable();
608
609 debug_rt_mutex_print_deadlock(waiter); 604 debug_rt_mutex_print_deadlock(waiter);
610 605
611 schedule_rt_mutex(lock); 606 schedule_rt_mutex(lock);
612 607
613 if (was_disabled)
614 local_irq_disable();
615
616 raw_spin_lock(&lock->wait_lock); 608 raw_spin_lock(&lock->wait_lock);
617 set_current_state(state); 609 set_current_state(state);
618 } 610 }
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
new file mode 100644
index 000000000000..9a7dd35102a3
--- /dev/null
+++ b/kernel/sched/Makefile
@@ -0,0 +1,20 @@
1ifdef CONFIG_FUNCTION_TRACER
2CFLAGS_REMOVE_clock.o = -pg
3endif
4
5ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
6# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
7# needed for x86 only. Why this used to be enabled for all architectures is beyond
8# me. I suspect most platforms don't need this, but until we know that for sure
9# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k
10# to get a correct value for the wait-channel (WCHAN in ps). --davidm
11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
12endif
13
14obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o
15obj-$(CONFIG_SMP) += cpupri.o
16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
17obj-$(CONFIG_SCHEDSTATS) += stats.o
18obj-$(CONFIG_SCHED_DEBUG) += debug.o
19
20
diff --git a/kernel/sched_autogroup.c b/kernel/sched/auto_group.c
index 429242f3c484..e8a1f83ee0e7 100644
--- a/kernel/sched_autogroup.c
+++ b/kernel/sched/auto_group.c
@@ -1,15 +1,19 @@
1#ifdef CONFIG_SCHED_AUTOGROUP 1#ifdef CONFIG_SCHED_AUTOGROUP
2 2
3#include "sched.h"
4
3#include <linux/proc_fs.h> 5#include <linux/proc_fs.h>
4#include <linux/seq_file.h> 6#include <linux/seq_file.h>
5#include <linux/kallsyms.h> 7#include <linux/kallsyms.h>
6#include <linux/utsname.h> 8#include <linux/utsname.h>
9#include <linux/security.h>
10#include <linux/export.h>
7 11
8unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; 12unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
9static struct autogroup autogroup_default; 13static struct autogroup autogroup_default;
10static atomic_t autogroup_seq_nr; 14static atomic_t autogroup_seq_nr;
11 15
12static void __init autogroup_init(struct task_struct *init_task) 16void __init autogroup_init(struct task_struct *init_task)
13{ 17{
14 autogroup_default.tg = &root_task_group; 18 autogroup_default.tg = &root_task_group;
15 kref_init(&autogroup_default.kref); 19 kref_init(&autogroup_default.kref);
@@ -17,7 +21,7 @@ static void __init autogroup_init(struct task_struct *init_task)
17 init_task->signal->autogroup = &autogroup_default; 21 init_task->signal->autogroup = &autogroup_default;
18} 22}
19 23
20static inline void autogroup_free(struct task_group *tg) 24void autogroup_free(struct task_group *tg)
21{ 25{
22 kfree(tg->autogroup); 26 kfree(tg->autogroup);
23} 27}
@@ -59,10 +63,6 @@ static inline struct autogroup *autogroup_task_get(struct task_struct *p)
59 return ag; 63 return ag;
60} 64}
61 65
62#ifdef CONFIG_RT_GROUP_SCHED
63static void free_rt_sched_group(struct task_group *tg);
64#endif
65
66static inline struct autogroup *autogroup_create(void) 66static inline struct autogroup *autogroup_create(void)
67{ 67{
68 struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); 68 struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
@@ -108,8 +108,7 @@ out_fail:
108 return autogroup_kref_get(&autogroup_default); 108 return autogroup_kref_get(&autogroup_default);
109} 109}
110 110
111static inline bool 111bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
112task_wants_autogroup(struct task_struct *p, struct task_group *tg)
113{ 112{
114 if (tg != &root_task_group) 113 if (tg != &root_task_group)
115 return false; 114 return false;
@@ -127,22 +126,6 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg)
127 return true; 126 return true;
128} 127}
129 128
130static inline bool task_group_is_autogroup(struct task_group *tg)
131{
132 return !!tg->autogroup;
133}
134
135static inline struct task_group *
136autogroup_task_group(struct task_struct *p, struct task_group *tg)
137{
138 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
139
140 if (enabled && task_wants_autogroup(p, tg))
141 return p->signal->autogroup->tg;
142
143 return tg;
144}
145
146static void 129static void
147autogroup_move_group(struct task_struct *p, struct autogroup *ag) 130autogroup_move_group(struct task_struct *p, struct autogroup *ag)
148{ 131{
@@ -263,7 +246,7 @@ out:
263#endif /* CONFIG_PROC_FS */ 246#endif /* CONFIG_PROC_FS */
264 247
265#ifdef CONFIG_SCHED_DEBUG 248#ifdef CONFIG_SCHED_DEBUG
266static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) 249int autogroup_path(struct task_group *tg, char *buf, int buflen)
267{ 250{
268 if (!task_group_is_autogroup(tg)) 251 if (!task_group_is_autogroup(tg))
269 return 0; 252 return 0;
diff --git a/kernel/sched_autogroup.h b/kernel/sched/auto_group.h
index c2f0e7248dca..8bd047142816 100644
--- a/kernel/sched_autogroup.h
+++ b/kernel/sched/auto_group.h
@@ -1,5 +1,8 @@
1#ifdef CONFIG_SCHED_AUTOGROUP 1#ifdef CONFIG_SCHED_AUTOGROUP
2 2
3#include <linux/kref.h>
4#include <linux/rwsem.h>
5
3struct autogroup { 6struct autogroup {
4 /* 7 /*
5 * reference doesn't mean how many thread attach to this 8 * reference doesn't mean how many thread attach to this
@@ -13,9 +16,28 @@ struct autogroup {
13 int nice; 16 int nice;
14}; 17};
15 18
16static inline bool task_group_is_autogroup(struct task_group *tg); 19extern void autogroup_init(struct task_struct *init_task);
20extern void autogroup_free(struct task_group *tg);
21
22static inline bool task_group_is_autogroup(struct task_group *tg)
23{
24 return !!tg->autogroup;
25}
26
27extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
28
17static inline struct task_group * 29static inline struct task_group *
18autogroup_task_group(struct task_struct *p, struct task_group *tg); 30autogroup_task_group(struct task_struct *p, struct task_group *tg)
31{
32 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
33
34 if (enabled && task_wants_autogroup(p, tg))
35 return p->signal->autogroup->tg;
36
37 return tg;
38}
39
40extern int autogroup_path(struct task_group *tg, char *buf, int buflen);
19 41
20#else /* !CONFIG_SCHED_AUTOGROUP */ 42#else /* !CONFIG_SCHED_AUTOGROUP */
21 43
diff --git a/kernel/sched_clock.c b/kernel/sched/clock.c
index c685e31492df..c685e31492df 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched/clock.c
diff --git a/kernel/sched.c b/kernel/sched/core.c
index d6b149ccf925..df00cb09263e 100644
--- a/kernel/sched.c
+++ b/kernel/sched/core.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * kernel/sched.c 2 * kernel/sched/core.c
3 * 3 *
4 * Kernel scheduler and related syscalls 4 * Kernel scheduler and related syscalls
5 * 5 *
@@ -56,7 +56,6 @@
56#include <linux/percpu.h> 56#include <linux/percpu.h>
57#include <linux/proc_fs.h> 57#include <linux/proc_fs.h>
58#include <linux/seq_file.h> 58#include <linux/seq_file.h>
59#include <linux/stop_machine.h>
60#include <linux/sysctl.h> 59#include <linux/sysctl.h>
61#include <linux/syscalls.h> 60#include <linux/syscalls.h>
62#include <linux/times.h> 61#include <linux/times.h>
@@ -75,129 +74,17 @@
75 74
76#include <asm/tlb.h> 75#include <asm/tlb.h>
77#include <asm/irq_regs.h> 76#include <asm/irq_regs.h>
78#include <asm/mutex.h>
79#ifdef CONFIG_PARAVIRT 77#ifdef CONFIG_PARAVIRT
80#include <asm/paravirt.h> 78#include <asm/paravirt.h>
81#endif 79#endif
82 80
83#include "sched_cpupri.h" 81#include "sched.h"
84#include "workqueue_sched.h" 82#include "../workqueue_sched.h"
85#include "sched_autogroup.h"
86 83
87#define CREATE_TRACE_POINTS 84#define CREATE_TRACE_POINTS
88#include <trace/events/sched.h> 85#include <trace/events/sched.h>
89 86
90/* 87void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
91 * Convert user-nice values [ -20 ... 0 ... 19 ]
92 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
93 * and back.
94 */
95#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
96#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
97#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
98
99/*
100 * 'User priority' is the nice value converted to something we
101 * can work with better when scaling various scheduler parameters,
102 * it's a [ 0 ... 39 ] range.
103 */
104#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
105#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
106#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
107
108/*
109 * Helpers for converting nanosecond timing to jiffy resolution
110 */
111#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
112
113#define NICE_0_LOAD SCHED_LOAD_SCALE
114#define NICE_0_SHIFT SCHED_LOAD_SHIFT
115
116/*
117 * These are the 'tuning knobs' of the scheduler:
118 *
119 * default timeslice is 100 msecs (used only for SCHED_RR tasks).
120 * Timeslices get refilled after they expire.
121 */
122#define DEF_TIMESLICE (100 * HZ / 1000)
123
124/*
125 * single value that denotes runtime == period, ie unlimited time.
126 */
127#define RUNTIME_INF ((u64)~0ULL)
128
129static inline int rt_policy(int policy)
130{
131 if (policy == SCHED_FIFO || policy == SCHED_RR)
132 return 1;
133 return 0;
134}
135
136static inline int task_has_rt_policy(struct task_struct *p)
137{
138 return rt_policy(p->policy);
139}
140
141/*
142 * This is the priority-queue data structure of the RT scheduling class:
143 */
144struct rt_prio_array {
145 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
146 struct list_head queue[MAX_RT_PRIO];
147};
148
149struct rt_bandwidth {
150 /* nests inside the rq lock: */
151 raw_spinlock_t rt_runtime_lock;
152 ktime_t rt_period;
153 u64 rt_runtime;
154 struct hrtimer rt_period_timer;
155};
156
157static struct rt_bandwidth def_rt_bandwidth;
158
159static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
160
161static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
162{
163 struct rt_bandwidth *rt_b =
164 container_of(timer, struct rt_bandwidth, rt_period_timer);
165 ktime_t now;
166 int overrun;
167 int idle = 0;
168
169 for (;;) {
170 now = hrtimer_cb_get_time(timer);
171 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
172
173 if (!overrun)
174 break;
175
176 idle = do_sched_rt_period_timer(rt_b, overrun);
177 }
178
179 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
180}
181
182static
183void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
184{
185 rt_b->rt_period = ns_to_ktime(period);
186 rt_b->rt_runtime = runtime;
187
188 raw_spin_lock_init(&rt_b->rt_runtime_lock);
189
190 hrtimer_init(&rt_b->rt_period_timer,
191 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
192 rt_b->rt_period_timer.function = sched_rt_period_timer;
193}
194
195static inline int rt_bandwidth_enabled(void)
196{
197 return sysctl_sched_rt_runtime >= 0;
198}
199
200static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
201{ 88{
202 unsigned long delta; 89 unsigned long delta;
203 ktime_t soft, hard, now; 90 ktime_t soft, hard, now;
@@ -217,580 +104,12 @@ static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
217 } 104 }
218} 105}
219 106
220static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 107DEFINE_MUTEX(sched_domains_mutex);
221{ 108DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
222 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
223 return;
224
225 if (hrtimer_active(&rt_b->rt_period_timer))
226 return;
227
228 raw_spin_lock(&rt_b->rt_runtime_lock);
229 start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
230 raw_spin_unlock(&rt_b->rt_runtime_lock);
231}
232
233#ifdef CONFIG_RT_GROUP_SCHED
234static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
235{
236 hrtimer_cancel(&rt_b->rt_period_timer);
237}
238#endif
239
240/*
241 * sched_domains_mutex serializes calls to init_sched_domains,
242 * detach_destroy_domains and partition_sched_domains.
243 */
244static DEFINE_MUTEX(sched_domains_mutex);
245
246#ifdef CONFIG_CGROUP_SCHED
247
248#include <linux/cgroup.h>
249
250struct cfs_rq;
251
252static LIST_HEAD(task_groups);
253
254struct cfs_bandwidth {
255#ifdef CONFIG_CFS_BANDWIDTH
256 raw_spinlock_t lock;
257 ktime_t period;
258 u64 quota, runtime;
259 s64 hierarchal_quota;
260 u64 runtime_expires;
261
262 int idle, timer_active;
263 struct hrtimer period_timer, slack_timer;
264 struct list_head throttled_cfs_rq;
265
266 /* statistics */
267 int nr_periods, nr_throttled;
268 u64 throttled_time;
269#endif
270};
271
272/* task group related information */
273struct task_group {
274 struct cgroup_subsys_state css;
275
276#ifdef CONFIG_FAIR_GROUP_SCHED
277 /* schedulable entities of this group on each cpu */
278 struct sched_entity **se;
279 /* runqueue "owned" by this group on each cpu */
280 struct cfs_rq **cfs_rq;
281 unsigned long shares;
282
283 atomic_t load_weight;
284#endif
285
286#ifdef CONFIG_RT_GROUP_SCHED
287 struct sched_rt_entity **rt_se;
288 struct rt_rq **rt_rq;
289
290 struct rt_bandwidth rt_bandwidth;
291#endif
292
293 struct rcu_head rcu;
294 struct list_head list;
295
296 struct task_group *parent;
297 struct list_head siblings;
298 struct list_head children;
299
300#ifdef CONFIG_SCHED_AUTOGROUP
301 struct autogroup *autogroup;
302#endif
303
304 struct cfs_bandwidth cfs_bandwidth;
305};
306
307/* task_group_lock serializes the addition/removal of task groups */
308static DEFINE_SPINLOCK(task_group_lock);
309
310#ifdef CONFIG_FAIR_GROUP_SCHED
311
312# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
313
314/*
315 * A weight of 0 or 1 can cause arithmetics problems.
316 * A weight of a cfs_rq is the sum of weights of which entities
317 * are queued on this cfs_rq, so a weight of a entity should not be
318 * too large, so as the shares value of a task group.
319 * (The default weight is 1024 - so there's no practical
320 * limitation from this.)
321 */
322#define MIN_SHARES (1UL << 1)
323#define MAX_SHARES (1UL << 18)
324
325static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
326#endif
327
328/* Default task group.
329 * Every task in system belong to this group at bootup.
330 */
331struct task_group root_task_group;
332
333#endif /* CONFIG_CGROUP_SCHED */
334
335/* CFS-related fields in a runqueue */
336struct cfs_rq {
337 struct load_weight load;
338 unsigned long nr_running, h_nr_running;
339
340 u64 exec_clock;
341 u64 min_vruntime;
342#ifndef CONFIG_64BIT
343 u64 min_vruntime_copy;
344#endif
345
346 struct rb_root tasks_timeline;
347 struct rb_node *rb_leftmost;
348
349 struct list_head tasks;
350 struct list_head *balance_iterator;
351
352 /*
353 * 'curr' points to currently running entity on this cfs_rq.
354 * It is set to NULL otherwise (i.e when none are currently running).
355 */
356 struct sched_entity *curr, *next, *last, *skip;
357
358#ifdef CONFIG_SCHED_DEBUG
359 unsigned int nr_spread_over;
360#endif
361
362#ifdef CONFIG_FAIR_GROUP_SCHED
363 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
364
365 /*
366 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
367 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
368 * (like users, containers etc.)
369 *
370 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
371 * list is used during load balance.
372 */
373 int on_list;
374 struct list_head leaf_cfs_rq_list;
375 struct task_group *tg; /* group that "owns" this runqueue */
376
377#ifdef CONFIG_SMP
378 /*
379 * the part of load.weight contributed by tasks
380 */
381 unsigned long task_weight;
382
383 /*
384 * h_load = weight * f(tg)
385 *
386 * Where f(tg) is the recursive weight fraction assigned to
387 * this group.
388 */
389 unsigned long h_load;
390
391 /*
392 * Maintaining per-cpu shares distribution for group scheduling
393 *
394 * load_stamp is the last time we updated the load average
395 * load_last is the last time we updated the load average and saw load
396 * load_unacc_exec_time is currently unaccounted execution time
397 */
398 u64 load_avg;
399 u64 load_period;
400 u64 load_stamp, load_last, load_unacc_exec_time;
401
402 unsigned long load_contribution;
403#endif
404#ifdef CONFIG_CFS_BANDWIDTH
405 int runtime_enabled;
406 u64 runtime_expires;
407 s64 runtime_remaining;
408
409 u64 throttled_timestamp;
410 int throttled, throttle_count;
411 struct list_head throttled_list;
412#endif
413#endif
414};
415
416#ifdef CONFIG_FAIR_GROUP_SCHED
417#ifdef CONFIG_CFS_BANDWIDTH
418static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
419{
420 return &tg->cfs_bandwidth;
421}
422
423static inline u64 default_cfs_period(void);
424static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
425static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
426
427static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
428{
429 struct cfs_bandwidth *cfs_b =
430 container_of(timer, struct cfs_bandwidth, slack_timer);
431 do_sched_cfs_slack_timer(cfs_b);
432
433 return HRTIMER_NORESTART;
434}
435
436static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
437{
438 struct cfs_bandwidth *cfs_b =
439 container_of(timer, struct cfs_bandwidth, period_timer);
440 ktime_t now;
441 int overrun;
442 int idle = 0;
443
444 for (;;) {
445 now = hrtimer_cb_get_time(timer);
446 overrun = hrtimer_forward(timer, now, cfs_b->period);
447
448 if (!overrun)
449 break;
450
451 idle = do_sched_cfs_period_timer(cfs_b, overrun);
452 }
453
454 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
455}
456
457static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
458{
459 raw_spin_lock_init(&cfs_b->lock);
460 cfs_b->runtime = 0;
461 cfs_b->quota = RUNTIME_INF;
462 cfs_b->period = ns_to_ktime(default_cfs_period());
463
464 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
465 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
466 cfs_b->period_timer.function = sched_cfs_period_timer;
467 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
468 cfs_b->slack_timer.function = sched_cfs_slack_timer;
469}
470
471static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
472{
473 cfs_rq->runtime_enabled = 0;
474 INIT_LIST_HEAD(&cfs_rq->throttled_list);
475}
476
477/* requires cfs_b->lock, may release to reprogram timer */
478static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
479{
480 /*
481 * The timer may be active because we're trying to set a new bandwidth
482 * period or because we're racing with the tear-down path
483 * (timer_active==0 becomes visible before the hrtimer call-back
484 * terminates). In either case we ensure that it's re-programmed
485 */
486 while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
487 raw_spin_unlock(&cfs_b->lock);
488 /* ensure cfs_b->lock is available while we wait */
489 hrtimer_cancel(&cfs_b->period_timer);
490
491 raw_spin_lock(&cfs_b->lock);
492 /* if someone else restarted the timer then we're done */
493 if (cfs_b->timer_active)
494 return;
495 }
496
497 cfs_b->timer_active = 1;
498 start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
499}
500
501static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
502{
503 hrtimer_cancel(&cfs_b->period_timer);
504 hrtimer_cancel(&cfs_b->slack_timer);
505}
506#else
507static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
508static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
509static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
510
511static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
512{
513 return NULL;
514}
515#endif /* CONFIG_CFS_BANDWIDTH */
516#endif /* CONFIG_FAIR_GROUP_SCHED */
517
518/* Real-Time classes' related field in a runqueue: */
519struct rt_rq {
520 struct rt_prio_array active;
521 unsigned long rt_nr_running;
522#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
523 struct {
524 int curr; /* highest queued rt task prio */
525#ifdef CONFIG_SMP
526 int next; /* next highest */
527#endif
528 } highest_prio;
529#endif
530#ifdef CONFIG_SMP
531 unsigned long rt_nr_migratory;
532 unsigned long rt_nr_total;
533 int overloaded;
534 struct plist_head pushable_tasks;
535#endif
536 int rt_throttled;
537 u64 rt_time;
538 u64 rt_runtime;
539 /* Nests inside the rq lock: */
540 raw_spinlock_t rt_runtime_lock;
541
542#ifdef CONFIG_RT_GROUP_SCHED
543 unsigned long rt_nr_boosted;
544
545 struct rq *rq;
546 struct list_head leaf_rt_rq_list;
547 struct task_group *tg;
548#endif
549};
550
551#ifdef CONFIG_SMP
552
553/*
554 * We add the notion of a root-domain which will be used to define per-domain
555 * variables. Each exclusive cpuset essentially defines an island domain by
556 * fully partitioning the member cpus from any other cpuset. Whenever a new
557 * exclusive cpuset is created, we also create and attach a new root-domain
558 * object.
559 *
560 */
561struct root_domain {
562 atomic_t refcount;
563 atomic_t rto_count;
564 struct rcu_head rcu;
565 cpumask_var_t span;
566 cpumask_var_t online;
567
568 /*
569 * The "RT overload" flag: it gets set if a CPU has more than
570 * one runnable RT task.
571 */
572 cpumask_var_t rto_mask;
573 struct cpupri cpupri;
574};
575
576/*
577 * By default the system creates a single root-domain with all cpus as
578 * members (mimicking the global state we have today).
579 */
580static struct root_domain def_root_domain;
581
582#endif /* CONFIG_SMP */
583
584/*
585 * This is the main, per-CPU runqueue data structure.
586 *
587 * Locking rule: those places that want to lock multiple runqueues
588 * (such as the load balancing or the thread migration code), lock
589 * acquire operations must be ordered by ascending &runqueue.
590 */
591struct rq {
592 /* runqueue lock: */
593 raw_spinlock_t lock;
594
595 /*
596 * nr_running and cpu_load should be in the same cacheline because
597 * remote CPUs use both these fields when doing load calculation.
598 */
599 unsigned long nr_running;
600 #define CPU_LOAD_IDX_MAX 5
601 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
602 unsigned long last_load_update_tick;
603#ifdef CONFIG_NO_HZ
604 u64 nohz_stamp;
605 unsigned char nohz_balance_kick;
606#endif
607 int skip_clock_update;
608
609 /* capture load from *all* tasks on this cpu: */
610 struct load_weight load;
611 unsigned long nr_load_updates;
612 u64 nr_switches;
613
614 struct cfs_rq cfs;
615 struct rt_rq rt;
616
617#ifdef CONFIG_FAIR_GROUP_SCHED
618 /* list of leaf cfs_rq on this cpu: */
619 struct list_head leaf_cfs_rq_list;
620#endif
621#ifdef CONFIG_RT_GROUP_SCHED
622 struct list_head leaf_rt_rq_list;
623#endif
624
625 /*
626 * This is part of a global counter where only the total sum
627 * over all CPUs matters. A task can increase this counter on
628 * one CPU and if it got migrated afterwards it may decrease
629 * it on another CPU. Always updated under the runqueue lock:
630 */
631 unsigned long nr_uninterruptible;
632
633 struct task_struct *curr, *idle, *stop;
634 unsigned long next_balance;
635 struct mm_struct *prev_mm;
636
637 u64 clock;
638 u64 clock_task;
639
640 atomic_t nr_iowait;
641
642#ifdef CONFIG_SMP
643 struct root_domain *rd;
644 struct sched_domain *sd;
645
646 unsigned long cpu_power;
647
648 unsigned char idle_balance;
649 /* For active balancing */
650 int post_schedule;
651 int active_balance;
652 int push_cpu;
653 struct cpu_stop_work active_balance_work;
654 /* cpu of this runqueue: */
655 int cpu;
656 int online;
657
658 u64 rt_avg;
659 u64 age_stamp;
660 u64 idle_stamp;
661 u64 avg_idle;
662#endif
663
664#ifdef CONFIG_IRQ_TIME_ACCOUNTING
665 u64 prev_irq_time;
666#endif
667#ifdef CONFIG_PARAVIRT
668 u64 prev_steal_time;
669#endif
670#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
671 u64 prev_steal_time_rq;
672#endif
673
674 /* calc_load related fields */
675 unsigned long calc_load_update;
676 long calc_load_active;
677
678#ifdef CONFIG_SCHED_HRTICK
679#ifdef CONFIG_SMP
680 int hrtick_csd_pending;
681 struct call_single_data hrtick_csd;
682#endif
683 struct hrtimer hrtick_timer;
684#endif
685
686#ifdef CONFIG_SCHEDSTATS
687 /* latency stats */
688 struct sched_info rq_sched_info;
689 unsigned long long rq_cpu_time;
690 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
691
692 /* sys_sched_yield() stats */
693 unsigned int yld_count;
694
695 /* schedule() stats */
696 unsigned int sched_switch;
697 unsigned int sched_count;
698 unsigned int sched_goidle;
699
700 /* try_to_wake_up() stats */
701 unsigned int ttwu_count;
702 unsigned int ttwu_local;
703#endif
704
705#ifdef CONFIG_SMP
706 struct llist_head wake_list;
707#endif
708};
709
710static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
711
712
713static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
714
715static inline int cpu_of(struct rq *rq)
716{
717#ifdef CONFIG_SMP
718 return rq->cpu;
719#else
720 return 0;
721#endif
722}
723
724#define rcu_dereference_check_sched_domain(p) \
725 rcu_dereference_check((p), \
726 lockdep_is_held(&sched_domains_mutex))
727
728/*
729 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
730 * See detach_destroy_domains: synchronize_sched for details.
731 *
732 * The domain tree of any CPU may only be accessed from within
733 * preempt-disabled sections.
734 */
735#define for_each_domain(cpu, __sd) \
736 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
737
738#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
739#define this_rq() (&__get_cpu_var(runqueues))
740#define task_rq(p) cpu_rq(task_cpu(p))
741#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
742#define raw_rq() (&__raw_get_cpu_var(runqueues))
743
744#ifdef CONFIG_CGROUP_SCHED
745
746/*
747 * Return the group to which this tasks belongs.
748 *
749 * We use task_subsys_state_check() and extend the RCU verification with
750 * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
751 * task it moves into the cgroup. Therefore by holding either of those locks,
752 * we pin the task to the current cgroup.
753 */
754static inline struct task_group *task_group(struct task_struct *p)
755{
756 struct task_group *tg;
757 struct cgroup_subsys_state *css;
758
759 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
760 lockdep_is_held(&p->pi_lock) ||
761 lockdep_is_held(&task_rq(p)->lock));
762 tg = container_of(css, struct task_group, css);
763
764 return autogroup_task_group(p, tg);
765}
766
767/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
768static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
769{
770#ifdef CONFIG_FAIR_GROUP_SCHED
771 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
772 p->se.parent = task_group(p)->se[cpu];
773#endif
774
775#ifdef CONFIG_RT_GROUP_SCHED
776 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
777 p->rt.parent = task_group(p)->rt_se[cpu];
778#endif
779}
780
781#else /* CONFIG_CGROUP_SCHED */
782
783static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
784static inline struct task_group *task_group(struct task_struct *p)
785{
786 return NULL;
787}
788
789#endif /* CONFIG_CGROUP_SCHED */
790 109
791static void update_rq_clock_task(struct rq *rq, s64 delta); 110static void update_rq_clock_task(struct rq *rq, s64 delta);
792 111
793static void update_rq_clock(struct rq *rq) 112void update_rq_clock(struct rq *rq)
794{ 113{
795 s64 delta; 114 s64 delta;
796 115
@@ -803,44 +122,14 @@ static void update_rq_clock(struct rq *rq)
803} 122}
804 123
805/* 124/*
806 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
807 */
808#ifdef CONFIG_SCHED_DEBUG
809# define const_debug __read_mostly
810#else
811# define const_debug static const
812#endif
813
814/**
815 * runqueue_is_locked - Returns true if the current cpu runqueue is locked
816 * @cpu: the processor in question.
817 *
818 * This interface allows printk to be called with the runqueue lock
819 * held and know whether or not it is OK to wake up the klogd.
820 */
821int runqueue_is_locked(int cpu)
822{
823 return raw_spin_is_locked(&cpu_rq(cpu)->lock);
824}
825
826/*
827 * Debugging: various feature bits 125 * Debugging: various feature bits
828 */ 126 */
829 127
830#define SCHED_FEAT(name, enabled) \ 128#define SCHED_FEAT(name, enabled) \
831 __SCHED_FEAT_##name ,
832
833enum {
834#include "sched_features.h"
835};
836
837#undef SCHED_FEAT
838
839#define SCHED_FEAT(name, enabled) \
840 (1UL << __SCHED_FEAT_##name) * enabled | 129 (1UL << __SCHED_FEAT_##name) * enabled |
841 130
842const_debug unsigned int sysctl_sched_features = 131const_debug unsigned int sysctl_sched_features =
843#include "sched_features.h" 132#include "features.h"
844 0; 133 0;
845 134
846#undef SCHED_FEAT 135#undef SCHED_FEAT
@@ -850,7 +139,7 @@ const_debug unsigned int sysctl_sched_features =
850 #name , 139 #name ,
851 140
852static __read_mostly char *sched_feat_names[] = { 141static __read_mostly char *sched_feat_names[] = {
853#include "sched_features.h" 142#include "features.h"
854 NULL 143 NULL
855}; 144};
856 145
@@ -860,7 +149,7 @@ static int sched_feat_show(struct seq_file *m, void *v)
860{ 149{
861 int i; 150 int i;
862 151
863 for (i = 0; sched_feat_names[i]; i++) { 152 for (i = 0; i < __SCHED_FEAT_NR; i++) {
864 if (!(sysctl_sched_features & (1UL << i))) 153 if (!(sysctl_sched_features & (1UL << i)))
865 seq_puts(m, "NO_"); 154 seq_puts(m, "NO_");
866 seq_printf(m, "%s ", sched_feat_names[i]); 155 seq_printf(m, "%s ", sched_feat_names[i]);
@@ -870,6 +159,36 @@ static int sched_feat_show(struct seq_file *m, void *v)
870 return 0; 159 return 0;
871} 160}
872 161
162#ifdef HAVE_JUMP_LABEL
163
164#define jump_label_key__true jump_label_key_enabled
165#define jump_label_key__false jump_label_key_disabled
166
167#define SCHED_FEAT(name, enabled) \
168 jump_label_key__##enabled ,
169
170struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = {
171#include "features.h"
172};
173
174#undef SCHED_FEAT
175
176static void sched_feat_disable(int i)
177{
178 if (jump_label_enabled(&sched_feat_keys[i]))
179 jump_label_dec(&sched_feat_keys[i]);
180}
181
182static void sched_feat_enable(int i)
183{
184 if (!jump_label_enabled(&sched_feat_keys[i]))
185 jump_label_inc(&sched_feat_keys[i]);
186}
187#else
188static void sched_feat_disable(int i) { };
189static void sched_feat_enable(int i) { };
190#endif /* HAVE_JUMP_LABEL */
191
873static ssize_t 192static ssize_t
874sched_feat_write(struct file *filp, const char __user *ubuf, 193sched_feat_write(struct file *filp, const char __user *ubuf,
875 size_t cnt, loff_t *ppos) 194 size_t cnt, loff_t *ppos)
@@ -893,17 +212,20 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
893 cmp += 3; 212 cmp += 3;
894 } 213 }
895 214
896 for (i = 0; sched_feat_names[i]; i++) { 215 for (i = 0; i < __SCHED_FEAT_NR; i++) {
897 if (strcmp(cmp, sched_feat_names[i]) == 0) { 216 if (strcmp(cmp, sched_feat_names[i]) == 0) {
898 if (neg) 217 if (neg) {
899 sysctl_sched_features &= ~(1UL << i); 218 sysctl_sched_features &= ~(1UL << i);
900 else 219 sched_feat_disable(i);
220 } else {
901 sysctl_sched_features |= (1UL << i); 221 sysctl_sched_features |= (1UL << i);
222 sched_feat_enable(i);
223 }
902 break; 224 break;
903 } 225 }
904 } 226 }
905 227
906 if (!sched_feat_names[i]) 228 if (i == __SCHED_FEAT_NR)
907 return -EINVAL; 229 return -EINVAL;
908 230
909 *ppos += cnt; 231 *ppos += cnt;
@@ -932,10 +254,7 @@ static __init int sched_init_debug(void)
932 return 0; 254 return 0;
933} 255}
934late_initcall(sched_init_debug); 256late_initcall(sched_init_debug);
935 257#endif /* CONFIG_SCHED_DEBUG */
936#endif
937
938#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
939 258
940/* 259/*
941 * Number of tasks to iterate in a single balance run. 260 * Number of tasks to iterate in a single balance run.
@@ -957,7 +276,7 @@ const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
957 */ 276 */
958unsigned int sysctl_sched_rt_period = 1000000; 277unsigned int sysctl_sched_rt_period = 1000000;
959 278
960static __read_mostly int scheduler_running; 279__read_mostly int scheduler_running;
961 280
962/* 281/*
963 * part of the period that we allow rt tasks to run in us. 282 * part of the period that we allow rt tasks to run in us.
@@ -965,112 +284,7 @@ static __read_mostly int scheduler_running;
965 */ 284 */
966int sysctl_sched_rt_runtime = 950000; 285int sysctl_sched_rt_runtime = 950000;
967 286
968static inline u64 global_rt_period(void)
969{
970 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
971}
972
973static inline u64 global_rt_runtime(void)
974{
975 if (sysctl_sched_rt_runtime < 0)
976 return RUNTIME_INF;
977 287
978 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
979}
980
981#ifndef prepare_arch_switch
982# define prepare_arch_switch(next) do { } while (0)
983#endif
984#ifndef finish_arch_switch
985# define finish_arch_switch(prev) do { } while (0)
986#endif
987
988static inline int task_current(struct rq *rq, struct task_struct *p)
989{
990 return rq->curr == p;
991}
992
993static inline int task_running(struct rq *rq, struct task_struct *p)
994{
995#ifdef CONFIG_SMP
996 return p->on_cpu;
997#else
998 return task_current(rq, p);
999#endif
1000}
1001
1002#ifndef __ARCH_WANT_UNLOCKED_CTXSW
1003static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
1004{
1005#ifdef CONFIG_SMP
1006 /*
1007 * We can optimise this out completely for !SMP, because the
1008 * SMP rebalancing from interrupt is the only thing that cares
1009 * here.
1010 */
1011 next->on_cpu = 1;
1012#endif
1013}
1014
1015static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
1016{
1017#ifdef CONFIG_SMP
1018 /*
1019 * After ->on_cpu is cleared, the task can be moved to a different CPU.
1020 * We must ensure this doesn't happen until the switch is completely
1021 * finished.
1022 */
1023 smp_wmb();
1024 prev->on_cpu = 0;
1025#endif
1026#ifdef CONFIG_DEBUG_SPINLOCK
1027 /* this is a valid case when another task releases the spinlock */
1028 rq->lock.owner = current;
1029#endif
1030 /*
1031 * If we are tracking spinlock dependencies then we have to
1032 * fix up the runqueue lock - which gets 'carried over' from
1033 * prev into current:
1034 */
1035 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
1036
1037 raw_spin_unlock_irq(&rq->lock);
1038}
1039
1040#else /* __ARCH_WANT_UNLOCKED_CTXSW */
1041static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
1042{
1043#ifdef CONFIG_SMP
1044 /*
1045 * We can optimise this out completely for !SMP, because the
1046 * SMP rebalancing from interrupt is the only thing that cares
1047 * here.
1048 */
1049 next->on_cpu = 1;
1050#endif
1051#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1052 raw_spin_unlock_irq(&rq->lock);
1053#else
1054 raw_spin_unlock(&rq->lock);
1055#endif
1056}
1057
1058static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
1059{
1060#ifdef CONFIG_SMP
1061 /*
1062 * After ->on_cpu is cleared, the task can be moved to a different CPU.
1063 * We must ensure this doesn't happen until the switch is completely
1064 * finished.
1065 */
1066 smp_wmb();
1067 prev->on_cpu = 0;
1068#endif
1069#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1070 local_irq_enable();
1071#endif
1072}
1073#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
1074 288
1075/* 289/*
1076 * __task_rq_lock - lock the rq @p resides on. 290 * __task_rq_lock - lock the rq @p resides on.
@@ -1153,20 +367,6 @@ static struct rq *this_rq_lock(void)
1153 * rq->lock. 367 * rq->lock.
1154 */ 368 */
1155 369
1156/*
1157 * Use hrtick when:
1158 * - enabled by features
1159 * - hrtimer is actually high res
1160 */
1161static inline int hrtick_enabled(struct rq *rq)
1162{
1163 if (!sched_feat(HRTICK))
1164 return 0;
1165 if (!cpu_active(cpu_of(rq)))
1166 return 0;
1167 return hrtimer_is_hres_active(&rq->hrtick_timer);
1168}
1169
1170static void hrtick_clear(struct rq *rq) 370static void hrtick_clear(struct rq *rq)
1171{ 371{
1172 if (hrtimer_active(&rq->hrtick_timer)) 372 if (hrtimer_active(&rq->hrtick_timer))
@@ -1210,7 +410,7 @@ static void __hrtick_start(void *arg)
1210 * 410 *
1211 * called with rq->lock held and irqs disabled 411 * called with rq->lock held and irqs disabled
1212 */ 412 */
1213static void hrtick_start(struct rq *rq, u64 delay) 413void hrtick_start(struct rq *rq, u64 delay)
1214{ 414{
1215 struct hrtimer *timer = &rq->hrtick_timer; 415 struct hrtimer *timer = &rq->hrtick_timer;
1216 ktime_t time = ktime_add_ns(timer->base->get_time(), delay); 416 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
@@ -1254,7 +454,7 @@ static __init void init_hrtick(void)
1254 * 454 *
1255 * called with rq->lock held and irqs disabled 455 * called with rq->lock held and irqs disabled
1256 */ 456 */
1257static void hrtick_start(struct rq *rq, u64 delay) 457void hrtick_start(struct rq *rq, u64 delay)
1258{ 458{
1259 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, 459 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1260 HRTIMER_MODE_REL_PINNED, 0); 460 HRTIMER_MODE_REL_PINNED, 0);
@@ -1305,7 +505,7 @@ static inline void init_hrtick(void)
1305#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) 505#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1306#endif 506#endif
1307 507
1308static void resched_task(struct task_struct *p) 508void resched_task(struct task_struct *p)
1309{ 509{
1310 int cpu; 510 int cpu;
1311 511
@@ -1326,7 +526,7 @@ static void resched_task(struct task_struct *p)
1326 smp_send_reschedule(cpu); 526 smp_send_reschedule(cpu);
1327} 527}
1328 528
1329static void resched_cpu(int cpu) 529void resched_cpu(int cpu)
1330{ 530{
1331 struct rq *rq = cpu_rq(cpu); 531 struct rq *rq = cpu_rq(cpu);
1332 unsigned long flags; 532 unsigned long flags;
@@ -1407,7 +607,8 @@ void wake_up_idle_cpu(int cpu)
1407 607
1408static inline bool got_nohz_idle_kick(void) 608static inline bool got_nohz_idle_kick(void)
1409{ 609{
1410 return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick; 610 int cpu = smp_processor_id();
611 return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
1411} 612}
1412 613
1413#else /* CONFIG_NO_HZ */ 614#else /* CONFIG_NO_HZ */
@@ -1419,12 +620,7 @@ static inline bool got_nohz_idle_kick(void)
1419 620
1420#endif /* CONFIG_NO_HZ */ 621#endif /* CONFIG_NO_HZ */
1421 622
1422static u64 sched_avg_period(void) 623void sched_avg_update(struct rq *rq)
1423{
1424 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
1425}
1426
1427static void sched_avg_update(struct rq *rq)
1428{ 624{
1429 s64 period = sched_avg_period(); 625 s64 period = sched_avg_period();
1430 626
@@ -1440,193 +636,23 @@ static void sched_avg_update(struct rq *rq)
1440 } 636 }
1441} 637}
1442 638
1443static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1444{
1445 rq->rt_avg += rt_delta;
1446 sched_avg_update(rq);
1447}
1448
1449#else /* !CONFIG_SMP */ 639#else /* !CONFIG_SMP */
1450static void resched_task(struct task_struct *p) 640void resched_task(struct task_struct *p)
1451{ 641{
1452 assert_raw_spin_locked(&task_rq(p)->lock); 642 assert_raw_spin_locked(&task_rq(p)->lock);
1453 set_tsk_need_resched(p); 643 set_tsk_need_resched(p);
1454} 644}
1455
1456static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1457{
1458}
1459
1460static void sched_avg_update(struct rq *rq)
1461{
1462}
1463#endif /* CONFIG_SMP */ 645#endif /* CONFIG_SMP */
1464 646
1465#if BITS_PER_LONG == 32
1466# define WMULT_CONST (~0UL)
1467#else
1468# define WMULT_CONST (1UL << 32)
1469#endif
1470
1471#define WMULT_SHIFT 32
1472
1473/*
1474 * Shift right and round:
1475 */
1476#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1477
1478/*
1479 * delta *= weight / lw
1480 */
1481static unsigned long
1482calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1483 struct load_weight *lw)
1484{
1485 u64 tmp;
1486
1487 /*
1488 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
1489 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
1490 * 2^SCHED_LOAD_RESOLUTION.
1491 */
1492 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
1493 tmp = (u64)delta_exec * scale_load_down(weight);
1494 else
1495 tmp = (u64)delta_exec;
1496
1497 if (!lw->inv_weight) {
1498 unsigned long w = scale_load_down(lw->weight);
1499
1500 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
1501 lw->inv_weight = 1;
1502 else if (unlikely(!w))
1503 lw->inv_weight = WMULT_CONST;
1504 else
1505 lw->inv_weight = WMULT_CONST / w;
1506 }
1507
1508 /*
1509 * Check whether we'd overflow the 64-bit multiplication:
1510 */
1511 if (unlikely(tmp > WMULT_CONST))
1512 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
1513 WMULT_SHIFT/2);
1514 else
1515 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
1516
1517 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1518}
1519
1520static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1521{
1522 lw->weight += inc;
1523 lw->inv_weight = 0;
1524}
1525
1526static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1527{
1528 lw->weight -= dec;
1529 lw->inv_weight = 0;
1530}
1531
1532static inline void update_load_set(struct load_weight *lw, unsigned long w)
1533{
1534 lw->weight = w;
1535 lw->inv_weight = 0;
1536}
1537
1538/*
1539 * To aid in avoiding the subversion of "niceness" due to uneven distribution
1540 * of tasks with abnormal "nice" values across CPUs the contribution that
1541 * each task makes to its run queue's load is weighted according to its
1542 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
1543 * scaled version of the new time slice allocation that they receive on time
1544 * slice expiry etc.
1545 */
1546
1547#define WEIGHT_IDLEPRIO 3
1548#define WMULT_IDLEPRIO 1431655765
1549
1550/*
1551 * Nice levels are multiplicative, with a gentle 10% change for every
1552 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
1553 * nice 1, it will get ~10% less CPU time than another CPU-bound task
1554 * that remained on nice 0.
1555 *
1556 * The "10% effect" is relative and cumulative: from _any_ nice level,
1557 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
1558 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
1559 * If a task goes up by ~10% and another task goes down by ~10% then
1560 * the relative distance between them is ~25%.)
1561 */
1562static const int prio_to_weight[40] = {
1563 /* -20 */ 88761, 71755, 56483, 46273, 36291,
1564 /* -15 */ 29154, 23254, 18705, 14949, 11916,
1565 /* -10 */ 9548, 7620, 6100, 4904, 3906,
1566 /* -5 */ 3121, 2501, 1991, 1586, 1277,
1567 /* 0 */ 1024, 820, 655, 526, 423,
1568 /* 5 */ 335, 272, 215, 172, 137,
1569 /* 10 */ 110, 87, 70, 56, 45,
1570 /* 15 */ 36, 29, 23, 18, 15,
1571};
1572
1573/*
1574 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
1575 *
1576 * In cases where the weight does not change often, we can use the
1577 * precalculated inverse to speed up arithmetics by turning divisions
1578 * into multiplications:
1579 */
1580static const u32 prio_to_wmult[40] = {
1581 /* -20 */ 48388, 59856, 76040, 92818, 118348,
1582 /* -15 */ 147320, 184698, 229616, 287308, 360437,
1583 /* -10 */ 449829, 563644, 704093, 875809, 1099582,
1584 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
1585 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
1586 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
1587 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
1588 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1589};
1590
1591/* Time spent by the tasks of the cpu accounting group executing in ... */
1592enum cpuacct_stat_index {
1593 CPUACCT_STAT_USER, /* ... user mode */
1594 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
1595
1596 CPUACCT_STAT_NSTATS,
1597};
1598
1599#ifdef CONFIG_CGROUP_CPUACCT
1600static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1601static void cpuacct_update_stats(struct task_struct *tsk,
1602 enum cpuacct_stat_index idx, cputime_t val);
1603#else
1604static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1605static inline void cpuacct_update_stats(struct task_struct *tsk,
1606 enum cpuacct_stat_index idx, cputime_t val) {}
1607#endif
1608
1609static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1610{
1611 update_load_add(&rq->load, load);
1612}
1613
1614static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1615{
1616 update_load_sub(&rq->load, load);
1617}
1618
1619#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ 647#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
1620 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) 648 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
1621typedef int (*tg_visitor)(struct task_group *, void *);
1622
1623/* 649/*
1624 * Iterate task_group tree rooted at *from, calling @down when first entering a 650 * Iterate task_group tree rooted at *from, calling @down when first entering a
1625 * node and @up when leaving it for the final time. 651 * node and @up when leaving it for the final time.
1626 * 652 *
1627 * Caller must hold rcu_lock or sufficient equivalent. 653 * Caller must hold rcu_lock or sufficient equivalent.
1628 */ 654 */
1629static int walk_tg_tree_from(struct task_group *from, 655int walk_tg_tree_from(struct task_group *from,
1630 tg_visitor down, tg_visitor up, void *data) 656 tg_visitor down, tg_visitor up, void *data)
1631{ 657{
1632 struct task_group *parent, *child; 658 struct task_group *parent, *child;
@@ -1657,270 +683,13 @@ out:
1657 return ret; 683 return ret;
1658} 684}
1659 685
1660/* 686int tg_nop(struct task_group *tg, void *data)
1661 * Iterate the full tree, calling @down when first entering a node and @up when
1662 * leaving it for the final time.
1663 *
1664 * Caller must hold rcu_lock or sufficient equivalent.
1665 */
1666
1667static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1668{
1669 return walk_tg_tree_from(&root_task_group, down, up, data);
1670}
1671
1672static int tg_nop(struct task_group *tg, void *data)
1673{ 687{
1674 return 0; 688 return 0;
1675} 689}
1676#endif 690#endif
1677 691
1678#ifdef CONFIG_SMP 692void update_cpu_load(struct rq *this_rq);
1679/* Used instead of source_load when we know the type == 0 */
1680static unsigned long weighted_cpuload(const int cpu)
1681{
1682 return cpu_rq(cpu)->load.weight;
1683}
1684
1685/*
1686 * Return a low guess at the load of a migration-source cpu weighted
1687 * according to the scheduling class and "nice" value.
1688 *
1689 * We want to under-estimate the load of migration sources, to
1690 * balance conservatively.
1691 */
1692static unsigned long source_load(int cpu, int type)
1693{
1694 struct rq *rq = cpu_rq(cpu);
1695 unsigned long total = weighted_cpuload(cpu);
1696
1697 if (type == 0 || !sched_feat(LB_BIAS))
1698 return total;
1699
1700 return min(rq->cpu_load[type-1], total);
1701}
1702
1703/*
1704 * Return a high guess at the load of a migration-target cpu weighted
1705 * according to the scheduling class and "nice" value.
1706 */
1707static unsigned long target_load(int cpu, int type)
1708{
1709 struct rq *rq = cpu_rq(cpu);
1710 unsigned long total = weighted_cpuload(cpu);
1711
1712 if (type == 0 || !sched_feat(LB_BIAS))
1713 return total;
1714
1715 return max(rq->cpu_load[type-1], total);
1716}
1717
1718static unsigned long power_of(int cpu)
1719{
1720 return cpu_rq(cpu)->cpu_power;
1721}
1722
1723static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1724
1725static unsigned long cpu_avg_load_per_task(int cpu)
1726{
1727 struct rq *rq = cpu_rq(cpu);
1728 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
1729
1730 if (nr_running)
1731 return rq->load.weight / nr_running;
1732
1733 return 0;
1734}
1735
1736#ifdef CONFIG_PREEMPT
1737
1738static void double_rq_lock(struct rq *rq1, struct rq *rq2);
1739
1740/*
1741 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1742 * way at the expense of forcing extra atomic operations in all
1743 * invocations. This assures that the double_lock is acquired using the
1744 * same underlying policy as the spinlock_t on this architecture, which
1745 * reduces latency compared to the unfair variant below. However, it
1746 * also adds more overhead and therefore may reduce throughput.
1747 */
1748static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1749 __releases(this_rq->lock)
1750 __acquires(busiest->lock)
1751 __acquires(this_rq->lock)
1752{
1753 raw_spin_unlock(&this_rq->lock);
1754 double_rq_lock(this_rq, busiest);
1755
1756 return 1;
1757}
1758
1759#else
1760/*
1761 * Unfair double_lock_balance: Optimizes throughput at the expense of
1762 * latency by eliminating extra atomic operations when the locks are
1763 * already in proper order on entry. This favors lower cpu-ids and will
1764 * grant the double lock to lower cpus over higher ids under contention,
1765 * regardless of entry order into the function.
1766 */
1767static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1768 __releases(this_rq->lock)
1769 __acquires(busiest->lock)
1770 __acquires(this_rq->lock)
1771{
1772 int ret = 0;
1773
1774 if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1775 if (busiest < this_rq) {
1776 raw_spin_unlock(&this_rq->lock);
1777 raw_spin_lock(&busiest->lock);
1778 raw_spin_lock_nested(&this_rq->lock,
1779 SINGLE_DEPTH_NESTING);
1780 ret = 1;
1781 } else
1782 raw_spin_lock_nested(&busiest->lock,
1783 SINGLE_DEPTH_NESTING);
1784 }
1785 return ret;
1786}
1787
1788#endif /* CONFIG_PREEMPT */
1789
1790/*
1791 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1792 */
1793static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1794{
1795 if (unlikely(!irqs_disabled())) {
1796 /* printk() doesn't work good under rq->lock */
1797 raw_spin_unlock(&this_rq->lock);
1798 BUG_ON(1);
1799 }
1800
1801 return _double_lock_balance(this_rq, busiest);
1802}
1803
1804static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1805 __releases(busiest->lock)
1806{
1807 raw_spin_unlock(&busiest->lock);
1808 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1809}
1810
1811/*
1812 * double_rq_lock - safely lock two runqueues
1813 *
1814 * Note this does not disable interrupts like task_rq_lock,
1815 * you need to do so manually before calling.
1816 */
1817static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1818 __acquires(rq1->lock)
1819 __acquires(rq2->lock)
1820{
1821 BUG_ON(!irqs_disabled());
1822 if (rq1 == rq2) {
1823 raw_spin_lock(&rq1->lock);
1824 __acquire(rq2->lock); /* Fake it out ;) */
1825 } else {
1826 if (rq1 < rq2) {
1827 raw_spin_lock(&rq1->lock);
1828 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1829 } else {
1830 raw_spin_lock(&rq2->lock);
1831 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1832 }
1833 }
1834}
1835
1836/*
1837 * double_rq_unlock - safely unlock two runqueues
1838 *
1839 * Note this does not restore interrupts like task_rq_unlock,
1840 * you need to do so manually after calling.
1841 */
1842static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1843 __releases(rq1->lock)
1844 __releases(rq2->lock)
1845{
1846 raw_spin_unlock(&rq1->lock);
1847 if (rq1 != rq2)
1848 raw_spin_unlock(&rq2->lock);
1849 else
1850 __release(rq2->lock);
1851}
1852
1853#else /* CONFIG_SMP */
1854
1855/*
1856 * double_rq_lock - safely lock two runqueues
1857 *
1858 * Note this does not disable interrupts like task_rq_lock,
1859 * you need to do so manually before calling.
1860 */
1861static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1862 __acquires(rq1->lock)
1863 __acquires(rq2->lock)
1864{
1865 BUG_ON(!irqs_disabled());
1866 BUG_ON(rq1 != rq2);
1867 raw_spin_lock(&rq1->lock);
1868 __acquire(rq2->lock); /* Fake it out ;) */
1869}
1870
1871/*
1872 * double_rq_unlock - safely unlock two runqueues
1873 *
1874 * Note this does not restore interrupts like task_rq_unlock,
1875 * you need to do so manually after calling.
1876 */
1877static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1878 __releases(rq1->lock)
1879 __releases(rq2->lock)
1880{
1881 BUG_ON(rq1 != rq2);
1882 raw_spin_unlock(&rq1->lock);
1883 __release(rq2->lock);
1884}
1885
1886#endif
1887
1888static void calc_load_account_idle(struct rq *this_rq);
1889static void update_sysctl(void);
1890static int get_update_sysctl_factor(void);
1891static void update_cpu_load(struct rq *this_rq);
1892
1893static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1894{
1895 set_task_rq(p, cpu);
1896#ifdef CONFIG_SMP
1897 /*
1898 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1899 * successfully executed on another CPU. We must ensure that updates of
1900 * per-task data have been completed by this moment.
1901 */
1902 smp_wmb();
1903 task_thread_info(p)->cpu = cpu;
1904#endif
1905}
1906
1907static const struct sched_class rt_sched_class;
1908
1909#define sched_class_highest (&stop_sched_class)
1910#define for_each_class(class) \
1911 for (class = sched_class_highest; class; class = class->next)
1912
1913#include "sched_stats.h"
1914
1915static void inc_nr_running(struct rq *rq)
1916{
1917 rq->nr_running++;
1918}
1919
1920static void dec_nr_running(struct rq *rq)
1921{
1922 rq->nr_running--;
1923}
1924 693
1925static void set_load_weight(struct task_struct *p) 694static void set_load_weight(struct task_struct *p)
1926{ 695{
@@ -1957,7 +726,7 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1957/* 726/*
1958 * activate_task - move a task to the runqueue. 727 * activate_task - move a task to the runqueue.
1959 */ 728 */
1960static void activate_task(struct rq *rq, struct task_struct *p, int flags) 729void activate_task(struct rq *rq, struct task_struct *p, int flags)
1961{ 730{
1962 if (task_contributes_to_load(p)) 731 if (task_contributes_to_load(p))
1963 rq->nr_uninterruptible--; 732 rq->nr_uninterruptible--;
@@ -1968,7 +737,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags)
1968/* 737/*
1969 * deactivate_task - remove a task from the runqueue. 738 * deactivate_task - remove a task from the runqueue.
1970 */ 739 */
1971static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) 740void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1972{ 741{
1973 if (task_contributes_to_load(p)) 742 if (task_contributes_to_load(p))
1974 rq->nr_uninterruptible++; 743 rq->nr_uninterruptible++;
@@ -2159,14 +928,14 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
2159#ifdef CONFIG_IRQ_TIME_ACCOUNTING 928#ifdef CONFIG_IRQ_TIME_ACCOUNTING
2160static int irqtime_account_hi_update(void) 929static int irqtime_account_hi_update(void)
2161{ 930{
2162 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 931 u64 *cpustat = kcpustat_this_cpu->cpustat;
2163 unsigned long flags; 932 unsigned long flags;
2164 u64 latest_ns; 933 u64 latest_ns;
2165 int ret = 0; 934 int ret = 0;
2166 935
2167 local_irq_save(flags); 936 local_irq_save(flags);
2168 latest_ns = this_cpu_read(cpu_hardirq_time); 937 latest_ns = this_cpu_read(cpu_hardirq_time);
2169 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq)) 938 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
2170 ret = 1; 939 ret = 1;
2171 local_irq_restore(flags); 940 local_irq_restore(flags);
2172 return ret; 941 return ret;
@@ -2174,14 +943,14 @@ static int irqtime_account_hi_update(void)
2174 943
2175static int irqtime_account_si_update(void) 944static int irqtime_account_si_update(void)
2176{ 945{
2177 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 946 u64 *cpustat = kcpustat_this_cpu->cpustat;
2178 unsigned long flags; 947 unsigned long flags;
2179 u64 latest_ns; 948 u64 latest_ns;
2180 int ret = 0; 949 int ret = 0;
2181 950
2182 local_irq_save(flags); 951 local_irq_save(flags);
2183 latest_ns = this_cpu_read(cpu_softirq_time); 952 latest_ns = this_cpu_read(cpu_softirq_time);
2184 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq)) 953 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
2185 ret = 1; 954 ret = 1;
2186 local_irq_restore(flags); 955 local_irq_restore(flags);
2187 return ret; 956 return ret;
@@ -2193,15 +962,6 @@ static int irqtime_account_si_update(void)
2193 962
2194#endif 963#endif
2195 964
2196#include "sched_idletask.c"
2197#include "sched_fair.c"
2198#include "sched_rt.c"
2199#include "sched_autogroup.c"
2200#include "sched_stoptask.c"
2201#ifdef CONFIG_SCHED_DEBUG
2202# include "sched_debug.c"
2203#endif
2204
2205void sched_set_stop_task(int cpu, struct task_struct *stop) 965void sched_set_stop_task(int cpu, struct task_struct *stop)
2206{ 966{
2207 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; 967 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
@@ -2299,7 +1059,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2299 p->sched_class->prio_changed(rq, p, oldprio); 1059 p->sched_class->prio_changed(rq, p, oldprio);
2300} 1060}
2301 1061
2302static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 1062void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2303{ 1063{
2304 const struct sched_class *class; 1064 const struct sched_class *class;
2305 1065
@@ -2325,38 +1085,6 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2325} 1085}
2326 1086
2327#ifdef CONFIG_SMP 1087#ifdef CONFIG_SMP
2328/*
2329 * Is this task likely cache-hot:
2330 */
2331static int
2332task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2333{
2334 s64 delta;
2335
2336 if (p->sched_class != &fair_sched_class)
2337 return 0;
2338
2339 if (unlikely(p->policy == SCHED_IDLE))
2340 return 0;
2341
2342 /*
2343 * Buddy candidates are cache hot:
2344 */
2345 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
2346 (&p->se == cfs_rq_of(&p->se)->next ||
2347 &p->se == cfs_rq_of(&p->se)->last))
2348 return 1;
2349
2350 if (sysctl_sched_migration_cost == -1)
2351 return 1;
2352 if (sysctl_sched_migration_cost == 0)
2353 return 0;
2354
2355 delta = now - p->se.exec_start;
2356
2357 return delta < (s64)sysctl_sched_migration_cost;
2358}
2359
2360void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 1088void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2361{ 1089{
2362#ifdef CONFIG_SCHED_DEBUG 1090#ifdef CONFIG_SCHED_DEBUG
@@ -2783,6 +1511,11 @@ static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
2783 1511
2784} 1512}
2785#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ 1513#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
1514
1515static inline int ttwu_share_cache(int this_cpu, int that_cpu)
1516{
1517 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
1518}
2786#endif /* CONFIG_SMP */ 1519#endif /* CONFIG_SMP */
2787 1520
2788static void ttwu_queue(struct task_struct *p, int cpu) 1521static void ttwu_queue(struct task_struct *p, int cpu)
@@ -2790,7 +1523,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
2790 struct rq *rq = cpu_rq(cpu); 1523 struct rq *rq = cpu_rq(cpu);
2791 1524
2792#if defined(CONFIG_SMP) 1525#if defined(CONFIG_SMP)
2793 if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { 1526 if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) {
2794 sched_clock_cpu(cpu); /* sync clocks x-cpu */ 1527 sched_clock_cpu(cpu); /* sync clocks x-cpu */
2795 ttwu_queue_remote(p, cpu); 1528 ttwu_queue_remote(p, cpu);
2796 return; 1529 return;
@@ -3204,6 +1937,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
3204 local_irq_enable(); 1937 local_irq_enable();
3205#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ 1938#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
3206 finish_lock_switch(rq, prev); 1939 finish_lock_switch(rq, prev);
1940 trace_sched_stat_sleeptime(current, rq->clock);
3207 1941
3208 fire_sched_in_preempt_notifiers(current); 1942 fire_sched_in_preempt_notifiers(current);
3209 if (mm) 1943 if (mm)
@@ -3439,7 +2173,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
3439 */ 2173 */
3440static atomic_long_t calc_load_tasks_idle; 2174static atomic_long_t calc_load_tasks_idle;
3441 2175
3442static void calc_load_account_idle(struct rq *this_rq) 2176void calc_load_account_idle(struct rq *this_rq)
3443{ 2177{
3444 long delta; 2178 long delta;
3445 2179
@@ -3583,7 +2317,7 @@ static void calc_global_nohz(unsigned long ticks)
3583 */ 2317 */
3584} 2318}
3585#else 2319#else
3586static void calc_load_account_idle(struct rq *this_rq) 2320void calc_load_account_idle(struct rq *this_rq)
3587{ 2321{
3588} 2322}
3589 2323
@@ -3726,7 +2460,7 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
3726 * scheduler tick (TICK_NSEC). With tickless idle this will not be called 2460 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
3727 * every tick. We fix it up based on jiffies. 2461 * every tick. We fix it up based on jiffies.
3728 */ 2462 */
3729static void update_cpu_load(struct rq *this_rq) 2463void update_cpu_load(struct rq *this_rq)
3730{ 2464{
3731 unsigned long this_load = this_rq->load.weight; 2465 unsigned long this_load = this_rq->load.weight;
3732 unsigned long curr_jiffies = jiffies; 2466 unsigned long curr_jiffies = jiffies;
@@ -3804,8 +2538,10 @@ unlock:
3804#endif 2538#endif
3805 2539
3806DEFINE_PER_CPU(struct kernel_stat, kstat); 2540DEFINE_PER_CPU(struct kernel_stat, kstat);
2541DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
3807 2542
3808EXPORT_PER_CPU_SYMBOL(kstat); 2543EXPORT_PER_CPU_SYMBOL(kstat);
2544EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
3809 2545
3810/* 2546/*
3811 * Return any ns on the sched_clock that have not yet been accounted in 2547 * Return any ns on the sched_clock that have not yet been accounted in
@@ -3858,6 +2594,42 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3858 return ns; 2594 return ns;
3859} 2595}
3860 2596
2597#ifdef CONFIG_CGROUP_CPUACCT
2598struct cgroup_subsys cpuacct_subsys;
2599struct cpuacct root_cpuacct;
2600#endif
2601
2602static inline void task_group_account_field(struct task_struct *p, int index,
2603 u64 tmp)
2604{
2605#ifdef CONFIG_CGROUP_CPUACCT
2606 struct kernel_cpustat *kcpustat;
2607 struct cpuacct *ca;
2608#endif
2609 /*
2610 * Since all updates are sure to touch the root cgroup, we
2611 * get ourselves ahead and touch it first. If the root cgroup
2612 * is the only cgroup, then nothing else should be necessary.
2613 *
2614 */
2615 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
2616
2617#ifdef CONFIG_CGROUP_CPUACCT
2618 if (unlikely(!cpuacct_subsys.active))
2619 return;
2620
2621 rcu_read_lock();
2622 ca = task_ca(p);
2623 while (ca && (ca != &root_cpuacct)) {
2624 kcpustat = this_cpu_ptr(ca->cpustat);
2625 kcpustat->cpustat[index] += tmp;
2626 ca = parent_ca(ca);
2627 }
2628 rcu_read_unlock();
2629#endif
2630}
2631
2632
3861/* 2633/*
3862 * Account user cpu time to a process. 2634 * Account user cpu time to a process.
3863 * @p: the process that the cpu time gets accounted to 2635 * @p: the process that the cpu time gets accounted to
@@ -3867,22 +2639,18 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3867void account_user_time(struct task_struct *p, cputime_t cputime, 2639void account_user_time(struct task_struct *p, cputime_t cputime,
3868 cputime_t cputime_scaled) 2640 cputime_t cputime_scaled)
3869{ 2641{
3870 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2642 int index;
3871 cputime64_t tmp;
3872 2643
3873 /* Add user time to process. */ 2644 /* Add user time to process. */
3874 p->utime = cputime_add(p->utime, cputime); 2645 p->utime += cputime;
3875 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); 2646 p->utimescaled += cputime_scaled;
3876 account_group_user_time(p, cputime); 2647 account_group_user_time(p, cputime);
3877 2648
2649 index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
2650
3878 /* Add user time to cpustat. */ 2651 /* Add user time to cpustat. */
3879 tmp = cputime_to_cputime64(cputime); 2652 task_group_account_field(p, index, (__force u64) cputime);
3880 if (TASK_NICE(p) > 0)
3881 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3882 else
3883 cpustat->user = cputime64_add(cpustat->user, tmp);
3884 2653
3885 cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
3886 /* Account for user time used */ 2654 /* Account for user time used */
3887 acct_update_integrals(p); 2655 acct_update_integrals(p);
3888} 2656}
@@ -3896,24 +2664,21 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
3896static void account_guest_time(struct task_struct *p, cputime_t cputime, 2664static void account_guest_time(struct task_struct *p, cputime_t cputime,
3897 cputime_t cputime_scaled) 2665 cputime_t cputime_scaled)
3898{ 2666{
3899 cputime64_t tmp; 2667 u64 *cpustat = kcpustat_this_cpu->cpustat;
3900 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3901
3902 tmp = cputime_to_cputime64(cputime);
3903 2668
3904 /* Add guest time to process. */ 2669 /* Add guest time to process. */
3905 p->utime = cputime_add(p->utime, cputime); 2670 p->utime += cputime;
3906 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); 2671 p->utimescaled += cputime_scaled;
3907 account_group_user_time(p, cputime); 2672 account_group_user_time(p, cputime);
3908 p->gtime = cputime_add(p->gtime, cputime); 2673 p->gtime += cputime;
3909 2674
3910 /* Add guest time to cpustat. */ 2675 /* Add guest time to cpustat. */
3911 if (TASK_NICE(p) > 0) { 2676 if (TASK_NICE(p) > 0) {
3912 cpustat->nice = cputime64_add(cpustat->nice, tmp); 2677 cpustat[CPUTIME_NICE] += (__force u64) cputime;
3913 cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp); 2678 cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
3914 } else { 2679 } else {
3915 cpustat->user = cputime64_add(cpustat->user, tmp); 2680 cpustat[CPUTIME_USER] += (__force u64) cputime;
3916 cpustat->guest = cputime64_add(cpustat->guest, tmp); 2681 cpustat[CPUTIME_GUEST] += (__force u64) cputime;
3917 } 2682 }
3918} 2683}
3919 2684
@@ -3926,18 +2691,15 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
3926 */ 2691 */
3927static inline 2692static inline
3928void __account_system_time(struct task_struct *p, cputime_t cputime, 2693void __account_system_time(struct task_struct *p, cputime_t cputime,
3929 cputime_t cputime_scaled, cputime64_t *target_cputime64) 2694 cputime_t cputime_scaled, int index)
3930{ 2695{
3931 cputime64_t tmp = cputime_to_cputime64(cputime);
3932
3933 /* Add system time to process. */ 2696 /* Add system time to process. */
3934 p->stime = cputime_add(p->stime, cputime); 2697 p->stime += cputime;
3935 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); 2698 p->stimescaled += cputime_scaled;
3936 account_group_system_time(p, cputime); 2699 account_group_system_time(p, cputime);
3937 2700
3938 /* Add system time to cpustat. */ 2701 /* Add system time to cpustat. */
3939 *target_cputime64 = cputime64_add(*target_cputime64, tmp); 2702 task_group_account_field(p, index, (__force u64) cputime);
3940 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
3941 2703
3942 /* Account for system time used */ 2704 /* Account for system time used */
3943 acct_update_integrals(p); 2705 acct_update_integrals(p);
@@ -3953,8 +2715,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
3953void account_system_time(struct task_struct *p, int hardirq_offset, 2715void account_system_time(struct task_struct *p, int hardirq_offset,
3954 cputime_t cputime, cputime_t cputime_scaled) 2716 cputime_t cputime, cputime_t cputime_scaled)
3955{ 2717{
3956 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2718 int index;
3957 cputime64_t *target_cputime64;
3958 2719
3959 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 2720 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
3960 account_guest_time(p, cputime, cputime_scaled); 2721 account_guest_time(p, cputime, cputime_scaled);
@@ -3962,13 +2723,13 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3962 } 2723 }
3963 2724
3964 if (hardirq_count() - hardirq_offset) 2725 if (hardirq_count() - hardirq_offset)
3965 target_cputime64 = &cpustat->irq; 2726 index = CPUTIME_IRQ;
3966 else if (in_serving_softirq()) 2727 else if (in_serving_softirq())
3967 target_cputime64 = &cpustat->softirq; 2728 index = CPUTIME_SOFTIRQ;
3968 else 2729 else
3969 target_cputime64 = &cpustat->system; 2730 index = CPUTIME_SYSTEM;
3970 2731
3971 __account_system_time(p, cputime, cputime_scaled, target_cputime64); 2732 __account_system_time(p, cputime, cputime_scaled, index);
3972} 2733}
3973 2734
3974/* 2735/*
@@ -3977,10 +2738,9 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3977 */ 2738 */
3978void account_steal_time(cputime_t cputime) 2739void account_steal_time(cputime_t cputime)
3979{ 2740{
3980 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2741 u64 *cpustat = kcpustat_this_cpu->cpustat;
3981 cputime64_t cputime64 = cputime_to_cputime64(cputime);
3982 2742
3983 cpustat->steal = cputime64_add(cpustat->steal, cputime64); 2743 cpustat[CPUTIME_STEAL] += (__force u64) cputime;
3984} 2744}
3985 2745
3986/* 2746/*
@@ -3989,14 +2749,13 @@ void account_steal_time(cputime_t cputime)
3989 */ 2749 */
3990void account_idle_time(cputime_t cputime) 2750void account_idle_time(cputime_t cputime)
3991{ 2751{
3992 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2752 u64 *cpustat = kcpustat_this_cpu->cpustat;
3993 cputime64_t cputime64 = cputime_to_cputime64(cputime);
3994 struct rq *rq = this_rq(); 2753 struct rq *rq = this_rq();
3995 2754
3996 if (atomic_read(&rq->nr_iowait) > 0) 2755 if (atomic_read(&rq->nr_iowait) > 0)
3997 cpustat->iowait = cputime64_add(cpustat->iowait, cputime64); 2756 cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
3998 else 2757 else
3999 cpustat->idle = cputime64_add(cpustat->idle, cputime64); 2758 cpustat[CPUTIME_IDLE] += (__force u64) cputime;
4000} 2759}
4001 2760
4002static __always_inline bool steal_account_process_tick(void) 2761static __always_inline bool steal_account_process_tick(void)
@@ -4046,16 +2805,15 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
4046 struct rq *rq) 2805 struct rq *rq)
4047{ 2806{
4048 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 2807 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
4049 cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); 2808 u64 *cpustat = kcpustat_this_cpu->cpustat;
4050 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4051 2809
4052 if (steal_account_process_tick()) 2810 if (steal_account_process_tick())
4053 return; 2811 return;
4054 2812
4055 if (irqtime_account_hi_update()) { 2813 if (irqtime_account_hi_update()) {
4056 cpustat->irq = cputime64_add(cpustat->irq, tmp); 2814 cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
4057 } else if (irqtime_account_si_update()) { 2815 } else if (irqtime_account_si_update()) {
4058 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 2816 cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
4059 } else if (this_cpu_ksoftirqd() == p) { 2817 } else if (this_cpu_ksoftirqd() == p) {
4060 /* 2818 /*
4061 * ksoftirqd time do not get accounted in cpu_softirq_time. 2819 * ksoftirqd time do not get accounted in cpu_softirq_time.
@@ -4063,7 +2821,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
4063 * Also, p->stime needs to be updated for ksoftirqd. 2821 * Also, p->stime needs to be updated for ksoftirqd.
4064 */ 2822 */
4065 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, 2823 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
4066 &cpustat->softirq); 2824 CPUTIME_SOFTIRQ);
4067 } else if (user_tick) { 2825 } else if (user_tick) {
4068 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 2826 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
4069 } else if (p == rq->idle) { 2827 } else if (p == rq->idle) {
@@ -4072,7 +2830,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
4072 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); 2830 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
4073 } else { 2831 } else {
4074 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, 2832 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
4075 &cpustat->system); 2833 CPUTIME_SYSTEM);
4076 } 2834 }
4077} 2835}
4078 2836
@@ -4171,7 +2929,7 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
4171 2929
4172void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 2930void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
4173{ 2931{
4174 cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime); 2932 cputime_t rtime, utime = p->utime, total = utime + p->stime;
4175 2933
4176 /* 2934 /*
4177 * Use CFS's precise accounting: 2935 * Use CFS's precise accounting:
@@ -4179,11 +2937,11 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
4179 rtime = nsecs_to_cputime(p->se.sum_exec_runtime); 2937 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
4180 2938
4181 if (total) { 2939 if (total) {
4182 u64 temp = rtime; 2940 u64 temp = (__force u64) rtime;
4183 2941
4184 temp *= utime; 2942 temp *= (__force u64) utime;
4185 do_div(temp, total); 2943 do_div(temp, (__force u32) total);
4186 utime = (cputime_t)temp; 2944 utime = (__force cputime_t) temp;
4187 } else 2945 } else
4188 utime = rtime; 2946 utime = rtime;
4189 2947
@@ -4191,7 +2949,7 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
4191 * Compare with previous values, to keep monotonicity: 2949 * Compare with previous values, to keep monotonicity:
4192 */ 2950 */
4193 p->prev_utime = max(p->prev_utime, utime); 2951 p->prev_utime = max(p->prev_utime, utime);
4194 p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime)); 2952 p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
4195 2953
4196 *ut = p->prev_utime; 2954 *ut = p->prev_utime;
4197 *st = p->prev_stime; 2955 *st = p->prev_stime;
@@ -4208,21 +2966,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
4208 2966
4209 thread_group_cputime(p, &cputime); 2967 thread_group_cputime(p, &cputime);
4210 2968
4211 total = cputime_add(cputime.utime, cputime.stime); 2969 total = cputime.utime + cputime.stime;
4212 rtime = nsecs_to_cputime(cputime.sum_exec_runtime); 2970 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
4213 2971
4214 if (total) { 2972 if (total) {
4215 u64 temp = rtime; 2973 u64 temp = (__force u64) rtime;
4216 2974
4217 temp *= cputime.utime; 2975 temp *= (__force u64) cputime.utime;
4218 do_div(temp, total); 2976 do_div(temp, (__force u32) total);
4219 utime = (cputime_t)temp; 2977 utime = (__force cputime_t) temp;
4220 } else 2978 } else
4221 utime = rtime; 2979 utime = rtime;
4222 2980
4223 sig->prev_utime = max(sig->prev_utime, utime); 2981 sig->prev_utime = max(sig->prev_utime, utime);
4224 sig->prev_stime = max(sig->prev_stime, 2982 sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
4225 cputime_sub(rtime, sig->prev_utime));
4226 2983
4227 *ut = sig->prev_utime; 2984 *ut = sig->prev_utime;
4228 *st = sig->prev_stime; 2985 *st = sig->prev_stime;
@@ -4321,6 +3078,9 @@ static noinline void __schedule_bug(struct task_struct *prev)
4321{ 3078{
4322 struct pt_regs *regs = get_irq_regs(); 3079 struct pt_regs *regs = get_irq_regs();
4323 3080
3081 if (oops_in_progress)
3082 return;
3083
4324 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", 3084 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
4325 prev->comm, prev->pid, preempt_count()); 3085 prev->comm, prev->pid, preempt_count());
4326 3086
@@ -5570,7 +4330,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
5570 goto out_free_cpus_allowed; 4330 goto out_free_cpus_allowed;
5571 } 4331 }
5572 retval = -EPERM; 4332 retval = -EPERM;
5573 if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE)) 4333 if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE))
5574 goto out_unlock; 4334 goto out_unlock;
5575 4335
5576 retval = security_task_setscheduler(p); 4336 retval = security_task_setscheduler(p);
@@ -5852,6 +4612,13 @@ again:
5852 */ 4612 */
5853 if (preempt && rq != p_rq) 4613 if (preempt && rq != p_rq)
5854 resched_task(p_rq->curr); 4614 resched_task(p_rq->curr);
4615 } else {
4616 /*
4617 * We might have set it in task_yield_fair(), but are
4618 * not going to schedule(), so don't want to skip
4619 * the next update.
4620 */
4621 rq->skip_clock_update = 0;
5855 } 4622 }
5856 4623
5857out: 4624out:
@@ -6019,7 +4786,7 @@ void sched_show_task(struct task_struct *p)
6019 free = stack_not_used(p); 4786 free = stack_not_used(p);
6020#endif 4787#endif
6021 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, 4788 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
6022 task_pid_nr(p), task_pid_nr(p->real_parent), 4789 task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)),
6023 (unsigned long)task_thread_info(p)->flags); 4790 (unsigned long)task_thread_info(p)->flags);
6024 4791
6025 show_stack(p, NULL); 4792 show_stack(p, NULL);
@@ -6118,53 +4885,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
6118#endif 4885#endif
6119} 4886}
6120 4887
6121/*
6122 * Increase the granularity value when there are more CPUs,
6123 * because with more CPUs the 'effective latency' as visible
6124 * to users decreases. But the relationship is not linear,
6125 * so pick a second-best guess by going with the log2 of the
6126 * number of CPUs.
6127 *
6128 * This idea comes from the SD scheduler of Con Kolivas:
6129 */
6130static int get_update_sysctl_factor(void)
6131{
6132 unsigned int cpus = min_t(int, num_online_cpus(), 8);
6133 unsigned int factor;
6134
6135 switch (sysctl_sched_tunable_scaling) {
6136 case SCHED_TUNABLESCALING_NONE:
6137 factor = 1;
6138 break;
6139 case SCHED_TUNABLESCALING_LINEAR:
6140 factor = cpus;
6141 break;
6142 case SCHED_TUNABLESCALING_LOG:
6143 default:
6144 factor = 1 + ilog2(cpus);
6145 break;
6146 }
6147
6148 return factor;
6149}
6150
6151static void update_sysctl(void)
6152{
6153 unsigned int factor = get_update_sysctl_factor();
6154
6155#define SET_SYSCTL(name) \
6156 (sysctl_##name = (factor) * normalized_sysctl_##name)
6157 SET_SYSCTL(sched_min_granularity);
6158 SET_SYSCTL(sched_latency);
6159 SET_SYSCTL(sched_wakeup_granularity);
6160#undef SET_SYSCTL
6161}
6162
6163static inline void sched_init_granularity(void)
6164{
6165 update_sysctl();
6166}
6167
6168#ifdef CONFIG_SMP 4888#ifdef CONFIG_SMP
6169void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 4889void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
6170{ 4890{
@@ -6351,30 +5071,6 @@ static void calc_global_load_remove(struct rq *rq)
6351 rq->calc_load_active = 0; 5071 rq->calc_load_active = 0;
6352} 5072}
6353 5073
6354#ifdef CONFIG_CFS_BANDWIDTH
6355static void unthrottle_offline_cfs_rqs(struct rq *rq)
6356{
6357 struct cfs_rq *cfs_rq;
6358
6359 for_each_leaf_cfs_rq(rq, cfs_rq) {
6360 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
6361
6362 if (!cfs_rq->runtime_enabled)
6363 continue;
6364
6365 /*
6366 * clock_task is not advancing so we just need to make sure
6367 * there's some valid quota amount
6368 */
6369 cfs_rq->runtime_remaining = cfs_b->quota;
6370 if (cfs_rq_throttled(cfs_rq))
6371 unthrottle_cfs_rq(cfs_rq);
6372 }
6373}
6374#else
6375static void unthrottle_offline_cfs_rqs(struct rq *rq) {}
6376#endif
6377
6378/* 5074/*
6379 * Migrate all tasks from the rq, sleeping tasks will be migrated by 5075 * Migrate all tasks from the rq, sleeping tasks will be migrated by
6380 * try_to_wake_up()->select_task_rq(). 5076 * try_to_wake_up()->select_task_rq().
@@ -6480,7 +5176,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
6480static void 5176static void
6481set_table_entry(struct ctl_table *entry, 5177set_table_entry(struct ctl_table *entry,
6482 const char *procname, void *data, int maxlen, 5178 const char *procname, void *data, int maxlen,
6483 mode_t mode, proc_handler *proc_handler) 5179 umode_t mode, proc_handler *proc_handler)
6484{ 5180{
6485 entry->procname = procname; 5181 entry->procname = procname;
6486 entry->data = data; 5182 entry->data = data;
@@ -6980,6 +5676,12 @@ out:
6980 return -ENOMEM; 5676 return -ENOMEM;
6981} 5677}
6982 5678
5679/*
5680 * By default the system creates a single root-domain with all cpus as
5681 * members (mimicking the global state we have today).
5682 */
5683struct root_domain def_root_domain;
5684
6983static void init_defrootdomain(void) 5685static void init_defrootdomain(void)
6984{ 5686{
6985 init_rootdomain(&def_root_domain); 5687 init_rootdomain(&def_root_domain);
@@ -7051,6 +5753,31 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
7051} 5753}
7052 5754
7053/* 5755/*
5756 * Keep a special pointer to the highest sched_domain that has
5757 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
5758 * allows us to avoid some pointer chasing select_idle_sibling().
5759 *
5760 * Also keep a unique ID per domain (we use the first cpu number in
5761 * the cpumask of the domain), this allows us to quickly tell if
5762 * two cpus are in the same cache domain, see ttwu_share_cache().
5763 */
5764DEFINE_PER_CPU(struct sched_domain *, sd_llc);
5765DEFINE_PER_CPU(int, sd_llc_id);
5766
5767static void update_top_cache_domain(int cpu)
5768{
5769 struct sched_domain *sd;
5770 int id = cpu;
5771
5772 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
5773 if (sd)
5774 id = cpumask_first(sched_domain_span(sd));
5775
5776 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
5777 per_cpu(sd_llc_id, cpu) = id;
5778}
5779
5780/*
7054 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 5781 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
7055 * hold the hotplug lock. 5782 * hold the hotplug lock.
7056 */ 5783 */
@@ -7089,6 +5816,8 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
7089 tmp = rq->sd; 5816 tmp = rq->sd;
7090 rcu_assign_pointer(rq->sd, sd); 5817 rcu_assign_pointer(rq->sd, sd);
7091 destroy_sched_domains(tmp, cpu); 5818 destroy_sched_domains(tmp, cpu);
5819
5820 update_top_cache_domain(cpu);
7092} 5821}
7093 5822
7094/* cpus with isolated domains */ 5823/* cpus with isolated domains */
@@ -7248,7 +5977,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
7248 continue; 5977 continue;
7249 5978
7250 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 5979 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
7251 GFP_KERNEL, cpu_to_node(i)); 5980 GFP_KERNEL, cpu_to_node(cpu));
7252 5981
7253 if (!sg) 5982 if (!sg)
7254 goto fail; 5983 goto fail;
@@ -7386,6 +6115,12 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7386 return; 6115 return;
7387 6116
7388 update_group_power(sd, cpu); 6117 update_group_power(sd, cpu);
6118 atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
6119}
6120
6121int __weak arch_sd_sibling_asym_packing(void)
6122{
6123 return 0*SD_ASYM_PACKING;
7389} 6124}
7390 6125
7391/* 6126/*
@@ -7940,54 +6675,52 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7940} 6675}
7941 6676
7942#ifdef CONFIG_SCHED_MC 6677#ifdef CONFIG_SCHED_MC
7943static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, 6678static ssize_t sched_mc_power_savings_show(struct device *dev,
7944 struct sysdev_class_attribute *attr, 6679 struct device_attribute *attr,
7945 char *page) 6680 char *buf)
7946{ 6681{
7947 return sprintf(page, "%u\n", sched_mc_power_savings); 6682 return sprintf(buf, "%u\n", sched_mc_power_savings);
7948} 6683}
7949static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, 6684static ssize_t sched_mc_power_savings_store(struct device *dev,
7950 struct sysdev_class_attribute *attr, 6685 struct device_attribute *attr,
7951 const char *buf, size_t count) 6686 const char *buf, size_t count)
7952{ 6687{
7953 return sched_power_savings_store(buf, count, 0); 6688 return sched_power_savings_store(buf, count, 0);
7954} 6689}
7955static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644, 6690static DEVICE_ATTR(sched_mc_power_savings, 0644,
7956 sched_mc_power_savings_show, 6691 sched_mc_power_savings_show,
7957 sched_mc_power_savings_store); 6692 sched_mc_power_savings_store);
7958#endif 6693#endif
7959 6694
7960#ifdef CONFIG_SCHED_SMT 6695#ifdef CONFIG_SCHED_SMT
7961static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, 6696static ssize_t sched_smt_power_savings_show(struct device *dev,
7962 struct sysdev_class_attribute *attr, 6697 struct device_attribute *attr,
7963 char *page) 6698 char *buf)
7964{ 6699{
7965 return sprintf(page, "%u\n", sched_smt_power_savings); 6700 return sprintf(buf, "%u\n", sched_smt_power_savings);
7966} 6701}
7967static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, 6702static ssize_t sched_smt_power_savings_store(struct device *dev,
7968 struct sysdev_class_attribute *attr, 6703 struct device_attribute *attr,
7969 const char *buf, size_t count) 6704 const char *buf, size_t count)
7970{ 6705{
7971 return sched_power_savings_store(buf, count, 1); 6706 return sched_power_savings_store(buf, count, 1);
7972} 6707}
7973static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644, 6708static DEVICE_ATTR(sched_smt_power_savings, 0644,
7974 sched_smt_power_savings_show, 6709 sched_smt_power_savings_show,
7975 sched_smt_power_savings_store); 6710 sched_smt_power_savings_store);
7976#endif 6711#endif
7977 6712
7978int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) 6713int __init sched_create_sysfs_power_savings_entries(struct device *dev)
7979{ 6714{
7980 int err = 0; 6715 int err = 0;
7981 6716
7982#ifdef CONFIG_SCHED_SMT 6717#ifdef CONFIG_SCHED_SMT
7983 if (smt_capable()) 6718 if (smt_capable())
7984 err = sysfs_create_file(&cls->kset.kobj, 6719 err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
7985 &attr_sched_smt_power_savings.attr);
7986#endif 6720#endif
7987#ifdef CONFIG_SCHED_MC 6721#ifdef CONFIG_SCHED_MC
7988 if (!err && mc_capable()) 6722 if (!err && mc_capable())
7989 err = sysfs_create_file(&cls->kset.kobj, 6723 err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
7990 &attr_sched_mc_power_savings.attr);
7991#endif 6724#endif
7992 return err; 6725 return err;
7993} 6726}
@@ -8023,29 +6756,6 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
8023 } 6756 }
8024} 6757}
8025 6758
8026static int update_runtime(struct notifier_block *nfb,
8027 unsigned long action, void *hcpu)
8028{
8029 int cpu = (int)(long)hcpu;
8030
8031 switch (action) {
8032 case CPU_DOWN_PREPARE:
8033 case CPU_DOWN_PREPARE_FROZEN:
8034 disable_runtime(cpu_rq(cpu));
8035 return NOTIFY_OK;
8036
8037 case CPU_DOWN_FAILED:
8038 case CPU_DOWN_FAILED_FROZEN:
8039 case CPU_ONLINE:
8040 case CPU_ONLINE_FROZEN:
8041 enable_runtime(cpu_rq(cpu));
8042 return NOTIFY_OK;
8043
8044 default:
8045 return NOTIFY_DONE;
8046 }
8047}
8048
8049void __init sched_init_smp(void) 6759void __init sched_init_smp(void)
8050{ 6760{
8051 cpumask_var_t non_isolated_cpus; 6761 cpumask_var_t non_isolated_cpus;
@@ -8094,104 +6804,11 @@ int in_sched_functions(unsigned long addr)
8094 && addr < (unsigned long)__sched_text_end); 6804 && addr < (unsigned long)__sched_text_end);
8095} 6805}
8096 6806
8097static void init_cfs_rq(struct cfs_rq *cfs_rq) 6807#ifdef CONFIG_CGROUP_SCHED
8098{ 6808struct task_group root_task_group;
8099 cfs_rq->tasks_timeline = RB_ROOT;
8100 INIT_LIST_HEAD(&cfs_rq->tasks);
8101 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
8102#ifndef CONFIG_64BIT
8103 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
8104#endif
8105}
8106
8107static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
8108{
8109 struct rt_prio_array *array;
8110 int i;
8111
8112 array = &rt_rq->active;
8113 for (i = 0; i < MAX_RT_PRIO; i++) {
8114 INIT_LIST_HEAD(array->queue + i);
8115 __clear_bit(i, array->bitmap);
8116 }
8117 /* delimiter for bitsearch: */
8118 __set_bit(MAX_RT_PRIO, array->bitmap);
8119
8120#if defined CONFIG_SMP
8121 rt_rq->highest_prio.curr = MAX_RT_PRIO;
8122 rt_rq->highest_prio.next = MAX_RT_PRIO;
8123 rt_rq->rt_nr_migratory = 0;
8124 rt_rq->overloaded = 0;
8125 plist_head_init(&rt_rq->pushable_tasks);
8126#endif
8127
8128 rt_rq->rt_time = 0;
8129 rt_rq->rt_throttled = 0;
8130 rt_rq->rt_runtime = 0;
8131 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
8132}
8133
8134#ifdef CONFIG_FAIR_GROUP_SCHED
8135static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8136 struct sched_entity *se, int cpu,
8137 struct sched_entity *parent)
8138{
8139 struct rq *rq = cpu_rq(cpu);
8140
8141 cfs_rq->tg = tg;
8142 cfs_rq->rq = rq;
8143#ifdef CONFIG_SMP
8144 /* allow initial update_cfs_load() to truncate */
8145 cfs_rq->load_stamp = 1;
8146#endif
8147 init_cfs_rq_runtime(cfs_rq);
8148
8149 tg->cfs_rq[cpu] = cfs_rq;
8150 tg->se[cpu] = se;
8151
8152 /* se could be NULL for root_task_group */
8153 if (!se)
8154 return;
8155
8156 if (!parent)
8157 se->cfs_rq = &rq->cfs;
8158 else
8159 se->cfs_rq = parent->my_q;
8160
8161 se->my_q = cfs_rq;
8162 update_load_set(&se->load, 0);
8163 se->parent = parent;
8164}
8165#endif 6809#endif
8166 6810
8167#ifdef CONFIG_RT_GROUP_SCHED 6811DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
8168static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
8169 struct sched_rt_entity *rt_se, int cpu,
8170 struct sched_rt_entity *parent)
8171{
8172 struct rq *rq = cpu_rq(cpu);
8173
8174 rt_rq->highest_prio.curr = MAX_RT_PRIO;
8175 rt_rq->rt_nr_boosted = 0;
8176 rt_rq->rq = rq;
8177 rt_rq->tg = tg;
8178
8179 tg->rt_rq[cpu] = rt_rq;
8180 tg->rt_se[cpu] = rt_se;
8181
8182 if (!rt_se)
8183 return;
8184
8185 if (!parent)
8186 rt_se->rt_rq = &rq->rt;
8187 else
8188 rt_se->rt_rq = parent->my_q;
8189
8190 rt_se->my_q = rt_rq;
8191 rt_se->parent = parent;
8192 INIT_LIST_HEAD(&rt_se->run_list);
8193}
8194#endif
8195 6812
8196void __init sched_init(void) 6813void __init sched_init(void)
8197{ 6814{
@@ -8249,9 +6866,17 @@ void __init sched_init(void)
8249#ifdef CONFIG_CGROUP_SCHED 6866#ifdef CONFIG_CGROUP_SCHED
8250 list_add(&root_task_group.list, &task_groups); 6867 list_add(&root_task_group.list, &task_groups);
8251 INIT_LIST_HEAD(&root_task_group.children); 6868 INIT_LIST_HEAD(&root_task_group.children);
6869 INIT_LIST_HEAD(&root_task_group.siblings);
8252 autogroup_init(&init_task); 6870 autogroup_init(&init_task);
6871
8253#endif /* CONFIG_CGROUP_SCHED */ 6872#endif /* CONFIG_CGROUP_SCHED */
8254 6873
6874#ifdef CONFIG_CGROUP_CPUACCT
6875 root_cpuacct.cpustat = &kernel_cpustat;
6876 root_cpuacct.cpuusage = alloc_percpu(u64);
6877 /* Too early, not expected to fail */
6878 BUG_ON(!root_cpuacct.cpuusage);
6879#endif
8255 for_each_possible_cpu(i) { 6880 for_each_possible_cpu(i) {
8256 struct rq *rq; 6881 struct rq *rq;
8257 6882
@@ -8263,7 +6888,7 @@ void __init sched_init(void)
8263 init_cfs_rq(&rq->cfs); 6888 init_cfs_rq(&rq->cfs);
8264 init_rt_rq(&rq->rt, rq); 6889 init_rt_rq(&rq->rt, rq);
8265#ifdef CONFIG_FAIR_GROUP_SCHED 6890#ifdef CONFIG_FAIR_GROUP_SCHED
8266 root_task_group.shares = root_task_group_load; 6891 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
8267 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 6892 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
8268 /* 6893 /*
8269 * How much cpu bandwidth does root_task_group get? 6894 * How much cpu bandwidth does root_task_group get?
@@ -8313,7 +6938,7 @@ void __init sched_init(void)
8313 rq->avg_idle = 2*sysctl_sched_migration_cost; 6938 rq->avg_idle = 2*sysctl_sched_migration_cost;
8314 rq_attach_root(rq, &def_root_domain); 6939 rq_attach_root(rq, &def_root_domain);
8315#ifdef CONFIG_NO_HZ 6940#ifdef CONFIG_NO_HZ
8316 rq->nohz_balance_kick = 0; 6941 rq->nohz_flags = 0;
8317#endif 6942#endif
8318#endif 6943#endif
8319 init_rq_hrtick(rq); 6944 init_rq_hrtick(rq);
@@ -8326,10 +6951,6 @@ void __init sched_init(void)
8326 INIT_HLIST_HEAD(&init_task.preempt_notifiers); 6951 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
8327#endif 6952#endif
8328 6953
8329#ifdef CONFIG_SMP
8330 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
8331#endif
8332
8333#ifdef CONFIG_RT_MUTEXES 6954#ifdef CONFIG_RT_MUTEXES
8334 plist_head_init(&init_task.pi_waiters); 6955 plist_head_init(&init_task.pi_waiters);
8335#endif 6956#endif
@@ -8357,17 +6978,11 @@ void __init sched_init(void)
8357 6978
8358#ifdef CONFIG_SMP 6979#ifdef CONFIG_SMP
8359 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); 6980 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
8360#ifdef CONFIG_NO_HZ
8361 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
8362 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
8363 atomic_set(&nohz.load_balancer, nr_cpu_ids);
8364 atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
8365 atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
8366#endif
8367 /* May be allocated at isolcpus cmdline parse time */ 6981 /* May be allocated at isolcpus cmdline parse time */
8368 if (cpu_isolated_map == NULL) 6982 if (cpu_isolated_map == NULL)
8369 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 6983 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
8370#endif /* SMP */ 6984#endif
6985 init_sched_fair_class();
8371 6986
8372 scheduler_running = 1; 6987 scheduler_running = 1;
8373} 6988}
@@ -8519,169 +7134,10 @@ void set_curr_task(int cpu, struct task_struct *p)
8519 7134
8520#endif 7135#endif
8521 7136
8522#ifdef CONFIG_FAIR_GROUP_SCHED
8523static void free_fair_sched_group(struct task_group *tg)
8524{
8525 int i;
8526
8527 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
8528
8529 for_each_possible_cpu(i) {
8530 if (tg->cfs_rq)
8531 kfree(tg->cfs_rq[i]);
8532 if (tg->se)
8533 kfree(tg->se[i]);
8534 }
8535
8536 kfree(tg->cfs_rq);
8537 kfree(tg->se);
8538}
8539
8540static
8541int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8542{
8543 struct cfs_rq *cfs_rq;
8544 struct sched_entity *se;
8545 int i;
8546
8547 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
8548 if (!tg->cfs_rq)
8549 goto err;
8550 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
8551 if (!tg->se)
8552 goto err;
8553
8554 tg->shares = NICE_0_LOAD;
8555
8556 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
8557
8558 for_each_possible_cpu(i) {
8559 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8560 GFP_KERNEL, cpu_to_node(i));
8561 if (!cfs_rq)
8562 goto err;
8563
8564 se = kzalloc_node(sizeof(struct sched_entity),
8565 GFP_KERNEL, cpu_to_node(i));
8566 if (!se)
8567 goto err_free_rq;
8568
8569 init_cfs_rq(cfs_rq);
8570 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8571 }
8572
8573 return 1;
8574
8575err_free_rq:
8576 kfree(cfs_rq);
8577err:
8578 return 0;
8579}
8580
8581static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8582{
8583 struct rq *rq = cpu_rq(cpu);
8584 unsigned long flags;
8585
8586 /*
8587 * Only empty task groups can be destroyed; so we can speculatively
8588 * check on_list without danger of it being re-added.
8589 */
8590 if (!tg->cfs_rq[cpu]->on_list)
8591 return;
8592
8593 raw_spin_lock_irqsave(&rq->lock, flags);
8594 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8595 raw_spin_unlock_irqrestore(&rq->lock, flags);
8596}
8597#else /* !CONFIG_FAIR_GROUP_SCHED */
8598static inline void free_fair_sched_group(struct task_group *tg)
8599{
8600}
8601
8602static inline
8603int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8604{
8605 return 1;
8606}
8607
8608static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8609{
8610}
8611#endif /* CONFIG_FAIR_GROUP_SCHED */
8612
8613#ifdef CONFIG_RT_GROUP_SCHED
8614static void free_rt_sched_group(struct task_group *tg)
8615{
8616 int i;
8617
8618 if (tg->rt_se)
8619 destroy_rt_bandwidth(&tg->rt_bandwidth);
8620
8621 for_each_possible_cpu(i) {
8622 if (tg->rt_rq)
8623 kfree(tg->rt_rq[i]);
8624 if (tg->rt_se)
8625 kfree(tg->rt_se[i]);
8626 }
8627
8628 kfree(tg->rt_rq);
8629 kfree(tg->rt_se);
8630}
8631
8632static
8633int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8634{
8635 struct rt_rq *rt_rq;
8636 struct sched_rt_entity *rt_se;
8637 int i;
8638
8639 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
8640 if (!tg->rt_rq)
8641 goto err;
8642 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
8643 if (!tg->rt_se)
8644 goto err;
8645
8646 init_rt_bandwidth(&tg->rt_bandwidth,
8647 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
8648
8649 for_each_possible_cpu(i) {
8650 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8651 GFP_KERNEL, cpu_to_node(i));
8652 if (!rt_rq)
8653 goto err;
8654
8655 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
8656 GFP_KERNEL, cpu_to_node(i));
8657 if (!rt_se)
8658 goto err_free_rq;
8659
8660 init_rt_rq(rt_rq, cpu_rq(i));
8661 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
8662 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
8663 }
8664
8665 return 1;
8666
8667err_free_rq:
8668 kfree(rt_rq);
8669err:
8670 return 0;
8671}
8672#else /* !CONFIG_RT_GROUP_SCHED */
8673static inline void free_rt_sched_group(struct task_group *tg)
8674{
8675}
8676
8677static inline
8678int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8679{
8680 return 1;
8681}
8682#endif /* CONFIG_RT_GROUP_SCHED */
8683
8684#ifdef CONFIG_CGROUP_SCHED 7137#ifdef CONFIG_CGROUP_SCHED
7138/* task_group_lock serializes the addition/removal of task groups */
7139static DEFINE_SPINLOCK(task_group_lock);
7140
8685static void free_sched_group(struct task_group *tg) 7141static void free_sched_group(struct task_group *tg)
8686{ 7142{
8687 free_fair_sched_group(tg); 7143 free_fair_sched_group(tg);
@@ -8786,50 +7242,6 @@ void sched_move_task(struct task_struct *tsk)
8786} 7242}
8787#endif /* CONFIG_CGROUP_SCHED */ 7243#endif /* CONFIG_CGROUP_SCHED */
8788 7244
8789#ifdef CONFIG_FAIR_GROUP_SCHED
8790static DEFINE_MUTEX(shares_mutex);
8791
8792int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8793{
8794 int i;
8795 unsigned long flags;
8796
8797 /*
8798 * We can't change the weight of the root cgroup.
8799 */
8800 if (!tg->se[0])
8801 return -EINVAL;
8802
8803 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
8804
8805 mutex_lock(&shares_mutex);
8806 if (tg->shares == shares)
8807 goto done;
8808
8809 tg->shares = shares;
8810 for_each_possible_cpu(i) {
8811 struct rq *rq = cpu_rq(i);
8812 struct sched_entity *se;
8813
8814 se = tg->se[i];
8815 /* Propagate contribution to hierarchy */
8816 raw_spin_lock_irqsave(&rq->lock, flags);
8817 for_each_sched_entity(se)
8818 update_cfs_shares(group_cfs_rq(se));
8819 raw_spin_unlock_irqrestore(&rq->lock, flags);
8820 }
8821
8822done:
8823 mutex_unlock(&shares_mutex);
8824 return 0;
8825}
8826
8827unsigned long sched_group_shares(struct task_group *tg)
8828{
8829 return tg->shares;
8830}
8831#endif
8832
8833#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) 7245#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
8834static unsigned long to_ratio(u64 period, u64 runtime) 7246static unsigned long to_ratio(u64 period, u64 runtime)
8835{ 7247{
@@ -8852,7 +7264,7 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
8852 struct task_struct *g, *p; 7264 struct task_struct *g, *p;
8853 7265
8854 do_each_thread(g, p) { 7266 do_each_thread(g, p) {
8855 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) 7267 if (rt_task(p) && task_rq(p)->rt.tg == tg)
8856 return 1; 7268 return 1;
8857 } while_each_thread(g, p); 7269 } while_each_thread(g, p);
8858 7270
@@ -9144,24 +7556,31 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9144 sched_destroy_group(tg); 7556 sched_destroy_group(tg);
9145} 7557}
9146 7558
9147static int 7559static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
9148cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 7560 struct cgroup_taskset *tset)
9149{ 7561{
7562 struct task_struct *task;
7563
7564 cgroup_taskset_for_each(task, cgrp, tset) {
9150#ifdef CONFIG_RT_GROUP_SCHED 7565#ifdef CONFIG_RT_GROUP_SCHED
9151 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) 7566 if (!sched_rt_can_attach(cgroup_tg(cgrp), task))
9152 return -EINVAL; 7567 return -EINVAL;
9153#else 7568#else
9154 /* We don't support RT-tasks being in separate groups */ 7569 /* We don't support RT-tasks being in separate groups */
9155 if (tsk->sched_class != &fair_sched_class) 7570 if (task->sched_class != &fair_sched_class)
9156 return -EINVAL; 7571 return -EINVAL;
9157#endif 7572#endif
7573 }
9158 return 0; 7574 return 0;
9159} 7575}
9160 7576
9161static void 7577static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
9162cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 7578 struct cgroup_taskset *tset)
9163{ 7579{
9164 sched_move_task(tsk); 7580 struct task_struct *task;
7581
7582 cgroup_taskset_for_each(task, cgrp, tset)
7583 sched_move_task(task);
9165} 7584}
9166 7585
9167static void 7586static void
@@ -9203,8 +7622,8 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
9203 7622
9204static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) 7623static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9205{ 7624{
9206 int i, ret = 0, runtime_enabled; 7625 int i, ret = 0, runtime_enabled, runtime_was_enabled;
9207 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); 7626 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9208 7627
9209 if (tg == &root_task_group) 7628 if (tg == &root_task_group)
9210 return -EINVAL; 7629 return -EINVAL;
@@ -9231,6 +7650,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9231 goto out_unlock; 7650 goto out_unlock;
9232 7651
9233 runtime_enabled = quota != RUNTIME_INF; 7652 runtime_enabled = quota != RUNTIME_INF;
7653 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
7654 account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
9234 raw_spin_lock_irq(&cfs_b->lock); 7655 raw_spin_lock_irq(&cfs_b->lock);
9235 cfs_b->period = ns_to_ktime(period); 7656 cfs_b->period = ns_to_ktime(period);
9236 cfs_b->quota = quota; 7657 cfs_b->quota = quota;
@@ -9246,13 +7667,13 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9246 7667
9247 for_each_possible_cpu(i) { 7668 for_each_possible_cpu(i) {
9248 struct cfs_rq *cfs_rq = tg->cfs_rq[i]; 7669 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
9249 struct rq *rq = rq_of(cfs_rq); 7670 struct rq *rq = cfs_rq->rq;
9250 7671
9251 raw_spin_lock_irq(&rq->lock); 7672 raw_spin_lock_irq(&rq->lock);
9252 cfs_rq->runtime_enabled = runtime_enabled; 7673 cfs_rq->runtime_enabled = runtime_enabled;
9253 cfs_rq->runtime_remaining = 0; 7674 cfs_rq->runtime_remaining = 0;
9254 7675
9255 if (cfs_rq_throttled(cfs_rq)) 7676 if (cfs_rq->throttled)
9256 unthrottle_cfs_rq(cfs_rq); 7677 unthrottle_cfs_rq(cfs_rq);
9257 raw_spin_unlock_irq(&rq->lock); 7678 raw_spin_unlock_irq(&rq->lock);
9258 } 7679 }
@@ -9266,7 +7687,7 @@ int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
9266{ 7687{
9267 u64 quota, period; 7688 u64 quota, period;
9268 7689
9269 period = ktime_to_ns(tg_cfs_bandwidth(tg)->period); 7690 period = ktime_to_ns(tg->cfs_bandwidth.period);
9270 if (cfs_quota_us < 0) 7691 if (cfs_quota_us < 0)
9271 quota = RUNTIME_INF; 7692 quota = RUNTIME_INF;
9272 else 7693 else
@@ -9279,10 +7700,10 @@ long tg_get_cfs_quota(struct task_group *tg)
9279{ 7700{
9280 u64 quota_us; 7701 u64 quota_us;
9281 7702
9282 if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF) 7703 if (tg->cfs_bandwidth.quota == RUNTIME_INF)
9283 return -1; 7704 return -1;
9284 7705
9285 quota_us = tg_cfs_bandwidth(tg)->quota; 7706 quota_us = tg->cfs_bandwidth.quota;
9286 do_div(quota_us, NSEC_PER_USEC); 7707 do_div(quota_us, NSEC_PER_USEC);
9287 7708
9288 return quota_us; 7709 return quota_us;
@@ -9293,10 +7714,7 @@ int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
9293 u64 quota, period; 7714 u64 quota, period;
9294 7715
9295 period = (u64)cfs_period_us * NSEC_PER_USEC; 7716 period = (u64)cfs_period_us * NSEC_PER_USEC;
9296 quota = tg_cfs_bandwidth(tg)->quota; 7717 quota = tg->cfs_bandwidth.quota;
9297
9298 if (period <= 0)
9299 return -EINVAL;
9300 7718
9301 return tg_set_cfs_bandwidth(tg, period, quota); 7719 return tg_set_cfs_bandwidth(tg, period, quota);
9302} 7720}
@@ -9305,7 +7723,7 @@ long tg_get_cfs_period(struct task_group *tg)
9305{ 7723{
9306 u64 cfs_period_us; 7724 u64 cfs_period_us;
9307 7725
9308 cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period); 7726 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
9309 do_div(cfs_period_us, NSEC_PER_USEC); 7727 do_div(cfs_period_us, NSEC_PER_USEC);
9310 7728
9311 return cfs_period_us; 7729 return cfs_period_us;
@@ -9365,13 +7783,13 @@ static u64 normalize_cfs_quota(struct task_group *tg,
9365static int tg_cfs_schedulable_down(struct task_group *tg, void *data) 7783static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
9366{ 7784{
9367 struct cfs_schedulable_data *d = data; 7785 struct cfs_schedulable_data *d = data;
9368 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); 7786 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9369 s64 quota = 0, parent_quota = -1; 7787 s64 quota = 0, parent_quota = -1;
9370 7788
9371 if (!tg->parent) { 7789 if (!tg->parent) {
9372 quota = RUNTIME_INF; 7790 quota = RUNTIME_INF;
9373 } else { 7791 } else {
9374 struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent); 7792 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
9375 7793
9376 quota = normalize_cfs_quota(tg, d); 7794 quota = normalize_cfs_quota(tg, d);
9377 parent_quota = parent_b->hierarchal_quota; 7795 parent_quota = parent_b->hierarchal_quota;
@@ -9415,7 +7833,7 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
9415 struct cgroup_map_cb *cb) 7833 struct cgroup_map_cb *cb)
9416{ 7834{
9417 struct task_group *tg = cgroup_tg(cgrp); 7835 struct task_group *tg = cgroup_tg(cgrp);
9418 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); 7836 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9419 7837
9420 cb->fill(cb, "nr_periods", cfs_b->nr_periods); 7838 cb->fill(cb, "nr_periods", cfs_b->nr_periods);
9421 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); 7839 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
@@ -9497,8 +7915,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
9497 .name = "cpu", 7915 .name = "cpu",
9498 .create = cpu_cgroup_create, 7916 .create = cpu_cgroup_create,
9499 .destroy = cpu_cgroup_destroy, 7917 .destroy = cpu_cgroup_destroy,
9500 .can_attach_task = cpu_cgroup_can_attach_task, 7918 .can_attach = cpu_cgroup_can_attach,
9501 .attach_task = cpu_cgroup_attach_task, 7919 .attach = cpu_cgroup_attach,
9502 .exit = cpu_cgroup_exit, 7920 .exit = cpu_cgroup_exit,
9503 .populate = cpu_cgroup_populate, 7921 .populate = cpu_cgroup_populate,
9504 .subsys_id = cpu_cgroup_subsys_id, 7922 .subsys_id = cpu_cgroup_subsys_id,
@@ -9516,38 +7934,16 @@ struct cgroup_subsys cpu_cgroup_subsys = {
9516 * (balbir@in.ibm.com). 7934 * (balbir@in.ibm.com).
9517 */ 7935 */
9518 7936
9519/* track cpu usage of a group of tasks and its child groups */
9520struct cpuacct {
9521 struct cgroup_subsys_state css;
9522 /* cpuusage holds pointer to a u64-type object on every cpu */
9523 u64 __percpu *cpuusage;
9524 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
9525 struct cpuacct *parent;
9526};
9527
9528struct cgroup_subsys cpuacct_subsys;
9529
9530/* return cpu accounting group corresponding to this container */
9531static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
9532{
9533 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
9534 struct cpuacct, css);
9535}
9536
9537/* return cpu accounting group to which this task belongs */
9538static inline struct cpuacct *task_ca(struct task_struct *tsk)
9539{
9540 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
9541 struct cpuacct, css);
9542}
9543
9544/* create a new cpu accounting group */ 7937/* create a new cpu accounting group */
9545static struct cgroup_subsys_state *cpuacct_create( 7938static struct cgroup_subsys_state *cpuacct_create(
9546 struct cgroup_subsys *ss, struct cgroup *cgrp) 7939 struct cgroup_subsys *ss, struct cgroup *cgrp)
9547{ 7940{
9548 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); 7941 struct cpuacct *ca;
9549 int i; 7942
7943 if (!cgrp->parent)
7944 return &root_cpuacct.css;
9550 7945
7946 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
9551 if (!ca) 7947 if (!ca)
9552 goto out; 7948 goto out;
9553 7949
@@ -9555,18 +7951,13 @@ static struct cgroup_subsys_state *cpuacct_create(
9555 if (!ca->cpuusage) 7951 if (!ca->cpuusage)
9556 goto out_free_ca; 7952 goto out_free_ca;
9557 7953
9558 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) 7954 ca->cpustat = alloc_percpu(struct kernel_cpustat);
9559 if (percpu_counter_init(&ca->cpustat[i], 0)) 7955 if (!ca->cpustat)
9560 goto out_free_counters; 7956 goto out_free_cpuusage;
9561
9562 if (cgrp->parent)
9563 ca->parent = cgroup_ca(cgrp->parent);
9564 7957
9565 return &ca->css; 7958 return &ca->css;
9566 7959
9567out_free_counters: 7960out_free_cpuusage:
9568 while (--i >= 0)
9569 percpu_counter_destroy(&ca->cpustat[i]);
9570 free_percpu(ca->cpuusage); 7961 free_percpu(ca->cpuusage);
9571out_free_ca: 7962out_free_ca:
9572 kfree(ca); 7963 kfree(ca);
@@ -9579,10 +7970,8 @@ static void
9579cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) 7970cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9580{ 7971{
9581 struct cpuacct *ca = cgroup_ca(cgrp); 7972 struct cpuacct *ca = cgroup_ca(cgrp);
9582 int i;
9583 7973
9584 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) 7974 free_percpu(ca->cpustat);
9585 percpu_counter_destroy(&ca->cpustat[i]);
9586 free_percpu(ca->cpuusage); 7975 free_percpu(ca->cpuusage);
9587 kfree(ca); 7976 kfree(ca);
9588} 7977}
@@ -9675,16 +8064,31 @@ static const char *cpuacct_stat_desc[] = {
9675}; 8064};
9676 8065
9677static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, 8066static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
9678 struct cgroup_map_cb *cb) 8067 struct cgroup_map_cb *cb)
9679{ 8068{
9680 struct cpuacct *ca = cgroup_ca(cgrp); 8069 struct cpuacct *ca = cgroup_ca(cgrp);
9681 int i; 8070 int cpu;
8071 s64 val = 0;
9682 8072
9683 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) { 8073 for_each_online_cpu(cpu) {
9684 s64 val = percpu_counter_read(&ca->cpustat[i]); 8074 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
9685 val = cputime64_to_clock_t(val); 8075 val += kcpustat->cpustat[CPUTIME_USER];
9686 cb->fill(cb, cpuacct_stat_desc[i], val); 8076 val += kcpustat->cpustat[CPUTIME_NICE];
9687 } 8077 }
8078 val = cputime64_to_clock_t(val);
8079 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
8080
8081 val = 0;
8082 for_each_online_cpu(cpu) {
8083 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
8084 val += kcpustat->cpustat[CPUTIME_SYSTEM];
8085 val += kcpustat->cpustat[CPUTIME_IRQ];
8086 val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
8087 }
8088
8089 val = cputime64_to_clock_t(val);
8090 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
8091
9688 return 0; 8092 return 0;
9689} 8093}
9690 8094
@@ -9714,7 +8118,7 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
9714 * 8118 *
9715 * called with rq->lock held. 8119 * called with rq->lock held.
9716 */ 8120 */
9717static void cpuacct_charge(struct task_struct *tsk, u64 cputime) 8121void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9718{ 8122{
9719 struct cpuacct *ca; 8123 struct cpuacct *ca;
9720 int cpu; 8124 int cpu;
@@ -9728,7 +8132,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9728 8132
9729 ca = task_ca(tsk); 8133 ca = task_ca(tsk);
9730 8134
9731 for (; ca; ca = ca->parent) { 8135 for (; ca; ca = parent_ca(ca)) {
9732 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 8136 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9733 *cpuusage += cputime; 8137 *cpuusage += cputime;
9734 } 8138 }
@@ -9736,45 +8140,6 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9736 rcu_read_unlock(); 8140 rcu_read_unlock();
9737} 8141}
9738 8142
9739/*
9740 * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
9741 * in cputime_t units. As a result, cpuacct_update_stats calls
9742 * percpu_counter_add with values large enough to always overflow the
9743 * per cpu batch limit causing bad SMP scalability.
9744 *
9745 * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
9746 * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
9747 * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
9748 */
9749#ifdef CONFIG_SMP
9750#define CPUACCT_BATCH \
9751 min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
9752#else
9753#define CPUACCT_BATCH 0
9754#endif
9755
9756/*
9757 * Charge the system/user time to the task's accounting group.
9758 */
9759static void cpuacct_update_stats(struct task_struct *tsk,
9760 enum cpuacct_stat_index idx, cputime_t val)
9761{
9762 struct cpuacct *ca;
9763 int batch = CPUACCT_BATCH;
9764
9765 if (unlikely(!cpuacct_subsys.active))
9766 return;
9767
9768 rcu_read_lock();
9769 ca = task_ca(tsk);
9770
9771 do {
9772 __percpu_counter_add(&ca->cpustat[idx], val, batch);
9773 ca = ca->parent;
9774 } while (ca);
9775 rcu_read_unlock();
9776}
9777
9778struct cgroup_subsys cpuacct_subsys = { 8143struct cgroup_subsys cpuacct_subsys = {
9779 .name = "cpuacct", 8144 .name = "cpuacct",
9780 .create = cpuacct_create, 8145 .create = cpuacct_create,
diff --git a/kernel/sched_cpupri.c b/kernel/sched/cpupri.c
index a86cf9d9eb11..d72586fdf660 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * kernel/sched_cpupri.c 2 * kernel/sched/cpupri.c
3 * 3 *
4 * CPU priority management 4 * CPU priority management
5 * 5 *
@@ -28,7 +28,7 @@
28 */ 28 */
29 29
30#include <linux/gfp.h> 30#include <linux/gfp.h>
31#include "sched_cpupri.h" 31#include "cpupri.h"
32 32
33/* Convert between a 140 based task->prio, and our 102 based cpupri */ 33/* Convert between a 140 based task->prio, and our 102 based cpupri */
34static int convert_prio(int prio) 34static int convert_prio(int prio)
@@ -129,7 +129,7 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
129 * cpupri_set - update the cpu priority setting 129 * cpupri_set - update the cpu priority setting
130 * @cp: The cpupri context 130 * @cp: The cpupri context
131 * @cpu: The target cpu 131 * @cpu: The target cpu
132 * @pri: The priority (INVALID-RT99) to assign to this CPU 132 * @newpri: The priority (INVALID-RT99) to assign to this CPU
133 * 133 *
134 * Note: Assumes cpu_rq(cpu)->lock is locked 134 * Note: Assumes cpu_rq(cpu)->lock is locked
135 * 135 *
@@ -200,7 +200,6 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
200/** 200/**
201 * cpupri_init - initialize the cpupri structure 201 * cpupri_init - initialize the cpupri structure
202 * @cp: The cpupri context 202 * @cp: The cpupri context
203 * @bootmem: true if allocations need to use bootmem
204 * 203 *
205 * Returns: -ENOMEM if memory fails. 204 * Returns: -ENOMEM if memory fails.
206 */ 205 */
diff --git a/kernel/sched_cpupri.h b/kernel/sched/cpupri.h
index f6d756173491..f6d756173491 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched/cpupri.h
diff --git a/kernel/sched_debug.c b/kernel/sched/debug.c
index a6710a112b4f..2a075e10004b 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched/debug.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * kernel/time/sched_debug.c 2 * kernel/sched/debug.c
3 * 3 *
4 * Print the CFS rbtree 4 * Print the CFS rbtree
5 * 5 *
@@ -16,6 +16,8 @@
16#include <linux/kallsyms.h> 16#include <linux/kallsyms.h>
17#include <linux/utsname.h> 17#include <linux/utsname.h>
18 18
19#include "sched.h"
20
19static DEFINE_SPINLOCK(sched_debug_lock); 21static DEFINE_SPINLOCK(sched_debug_lock);
20 22
21/* 23/*
@@ -373,7 +375,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
373 return 0; 375 return 0;
374} 376}
375 377
376static void sysrq_sched_debug_show(void) 378void sysrq_sched_debug_show(void)
377{ 379{
378 sched_debug_show(NULL, NULL); 380 sched_debug_show(NULL, NULL);
379} 381}
diff --git a/kernel/sched_fair.c b/kernel/sched/fair.c
index 8a39fa3e3c6c..84adb2d66cbd 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched/fair.c
@@ -23,6 +23,13 @@
23#include <linux/latencytop.h> 23#include <linux/latencytop.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/cpumask.h> 25#include <linux/cpumask.h>
26#include <linux/slab.h>
27#include <linux/profile.h>
28#include <linux/interrupt.h>
29
30#include <trace/events/sched.h>
31
32#include "sched.h"
26 33
27/* 34/*
28 * Targeted preemption latency for CPU-bound tasks: 35 * Targeted preemption latency for CPU-bound tasks:
@@ -103,7 +110,110 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
103unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; 110unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
104#endif 111#endif
105 112
106static const struct sched_class fair_sched_class; 113/*
114 * Increase the granularity value when there are more CPUs,
115 * because with more CPUs the 'effective latency' as visible
116 * to users decreases. But the relationship is not linear,
117 * so pick a second-best guess by going with the log2 of the
118 * number of CPUs.
119 *
120 * This idea comes from the SD scheduler of Con Kolivas:
121 */
122static int get_update_sysctl_factor(void)
123{
124 unsigned int cpus = min_t(int, num_online_cpus(), 8);
125 unsigned int factor;
126
127 switch (sysctl_sched_tunable_scaling) {
128 case SCHED_TUNABLESCALING_NONE:
129 factor = 1;
130 break;
131 case SCHED_TUNABLESCALING_LINEAR:
132 factor = cpus;
133 break;
134 case SCHED_TUNABLESCALING_LOG:
135 default:
136 factor = 1 + ilog2(cpus);
137 break;
138 }
139
140 return factor;
141}
142
143static void update_sysctl(void)
144{
145 unsigned int factor = get_update_sysctl_factor();
146
147#define SET_SYSCTL(name) \
148 (sysctl_##name = (factor) * normalized_sysctl_##name)
149 SET_SYSCTL(sched_min_granularity);
150 SET_SYSCTL(sched_latency);
151 SET_SYSCTL(sched_wakeup_granularity);
152#undef SET_SYSCTL
153}
154
155void sched_init_granularity(void)
156{
157 update_sysctl();
158}
159
160#if BITS_PER_LONG == 32
161# define WMULT_CONST (~0UL)
162#else
163# define WMULT_CONST (1UL << 32)
164#endif
165
166#define WMULT_SHIFT 32
167
168/*
169 * Shift right and round:
170 */
171#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
172
173/*
174 * delta *= weight / lw
175 */
176static unsigned long
177calc_delta_mine(unsigned long delta_exec, unsigned long weight,
178 struct load_weight *lw)
179{
180 u64 tmp;
181
182 /*
183 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
184 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
185 * 2^SCHED_LOAD_RESOLUTION.
186 */
187 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
188 tmp = (u64)delta_exec * scale_load_down(weight);
189 else
190 tmp = (u64)delta_exec;
191
192 if (!lw->inv_weight) {
193 unsigned long w = scale_load_down(lw->weight);
194
195 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
196 lw->inv_weight = 1;
197 else if (unlikely(!w))
198 lw->inv_weight = WMULT_CONST;
199 else
200 lw->inv_weight = WMULT_CONST / w;
201 }
202
203 /*
204 * Check whether we'd overflow the 64-bit multiplication:
205 */
206 if (unlikely(tmp > WMULT_CONST))
207 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
208 WMULT_SHIFT/2);
209 else
210 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
211
212 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
213}
214
215
216const struct sched_class fair_sched_class;
107 217
108/************************************************************** 218/**************************************************************
109 * CFS operations on generic schedulable entities: 219 * CFS operations on generic schedulable entities:
@@ -413,7 +523,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
413 rb_erase(&se->run_node, &cfs_rq->tasks_timeline); 523 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
414} 524}
415 525
416static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) 526struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
417{ 527{
418 struct rb_node *left = cfs_rq->rb_leftmost; 528 struct rb_node *left = cfs_rq->rb_leftmost;
419 529
@@ -434,7 +544,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se)
434} 544}
435 545
436#ifdef CONFIG_SCHED_DEBUG 546#ifdef CONFIG_SCHED_DEBUG
437static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) 547struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
438{ 548{
439 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); 549 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
440 550
@@ -684,7 +794,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
684{ 794{
685 update_load_add(&cfs_rq->load, se->load.weight); 795 update_load_add(&cfs_rq->load, se->load.weight);
686 if (!parent_entity(se)) 796 if (!parent_entity(se))
687 inc_cpu_load(rq_of(cfs_rq), se->load.weight); 797 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
688 if (entity_is_task(se)) { 798 if (entity_is_task(se)) {
689 add_cfs_task_weight(cfs_rq, se->load.weight); 799 add_cfs_task_weight(cfs_rq, se->load.weight);
690 list_add(&se->group_node, &cfs_rq->tasks); 800 list_add(&se->group_node, &cfs_rq->tasks);
@@ -697,7 +807,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
697{ 807{
698 update_load_sub(&cfs_rq->load, se->load.weight); 808 update_load_sub(&cfs_rq->load, se->load.weight);
699 if (!parent_entity(se)) 809 if (!parent_entity(se))
700 dec_cpu_load(rq_of(cfs_rq), se->load.weight); 810 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
701 if (entity_is_task(se)) { 811 if (entity_is_task(se)) {
702 add_cfs_task_weight(cfs_rq, -se->load.weight); 812 add_cfs_task_weight(cfs_rq, -se->load.weight);
703 list_del_init(&se->group_node); 813 list_del_init(&se->group_node);
@@ -893,7 +1003,6 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
893 if (unlikely(delta > se->statistics.sleep_max)) 1003 if (unlikely(delta > se->statistics.sleep_max))
894 se->statistics.sleep_max = delta; 1004 se->statistics.sleep_max = delta;
895 1005
896 se->statistics.sleep_start = 0;
897 se->statistics.sum_sleep_runtime += delta; 1006 se->statistics.sum_sleep_runtime += delta;
898 1007
899 if (tsk) { 1008 if (tsk) {
@@ -910,7 +1019,6 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
910 if (unlikely(delta > se->statistics.block_max)) 1019 if (unlikely(delta > se->statistics.block_max))
911 se->statistics.block_max = delta; 1020 se->statistics.block_max = delta;
912 1021
913 se->statistics.block_start = 0;
914 se->statistics.sum_sleep_runtime += delta; 1022 se->statistics.sum_sleep_runtime += delta;
915 1023
916 if (tsk) { 1024 if (tsk) {
@@ -920,6 +1028,8 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
920 trace_sched_stat_iowait(tsk, delta); 1028 trace_sched_stat_iowait(tsk, delta);
921 } 1029 }
922 1030
1031 trace_sched_stat_blocked(tsk, delta);
1032
923 /* 1033 /*
924 * Blocking time is in units of nanosecs, so shift by 1034 * Blocking time is in units of nanosecs, so shift by
925 * 20 to get a milliseconds-range estimation of the 1035 * 20 to get a milliseconds-range estimation of the
@@ -1287,6 +1397,32 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1287 */ 1397 */
1288 1398
1289#ifdef CONFIG_CFS_BANDWIDTH 1399#ifdef CONFIG_CFS_BANDWIDTH
1400
1401#ifdef HAVE_JUMP_LABEL
1402static struct jump_label_key __cfs_bandwidth_used;
1403
1404static inline bool cfs_bandwidth_used(void)
1405{
1406 return static_branch(&__cfs_bandwidth_used);
1407}
1408
1409void account_cfs_bandwidth_used(int enabled, int was_enabled)
1410{
1411 /* only need to count groups transitioning between enabled/!enabled */
1412 if (enabled && !was_enabled)
1413 jump_label_inc(&__cfs_bandwidth_used);
1414 else if (!enabled && was_enabled)
1415 jump_label_dec(&__cfs_bandwidth_used);
1416}
1417#else /* HAVE_JUMP_LABEL */
1418static bool cfs_bandwidth_used(void)
1419{
1420 return true;
1421}
1422
1423void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
1424#endif /* HAVE_JUMP_LABEL */
1425
1290/* 1426/*
1291 * default period for cfs group bandwidth. 1427 * default period for cfs group bandwidth.
1292 * default: 0.1s, units: nanoseconds 1428 * default: 0.1s, units: nanoseconds
@@ -1308,7 +1444,7 @@ static inline u64 sched_cfs_bandwidth_slice(void)
1308 * 1444 *
1309 * requires cfs_b->lock 1445 * requires cfs_b->lock
1310 */ 1446 */
1311static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) 1447void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
1312{ 1448{
1313 u64 now; 1449 u64 now;
1314 1450
@@ -1320,6 +1456,11 @@ static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
1320 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); 1456 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
1321} 1457}
1322 1458
1459static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
1460{
1461 return &tg->cfs_bandwidth;
1462}
1463
1323/* returns 0 on failure to allocate runtime */ 1464/* returns 0 on failure to allocate runtime */
1324static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) 1465static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1325{ 1466{
@@ -1421,7 +1562,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1421static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, 1562static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1422 unsigned long delta_exec) 1563 unsigned long delta_exec)
1423{ 1564{
1424 if (!cfs_rq->runtime_enabled) 1565 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
1425 return; 1566 return;
1426 1567
1427 __account_cfs_rq_runtime(cfs_rq, delta_exec); 1568 __account_cfs_rq_runtime(cfs_rq, delta_exec);
@@ -1429,13 +1570,13 @@ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1429 1570
1430static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) 1571static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
1431{ 1572{
1432 return cfs_rq->throttled; 1573 return cfs_bandwidth_used() && cfs_rq->throttled;
1433} 1574}
1434 1575
1435/* check whether cfs_rq, or any parent, is throttled */ 1576/* check whether cfs_rq, or any parent, is throttled */
1436static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) 1577static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
1437{ 1578{
1438 return cfs_rq->throttle_count; 1579 return cfs_bandwidth_used() && cfs_rq->throttle_count;
1439} 1580}
1440 1581
1441/* 1582/*
@@ -1530,7 +1671,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
1530 raw_spin_unlock(&cfs_b->lock); 1671 raw_spin_unlock(&cfs_b->lock);
1531} 1672}
1532 1673
1533static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) 1674void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
1534{ 1675{
1535 struct rq *rq = rq_of(cfs_rq); 1676 struct rq *rq = rq_of(cfs_rq);
1536 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); 1677 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
@@ -1756,6 +1897,9 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1756 1897
1757static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) 1898static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1758{ 1899{
1900 if (!cfs_bandwidth_used())
1901 return;
1902
1759 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running) 1903 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
1760 return; 1904 return;
1761 1905
@@ -1801,6 +1945,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
1801 */ 1945 */
1802static void check_enqueue_throttle(struct cfs_rq *cfs_rq) 1946static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
1803{ 1947{
1948 if (!cfs_bandwidth_used())
1949 return;
1950
1804 /* an active group must be handled by the update_curr()->put() path */ 1951 /* an active group must be handled by the update_curr()->put() path */
1805 if (!cfs_rq->runtime_enabled || cfs_rq->curr) 1952 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
1806 return; 1953 return;
@@ -1818,6 +1965,9 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
1818/* conditionally throttle active cfs_rq's from put_prev_entity() */ 1965/* conditionally throttle active cfs_rq's from put_prev_entity() */
1819static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) 1966static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1820{ 1967{
1968 if (!cfs_bandwidth_used())
1969 return;
1970
1821 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) 1971 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
1822 return; 1972 return;
1823 1973
@@ -1830,7 +1980,112 @@ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1830 1980
1831 throttle_cfs_rq(cfs_rq); 1981 throttle_cfs_rq(cfs_rq);
1832} 1982}
1833#else 1983
1984static inline u64 default_cfs_period(void);
1985static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
1986static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
1987
1988static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
1989{
1990 struct cfs_bandwidth *cfs_b =
1991 container_of(timer, struct cfs_bandwidth, slack_timer);
1992 do_sched_cfs_slack_timer(cfs_b);
1993
1994 return HRTIMER_NORESTART;
1995}
1996
1997static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
1998{
1999 struct cfs_bandwidth *cfs_b =
2000 container_of(timer, struct cfs_bandwidth, period_timer);
2001 ktime_t now;
2002 int overrun;
2003 int idle = 0;
2004
2005 for (;;) {
2006 now = hrtimer_cb_get_time(timer);
2007 overrun = hrtimer_forward(timer, now, cfs_b->period);
2008
2009 if (!overrun)
2010 break;
2011
2012 idle = do_sched_cfs_period_timer(cfs_b, overrun);
2013 }
2014
2015 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
2016}
2017
2018void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2019{
2020 raw_spin_lock_init(&cfs_b->lock);
2021 cfs_b->runtime = 0;
2022 cfs_b->quota = RUNTIME_INF;
2023 cfs_b->period = ns_to_ktime(default_cfs_period());
2024
2025 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
2026 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2027 cfs_b->period_timer.function = sched_cfs_period_timer;
2028 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2029 cfs_b->slack_timer.function = sched_cfs_slack_timer;
2030}
2031
2032static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2033{
2034 cfs_rq->runtime_enabled = 0;
2035 INIT_LIST_HEAD(&cfs_rq->throttled_list);
2036}
2037
2038/* requires cfs_b->lock, may release to reprogram timer */
2039void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2040{
2041 /*
2042 * The timer may be active because we're trying to set a new bandwidth
2043 * period or because we're racing with the tear-down path
2044 * (timer_active==0 becomes visible before the hrtimer call-back
2045 * terminates). In either case we ensure that it's re-programmed
2046 */
2047 while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
2048 raw_spin_unlock(&cfs_b->lock);
2049 /* ensure cfs_b->lock is available while we wait */
2050 hrtimer_cancel(&cfs_b->period_timer);
2051
2052 raw_spin_lock(&cfs_b->lock);
2053 /* if someone else restarted the timer then we're done */
2054 if (cfs_b->timer_active)
2055 return;
2056 }
2057
2058 cfs_b->timer_active = 1;
2059 start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
2060}
2061
2062static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2063{
2064 hrtimer_cancel(&cfs_b->period_timer);
2065 hrtimer_cancel(&cfs_b->slack_timer);
2066}
2067
2068void unthrottle_offline_cfs_rqs(struct rq *rq)
2069{
2070 struct cfs_rq *cfs_rq;
2071
2072 for_each_leaf_cfs_rq(rq, cfs_rq) {
2073 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2074
2075 if (!cfs_rq->runtime_enabled)
2076 continue;
2077
2078 /*
2079 * clock_task is not advancing so we just need to make sure
2080 * there's some valid quota amount
2081 */
2082 cfs_rq->runtime_remaining = cfs_b->quota;
2083 if (cfs_rq_throttled(cfs_rq))
2084 unthrottle_cfs_rq(cfs_rq);
2085 }
2086}
2087
2088#else /* CONFIG_CFS_BANDWIDTH */
1834static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, 2089static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1835 unsigned long delta_exec) {} 2090 unsigned long delta_exec) {}
1836static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 2091static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -1852,8 +2107,22 @@ static inline int throttled_lb_pair(struct task_group *tg,
1852{ 2107{
1853 return 0; 2108 return 0;
1854} 2109}
2110
2111void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2112
2113#ifdef CONFIG_FAIR_GROUP_SCHED
2114static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
1855#endif 2115#endif
1856 2116
2117static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2118{
2119 return NULL;
2120}
2121static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2122void unthrottle_offline_cfs_rqs(struct rq *rq) {}
2123
2124#endif /* CONFIG_CFS_BANDWIDTH */
2125
1857/************************************************** 2126/**************************************************
1858 * CFS operations on tasks: 2127 * CFS operations on tasks:
1859 */ 2128 */
@@ -1866,7 +2135,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
1866 2135
1867 WARN_ON(task_rq(p) != rq); 2136 WARN_ON(task_rq(p) != rq);
1868 2137
1869 if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) { 2138 if (cfs_rq->nr_running > 1) {
1870 u64 slice = sched_slice(cfs_rq, se); 2139 u64 slice = sched_slice(cfs_rq, se);
1871 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; 2140 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
1872 s64 delta = slice - ran; 2141 s64 delta = slice - ran;
@@ -1897,7 +2166,7 @@ static void hrtick_update(struct rq *rq)
1897{ 2166{
1898 struct task_struct *curr = rq->curr; 2167 struct task_struct *curr = rq->curr;
1899 2168
1900 if (curr->sched_class != &fair_sched_class) 2169 if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
1901 return; 2170 return;
1902 2171
1903 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) 2172 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
@@ -2020,6 +2289,61 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2020} 2289}
2021 2290
2022#ifdef CONFIG_SMP 2291#ifdef CONFIG_SMP
2292/* Used instead of source_load when we know the type == 0 */
2293static unsigned long weighted_cpuload(const int cpu)
2294{
2295 return cpu_rq(cpu)->load.weight;
2296}
2297
2298/*
2299 * Return a low guess at the load of a migration-source cpu weighted
2300 * according to the scheduling class and "nice" value.
2301 *
2302 * We want to under-estimate the load of migration sources, to
2303 * balance conservatively.
2304 */
2305static unsigned long source_load(int cpu, int type)
2306{
2307 struct rq *rq = cpu_rq(cpu);
2308 unsigned long total = weighted_cpuload(cpu);
2309
2310 if (type == 0 || !sched_feat(LB_BIAS))
2311 return total;
2312
2313 return min(rq->cpu_load[type-1], total);
2314}
2315
2316/*
2317 * Return a high guess at the load of a migration-target cpu weighted
2318 * according to the scheduling class and "nice" value.
2319 */
2320static unsigned long target_load(int cpu, int type)
2321{
2322 struct rq *rq = cpu_rq(cpu);
2323 unsigned long total = weighted_cpuload(cpu);
2324
2325 if (type == 0 || !sched_feat(LB_BIAS))
2326 return total;
2327
2328 return max(rq->cpu_load[type-1], total);
2329}
2330
2331static unsigned long power_of(int cpu)
2332{
2333 return cpu_rq(cpu)->cpu_power;
2334}
2335
2336static unsigned long cpu_avg_load_per_task(int cpu)
2337{
2338 struct rq *rq = cpu_rq(cpu);
2339 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
2340
2341 if (nr_running)
2342 return rq->load.weight / nr_running;
2343
2344 return 0;
2345}
2346
2023 2347
2024static void task_waking_fair(struct task_struct *p) 2348static void task_waking_fair(struct task_struct *p)
2025{ 2349{
@@ -2327,7 +2651,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
2327 int prev_cpu = task_cpu(p); 2651 int prev_cpu = task_cpu(p);
2328 struct sched_domain *sd; 2652 struct sched_domain *sd;
2329 struct sched_group *sg; 2653 struct sched_group *sg;
2330 int i, smt = 0; 2654 int i;
2331 2655
2332 /* 2656 /*
2333 * If the task is going to be woken-up on this cpu and if it is 2657 * If the task is going to be woken-up on this cpu and if it is
@@ -2347,17 +2671,9 @@ static int select_idle_sibling(struct task_struct *p, int target)
2347 * Otherwise, iterate the domains and find an elegible idle cpu. 2671 * Otherwise, iterate the domains and find an elegible idle cpu.
2348 */ 2672 */
2349 rcu_read_lock(); 2673 rcu_read_lock();
2350again:
2351 for_each_domain(target, sd) {
2352 if (!smt && (sd->flags & SD_SHARE_CPUPOWER))
2353 continue;
2354
2355 if (smt && !(sd->flags & SD_SHARE_CPUPOWER))
2356 break;
2357
2358 if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
2359 break;
2360 2674
2675 sd = rcu_dereference(per_cpu(sd_llc, target));
2676 for_each_lower_domain(sd) {
2361 sg = sd->groups; 2677 sg = sd->groups;
2362 do { 2678 do {
2363 if (!cpumask_intersects(sched_group_cpus(sg), 2679 if (!cpumask_intersects(sched_group_cpus(sg),
@@ -2376,10 +2692,6 @@ next:
2376 sg = sg->next; 2692 sg = sg->next;
2377 } while (sg != sd->groups); 2693 } while (sg != sd->groups);
2378 } 2694 }
2379 if (!smt) {
2380 smt = 1;
2381 goto again;
2382 }
2383done: 2695done:
2384 rcu_read_unlock(); 2696 rcu_read_unlock();
2385 2697
@@ -2408,6 +2720,9 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2408 int want_sd = 1; 2720 int want_sd = 1;
2409 int sync = wake_flags & WF_SYNC; 2721 int sync = wake_flags & WF_SYNC;
2410 2722
2723 if (p->rt.nr_cpus_allowed == 1)
2724 return prev_cpu;
2725
2411 if (sd_flag & SD_BALANCE_WAKE) { 2726 if (sd_flag & SD_BALANCE_WAKE) {
2412 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) 2727 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
2413 want_affine = 1; 2728 want_affine = 1;
@@ -2692,7 +3007,8 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
2692 } while (cfs_rq); 3007 } while (cfs_rq);
2693 3008
2694 p = task_of(se); 3009 p = task_of(se);
2695 hrtick_start_fair(rq, p); 3010 if (hrtick_enabled(rq))
3011 hrtick_start_fair(rq, p);
2696 3012
2697 return p; 3013 return p;
2698} 3014}
@@ -2736,6 +3052,12 @@ static void yield_task_fair(struct rq *rq)
2736 * Update run-time statistics of the 'current'. 3052 * Update run-time statistics of the 'current'.
2737 */ 3053 */
2738 update_curr(cfs_rq); 3054 update_curr(cfs_rq);
3055 /*
3056 * Tell update_rq_clock() that we've just updated,
3057 * so we don't do microscopic update in schedule()
3058 * and double the fastpath cost.
3059 */
3060 rq->skip_clock_update = 1;
2739 } 3061 }
2740 3062
2741 set_skip_buddy(se); 3063 set_skip_buddy(se);
@@ -2776,12 +3098,50 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
2776} 3098}
2777 3099
2778/* 3100/*
3101 * Is this task likely cache-hot:
3102 */
3103static int
3104task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
3105{
3106 s64 delta;
3107
3108 if (p->sched_class != &fair_sched_class)
3109 return 0;
3110
3111 if (unlikely(p->policy == SCHED_IDLE))
3112 return 0;
3113
3114 /*
3115 * Buddy candidates are cache hot:
3116 */
3117 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
3118 (&p->se == cfs_rq_of(&p->se)->next ||
3119 &p->se == cfs_rq_of(&p->se)->last))
3120 return 1;
3121
3122 if (sysctl_sched_migration_cost == -1)
3123 return 1;
3124 if (sysctl_sched_migration_cost == 0)
3125 return 0;
3126
3127 delta = now - p->se.exec_start;
3128
3129 return delta < (s64)sysctl_sched_migration_cost;
3130}
3131
3132#define LBF_ALL_PINNED 0x01
3133#define LBF_NEED_BREAK 0x02 /* clears into HAD_BREAK */
3134#define LBF_HAD_BREAK 0x04
3135#define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */
3136#define LBF_ABORT 0x10
3137
3138/*
2779 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? 3139 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
2780 */ 3140 */
2781static 3141static
2782int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, 3142int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2783 struct sched_domain *sd, enum cpu_idle_type idle, 3143 struct sched_domain *sd, enum cpu_idle_type idle,
2784 int *all_pinned) 3144 int *lb_flags)
2785{ 3145{
2786 int tsk_cache_hot = 0; 3146 int tsk_cache_hot = 0;
2787 /* 3147 /*
@@ -2794,7 +3154,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2794 schedstat_inc(p, se.statistics.nr_failed_migrations_affine); 3154 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
2795 return 0; 3155 return 0;
2796 } 3156 }
2797 *all_pinned = 0; 3157 *lb_flags &= ~LBF_ALL_PINNED;
2798 3158
2799 if (task_running(rq, p)) { 3159 if (task_running(rq, p)) {
2800 schedstat_inc(p, se.statistics.nr_failed_migrations_running); 3160 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
@@ -2868,7 +3228,7 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
2868static unsigned long 3228static unsigned long
2869balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, 3229balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2870 unsigned long max_load_move, struct sched_domain *sd, 3230 unsigned long max_load_move, struct sched_domain *sd,
2871 enum cpu_idle_type idle, int *all_pinned, 3231 enum cpu_idle_type idle, int *lb_flags,
2872 struct cfs_rq *busiest_cfs_rq) 3232 struct cfs_rq *busiest_cfs_rq)
2873{ 3233{
2874 int loops = 0, pulled = 0; 3234 int loops = 0, pulled = 0;
@@ -2879,12 +3239,14 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2879 goto out; 3239 goto out;
2880 3240
2881 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { 3241 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
2882 if (loops++ > sysctl_sched_nr_migrate) 3242 if (loops++ > sysctl_sched_nr_migrate) {
3243 *lb_flags |= LBF_NEED_BREAK;
2883 break; 3244 break;
3245 }
2884 3246
2885 if ((p->se.load.weight >> 1) > rem_load_move || 3247 if ((p->se.load.weight >> 1) > rem_load_move ||
2886 !can_migrate_task(p, busiest, this_cpu, sd, idle, 3248 !can_migrate_task(p, busiest, this_cpu, sd, idle,
2887 all_pinned)) 3249 lb_flags))
2888 continue; 3250 continue;
2889 3251
2890 pull_task(busiest, p, this_rq, this_cpu); 3252 pull_task(busiest, p, this_rq, this_cpu);
@@ -2897,8 +3259,10 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2897 * kernels will stop after the first task is pulled to minimize 3259 * kernels will stop after the first task is pulled to minimize
2898 * the critical section. 3260 * the critical section.
2899 */ 3261 */
2900 if (idle == CPU_NEWLY_IDLE) 3262 if (idle == CPU_NEWLY_IDLE) {
3263 *lb_flags |= LBF_ABORT;
2901 break; 3264 break;
3265 }
2902#endif 3266#endif
2903 3267
2904 /* 3268 /*
@@ -3003,7 +3367,7 @@ static unsigned long
3003load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 3367load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
3004 unsigned long max_load_move, 3368 unsigned long max_load_move,
3005 struct sched_domain *sd, enum cpu_idle_type idle, 3369 struct sched_domain *sd, enum cpu_idle_type idle,
3006 int *all_pinned) 3370 int *lb_flags)
3007{ 3371{
3008 long rem_load_move = max_load_move; 3372 long rem_load_move = max_load_move;
3009 struct cfs_rq *busiest_cfs_rq; 3373 struct cfs_rq *busiest_cfs_rq;
@@ -3016,6 +3380,9 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
3016 unsigned long busiest_weight = busiest_cfs_rq->load.weight; 3380 unsigned long busiest_weight = busiest_cfs_rq->load.weight;
3017 u64 rem_load, moved_load; 3381 u64 rem_load, moved_load;
3018 3382
3383 if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
3384 break;
3385
3019 /* 3386 /*
3020 * empty group or part of a throttled hierarchy 3387 * empty group or part of a throttled hierarchy
3021 */ 3388 */
@@ -3027,7 +3394,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
3027 rem_load = div_u64(rem_load, busiest_h_load + 1); 3394 rem_load = div_u64(rem_load, busiest_h_load + 1);
3028 3395
3029 moved_load = balance_tasks(this_rq, this_cpu, busiest, 3396 moved_load = balance_tasks(this_rq, this_cpu, busiest,
3030 rem_load, sd, idle, all_pinned, 3397 rem_load, sd, idle, lb_flags,
3031 busiest_cfs_rq); 3398 busiest_cfs_rq);
3032 3399
3033 if (!moved_load) 3400 if (!moved_load)
@@ -3053,10 +3420,10 @@ static unsigned long
3053load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 3420load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
3054 unsigned long max_load_move, 3421 unsigned long max_load_move,
3055 struct sched_domain *sd, enum cpu_idle_type idle, 3422 struct sched_domain *sd, enum cpu_idle_type idle,
3056 int *all_pinned) 3423 int *lb_flags)
3057{ 3424{
3058 return balance_tasks(this_rq, this_cpu, busiest, 3425 return balance_tasks(this_rq, this_cpu, busiest,
3059 max_load_move, sd, idle, all_pinned, 3426 max_load_move, sd, idle, lb_flags,
3060 &busiest->cfs); 3427 &busiest->cfs);
3061} 3428}
3062#endif 3429#endif
@@ -3071,29 +3438,30 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
3071static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, 3438static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3072 unsigned long max_load_move, 3439 unsigned long max_load_move,
3073 struct sched_domain *sd, enum cpu_idle_type idle, 3440 struct sched_domain *sd, enum cpu_idle_type idle,
3074 int *all_pinned) 3441 int *lb_flags)
3075{ 3442{
3076 unsigned long total_load_moved = 0, load_moved; 3443 unsigned long total_load_moved = 0, load_moved;
3077 3444
3078 do { 3445 do {
3079 load_moved = load_balance_fair(this_rq, this_cpu, busiest, 3446 load_moved = load_balance_fair(this_rq, this_cpu, busiest,
3080 max_load_move - total_load_moved, 3447 max_load_move - total_load_moved,
3081 sd, idle, all_pinned); 3448 sd, idle, lb_flags);
3082 3449
3083 total_load_moved += load_moved; 3450 total_load_moved += load_moved;
3084 3451
3452 if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
3453 break;
3454
3085#ifdef CONFIG_PREEMPT 3455#ifdef CONFIG_PREEMPT
3086 /* 3456 /*
3087 * NEWIDLE balancing is a source of latency, so preemptible 3457 * NEWIDLE balancing is a source of latency, so preemptible
3088 * kernels will stop after the first task is pulled to minimize 3458 * kernels will stop after the first task is pulled to minimize
3089 * the critical section. 3459 * the critical section.
3090 */ 3460 */
3091 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) 3461 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) {
3092 break; 3462 *lb_flags |= LBF_ABORT;
3093
3094 if (raw_spin_is_contended(&this_rq->lock) ||
3095 raw_spin_is_contended(&busiest->lock))
3096 break; 3463 break;
3464 }
3097#endif 3465#endif
3098 } while (load_moved && max_load_move > total_load_moved); 3466 } while (load_moved && max_load_move > total_load_moved);
3099 3467
@@ -3155,15 +3523,6 @@ struct sg_lb_stats {
3155}; 3523};
3156 3524
3157/** 3525/**
3158 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
3159 * @group: The group whose first cpu is to be returned.
3160 */
3161static inline unsigned int group_first_cpu(struct sched_group *group)
3162{
3163 return cpumask_first(sched_group_cpus(group));
3164}
3165
3166/**
3167 * get_sd_load_idx - Obtain the load index for a given sched domain. 3526 * get_sd_load_idx - Obtain the load index for a given sched domain.
3168 * @sd: The sched_domain whose load_idx is to be obtained. 3527 * @sd: The sched_domain whose load_idx is to be obtained.
3169 * @idle: The Idle status of the CPU for whose sd load_icx is obtained. 3528 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
@@ -3412,7 +3771,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
3412 sdg->sgp->power = power; 3771 sdg->sgp->power = power;
3413} 3772}
3414 3773
3415static void update_group_power(struct sched_domain *sd, int cpu) 3774void update_group_power(struct sched_domain *sd, int cpu)
3416{ 3775{
3417 struct sched_domain *child = sd->child; 3776 struct sched_domain *child = sd->child;
3418 struct sched_group *group, *sdg = sd->groups; 3777 struct sched_group *group, *sdg = sd->groups;
@@ -3678,11 +4037,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3678 } while (sg != sd->groups); 4037 } while (sg != sd->groups);
3679} 4038}
3680 4039
3681int __weak arch_sd_sibling_asym_packing(void)
3682{
3683 return 0*SD_ASYM_PACKING;
3684}
3685
3686/** 4040/**
3687 * check_asym_packing - Check to see if the group is packed into the 4041 * check_asym_packing - Check to see if the group is packed into the
3688 * sched doman. 4042 * sched doman.
@@ -4046,7 +4400,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
4046#define MAX_PINNED_INTERVAL 512 4400#define MAX_PINNED_INTERVAL 512
4047 4401
4048/* Working cpumask for load_balance and load_balance_newidle. */ 4402/* Working cpumask for load_balance and load_balance_newidle. */
4049static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 4403DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4050 4404
4051static int need_active_balance(struct sched_domain *sd, int idle, 4405static int need_active_balance(struct sched_domain *sd, int idle,
4052 int busiest_cpu, int this_cpu) 4406 int busiest_cpu, int this_cpu)
@@ -4097,7 +4451,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4097 struct sched_domain *sd, enum cpu_idle_type idle, 4451 struct sched_domain *sd, enum cpu_idle_type idle,
4098 int *balance) 4452 int *balance)
4099{ 4453{
4100 int ld_moved, all_pinned = 0, active_balance = 0; 4454 int ld_moved, lb_flags = 0, active_balance = 0;
4101 struct sched_group *group; 4455 struct sched_group *group;
4102 unsigned long imbalance; 4456 unsigned long imbalance;
4103 struct rq *busiest; 4457 struct rq *busiest;
@@ -4138,11 +4492,11 @@ redo:
4138 * still unbalanced. ld_moved simply stays zero, so it is 4492 * still unbalanced. ld_moved simply stays zero, so it is
4139 * correctly treated as an imbalance. 4493 * correctly treated as an imbalance.
4140 */ 4494 */
4141 all_pinned = 1; 4495 lb_flags |= LBF_ALL_PINNED;
4142 local_irq_save(flags); 4496 local_irq_save(flags);
4143 double_rq_lock(this_rq, busiest); 4497 double_rq_lock(this_rq, busiest);
4144 ld_moved = move_tasks(this_rq, this_cpu, busiest, 4498 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4145 imbalance, sd, idle, &all_pinned); 4499 imbalance, sd, idle, &lb_flags);
4146 double_rq_unlock(this_rq, busiest); 4500 double_rq_unlock(this_rq, busiest);
4147 local_irq_restore(flags); 4501 local_irq_restore(flags);
4148 4502
@@ -4152,8 +4506,18 @@ redo:
4152 if (ld_moved && this_cpu != smp_processor_id()) 4506 if (ld_moved && this_cpu != smp_processor_id())
4153 resched_cpu(this_cpu); 4507 resched_cpu(this_cpu);
4154 4508
4509 if (lb_flags & LBF_ABORT)
4510 goto out_balanced;
4511
4512 if (lb_flags & LBF_NEED_BREAK) {
4513 lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK;
4514 if (lb_flags & LBF_ABORT)
4515 goto out_balanced;
4516 goto redo;
4517 }
4518
4155 /* All tasks on this runqueue were pinned by CPU affinity */ 4519 /* All tasks on this runqueue were pinned by CPU affinity */
4156 if (unlikely(all_pinned)) { 4520 if (unlikely(lb_flags & LBF_ALL_PINNED)) {
4157 cpumask_clear_cpu(cpu_of(busiest), cpus); 4521 cpumask_clear_cpu(cpu_of(busiest), cpus);
4158 if (!cpumask_empty(cpus)) 4522 if (!cpumask_empty(cpus))
4159 goto redo; 4523 goto redo;
@@ -4183,7 +4547,7 @@ redo:
4183 tsk_cpus_allowed(busiest->curr))) { 4547 tsk_cpus_allowed(busiest->curr))) {
4184 raw_spin_unlock_irqrestore(&busiest->lock, 4548 raw_spin_unlock_irqrestore(&busiest->lock,
4185 flags); 4549 flags);
4186 all_pinned = 1; 4550 lb_flags |= LBF_ALL_PINNED;
4187 goto out_one_pinned; 4551 goto out_one_pinned;
4188 } 4552 }
4189 4553
@@ -4236,7 +4600,8 @@ out_balanced:
4236 4600
4237out_one_pinned: 4601out_one_pinned:
4238 /* tune up the balancing interval */ 4602 /* tune up the balancing interval */
4239 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || 4603 if (((lb_flags & LBF_ALL_PINNED) &&
4604 sd->balance_interval < MAX_PINNED_INTERVAL) ||
4240 (sd->balance_interval < sd->max_interval)) 4605 (sd->balance_interval < sd->max_interval))
4241 sd->balance_interval *= 2; 4606 sd->balance_interval *= 2;
4242 4607
@@ -4249,7 +4614,7 @@ out:
4249 * idle_balance is called by schedule() if this_cpu is about to become 4614 * idle_balance is called by schedule() if this_cpu is about to become
4250 * idle. Attempts to pull tasks from other CPUs. 4615 * idle. Attempts to pull tasks from other CPUs.
4251 */ 4616 */
4252static void idle_balance(int this_cpu, struct rq *this_rq) 4617void idle_balance(int this_cpu, struct rq *this_rq)
4253{ 4618{
4254 struct sched_domain *sd; 4619 struct sched_domain *sd;
4255 int pulled_task = 0; 4620 int pulled_task = 0;
@@ -4364,28 +4729,16 @@ out_unlock:
4364#ifdef CONFIG_NO_HZ 4729#ifdef CONFIG_NO_HZ
4365/* 4730/*
4366 * idle load balancing details 4731 * idle load balancing details
4367 * - One of the idle CPUs nominates itself as idle load_balancer, while
4368 * entering idle.
4369 * - This idle load balancer CPU will also go into tickless mode when
4370 * it is idle, just like all other idle CPUs
4371 * - When one of the busy CPUs notice that there may be an idle rebalancing 4732 * - When one of the busy CPUs notice that there may be an idle rebalancing
4372 * needed, they will kick the idle load balancer, which then does idle 4733 * needed, they will kick the idle load balancer, which then does idle
4373 * load balancing for all the idle CPUs. 4734 * load balancing for all the idle CPUs.
4374 */ 4735 */
4375static struct { 4736static struct {
4376 atomic_t load_balancer;
4377 atomic_t first_pick_cpu;
4378 atomic_t second_pick_cpu;
4379 cpumask_var_t idle_cpus_mask; 4737 cpumask_var_t idle_cpus_mask;
4380 cpumask_var_t grp_idle_mask; 4738 atomic_t nr_cpus;
4381 unsigned long next_balance; /* in jiffy units */ 4739 unsigned long next_balance; /* in jiffy units */
4382} nohz ____cacheline_aligned; 4740} nohz ____cacheline_aligned;
4383 4741
4384int get_nohz_load_balancer(void)
4385{
4386 return atomic_read(&nohz.load_balancer);
4387}
4388
4389#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 4742#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4390/** 4743/**
4391 * lowest_flag_domain - Return lowest sched_domain containing flag. 4744 * lowest_flag_domain - Return lowest sched_domain containing flag.
@@ -4422,33 +4775,6 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4422 (sd && (sd->flags & flag)); sd = sd->parent) 4775 (sd && (sd->flags & flag)); sd = sd->parent)
4423 4776
4424/** 4777/**
4425 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
4426 * @ilb_group: group to be checked for semi-idleness
4427 *
4428 * Returns: 1 if the group is semi-idle. 0 otherwise.
4429 *
4430 * We define a sched_group to be semi idle if it has atleast one idle-CPU
4431 * and atleast one non-idle CPU. This helper function checks if the given
4432 * sched_group is semi-idle or not.
4433 */
4434static inline int is_semi_idle_group(struct sched_group *ilb_group)
4435{
4436 cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
4437 sched_group_cpus(ilb_group));
4438
4439 /*
4440 * A sched_group is semi-idle when it has atleast one busy cpu
4441 * and atleast one idle cpu.
4442 */
4443 if (cpumask_empty(nohz.grp_idle_mask))
4444 return 0;
4445
4446 if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
4447 return 0;
4448
4449 return 1;
4450}
4451/**
4452 * find_new_ilb - Finds the optimum idle load balancer for nomination. 4778 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4453 * @cpu: The cpu which is nominating a new idle_load_balancer. 4779 * @cpu: The cpu which is nominating a new idle_load_balancer.
4454 * 4780 *
@@ -4462,9 +4788,9 @@ static inline int is_semi_idle_group(struct sched_group *ilb_group)
4462 */ 4788 */
4463static int find_new_ilb(int cpu) 4789static int find_new_ilb(int cpu)
4464{ 4790{
4791 int ilb = cpumask_first(nohz.idle_cpus_mask);
4792 struct sched_group *ilbg;
4465 struct sched_domain *sd; 4793 struct sched_domain *sd;
4466 struct sched_group *ilb_group;
4467 int ilb = nr_cpu_ids;
4468 4794
4469 /* 4795 /*
4470 * Have idle load balancer selection from semi-idle packages only 4796 * Have idle load balancer selection from semi-idle packages only
@@ -4482,23 +4808,28 @@ static int find_new_ilb(int cpu)
4482 4808
4483 rcu_read_lock(); 4809 rcu_read_lock();
4484 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { 4810 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4485 ilb_group = sd->groups; 4811 ilbg = sd->groups;
4486 4812
4487 do { 4813 do {
4488 if (is_semi_idle_group(ilb_group)) { 4814 if (ilbg->group_weight !=
4489 ilb = cpumask_first(nohz.grp_idle_mask); 4815 atomic_read(&ilbg->sgp->nr_busy_cpus)) {
4816 ilb = cpumask_first_and(nohz.idle_cpus_mask,
4817 sched_group_cpus(ilbg));
4490 goto unlock; 4818 goto unlock;
4491 } 4819 }
4492 4820
4493 ilb_group = ilb_group->next; 4821 ilbg = ilbg->next;
4494 4822
4495 } while (ilb_group != sd->groups); 4823 } while (ilbg != sd->groups);
4496 } 4824 }
4497unlock: 4825unlock:
4498 rcu_read_unlock(); 4826 rcu_read_unlock();
4499 4827
4500out_done: 4828out_done:
4501 return ilb; 4829 if (ilb < nr_cpu_ids && idle_cpu(ilb))
4830 return ilb;
4831
4832 return nr_cpu_ids;
4502} 4833}
4503#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ 4834#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4504static inline int find_new_ilb(int call_cpu) 4835static inline int find_new_ilb(int call_cpu)
@@ -4518,99 +4849,68 @@ static void nohz_balancer_kick(int cpu)
4518 4849
4519 nohz.next_balance++; 4850 nohz.next_balance++;
4520 4851
4521 ilb_cpu = get_nohz_load_balancer(); 4852 ilb_cpu = find_new_ilb(cpu);
4522
4523 if (ilb_cpu >= nr_cpu_ids) {
4524 ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
4525 if (ilb_cpu >= nr_cpu_ids)
4526 return;
4527 }
4528 4853
4529 if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { 4854 if (ilb_cpu >= nr_cpu_ids)
4530 cpu_rq(ilb_cpu)->nohz_balance_kick = 1; 4855 return;
4531 4856
4532 smp_mb(); 4857 if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
4533 /* 4858 return;
4534 * Use smp_send_reschedule() instead of resched_cpu(). 4859 /*
4535 * This way we generate a sched IPI on the target cpu which 4860 * Use smp_send_reschedule() instead of resched_cpu().
4536 * is idle. And the softirq performing nohz idle load balance 4861 * This way we generate a sched IPI on the target cpu which
4537 * will be run before returning from the IPI. 4862 * is idle. And the softirq performing nohz idle load balance
4538 */ 4863 * will be run before returning from the IPI.
4539 smp_send_reschedule(ilb_cpu); 4864 */
4540 } 4865 smp_send_reschedule(ilb_cpu);
4541 return; 4866 return;
4542} 4867}
4543 4868
4544/* 4869static inline void set_cpu_sd_state_busy(void)
4545 * This routine will try to nominate the ilb (idle load balancing)
4546 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
4547 * load balancing on behalf of all those cpus.
4548 *
4549 * When the ilb owner becomes busy, we will not have new ilb owner until some
4550 * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
4551 * idle load balancing by kicking one of the idle CPUs.
4552 *
4553 * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
4554 * ilb owner CPU in future (when there is a need for idle load balancing on
4555 * behalf of all idle CPUs).
4556 */
4557void select_nohz_load_balancer(int stop_tick)
4558{ 4870{
4871 struct sched_domain *sd;
4559 int cpu = smp_processor_id(); 4872 int cpu = smp_processor_id();
4560 4873
4561 if (stop_tick) { 4874 if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
4562 if (!cpu_active(cpu)) { 4875 return;
4563 if (atomic_read(&nohz.load_balancer) != cpu) 4876 clear_bit(NOHZ_IDLE, nohz_flags(cpu));
4564 return;
4565
4566 /*
4567 * If we are going offline and still the leader,
4568 * give up!
4569 */
4570 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
4571 nr_cpu_ids) != cpu)
4572 BUG();
4573 4877
4574 return; 4878 rcu_read_lock();
4575 } 4879 for_each_domain(cpu, sd)
4880 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
4881 rcu_read_unlock();
4882}
4576 4883
4577 cpumask_set_cpu(cpu, nohz.idle_cpus_mask); 4884void set_cpu_sd_state_idle(void)
4885{
4886 struct sched_domain *sd;
4887 int cpu = smp_processor_id();
4578 4888
4579 if (atomic_read(&nohz.first_pick_cpu) == cpu) 4889 if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
4580 atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids); 4890 return;
4581 if (atomic_read(&nohz.second_pick_cpu) == cpu) 4891 set_bit(NOHZ_IDLE, nohz_flags(cpu));
4582 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
4583 4892
4584 if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) { 4893 rcu_read_lock();
4585 int new_ilb; 4894 for_each_domain(cpu, sd)
4895 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
4896 rcu_read_unlock();
4897}
4586 4898
4587 /* make me the ilb owner */ 4899/*
4588 if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids, 4900 * This routine will record that this cpu is going idle with tick stopped.
4589 cpu) != nr_cpu_ids) 4901 * This info will be used in performing idle load balancing in the future.
4590 return; 4902 */
4903void select_nohz_load_balancer(int stop_tick)
4904{
4905 int cpu = smp_processor_id();
4591 4906
4592 /* 4907 if (stop_tick) {
4593 * Check to see if there is a more power-efficient 4908 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
4594 * ilb.
4595 */
4596 new_ilb = find_new_ilb(cpu);
4597 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4598 atomic_set(&nohz.load_balancer, nr_cpu_ids);
4599 resched_cpu(new_ilb);
4600 return;
4601 }
4602 return;
4603 }
4604 } else {
4605 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
4606 return; 4909 return;
4607 4910
4608 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); 4911 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
4609 4912 atomic_inc(&nohz.nr_cpus);
4610 if (atomic_read(&nohz.load_balancer) == cpu) 4913 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
4611 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
4612 nr_cpu_ids) != cpu)
4613 BUG();
4614 } 4914 }
4615 return; 4915 return;
4616} 4916}
@@ -4624,7 +4924,7 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
4624 * Scale the max load_balance interval with the number of CPUs in the system. 4924 * Scale the max load_balance interval with the number of CPUs in the system.
4625 * This trades load-balance latency on larger machines for less cross talk. 4925 * This trades load-balance latency on larger machines for less cross talk.
4626 */ 4926 */
4627static void update_max_interval(void) 4927void update_max_interval(void)
4628{ 4928{
4629 max_load_balance_interval = HZ*num_online_cpus()/10; 4929 max_load_balance_interval = HZ*num_online_cpus()/10;
4630} 4930}
@@ -4716,11 +5016,12 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
4716 struct rq *rq; 5016 struct rq *rq;
4717 int balance_cpu; 5017 int balance_cpu;
4718 5018
4719 if (idle != CPU_IDLE || !this_rq->nohz_balance_kick) 5019 if (idle != CPU_IDLE ||
4720 return; 5020 !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
5021 goto end;
4721 5022
4722 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { 5023 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
4723 if (balance_cpu == this_cpu) 5024 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
4724 continue; 5025 continue;
4725 5026
4726 /* 5027 /*
@@ -4728,10 +5029,8 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
4728 * work being done for other cpus. Next load 5029 * work being done for other cpus. Next load
4729 * balancing owner will pick it up. 5030 * balancing owner will pick it up.
4730 */ 5031 */
4731 if (need_resched()) { 5032 if (need_resched())
4732 this_rq->nohz_balance_kick = 0;
4733 break; 5033 break;
4734 }
4735 5034
4736 raw_spin_lock_irq(&this_rq->lock); 5035 raw_spin_lock_irq(&this_rq->lock);
4737 update_rq_clock(this_rq); 5036 update_rq_clock(this_rq);
@@ -4745,53 +5044,75 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
4745 this_rq->next_balance = rq->next_balance; 5044 this_rq->next_balance = rq->next_balance;
4746 } 5045 }
4747 nohz.next_balance = this_rq->next_balance; 5046 nohz.next_balance = this_rq->next_balance;
4748 this_rq->nohz_balance_kick = 0; 5047end:
5048 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
4749} 5049}
4750 5050
4751/* 5051/*
4752 * Current heuristic for kicking the idle load balancer 5052 * Current heuristic for kicking the idle load balancer in the presence
4753 * - first_pick_cpu is the one of the busy CPUs. It will kick 5053 * of an idle cpu is the system.
4754 * idle load balancer when it has more than one process active. This 5054 * - This rq has more than one task.
4755 * eliminates the need for idle load balancing altogether when we have 5055 * - At any scheduler domain level, this cpu's scheduler group has multiple
4756 * only one running process in the system (common case). 5056 * busy cpu's exceeding the group's power.
4757 * - If there are more than one busy CPU, idle load balancer may have 5057 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
4758 * to run for active_load_balance to happen (i.e., two busy CPUs are 5058 * domain span are idle.
4759 * SMT or core siblings and can run better if they move to different
4760 * physical CPUs). So, second_pick_cpu is the second of the busy CPUs
4761 * which will kick idle load balancer as soon as it has any load.
4762 */ 5059 */
4763static inline int nohz_kick_needed(struct rq *rq, int cpu) 5060static inline int nohz_kick_needed(struct rq *rq, int cpu)
4764{ 5061{
4765 unsigned long now = jiffies; 5062 unsigned long now = jiffies;
4766 int ret; 5063 struct sched_domain *sd;
4767 int first_pick_cpu, second_pick_cpu;
4768 5064
4769 if (time_before(now, nohz.next_balance)) 5065 if (unlikely(idle_cpu(cpu)))
4770 return 0; 5066 return 0;
4771 5067
4772 if (idle_cpu(cpu)) 5068 /*
4773 return 0; 5069 * We may be recently in ticked or tickless idle mode. At the first
5070 * busy tick after returning from idle, we will update the busy stats.
5071 */
5072 set_cpu_sd_state_busy();
5073 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
5074 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
5075 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
5076 atomic_dec(&nohz.nr_cpus);
5077 }
4774 5078
4775 first_pick_cpu = atomic_read(&nohz.first_pick_cpu); 5079 /*
4776 second_pick_cpu = atomic_read(&nohz.second_pick_cpu); 5080 * None are in tickless mode and hence no need for NOHZ idle load
5081 * balancing.
5082 */
5083 if (likely(!atomic_read(&nohz.nr_cpus)))
5084 return 0;
4777 5085
4778 if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu && 5086 if (time_before(now, nohz.next_balance))
4779 second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
4780 return 0; 5087 return 0;
4781 5088
4782 ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu); 5089 if (rq->nr_running >= 2)
4783 if (ret == nr_cpu_ids || ret == cpu) { 5090 goto need_kick;
4784 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); 5091
4785 if (rq->nr_running > 1) 5092 rcu_read_lock();
4786 return 1; 5093 for_each_domain(cpu, sd) {
4787 } else { 5094 struct sched_group *sg = sd->groups;
4788 ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu); 5095 struct sched_group_power *sgp = sg->sgp;
4789 if (ret == nr_cpu_ids || ret == cpu) { 5096 int nr_busy = atomic_read(&sgp->nr_busy_cpus);
4790 if (rq->nr_running) 5097
4791 return 1; 5098 if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
4792 } 5099 goto need_kick_unlock;
5100
5101 if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
5102 && (cpumask_first_and(nohz.idle_cpus_mask,
5103 sched_domain_span(sd)) < cpu))
5104 goto need_kick_unlock;
5105
5106 if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
5107 break;
4793 } 5108 }
5109 rcu_read_unlock();
4794 return 0; 5110 return 0;
5111
5112need_kick_unlock:
5113 rcu_read_unlock();
5114need_kick:
5115 return 1;
4795} 5116}
4796#else 5117#else
4797static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } 5118static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
@@ -4826,14 +5147,14 @@ static inline int on_null_domain(int cpu)
4826/* 5147/*
4827 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 5148 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4828 */ 5149 */
4829static inline void trigger_load_balance(struct rq *rq, int cpu) 5150void trigger_load_balance(struct rq *rq, int cpu)
4830{ 5151{
4831 /* Don't need to rebalance while attached to NULL domain */ 5152 /* Don't need to rebalance while attached to NULL domain */
4832 if (time_after_eq(jiffies, rq->next_balance) && 5153 if (time_after_eq(jiffies, rq->next_balance) &&
4833 likely(!on_null_domain(cpu))) 5154 likely(!on_null_domain(cpu)))
4834 raise_softirq(SCHED_SOFTIRQ); 5155 raise_softirq(SCHED_SOFTIRQ);
4835#ifdef CONFIG_NO_HZ 5156#ifdef CONFIG_NO_HZ
4836 else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) 5157 if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
4837 nohz_balancer_kick(cpu); 5158 nohz_balancer_kick(cpu);
4838#endif 5159#endif
4839} 5160}
@@ -4848,15 +5169,6 @@ static void rq_offline_fair(struct rq *rq)
4848 update_sysctl(); 5169 update_sysctl();
4849} 5170}
4850 5171
4851#else /* CONFIG_SMP */
4852
4853/*
4854 * on UP we do not need to balance between CPUs:
4855 */
4856static inline void idle_balance(int cpu, struct rq *rq)
4857{
4858}
4859
4860#endif /* CONFIG_SMP */ 5172#endif /* CONFIG_SMP */
4861 5173
4862/* 5174/*
@@ -4880,8 +5192,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
4880 */ 5192 */
4881static void task_fork_fair(struct task_struct *p) 5193static void task_fork_fair(struct task_struct *p)
4882{ 5194{
4883 struct cfs_rq *cfs_rq = task_cfs_rq(current); 5195 struct cfs_rq *cfs_rq;
4884 struct sched_entity *se = &p->se, *curr = cfs_rq->curr; 5196 struct sched_entity *se = &p->se, *curr;
4885 int this_cpu = smp_processor_id(); 5197 int this_cpu = smp_processor_id();
4886 struct rq *rq = this_rq(); 5198 struct rq *rq = this_rq();
4887 unsigned long flags; 5199 unsigned long flags;
@@ -4890,6 +5202,9 @@ static void task_fork_fair(struct task_struct *p)
4890 5202
4891 update_rq_clock(rq); 5203 update_rq_clock(rq);
4892 5204
5205 cfs_rq = task_cfs_rq(current);
5206 curr = cfs_rq->curr;
5207
4893 if (unlikely(task_cpu(p) != this_cpu)) { 5208 if (unlikely(task_cpu(p) != this_cpu)) {
4894 rcu_read_lock(); 5209 rcu_read_lock();
4895 __set_task_cpu(p, this_cpu); 5210 __set_task_cpu(p, this_cpu);
@@ -4999,6 +5314,16 @@ static void set_curr_task_fair(struct rq *rq)
4999 } 5314 }
5000} 5315}
5001 5316
5317void init_cfs_rq(struct cfs_rq *cfs_rq)
5318{
5319 cfs_rq->tasks_timeline = RB_ROOT;
5320 INIT_LIST_HEAD(&cfs_rq->tasks);
5321 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
5322#ifndef CONFIG_64BIT
5323 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
5324#endif
5325}
5326
5002#ifdef CONFIG_FAIR_GROUP_SCHED 5327#ifdef CONFIG_FAIR_GROUP_SCHED
5003static void task_move_group_fair(struct task_struct *p, int on_rq) 5328static void task_move_group_fair(struct task_struct *p, int on_rq)
5004{ 5329{
@@ -5015,13 +5340,182 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
5015 * to another cgroup's rq. This does somewhat interfere with the 5340 * to another cgroup's rq. This does somewhat interfere with the
5016 * fair sleeper stuff for the first placement, but who cares. 5341 * fair sleeper stuff for the first placement, but who cares.
5017 */ 5342 */
5343 /*
5344 * When !on_rq, vruntime of the task has usually NOT been normalized.
5345 * But there are some cases where it has already been normalized:
5346 *
5347 * - Moving a forked child which is waiting for being woken up by
5348 * wake_up_new_task().
5349 * - Moving a task which has been woken up by try_to_wake_up() and
5350 * waiting for actually being woken up by sched_ttwu_pending().
5351 *
5352 * To prevent boost or penalty in the new cfs_rq caused by delta
5353 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
5354 */
5355 if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
5356 on_rq = 1;
5357
5018 if (!on_rq) 5358 if (!on_rq)
5019 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; 5359 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
5020 set_task_rq(p, task_cpu(p)); 5360 set_task_rq(p, task_cpu(p));
5021 if (!on_rq) 5361 if (!on_rq)
5022 p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; 5362 p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
5023} 5363}
5364
5365void free_fair_sched_group(struct task_group *tg)
5366{
5367 int i;
5368
5369 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
5370
5371 for_each_possible_cpu(i) {
5372 if (tg->cfs_rq)
5373 kfree(tg->cfs_rq[i]);
5374 if (tg->se)
5375 kfree(tg->se[i]);
5376 }
5377
5378 kfree(tg->cfs_rq);
5379 kfree(tg->se);
5380}
5381
5382int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
5383{
5384 struct cfs_rq *cfs_rq;
5385 struct sched_entity *se;
5386 int i;
5387
5388 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
5389 if (!tg->cfs_rq)
5390 goto err;
5391 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
5392 if (!tg->se)
5393 goto err;
5394
5395 tg->shares = NICE_0_LOAD;
5396
5397 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
5398
5399 for_each_possible_cpu(i) {
5400 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
5401 GFP_KERNEL, cpu_to_node(i));
5402 if (!cfs_rq)
5403 goto err;
5404
5405 se = kzalloc_node(sizeof(struct sched_entity),
5406 GFP_KERNEL, cpu_to_node(i));
5407 if (!se)
5408 goto err_free_rq;
5409
5410 init_cfs_rq(cfs_rq);
5411 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
5412 }
5413
5414 return 1;
5415
5416err_free_rq:
5417 kfree(cfs_rq);
5418err:
5419 return 0;
5420}
5421
5422void unregister_fair_sched_group(struct task_group *tg, int cpu)
5423{
5424 struct rq *rq = cpu_rq(cpu);
5425 unsigned long flags;
5426
5427 /*
5428 * Only empty task groups can be destroyed; so we can speculatively
5429 * check on_list without danger of it being re-added.
5430 */
5431 if (!tg->cfs_rq[cpu]->on_list)
5432 return;
5433
5434 raw_spin_lock_irqsave(&rq->lock, flags);
5435 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
5436 raw_spin_unlock_irqrestore(&rq->lock, flags);
5437}
5438
5439void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
5440 struct sched_entity *se, int cpu,
5441 struct sched_entity *parent)
5442{
5443 struct rq *rq = cpu_rq(cpu);
5444
5445 cfs_rq->tg = tg;
5446 cfs_rq->rq = rq;
5447#ifdef CONFIG_SMP
5448 /* allow initial update_cfs_load() to truncate */
5449 cfs_rq->load_stamp = 1;
5024#endif 5450#endif
5451 init_cfs_rq_runtime(cfs_rq);
5452
5453 tg->cfs_rq[cpu] = cfs_rq;
5454 tg->se[cpu] = se;
5455
5456 /* se could be NULL for root_task_group */
5457 if (!se)
5458 return;
5459
5460 if (!parent)
5461 se->cfs_rq = &rq->cfs;
5462 else
5463 se->cfs_rq = parent->my_q;
5464
5465 se->my_q = cfs_rq;
5466 update_load_set(&se->load, 0);
5467 se->parent = parent;
5468}
5469
5470static DEFINE_MUTEX(shares_mutex);
5471
5472int sched_group_set_shares(struct task_group *tg, unsigned long shares)
5473{
5474 int i;
5475 unsigned long flags;
5476
5477 /*
5478 * We can't change the weight of the root cgroup.
5479 */
5480 if (!tg->se[0])
5481 return -EINVAL;
5482
5483 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
5484
5485 mutex_lock(&shares_mutex);
5486 if (tg->shares == shares)
5487 goto done;
5488
5489 tg->shares = shares;
5490 for_each_possible_cpu(i) {
5491 struct rq *rq = cpu_rq(i);
5492 struct sched_entity *se;
5493
5494 se = tg->se[i];
5495 /* Propagate contribution to hierarchy */
5496 raw_spin_lock_irqsave(&rq->lock, flags);
5497 for_each_sched_entity(se)
5498 update_cfs_shares(group_cfs_rq(se));
5499 raw_spin_unlock_irqrestore(&rq->lock, flags);
5500 }
5501
5502done:
5503 mutex_unlock(&shares_mutex);
5504 return 0;
5505}
5506#else /* CONFIG_FAIR_GROUP_SCHED */
5507
5508void free_fair_sched_group(struct task_group *tg) { }
5509
5510int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
5511{
5512 return 1;
5513}
5514
5515void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
5516
5517#endif /* CONFIG_FAIR_GROUP_SCHED */
5518
5025 5519
5026static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) 5520static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
5027{ 5521{
@@ -5041,7 +5535,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
5041/* 5535/*
5042 * All the scheduling class methods: 5536 * All the scheduling class methods:
5043 */ 5537 */
5044static const struct sched_class fair_sched_class = { 5538const struct sched_class fair_sched_class = {
5045 .next = &idle_sched_class, 5539 .next = &idle_sched_class,
5046 .enqueue_task = enqueue_task_fair, 5540 .enqueue_task = enqueue_task_fair,
5047 .dequeue_task = dequeue_task_fair, 5541 .dequeue_task = dequeue_task_fair,
@@ -5078,7 +5572,7 @@ static const struct sched_class fair_sched_class = {
5078}; 5572};
5079 5573
5080#ifdef CONFIG_SCHED_DEBUG 5574#ifdef CONFIG_SCHED_DEBUG
5081static void print_cfs_stats(struct seq_file *m, int cpu) 5575void print_cfs_stats(struct seq_file *m, int cpu)
5082{ 5576{
5083 struct cfs_rq *cfs_rq; 5577 struct cfs_rq *cfs_rq;
5084 5578
@@ -5088,3 +5582,15 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
5088 rcu_read_unlock(); 5582 rcu_read_unlock();
5089} 5583}
5090#endif 5584#endif
5585
5586__init void init_sched_fair_class(void)
5587{
5588#ifdef CONFIG_SMP
5589 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
5590
5591#ifdef CONFIG_NO_HZ
5592 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
5593#endif
5594#endif /* SMP */
5595
5596}
diff --git a/kernel/sched_features.h b/kernel/sched/features.h
index 84802245abd2..e61fd73913d0 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched/features.h
@@ -3,13 +3,13 @@
3 * them to run sooner, but does not allow tons of sleepers to 3 * them to run sooner, but does not allow tons of sleepers to
4 * rip the spread apart. 4 * rip the spread apart.
5 */ 5 */
6SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1) 6SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
7 7
8/* 8/*
9 * Place new tasks ahead so that they do not starve already running 9 * Place new tasks ahead so that they do not starve already running
10 * tasks 10 * tasks
11 */ 11 */
12SCHED_FEAT(START_DEBIT, 1) 12SCHED_FEAT(START_DEBIT, true)
13 13
14/* 14/*
15 * Based on load and program behaviour, see if it makes sense to place 15 * Based on load and program behaviour, see if it makes sense to place
@@ -17,54 +17,54 @@ SCHED_FEAT(START_DEBIT, 1)
17 * improve cache locality. Typically used with SYNC wakeups as 17 * improve cache locality. Typically used with SYNC wakeups as
18 * generated by pipes and the like, see also SYNC_WAKEUPS. 18 * generated by pipes and the like, see also SYNC_WAKEUPS.
19 */ 19 */
20SCHED_FEAT(AFFINE_WAKEUPS, 1) 20SCHED_FEAT(AFFINE_WAKEUPS, true)
21 21
22/* 22/*
23 * Prefer to schedule the task we woke last (assuming it failed 23 * Prefer to schedule the task we woke last (assuming it failed
24 * wakeup-preemption), since its likely going to consume data we 24 * wakeup-preemption), since its likely going to consume data we
25 * touched, increases cache locality. 25 * touched, increases cache locality.
26 */ 26 */
27SCHED_FEAT(NEXT_BUDDY, 0) 27SCHED_FEAT(NEXT_BUDDY, false)
28 28
29/* 29/*
30 * Prefer to schedule the task that ran last (when we did 30 * Prefer to schedule the task that ran last (when we did
31 * wake-preempt) as that likely will touch the same data, increases 31 * wake-preempt) as that likely will touch the same data, increases
32 * cache locality. 32 * cache locality.
33 */ 33 */
34SCHED_FEAT(LAST_BUDDY, 1) 34SCHED_FEAT(LAST_BUDDY, true)
35 35
36/* 36/*
37 * Consider buddies to be cache hot, decreases the likelyness of a 37 * Consider buddies to be cache hot, decreases the likelyness of a
38 * cache buddy being migrated away, increases cache locality. 38 * cache buddy being migrated away, increases cache locality.
39 */ 39 */
40SCHED_FEAT(CACHE_HOT_BUDDY, 1) 40SCHED_FEAT(CACHE_HOT_BUDDY, true)
41 41
42/* 42/*
43 * Use arch dependent cpu power functions 43 * Use arch dependent cpu power functions
44 */ 44 */
45SCHED_FEAT(ARCH_POWER, 0) 45SCHED_FEAT(ARCH_POWER, false)
46 46
47SCHED_FEAT(HRTICK, 0) 47SCHED_FEAT(HRTICK, false)
48SCHED_FEAT(DOUBLE_TICK, 0) 48SCHED_FEAT(DOUBLE_TICK, false)
49SCHED_FEAT(LB_BIAS, 1) 49SCHED_FEAT(LB_BIAS, true)
50 50
51/* 51/*
52 * Spin-wait on mutex acquisition when the mutex owner is running on 52 * Spin-wait on mutex acquisition when the mutex owner is running on
53 * another cpu -- assumes that when the owner is running, it will soon 53 * another cpu -- assumes that when the owner is running, it will soon
54 * release the lock. Decreases scheduling overhead. 54 * release the lock. Decreases scheduling overhead.
55 */ 55 */
56SCHED_FEAT(OWNER_SPIN, 1) 56SCHED_FEAT(OWNER_SPIN, true)
57 57
58/* 58/*
59 * Decrement CPU power based on time not spent running tasks 59 * Decrement CPU power based on time not spent running tasks
60 */ 60 */
61SCHED_FEAT(NONTASK_POWER, 1) 61SCHED_FEAT(NONTASK_POWER, true)
62 62
63/* 63/*
64 * Queue remote wakeups on the target CPU and process them 64 * Queue remote wakeups on the target CPU and process them
65 * using the scheduler IPI. Reduces rq->lock contention/bounces. 65 * using the scheduler IPI. Reduces rq->lock contention/bounces.
66 */ 66 */
67SCHED_FEAT(TTWU_QUEUE, 1) 67SCHED_FEAT(TTWU_QUEUE, true)
68 68
69SCHED_FEAT(FORCE_SD_OVERLAP, 0) 69SCHED_FEAT(FORCE_SD_OVERLAP, false)
70SCHED_FEAT(RT_RUNTIME_SHARE, 1) 70SCHED_FEAT(RT_RUNTIME_SHARE, true)
diff --git a/kernel/sched_idletask.c b/kernel/sched/idle_task.c
index 0a51882534ea..91b4c957f289 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched/idle_task.c
@@ -1,3 +1,5 @@
1#include "sched.h"
2
1/* 3/*
2 * idle-task scheduling class. 4 * idle-task scheduling class.
3 * 5 *
@@ -71,7 +73,7 @@ static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task
71/* 73/*
72 * Simple, special scheduling class for the per-CPU idle tasks: 74 * Simple, special scheduling class for the per-CPU idle tasks:
73 */ 75 */
74static const struct sched_class idle_sched_class = { 76const struct sched_class idle_sched_class = {
75 /* .next is NULL */ 77 /* .next is NULL */
76 /* no enqueue/yield_task for idle tasks */ 78 /* no enqueue/yield_task for idle tasks */
77 79
diff --git a/kernel/sched_rt.c b/kernel/sched/rt.c
index 583a1368afe6..3640ebbb466b 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched/rt.c
@@ -3,7 +3,92 @@
3 * policies) 3 * policies)
4 */ 4 */
5 5
6#include "sched.h"
7
8#include <linux/slab.h>
9
10static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
11
12struct rt_bandwidth def_rt_bandwidth;
13
14static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
15{
16 struct rt_bandwidth *rt_b =
17 container_of(timer, struct rt_bandwidth, rt_period_timer);
18 ktime_t now;
19 int overrun;
20 int idle = 0;
21
22 for (;;) {
23 now = hrtimer_cb_get_time(timer);
24 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
25
26 if (!overrun)
27 break;
28
29 idle = do_sched_rt_period_timer(rt_b, overrun);
30 }
31
32 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
33}
34
35void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
36{
37 rt_b->rt_period = ns_to_ktime(period);
38 rt_b->rt_runtime = runtime;
39
40 raw_spin_lock_init(&rt_b->rt_runtime_lock);
41
42 hrtimer_init(&rt_b->rt_period_timer,
43 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
44 rt_b->rt_period_timer.function = sched_rt_period_timer;
45}
46
47static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
48{
49 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
50 return;
51
52 if (hrtimer_active(&rt_b->rt_period_timer))
53 return;
54
55 raw_spin_lock(&rt_b->rt_runtime_lock);
56 start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
57 raw_spin_unlock(&rt_b->rt_runtime_lock);
58}
59
60void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
61{
62 struct rt_prio_array *array;
63 int i;
64
65 array = &rt_rq->active;
66 for (i = 0; i < MAX_RT_PRIO; i++) {
67 INIT_LIST_HEAD(array->queue + i);
68 __clear_bit(i, array->bitmap);
69 }
70 /* delimiter for bitsearch: */
71 __set_bit(MAX_RT_PRIO, array->bitmap);
72
73#if defined CONFIG_SMP
74 rt_rq->highest_prio.curr = MAX_RT_PRIO;
75 rt_rq->highest_prio.next = MAX_RT_PRIO;
76 rt_rq->rt_nr_migratory = 0;
77 rt_rq->overloaded = 0;
78 plist_head_init(&rt_rq->pushable_tasks);
79#endif
80
81 rt_rq->rt_time = 0;
82 rt_rq->rt_throttled = 0;
83 rt_rq->rt_runtime = 0;
84 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
85}
86
6#ifdef CONFIG_RT_GROUP_SCHED 87#ifdef CONFIG_RT_GROUP_SCHED
88static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
89{
90 hrtimer_cancel(&rt_b->rt_period_timer);
91}
7 92
8#define rt_entity_is_task(rt_se) (!(rt_se)->my_q) 93#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
9 94
@@ -25,6 +110,91 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
25 return rt_se->rt_rq; 110 return rt_se->rt_rq;
26} 111}
27 112
113void free_rt_sched_group(struct task_group *tg)
114{
115 int i;
116
117 if (tg->rt_se)
118 destroy_rt_bandwidth(&tg->rt_bandwidth);
119
120 for_each_possible_cpu(i) {
121 if (tg->rt_rq)
122 kfree(tg->rt_rq[i]);
123 if (tg->rt_se)
124 kfree(tg->rt_se[i]);
125 }
126
127 kfree(tg->rt_rq);
128 kfree(tg->rt_se);
129}
130
131void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
132 struct sched_rt_entity *rt_se, int cpu,
133 struct sched_rt_entity *parent)
134{
135 struct rq *rq = cpu_rq(cpu);
136
137 rt_rq->highest_prio.curr = MAX_RT_PRIO;
138 rt_rq->rt_nr_boosted = 0;
139 rt_rq->rq = rq;
140 rt_rq->tg = tg;
141
142 tg->rt_rq[cpu] = rt_rq;
143 tg->rt_se[cpu] = rt_se;
144
145 if (!rt_se)
146 return;
147
148 if (!parent)
149 rt_se->rt_rq = &rq->rt;
150 else
151 rt_se->rt_rq = parent->my_q;
152
153 rt_se->my_q = rt_rq;
154 rt_se->parent = parent;
155 INIT_LIST_HEAD(&rt_se->run_list);
156}
157
158int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
159{
160 struct rt_rq *rt_rq;
161 struct sched_rt_entity *rt_se;
162 int i;
163
164 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
165 if (!tg->rt_rq)
166 goto err;
167 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
168 if (!tg->rt_se)
169 goto err;
170
171 init_rt_bandwidth(&tg->rt_bandwidth,
172 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
173
174 for_each_possible_cpu(i) {
175 rt_rq = kzalloc_node(sizeof(struct rt_rq),
176 GFP_KERNEL, cpu_to_node(i));
177 if (!rt_rq)
178 goto err;
179
180 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
181 GFP_KERNEL, cpu_to_node(i));
182 if (!rt_se)
183 goto err_free_rq;
184
185 init_rt_rq(rt_rq, cpu_rq(i));
186 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
187 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
188 }
189
190 return 1;
191
192err_free_rq:
193 kfree(rt_rq);
194err:
195 return 0;
196}
197
28#else /* CONFIG_RT_GROUP_SCHED */ 198#else /* CONFIG_RT_GROUP_SCHED */
29 199
30#define rt_entity_is_task(rt_se) (1) 200#define rt_entity_is_task(rt_se) (1)
@@ -47,6 +217,12 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
47 return &rq->rt; 217 return &rq->rt;
48} 218}
49 219
220void free_rt_sched_group(struct task_group *tg) { }
221
222int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
223{
224 return 1;
225}
50#endif /* CONFIG_RT_GROUP_SCHED */ 226#endif /* CONFIG_RT_GROUP_SCHED */
51 227
52#ifdef CONFIG_SMP 228#ifdef CONFIG_SMP
@@ -556,6 +732,28 @@ static void enable_runtime(struct rq *rq)
556 raw_spin_unlock_irqrestore(&rq->lock, flags); 732 raw_spin_unlock_irqrestore(&rq->lock, flags);
557} 733}
558 734
735int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu)
736{
737 int cpu = (int)(long)hcpu;
738
739 switch (action) {
740 case CPU_DOWN_PREPARE:
741 case CPU_DOWN_PREPARE_FROZEN:
742 disable_runtime(cpu_rq(cpu));
743 return NOTIFY_OK;
744
745 case CPU_DOWN_FAILED:
746 case CPU_DOWN_FAILED_FROZEN:
747 case CPU_ONLINE:
748 case CPU_ONLINE_FROZEN:
749 enable_runtime(cpu_rq(cpu));
750 return NOTIFY_OK;
751
752 default:
753 return NOTIFY_DONE;
754 }
755}
756
559static int balance_runtime(struct rt_rq *rt_rq) 757static int balance_runtime(struct rt_rq *rt_rq)
560{ 758{
561 int more = 0; 759 int more = 0;
@@ -648,7 +846,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
648 if (rt_rq->rt_throttled) 846 if (rt_rq->rt_throttled)
649 return rt_rq_throttled(rt_rq); 847 return rt_rq_throttled(rt_rq);
650 848
651 if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq)) 849 if (runtime >= sched_rt_period(rt_rq))
652 return 0; 850 return 0;
653 851
654 balance_runtime(rt_rq); 852 balance_runtime(rt_rq);
@@ -957,8 +1155,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
957} 1155}
958 1156
959/* 1157/*
960 * Put task to the end of the run list without the overhead of dequeue 1158 * Put task to the head or the end of the run list without the overhead of
961 * followed by enqueue. 1159 * dequeue followed by enqueue.
962 */ 1160 */
963static void 1161static void
964requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head) 1162requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
@@ -1002,6 +1200,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1002 1200
1003 cpu = task_cpu(p); 1201 cpu = task_cpu(p);
1004 1202
1203 if (p->rt.nr_cpus_allowed == 1)
1204 goto out;
1205
1005 /* For anything but wake ups, just return the task_cpu */ 1206 /* For anything but wake ups, just return the task_cpu */
1006 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) 1207 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
1007 goto out; 1208 goto out;
@@ -1178,8 +1379,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1178/* Only try algorithms three times */ 1379/* Only try algorithms three times */
1179#define RT_MAX_TRIES 3 1380#define RT_MAX_TRIES 3
1180 1381
1181static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
1182
1183static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 1382static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1184{ 1383{
1185 if (!task_running(rq, p) && 1384 if (!task_running(rq, p) &&
@@ -1653,13 +1852,14 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
1653 pull_rt_task(rq); 1852 pull_rt_task(rq);
1654} 1853}
1655 1854
1656static inline void init_sched_rt_class(void) 1855void init_sched_rt_class(void)
1657{ 1856{
1658 unsigned int i; 1857 unsigned int i;
1659 1858
1660 for_each_possible_cpu(i) 1859 for_each_possible_cpu(i) {
1661 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), 1860 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
1662 GFP_KERNEL, cpu_to_node(i)); 1861 GFP_KERNEL, cpu_to_node(i));
1862 }
1663} 1863}
1664#endif /* CONFIG_SMP */ 1864#endif /* CONFIG_SMP */
1665 1865
@@ -1800,7 +2000,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
1800 return 0; 2000 return 0;
1801} 2001}
1802 2002
1803static const struct sched_class rt_sched_class = { 2003const struct sched_class rt_sched_class = {
1804 .next = &fair_sched_class, 2004 .next = &fair_sched_class,
1805 .enqueue_task = enqueue_task_rt, 2005 .enqueue_task = enqueue_task_rt,
1806 .dequeue_task = dequeue_task_rt, 2006 .dequeue_task = dequeue_task_rt,
@@ -1835,7 +2035,7 @@ static const struct sched_class rt_sched_class = {
1835#ifdef CONFIG_SCHED_DEBUG 2035#ifdef CONFIG_SCHED_DEBUG
1836extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); 2036extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
1837 2037
1838static void print_rt_stats(struct seq_file *m, int cpu) 2038void print_rt_stats(struct seq_file *m, int cpu)
1839{ 2039{
1840 rt_rq_iter_t iter; 2040 rt_rq_iter_t iter;
1841 struct rt_rq *rt_rq; 2041 struct rt_rq *rt_rq;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
new file mode 100644
index 000000000000..98c0c2623db8
--- /dev/null
+++ b/kernel/sched/sched.h
@@ -0,0 +1,1166 @@
1
2#include <linux/sched.h>
3#include <linux/mutex.h>
4#include <linux/spinlock.h>
5#include <linux/stop_machine.h>
6
7#include "cpupri.h"
8
9extern __read_mostly int scheduler_running;
10
11/*
12 * Convert user-nice values [ -20 ... 0 ... 19 ]
13 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
14 * and back.
15 */
16#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
17#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
18#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
19
20/*
21 * 'User priority' is the nice value converted to something we
22 * can work with better when scaling various scheduler parameters,
23 * it's a [ 0 ... 39 ] range.
24 */
25#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
26#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
27#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
28
29/*
30 * Helpers for converting nanosecond timing to jiffy resolution
31 */
32#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
33
34#define NICE_0_LOAD SCHED_LOAD_SCALE
35#define NICE_0_SHIFT SCHED_LOAD_SHIFT
36
37/*
38 * These are the 'tuning knobs' of the scheduler:
39 *
40 * default timeslice is 100 msecs (used only for SCHED_RR tasks).
41 * Timeslices get refilled after they expire.
42 */
43#define DEF_TIMESLICE (100 * HZ / 1000)
44
45/*
46 * single value that denotes runtime == period, ie unlimited time.
47 */
48#define RUNTIME_INF ((u64)~0ULL)
49
50static inline int rt_policy(int policy)
51{
52 if (policy == SCHED_FIFO || policy == SCHED_RR)
53 return 1;
54 return 0;
55}
56
57static inline int task_has_rt_policy(struct task_struct *p)
58{
59 return rt_policy(p->policy);
60}
61
62/*
63 * This is the priority-queue data structure of the RT scheduling class:
64 */
65struct rt_prio_array {
66 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
67 struct list_head queue[MAX_RT_PRIO];
68};
69
70struct rt_bandwidth {
71 /* nests inside the rq lock: */
72 raw_spinlock_t rt_runtime_lock;
73 ktime_t rt_period;
74 u64 rt_runtime;
75 struct hrtimer rt_period_timer;
76};
77
78extern struct mutex sched_domains_mutex;
79
80#ifdef CONFIG_CGROUP_SCHED
81
82#include <linux/cgroup.h>
83
84struct cfs_rq;
85struct rt_rq;
86
87static LIST_HEAD(task_groups);
88
89struct cfs_bandwidth {
90#ifdef CONFIG_CFS_BANDWIDTH
91 raw_spinlock_t lock;
92 ktime_t period;
93 u64 quota, runtime;
94 s64 hierarchal_quota;
95 u64 runtime_expires;
96
97 int idle, timer_active;
98 struct hrtimer period_timer, slack_timer;
99 struct list_head throttled_cfs_rq;
100
101 /* statistics */
102 int nr_periods, nr_throttled;
103 u64 throttled_time;
104#endif
105};
106
107/* task group related information */
108struct task_group {
109 struct cgroup_subsys_state css;
110
111#ifdef CONFIG_FAIR_GROUP_SCHED
112 /* schedulable entities of this group on each cpu */
113 struct sched_entity **se;
114 /* runqueue "owned" by this group on each cpu */
115 struct cfs_rq **cfs_rq;
116 unsigned long shares;
117
118 atomic_t load_weight;
119#endif
120
121#ifdef CONFIG_RT_GROUP_SCHED
122 struct sched_rt_entity **rt_se;
123 struct rt_rq **rt_rq;
124
125 struct rt_bandwidth rt_bandwidth;
126#endif
127
128 struct rcu_head rcu;
129 struct list_head list;
130
131 struct task_group *parent;
132 struct list_head siblings;
133 struct list_head children;
134
135#ifdef CONFIG_SCHED_AUTOGROUP
136 struct autogroup *autogroup;
137#endif
138
139 struct cfs_bandwidth cfs_bandwidth;
140};
141
142#ifdef CONFIG_FAIR_GROUP_SCHED
143#define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
144
145/*
146 * A weight of 0 or 1 can cause arithmetics problems.
147 * A weight of a cfs_rq is the sum of weights of which entities
148 * are queued on this cfs_rq, so a weight of a entity should not be
149 * too large, so as the shares value of a task group.
150 * (The default weight is 1024 - so there's no practical
151 * limitation from this.)
152 */
153#define MIN_SHARES (1UL << 1)
154#define MAX_SHARES (1UL << 18)
155#endif
156
157/* Default task group.
158 * Every task in system belong to this group at bootup.
159 */
160extern struct task_group root_task_group;
161
162typedef int (*tg_visitor)(struct task_group *, void *);
163
164extern int walk_tg_tree_from(struct task_group *from,
165 tg_visitor down, tg_visitor up, void *data);
166
167/*
168 * Iterate the full tree, calling @down when first entering a node and @up when
169 * leaving it for the final time.
170 *
171 * Caller must hold rcu_lock or sufficient equivalent.
172 */
173static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
174{
175 return walk_tg_tree_from(&root_task_group, down, up, data);
176}
177
178extern int tg_nop(struct task_group *tg, void *data);
179
180extern void free_fair_sched_group(struct task_group *tg);
181extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
182extern void unregister_fair_sched_group(struct task_group *tg, int cpu);
183extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
184 struct sched_entity *se, int cpu,
185 struct sched_entity *parent);
186extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
187extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
188
189extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
190extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
191extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
192
193extern void free_rt_sched_group(struct task_group *tg);
194extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent);
195extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
196 struct sched_rt_entity *rt_se, int cpu,
197 struct sched_rt_entity *parent);
198
199#else /* CONFIG_CGROUP_SCHED */
200
201struct cfs_bandwidth { };
202
203#endif /* CONFIG_CGROUP_SCHED */
204
205/* CFS-related fields in a runqueue */
206struct cfs_rq {
207 struct load_weight load;
208 unsigned long nr_running, h_nr_running;
209
210 u64 exec_clock;
211 u64 min_vruntime;
212#ifndef CONFIG_64BIT
213 u64 min_vruntime_copy;
214#endif
215
216 struct rb_root tasks_timeline;
217 struct rb_node *rb_leftmost;
218
219 struct list_head tasks;
220 struct list_head *balance_iterator;
221
222 /*
223 * 'curr' points to currently running entity on this cfs_rq.
224 * It is set to NULL otherwise (i.e when none are currently running).
225 */
226 struct sched_entity *curr, *next, *last, *skip;
227
228#ifdef CONFIG_SCHED_DEBUG
229 unsigned int nr_spread_over;
230#endif
231
232#ifdef CONFIG_FAIR_GROUP_SCHED
233 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
234
235 /*
236 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
237 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
238 * (like users, containers etc.)
239 *
240 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
241 * list is used during load balance.
242 */
243 int on_list;
244 struct list_head leaf_cfs_rq_list;
245 struct task_group *tg; /* group that "owns" this runqueue */
246
247#ifdef CONFIG_SMP
248 /*
249 * the part of load.weight contributed by tasks
250 */
251 unsigned long task_weight;
252
253 /*
254 * h_load = weight * f(tg)
255 *
256 * Where f(tg) is the recursive weight fraction assigned to
257 * this group.
258 */
259 unsigned long h_load;
260
261 /*
262 * Maintaining per-cpu shares distribution for group scheduling
263 *
264 * load_stamp is the last time we updated the load average
265 * load_last is the last time we updated the load average and saw load
266 * load_unacc_exec_time is currently unaccounted execution time
267 */
268 u64 load_avg;
269 u64 load_period;
270 u64 load_stamp, load_last, load_unacc_exec_time;
271
272 unsigned long load_contribution;
273#endif /* CONFIG_SMP */
274#ifdef CONFIG_CFS_BANDWIDTH
275 int runtime_enabled;
276 u64 runtime_expires;
277 s64 runtime_remaining;
278
279 u64 throttled_timestamp;
280 int throttled, throttle_count;
281 struct list_head throttled_list;
282#endif /* CONFIG_CFS_BANDWIDTH */
283#endif /* CONFIG_FAIR_GROUP_SCHED */
284};
285
286static inline int rt_bandwidth_enabled(void)
287{
288 return sysctl_sched_rt_runtime >= 0;
289}
290
291/* Real-Time classes' related field in a runqueue: */
292struct rt_rq {
293 struct rt_prio_array active;
294 unsigned long rt_nr_running;
295#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
296 struct {
297 int curr; /* highest queued rt task prio */
298#ifdef CONFIG_SMP
299 int next; /* next highest */
300#endif
301 } highest_prio;
302#endif
303#ifdef CONFIG_SMP
304 unsigned long rt_nr_migratory;
305 unsigned long rt_nr_total;
306 int overloaded;
307 struct plist_head pushable_tasks;
308#endif
309 int rt_throttled;
310 u64 rt_time;
311 u64 rt_runtime;
312 /* Nests inside the rq lock: */
313 raw_spinlock_t rt_runtime_lock;
314
315#ifdef CONFIG_RT_GROUP_SCHED
316 unsigned long rt_nr_boosted;
317
318 struct rq *rq;
319 struct list_head leaf_rt_rq_list;
320 struct task_group *tg;
321#endif
322};
323
324#ifdef CONFIG_SMP
325
326/*
327 * We add the notion of a root-domain which will be used to define per-domain
328 * variables. Each exclusive cpuset essentially defines an island domain by
329 * fully partitioning the member cpus from any other cpuset. Whenever a new
330 * exclusive cpuset is created, we also create and attach a new root-domain
331 * object.
332 *
333 */
334struct root_domain {
335 atomic_t refcount;
336 atomic_t rto_count;
337 struct rcu_head rcu;
338 cpumask_var_t span;
339 cpumask_var_t online;
340
341 /*
342 * The "RT overload" flag: it gets set if a CPU has more than
343 * one runnable RT task.
344 */
345 cpumask_var_t rto_mask;
346 struct cpupri cpupri;
347};
348
349extern struct root_domain def_root_domain;
350
351#endif /* CONFIG_SMP */
352
353/*
354 * This is the main, per-CPU runqueue data structure.
355 *
356 * Locking rule: those places that want to lock multiple runqueues
357 * (such as the load balancing or the thread migration code), lock
358 * acquire operations must be ordered by ascending &runqueue.
359 */
360struct rq {
361 /* runqueue lock: */
362 raw_spinlock_t lock;
363
364 /*
365 * nr_running and cpu_load should be in the same cacheline because
366 * remote CPUs use both these fields when doing load calculation.
367 */
368 unsigned long nr_running;
369 #define CPU_LOAD_IDX_MAX 5
370 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
371 unsigned long last_load_update_tick;
372#ifdef CONFIG_NO_HZ
373 u64 nohz_stamp;
374 unsigned long nohz_flags;
375#endif
376 int skip_clock_update;
377
378 /* capture load from *all* tasks on this cpu: */
379 struct load_weight load;
380 unsigned long nr_load_updates;
381 u64 nr_switches;
382
383 struct cfs_rq cfs;
384 struct rt_rq rt;
385
386#ifdef CONFIG_FAIR_GROUP_SCHED
387 /* list of leaf cfs_rq on this cpu: */
388 struct list_head leaf_cfs_rq_list;
389#endif
390#ifdef CONFIG_RT_GROUP_SCHED
391 struct list_head leaf_rt_rq_list;
392#endif
393
394 /*
395 * This is part of a global counter where only the total sum
396 * over all CPUs matters. A task can increase this counter on
397 * one CPU and if it got migrated afterwards it may decrease
398 * it on another CPU. Always updated under the runqueue lock:
399 */
400 unsigned long nr_uninterruptible;
401
402 struct task_struct *curr, *idle, *stop;
403 unsigned long next_balance;
404 struct mm_struct *prev_mm;
405
406 u64 clock;
407 u64 clock_task;
408
409 atomic_t nr_iowait;
410
411#ifdef CONFIG_SMP
412 struct root_domain *rd;
413 struct sched_domain *sd;
414
415 unsigned long cpu_power;
416
417 unsigned char idle_balance;
418 /* For active balancing */
419 int post_schedule;
420 int active_balance;
421 int push_cpu;
422 struct cpu_stop_work active_balance_work;
423 /* cpu of this runqueue: */
424 int cpu;
425 int online;
426
427 u64 rt_avg;
428 u64 age_stamp;
429 u64 idle_stamp;
430 u64 avg_idle;
431#endif
432
433#ifdef CONFIG_IRQ_TIME_ACCOUNTING
434 u64 prev_irq_time;
435#endif
436#ifdef CONFIG_PARAVIRT
437 u64 prev_steal_time;
438#endif
439#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
440 u64 prev_steal_time_rq;
441#endif
442
443 /* calc_load related fields */
444 unsigned long calc_load_update;
445 long calc_load_active;
446
447#ifdef CONFIG_SCHED_HRTICK
448#ifdef CONFIG_SMP
449 int hrtick_csd_pending;
450 struct call_single_data hrtick_csd;
451#endif
452 struct hrtimer hrtick_timer;
453#endif
454
455#ifdef CONFIG_SCHEDSTATS
456 /* latency stats */
457 struct sched_info rq_sched_info;
458 unsigned long long rq_cpu_time;
459 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
460
461 /* sys_sched_yield() stats */
462 unsigned int yld_count;
463
464 /* schedule() stats */
465 unsigned int sched_switch;
466 unsigned int sched_count;
467 unsigned int sched_goidle;
468
469 /* try_to_wake_up() stats */
470 unsigned int ttwu_count;
471 unsigned int ttwu_local;
472#endif
473
474#ifdef CONFIG_SMP
475 struct llist_head wake_list;
476#endif
477};
478
479static inline int cpu_of(struct rq *rq)
480{
481#ifdef CONFIG_SMP
482 return rq->cpu;
483#else
484 return 0;
485#endif
486}
487
488DECLARE_PER_CPU(struct rq, runqueues);
489
490#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
491#define this_rq() (&__get_cpu_var(runqueues))
492#define task_rq(p) cpu_rq(task_cpu(p))
493#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
494#define raw_rq() (&__raw_get_cpu_var(runqueues))
495
496#ifdef CONFIG_SMP
497
498#define rcu_dereference_check_sched_domain(p) \
499 rcu_dereference_check((p), \
500 lockdep_is_held(&sched_domains_mutex))
501
502/*
503 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
504 * See detach_destroy_domains: synchronize_sched for details.
505 *
506 * The domain tree of any CPU may only be accessed from within
507 * preempt-disabled sections.
508 */
509#define for_each_domain(cpu, __sd) \
510 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
511 __sd; __sd = __sd->parent)
512
513#define for_each_lower_domain(sd) for (; sd; sd = sd->child)
514
515/**
516 * highest_flag_domain - Return highest sched_domain containing flag.
517 * @cpu: The cpu whose highest level of sched domain is to
518 * be returned.
519 * @flag: The flag to check for the highest sched_domain
520 * for the given cpu.
521 *
522 * Returns the highest sched_domain of a cpu which contains the given flag.
523 */
524static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
525{
526 struct sched_domain *sd, *hsd = NULL;
527
528 for_each_domain(cpu, sd) {
529 if (!(sd->flags & flag))
530 break;
531 hsd = sd;
532 }
533
534 return hsd;
535}
536
537DECLARE_PER_CPU(struct sched_domain *, sd_llc);
538DECLARE_PER_CPU(int, sd_llc_id);
539
540#endif /* CONFIG_SMP */
541
542#include "stats.h"
543#include "auto_group.h"
544
545#ifdef CONFIG_CGROUP_SCHED
546
547/*
548 * Return the group to which this tasks belongs.
549 *
550 * We use task_subsys_state_check() and extend the RCU verification with
551 * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
552 * task it moves into the cgroup. Therefore by holding either of those locks,
553 * we pin the task to the current cgroup.
554 */
555static inline struct task_group *task_group(struct task_struct *p)
556{
557 struct task_group *tg;
558 struct cgroup_subsys_state *css;
559
560 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
561 lockdep_is_held(&p->pi_lock) ||
562 lockdep_is_held(&task_rq(p)->lock));
563 tg = container_of(css, struct task_group, css);
564
565 return autogroup_task_group(p, tg);
566}
567
568/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
569static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
570{
571#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED)
572 struct task_group *tg = task_group(p);
573#endif
574
575#ifdef CONFIG_FAIR_GROUP_SCHED
576 p->se.cfs_rq = tg->cfs_rq[cpu];
577 p->se.parent = tg->se[cpu];
578#endif
579
580#ifdef CONFIG_RT_GROUP_SCHED
581 p->rt.rt_rq = tg->rt_rq[cpu];
582 p->rt.parent = tg->rt_se[cpu];
583#endif
584}
585
586#else /* CONFIG_CGROUP_SCHED */
587
588static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
589static inline struct task_group *task_group(struct task_struct *p)
590{
591 return NULL;
592}
593
594#endif /* CONFIG_CGROUP_SCHED */
595
596static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
597{
598 set_task_rq(p, cpu);
599#ifdef CONFIG_SMP
600 /*
601 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
602 * successfuly executed on another CPU. We must ensure that updates of
603 * per-task data have been completed by this moment.
604 */
605 smp_wmb();
606 task_thread_info(p)->cpu = cpu;
607#endif
608}
609
610/*
611 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
612 */
613#ifdef CONFIG_SCHED_DEBUG
614# include <linux/jump_label.h>
615# define const_debug __read_mostly
616#else
617# define const_debug const
618#endif
619
620extern const_debug unsigned int sysctl_sched_features;
621
622#define SCHED_FEAT(name, enabled) \
623 __SCHED_FEAT_##name ,
624
625enum {
626#include "features.h"
627 __SCHED_FEAT_NR,
628};
629
630#undef SCHED_FEAT
631
632#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
633static __always_inline bool static_branch__true(struct jump_label_key *key)
634{
635 return likely(static_branch(key)); /* Not out of line branch. */
636}
637
638static __always_inline bool static_branch__false(struct jump_label_key *key)
639{
640 return unlikely(static_branch(key)); /* Out of line branch. */
641}
642
643#define SCHED_FEAT(name, enabled) \
644static __always_inline bool static_branch_##name(struct jump_label_key *key) \
645{ \
646 return static_branch__##enabled(key); \
647}
648
649#include "features.h"
650
651#undef SCHED_FEAT
652
653extern struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR];
654#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
655#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
656#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
657#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
658
659static inline u64 global_rt_period(void)
660{
661 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
662}
663
664static inline u64 global_rt_runtime(void)
665{
666 if (sysctl_sched_rt_runtime < 0)
667 return RUNTIME_INF;
668
669 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
670}
671
672
673
674static inline int task_current(struct rq *rq, struct task_struct *p)
675{
676 return rq->curr == p;
677}
678
679static inline int task_running(struct rq *rq, struct task_struct *p)
680{
681#ifdef CONFIG_SMP
682 return p->on_cpu;
683#else
684 return task_current(rq, p);
685#endif
686}
687
688
689#ifndef prepare_arch_switch
690# define prepare_arch_switch(next) do { } while (0)
691#endif
692#ifndef finish_arch_switch
693# define finish_arch_switch(prev) do { } while (0)
694#endif
695
696#ifndef __ARCH_WANT_UNLOCKED_CTXSW
697static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
698{
699#ifdef CONFIG_SMP
700 /*
701 * We can optimise this out completely for !SMP, because the
702 * SMP rebalancing from interrupt is the only thing that cares
703 * here.
704 */
705 next->on_cpu = 1;
706#endif
707}
708
709static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
710{
711#ifdef CONFIG_SMP
712 /*
713 * After ->on_cpu is cleared, the task can be moved to a different CPU.
714 * We must ensure this doesn't happen until the switch is completely
715 * finished.
716 */
717 smp_wmb();
718 prev->on_cpu = 0;
719#endif
720#ifdef CONFIG_DEBUG_SPINLOCK
721 /* this is a valid case when another task releases the spinlock */
722 rq->lock.owner = current;
723#endif
724 /*
725 * If we are tracking spinlock dependencies then we have to
726 * fix up the runqueue lock - which gets 'carried over' from
727 * prev into current:
728 */
729 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
730
731 raw_spin_unlock_irq(&rq->lock);
732}
733
734#else /* __ARCH_WANT_UNLOCKED_CTXSW */
735static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
736{
737#ifdef CONFIG_SMP
738 /*
739 * We can optimise this out completely for !SMP, because the
740 * SMP rebalancing from interrupt is the only thing that cares
741 * here.
742 */
743 next->on_cpu = 1;
744#endif
745#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
746 raw_spin_unlock_irq(&rq->lock);
747#else
748 raw_spin_unlock(&rq->lock);
749#endif
750}
751
752static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
753{
754#ifdef CONFIG_SMP
755 /*
756 * After ->on_cpu is cleared, the task can be moved to a different CPU.
757 * We must ensure this doesn't happen until the switch is completely
758 * finished.
759 */
760 smp_wmb();
761 prev->on_cpu = 0;
762#endif
763#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
764 local_irq_enable();
765#endif
766}
767#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
768
769
770static inline void update_load_add(struct load_weight *lw, unsigned long inc)
771{
772 lw->weight += inc;
773 lw->inv_weight = 0;
774}
775
776static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
777{
778 lw->weight -= dec;
779 lw->inv_weight = 0;
780}
781
782static inline void update_load_set(struct load_weight *lw, unsigned long w)
783{
784 lw->weight = w;
785 lw->inv_weight = 0;
786}
787
788/*
789 * To aid in avoiding the subversion of "niceness" due to uneven distribution
790 * of tasks with abnormal "nice" values across CPUs the contribution that
791 * each task makes to its run queue's load is weighted according to its
792 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
793 * scaled version of the new time slice allocation that they receive on time
794 * slice expiry etc.
795 */
796
797#define WEIGHT_IDLEPRIO 3
798#define WMULT_IDLEPRIO 1431655765
799
800/*
801 * Nice levels are multiplicative, with a gentle 10% change for every
802 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
803 * nice 1, it will get ~10% less CPU time than another CPU-bound task
804 * that remained on nice 0.
805 *
806 * The "10% effect" is relative and cumulative: from _any_ nice level,
807 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
808 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
809 * If a task goes up by ~10% and another task goes down by ~10% then
810 * the relative distance between them is ~25%.)
811 */
812static const int prio_to_weight[40] = {
813 /* -20 */ 88761, 71755, 56483, 46273, 36291,
814 /* -15 */ 29154, 23254, 18705, 14949, 11916,
815 /* -10 */ 9548, 7620, 6100, 4904, 3906,
816 /* -5 */ 3121, 2501, 1991, 1586, 1277,
817 /* 0 */ 1024, 820, 655, 526, 423,
818 /* 5 */ 335, 272, 215, 172, 137,
819 /* 10 */ 110, 87, 70, 56, 45,
820 /* 15 */ 36, 29, 23, 18, 15,
821};
822
823/*
824 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
825 *
826 * In cases where the weight does not change often, we can use the
827 * precalculated inverse to speed up arithmetics by turning divisions
828 * into multiplications:
829 */
830static const u32 prio_to_wmult[40] = {
831 /* -20 */ 48388, 59856, 76040, 92818, 118348,
832 /* -15 */ 147320, 184698, 229616, 287308, 360437,
833 /* -10 */ 449829, 563644, 704093, 875809, 1099582,
834 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
835 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
836 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
837 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
838 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
839};
840
841/* Time spent by the tasks of the cpu accounting group executing in ... */
842enum cpuacct_stat_index {
843 CPUACCT_STAT_USER, /* ... user mode */
844 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
845
846 CPUACCT_STAT_NSTATS,
847};
848
849
850#define sched_class_highest (&stop_sched_class)
851#define for_each_class(class) \
852 for (class = sched_class_highest; class; class = class->next)
853
854extern const struct sched_class stop_sched_class;
855extern const struct sched_class rt_sched_class;
856extern const struct sched_class fair_sched_class;
857extern const struct sched_class idle_sched_class;
858
859
860#ifdef CONFIG_SMP
861
862extern void trigger_load_balance(struct rq *rq, int cpu);
863extern void idle_balance(int this_cpu, struct rq *this_rq);
864
865#else /* CONFIG_SMP */
866
867static inline void idle_balance(int cpu, struct rq *rq)
868{
869}
870
871#endif
872
873extern void sysrq_sched_debug_show(void);
874extern void sched_init_granularity(void);
875extern void update_max_interval(void);
876extern void update_group_power(struct sched_domain *sd, int cpu);
877extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu);
878extern void init_sched_rt_class(void);
879extern void init_sched_fair_class(void);
880
881extern void resched_task(struct task_struct *p);
882extern void resched_cpu(int cpu);
883
884extern struct rt_bandwidth def_rt_bandwidth;
885extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
886
887extern void update_cpu_load(struct rq *this_rq);
888
889#ifdef CONFIG_CGROUP_CPUACCT
890#include <linux/cgroup.h>
891/* track cpu usage of a group of tasks and its child groups */
892struct cpuacct {
893 struct cgroup_subsys_state css;
894 /* cpuusage holds pointer to a u64-type object on every cpu */
895 u64 __percpu *cpuusage;
896 struct kernel_cpustat __percpu *cpustat;
897};
898
899/* return cpu accounting group corresponding to this container */
900static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
901{
902 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
903 struct cpuacct, css);
904}
905
906/* return cpu accounting group to which this task belongs */
907static inline struct cpuacct *task_ca(struct task_struct *tsk)
908{
909 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
910 struct cpuacct, css);
911}
912
913static inline struct cpuacct *parent_ca(struct cpuacct *ca)
914{
915 if (!ca || !ca->css.cgroup->parent)
916 return NULL;
917 return cgroup_ca(ca->css.cgroup->parent);
918}
919
920extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
921#else
922static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
923#endif
924
925static inline void inc_nr_running(struct rq *rq)
926{
927 rq->nr_running++;
928}
929
930static inline void dec_nr_running(struct rq *rq)
931{
932 rq->nr_running--;
933}
934
935extern void update_rq_clock(struct rq *rq);
936
937extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
938extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
939
940extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
941
942extern const_debug unsigned int sysctl_sched_time_avg;
943extern const_debug unsigned int sysctl_sched_nr_migrate;
944extern const_debug unsigned int sysctl_sched_migration_cost;
945
946static inline u64 sched_avg_period(void)
947{
948 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
949}
950
951void calc_load_account_idle(struct rq *this_rq);
952
953#ifdef CONFIG_SCHED_HRTICK
954
955/*
956 * Use hrtick when:
957 * - enabled by features
958 * - hrtimer is actually high res
959 */
960static inline int hrtick_enabled(struct rq *rq)
961{
962 if (!sched_feat(HRTICK))
963 return 0;
964 if (!cpu_active(cpu_of(rq)))
965 return 0;
966 return hrtimer_is_hres_active(&rq->hrtick_timer);
967}
968
969void hrtick_start(struct rq *rq, u64 delay);
970
971#else
972
973static inline int hrtick_enabled(struct rq *rq)
974{
975 return 0;
976}
977
978#endif /* CONFIG_SCHED_HRTICK */
979
980#ifdef CONFIG_SMP
981extern void sched_avg_update(struct rq *rq);
982static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
983{
984 rq->rt_avg += rt_delta;
985 sched_avg_update(rq);
986}
987#else
988static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
989static inline void sched_avg_update(struct rq *rq) { }
990#endif
991
992extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
993
994#ifdef CONFIG_SMP
995#ifdef CONFIG_PREEMPT
996
997static inline void double_rq_lock(struct rq *rq1, struct rq *rq2);
998
999/*
1000 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1001 * way at the expense of forcing extra atomic operations in all
1002 * invocations. This assures that the double_lock is acquired using the
1003 * same underlying policy as the spinlock_t on this architecture, which
1004 * reduces latency compared to the unfair variant below. However, it
1005 * also adds more overhead and therefore may reduce throughput.
1006 */
1007static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1008 __releases(this_rq->lock)
1009 __acquires(busiest->lock)
1010 __acquires(this_rq->lock)
1011{
1012 raw_spin_unlock(&this_rq->lock);
1013 double_rq_lock(this_rq, busiest);
1014
1015 return 1;
1016}
1017
1018#else
1019/*
1020 * Unfair double_lock_balance: Optimizes throughput at the expense of
1021 * latency by eliminating extra atomic operations when the locks are
1022 * already in proper order on entry. This favors lower cpu-ids and will
1023 * grant the double lock to lower cpus over higher ids under contention,
1024 * regardless of entry order into the function.
1025 */
1026static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1027 __releases(this_rq->lock)
1028 __acquires(busiest->lock)
1029 __acquires(this_rq->lock)
1030{
1031 int ret = 0;
1032
1033 if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1034 if (busiest < this_rq) {
1035 raw_spin_unlock(&this_rq->lock);
1036 raw_spin_lock(&busiest->lock);
1037 raw_spin_lock_nested(&this_rq->lock,
1038 SINGLE_DEPTH_NESTING);
1039 ret = 1;
1040 } else
1041 raw_spin_lock_nested(&busiest->lock,
1042 SINGLE_DEPTH_NESTING);
1043 }
1044 return ret;
1045}
1046
1047#endif /* CONFIG_PREEMPT */
1048
1049/*
1050 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1051 */
1052static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1053{
1054 if (unlikely(!irqs_disabled())) {
1055 /* printk() doesn't work good under rq->lock */
1056 raw_spin_unlock(&this_rq->lock);
1057 BUG_ON(1);
1058 }
1059
1060 return _double_lock_balance(this_rq, busiest);
1061}
1062
1063static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1064 __releases(busiest->lock)
1065{
1066 raw_spin_unlock(&busiest->lock);
1067 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1068}
1069
1070/*
1071 * double_rq_lock - safely lock two runqueues
1072 *
1073 * Note this does not disable interrupts like task_rq_lock,
1074 * you need to do so manually before calling.
1075 */
1076static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
1077 __acquires(rq1->lock)
1078 __acquires(rq2->lock)
1079{
1080 BUG_ON(!irqs_disabled());
1081 if (rq1 == rq2) {
1082 raw_spin_lock(&rq1->lock);
1083 __acquire(rq2->lock); /* Fake it out ;) */
1084 } else {
1085 if (rq1 < rq2) {
1086 raw_spin_lock(&rq1->lock);
1087 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1088 } else {
1089 raw_spin_lock(&rq2->lock);
1090 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1091 }
1092 }
1093}
1094
1095/*
1096 * double_rq_unlock - safely unlock two runqueues
1097 *
1098 * Note this does not restore interrupts like task_rq_unlock,
1099 * you need to do so manually after calling.
1100 */
1101static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1102 __releases(rq1->lock)
1103 __releases(rq2->lock)
1104{
1105 raw_spin_unlock(&rq1->lock);
1106 if (rq1 != rq2)
1107 raw_spin_unlock(&rq2->lock);
1108 else
1109 __release(rq2->lock);
1110}
1111
1112#else /* CONFIG_SMP */
1113
1114/*
1115 * double_rq_lock - safely lock two runqueues
1116 *
1117 * Note this does not disable interrupts like task_rq_lock,
1118 * you need to do so manually before calling.
1119 */
1120static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
1121 __acquires(rq1->lock)
1122 __acquires(rq2->lock)
1123{
1124 BUG_ON(!irqs_disabled());
1125 BUG_ON(rq1 != rq2);
1126 raw_spin_lock(&rq1->lock);
1127 __acquire(rq2->lock); /* Fake it out ;) */
1128}
1129
1130/*
1131 * double_rq_unlock - safely unlock two runqueues
1132 *
1133 * Note this does not restore interrupts like task_rq_unlock,
1134 * you need to do so manually after calling.
1135 */
1136static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1137 __releases(rq1->lock)
1138 __releases(rq2->lock)
1139{
1140 BUG_ON(rq1 != rq2);
1141 raw_spin_unlock(&rq1->lock);
1142 __release(rq2->lock);
1143}
1144
1145#endif
1146
1147extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
1148extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
1149extern void print_cfs_stats(struct seq_file *m, int cpu);
1150extern void print_rt_stats(struct seq_file *m, int cpu);
1151
1152extern void init_cfs_rq(struct cfs_rq *cfs_rq);
1153extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
1154extern void unthrottle_offline_cfs_rqs(struct rq *rq);
1155
1156extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
1157
1158#ifdef CONFIG_NO_HZ
1159enum rq_nohz_flag_bits {
1160 NOHZ_TICK_STOPPED,
1161 NOHZ_BALANCE_KICK,
1162 NOHZ_IDLE,
1163};
1164
1165#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
1166#endif
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
new file mode 100644
index 000000000000..2a581ba8e190
--- /dev/null
+++ b/kernel/sched/stats.c
@@ -0,0 +1,111 @@
1
2#include <linux/slab.h>
3#include <linux/fs.h>
4#include <linux/seq_file.h>
5#include <linux/proc_fs.h>
6
7#include "sched.h"
8
9/*
10 * bump this up when changing the output format or the meaning of an existing
11 * format, so that tools can adapt (or abort)
12 */
13#define SCHEDSTAT_VERSION 15
14
15static int show_schedstat(struct seq_file *seq, void *v)
16{
17 int cpu;
18 int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
19 char *mask_str = kmalloc(mask_len, GFP_KERNEL);
20
21 if (mask_str == NULL)
22 return -ENOMEM;
23
24 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
25 seq_printf(seq, "timestamp %lu\n", jiffies);
26 for_each_online_cpu(cpu) {
27 struct rq *rq = cpu_rq(cpu);
28#ifdef CONFIG_SMP
29 struct sched_domain *sd;
30 int dcount = 0;
31#endif
32
33 /* runqueue-specific stats */
34 seq_printf(seq,
35 "cpu%d %u %u %u %u %u %u %llu %llu %lu",
36 cpu, rq->yld_count,
37 rq->sched_switch, rq->sched_count, rq->sched_goidle,
38 rq->ttwu_count, rq->ttwu_local,
39 rq->rq_cpu_time,
40 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
41
42 seq_printf(seq, "\n");
43
44#ifdef CONFIG_SMP
45 /* domain-specific stats */
46 rcu_read_lock();
47 for_each_domain(cpu, sd) {
48 enum cpu_idle_type itype;
49
50 cpumask_scnprintf(mask_str, mask_len,
51 sched_domain_span(sd));
52 seq_printf(seq, "domain%d %s", dcount++, mask_str);
53 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
54 itype++) {
55 seq_printf(seq, " %u %u %u %u %u %u %u %u",
56 sd->lb_count[itype],
57 sd->lb_balanced[itype],
58 sd->lb_failed[itype],
59 sd->lb_imbalance[itype],
60 sd->lb_gained[itype],
61 sd->lb_hot_gained[itype],
62 sd->lb_nobusyq[itype],
63 sd->lb_nobusyg[itype]);
64 }
65 seq_printf(seq,
66 " %u %u %u %u %u %u %u %u %u %u %u %u\n",
67 sd->alb_count, sd->alb_failed, sd->alb_pushed,
68 sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
69 sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
70 sd->ttwu_wake_remote, sd->ttwu_move_affine,
71 sd->ttwu_move_balance);
72 }
73 rcu_read_unlock();
74#endif
75 }
76 kfree(mask_str);
77 return 0;
78}
79
80static int schedstat_open(struct inode *inode, struct file *file)
81{
82 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
83 char *buf = kmalloc(size, GFP_KERNEL);
84 struct seq_file *m;
85 int res;
86
87 if (!buf)
88 return -ENOMEM;
89 res = single_open(file, show_schedstat, NULL);
90 if (!res) {
91 m = file->private_data;
92 m->buf = buf;
93 m->size = size;
94 } else
95 kfree(buf);
96 return res;
97}
98
99static const struct file_operations proc_schedstat_operations = {
100 .open = schedstat_open,
101 .read = seq_read,
102 .llseek = seq_lseek,
103 .release = single_release,
104};
105
106static int __init proc_schedstat_init(void)
107{
108 proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
109 return 0;
110}
111module_init(proc_schedstat_init);
diff --git a/kernel/sched_stats.h b/kernel/sched/stats.h
index 87f9e36ea56e..2ef90a51ec5e 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched/stats.h
@@ -1,108 +1,5 @@
1 1
2#ifdef CONFIG_SCHEDSTATS 2#ifdef CONFIG_SCHEDSTATS
3/*
4 * bump this up when changing the output format or the meaning of an existing
5 * format, so that tools can adapt (or abort)
6 */
7#define SCHEDSTAT_VERSION 15
8
9static int show_schedstat(struct seq_file *seq, void *v)
10{
11 int cpu;
12 int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
13 char *mask_str = kmalloc(mask_len, GFP_KERNEL);
14
15 if (mask_str == NULL)
16 return -ENOMEM;
17
18 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
19 seq_printf(seq, "timestamp %lu\n", jiffies);
20 for_each_online_cpu(cpu) {
21 struct rq *rq = cpu_rq(cpu);
22#ifdef CONFIG_SMP
23 struct sched_domain *sd;
24 int dcount = 0;
25#endif
26
27 /* runqueue-specific stats */
28 seq_printf(seq,
29 "cpu%d %u %u %u %u %u %u %llu %llu %lu",
30 cpu, rq->yld_count,
31 rq->sched_switch, rq->sched_count, rq->sched_goidle,
32 rq->ttwu_count, rq->ttwu_local,
33 rq->rq_cpu_time,
34 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
35
36 seq_printf(seq, "\n");
37
38#ifdef CONFIG_SMP
39 /* domain-specific stats */
40 rcu_read_lock();
41 for_each_domain(cpu, sd) {
42 enum cpu_idle_type itype;
43
44 cpumask_scnprintf(mask_str, mask_len,
45 sched_domain_span(sd));
46 seq_printf(seq, "domain%d %s", dcount++, mask_str);
47 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
48 itype++) {
49 seq_printf(seq, " %u %u %u %u %u %u %u %u",
50 sd->lb_count[itype],
51 sd->lb_balanced[itype],
52 sd->lb_failed[itype],
53 sd->lb_imbalance[itype],
54 sd->lb_gained[itype],
55 sd->lb_hot_gained[itype],
56 sd->lb_nobusyq[itype],
57 sd->lb_nobusyg[itype]);
58 }
59 seq_printf(seq,
60 " %u %u %u %u %u %u %u %u %u %u %u %u\n",
61 sd->alb_count, sd->alb_failed, sd->alb_pushed,
62 sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
63 sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
64 sd->ttwu_wake_remote, sd->ttwu_move_affine,
65 sd->ttwu_move_balance);
66 }
67 rcu_read_unlock();
68#endif
69 }
70 kfree(mask_str);
71 return 0;
72}
73
74static int schedstat_open(struct inode *inode, struct file *file)
75{
76 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
77 char *buf = kmalloc(size, GFP_KERNEL);
78 struct seq_file *m;
79 int res;
80
81 if (!buf)
82 return -ENOMEM;
83 res = single_open(file, show_schedstat, NULL);
84 if (!res) {
85 m = file->private_data;
86 m->buf = buf;
87 m->size = size;
88 } else
89 kfree(buf);
90 return res;
91}
92
93static const struct file_operations proc_schedstat_operations = {
94 .open = schedstat_open,
95 .read = seq_read,
96 .llseek = seq_lseek,
97 .release = single_release,
98};
99
100static int __init proc_schedstat_init(void)
101{
102 proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
103 return 0;
104}
105module_init(proc_schedstat_init);
106 3
107/* 4/*
108 * Expects runqueue lock to be held for atomicity of update 5 * Expects runqueue lock to be held for atomicity of update
@@ -283,8 +180,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
283 return; 180 return;
284 181
285 raw_spin_lock(&cputimer->lock); 182 raw_spin_lock(&cputimer->lock);
286 cputimer->cputime.utime = 183 cputimer->cputime.utime += cputime;
287 cputime_add(cputimer->cputime.utime, cputime);
288 raw_spin_unlock(&cputimer->lock); 184 raw_spin_unlock(&cputimer->lock);
289} 185}
290 186
@@ -307,8 +203,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
307 return; 203 return;
308 204
309 raw_spin_lock(&cputimer->lock); 205 raw_spin_lock(&cputimer->lock);
310 cputimer->cputime.stime = 206 cputimer->cputime.stime += cputime;
311 cputime_add(cputimer->cputime.stime, cputime);
312 raw_spin_unlock(&cputimer->lock); 207 raw_spin_unlock(&cputimer->lock);
313} 208}
314 209
diff --git a/kernel/sched_stoptask.c b/kernel/sched/stop_task.c
index 8b44e7fa7fb3..7b386e86fd23 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched/stop_task.c
@@ -1,3 +1,5 @@
1#include "sched.h"
2
1/* 3/*
2 * stop-task scheduling class. 4 * stop-task scheduling class.
3 * 5 *
@@ -80,7 +82,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task)
80/* 82/*
81 * Simple, special scheduling class for the per-CPU stop tasks: 83 * Simple, special scheduling class for the per-CPU stop tasks:
82 */ 84 */
83static const struct sched_class stop_sched_class = { 85const struct sched_class stop_sched_class = {
84 .next = &rt_sched_class, 86 .next = &rt_sched_class,
85 87
86 .enqueue_task = enqueue_task_stop, 88 .enqueue_task = enqueue_task_stop,
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 57d4b13b631d..e8d76c5895ea 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -6,6 +6,7 @@
6 * This defines a simple but solid secure-computing mode. 6 * This defines a simple but solid secure-computing mode.
7 */ 7 */
8 8
9#include <linux/audit.h>
9#include <linux/seccomp.h> 10#include <linux/seccomp.h>
10#include <linux/sched.h> 11#include <linux/sched.h>
11#include <linux/compat.h> 12#include <linux/compat.h>
@@ -54,6 +55,7 @@ void __secure_computing(int this_syscall)
54#ifdef SECCOMP_DEBUG 55#ifdef SECCOMP_DEBUG
55 dump_stack(); 56 dump_stack();
56#endif 57#endif
58 audit_seccomp(this_syscall);
57 do_exit(SIGKILL); 59 do_exit(SIGKILL);
58} 60}
59 61
diff --git a/kernel/signal.c b/kernel/signal.c
index b3f78d09a105..c73c4284160e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -28,6 +28,7 @@
28#include <linux/freezer.h> 28#include <linux/freezer.h>
29#include <linux/pid_namespace.h> 29#include <linux/pid_namespace.h>
30#include <linux/nsproxy.h> 30#include <linux/nsproxy.h>
31#include <linux/user_namespace.h>
31#define CREATE_TRACE_POINTS 32#define CREATE_TRACE_POINTS
32#include <trace/events/signal.h> 33#include <trace/events/signal.h>
33 34
@@ -1019,6 +1020,34 @@ static inline int legacy_queue(struct sigpending *signals, int sig)
1019 return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); 1020 return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
1020} 1021}
1021 1022
1023/*
1024 * map the uid in struct cred into user namespace *ns
1025 */
1026static inline uid_t map_cred_ns(const struct cred *cred,
1027 struct user_namespace *ns)
1028{
1029 return user_ns_map_uid(ns, cred, cred->uid);
1030}
1031
1032#ifdef CONFIG_USER_NS
1033static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
1034{
1035 if (current_user_ns() == task_cred_xxx(t, user_ns))
1036 return;
1037
1038 if (SI_FROMKERNEL(info))
1039 return;
1040
1041 info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns),
1042 current_cred(), info->si_uid);
1043}
1044#else
1045static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
1046{
1047 return;
1048}
1049#endif
1050
1022static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, 1051static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
1023 int group, int from_ancestor_ns) 1052 int group, int from_ancestor_ns)
1024{ 1053{
@@ -1088,6 +1117,9 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
1088 q->info.si_pid = 0; 1117 q->info.si_pid = 0;
1089 break; 1118 break;
1090 } 1119 }
1120
1121 userns_fixup_signal_uid(&q->info, t);
1122
1091 } else if (!is_si_special(info)) { 1123 } else if (!is_si_special(info)) {
1092 if (sig >= SIGRTMIN && info->si_code != SI_USER) { 1124 if (sig >= SIGRTMIN && info->si_code != SI_USER) {
1093 /* 1125 /*
@@ -1626,13 +1658,12 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
1626 */ 1658 */
1627 rcu_read_lock(); 1659 rcu_read_lock();
1628 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); 1660 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
1629 info.si_uid = __task_cred(tsk)->uid; 1661 info.si_uid = map_cred_ns(__task_cred(tsk),
1662 task_cred_xxx(tsk->parent, user_ns));
1630 rcu_read_unlock(); 1663 rcu_read_unlock();
1631 1664
1632 info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime, 1665 info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime);
1633 tsk->signal->utime)); 1666 info.si_stime = cputime_to_clock_t(tsk->stime + tsk->signal->stime);
1634 info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
1635 tsk->signal->stime));
1636 1667
1637 info.si_status = tsk->exit_code & 0x7f; 1668 info.si_status = tsk->exit_code & 0x7f;
1638 if (tsk->exit_code & 0x80) 1669 if (tsk->exit_code & 0x80)
@@ -1711,7 +1742,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
1711 */ 1742 */
1712 rcu_read_lock(); 1743 rcu_read_lock();
1713 info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); 1744 info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
1714 info.si_uid = __task_cred(tsk)->uid; 1745 info.si_uid = map_cred_ns(__task_cred(tsk),
1746 task_cred_xxx(parent, user_ns));
1715 rcu_read_unlock(); 1747 rcu_read_unlock();
1716 1748
1717 info.si_utime = cputime_to_clock_t(tsk->utime); 1749 info.si_utime = cputime_to_clock_t(tsk->utime);
@@ -1994,8 +2026,6 @@ static bool do_signal_stop(int signr)
1994 */ 2026 */
1995 if (!(sig->flags & SIGNAL_STOP_STOPPED)) 2027 if (!(sig->flags & SIGNAL_STOP_STOPPED))
1996 sig->group_exit_code = signr; 2028 sig->group_exit_code = signr;
1997 else
1998 WARN_ON_ONCE(!current->ptrace);
1999 2029
2000 sig->group_stop_count = 0; 2030 sig->group_stop_count = 0;
2001 2031
@@ -2129,8 +2159,11 @@ static int ptrace_signal(int signr, siginfo_t *info,
2129 info->si_signo = signr; 2159 info->si_signo = signr;
2130 info->si_errno = 0; 2160 info->si_errno = 0;
2131 info->si_code = SI_USER; 2161 info->si_code = SI_USER;
2162 rcu_read_lock();
2132 info->si_pid = task_pid_vnr(current->parent); 2163 info->si_pid = task_pid_vnr(current->parent);
2133 info->si_uid = task_uid(current->parent); 2164 info->si_uid = map_cred_ns(__task_cred(current->parent),
2165 current_user_ns());
2166 rcu_read_unlock();
2134 } 2167 }
2135 2168
2136 /* If the (new) signal is now blocked, requeue it. */ 2169 /* If the (new) signal is now blocked, requeue it. */
@@ -2322,6 +2355,27 @@ relock:
2322 return signr; 2355 return signr;
2323} 2356}
2324 2357
2358/**
2359 * block_sigmask - add @ka's signal mask to current->blocked
2360 * @ka: action for @signr
2361 * @signr: signal that has been successfully delivered
2362 *
2363 * This function should be called when a signal has succesfully been
2364 * delivered. It adds the mask of signals for @ka to current->blocked
2365 * so that they are blocked during the execution of the signal
2366 * handler. In addition, @signr will be blocked unless %SA_NODEFER is
2367 * set in @ka->sa.sa_flags.
2368 */
2369void block_sigmask(struct k_sigaction *ka, int signr)
2370{
2371 sigset_t blocked;
2372
2373 sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
2374 if (!(ka->sa.sa_flags & SA_NODEFER))
2375 sigaddset(&blocked, signr);
2376 set_current_blocked(&blocked);
2377}
2378
2325/* 2379/*
2326 * It could be that complete_signal() picked us to notify about the 2380 * It could be that complete_signal() picked us to notify about the
2327 * group-wide signal. Other threads should be notified now to take 2381 * group-wide signal. Other threads should be notified now to take
@@ -2359,8 +2413,15 @@ void exit_signals(struct task_struct *tsk)
2359 int group_stop = 0; 2413 int group_stop = 0;
2360 sigset_t unblocked; 2414 sigset_t unblocked;
2361 2415
2416 /*
2417 * @tsk is about to have PF_EXITING set - lock out users which
2418 * expect stable threadgroup.
2419 */
2420 threadgroup_change_begin(tsk);
2421
2362 if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) { 2422 if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) {
2363 tsk->flags |= PF_EXITING; 2423 tsk->flags |= PF_EXITING;
2424 threadgroup_change_end(tsk);
2364 return; 2425 return;
2365 } 2426 }
2366 2427
@@ -2370,6 +2431,9 @@ void exit_signals(struct task_struct *tsk)
2370 * see wants_signal(), do_signal_stop(). 2431 * see wants_signal(), do_signal_stop().
2371 */ 2432 */
2372 tsk->flags |= PF_EXITING; 2433 tsk->flags |= PF_EXITING;
2434
2435 threadgroup_change_end(tsk);
2436
2373 if (!signal_pending(tsk)) 2437 if (!signal_pending(tsk))
2374 goto out; 2438 goto out;
2375 2439
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 2c71d91efff0..4eb3a0fa351e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -347,12 +347,12 @@ void irq_exit(void)
347 if (!in_interrupt() && local_softirq_pending()) 347 if (!in_interrupt() && local_softirq_pending())
348 invoke_softirq(); 348 invoke_softirq();
349 349
350 rcu_irq_exit();
351#ifdef CONFIG_NO_HZ 350#ifdef CONFIG_NO_HZ
352 /* Make sure that timer wheel updates are propagated */ 351 /* Make sure that timer wheel updates are propagated */
353 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) 352 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
354 tick_nohz_stop_sched_tick(0); 353 tick_nohz_irq_exit();
355#endif 354#endif
355 rcu_irq_exit();
356 preempt_enable_no_resched(); 356 preempt_enable_no_resched();
357} 357}
358 358
diff --git a/kernel/sys.c b/kernel/sys.c
index 481611fbd079..40701538fbd1 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1605,7 +1605,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1605 unsigned long maxrss = 0; 1605 unsigned long maxrss = 0;
1606 1606
1607 memset((char *) r, 0, sizeof *r); 1607 memset((char *) r, 0, sizeof *r);
1608 utime = stime = cputime_zero; 1608 utime = stime = 0;
1609 1609
1610 if (who == RUSAGE_THREAD) { 1610 if (who == RUSAGE_THREAD) {
1611 task_times(current, &utime, &stime); 1611 task_times(current, &utime, &stime);
@@ -1635,8 +1635,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1635 1635
1636 case RUSAGE_SELF: 1636 case RUSAGE_SELF:
1637 thread_group_times(p, &tgutime, &tgstime); 1637 thread_group_times(p, &tgutime, &tgstime);
1638 utime = cputime_add(utime, tgutime); 1638 utime += tgutime;
1639 stime = cputime_add(stime, tgstime); 1639 stime += tgstime;
1640 r->ru_nvcsw += p->signal->nvcsw; 1640 r->ru_nvcsw += p->signal->nvcsw;
1641 r->ru_nivcsw += p->signal->nivcsw; 1641 r->ru_nivcsw += p->signal->nivcsw;
1642 r->ru_minflt += p->signal->min_flt; 1642 r->ru_minflt += p->signal->min_flt;
@@ -1692,6 +1692,124 @@ SYSCALL_DEFINE1(umask, int, mask)
1692 return mask; 1692 return mask;
1693} 1693}
1694 1694
1695#ifdef CONFIG_CHECKPOINT_RESTORE
1696static int prctl_set_mm(int opt, unsigned long addr,
1697 unsigned long arg4, unsigned long arg5)
1698{
1699 unsigned long rlim = rlimit(RLIMIT_DATA);
1700 unsigned long vm_req_flags;
1701 unsigned long vm_bad_flags;
1702 struct vm_area_struct *vma;
1703 int error = 0;
1704 struct mm_struct *mm = current->mm;
1705
1706 if (arg4 | arg5)
1707 return -EINVAL;
1708
1709 if (!capable(CAP_SYS_ADMIN))
1710 return -EPERM;
1711
1712 if (addr >= TASK_SIZE)
1713 return -EINVAL;
1714
1715 down_read(&mm->mmap_sem);
1716 vma = find_vma(mm, addr);
1717
1718 if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) {
1719 /* It must be existing VMA */
1720 if (!vma || vma->vm_start > addr)
1721 goto out;
1722 }
1723
1724 error = -EINVAL;
1725 switch (opt) {
1726 case PR_SET_MM_START_CODE:
1727 case PR_SET_MM_END_CODE:
1728 vm_req_flags = VM_READ | VM_EXEC;
1729 vm_bad_flags = VM_WRITE | VM_MAYSHARE;
1730
1731 if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
1732 (vma->vm_flags & vm_bad_flags))
1733 goto out;
1734
1735 if (opt == PR_SET_MM_START_CODE)
1736 mm->start_code = addr;
1737 else
1738 mm->end_code = addr;
1739 break;
1740
1741 case PR_SET_MM_START_DATA:
1742 case PR_SET_MM_END_DATA:
1743 vm_req_flags = VM_READ | VM_WRITE;
1744 vm_bad_flags = VM_EXEC | VM_MAYSHARE;
1745
1746 if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
1747 (vma->vm_flags & vm_bad_flags))
1748 goto out;
1749
1750 if (opt == PR_SET_MM_START_DATA)
1751 mm->start_data = addr;
1752 else
1753 mm->end_data = addr;
1754 break;
1755
1756 case PR_SET_MM_START_STACK:
1757
1758#ifdef CONFIG_STACK_GROWSUP
1759 vm_req_flags = VM_READ | VM_WRITE | VM_GROWSUP;
1760#else
1761 vm_req_flags = VM_READ | VM_WRITE | VM_GROWSDOWN;
1762#endif
1763 if ((vma->vm_flags & vm_req_flags) != vm_req_flags)
1764 goto out;
1765
1766 mm->start_stack = addr;
1767 break;
1768
1769 case PR_SET_MM_START_BRK:
1770 if (addr <= mm->end_data)
1771 goto out;
1772
1773 if (rlim < RLIM_INFINITY &&
1774 (mm->brk - addr) +
1775 (mm->end_data - mm->start_data) > rlim)
1776 goto out;
1777
1778 mm->start_brk = addr;
1779 break;
1780
1781 case PR_SET_MM_BRK:
1782 if (addr <= mm->end_data)
1783 goto out;
1784
1785 if (rlim < RLIM_INFINITY &&
1786 (addr - mm->start_brk) +
1787 (mm->end_data - mm->start_data) > rlim)
1788 goto out;
1789
1790 mm->brk = addr;
1791 break;
1792
1793 default:
1794 error = -EINVAL;
1795 goto out;
1796 }
1797
1798 error = 0;
1799
1800out:
1801 up_read(&mm->mmap_sem);
1802
1803 return error;
1804}
1805#else /* CONFIG_CHECKPOINT_RESTORE */
1806static int prctl_set_mm(int opt, unsigned long addr,
1807 unsigned long arg4, unsigned long arg5)
1808{
1809 return -EINVAL;
1810}
1811#endif
1812
1695SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, 1813SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1696 unsigned long, arg4, unsigned long, arg5) 1814 unsigned long, arg4, unsigned long, arg5)
1697{ 1815{
@@ -1841,6 +1959,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1841 else 1959 else
1842 error = PR_MCE_KILL_DEFAULT; 1960 error = PR_MCE_KILL_DEFAULT;
1843 break; 1961 break;
1962 case PR_SET_MM:
1963 error = prctl_set_mm(arg2, arg3, arg4, arg5);
1964 break;
1844 default: 1965 default:
1845 error = -EINVAL; 1966 error = -EINVAL;
1846 break; 1967 break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ae2719643854..f487f257e05e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -803,6 +803,15 @@ static struct ctl_table kern_table[] = {
803 .mode = 0644, 803 .mode = 0644,
804 .proc_handler = proc_dointvec, 804 .proc_handler = proc_dointvec,
805 }, 805 },
806#ifdef CONFIG_DEBUG_STACKOVERFLOW
807 {
808 .procname = "panic_on_stackoverflow",
809 .data = &sysctl_panic_on_stackoverflow,
810 .maxlen = sizeof(int),
811 .mode = 0644,
812 .proc_handler = proc_dointvec,
813 },
814#endif
806 { 815 {
807 .procname = "bootloader_type", 816 .procname = "bootloader_type",
808 .data = &bootloader_type, 817 .data = &bootloader_type,
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index b26c2228fe92..2cf9cc7aa103 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -25,7 +25,7 @@ config HIGH_RES_TIMERS
25config GENERIC_CLOCKEVENTS_BUILD 25config GENERIC_CLOCKEVENTS_BUILD
26 bool 26 bool
27 default y 27 default y
28 depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR 28 depends on GENERIC_CLOCKEVENTS
29 29
30config GENERIC_CLOCKEVENTS_MIN_ADJUST 30config GENERIC_CLOCKEVENTS_MIN_ADJUST
31 bool 31 bool
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 1ecd6ba36d6c..9cd928f7a7c6 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -17,7 +17,6 @@
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/notifier.h> 18#include <linux/notifier.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <linux/sysdev.h>
21 20
22#include "tick-internal.h" 21#include "tick-internal.h"
23 22
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index d3ad022136e5..a45ca167ab24 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -23,8 +23,8 @@
23 * o Allow clocksource drivers to be unregistered 23 * o Allow clocksource drivers to be unregistered
24 */ 24 */
25 25
26#include <linux/device.h>
26#include <linux/clocksource.h> 27#include <linux/clocksource.h>
27#include <linux/sysdev.h>
28#include <linux/init.h> 28#include <linux/init.h>
29#include <linux/module.h> 29#include <linux/module.h>
30#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ 30#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
@@ -796,8 +796,8 @@ EXPORT_SYMBOL(clocksource_unregister);
796 * Provides sysfs interface for listing current clocksource. 796 * Provides sysfs interface for listing current clocksource.
797 */ 797 */
798static ssize_t 798static ssize_t
799sysfs_show_current_clocksources(struct sys_device *dev, 799sysfs_show_current_clocksources(struct device *dev,
800 struct sysdev_attribute *attr, char *buf) 800 struct device_attribute *attr, char *buf)
801{ 801{
802 ssize_t count = 0; 802 ssize_t count = 0;
803 803
@@ -818,8 +818,8 @@ sysfs_show_current_clocksources(struct sys_device *dev,
818 * Takes input from sysfs interface for manually overriding the default 818 * Takes input from sysfs interface for manually overriding the default
819 * clocksource selection. 819 * clocksource selection.
820 */ 820 */
821static ssize_t sysfs_override_clocksource(struct sys_device *dev, 821static ssize_t sysfs_override_clocksource(struct device *dev,
822 struct sysdev_attribute *attr, 822 struct device_attribute *attr,
823 const char *buf, size_t count) 823 const char *buf, size_t count)
824{ 824{
825 size_t ret = count; 825 size_t ret = count;
@@ -853,8 +853,8 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
853 * Provides sysfs interface for listing registered clocksources 853 * Provides sysfs interface for listing registered clocksources
854 */ 854 */
855static ssize_t 855static ssize_t
856sysfs_show_available_clocksources(struct sys_device *dev, 856sysfs_show_available_clocksources(struct device *dev,
857 struct sysdev_attribute *attr, 857 struct device_attribute *attr,
858 char *buf) 858 char *buf)
859{ 859{
860 struct clocksource *src; 860 struct clocksource *src;
@@ -883,35 +883,36 @@ sysfs_show_available_clocksources(struct sys_device *dev,
883/* 883/*
884 * Sysfs setup bits: 884 * Sysfs setup bits:
885 */ 885 */
886static SYSDEV_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, 886static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources,
887 sysfs_override_clocksource); 887 sysfs_override_clocksource);
888 888
889static SYSDEV_ATTR(available_clocksource, 0444, 889static DEVICE_ATTR(available_clocksource, 0444,
890 sysfs_show_available_clocksources, NULL); 890 sysfs_show_available_clocksources, NULL);
891 891
892static struct sysdev_class clocksource_sysclass = { 892static struct bus_type clocksource_subsys = {
893 .name = "clocksource", 893 .name = "clocksource",
894 .dev_name = "clocksource",
894}; 895};
895 896
896static struct sys_device device_clocksource = { 897static struct device device_clocksource = {
897 .id = 0, 898 .id = 0,
898 .cls = &clocksource_sysclass, 899 .bus = &clocksource_subsys,
899}; 900};
900 901
901static int __init init_clocksource_sysfs(void) 902static int __init init_clocksource_sysfs(void)
902{ 903{
903 int error = sysdev_class_register(&clocksource_sysclass); 904 int error = subsys_system_register(&clocksource_subsys, NULL);
904 905
905 if (!error) 906 if (!error)
906 error = sysdev_register(&device_clocksource); 907 error = device_register(&device_clocksource);
907 if (!error) 908 if (!error)
908 error = sysdev_create_file( 909 error = device_create_file(
909 &device_clocksource, 910 &device_clocksource,
910 &attr_current_clocksource); 911 &dev_attr_current_clocksource);
911 if (!error) 912 if (!error)
912 error = sysdev_create_file( 913 error = device_create_file(
913 &device_clocksource, 914 &device_clocksource,
914 &attr_available_clocksource); 915 &dev_attr_available_clocksource);
915 return error; 916 return error;
916} 917}
917 918
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 40420644d0ba..7656642e4b8e 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -275,42 +275,17 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
275} 275}
276EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); 276EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
277 277
278/** 278static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
279 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
280 *
281 * When the next event is more than a tick into the future, stop the idle tick
282 * Called either from the idle loop or from irq_exit() when an idle period was
283 * just interrupted by an interrupt which did not cause a reschedule.
284 */
285void tick_nohz_stop_sched_tick(int inidle)
286{ 279{
287 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; 280 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
288 struct tick_sched *ts;
289 ktime_t last_update, expires, now; 281 ktime_t last_update, expires, now;
290 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 282 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
291 u64 time_delta; 283 u64 time_delta;
292 int cpu; 284 int cpu;
293 285
294 local_irq_save(flags);
295
296 cpu = smp_processor_id(); 286 cpu = smp_processor_id();
297 ts = &per_cpu(tick_cpu_sched, cpu); 287 ts = &per_cpu(tick_cpu_sched, cpu);
298 288
299 /*
300 * Call to tick_nohz_start_idle stops the last_update_time from being
301 * updated. Thus, it must not be called in the event we are called from
302 * irq_exit() with the prior state different than idle.
303 */
304 if (!inidle && !ts->inidle)
305 goto end;
306
307 /*
308 * Set ts->inidle unconditionally. Even if the system did not
309 * switch to NOHZ mode the cpu frequency governers rely on the
310 * update of the idle time accounting in tick_nohz_start_idle().
311 */
312 ts->inidle = 1;
313
314 now = tick_nohz_start_idle(cpu, ts); 289 now = tick_nohz_start_idle(cpu, ts);
315 290
316 /* 291 /*
@@ -326,10 +301,10 @@ void tick_nohz_stop_sched_tick(int inidle)
326 } 301 }
327 302
328 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) 303 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
329 goto end; 304 return;
330 305
331 if (need_resched()) 306 if (need_resched())
332 goto end; 307 return;
333 308
334 if (unlikely(local_softirq_pending() && cpu_online(cpu))) { 309 if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
335 static int ratelimit; 310 static int ratelimit;
@@ -339,7 +314,7 @@ void tick_nohz_stop_sched_tick(int inidle)
339 (unsigned int) local_softirq_pending()); 314 (unsigned int) local_softirq_pending());
340 ratelimit++; 315 ratelimit++;
341 } 316 }
342 goto end; 317 return;
343 } 318 }
344 319
345 ts->idle_calls++; 320 ts->idle_calls++;
@@ -434,7 +409,6 @@ void tick_nohz_stop_sched_tick(int inidle)
434 ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); 409 ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
435 ts->tick_stopped = 1; 410 ts->tick_stopped = 1;
436 ts->idle_jiffies = last_jiffies; 411 ts->idle_jiffies = last_jiffies;
437 rcu_enter_nohz();
438 } 412 }
439 413
440 ts->idle_sleeps++; 414 ts->idle_sleeps++;
@@ -472,8 +446,64 @@ out:
472 ts->next_jiffies = next_jiffies; 446 ts->next_jiffies = next_jiffies;
473 ts->last_jiffies = last_jiffies; 447 ts->last_jiffies = last_jiffies;
474 ts->sleep_length = ktime_sub(dev->next_event, now); 448 ts->sleep_length = ktime_sub(dev->next_event, now);
475end: 449}
476 local_irq_restore(flags); 450
451/**
452 * tick_nohz_idle_enter - stop the idle tick from the idle task
453 *
454 * When the next event is more than a tick into the future, stop the idle tick
455 * Called when we start the idle loop.
456 *
457 * The arch is responsible of calling:
458 *
459 * - rcu_idle_enter() after its last use of RCU before the CPU is put
460 * to sleep.
461 * - rcu_idle_exit() before the first use of RCU after the CPU is woken up.
462 */
463void tick_nohz_idle_enter(void)
464{
465 struct tick_sched *ts;
466
467 WARN_ON_ONCE(irqs_disabled());
468
469 /*
470 * Update the idle state in the scheduler domain hierarchy
471 * when tick_nohz_stop_sched_tick() is called from the idle loop.
472 * State will be updated to busy during the first busy tick after
473 * exiting idle.
474 */
475 set_cpu_sd_state_idle();
476
477 local_irq_disable();
478
479 ts = &__get_cpu_var(tick_cpu_sched);
480 /*
481 * set ts->inidle unconditionally. even if the system did not
482 * switch to nohz mode the cpu frequency governers rely on the
483 * update of the idle time accounting in tick_nohz_start_idle().
484 */
485 ts->inidle = 1;
486 tick_nohz_stop_sched_tick(ts);
487
488 local_irq_enable();
489}
490
491/**
492 * tick_nohz_irq_exit - update next tick event from interrupt exit
493 *
494 * When an interrupt fires while we are idle and it doesn't cause
495 * a reschedule, it may still add, modify or delete a timer, enqueue
496 * an RCU callback, etc...
497 * So we need to re-calculate and reprogram the next tick event.
498 */
499void tick_nohz_irq_exit(void)
500{
501 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
502
503 if (!ts->inidle)
504 return;
505
506 tick_nohz_stop_sched_tick(ts);
477} 507}
478 508
479/** 509/**
@@ -515,11 +545,13 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
515} 545}
516 546
517/** 547/**
518 * tick_nohz_restart_sched_tick - restart the idle tick from the idle task 548 * tick_nohz_idle_exit - restart the idle tick from the idle task
519 * 549 *
520 * Restart the idle tick when the CPU is woken up from idle 550 * Restart the idle tick when the CPU is woken up from idle
551 * This also exit the RCU extended quiescent state. The CPU
552 * can use RCU again after this function is called.
521 */ 553 */
522void tick_nohz_restart_sched_tick(void) 554void tick_nohz_idle_exit(void)
523{ 555{
524 int cpu = smp_processor_id(); 556 int cpu = smp_processor_id();
525 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 557 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
@@ -529,6 +561,7 @@ void tick_nohz_restart_sched_tick(void)
529 ktime_t now; 561 ktime_t now;
530 562
531 local_irq_disable(); 563 local_irq_disable();
564
532 if (ts->idle_active || (ts->inidle && ts->tick_stopped)) 565 if (ts->idle_active || (ts->inidle && ts->tick_stopped))
533 now = ktime_get(); 566 now = ktime_get();
534 567
@@ -543,8 +576,6 @@ void tick_nohz_restart_sched_tick(void)
543 576
544 ts->inidle = 0; 577 ts->inidle = 0;
545 578
546 rcu_exit_nohz();
547
548 /* Update jiffies first */ 579 /* Update jiffies first */
549 select_nohz_load_balancer(0); 580 select_nohz_load_balancer(0);
550 tick_do_update_jiffies64(now); 581 tick_do_update_jiffies64(now);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 237841378c03..0c6358186401 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -131,7 +131,7 @@ static inline s64 timekeeping_get_ns_raw(void)
131 /* calculate the delta since the last update_wall_time: */ 131 /* calculate the delta since the last update_wall_time: */
132 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 132 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
133 133
134 /* return delta convert to nanoseconds using ntp adjusted mult. */ 134 /* return delta convert to nanoseconds. */
135 return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); 135 return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
136} 136}
137 137
@@ -813,11 +813,11 @@ static void timekeeping_adjust(s64 offset)
813 * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs. 813 * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs.
814 * 814 *
815 * Note we subtract one in the shift, so that error is really error*2. 815 * Note we subtract one in the shift, so that error is really error*2.
816 * This "saves" dividing(shifting) intererval twice, but keeps the 816 * This "saves" dividing(shifting) interval twice, but keeps the
817 * (error > interval) comparision as still measuring if error is 817 * (error > interval) comparison as still measuring if error is
818 * larger then half an interval. 818 * larger then half an interval.
819 * 819 *
820 * Note: It does not "save" on aggrivation when reading the code. 820 * Note: It does not "save" on aggravation when reading the code.
821 */ 821 */
822 error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1); 822 error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1);
823 if (error > interval) { 823 if (error > interval) {
@@ -833,7 +833,7 @@ static void timekeeping_adjust(s64 offset)
833 * nanosecond, and store the amount rounded up into 833 * nanosecond, and store the amount rounded up into
834 * the error. This causes the likely below to be unlikely. 834 * the error. This causes the likely below to be unlikely.
835 * 835 *
836 * The properfix is to avoid rounding up by using 836 * The proper fix is to avoid rounding up by using
837 * the high precision timekeeper.xtime_nsec instead of 837 * the high precision timekeeper.xtime_nsec instead of
838 * xtime.tv_nsec everywhere. Fixing this will take some 838 * xtime.tv_nsec everywhere. Fixing this will take some
839 * time. 839 * time.
diff --git a/kernel/timer.c b/kernel/timer.c
index 9c3c62b0c4bc..a297ffcf888e 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -427,6 +427,12 @@ static int timer_fixup_init(void *addr, enum debug_obj_state state)
427 } 427 }
428} 428}
429 429
430/* Stub timer callback for improperly used timers. */
431static void stub_timer(unsigned long data)
432{
433 WARN_ON(1);
434}
435
430/* 436/*
431 * fixup_activate is called when: 437 * fixup_activate is called when:
432 * - an active object is activated 438 * - an active object is activated
@@ -450,7 +456,8 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state)
450 debug_object_activate(timer, &timer_debug_descr); 456 debug_object_activate(timer, &timer_debug_descr);
451 return 0; 457 return 0;
452 } else { 458 } else {
453 WARN_ON_ONCE(1); 459 setup_timer(timer, stub_timer, 0);
460 return 1;
454 } 461 }
455 return 0; 462 return 0;
456 463
@@ -480,12 +487,40 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state)
480 } 487 }
481} 488}
482 489
490/*
491 * fixup_assert_init is called when:
492 * - an untracked/uninit-ed object is found
493 */
494static int timer_fixup_assert_init(void *addr, enum debug_obj_state state)
495{
496 struct timer_list *timer = addr;
497
498 switch (state) {
499 case ODEBUG_STATE_NOTAVAILABLE:
500 if (timer->entry.prev == TIMER_ENTRY_STATIC) {
501 /*
502 * This is not really a fixup. The timer was
503 * statically initialized. We just make sure that it
504 * is tracked in the object tracker.
505 */
506 debug_object_init(timer, &timer_debug_descr);
507 return 0;
508 } else {
509 setup_timer(timer, stub_timer, 0);
510 return 1;
511 }
512 default:
513 return 0;
514 }
515}
516
483static struct debug_obj_descr timer_debug_descr = { 517static struct debug_obj_descr timer_debug_descr = {
484 .name = "timer_list", 518 .name = "timer_list",
485 .debug_hint = timer_debug_hint, 519 .debug_hint = timer_debug_hint,
486 .fixup_init = timer_fixup_init, 520 .fixup_init = timer_fixup_init,
487 .fixup_activate = timer_fixup_activate, 521 .fixup_activate = timer_fixup_activate,
488 .fixup_free = timer_fixup_free, 522 .fixup_free = timer_fixup_free,
523 .fixup_assert_init = timer_fixup_assert_init,
489}; 524};
490 525
491static inline void debug_timer_init(struct timer_list *timer) 526static inline void debug_timer_init(struct timer_list *timer)
@@ -508,6 +543,11 @@ static inline void debug_timer_free(struct timer_list *timer)
508 debug_object_free(timer, &timer_debug_descr); 543 debug_object_free(timer, &timer_debug_descr);
509} 544}
510 545
546static inline void debug_timer_assert_init(struct timer_list *timer)
547{
548 debug_object_assert_init(timer, &timer_debug_descr);
549}
550
511static void __init_timer(struct timer_list *timer, 551static void __init_timer(struct timer_list *timer,
512 const char *name, 552 const char *name,
513 struct lock_class_key *key); 553 struct lock_class_key *key);
@@ -531,6 +571,7 @@ EXPORT_SYMBOL_GPL(destroy_timer_on_stack);
531static inline void debug_timer_init(struct timer_list *timer) { } 571static inline void debug_timer_init(struct timer_list *timer) { }
532static inline void debug_timer_activate(struct timer_list *timer) { } 572static inline void debug_timer_activate(struct timer_list *timer) { }
533static inline void debug_timer_deactivate(struct timer_list *timer) { } 573static inline void debug_timer_deactivate(struct timer_list *timer) { }
574static inline void debug_timer_assert_init(struct timer_list *timer) { }
534#endif 575#endif
535 576
536static inline void debug_init(struct timer_list *timer) 577static inline void debug_init(struct timer_list *timer)
@@ -552,6 +593,11 @@ static inline void debug_deactivate(struct timer_list *timer)
552 trace_timer_cancel(timer); 593 trace_timer_cancel(timer);
553} 594}
554 595
596static inline void debug_assert_init(struct timer_list *timer)
597{
598 debug_timer_assert_init(timer);
599}
600
555static void __init_timer(struct timer_list *timer, 601static void __init_timer(struct timer_list *timer,
556 const char *name, 602 const char *name,
557 struct lock_class_key *key) 603 struct lock_class_key *key)
@@ -902,6 +948,8 @@ int del_timer(struct timer_list *timer)
902 unsigned long flags; 948 unsigned long flags;
903 int ret = 0; 949 int ret = 0;
904 950
951 debug_assert_init(timer);
952
905 timer_stats_timer_clear_start_info(timer); 953 timer_stats_timer_clear_start_info(timer);
906 if (timer_pending(timer)) { 954 if (timer_pending(timer)) {
907 base = lock_timer_base(timer, &flags); 955 base = lock_timer_base(timer, &flags);
@@ -932,6 +980,8 @@ int try_to_del_timer_sync(struct timer_list *timer)
932 unsigned long flags; 980 unsigned long flags;
933 int ret = -1; 981 int ret = -1;
934 982
983 debug_assert_init(timer);
984
935 base = lock_timer_base(timer, &flags); 985 base = lock_timer_base(timer, &flags);
936 986
937 if (base->running_timer == timer) 987 if (base->running_timer == timer)
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 16fc34a0806f..cdea7b56b0c9 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -402,7 +402,7 @@ static int blk_remove_buf_file_callback(struct dentry *dentry)
402 402
403static struct dentry *blk_create_buf_file_callback(const char *filename, 403static struct dentry *blk_create_buf_file_callback(const char *filename,
404 struct dentry *parent, 404 struct dentry *parent,
405 int mode, 405 umode_t mode,
406 struct rchan_buf *buf, 406 struct rchan_buf *buf,
407 int *is_global) 407 int *is_global)
408{ 408{
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b1e8943fed1d..683d559a0eef 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -22,11 +22,13 @@
22#include <linux/hardirq.h> 22#include <linux/hardirq.h>
23#include <linux/kthread.h> 23#include <linux/kthread.h>
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
25#include <linux/bsearch.h>
25#include <linux/module.h> 26#include <linux/module.h>
26#include <linux/ftrace.h> 27#include <linux/ftrace.h>
27#include <linux/sysctl.h> 28#include <linux/sysctl.h>
28#include <linux/slab.h> 29#include <linux/slab.h>
29#include <linux/ctype.h> 30#include <linux/ctype.h>
31#include <linux/sort.h>
30#include <linux/list.h> 32#include <linux/list.h>
31#include <linux/hash.h> 33#include <linux/hash.h>
32#include <linux/rcupdate.h> 34#include <linux/rcupdate.h>
@@ -947,13 +949,6 @@ struct ftrace_func_probe {
947 struct rcu_head rcu; 949 struct rcu_head rcu;
948}; 950};
949 951
950enum {
951 FTRACE_ENABLE_CALLS = (1 << 0),
952 FTRACE_DISABLE_CALLS = (1 << 1),
953 FTRACE_UPDATE_TRACE_FUNC = (1 << 2),
954 FTRACE_START_FUNC_RET = (1 << 3),
955 FTRACE_STOP_FUNC_RET = (1 << 4),
956};
957struct ftrace_func_entry { 952struct ftrace_func_entry {
958 struct hlist_node hlist; 953 struct hlist_node hlist;
959 unsigned long ip; 954 unsigned long ip;
@@ -984,18 +979,19 @@ static struct ftrace_ops global_ops = {
984 .filter_hash = EMPTY_HASH, 979 .filter_hash = EMPTY_HASH,
985}; 980};
986 981
987static struct dyn_ftrace *ftrace_new_addrs;
988
989static DEFINE_MUTEX(ftrace_regex_lock); 982static DEFINE_MUTEX(ftrace_regex_lock);
990 983
991struct ftrace_page { 984struct ftrace_page {
992 struct ftrace_page *next; 985 struct ftrace_page *next;
986 struct dyn_ftrace *records;
993 int index; 987 int index;
994 struct dyn_ftrace records[]; 988 int size;
995}; 989};
996 990
997#define ENTRIES_PER_PAGE \ 991static struct ftrace_page *ftrace_new_pgs;
998 ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace)) 992
993#define ENTRY_SIZE sizeof(struct dyn_ftrace)
994#define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE)
999 995
1000/* estimate from running different kernels */ 996/* estimate from running different kernels */
1001#define NR_TO_INIT 10000 997#define NR_TO_INIT 10000
@@ -1003,7 +999,10 @@ struct ftrace_page {
1003static struct ftrace_page *ftrace_pages_start; 999static struct ftrace_page *ftrace_pages_start;
1004static struct ftrace_page *ftrace_pages; 1000static struct ftrace_page *ftrace_pages;
1005 1001
1006static struct dyn_ftrace *ftrace_free_records; 1002static bool ftrace_hash_empty(struct ftrace_hash *hash)
1003{
1004 return !hash || !hash->count;
1005}
1007 1006
1008static struct ftrace_func_entry * 1007static struct ftrace_func_entry *
1009ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) 1008ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
@@ -1013,7 +1012,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
1013 struct hlist_head *hhd; 1012 struct hlist_head *hhd;
1014 struct hlist_node *n; 1013 struct hlist_node *n;
1015 1014
1016 if (!hash->count) 1015 if (ftrace_hash_empty(hash))
1017 return NULL; 1016 return NULL;
1018 1017
1019 if (hash->size_bits > 0) 1018 if (hash->size_bits > 0)
@@ -1157,7 +1156,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
1157 return NULL; 1156 return NULL;
1158 1157
1159 /* Empty hash? */ 1158 /* Empty hash? */
1160 if (!hash || !hash->count) 1159 if (ftrace_hash_empty(hash))
1161 return new_hash; 1160 return new_hash;
1162 1161
1163 size = 1 << hash->size_bits; 1162 size = 1 << hash->size_bits;
@@ -1282,9 +1281,9 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
1282 filter_hash = rcu_dereference_raw(ops->filter_hash); 1281 filter_hash = rcu_dereference_raw(ops->filter_hash);
1283 notrace_hash = rcu_dereference_raw(ops->notrace_hash); 1282 notrace_hash = rcu_dereference_raw(ops->notrace_hash);
1284 1283
1285 if ((!filter_hash || !filter_hash->count || 1284 if ((ftrace_hash_empty(filter_hash) ||
1286 ftrace_lookup_ip(filter_hash, ip)) && 1285 ftrace_lookup_ip(filter_hash, ip)) &&
1287 (!notrace_hash || !notrace_hash->count || 1286 (ftrace_hash_empty(notrace_hash) ||
1288 !ftrace_lookup_ip(notrace_hash, ip))) 1287 !ftrace_lookup_ip(notrace_hash, ip)))
1289 ret = 1; 1288 ret = 1;
1290 else 1289 else
@@ -1307,6 +1306,47 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
1307 } \ 1306 } \
1308 } 1307 }
1309 1308
1309
1310static int ftrace_cmp_recs(const void *a, const void *b)
1311{
1312 const struct dyn_ftrace *reca = a;
1313 const struct dyn_ftrace *recb = b;
1314
1315 if (reca->ip > recb->ip)
1316 return 1;
1317 if (reca->ip < recb->ip)
1318 return -1;
1319 return 0;
1320}
1321
1322/**
1323 * ftrace_location - return true if the ip giving is a traced location
1324 * @ip: the instruction pointer to check
1325 *
1326 * Returns 1 if @ip given is a pointer to a ftrace location.
1327 * That is, the instruction that is either a NOP or call to
1328 * the function tracer. It checks the ftrace internal tables to
1329 * determine if the address belongs or not.
1330 */
1331int ftrace_location(unsigned long ip)
1332{
1333 struct ftrace_page *pg;
1334 struct dyn_ftrace *rec;
1335 struct dyn_ftrace key;
1336
1337 key.ip = ip;
1338
1339 for (pg = ftrace_pages_start; pg; pg = pg->next) {
1340 rec = bsearch(&key, pg->records, pg->index,
1341 sizeof(struct dyn_ftrace),
1342 ftrace_cmp_recs);
1343 if (rec)
1344 return 1;
1345 }
1346
1347 return 0;
1348}
1349
1310static void __ftrace_hash_rec_update(struct ftrace_ops *ops, 1350static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1311 int filter_hash, 1351 int filter_hash,
1312 bool inc) 1352 bool inc)
@@ -1336,7 +1376,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1336 if (filter_hash) { 1376 if (filter_hash) {
1337 hash = ops->filter_hash; 1377 hash = ops->filter_hash;
1338 other_hash = ops->notrace_hash; 1378 other_hash = ops->notrace_hash;
1339 if (!hash || !hash->count) 1379 if (ftrace_hash_empty(hash))
1340 all = 1; 1380 all = 1;
1341 } else { 1381 } else {
1342 inc = !inc; 1382 inc = !inc;
@@ -1346,7 +1386,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1346 * If the notrace hash has no items, 1386 * If the notrace hash has no items,
1347 * then there's nothing to do. 1387 * then there's nothing to do.
1348 */ 1388 */
1349 if (hash && !hash->count) 1389 if (ftrace_hash_empty(hash))
1350 return; 1390 return;
1351 } 1391 }
1352 1392
@@ -1363,8 +1403,8 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1363 if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip)) 1403 if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip))
1364 match = 1; 1404 match = 1;
1365 } else { 1405 } else {
1366 in_hash = hash && !!ftrace_lookup_ip(hash, rec->ip); 1406 in_hash = !!ftrace_lookup_ip(hash, rec->ip);
1367 in_other_hash = other_hash && !!ftrace_lookup_ip(other_hash, rec->ip); 1407 in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip);
1368 1408
1369 /* 1409 /*
1370 * 1410 *
@@ -1372,7 +1412,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1372 if (filter_hash && in_hash && !in_other_hash) 1412 if (filter_hash && in_hash && !in_other_hash)
1373 match = 1; 1413 match = 1;
1374 else if (!filter_hash && in_hash && 1414 else if (!filter_hash && in_hash &&
1375 (in_other_hash || !other_hash->count)) 1415 (in_other_hash || ftrace_hash_empty(other_hash)))
1376 match = 1; 1416 match = 1;
1377 } 1417 }
1378 if (!match) 1418 if (!match)
@@ -1406,40 +1446,12 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops,
1406 __ftrace_hash_rec_update(ops, filter_hash, 1); 1446 __ftrace_hash_rec_update(ops, filter_hash, 1);
1407} 1447}
1408 1448
1409static void ftrace_free_rec(struct dyn_ftrace *rec)
1410{
1411 rec->freelist = ftrace_free_records;
1412 ftrace_free_records = rec;
1413 rec->flags |= FTRACE_FL_FREE;
1414}
1415
1416static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) 1449static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
1417{ 1450{
1418 struct dyn_ftrace *rec; 1451 if (ftrace_pages->index == ftrace_pages->size) {
1419 1452 /* We should have allocated enough */
1420 /* First check for freed records */ 1453 if (WARN_ON(!ftrace_pages->next))
1421 if (ftrace_free_records) {
1422 rec = ftrace_free_records;
1423
1424 if (unlikely(!(rec->flags & FTRACE_FL_FREE))) {
1425 FTRACE_WARN_ON_ONCE(1);
1426 ftrace_free_records = NULL;
1427 return NULL; 1454 return NULL;
1428 }
1429
1430 ftrace_free_records = rec->freelist;
1431 memset(rec, 0, sizeof(*rec));
1432 return rec;
1433 }
1434
1435 if (ftrace_pages->index == ENTRIES_PER_PAGE) {
1436 if (!ftrace_pages->next) {
1437 /* allocate another page */
1438 ftrace_pages->next =
1439 (void *)get_zeroed_page(GFP_KERNEL);
1440 if (!ftrace_pages->next)
1441 return NULL;
1442 }
1443 ftrace_pages = ftrace_pages->next; 1455 ftrace_pages = ftrace_pages->next;
1444 } 1456 }
1445 1457
@@ -1459,8 +1471,6 @@ ftrace_record_ip(unsigned long ip)
1459 return NULL; 1471 return NULL;
1460 1472
1461 rec->ip = ip; 1473 rec->ip = ip;
1462 rec->newlist = ftrace_new_addrs;
1463 ftrace_new_addrs = rec;
1464 1474
1465 return rec; 1475 return rec;
1466} 1476}
@@ -1475,7 +1485,19 @@ static void print_ip_ins(const char *fmt, unsigned char *p)
1475 printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); 1485 printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
1476} 1486}
1477 1487
1478static void ftrace_bug(int failed, unsigned long ip) 1488/**
1489 * ftrace_bug - report and shutdown function tracer
1490 * @failed: The failed type (EFAULT, EINVAL, EPERM)
1491 * @ip: The address that failed
1492 *
1493 * The arch code that enables or disables the function tracing
1494 * can call ftrace_bug() when it has detected a problem in
1495 * modifying the code. @failed should be one of either:
1496 * EFAULT - if the problem happens on reading the @ip address
1497 * EINVAL - if what is read at @ip is not what was expected
1498 * EPERM - if the problem happens on writting to the @ip address
1499 */
1500void ftrace_bug(int failed, unsigned long ip)
1479{ 1501{
1480 switch (failed) { 1502 switch (failed) {
1481 case -EFAULT: 1503 case -EFAULT:
@@ -1517,24 +1539,19 @@ int ftrace_text_reserved(void *start, void *end)
1517 return 0; 1539 return 0;
1518} 1540}
1519 1541
1520 1542static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
1521static int
1522__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1523{ 1543{
1524 unsigned long ftrace_addr;
1525 unsigned long flag = 0UL; 1544 unsigned long flag = 0UL;
1526 1545
1527 ftrace_addr = (unsigned long)FTRACE_ADDR;
1528
1529 /* 1546 /*
1530 * If we are enabling tracing: 1547 * If we are updating calls:
1531 * 1548 *
1532 * If the record has a ref count, then we need to enable it 1549 * If the record has a ref count, then we need to enable it
1533 * because someone is using it. 1550 * because someone is using it.
1534 * 1551 *
1535 * Otherwise we make sure its disabled. 1552 * Otherwise we make sure its disabled.
1536 * 1553 *
1537 * If we are disabling tracing, then disable all records that 1554 * If we are disabling calls, then disable all records that
1538 * are enabled. 1555 * are enabled.
1539 */ 1556 */
1540 if (enable && (rec->flags & ~FTRACE_FL_MASK)) 1557 if (enable && (rec->flags & ~FTRACE_FL_MASK))
@@ -1542,18 +1559,72 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1542 1559
1543 /* If the state of this record hasn't changed, then do nothing */ 1560 /* If the state of this record hasn't changed, then do nothing */
1544 if ((rec->flags & FTRACE_FL_ENABLED) == flag) 1561 if ((rec->flags & FTRACE_FL_ENABLED) == flag)
1545 return 0; 1562 return FTRACE_UPDATE_IGNORE;
1546 1563
1547 if (flag) { 1564 if (flag) {
1548 rec->flags |= FTRACE_FL_ENABLED; 1565 if (update)
1566 rec->flags |= FTRACE_FL_ENABLED;
1567 return FTRACE_UPDATE_MAKE_CALL;
1568 }
1569
1570 if (update)
1571 rec->flags &= ~FTRACE_FL_ENABLED;
1572
1573 return FTRACE_UPDATE_MAKE_NOP;
1574}
1575
1576/**
1577 * ftrace_update_record, set a record that now is tracing or not
1578 * @rec: the record to update
1579 * @enable: set to 1 if the record is tracing, zero to force disable
1580 *
1581 * The records that represent all functions that can be traced need
1582 * to be updated when tracing has been enabled.
1583 */
1584int ftrace_update_record(struct dyn_ftrace *rec, int enable)
1585{
1586 return ftrace_check_record(rec, enable, 1);
1587}
1588
1589/**
1590 * ftrace_test_record, check if the record has been enabled or not
1591 * @rec: the record to test
1592 * @enable: set to 1 to check if enabled, 0 if it is disabled
1593 *
1594 * The arch code may need to test if a record is already set to
1595 * tracing to determine how to modify the function code that it
1596 * represents.
1597 */
1598int ftrace_test_record(struct dyn_ftrace *rec, int enable)
1599{
1600 return ftrace_check_record(rec, enable, 0);
1601}
1602
1603static int
1604__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1605{
1606 unsigned long ftrace_addr;
1607 int ret;
1608
1609 ftrace_addr = (unsigned long)FTRACE_ADDR;
1610
1611 ret = ftrace_update_record(rec, enable);
1612
1613 switch (ret) {
1614 case FTRACE_UPDATE_IGNORE:
1615 return 0;
1616
1617 case FTRACE_UPDATE_MAKE_CALL:
1549 return ftrace_make_call(rec, ftrace_addr); 1618 return ftrace_make_call(rec, ftrace_addr);
1619
1620 case FTRACE_UPDATE_MAKE_NOP:
1621 return ftrace_make_nop(NULL, rec, ftrace_addr);
1550 } 1622 }
1551 1623
1552 rec->flags &= ~FTRACE_FL_ENABLED; 1624 return -1; /* unknow ftrace bug */
1553 return ftrace_make_nop(NULL, rec, ftrace_addr);
1554} 1625}
1555 1626
1556static void ftrace_replace_code(int enable) 1627static void ftrace_replace_code(int update)
1557{ 1628{
1558 struct dyn_ftrace *rec; 1629 struct dyn_ftrace *rec;
1559 struct ftrace_page *pg; 1630 struct ftrace_page *pg;
@@ -1563,11 +1634,7 @@ static void ftrace_replace_code(int enable)
1563 return; 1634 return;
1564 1635
1565 do_for_each_ftrace_rec(pg, rec) { 1636 do_for_each_ftrace_rec(pg, rec) {
1566 /* Skip over free records */ 1637 failed = __ftrace_replace_code(rec, update);
1567 if (rec->flags & FTRACE_FL_FREE)
1568 continue;
1569
1570 failed = __ftrace_replace_code(rec, enable);
1571 if (failed) { 1638 if (failed) {
1572 ftrace_bug(failed, rec->ip); 1639 ftrace_bug(failed, rec->ip);
1573 /* Stop processing */ 1640 /* Stop processing */
@@ -1576,6 +1643,78 @@ static void ftrace_replace_code(int enable)
1576 } while_for_each_ftrace_rec(); 1643 } while_for_each_ftrace_rec();
1577} 1644}
1578 1645
1646struct ftrace_rec_iter {
1647 struct ftrace_page *pg;
1648 int index;
1649};
1650
1651/**
1652 * ftrace_rec_iter_start, start up iterating over traced functions
1653 *
1654 * Returns an iterator handle that is used to iterate over all
1655 * the records that represent address locations where functions
1656 * are traced.
1657 *
1658 * May return NULL if no records are available.
1659 */
1660struct ftrace_rec_iter *ftrace_rec_iter_start(void)
1661{
1662 /*
1663 * We only use a single iterator.
1664 * Protected by the ftrace_lock mutex.
1665 */
1666 static struct ftrace_rec_iter ftrace_rec_iter;
1667 struct ftrace_rec_iter *iter = &ftrace_rec_iter;
1668
1669 iter->pg = ftrace_pages_start;
1670 iter->index = 0;
1671
1672 /* Could have empty pages */
1673 while (iter->pg && !iter->pg->index)
1674 iter->pg = iter->pg->next;
1675
1676 if (!iter->pg)
1677 return NULL;
1678
1679 return iter;
1680}
1681
1682/**
1683 * ftrace_rec_iter_next, get the next record to process.
1684 * @iter: The handle to the iterator.
1685 *
1686 * Returns the next iterator after the given iterator @iter.
1687 */
1688struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter)
1689{
1690 iter->index++;
1691
1692 if (iter->index >= iter->pg->index) {
1693 iter->pg = iter->pg->next;
1694 iter->index = 0;
1695
1696 /* Could have empty pages */
1697 while (iter->pg && !iter->pg->index)
1698 iter->pg = iter->pg->next;
1699 }
1700
1701 if (!iter->pg)
1702 return NULL;
1703
1704 return iter;
1705}
1706
1707/**
1708 * ftrace_rec_iter_record, get the record at the iterator location
1709 * @iter: The current iterator location
1710 *
1711 * Returns the record that the current @iter is at.
1712 */
1713struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter)
1714{
1715 return &iter->pg->records[iter->index];
1716}
1717
1579static int 1718static int
1580ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) 1719ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
1581{ 1720{
@@ -1617,13 +1756,7 @@ static int __ftrace_modify_code(void *data)
1617{ 1756{
1618 int *command = data; 1757 int *command = data;
1619 1758
1620 /* 1759 if (*command & FTRACE_UPDATE_CALLS)
1621 * Do not call function tracer while we update the code.
1622 * We are in stop machine, no worrying about races.
1623 */
1624 function_trace_stop++;
1625
1626 if (*command & FTRACE_ENABLE_CALLS)
1627 ftrace_replace_code(1); 1760 ftrace_replace_code(1);
1628 else if (*command & FTRACE_DISABLE_CALLS) 1761 else if (*command & FTRACE_DISABLE_CALLS)
1629 ftrace_replace_code(0); 1762 ftrace_replace_code(0);
@@ -1636,21 +1769,33 @@ static int __ftrace_modify_code(void *data)
1636 else if (*command & FTRACE_STOP_FUNC_RET) 1769 else if (*command & FTRACE_STOP_FUNC_RET)
1637 ftrace_disable_ftrace_graph_caller(); 1770 ftrace_disable_ftrace_graph_caller();
1638 1771
1639#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
1640 /*
1641 * For archs that call ftrace_test_stop_func(), we must
1642 * wait till after we update all the function callers
1643 * before we update the callback. This keeps different
1644 * ops that record different functions from corrupting
1645 * each other.
1646 */
1647 __ftrace_trace_function = __ftrace_trace_function_delay;
1648#endif
1649 function_trace_stop--;
1650
1651 return 0; 1772 return 0;
1652} 1773}
1653 1774
1775/**
1776 * ftrace_run_stop_machine, go back to the stop machine method
1777 * @command: The command to tell ftrace what to do
1778 *
1779 * If an arch needs to fall back to the stop machine method, the
1780 * it can call this function.
1781 */
1782void ftrace_run_stop_machine(int command)
1783{
1784 stop_machine(__ftrace_modify_code, &command, NULL);
1785}
1786
1787/**
1788 * arch_ftrace_update_code, modify the code to trace or not trace
1789 * @command: The command that needs to be done
1790 *
1791 * Archs can override this function if it does not need to
1792 * run stop_machine() to modify code.
1793 */
1794void __weak arch_ftrace_update_code(int command)
1795{
1796 ftrace_run_stop_machine(command);
1797}
1798
1654static void ftrace_run_update_code(int command) 1799static void ftrace_run_update_code(int command)
1655{ 1800{
1656 int ret; 1801 int ret;
@@ -1659,8 +1804,31 @@ static void ftrace_run_update_code(int command)
1659 FTRACE_WARN_ON(ret); 1804 FTRACE_WARN_ON(ret);
1660 if (ret) 1805 if (ret)
1661 return; 1806 return;
1807 /*
1808 * Do not call function tracer while we update the code.
1809 * We are in stop machine.
1810 */
1811 function_trace_stop++;
1662 1812
1663 stop_machine(__ftrace_modify_code, &command, NULL); 1813 /*
1814 * By default we use stop_machine() to modify the code.
1815 * But archs can do what ever they want as long as it
1816 * is safe. The stop_machine() is the safest, but also
1817 * produces the most overhead.
1818 */
1819 arch_ftrace_update_code(command);
1820
1821#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
1822 /*
1823 * For archs that call ftrace_test_stop_func(), we must
1824 * wait till after we update all the function callers
1825 * before we update the callback. This keeps different
1826 * ops that record different functions from corrupting
1827 * each other.
1828 */
1829 __ftrace_trace_function = __ftrace_trace_function_delay;
1830#endif
1831 function_trace_stop--;
1664 1832
1665 ret = ftrace_arch_code_modify_post_process(); 1833 ret = ftrace_arch_code_modify_post_process();
1666 FTRACE_WARN_ON(ret); 1834 FTRACE_WARN_ON(ret);
@@ -1691,7 +1859,7 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
1691 return -ENODEV; 1859 return -ENODEV;
1692 1860
1693 ftrace_start_up++; 1861 ftrace_start_up++;
1694 command |= FTRACE_ENABLE_CALLS; 1862 command |= FTRACE_UPDATE_CALLS;
1695 1863
1696 /* ops marked global share the filter hashes */ 1864 /* ops marked global share the filter hashes */
1697 if (ops->flags & FTRACE_OPS_FL_GLOBAL) { 1865 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
@@ -1743,8 +1911,7 @@ static void ftrace_shutdown(struct ftrace_ops *ops, int command)
1743 if (ops != &global_ops || !global_start_up) 1911 if (ops != &global_ops || !global_start_up)
1744 ops->flags &= ~FTRACE_OPS_FL_ENABLED; 1912 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
1745 1913
1746 if (!ftrace_start_up) 1914 command |= FTRACE_UPDATE_CALLS;
1747 command |= FTRACE_DISABLE_CALLS;
1748 1915
1749 if (saved_ftrace_func != ftrace_trace_function) { 1916 if (saved_ftrace_func != ftrace_trace_function) {
1750 saved_ftrace_func = ftrace_trace_function; 1917 saved_ftrace_func = ftrace_trace_function;
@@ -1766,7 +1933,7 @@ static void ftrace_startup_sysctl(void)
1766 saved_ftrace_func = NULL; 1933 saved_ftrace_func = NULL;
1767 /* ftrace_start_up is true if we want ftrace running */ 1934 /* ftrace_start_up is true if we want ftrace running */
1768 if (ftrace_start_up) 1935 if (ftrace_start_up)
1769 ftrace_run_update_code(FTRACE_ENABLE_CALLS); 1936 ftrace_run_update_code(FTRACE_UPDATE_CALLS);
1770} 1937}
1771 1938
1772static void ftrace_shutdown_sysctl(void) 1939static void ftrace_shutdown_sysctl(void)
@@ -1788,14 +1955,16 @@ static int ops_traces_mod(struct ftrace_ops *ops)
1788 struct ftrace_hash *hash; 1955 struct ftrace_hash *hash;
1789 1956
1790 hash = ops->filter_hash; 1957 hash = ops->filter_hash;
1791 return !!(!hash || !hash->count); 1958 return ftrace_hash_empty(hash);
1792} 1959}
1793 1960
1794static int ftrace_update_code(struct module *mod) 1961static int ftrace_update_code(struct module *mod)
1795{ 1962{
1963 struct ftrace_page *pg;
1796 struct dyn_ftrace *p; 1964 struct dyn_ftrace *p;
1797 cycle_t start, stop; 1965 cycle_t start, stop;
1798 unsigned long ref = 0; 1966 unsigned long ref = 0;
1967 int i;
1799 1968
1800 /* 1969 /*
1801 * When adding a module, we need to check if tracers are 1970 * When adding a module, we need to check if tracers are
@@ -1817,46 +1986,44 @@ static int ftrace_update_code(struct module *mod)
1817 start = ftrace_now(raw_smp_processor_id()); 1986 start = ftrace_now(raw_smp_processor_id());
1818 ftrace_update_cnt = 0; 1987 ftrace_update_cnt = 0;
1819 1988
1820 while (ftrace_new_addrs) { 1989 for (pg = ftrace_new_pgs; pg; pg = pg->next) {
1821 1990
1822 /* If something went wrong, bail without enabling anything */ 1991 for (i = 0; i < pg->index; i++) {
1823 if (unlikely(ftrace_disabled)) 1992 /* If something went wrong, bail without enabling anything */
1824 return -1; 1993 if (unlikely(ftrace_disabled))
1994 return -1;
1825 1995
1826 p = ftrace_new_addrs; 1996 p = &pg->records[i];
1827 ftrace_new_addrs = p->newlist; 1997 p->flags = ref;
1828 p->flags = ref;
1829 1998
1830 /* 1999 /*
1831 * Do the initial record conversion from mcount jump 2000 * Do the initial record conversion from mcount jump
1832 * to the NOP instructions. 2001 * to the NOP instructions.
1833 */ 2002 */
1834 if (!ftrace_code_disable(mod, p)) { 2003 if (!ftrace_code_disable(mod, p))
1835 ftrace_free_rec(p); 2004 break;
1836 /* Game over */
1837 break;
1838 }
1839 2005
1840 ftrace_update_cnt++; 2006 ftrace_update_cnt++;
1841 2007
1842 /* 2008 /*
1843 * If the tracing is enabled, go ahead and enable the record. 2009 * If the tracing is enabled, go ahead and enable the record.
1844 * 2010 *
1845 * The reason not to enable the record immediatelly is the 2011 * The reason not to enable the record immediatelly is the
1846 * inherent check of ftrace_make_nop/ftrace_make_call for 2012 * inherent check of ftrace_make_nop/ftrace_make_call for
1847 * correct previous instructions. Making first the NOP 2013 * correct previous instructions. Making first the NOP
1848 * conversion puts the module to the correct state, thus 2014 * conversion puts the module to the correct state, thus
1849 * passing the ftrace_make_call check. 2015 * passing the ftrace_make_call check.
1850 */ 2016 */
1851 if (ftrace_start_up && ref) { 2017 if (ftrace_start_up && ref) {
1852 int failed = __ftrace_replace_code(p, 1); 2018 int failed = __ftrace_replace_code(p, 1);
1853 if (failed) { 2019 if (failed)
1854 ftrace_bug(failed, p->ip); 2020 ftrace_bug(failed, p->ip);
1855 ftrace_free_rec(p);
1856 } 2021 }
1857 } 2022 }
1858 } 2023 }
1859 2024
2025 ftrace_new_pgs = NULL;
2026
1860 stop = ftrace_now(raw_smp_processor_id()); 2027 stop = ftrace_now(raw_smp_processor_id());
1861 ftrace_update_time = stop - start; 2028 ftrace_update_time = stop - start;
1862 ftrace_update_tot_cnt += ftrace_update_cnt; 2029 ftrace_update_tot_cnt += ftrace_update_cnt;
@@ -1864,57 +2031,108 @@ static int ftrace_update_code(struct module *mod)
1864 return 0; 2031 return 0;
1865} 2032}
1866 2033
1867static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) 2034static int ftrace_allocate_records(struct ftrace_page *pg, int count)
1868{ 2035{
1869 struct ftrace_page *pg; 2036 int order;
1870 int cnt; 2037 int cnt;
1871 int i;
1872 2038
1873 /* allocate a few pages */ 2039 if (WARN_ON(!count))
1874 ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL); 2040 return -EINVAL;
1875 if (!ftrace_pages_start) 2041
1876 return -1; 2042 order = get_count_order(DIV_ROUND_UP(count, ENTRIES_PER_PAGE));
1877 2043
1878 /* 2044 /*
1879 * Allocate a few more pages. 2045 * We want to fill as much as possible. No more than a page
1880 * 2046 * may be empty.
1881 * TODO: have some parser search vmlinux before
1882 * final linking to find all calls to ftrace.
1883 * Then we can:
1884 * a) know how many pages to allocate.
1885 * and/or
1886 * b) set up the table then.
1887 *
1888 * The dynamic code is still necessary for
1889 * modules.
1890 */ 2047 */
2048 while ((PAGE_SIZE << order) / ENTRY_SIZE >= count + ENTRIES_PER_PAGE)
2049 order--;
1891 2050
1892 pg = ftrace_pages = ftrace_pages_start; 2051 again:
2052 pg->records = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
1893 2053
1894 cnt = num_to_init / ENTRIES_PER_PAGE; 2054 if (!pg->records) {
1895 pr_info("ftrace: allocating %ld entries in %d pages\n", 2055 /* if we can't allocate this size, try something smaller */
1896 num_to_init, cnt + 1); 2056 if (!order)
2057 return -ENOMEM;
2058 order >>= 1;
2059 goto again;
2060 }
1897 2061
1898 for (i = 0; i < cnt; i++) { 2062 cnt = (PAGE_SIZE << order) / ENTRY_SIZE;
1899 pg->next = (void *)get_zeroed_page(GFP_KERNEL); 2063 pg->size = cnt;
1900 2064
1901 /* If we fail, we'll try later anyway */ 2065 if (cnt > count)
1902 if (!pg->next) 2066 cnt = count;
2067
2068 return cnt;
2069}
2070
2071static struct ftrace_page *
2072ftrace_allocate_pages(unsigned long num_to_init)
2073{
2074 struct ftrace_page *start_pg;
2075 struct ftrace_page *pg;
2076 int order;
2077 int cnt;
2078
2079 if (!num_to_init)
2080 return 0;
2081
2082 start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL);
2083 if (!pg)
2084 return NULL;
2085
2086 /*
2087 * Try to allocate as much as possible in one continues
2088 * location that fills in all of the space. We want to
2089 * waste as little space as possible.
2090 */
2091 for (;;) {
2092 cnt = ftrace_allocate_records(pg, num_to_init);
2093 if (cnt < 0)
2094 goto free_pages;
2095
2096 num_to_init -= cnt;
2097 if (!num_to_init)
1903 break; 2098 break;
1904 2099
2100 pg->next = kzalloc(sizeof(*pg), GFP_KERNEL);
2101 if (!pg->next)
2102 goto free_pages;
2103
1905 pg = pg->next; 2104 pg = pg->next;
1906 } 2105 }
1907 2106
1908 return 0; 2107 return start_pg;
2108
2109 free_pages:
2110 while (start_pg) {
2111 order = get_count_order(pg->size / ENTRIES_PER_PAGE);
2112 free_pages((unsigned long)pg->records, order);
2113 start_pg = pg->next;
2114 kfree(pg);
2115 pg = start_pg;
2116 }
2117 pr_info("ftrace: FAILED to allocate memory for functions\n");
2118 return NULL;
1909} 2119}
1910 2120
1911enum { 2121static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
1912 FTRACE_ITER_FILTER = (1 << 0), 2122{
1913 FTRACE_ITER_NOTRACE = (1 << 1), 2123 int cnt;
1914 FTRACE_ITER_PRINTALL = (1 << 2), 2124
1915 FTRACE_ITER_HASH = (1 << 3), 2125 if (!num_to_init) {
1916 FTRACE_ITER_ENABLED = (1 << 4), 2126 pr_info("ftrace: No functions to be traced?\n");
1917}; 2127 return -1;
2128 }
2129
2130 cnt = num_to_init / ENTRIES_PER_PAGE;
2131 pr_info("ftrace: allocating %ld entries in %d pages\n",
2132 num_to_init, cnt + 1);
2133
2134 return 0;
2135}
1918 2136
1919#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ 2137#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
1920 2138
@@ -1980,6 +2198,9 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
1980 void *p = NULL; 2198 void *p = NULL;
1981 loff_t l; 2199 loff_t l;
1982 2200
2201 if (!(iter->flags & FTRACE_ITER_DO_HASH))
2202 return NULL;
2203
1983 if (iter->func_pos > *pos) 2204 if (iter->func_pos > *pos)
1984 return NULL; 2205 return NULL;
1985 2206
@@ -2023,7 +2244,7 @@ static void *
2023t_next(struct seq_file *m, void *v, loff_t *pos) 2244t_next(struct seq_file *m, void *v, loff_t *pos)
2024{ 2245{
2025 struct ftrace_iterator *iter = m->private; 2246 struct ftrace_iterator *iter = m->private;
2026 struct ftrace_ops *ops = &global_ops; 2247 struct ftrace_ops *ops = iter->ops;
2027 struct dyn_ftrace *rec = NULL; 2248 struct dyn_ftrace *rec = NULL;
2028 2249
2029 if (unlikely(ftrace_disabled)) 2250 if (unlikely(ftrace_disabled))
@@ -2047,9 +2268,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
2047 } 2268 }
2048 } else { 2269 } else {
2049 rec = &iter->pg->records[iter->idx++]; 2270 rec = &iter->pg->records[iter->idx++];
2050 if ((rec->flags & FTRACE_FL_FREE) || 2271 if (((iter->flags & FTRACE_ITER_FILTER) &&
2051
2052 ((iter->flags & FTRACE_ITER_FILTER) &&
2053 !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) || 2272 !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) ||
2054 2273
2055 ((iter->flags & FTRACE_ITER_NOTRACE) && 2274 ((iter->flags & FTRACE_ITER_NOTRACE) &&
@@ -2081,7 +2300,7 @@ static void reset_iter_read(struct ftrace_iterator *iter)
2081static void *t_start(struct seq_file *m, loff_t *pos) 2300static void *t_start(struct seq_file *m, loff_t *pos)
2082{ 2301{
2083 struct ftrace_iterator *iter = m->private; 2302 struct ftrace_iterator *iter = m->private;
2084 struct ftrace_ops *ops = &global_ops; 2303 struct ftrace_ops *ops = iter->ops;
2085 void *p = NULL; 2304 void *p = NULL;
2086 loff_t l; 2305 loff_t l;
2087 2306
@@ -2101,7 +2320,8 @@ static void *t_start(struct seq_file *m, loff_t *pos)
2101 * off, we can short cut and just print out that all 2320 * off, we can short cut and just print out that all
2102 * functions are enabled. 2321 * functions are enabled.
2103 */ 2322 */
2104 if (iter->flags & FTRACE_ITER_FILTER && !ops->filter_hash->count) { 2323 if (iter->flags & FTRACE_ITER_FILTER &&
2324 ftrace_hash_empty(ops->filter_hash)) {
2105 if (*pos > 0) 2325 if (*pos > 0)
2106 return t_hash_start(m, pos); 2326 return t_hash_start(m, pos);
2107 iter->flags |= FTRACE_ITER_PRINTALL; 2327 iter->flags |= FTRACE_ITER_PRINTALL;
@@ -2126,12 +2346,8 @@ static void *t_start(struct seq_file *m, loff_t *pos)
2126 break; 2346 break;
2127 } 2347 }
2128 2348
2129 if (!p) { 2349 if (!p)
2130 if (iter->flags & FTRACE_ITER_FILTER) 2350 return t_hash_start(m, pos);
2131 return t_hash_start(m, pos);
2132
2133 return NULL;
2134 }
2135 2351
2136 return iter; 2352 return iter;
2137} 2353}
@@ -2189,6 +2405,7 @@ ftrace_avail_open(struct inode *inode, struct file *file)
2189 return -ENOMEM; 2405 return -ENOMEM;
2190 2406
2191 iter->pg = ftrace_pages_start; 2407 iter->pg = ftrace_pages_start;
2408 iter->ops = &global_ops;
2192 2409
2193 ret = seq_open(file, &show_ftrace_seq_ops); 2410 ret = seq_open(file, &show_ftrace_seq_ops);
2194 if (!ret) { 2411 if (!ret) {
@@ -2217,6 +2434,7 @@ ftrace_enabled_open(struct inode *inode, struct file *file)
2217 2434
2218 iter->pg = ftrace_pages_start; 2435 iter->pg = ftrace_pages_start;
2219 iter->flags = FTRACE_ITER_ENABLED; 2436 iter->flags = FTRACE_ITER_ENABLED;
2437 iter->ops = &global_ops;
2220 2438
2221 ret = seq_open(file, &show_ftrace_seq_ops); 2439 ret = seq_open(file, &show_ftrace_seq_ops);
2222 if (!ret) { 2440 if (!ret) {
@@ -2237,7 +2455,23 @@ static void ftrace_filter_reset(struct ftrace_hash *hash)
2237 mutex_unlock(&ftrace_lock); 2455 mutex_unlock(&ftrace_lock);
2238} 2456}
2239 2457
2240static int 2458/**
2459 * ftrace_regex_open - initialize function tracer filter files
2460 * @ops: The ftrace_ops that hold the hash filters
2461 * @flag: The type of filter to process
2462 * @inode: The inode, usually passed in to your open routine
2463 * @file: The file, usually passed in to your open routine
2464 *
2465 * ftrace_regex_open() initializes the filter files for the
2466 * @ops. Depending on @flag it may process the filter hash or
2467 * the notrace hash of @ops. With this called from the open
2468 * routine, you can use ftrace_filter_write() for the write
2469 * routine if @flag has FTRACE_ITER_FILTER set, or
2470 * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set.
2471 * ftrace_regex_lseek() should be used as the lseek routine, and
2472 * release must call ftrace_regex_release().
2473 */
2474int
2241ftrace_regex_open(struct ftrace_ops *ops, int flag, 2475ftrace_regex_open(struct ftrace_ops *ops, int flag,
2242 struct inode *inode, struct file *file) 2476 struct inode *inode, struct file *file)
2243{ 2477{
@@ -2306,8 +2540,9 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
2306static int 2540static int
2307ftrace_filter_open(struct inode *inode, struct file *file) 2541ftrace_filter_open(struct inode *inode, struct file *file)
2308{ 2542{
2309 return ftrace_regex_open(&global_ops, FTRACE_ITER_FILTER, 2543 return ftrace_regex_open(&global_ops,
2310 inode, file); 2544 FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH,
2545 inode, file);
2311} 2546}
2312 2547
2313static int 2548static int
@@ -2317,7 +2552,7 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
2317 inode, file); 2552 inode, file);
2318} 2553}
2319 2554
2320static loff_t 2555loff_t
2321ftrace_regex_lseek(struct file *file, loff_t offset, int origin) 2556ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
2322{ 2557{
2323 loff_t ret; 2558 loff_t ret;
@@ -2426,7 +2661,6 @@ match_records(struct ftrace_hash *hash, char *buff,
2426 goto out_unlock; 2661 goto out_unlock;
2427 2662
2428 do_for_each_ftrace_rec(pg, rec) { 2663 do_for_each_ftrace_rec(pg, rec) {
2429
2430 if (ftrace_match_record(rec, mod, search, search_len, type)) { 2664 if (ftrace_match_record(rec, mod, search, search_len, type)) {
2431 ret = enter_record(hash, rec, not); 2665 ret = enter_record(hash, rec, not);
2432 if (ret < 0) { 2666 if (ret < 0) {
@@ -2871,14 +3105,14 @@ out_unlock:
2871 return ret; 3105 return ret;
2872} 3106}
2873 3107
2874static ssize_t 3108ssize_t
2875ftrace_filter_write(struct file *file, const char __user *ubuf, 3109ftrace_filter_write(struct file *file, const char __user *ubuf,
2876 size_t cnt, loff_t *ppos) 3110 size_t cnt, loff_t *ppos)
2877{ 3111{
2878 return ftrace_regex_write(file, ubuf, cnt, ppos, 1); 3112 return ftrace_regex_write(file, ubuf, cnt, ppos, 1);
2879} 3113}
2880 3114
2881static ssize_t 3115ssize_t
2882ftrace_notrace_write(struct file *file, const char __user *ubuf, 3116ftrace_notrace_write(struct file *file, const char __user *ubuf,
2883 size_t cnt, loff_t *ppos) 3117 size_t cnt, loff_t *ppos)
2884{ 3118{
@@ -2919,7 +3153,7 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
2919 ret = ftrace_hash_move(ops, enable, orig_hash, hash); 3153 ret = ftrace_hash_move(ops, enable, orig_hash, hash);
2920 if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED 3154 if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED
2921 && ftrace_enabled) 3155 && ftrace_enabled)
2922 ftrace_run_update_code(FTRACE_ENABLE_CALLS); 3156 ftrace_run_update_code(FTRACE_UPDATE_CALLS);
2923 3157
2924 mutex_unlock(&ftrace_lock); 3158 mutex_unlock(&ftrace_lock);
2925 3159
@@ -3045,8 +3279,8 @@ static void __init set_ftrace_early_graph(char *buf)
3045} 3279}
3046#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 3280#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
3047 3281
3048static void __init 3282void __init
3049set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable) 3283ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable)
3050{ 3284{
3051 char *func; 3285 char *func;
3052 3286
@@ -3059,17 +3293,16 @@ set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable)
3059static void __init set_ftrace_early_filters(void) 3293static void __init set_ftrace_early_filters(void)
3060{ 3294{
3061 if (ftrace_filter_buf[0]) 3295 if (ftrace_filter_buf[0])
3062 set_ftrace_early_filter(&global_ops, ftrace_filter_buf, 1); 3296 ftrace_set_early_filter(&global_ops, ftrace_filter_buf, 1);
3063 if (ftrace_notrace_buf[0]) 3297 if (ftrace_notrace_buf[0])
3064 set_ftrace_early_filter(&global_ops, ftrace_notrace_buf, 0); 3298 ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0);
3065#ifdef CONFIG_FUNCTION_GRAPH_TRACER 3299#ifdef CONFIG_FUNCTION_GRAPH_TRACER
3066 if (ftrace_graph_buf[0]) 3300 if (ftrace_graph_buf[0])
3067 set_ftrace_early_graph(ftrace_graph_buf); 3301 set_ftrace_early_graph(ftrace_graph_buf);
3068#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 3302#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
3069} 3303}
3070 3304
3071static int 3305int ftrace_regex_release(struct inode *inode, struct file *file)
3072ftrace_regex_release(struct inode *inode, struct file *file)
3073{ 3306{
3074 struct seq_file *m = (struct seq_file *)file->private_data; 3307 struct seq_file *m = (struct seq_file *)file->private_data;
3075 struct ftrace_iterator *iter; 3308 struct ftrace_iterator *iter;
@@ -3107,7 +3340,7 @@ ftrace_regex_release(struct inode *inode, struct file *file)
3107 orig_hash, iter->hash); 3340 orig_hash, iter->hash);
3108 if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED) 3341 if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED)
3109 && ftrace_enabled) 3342 && ftrace_enabled)
3110 ftrace_run_update_code(FTRACE_ENABLE_CALLS); 3343 ftrace_run_update_code(FTRACE_UPDATE_CALLS);
3111 3344
3112 mutex_unlock(&ftrace_lock); 3345 mutex_unlock(&ftrace_lock);
3113 } 3346 }
@@ -3270,9 +3503,6 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
3270 3503
3271 do_for_each_ftrace_rec(pg, rec) { 3504 do_for_each_ftrace_rec(pg, rec) {
3272 3505
3273 if (rec->flags & FTRACE_FL_FREE)
3274 continue;
3275
3276 if (ftrace_match_record(rec, NULL, search, search_len, type)) { 3506 if (ftrace_match_record(rec, NULL, search, search_len, type)) {
3277 /* if it is in the array */ 3507 /* if it is in the array */
3278 exists = false; 3508 exists = false;
@@ -3381,15 +3611,62 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
3381 return 0; 3611 return 0;
3382} 3612}
3383 3613
3614static void ftrace_swap_recs(void *a, void *b, int size)
3615{
3616 struct dyn_ftrace *reca = a;
3617 struct dyn_ftrace *recb = b;
3618 struct dyn_ftrace t;
3619
3620 t = *reca;
3621 *reca = *recb;
3622 *recb = t;
3623}
3624
3384static int ftrace_process_locs(struct module *mod, 3625static int ftrace_process_locs(struct module *mod,
3385 unsigned long *start, 3626 unsigned long *start,
3386 unsigned long *end) 3627 unsigned long *end)
3387{ 3628{
3629 struct ftrace_page *pg;
3630 unsigned long count;
3388 unsigned long *p; 3631 unsigned long *p;
3389 unsigned long addr; 3632 unsigned long addr;
3390 unsigned long flags = 0; /* Shut up gcc */ 3633 unsigned long flags = 0; /* Shut up gcc */
3634 int ret = -ENOMEM;
3635
3636 count = end - start;
3637
3638 if (!count)
3639 return 0;
3640
3641 pg = ftrace_allocate_pages(count);
3642 if (!pg)
3643 return -ENOMEM;
3391 3644
3392 mutex_lock(&ftrace_lock); 3645 mutex_lock(&ftrace_lock);
3646
3647 /*
3648 * Core and each module needs their own pages, as
3649 * modules will free them when they are removed.
3650 * Force a new page to be allocated for modules.
3651 */
3652 if (!mod) {
3653 WARN_ON(ftrace_pages || ftrace_pages_start);
3654 /* First initialization */
3655 ftrace_pages = ftrace_pages_start = pg;
3656 } else {
3657 if (!ftrace_pages)
3658 goto out;
3659
3660 if (WARN_ON(ftrace_pages->next)) {
3661 /* Hmm, we have free pages? */
3662 while (ftrace_pages->next)
3663 ftrace_pages = ftrace_pages->next;
3664 }
3665
3666 ftrace_pages->next = pg;
3667 ftrace_pages = pg;
3668 }
3669
3393 p = start; 3670 p = start;
3394 while (p < end) { 3671 while (p < end) {
3395 addr = ftrace_call_adjust(*p++); 3672 addr = ftrace_call_adjust(*p++);
@@ -3401,9 +3678,18 @@ static int ftrace_process_locs(struct module *mod,
3401 */ 3678 */
3402 if (!addr) 3679 if (!addr)
3403 continue; 3680 continue;
3404 ftrace_record_ip(addr); 3681 if (!ftrace_record_ip(addr))
3682 break;
3405 } 3683 }
3406 3684
3685 /* These new locations need to be initialized */
3686 ftrace_new_pgs = pg;
3687
3688 /* Make each individual set of pages sorted by ips */
3689 for (; pg; pg = pg->next)
3690 sort(pg->records, pg->index, sizeof(struct dyn_ftrace),
3691 ftrace_cmp_recs, ftrace_swap_recs);
3692
3407 /* 3693 /*
3408 * We only need to disable interrupts on start up 3694 * We only need to disable interrupts on start up
3409 * because we are modifying code that an interrupt 3695 * because we are modifying code that an interrupt
@@ -3417,32 +3703,55 @@ static int ftrace_process_locs(struct module *mod,
3417 ftrace_update_code(mod); 3703 ftrace_update_code(mod);
3418 if (!mod) 3704 if (!mod)
3419 local_irq_restore(flags); 3705 local_irq_restore(flags);
3706 ret = 0;
3707 out:
3420 mutex_unlock(&ftrace_lock); 3708 mutex_unlock(&ftrace_lock);
3421 3709
3422 return 0; 3710 return ret;
3423} 3711}
3424 3712
3425#ifdef CONFIG_MODULES 3713#ifdef CONFIG_MODULES
3714
3715#define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
3716
3426void ftrace_release_mod(struct module *mod) 3717void ftrace_release_mod(struct module *mod)
3427{ 3718{
3428 struct dyn_ftrace *rec; 3719 struct dyn_ftrace *rec;
3720 struct ftrace_page **last_pg;
3429 struct ftrace_page *pg; 3721 struct ftrace_page *pg;
3722 int order;
3430 3723
3431 mutex_lock(&ftrace_lock); 3724 mutex_lock(&ftrace_lock);
3432 3725
3433 if (ftrace_disabled) 3726 if (ftrace_disabled)
3434 goto out_unlock; 3727 goto out_unlock;
3435 3728
3436 do_for_each_ftrace_rec(pg, rec) { 3729 /*
3730 * Each module has its own ftrace_pages, remove
3731 * them from the list.
3732 */
3733 last_pg = &ftrace_pages_start;
3734 for (pg = ftrace_pages_start; pg; pg = *last_pg) {
3735 rec = &pg->records[0];
3437 if (within_module_core(rec->ip, mod)) { 3736 if (within_module_core(rec->ip, mod)) {
3438 /* 3737 /*
3439 * rec->ip is changed in ftrace_free_rec() 3738 * As core pages are first, the first
3440 * It should not between s and e if record was freed. 3739 * page should never be a module page.
3441 */ 3740 */
3442 FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE); 3741 if (WARN_ON(pg == ftrace_pages_start))
3443 ftrace_free_rec(rec); 3742 goto out_unlock;
3444 } 3743
3445 } while_for_each_ftrace_rec(); 3744 /* Check if we are deleting the last page */
3745 if (pg == ftrace_pages)
3746 ftrace_pages = next_to_ftrace_page(last_pg);
3747
3748 *last_pg = pg->next;
3749 order = get_count_order(pg->size / ENTRIES_PER_PAGE);
3750 free_pages((unsigned long)pg->records, order);
3751 kfree(pg);
3752 } else
3753 last_pg = &pg->next;
3754 }
3446 out_unlock: 3755 out_unlock:
3447 mutex_unlock(&ftrace_lock); 3756 mutex_unlock(&ftrace_lock);
3448} 3757}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f2bd275bb60f..a3f1bc5d2a00 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -338,7 +338,8 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
338/* trace_flags holds trace_options default values */ 338/* trace_flags holds trace_options default values */
339unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | 339unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
340 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | 340 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
341 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE; 341 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE |
342 TRACE_ITER_IRQ_INFO;
342 343
343static int trace_stop_count; 344static int trace_stop_count;
344static DEFINE_RAW_SPINLOCK(tracing_start_lock); 345static DEFINE_RAW_SPINLOCK(tracing_start_lock);
@@ -426,6 +427,7 @@ static const char *trace_options[] = {
426 "record-cmd", 427 "record-cmd",
427 "overwrite", 428 "overwrite",
428 "disable_on_free", 429 "disable_on_free",
430 "irq-info",
429 NULL 431 NULL
430}; 432};
431 433
@@ -1843,6 +1845,33 @@ static void s_stop(struct seq_file *m, void *p)
1843 trace_event_read_unlock(); 1845 trace_event_read_unlock();
1844} 1846}
1845 1847
1848static void
1849get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *entries)
1850{
1851 unsigned long count;
1852 int cpu;
1853
1854 *total = 0;
1855 *entries = 0;
1856
1857 for_each_tracing_cpu(cpu) {
1858 count = ring_buffer_entries_cpu(tr->buffer, cpu);
1859 /*
1860 * If this buffer has skipped entries, then we hold all
1861 * entries for the trace and we need to ignore the
1862 * ones before the time stamp.
1863 */
1864 if (tr->data[cpu]->skipped_entries) {
1865 count -= tr->data[cpu]->skipped_entries;
1866 /* total is the same as the entries */
1867 *total += count;
1868 } else
1869 *total += count +
1870 ring_buffer_overrun_cpu(tr->buffer, cpu);
1871 *entries += count;
1872 }
1873}
1874
1846static void print_lat_help_header(struct seq_file *m) 1875static void print_lat_help_header(struct seq_file *m)
1847{ 1876{
1848 seq_puts(m, "# _------=> CPU# \n"); 1877 seq_puts(m, "# _------=> CPU# \n");
@@ -1855,12 +1884,35 @@ static void print_lat_help_header(struct seq_file *m)
1855 seq_puts(m, "# \\ / ||||| \\ | / \n"); 1884 seq_puts(m, "# \\ / ||||| \\ | / \n");
1856} 1885}
1857 1886
1858static void print_func_help_header(struct seq_file *m) 1887static void print_event_info(struct trace_array *tr, struct seq_file *m)
1888{
1889 unsigned long total;
1890 unsigned long entries;
1891
1892 get_total_entries(tr, &total, &entries);
1893 seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu #P:%d\n",
1894 entries, total, num_online_cpus());
1895 seq_puts(m, "#\n");
1896}
1897
1898static void print_func_help_header(struct trace_array *tr, struct seq_file *m)
1859{ 1899{
1860 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); 1900 print_event_info(tr, m);
1901 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n");
1861 seq_puts(m, "# | | | | |\n"); 1902 seq_puts(m, "# | | | | |\n");
1862} 1903}
1863 1904
1905static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m)
1906{
1907 print_event_info(tr, m);
1908 seq_puts(m, "# _-----=> irqs-off\n");
1909 seq_puts(m, "# / _----=> need-resched\n");
1910 seq_puts(m, "# | / _---=> hardirq/softirq\n");
1911 seq_puts(m, "# || / _--=> preempt-depth\n");
1912 seq_puts(m, "# ||| / delay\n");
1913 seq_puts(m, "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n");
1914 seq_puts(m, "# | | | |||| | |\n");
1915}
1864 1916
1865void 1917void
1866print_trace_header(struct seq_file *m, struct trace_iterator *iter) 1918print_trace_header(struct seq_file *m, struct trace_iterator *iter)
@@ -1869,32 +1921,14 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1869 struct trace_array *tr = iter->tr; 1921 struct trace_array *tr = iter->tr;
1870 struct trace_array_cpu *data = tr->data[tr->cpu]; 1922 struct trace_array_cpu *data = tr->data[tr->cpu];
1871 struct tracer *type = current_trace; 1923 struct tracer *type = current_trace;
1872 unsigned long entries = 0; 1924 unsigned long entries;
1873 unsigned long total = 0; 1925 unsigned long total;
1874 unsigned long count;
1875 const char *name = "preemption"; 1926 const char *name = "preemption";
1876 int cpu;
1877 1927
1878 if (type) 1928 if (type)
1879 name = type->name; 1929 name = type->name;
1880 1930
1881 1931 get_total_entries(tr, &total, &entries);
1882 for_each_tracing_cpu(cpu) {
1883 count = ring_buffer_entries_cpu(tr->buffer, cpu);
1884 /*
1885 * If this buffer has skipped entries, then we hold all
1886 * entries for the trace and we need to ignore the
1887 * ones before the time stamp.
1888 */
1889 if (tr->data[cpu]->skipped_entries) {
1890 count -= tr->data[cpu]->skipped_entries;
1891 /* total is the same as the entries */
1892 total += count;
1893 } else
1894 total += count +
1895 ring_buffer_overrun_cpu(tr->buffer, cpu);
1896 entries += count;
1897 }
1898 1932
1899 seq_printf(m, "# %s latency trace v1.1.5 on %s\n", 1933 seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
1900 name, UTS_RELEASE); 1934 name, UTS_RELEASE);
@@ -2140,6 +2174,21 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
2140 return print_trace_fmt(iter); 2174 return print_trace_fmt(iter);
2141} 2175}
2142 2176
2177void trace_latency_header(struct seq_file *m)
2178{
2179 struct trace_iterator *iter = m->private;
2180
2181 /* print nothing if the buffers are empty */
2182 if (trace_empty(iter))
2183 return;
2184
2185 if (iter->iter_flags & TRACE_FILE_LAT_FMT)
2186 print_trace_header(m, iter);
2187
2188 if (!(trace_flags & TRACE_ITER_VERBOSE))
2189 print_lat_help_header(m);
2190}
2191
2143void trace_default_header(struct seq_file *m) 2192void trace_default_header(struct seq_file *m)
2144{ 2193{
2145 struct trace_iterator *iter = m->private; 2194 struct trace_iterator *iter = m->private;
@@ -2155,8 +2204,12 @@ void trace_default_header(struct seq_file *m)
2155 if (!(trace_flags & TRACE_ITER_VERBOSE)) 2204 if (!(trace_flags & TRACE_ITER_VERBOSE))
2156 print_lat_help_header(m); 2205 print_lat_help_header(m);
2157 } else { 2206 } else {
2158 if (!(trace_flags & TRACE_ITER_VERBOSE)) 2207 if (!(trace_flags & TRACE_ITER_VERBOSE)) {
2159 print_func_help_header(m); 2208 if (trace_flags & TRACE_ITER_IRQ_INFO)
2209 print_func_help_header_irq(iter->tr, m);
2210 else
2211 print_func_help_header(iter->tr, m);
2212 }
2160 } 2213 }
2161} 2214}
2162 2215
@@ -4385,7 +4438,7 @@ static const struct file_operations trace_options_core_fops = {
4385}; 4438};
4386 4439
4387struct dentry *trace_create_file(const char *name, 4440struct dentry *trace_create_file(const char *name,
4388 mode_t mode, 4441 umode_t mode,
4389 struct dentry *parent, 4442 struct dentry *parent,
4390 void *data, 4443 void *data,
4391 const struct file_operations *fops) 4444 const struct file_operations *fops)
@@ -4775,6 +4828,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
4775{ 4828{
4776 __ftrace_dump(true, oops_dump_mode); 4829 __ftrace_dump(true, oops_dump_mode);
4777} 4830}
4831EXPORT_SYMBOL_GPL(ftrace_dump);
4778 4832
4779__init static int tracer_alloc_buffers(void) 4833__init static int tracer_alloc_buffers(void)
4780{ 4834{
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 092e1f8d18dc..b93ecbadad6d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -312,7 +312,7 @@ void tracing_reset_current(int cpu);
312void tracing_reset_current_online_cpus(void); 312void tracing_reset_current_online_cpus(void);
313int tracing_open_generic(struct inode *inode, struct file *filp); 313int tracing_open_generic(struct inode *inode, struct file *filp);
314struct dentry *trace_create_file(const char *name, 314struct dentry *trace_create_file(const char *name,
315 mode_t mode, 315 umode_t mode,
316 struct dentry *parent, 316 struct dentry *parent,
317 void *data, 317 void *data,
318 const struct file_operations *fops); 318 const struct file_operations *fops);
@@ -370,6 +370,7 @@ void trace_graph_function(struct trace_array *tr,
370 unsigned long ip, 370 unsigned long ip,
371 unsigned long parent_ip, 371 unsigned long parent_ip,
372 unsigned long flags, int pc); 372 unsigned long flags, int pc);
373void trace_latency_header(struct seq_file *m);
373void trace_default_header(struct seq_file *m); 374void trace_default_header(struct seq_file *m);
374void print_trace_header(struct seq_file *m, struct trace_iterator *iter); 375void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
375int trace_empty(struct trace_iterator *iter); 376int trace_empty(struct trace_iterator *iter);
@@ -654,6 +655,7 @@ enum trace_iterator_flags {
654 TRACE_ITER_RECORD_CMD = 0x100000, 655 TRACE_ITER_RECORD_CMD = 0x100000,
655 TRACE_ITER_OVERWRITE = 0x200000, 656 TRACE_ITER_OVERWRITE = 0x200000,
656 TRACE_ITER_STOP_ON_FREE = 0x400000, 657 TRACE_ITER_STOP_ON_FREE = 0x400000,
658 TRACE_ITER_IRQ_INFO = 0x800000,
657}; 659};
658 660
659/* 661/*
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 95dc31efd6dd..24aee7127451 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -27,6 +27,12 @@
27#include "trace.h" 27#include "trace.h"
28#include "trace_output.h" 28#include "trace_output.h"
29 29
30#define DEFAULT_SYS_FILTER_MESSAGE \
31 "### global filter ###\n" \
32 "# Use this to set filters for multiple events.\n" \
33 "# Only events with the given fields will be affected.\n" \
34 "# If no events are modified, an error message will be displayed here"
35
30enum filter_op_ids 36enum filter_op_ids
31{ 37{
32 OP_OR, 38 OP_OR,
@@ -646,7 +652,7 @@ void print_subsystem_event_filter(struct event_subsystem *system,
646 if (filter && filter->filter_string) 652 if (filter && filter->filter_string)
647 trace_seq_printf(s, "%s\n", filter->filter_string); 653 trace_seq_printf(s, "%s\n", filter->filter_string);
648 else 654 else
649 trace_seq_printf(s, "none\n"); 655 trace_seq_printf(s, DEFAULT_SYS_FILTER_MESSAGE "\n");
650 mutex_unlock(&event_mutex); 656 mutex_unlock(&event_mutex);
651} 657}
652 658
@@ -1732,11 +1738,121 @@ static int replace_system_preds(struct event_subsystem *system,
1732 return -ENOMEM; 1738 return -ENOMEM;
1733} 1739}
1734 1740
1741static int create_filter_start(char *filter_str, bool set_str,
1742 struct filter_parse_state **psp,
1743 struct event_filter **filterp)
1744{
1745 struct event_filter *filter;
1746 struct filter_parse_state *ps = NULL;
1747 int err = 0;
1748
1749 WARN_ON_ONCE(*psp || *filterp);
1750
1751 /* allocate everything, and if any fails, free all and fail */
1752 filter = __alloc_filter();
1753 if (filter && set_str)
1754 err = replace_filter_string(filter, filter_str);
1755
1756 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1757
1758 if (!filter || !ps || err) {
1759 kfree(ps);
1760 __free_filter(filter);
1761 return -ENOMEM;
1762 }
1763
1764 /* we're committed to creating a new filter */
1765 *filterp = filter;
1766 *psp = ps;
1767
1768 parse_init(ps, filter_ops, filter_str);
1769 err = filter_parse(ps);
1770 if (err && set_str)
1771 append_filter_err(ps, filter);
1772 return err;
1773}
1774
1775static void create_filter_finish(struct filter_parse_state *ps)
1776{
1777 if (ps) {
1778 filter_opstack_clear(ps);
1779 postfix_clear(ps);
1780 kfree(ps);
1781 }
1782}
1783
1784/**
1785 * create_filter - create a filter for a ftrace_event_call
1786 * @call: ftrace_event_call to create a filter for
1787 * @filter_str: filter string
1788 * @set_str: remember @filter_str and enable detailed error in filter
1789 * @filterp: out param for created filter (always updated on return)
1790 *
1791 * Creates a filter for @call with @filter_str. If @set_str is %true,
1792 * @filter_str is copied and recorded in the new filter.
1793 *
1794 * On success, returns 0 and *@filterp points to the new filter. On
1795 * failure, returns -errno and *@filterp may point to %NULL or to a new
1796 * filter. In the latter case, the returned filter contains error
1797 * information if @set_str is %true and the caller is responsible for
1798 * freeing it.
1799 */
1800static int create_filter(struct ftrace_event_call *call,
1801 char *filter_str, bool set_str,
1802 struct event_filter **filterp)
1803{
1804 struct event_filter *filter = NULL;
1805 struct filter_parse_state *ps = NULL;
1806 int err;
1807
1808 err = create_filter_start(filter_str, set_str, &ps, &filter);
1809 if (!err) {
1810 err = replace_preds(call, filter, ps, filter_str, false);
1811 if (err && set_str)
1812 append_filter_err(ps, filter);
1813 }
1814 create_filter_finish(ps);
1815
1816 *filterp = filter;
1817 return err;
1818}
1819
1820/**
1821 * create_system_filter - create a filter for an event_subsystem
1822 * @system: event_subsystem to create a filter for
1823 * @filter_str: filter string
1824 * @filterp: out param for created filter (always updated on return)
1825 *
1826 * Identical to create_filter() except that it creates a subsystem filter
1827 * and always remembers @filter_str.
1828 */
1829static int create_system_filter(struct event_subsystem *system,
1830 char *filter_str, struct event_filter **filterp)
1831{
1832 struct event_filter *filter = NULL;
1833 struct filter_parse_state *ps = NULL;
1834 int err;
1835
1836 err = create_filter_start(filter_str, true, &ps, &filter);
1837 if (!err) {
1838 err = replace_system_preds(system, ps, filter_str);
1839 if (!err) {
1840 /* System filters just show a default message */
1841 kfree(filter->filter_string);
1842 filter->filter_string = NULL;
1843 } else {
1844 append_filter_err(ps, filter);
1845 }
1846 }
1847 create_filter_finish(ps);
1848
1849 *filterp = filter;
1850 return err;
1851}
1852
1735int apply_event_filter(struct ftrace_event_call *call, char *filter_string) 1853int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1736{ 1854{
1737 struct filter_parse_state *ps;
1738 struct event_filter *filter; 1855 struct event_filter *filter;
1739 struct event_filter *tmp;
1740 int err = 0; 1856 int err = 0;
1741 1857
1742 mutex_lock(&event_mutex); 1858 mutex_lock(&event_mutex);
@@ -1753,49 +1869,30 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1753 goto out_unlock; 1869 goto out_unlock;
1754 } 1870 }
1755 1871
1756 err = -ENOMEM; 1872 err = create_filter(call, filter_string, true, &filter);
1757 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1758 if (!ps)
1759 goto out_unlock;
1760
1761 filter = __alloc_filter();
1762 if (!filter) {
1763 kfree(ps);
1764 goto out_unlock;
1765 }
1766
1767 replace_filter_string(filter, filter_string);
1768
1769 parse_init(ps, filter_ops, filter_string);
1770 err = filter_parse(ps);
1771 if (err) {
1772 append_filter_err(ps, filter);
1773 goto out;
1774 }
1775 1873
1776 err = replace_preds(call, filter, ps, filter_string, false);
1777 if (err) {
1778 filter_disable(call);
1779 append_filter_err(ps, filter);
1780 } else
1781 call->flags |= TRACE_EVENT_FL_FILTERED;
1782out:
1783 /* 1874 /*
1784 * Always swap the call filter with the new filter 1875 * Always swap the call filter with the new filter
1785 * even if there was an error. If there was an error 1876 * even if there was an error. If there was an error
1786 * in the filter, we disable the filter and show the error 1877 * in the filter, we disable the filter and show the error
1787 * string 1878 * string
1788 */ 1879 */
1789 tmp = call->filter; 1880 if (filter) {
1790 rcu_assign_pointer(call->filter, filter); 1881 struct event_filter *tmp = call->filter;
1791 if (tmp) { 1882
1792 /* Make sure the call is done with the filter */ 1883 if (!err)
1793 synchronize_sched(); 1884 call->flags |= TRACE_EVENT_FL_FILTERED;
1794 __free_filter(tmp); 1885 else
1886 filter_disable(call);
1887
1888 rcu_assign_pointer(call->filter, filter);
1889
1890 if (tmp) {
1891 /* Make sure the call is done with the filter */
1892 synchronize_sched();
1893 __free_filter(tmp);
1894 }
1795 } 1895 }
1796 filter_opstack_clear(ps);
1797 postfix_clear(ps);
1798 kfree(ps);
1799out_unlock: 1896out_unlock:
1800 mutex_unlock(&event_mutex); 1897 mutex_unlock(&event_mutex);
1801 1898
@@ -1805,7 +1902,6 @@ out_unlock:
1805int apply_subsystem_event_filter(struct event_subsystem *system, 1902int apply_subsystem_event_filter(struct event_subsystem *system,
1806 char *filter_string) 1903 char *filter_string)
1807{ 1904{
1808 struct filter_parse_state *ps;
1809 struct event_filter *filter; 1905 struct event_filter *filter;
1810 int err = 0; 1906 int err = 0;
1811 1907
@@ -1829,38 +1925,15 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1829 goto out_unlock; 1925 goto out_unlock;
1830 } 1926 }
1831 1927
1832 err = -ENOMEM; 1928 err = create_system_filter(system, filter_string, &filter);
1833 ps = kzalloc(sizeof(*ps), GFP_KERNEL); 1929 if (filter) {
1834 if (!ps) 1930 /*
1835 goto out_unlock; 1931 * No event actually uses the system filter
1836 1932 * we can free it without synchronize_sched().
1837 filter = __alloc_filter(); 1933 */
1838 if (!filter) 1934 __free_filter(system->filter);
1839 goto out; 1935 system->filter = filter;
1840
1841 replace_filter_string(filter, filter_string);
1842 /*
1843 * No event actually uses the system filter
1844 * we can free it without synchronize_sched().
1845 */
1846 __free_filter(system->filter);
1847 system->filter = filter;
1848
1849 parse_init(ps, filter_ops, filter_string);
1850 err = filter_parse(ps);
1851 if (err) {
1852 append_filter_err(ps, system->filter);
1853 goto out;
1854 } 1936 }
1855
1856 err = replace_system_preds(system, ps, filter_string);
1857 if (err)
1858 append_filter_err(ps, system->filter);
1859
1860out:
1861 filter_opstack_clear(ps);
1862 postfix_clear(ps);
1863 kfree(ps);
1864out_unlock: 1937out_unlock:
1865 mutex_unlock(&event_mutex); 1938 mutex_unlock(&event_mutex);
1866 1939
@@ -1882,7 +1955,6 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1882{ 1955{
1883 int err; 1956 int err;
1884 struct event_filter *filter; 1957 struct event_filter *filter;
1885 struct filter_parse_state *ps;
1886 struct ftrace_event_call *call; 1958 struct ftrace_event_call *call;
1887 1959
1888 mutex_lock(&event_mutex); 1960 mutex_lock(&event_mutex);
@@ -1897,33 +1969,10 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1897 if (event->filter) 1969 if (event->filter)
1898 goto out_unlock; 1970 goto out_unlock;
1899 1971
1900 filter = __alloc_filter(); 1972 err = create_filter(call, filter_str, false, &filter);
1901 if (!filter) {
1902 err = PTR_ERR(filter);
1903 goto out_unlock;
1904 }
1905
1906 err = -ENOMEM;
1907 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1908 if (!ps)
1909 goto free_filter;
1910
1911 parse_init(ps, filter_ops, filter_str);
1912 err = filter_parse(ps);
1913 if (err)
1914 goto free_ps;
1915
1916 err = replace_preds(call, filter, ps, filter_str, false);
1917 if (!err) 1973 if (!err)
1918 event->filter = filter; 1974 event->filter = filter;
1919 1975 else
1920free_ps:
1921 filter_opstack_clear(ps);
1922 postfix_clear(ps);
1923 kfree(ps);
1924
1925free_filter:
1926 if (err)
1927 __free_filter(filter); 1976 __free_filter(filter);
1928 1977
1929out_unlock: 1978out_unlock:
@@ -1942,43 +1991,6 @@ out_unlock:
1942#define CREATE_TRACE_POINTS 1991#define CREATE_TRACE_POINTS
1943#include "trace_events_filter_test.h" 1992#include "trace_events_filter_test.h"
1944 1993
1945static int test_get_filter(char *filter_str, struct ftrace_event_call *call,
1946 struct event_filter **pfilter)
1947{
1948 struct event_filter *filter;
1949 struct filter_parse_state *ps;
1950 int err = -ENOMEM;
1951
1952 filter = __alloc_filter();
1953 if (!filter)
1954 goto out;
1955
1956 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1957 if (!ps)
1958 goto free_filter;
1959
1960 parse_init(ps, filter_ops, filter_str);
1961 err = filter_parse(ps);
1962 if (err)
1963 goto free_ps;
1964
1965 err = replace_preds(call, filter, ps, filter_str, false);
1966 if (!err)
1967 *pfilter = filter;
1968
1969 free_ps:
1970 filter_opstack_clear(ps);
1971 postfix_clear(ps);
1972 kfree(ps);
1973
1974 free_filter:
1975 if (err)
1976 __free_filter(filter);
1977
1978 out:
1979 return err;
1980}
1981
1982#define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \ 1994#define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \
1983{ \ 1995{ \
1984 .filter = FILTER, \ 1996 .filter = FILTER, \
@@ -2097,12 +2109,13 @@ static __init int ftrace_test_event_filter(void)
2097 struct test_filter_data_t *d = &test_filter_data[i]; 2109 struct test_filter_data_t *d = &test_filter_data[i];
2098 int err; 2110 int err;
2099 2111
2100 err = test_get_filter(d->filter, &event_ftrace_test_filter, 2112 err = create_filter(&event_ftrace_test_filter, d->filter,
2101 &filter); 2113 false, &filter);
2102 if (err) { 2114 if (err) {
2103 printk(KERN_INFO 2115 printk(KERN_INFO
2104 "Failed to get filter for '%s', err %d\n", 2116 "Failed to get filter for '%s', err %d\n",
2105 d->filter, err); 2117 d->filter, err);
2118 __free_filter(filter);
2106 break; 2119 break;
2107 } 2120 }
2108 2121
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 20dad0d7a163..99d20e920368 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -280,9 +280,20 @@ static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
280} 280}
281 281
282static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { } 282static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { }
283static void irqsoff_print_header(struct seq_file *s) { }
284static void irqsoff_trace_open(struct trace_iterator *iter) { } 283static void irqsoff_trace_open(struct trace_iterator *iter) { }
285static void irqsoff_trace_close(struct trace_iterator *iter) { } 284static void irqsoff_trace_close(struct trace_iterator *iter) { }
285
286#ifdef CONFIG_FUNCTION_TRACER
287static void irqsoff_print_header(struct seq_file *s)
288{
289 trace_default_header(s);
290}
291#else
292static void irqsoff_print_header(struct seq_file *s)
293{
294 trace_latency_header(s);
295}
296#endif /* CONFIG_FUNCTION_TRACER */
286#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 297#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
287 298
288/* 299/*
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 51999309a6cf..0d6ff3555942 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -627,11 +627,23 @@ int trace_print_context(struct trace_iterator *iter)
627 unsigned long usec_rem = do_div(t, USEC_PER_SEC); 627 unsigned long usec_rem = do_div(t, USEC_PER_SEC);
628 unsigned long secs = (unsigned long)t; 628 unsigned long secs = (unsigned long)t;
629 char comm[TASK_COMM_LEN]; 629 char comm[TASK_COMM_LEN];
630 int ret;
630 631
631 trace_find_cmdline(entry->pid, comm); 632 trace_find_cmdline(entry->pid, comm);
632 633
633 return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ", 634 ret = trace_seq_printf(s, "%16s-%-5d [%03d] ",
634 comm, entry->pid, iter->cpu, secs, usec_rem); 635 comm, entry->pid, iter->cpu);
636 if (!ret)
637 return 0;
638
639 if (trace_flags & TRACE_ITER_IRQ_INFO) {
640 ret = trace_print_lat_fmt(s, entry);
641 if (!ret)
642 return 0;
643 }
644
645 return trace_seq_printf(s, " %5lu.%06lu: ",
646 secs, usec_rem);
635} 647}
636 648
637int trace_print_lat_context(struct trace_iterator *iter) 649int trace_print_lat_context(struct trace_iterator *iter)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index e4a70c0c71b6..ff791ea48b57 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -280,9 +280,20 @@ static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
280} 280}
281 281
282static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } 282static void wakeup_graph_return(struct ftrace_graph_ret *trace) { }
283static void wakeup_print_header(struct seq_file *s) { }
284static void wakeup_trace_open(struct trace_iterator *iter) { } 283static void wakeup_trace_open(struct trace_iterator *iter) { }
285static void wakeup_trace_close(struct trace_iterator *iter) { } 284static void wakeup_trace_close(struct trace_iterator *iter) { }
285
286#ifdef CONFIG_FUNCTION_TRACER
287static void wakeup_print_header(struct seq_file *s)
288{
289 trace_default_header(s);
290}
291#else
292static void wakeup_print_header(struct seq_file *s)
293{
294 trace_latency_header(s);
295}
296#endif /* CONFIG_FUNCTION_TRACER */
286#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 297#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
287 298
288/* 299/*
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 77575b386d97..d4545f49242e 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -13,6 +13,9 @@
13#include <linux/sysctl.h> 13#include <linux/sysctl.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/fs.h> 15#include <linux/fs.h>
16
17#include <asm/setup.h>
18
16#include "trace.h" 19#include "trace.h"
17 20
18#define STACK_TRACE_ENTRIES 500 21#define STACK_TRACE_ENTRIES 500
@@ -133,7 +136,6 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
133static struct ftrace_ops trace_ops __read_mostly = 136static struct ftrace_ops trace_ops __read_mostly =
134{ 137{
135 .func = stack_trace_call, 138 .func = stack_trace_call,
136 .flags = FTRACE_OPS_FL_GLOBAL,
137}; 139};
138 140
139static ssize_t 141static ssize_t
@@ -311,6 +313,21 @@ static const struct file_operations stack_trace_fops = {
311 .release = seq_release, 313 .release = seq_release,
312}; 314};
313 315
316static int
317stack_trace_filter_open(struct inode *inode, struct file *file)
318{
319 return ftrace_regex_open(&trace_ops, FTRACE_ITER_FILTER,
320 inode, file);
321}
322
323static const struct file_operations stack_trace_filter_fops = {
324 .open = stack_trace_filter_open,
325 .read = seq_read,
326 .write = ftrace_filter_write,
327 .llseek = ftrace_regex_lseek,
328 .release = ftrace_regex_release,
329};
330
314int 331int
315stack_trace_sysctl(struct ctl_table *table, int write, 332stack_trace_sysctl(struct ctl_table *table, int write,
316 void __user *buffer, size_t *lenp, 333 void __user *buffer, size_t *lenp,
@@ -338,8 +355,13 @@ stack_trace_sysctl(struct ctl_table *table, int write,
338 return ret; 355 return ret;
339} 356}
340 357
358static char stack_trace_filter_buf[COMMAND_LINE_SIZE+1] __initdata;
359
341static __init int enable_stacktrace(char *str) 360static __init int enable_stacktrace(char *str)
342{ 361{
362 if (strncmp(str, "_filter=", 8) == 0)
363 strncpy(stack_trace_filter_buf, str+8, COMMAND_LINE_SIZE);
364
343 stack_tracer_enabled = 1; 365 stack_tracer_enabled = 1;
344 last_stack_tracer_enabled = 1; 366 last_stack_tracer_enabled = 1;
345 return 1; 367 return 1;
@@ -358,6 +380,12 @@ static __init int stack_trace_init(void)
358 trace_create_file("stack_trace", 0444, d_tracer, 380 trace_create_file("stack_trace", 0444, d_tracer,
359 NULL, &stack_trace_fops); 381 NULL, &stack_trace_fops);
360 382
383 trace_create_file("stack_trace_filter", 0444, d_tracer,
384 NULL, &stack_trace_filter_fops);
385
386 if (stack_trace_filter_buf[0])
387 ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1);
388
361 if (stack_tracer_enabled) 389 if (stack_tracer_enabled)
362 register_ftrace_function(&trace_ops); 390 register_ftrace_function(&trace_ops);
363 391
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index db110b8ae030..f1539decd99d 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -634,10 +634,11 @@ static int tracepoint_module_coming(struct module *mod)
634 int ret = 0; 634 int ret = 0;
635 635
636 /* 636 /*
637 * We skip modules that tain the kernel, especially those with different 637 * We skip modules that taint the kernel, especially those with different
638 * module header (for forced load), to make sure we don't cause a crash. 638 * module headers (for forced load), to make sure we don't cause a crash.
639 * Staging and out-of-tree GPL modules are fine.
639 */ 640 */
640 if (mod->taints) 641 if (mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP)))
641 return 0; 642 return 0;
642 mutex_lock(&tracepoints_mutex); 643 mutex_lock(&tracepoints_mutex);
643 tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL); 644 tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL);
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 5bbfac85866e..23b4d784ebdd 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -127,7 +127,7 @@ void acct_update_integrals(struct task_struct *tsk)
127 127
128 local_irq_save(flags); 128 local_irq_save(flags);
129 time = tsk->stime + tsk->utime; 129 time = tsk->stime + tsk->utime;
130 dtime = cputime_sub(time, tsk->acct_timexpd); 130 dtime = time - tsk->acct_timexpd;
131 jiffies_to_timeval(cputime_to_jiffies(dtime), &value); 131 jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
132 delta = value.tv_sec; 132 delta = value.tv_sec;
133 delta = delta * USEC_PER_SEC + value.tv_usec; 133 delta = delta * USEC_PER_SEC + value.tv_usec;
diff --git a/kernel/wait.c b/kernel/wait.c
index 26fa7797f90f..7fdd9eaca2c3 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -10,10 +10,10 @@
10#include <linux/wait.h> 10#include <linux/wait.h>
11#include <linux/hash.h> 11#include <linux/hash.h>
12 12
13void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key) 13void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key)
14{ 14{
15 spin_lock_init(&q->lock); 15 spin_lock_init(&q->lock);
16 lockdep_set_class(&q->lock, key); 16 lockdep_set_class_and_name(&q->lock, key, name);
17 INIT_LIST_HEAD(&q->task_list); 17 INIT_LIST_HEAD(&q->task_list);
18} 18}
19 19
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 42fa9ad0a810..bec7b5b53e03 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -242,10 +242,10 @@ struct workqueue_struct {
242 242
243 int nr_drainers; /* W: drain in progress */ 243 int nr_drainers; /* W: drain in progress */
244 int saved_max_active; /* W: saved cwq max_active */ 244 int saved_max_active; /* W: saved cwq max_active */
245 const char *name; /* I: workqueue name */
246#ifdef CONFIG_LOCKDEP 245#ifdef CONFIG_LOCKDEP
247 struct lockdep_map lockdep_map; 246 struct lockdep_map lockdep_map;
248#endif 247#endif
248 char name[]; /* I: workqueue name */
249}; 249};
250 250
251struct workqueue_struct *system_wq __read_mostly; 251struct workqueue_struct *system_wq __read_mostly;
@@ -2954,14 +2954,29 @@ static int wq_clamp_max_active(int max_active, unsigned int flags,
2954 return clamp_val(max_active, 1, lim); 2954 return clamp_val(max_active, 1, lim);
2955} 2955}
2956 2956
2957struct workqueue_struct *__alloc_workqueue_key(const char *name, 2957struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
2958 unsigned int flags, 2958 unsigned int flags,
2959 int max_active, 2959 int max_active,
2960 struct lock_class_key *key, 2960 struct lock_class_key *key,
2961 const char *lock_name) 2961 const char *lock_name, ...)
2962{ 2962{
2963 va_list args, args1;
2963 struct workqueue_struct *wq; 2964 struct workqueue_struct *wq;
2964 unsigned int cpu; 2965 unsigned int cpu;
2966 size_t namelen;
2967
2968 /* determine namelen, allocate wq and format name */
2969 va_start(args, lock_name);
2970 va_copy(args1, args);
2971 namelen = vsnprintf(NULL, 0, fmt, args) + 1;
2972
2973 wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL);
2974 if (!wq)
2975 goto err;
2976
2977 vsnprintf(wq->name, namelen, fmt, args1);
2978 va_end(args);
2979 va_end(args1);
2965 2980
2966 /* 2981 /*
2967 * Workqueues which may be used during memory reclaim should 2982 * Workqueues which may be used during memory reclaim should
@@ -2978,12 +2993,9 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
2978 flags |= WQ_HIGHPRI; 2993 flags |= WQ_HIGHPRI;
2979 2994
2980 max_active = max_active ?: WQ_DFL_ACTIVE; 2995 max_active = max_active ?: WQ_DFL_ACTIVE;
2981 max_active = wq_clamp_max_active(max_active, flags, name); 2996 max_active = wq_clamp_max_active(max_active, flags, wq->name);
2982
2983 wq = kzalloc(sizeof(*wq), GFP_KERNEL);
2984 if (!wq)
2985 goto err;
2986 2997
2998 /* init wq */
2987 wq->flags = flags; 2999 wq->flags = flags;
2988 wq->saved_max_active = max_active; 3000 wq->saved_max_active = max_active;
2989 mutex_init(&wq->flush_mutex); 3001 mutex_init(&wq->flush_mutex);
@@ -2991,7 +3003,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
2991 INIT_LIST_HEAD(&wq->flusher_queue); 3003 INIT_LIST_HEAD(&wq->flusher_queue);
2992 INIT_LIST_HEAD(&wq->flusher_overflow); 3004 INIT_LIST_HEAD(&wq->flusher_overflow);
2993 3005
2994 wq->name = name;
2995 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); 3006 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
2996 INIT_LIST_HEAD(&wq->list); 3007 INIT_LIST_HEAD(&wq->list);
2997 3008
@@ -3020,7 +3031,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
3020 if (!rescuer) 3031 if (!rescuer)
3021 goto err; 3032 goto err;
3022 3033
3023 rescuer->task = kthread_create(rescuer_thread, wq, "%s", name); 3034 rescuer->task = kthread_create(rescuer_thread, wq, "%s",
3035 wq->name);
3024 if (IS_ERR(rescuer->task)) 3036 if (IS_ERR(rescuer->task))
3025 goto err; 3037 goto err;
3026 3038