aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile23
-rw-r--r--kernel/acct.c46
-rw-r--r--kernel/async.c2
-rw-r--r--kernel/audit.c13
-rw-r--r--kernel/audit.h6
-rw-r--r--kernel/auditfilter.c17
-rw-r--r--kernel/auditsc.c754
-rw-r--r--kernel/capability.c80
-rw-r--r--kernel/cgroup.c428
-rw-r--r--kernel/cgroup_freezer.c86
-rw-r--r--kernel/cpu.c8
-rw-r--r--kernel/cpuset.c134
-rw-r--r--kernel/debug/kdb/kdb_main.c2
-rw-r--r--kernel/debug/kdb/kdb_support.c2
-rw-r--r--kernel/events/Makefile2
-rw-r--r--kernel/events/callchain.c189
-rw-r--r--kernel/events/core.c473
-rw-r--r--kernel/events/hw_breakpoint.c4
-rw-r--r--kernel/events/internal.h42
-rw-r--r--kernel/events/ring_buffer.c5
-rw-r--r--kernel/exit.c62
-rw-r--r--kernel/fork.c65
-rw-r--r--kernel/freezer.c203
-rw-r--r--kernel/futex.c28
-rw-r--r--kernel/hrtimer.c6
-rw-r--r--kernel/hung_task.c14
-rw-r--r--kernel/irq/autoprobe.c4
-rw-r--r--kernel/irq/chip.c42
-rw-r--r--kernel/irq/internals.h4
-rw-r--r--kernel/irq/irqdomain.c15
-rw-r--r--kernel/irq/manage.c11
-rw-r--r--kernel/irq/spurious.c6
-rw-r--r--kernel/itimer.c15
-rw-r--r--kernel/jump_label.c54
-rw-r--r--kernel/kexec.c29
-rw-r--r--kernel/kmod.c27
-rw-r--r--kernel/kprobes.c10
-rw-r--r--kernel/kthread.c27
-rw-r--r--kernel/lockdep.c91
-rw-r--r--kernel/module.c205
-rw-r--r--kernel/panic.c43
-rw-r--r--kernel/params.c41
-rw-r--r--kernel/pid.c8
-rw-r--r--kernel/pid_namespace.c31
-rw-r--r--kernel/posix-cpu-timers.c132
-rw-r--r--kernel/power/hibernate.c104
-rw-r--r--kernel/power/main.c10
-rw-r--r--kernel/power/power.h26
-rw-r--r--kernel/power/process.c93
-rw-r--r--kernel/power/snapshot.c9
-rw-r--r--kernel/power/suspend.c12
-rw-r--r--kernel/power/swap.c14
-rw-r--r--kernel/power/user.c195
-rw-r--r--kernel/printk.c24
-rw-r--r--kernel/ptrace.c27
-rw-r--r--kernel/rcu.h7
-rw-r--r--kernel/rcupdate.c12
-rw-r--r--kernel/rcutiny.c149
-rw-r--r--kernel/rcutiny_plugin.h29
-rw-r--r--kernel/rcutorture.c229
-rw-r--r--kernel/rcutree.c290
-rw-r--r--kernel/rcutree.h26
-rw-r--r--kernel/rcutree_plugin.h289
-rw-r--r--kernel/rcutree_trace.c12
-rw-r--r--kernel/relay.c12
-rw-r--r--kernel/res_counter.c28
-rw-r--r--kernel/rtmutex-debug.c1
-rw-r--r--kernel/rtmutex-tester.c37
-rw-r--r--kernel/rtmutex.c8
-rw-r--r--kernel/sched/Makefile20
-rw-r--r--kernel/sched/auto_group.c (renamed from kernel/sched_autogroup.c)33
-rw-r--r--kernel/sched/auto_group.h (renamed from kernel/sched_autogroup.h)26
-rw-r--r--kernel/sched/clock.c (renamed from kernel/sched_clock.c)0
-rw-r--r--kernel/sched/core.c (renamed from kernel/sched.c)2306
-rw-r--r--kernel/sched/cpupri.c (renamed from kernel/sched_cpupri.c)7
-rw-r--r--kernel/sched/cpupri.h (renamed from kernel/sched_cpupri.h)0
-rw-r--r--kernel/sched/debug.c (renamed from kernel/sched_debug.c)6
-rw-r--r--kernel/sched/fair.c (renamed from kernel/sched_fair.c)1155
-rw-r--r--kernel/sched/features.h (renamed from kernel/sched_features.h)29
-rw-r--r--kernel/sched/idle_task.c (renamed from kernel/sched_idletask.c)4
-rw-r--r--kernel/sched/rt.c (renamed from kernel/sched_rt.c)226
-rw-r--r--kernel/sched/sched.h1166
-rw-r--r--kernel/sched/stats.c111
-rw-r--r--kernel/sched/stats.h (renamed from kernel/sched_stats.h)109
-rw-r--r--kernel/sched/stop_task.c (renamed from kernel/sched_stoptask.c)4
-rw-r--r--kernel/seccomp.c2
-rw-r--r--kernel/signal.c82
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/sys.c127
-rw-r--r--kernel/sysctl.c9
-rw-r--r--kernel/sysctl_binary.c2
-rw-r--r--kernel/time/Kconfig2
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/time/clockevents.c1
-rw-r--r--kernel/time/clocksource.c111
-rw-r--r--kernel/time/tick-broadcast.c2
-rw-r--r--kernel/time/tick-sched.c105
-rw-r--r--kernel/time/timekeeping.c94
-rw-r--r--kernel/timer.c64
-rw-r--r--kernel/trace/blktrace.c2
-rw-r--r--kernel/trace/ftrace.c720
-rw-r--r--kernel/trace/trace.c108
-rw-r--r--kernel/trace/trace.h4
-rw-r--r--kernel/trace/trace_events.c1
-rw-r--r--kernel/trace/trace_events_filter.c294
-rw-r--r--kernel/trace/trace_irqsoff.c13
-rw-r--r--kernel/trace/trace_output.c16
-rw-r--r--kernel/trace/trace_sched_wakeup.c13
-rw-r--r--kernel/trace/trace_stack.c30
-rw-r--r--kernel/tracepoint.c7
-rw-r--r--kernel/tsacct.c2
-rw-r--r--kernel/wait.c4
-rw-r--r--kernel/watchdog.c2
-rw-r--r--kernel/workqueue.c32
114 files changed, 7422 insertions, 4767 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index e898c5b9d02c..2d9de86b7e76 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -2,16 +2,15 @@
2# Makefile for the linux kernel. 2# Makefile for the linux kernel.
3# 3#
4 4
5obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ 5obj-y = fork.o exec_domain.o panic.o printk.o \
6 cpu.o exit.o itimer.o time.o softirq.o resource.o \ 6 cpu.o exit.o itimer.o time.o softirq.o resource.o \
7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
8 signal.o sys.o kmod.o workqueue.o pid.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o sched_clock.o cred.o \ 12 notifier.o ksysfs.o cred.o \
13 async.o range.o 13 async.o range.o groups.o
14obj-y += groups.o
15 14
16ifdef CONFIG_FUNCTION_TRACER 15ifdef CONFIG_FUNCTION_TRACER
17# Do not trace debug files and internal ftrace files 16# Do not trace debug files and internal ftrace files
@@ -20,10 +19,12 @@ CFLAGS_REMOVE_lockdep_proc.o = -pg
20CFLAGS_REMOVE_mutex-debug.o = -pg 19CFLAGS_REMOVE_mutex-debug.o = -pg
21CFLAGS_REMOVE_rtmutex-debug.o = -pg 20CFLAGS_REMOVE_rtmutex-debug.o = -pg
22CFLAGS_REMOVE_cgroup-debug.o = -pg 21CFLAGS_REMOVE_cgroup-debug.o = -pg
23CFLAGS_REMOVE_sched_clock.o = -pg
24CFLAGS_REMOVE_irq_work.o = -pg 22CFLAGS_REMOVE_irq_work.o = -pg
25endif 23endif
26 24
25obj-y += sched/
26obj-y += power/
27
27obj-$(CONFIG_FREEZER) += freezer.o 28obj-$(CONFIG_FREEZER) += freezer.o
28obj-$(CONFIG_PROFILING) += profile.o 29obj-$(CONFIG_PROFILING) += profile.o
29obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o 30obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
@@ -52,8 +53,6 @@ obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
52obj-$(CONFIG_UID16) += uid16.o 53obj-$(CONFIG_UID16) += uid16.o
53obj-$(CONFIG_MODULES) += module.o 54obj-$(CONFIG_MODULES) += module.o
54obj-$(CONFIG_KALLSYMS) += kallsyms.o 55obj-$(CONFIG_KALLSYMS) += kallsyms.o
55obj-$(CONFIG_PM) += power/
56obj-$(CONFIG_FREEZER) += power/
57obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o 56obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
58obj-$(CONFIG_KEXEC) += kexec.o 57obj-$(CONFIG_KEXEC) += kexec.o
59obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o 58obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
@@ -99,7 +98,6 @@ obj-$(CONFIG_TRACING) += trace/
99obj-$(CONFIG_X86_DS) += trace/ 98obj-$(CONFIG_X86_DS) += trace/
100obj-$(CONFIG_RING_BUFFER) += trace/ 99obj-$(CONFIG_RING_BUFFER) += trace/
101obj-$(CONFIG_TRACEPOINTS) += trace/ 100obj-$(CONFIG_TRACEPOINTS) += trace/
102obj-$(CONFIG_SMP) += sched_cpupri.o
103obj-$(CONFIG_IRQ_WORK) += irq_work.o 101obj-$(CONFIG_IRQ_WORK) += irq_work.o
104obj-$(CONFIG_CPU_PM) += cpu_pm.o 102obj-$(CONFIG_CPU_PM) += cpu_pm.o
105 103
@@ -110,15 +108,6 @@ obj-$(CONFIG_PADATA) += padata.o
110obj-$(CONFIG_CRASH_DUMP) += crash_dump.o 108obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
111obj-$(CONFIG_JUMP_LABEL) += jump_label.o 109obj-$(CONFIG_JUMP_LABEL) += jump_label.o
112 110
113ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
114# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
115# needed for x86 only. Why this used to be enabled for all architectures is beyond
116# me. I suspect most platforms don't need this, but until we know that for sure
117# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k
118# to get a correct value for the wait-channel (WCHAN in ps). --davidm
119CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer
120endif
121
122$(obj)/configs.o: $(obj)/config_data.h 111$(obj)/configs.o: $(obj)/config_data.h
123 112
124# config_data.h contains the same information as ikconfig.h but gzipped. 113# config_data.h contains the same information as ikconfig.h but gzipped.
diff --git a/kernel/acct.c b/kernel/acct.c
index fa7eb3de2ddc..02e6167a53b0 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -84,11 +84,10 @@ static void do_acct_process(struct bsd_acct_struct *acct,
84 * the cache line to have the data after getting the lock. 84 * the cache line to have the data after getting the lock.
85 */ 85 */
86struct bsd_acct_struct { 86struct bsd_acct_struct {
87 volatile int active; 87 int active;
88 volatile int needcheck; 88 unsigned long needcheck;
89 struct file *file; 89 struct file *file;
90 struct pid_namespace *ns; 90 struct pid_namespace *ns;
91 struct timer_list timer;
92 struct list_head list; 91 struct list_head list;
93}; 92};
94 93
@@ -96,15 +95,6 @@ static DEFINE_SPINLOCK(acct_lock);
96static LIST_HEAD(acct_list); 95static LIST_HEAD(acct_list);
97 96
98/* 97/*
99 * Called whenever the timer says to check the free space.
100 */
101static void acct_timeout(unsigned long x)
102{
103 struct bsd_acct_struct *acct = (struct bsd_acct_struct *)x;
104 acct->needcheck = 1;
105}
106
107/*
108 * Check the amount of free space and suspend/resume accordingly. 98 * Check the amount of free space and suspend/resume accordingly.
109 */ 99 */
110static int check_free_space(struct bsd_acct_struct *acct, struct file *file) 100static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
@@ -112,12 +102,12 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
112 struct kstatfs sbuf; 102 struct kstatfs sbuf;
113 int res; 103 int res;
114 int act; 104 int act;
115 sector_t resume; 105 u64 resume;
116 sector_t suspend; 106 u64 suspend;
117 107
118 spin_lock(&acct_lock); 108 spin_lock(&acct_lock);
119 res = acct->active; 109 res = acct->active;
120 if (!file || !acct->needcheck) 110 if (!file || time_is_before_jiffies(acct->needcheck))
121 goto out; 111 goto out;
122 spin_unlock(&acct_lock); 112 spin_unlock(&acct_lock);
123 113
@@ -127,8 +117,8 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
127 suspend = sbuf.f_blocks * SUSPEND; 117 suspend = sbuf.f_blocks * SUSPEND;
128 resume = sbuf.f_blocks * RESUME; 118 resume = sbuf.f_blocks * RESUME;
129 119
130 sector_div(suspend, 100); 120 do_div(suspend, 100);
131 sector_div(resume, 100); 121 do_div(resume, 100);
132 122
133 if (sbuf.f_bavail <= suspend) 123 if (sbuf.f_bavail <= suspend)
134 act = -1; 124 act = -1;
@@ -160,10 +150,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
160 } 150 }
161 } 151 }
162 152
163 del_timer(&acct->timer); 153 acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
164 acct->needcheck = 0;
165 acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
166 add_timer(&acct->timer);
167 res = acct->active; 154 res = acct->active;
168out: 155out:
169 spin_unlock(&acct_lock); 156 spin_unlock(&acct_lock);
@@ -185,9 +172,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
185 if (acct->file) { 172 if (acct->file) {
186 old_acct = acct->file; 173 old_acct = acct->file;
187 old_ns = acct->ns; 174 old_ns = acct->ns;
188 del_timer(&acct->timer);
189 acct->active = 0; 175 acct->active = 0;
190 acct->needcheck = 0;
191 acct->file = NULL; 176 acct->file = NULL;
192 acct->ns = NULL; 177 acct->ns = NULL;
193 list_del(&acct->list); 178 list_del(&acct->list);
@@ -195,13 +180,9 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
195 if (file) { 180 if (file) {
196 acct->file = file; 181 acct->file = file;
197 acct->ns = ns; 182 acct->ns = ns;
198 acct->needcheck = 0; 183 acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
199 acct->active = 1; 184 acct->active = 1;
200 list_add(&acct->list, &acct_list); 185 list_add(&acct->list, &acct_list);
201 /* It's been deleted if it was used before so this is safe */
202 setup_timer(&acct->timer, acct_timeout, (unsigned long)acct);
203 acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
204 add_timer(&acct->timer);
205 } 186 }
206 if (old_acct) { 187 if (old_acct) {
207 mnt_unpin(old_acct->f_path.mnt); 188 mnt_unpin(old_acct->f_path.mnt);
@@ -334,7 +315,7 @@ void acct_auto_close(struct super_block *sb)
334 spin_lock(&acct_lock); 315 spin_lock(&acct_lock);
335restart: 316restart:
336 list_for_each_entry(acct, &acct_list, list) 317 list_for_each_entry(acct, &acct_list, list)
337 if (acct->file && acct->file->f_path.mnt->mnt_sb == sb) { 318 if (acct->file && acct->file->f_path.dentry->d_sb == sb) {
338 acct_file_reopen(acct, NULL, NULL); 319 acct_file_reopen(acct, NULL, NULL);
339 goto restart; 320 goto restart;
340 } 321 }
@@ -348,7 +329,6 @@ void acct_exit_ns(struct pid_namespace *ns)
348 if (acct == NULL) 329 if (acct == NULL)
349 return; 330 return;
350 331
351 del_timer_sync(&acct->timer);
352 spin_lock(&acct_lock); 332 spin_lock(&acct_lock);
353 if (acct->file != NULL) 333 if (acct->file != NULL)
354 acct_file_reopen(acct, NULL, NULL); 334 acct_file_reopen(acct, NULL, NULL);
@@ -498,7 +478,7 @@ static void do_acct_process(struct bsd_acct_struct *acct,
498 * Fill the accounting struct with the needed info as recorded 478 * Fill the accounting struct with the needed info as recorded
499 * by the different kernel functions. 479 * by the different kernel functions.
500 */ 480 */
501 memset((caddr_t)&ac, 0, sizeof(acct_t)); 481 memset(&ac, 0, sizeof(acct_t));
502 482
503 ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER; 483 ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
504 strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm)); 484 strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
@@ -613,8 +593,8 @@ void acct_collect(long exitcode, int group_dead)
613 pacct->ac_flag |= ACORE; 593 pacct->ac_flag |= ACORE;
614 if (current->flags & PF_SIGNALED) 594 if (current->flags & PF_SIGNALED)
615 pacct->ac_flag |= AXSIG; 595 pacct->ac_flag |= AXSIG;
616 pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime); 596 pacct->ac_utime += current->utime;
617 pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime); 597 pacct->ac_stime += current->stime;
618 pacct->ac_minflt += current->min_flt; 598 pacct->ac_minflt += current->min_flt;
619 pacct->ac_majflt += current->maj_flt; 599 pacct->ac_majflt += current->maj_flt;
620 spin_unlock_irq(&current->sighand->siglock); 600 spin_unlock_irq(&current->sighand->siglock);
diff --git a/kernel/async.c b/kernel/async.c
index 80b74b88fefe..bd0c168a3bbe 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -78,8 +78,6 @@ static DECLARE_WAIT_QUEUE_HEAD(async_done);
78 78
79static atomic_t entry_count; 79static atomic_t entry_count;
80 80
81extern int initcall_debug;
82
83 81
84/* 82/*
85 * MUST be called with the lock held! 83 * MUST be called with the lock held!
diff --git a/kernel/audit.c b/kernel/audit.c
index 09fae2677a45..bb0eb5bb9a0a 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -601,13 +601,13 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
601 case AUDIT_TTY_SET: 601 case AUDIT_TTY_SET:
602 case AUDIT_TRIM: 602 case AUDIT_TRIM:
603 case AUDIT_MAKE_EQUIV: 603 case AUDIT_MAKE_EQUIV:
604 if (security_netlink_recv(skb, CAP_AUDIT_CONTROL)) 604 if (!capable(CAP_AUDIT_CONTROL))
605 err = -EPERM; 605 err = -EPERM;
606 break; 606 break;
607 case AUDIT_USER: 607 case AUDIT_USER:
608 case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: 608 case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
609 case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: 609 case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
610 if (security_netlink_recv(skb, CAP_AUDIT_WRITE)) 610 if (!capable(CAP_AUDIT_WRITE))
611 err = -EPERM; 611 err = -EPERM;
612 break; 612 break;
613 default: /* bad msg */ 613 default: /* bad msg */
@@ -631,7 +631,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
631 } 631 }
632 632
633 *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); 633 *ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
634 audit_log_format(*ab, "user pid=%d uid=%u auid=%u ses=%u", 634 audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u",
635 pid, uid, auid, ses); 635 pid, uid, auid, ses);
636 if (sid) { 636 if (sid) {
637 rc = security_secid_to_secctx(sid, &ctx, &len); 637 rc = security_secid_to_secctx(sid, &ctx, &len);
@@ -1260,12 +1260,13 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
1260 avail = audit_expand(ab, 1260 avail = audit_expand(ab,
1261 max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail)); 1261 max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
1262 if (!avail) 1262 if (!avail)
1263 goto out; 1263 goto out_va_end;
1264 len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2); 1264 len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2);
1265 } 1265 }
1266 va_end(args2);
1267 if (len > 0) 1266 if (len > 0)
1268 skb_put(skb, len); 1267 skb_put(skb, len);
1268out_va_end:
1269 va_end(args2);
1269out: 1270out:
1270 return; 1271 return;
1271} 1272}
@@ -1422,7 +1423,7 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
1422 char *p, *pathname; 1423 char *p, *pathname;
1423 1424
1424 if (prefix) 1425 if (prefix)
1425 audit_log_format(ab, " %s", prefix); 1426 audit_log_format(ab, "%s", prefix);
1426 1427
1427 /* We will allow 11 spaces for ' (deleted)' to be appended */ 1428 /* We will allow 11 spaces for ' (deleted)' to be appended */
1428 pathname = kmalloc(PATH_MAX+11, ab->gfp_mask); 1429 pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
diff --git a/kernel/audit.h b/kernel/audit.h
index 91e7071c4d2c..816766803371 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -36,12 +36,8 @@ enum audit_state {
36 AUDIT_DISABLED, /* Do not create per-task audit_context. 36 AUDIT_DISABLED, /* Do not create per-task audit_context.
37 * No syscall-specific audit records can 37 * No syscall-specific audit records can
38 * be generated. */ 38 * be generated. */
39 AUDIT_SETUP_CONTEXT, /* Create the per-task audit_context,
40 * but don't necessarily fill it in at
41 * syscall entry time (i.e., filter
42 * instead). */
43 AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context, 39 AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context,
44 * and always fill it in at syscall 40 * and fill it in at syscall
45 * entry time. This makes a full 41 * entry time. This makes a full
46 * syscall record available if some 42 * syscall record available if some
47 * other part of the kernel decides it 43 * other part of the kernel decides it
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index f8277c80d678..a6c3f1abd206 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -235,13 +235,15 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
235 switch(listnr) { 235 switch(listnr) {
236 default: 236 default:
237 goto exit_err; 237 goto exit_err;
238 case AUDIT_FILTER_USER:
239 case AUDIT_FILTER_TYPE:
240#ifdef CONFIG_AUDITSYSCALL 238#ifdef CONFIG_AUDITSYSCALL
241 case AUDIT_FILTER_ENTRY: 239 case AUDIT_FILTER_ENTRY:
240 if (rule->action == AUDIT_ALWAYS)
241 goto exit_err;
242 case AUDIT_FILTER_EXIT: 242 case AUDIT_FILTER_EXIT:
243 case AUDIT_FILTER_TASK: 243 case AUDIT_FILTER_TASK:
244#endif 244#endif
245 case AUDIT_FILTER_USER:
246 case AUDIT_FILTER_TYPE:
245 ; 247 ;
246 } 248 }
247 if (unlikely(rule->action == AUDIT_POSSIBLE)) { 249 if (unlikely(rule->action == AUDIT_POSSIBLE)) {
@@ -385,7 +387,7 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
385 goto exit_free; 387 goto exit_free;
386 break; 388 break;
387 case AUDIT_FILETYPE: 389 case AUDIT_FILETYPE:
388 if ((f->val & ~S_IFMT) > S_IFMT) 390 if (f->val & ~S_IFMT)
389 goto exit_free; 391 goto exit_free;
390 break; 392 break;
391 case AUDIT_INODE: 393 case AUDIT_INODE:
@@ -459,6 +461,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
459 case AUDIT_ARG1: 461 case AUDIT_ARG1:
460 case AUDIT_ARG2: 462 case AUDIT_ARG2:
461 case AUDIT_ARG3: 463 case AUDIT_ARG3:
464 case AUDIT_OBJ_UID:
465 case AUDIT_OBJ_GID:
462 break; 466 break;
463 case AUDIT_ARCH: 467 case AUDIT_ARCH:
464 entry->rule.arch_f = f; 468 entry->rule.arch_f = f;
@@ -522,7 +526,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
522 goto exit_free; 526 goto exit_free;
523 break; 527 break;
524 case AUDIT_FILTERKEY: 528 case AUDIT_FILTERKEY:
525 err = -EINVAL;
526 if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN) 529 if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN)
527 goto exit_free; 530 goto exit_free;
528 str = audit_unpack_string(&bufp, &remain, f->val); 531 str = audit_unpack_string(&bufp, &remain, f->val);
@@ -536,7 +539,11 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
536 goto exit_free; 539 goto exit_free;
537 break; 540 break;
538 case AUDIT_FILETYPE: 541 case AUDIT_FILETYPE:
539 if ((f->val & ~S_IFMT) > S_IFMT) 542 if (f->val & ~S_IFMT)
543 goto exit_free;
544 break;
545 case AUDIT_FIELD_COMPARE:
546 if (f->val > AUDIT_MAX_FIELD_COMPARE)
540 goto exit_free; 547 goto exit_free;
541 break; 548 break;
542 default: 549 default:
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 47b7fc1ea893..af1de0f34eae 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -70,9 +70,15 @@
70 70
71#include "audit.h" 71#include "audit.h"
72 72
73/* flags stating the success for a syscall */
74#define AUDITSC_INVALID 0
75#define AUDITSC_SUCCESS 1
76#define AUDITSC_FAILURE 2
77
73/* AUDIT_NAMES is the number of slots we reserve in the audit_context 78/* AUDIT_NAMES is the number of slots we reserve in the audit_context
74 * for saving names from getname(). */ 79 * for saving names from getname(). If we get more names we will allocate
75#define AUDIT_NAMES 20 80 * a name dynamically and also add those to the list anchored by names_list. */
81#define AUDIT_NAMES 5
76 82
77/* Indicates that audit should log the full pathname. */ 83/* Indicates that audit should log the full pathname. */
78#define AUDIT_NAME_FULL -1 84#define AUDIT_NAME_FULL -1
@@ -101,9 +107,8 @@ struct audit_cap_data {
101 * 107 *
102 * Further, in fs/namei.c:path_lookup() we store the inode and device. */ 108 * Further, in fs/namei.c:path_lookup() we store the inode and device. */
103struct audit_names { 109struct audit_names {
110 struct list_head list; /* audit_context->names_list */
104 const char *name; 111 const char *name;
105 int name_len; /* number of name's characters to log */
106 unsigned name_put; /* call __putname() for this name */
107 unsigned long ino; 112 unsigned long ino;
108 dev_t dev; 113 dev_t dev;
109 umode_t mode; 114 umode_t mode;
@@ -113,6 +118,14 @@ struct audit_names {
113 u32 osid; 118 u32 osid;
114 struct audit_cap_data fcap; 119 struct audit_cap_data fcap;
115 unsigned int fcap_ver; 120 unsigned int fcap_ver;
121 int name_len; /* number of name's characters to log */
122 bool name_put; /* call __putname() for this name */
123 /*
124 * This was an allocated audit_names and not from the array of
125 * names allocated in the task audit context. Thus this name
126 * should be freed on syscall exit
127 */
128 bool should_free;
116}; 129};
117 130
118struct audit_aux_data { 131struct audit_aux_data {
@@ -174,8 +187,17 @@ struct audit_context {
174 long return_code;/* syscall return code */ 187 long return_code;/* syscall return code */
175 u64 prio; 188 u64 prio;
176 int return_valid; /* return code is valid */ 189 int return_valid; /* return code is valid */
177 int name_count; 190 /*
178 struct audit_names names[AUDIT_NAMES]; 191 * The names_list is the list of all audit_names collected during this
192 * syscall. The first AUDIT_NAMES entries in the names_list will
193 * actually be from the preallocated_names array for performance
194 * reasons. Except during allocation they should never be referenced
195 * through the preallocated_names array and should only be found/used
196 * by running the names_list.
197 */
198 struct audit_names preallocated_names[AUDIT_NAMES];
199 int name_count; /* total records in names_list */
200 struct list_head names_list; /* anchor for struct audit_names->list */
179 char * filterkey; /* key for rule that triggered record */ 201 char * filterkey; /* key for rule that triggered record */
180 struct path pwd; 202 struct path pwd;
181 struct audit_context *previous; /* For nested syscalls */ 203 struct audit_context *previous; /* For nested syscalls */
@@ -210,12 +232,12 @@ struct audit_context {
210 struct { 232 struct {
211 uid_t uid; 233 uid_t uid;
212 gid_t gid; 234 gid_t gid;
213 mode_t mode; 235 umode_t mode;
214 u32 osid; 236 u32 osid;
215 int has_perm; 237 int has_perm;
216 uid_t perm_uid; 238 uid_t perm_uid;
217 gid_t perm_gid; 239 gid_t perm_gid;
218 mode_t perm_mode; 240 umode_t perm_mode;
219 unsigned long qbytes; 241 unsigned long qbytes;
220 } ipc; 242 } ipc;
221 struct { 243 struct {
@@ -234,7 +256,7 @@ struct audit_context {
234 } mq_sendrecv; 256 } mq_sendrecv;
235 struct { 257 struct {
236 int oflag; 258 int oflag;
237 mode_t mode; 259 umode_t mode;
238 struct mq_attr attr; 260 struct mq_attr attr;
239 } mq_open; 261 } mq_open;
240 struct { 262 struct {
@@ -305,21 +327,21 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
305 } 327 }
306} 328}
307 329
308static int audit_match_filetype(struct audit_context *ctx, int which) 330static int audit_match_filetype(struct audit_context *ctx, int val)
309{ 331{
310 unsigned index = which & ~S_IFMT; 332 struct audit_names *n;
311 mode_t mode = which & S_IFMT; 333 umode_t mode = (umode_t)val;
312 334
313 if (unlikely(!ctx)) 335 if (unlikely(!ctx))
314 return 0; 336 return 0;
315 337
316 if (index >= ctx->name_count) 338 list_for_each_entry(n, &ctx->names_list, list) {
317 return 0; 339 if ((n->ino != -1) &&
318 if (ctx->names[index].ino == -1) 340 ((n->mode & S_IFMT) == mode))
319 return 0; 341 return 1;
320 if ((ctx->names[index].mode ^ mode) & S_IFMT) 342 }
321 return 0; 343
322 return 1; 344 return 0;
323} 345}
324 346
325/* 347/*
@@ -441,6 +463,134 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
441 return 0; 463 return 0;
442} 464}
443 465
466static int audit_compare_id(uid_t uid1,
467 struct audit_names *name,
468 unsigned long name_offset,
469 struct audit_field *f,
470 struct audit_context *ctx)
471{
472 struct audit_names *n;
473 unsigned long addr;
474 uid_t uid2;
475 int rc;
476
477 BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t));
478
479 if (name) {
480 addr = (unsigned long)name;
481 addr += name_offset;
482
483 uid2 = *(uid_t *)addr;
484 rc = audit_comparator(uid1, f->op, uid2);
485 if (rc)
486 return rc;
487 }
488
489 if (ctx) {
490 list_for_each_entry(n, &ctx->names_list, list) {
491 addr = (unsigned long)n;
492 addr += name_offset;
493
494 uid2 = *(uid_t *)addr;
495
496 rc = audit_comparator(uid1, f->op, uid2);
497 if (rc)
498 return rc;
499 }
500 }
501 return 0;
502}
503
504static int audit_field_compare(struct task_struct *tsk,
505 const struct cred *cred,
506 struct audit_field *f,
507 struct audit_context *ctx,
508 struct audit_names *name)
509{
510 switch (f->val) {
511 /* process to file object comparisons */
512 case AUDIT_COMPARE_UID_TO_OBJ_UID:
513 return audit_compare_id(cred->uid,
514 name, offsetof(struct audit_names, uid),
515 f, ctx);
516 case AUDIT_COMPARE_GID_TO_OBJ_GID:
517 return audit_compare_id(cred->gid,
518 name, offsetof(struct audit_names, gid),
519 f, ctx);
520 case AUDIT_COMPARE_EUID_TO_OBJ_UID:
521 return audit_compare_id(cred->euid,
522 name, offsetof(struct audit_names, uid),
523 f, ctx);
524 case AUDIT_COMPARE_EGID_TO_OBJ_GID:
525 return audit_compare_id(cred->egid,
526 name, offsetof(struct audit_names, gid),
527 f, ctx);
528 case AUDIT_COMPARE_AUID_TO_OBJ_UID:
529 return audit_compare_id(tsk->loginuid,
530 name, offsetof(struct audit_names, uid),
531 f, ctx);
532 case AUDIT_COMPARE_SUID_TO_OBJ_UID:
533 return audit_compare_id(cred->suid,
534 name, offsetof(struct audit_names, uid),
535 f, ctx);
536 case AUDIT_COMPARE_SGID_TO_OBJ_GID:
537 return audit_compare_id(cred->sgid,
538 name, offsetof(struct audit_names, gid),
539 f, ctx);
540 case AUDIT_COMPARE_FSUID_TO_OBJ_UID:
541 return audit_compare_id(cred->fsuid,
542 name, offsetof(struct audit_names, uid),
543 f, ctx);
544 case AUDIT_COMPARE_FSGID_TO_OBJ_GID:
545 return audit_compare_id(cred->fsgid,
546 name, offsetof(struct audit_names, gid),
547 f, ctx);
548 /* uid comparisons */
549 case AUDIT_COMPARE_UID_TO_AUID:
550 return audit_comparator(cred->uid, f->op, tsk->loginuid);
551 case AUDIT_COMPARE_UID_TO_EUID:
552 return audit_comparator(cred->uid, f->op, cred->euid);
553 case AUDIT_COMPARE_UID_TO_SUID:
554 return audit_comparator(cred->uid, f->op, cred->suid);
555 case AUDIT_COMPARE_UID_TO_FSUID:
556 return audit_comparator(cred->uid, f->op, cred->fsuid);
557 /* auid comparisons */
558 case AUDIT_COMPARE_AUID_TO_EUID:
559 return audit_comparator(tsk->loginuid, f->op, cred->euid);
560 case AUDIT_COMPARE_AUID_TO_SUID:
561 return audit_comparator(tsk->loginuid, f->op, cred->suid);
562 case AUDIT_COMPARE_AUID_TO_FSUID:
563 return audit_comparator(tsk->loginuid, f->op, cred->fsuid);
564 /* euid comparisons */
565 case AUDIT_COMPARE_EUID_TO_SUID:
566 return audit_comparator(cred->euid, f->op, cred->suid);
567 case AUDIT_COMPARE_EUID_TO_FSUID:
568 return audit_comparator(cred->euid, f->op, cred->fsuid);
569 /* suid comparisons */
570 case AUDIT_COMPARE_SUID_TO_FSUID:
571 return audit_comparator(cred->suid, f->op, cred->fsuid);
572 /* gid comparisons */
573 case AUDIT_COMPARE_GID_TO_EGID:
574 return audit_comparator(cred->gid, f->op, cred->egid);
575 case AUDIT_COMPARE_GID_TO_SGID:
576 return audit_comparator(cred->gid, f->op, cred->sgid);
577 case AUDIT_COMPARE_GID_TO_FSGID:
578 return audit_comparator(cred->gid, f->op, cred->fsgid);
579 /* egid comparisons */
580 case AUDIT_COMPARE_EGID_TO_SGID:
581 return audit_comparator(cred->egid, f->op, cred->sgid);
582 case AUDIT_COMPARE_EGID_TO_FSGID:
583 return audit_comparator(cred->egid, f->op, cred->fsgid);
584 /* sgid comparison */
585 case AUDIT_COMPARE_SGID_TO_FSGID:
586 return audit_comparator(cred->sgid, f->op, cred->fsgid);
587 default:
588 WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n");
589 return 0;
590 }
591 return 0;
592}
593
444/* Determine if any context name data matches a rule's watch data */ 594/* Determine if any context name data matches a rule's watch data */
445/* Compare a task_struct with an audit_rule. Return 1 on match, 0 595/* Compare a task_struct with an audit_rule. Return 1 on match, 0
446 * otherwise. 596 * otherwise.
@@ -457,13 +607,14 @@ static int audit_filter_rules(struct task_struct *tsk,
457 bool task_creation) 607 bool task_creation)
458{ 608{
459 const struct cred *cred; 609 const struct cred *cred;
460 int i, j, need_sid = 1; 610 int i, need_sid = 1;
461 u32 sid; 611 u32 sid;
462 612
463 cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation); 613 cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation);
464 614
465 for (i = 0; i < rule->field_count; i++) { 615 for (i = 0; i < rule->field_count; i++) {
466 struct audit_field *f = &rule->fields[i]; 616 struct audit_field *f = &rule->fields[i];
617 struct audit_names *n;
467 int result = 0; 618 int result = 0;
468 619
469 switch (f->type) { 620 switch (f->type) {
@@ -522,12 +673,14 @@ static int audit_filter_rules(struct task_struct *tsk,
522 } 673 }
523 break; 674 break;
524 case AUDIT_DEVMAJOR: 675 case AUDIT_DEVMAJOR:
525 if (name) 676 if (name) {
526 result = audit_comparator(MAJOR(name->dev), 677 if (audit_comparator(MAJOR(name->dev), f->op, f->val) ||
527 f->op, f->val); 678 audit_comparator(MAJOR(name->rdev), f->op, f->val))
528 else if (ctx) { 679 ++result;
529 for (j = 0; j < ctx->name_count; j++) { 680 } else if (ctx) {
530 if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) { 681 list_for_each_entry(n, &ctx->names_list, list) {
682 if (audit_comparator(MAJOR(n->dev), f->op, f->val) ||
683 audit_comparator(MAJOR(n->rdev), f->op, f->val)) {
531 ++result; 684 ++result;
532 break; 685 break;
533 } 686 }
@@ -535,12 +688,14 @@ static int audit_filter_rules(struct task_struct *tsk,
535 } 688 }
536 break; 689 break;
537 case AUDIT_DEVMINOR: 690 case AUDIT_DEVMINOR:
538 if (name) 691 if (name) {
539 result = audit_comparator(MINOR(name->dev), 692 if (audit_comparator(MINOR(name->dev), f->op, f->val) ||
540 f->op, f->val); 693 audit_comparator(MINOR(name->rdev), f->op, f->val))
541 else if (ctx) { 694 ++result;
542 for (j = 0; j < ctx->name_count; j++) { 695 } else if (ctx) {
543 if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) { 696 list_for_each_entry(n, &ctx->names_list, list) {
697 if (audit_comparator(MINOR(n->dev), f->op, f->val) ||
698 audit_comparator(MINOR(n->rdev), f->op, f->val)) {
544 ++result; 699 ++result;
545 break; 700 break;
546 } 701 }
@@ -551,8 +706,32 @@ static int audit_filter_rules(struct task_struct *tsk,
551 if (name) 706 if (name)
552 result = (name->ino == f->val); 707 result = (name->ino == f->val);
553 else if (ctx) { 708 else if (ctx) {
554 for (j = 0; j < ctx->name_count; j++) { 709 list_for_each_entry(n, &ctx->names_list, list) {
555 if (audit_comparator(ctx->names[j].ino, f->op, f->val)) { 710 if (audit_comparator(n->ino, f->op, f->val)) {
711 ++result;
712 break;
713 }
714 }
715 }
716 break;
717 case AUDIT_OBJ_UID:
718 if (name) {
719 result = audit_comparator(name->uid, f->op, f->val);
720 } else if (ctx) {
721 list_for_each_entry(n, &ctx->names_list, list) {
722 if (audit_comparator(n->uid, f->op, f->val)) {
723 ++result;
724 break;
725 }
726 }
727 }
728 break;
729 case AUDIT_OBJ_GID:
730 if (name) {
731 result = audit_comparator(name->gid, f->op, f->val);
732 } else if (ctx) {
733 list_for_each_entry(n, &ctx->names_list, list) {
734 if (audit_comparator(n->gid, f->op, f->val)) {
556 ++result; 735 ++result;
557 break; 736 break;
558 } 737 }
@@ -607,11 +786,10 @@ static int audit_filter_rules(struct task_struct *tsk,
607 name->osid, f->type, f->op, 786 name->osid, f->type, f->op,
608 f->lsm_rule, ctx); 787 f->lsm_rule, ctx);
609 } else if (ctx) { 788 } else if (ctx) {
610 for (j = 0; j < ctx->name_count; j++) { 789 list_for_each_entry(n, &ctx->names_list, list) {
611 if (security_audit_rule_match( 790 if (security_audit_rule_match(n->osid, f->type,
612 ctx->names[j].osid, 791 f->op, f->lsm_rule,
613 f->type, f->op, 792 ctx)) {
614 f->lsm_rule, ctx)) {
615 ++result; 793 ++result;
616 break; 794 break;
617 } 795 }
@@ -643,8 +821,10 @@ static int audit_filter_rules(struct task_struct *tsk,
643 case AUDIT_FILETYPE: 821 case AUDIT_FILETYPE:
644 result = audit_match_filetype(ctx, f->val); 822 result = audit_match_filetype(ctx, f->val);
645 break; 823 break;
824 case AUDIT_FIELD_COMPARE:
825 result = audit_field_compare(tsk, cred, f, ctx, name);
826 break;
646 } 827 }
647
648 if (!result) 828 if (!result)
649 return 0; 829 return 0;
650 } 830 }
@@ -722,40 +902,53 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
722 return AUDIT_BUILD_CONTEXT; 902 return AUDIT_BUILD_CONTEXT;
723} 903}
724 904
725/* At syscall exit time, this filter is called if any audit_names[] have been 905/*
906 * Given an audit_name check the inode hash table to see if they match.
907 * Called holding the rcu read lock to protect the use of audit_inode_hash
908 */
909static int audit_filter_inode_name(struct task_struct *tsk,
910 struct audit_names *n,
911 struct audit_context *ctx) {
912 int word, bit;
913 int h = audit_hash_ino((u32)n->ino);
914 struct list_head *list = &audit_inode_hash[h];
915 struct audit_entry *e;
916 enum audit_state state;
917
918 word = AUDIT_WORD(ctx->major);
919 bit = AUDIT_BIT(ctx->major);
920
921 if (list_empty(list))
922 return 0;
923
924 list_for_each_entry_rcu(e, list, list) {
925 if ((e->rule.mask[word] & bit) == bit &&
926 audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) {
927 ctx->current_state = state;
928 return 1;
929 }
930 }
931
932 return 0;
933}
934
935/* At syscall exit time, this filter is called if any audit_names have been
726 * collected during syscall processing. We only check rules in sublists at hash 936 * collected during syscall processing. We only check rules in sublists at hash
727 * buckets applicable to the inode numbers in audit_names[]. 937 * buckets applicable to the inode numbers in audit_names.
728 * Regarding audit_state, same rules apply as for audit_filter_syscall(). 938 * Regarding audit_state, same rules apply as for audit_filter_syscall().
729 */ 939 */
730void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx) 940void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
731{ 941{
732 int i; 942 struct audit_names *n;
733 struct audit_entry *e;
734 enum audit_state state;
735 943
736 if (audit_pid && tsk->tgid == audit_pid) 944 if (audit_pid && tsk->tgid == audit_pid)
737 return; 945 return;
738 946
739 rcu_read_lock(); 947 rcu_read_lock();
740 for (i = 0; i < ctx->name_count; i++) {
741 int word = AUDIT_WORD(ctx->major);
742 int bit = AUDIT_BIT(ctx->major);
743 struct audit_names *n = &ctx->names[i];
744 int h = audit_hash_ino((u32)n->ino);
745 struct list_head *list = &audit_inode_hash[h];
746
747 if (list_empty(list))
748 continue;
749 948
750 list_for_each_entry_rcu(e, list, list) { 949 list_for_each_entry(n, &ctx->names_list, list) {
751 if ((e->rule.mask[word] & bit) == bit && 950 if (audit_filter_inode_name(tsk, n, ctx))
752 audit_filter_rules(tsk, &e->rule, ctx, n, 951 break;
753 &state, false)) {
754 rcu_read_unlock();
755 ctx->current_state = state;
756 return;
757 }
758 }
759 } 952 }
760 rcu_read_unlock(); 953 rcu_read_unlock();
761} 954}
@@ -766,7 +959,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
766{ 959{
767 struct audit_context *context = tsk->audit_context; 960 struct audit_context *context = tsk->audit_context;
768 961
769 if (likely(!context)) 962 if (!context)
770 return NULL; 963 return NULL;
771 context->return_valid = return_valid; 964 context->return_valid = return_valid;
772 965
@@ -799,7 +992,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
799 992
800static inline void audit_free_names(struct audit_context *context) 993static inline void audit_free_names(struct audit_context *context)
801{ 994{
802 int i; 995 struct audit_names *n, *next;
803 996
804#if AUDIT_DEBUG == 2 997#if AUDIT_DEBUG == 2
805 if (context->put_count + context->ino_count != context->name_count) { 998 if (context->put_count + context->ino_count != context->name_count) {
@@ -810,10 +1003,9 @@ static inline void audit_free_names(struct audit_context *context)
810 context->serial, context->major, context->in_syscall, 1003 context->serial, context->major, context->in_syscall,
811 context->name_count, context->put_count, 1004 context->name_count, context->put_count,
812 context->ino_count); 1005 context->ino_count);
813 for (i = 0; i < context->name_count; i++) { 1006 list_for_each_entry(n, &context->names_list, list) {
814 printk(KERN_ERR "names[%d] = %p = %s\n", i, 1007 printk(KERN_ERR "names[%d] = %p = %s\n", i,
815 context->names[i].name, 1008 n->name, n->name ?: "(null)");
816 context->names[i].name ?: "(null)");
817 } 1009 }
818 dump_stack(); 1010 dump_stack();
819 return; 1011 return;
@@ -824,9 +1016,12 @@ static inline void audit_free_names(struct audit_context *context)
824 context->ino_count = 0; 1016 context->ino_count = 0;
825#endif 1017#endif
826 1018
827 for (i = 0; i < context->name_count; i++) { 1019 list_for_each_entry_safe(n, next, &context->names_list, list) {
828 if (context->names[i].name && context->names[i].name_put) 1020 list_del(&n->list);
829 __putname(context->names[i].name); 1021 if (n->name && n->name_put)
1022 __putname(n->name);
1023 if (n->should_free)
1024 kfree(n);
830 } 1025 }
831 context->name_count = 0; 1026 context->name_count = 0;
832 path_put(&context->pwd); 1027 path_put(&context->pwd);
@@ -864,6 +1059,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
864 return NULL; 1059 return NULL;
865 audit_zero_context(context, state); 1060 audit_zero_context(context, state);
866 INIT_LIST_HEAD(&context->killed_trees); 1061 INIT_LIST_HEAD(&context->killed_trees);
1062 INIT_LIST_HEAD(&context->names_list);
867 return context; 1063 return context;
868} 1064}
869 1065
@@ -886,7 +1082,7 @@ int audit_alloc(struct task_struct *tsk)
886 return 0; /* Return if not auditing. */ 1082 return 0; /* Return if not auditing. */
887 1083
888 state = audit_filter_task(tsk, &key); 1084 state = audit_filter_task(tsk, &key);
889 if (likely(state == AUDIT_DISABLED)) 1085 if (state == AUDIT_DISABLED)
890 return 0; 1086 return 0;
891 1087
892 if (!(context = audit_alloc_context(state))) { 1088 if (!(context = audit_alloc_context(state))) {
@@ -975,7 +1171,7 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk
975 while (vma) { 1171 while (vma) {
976 if ((vma->vm_flags & VM_EXECUTABLE) && 1172 if ((vma->vm_flags & VM_EXECUTABLE) &&
977 vma->vm_file) { 1173 vma->vm_file) {
978 audit_log_d_path(ab, "exe=", 1174 audit_log_d_path(ab, " exe=",
979 &vma->vm_file->f_path); 1175 &vma->vm_file->f_path);
980 break; 1176 break;
981 } 1177 }
@@ -1166,8 +1362,8 @@ static void audit_log_execve_info(struct audit_context *context,
1166 struct audit_buffer **ab, 1362 struct audit_buffer **ab,
1167 struct audit_aux_data_execve *axi) 1363 struct audit_aux_data_execve *axi)
1168{ 1364{
1169 int i; 1365 int i, len;
1170 size_t len, len_sent = 0; 1366 size_t len_sent = 0;
1171 const char __user *p; 1367 const char __user *p;
1172 char *buf; 1368 char *buf;
1173 1369
@@ -1249,7 +1445,7 @@ static void show_special(struct audit_context *context, int *call_panic)
1249 case AUDIT_IPC: { 1445 case AUDIT_IPC: {
1250 u32 osid = context->ipc.osid; 1446 u32 osid = context->ipc.osid;
1251 1447
1252 audit_log_format(ab, "ouid=%u ogid=%u mode=%#o", 1448 audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho",
1253 context->ipc.uid, context->ipc.gid, context->ipc.mode); 1449 context->ipc.uid, context->ipc.gid, context->ipc.mode);
1254 if (osid) { 1450 if (osid) {
1255 char *ctx = NULL; 1451 char *ctx = NULL;
@@ -1267,7 +1463,7 @@ static void show_special(struct audit_context *context, int *call_panic)
1267 ab = audit_log_start(context, GFP_KERNEL, 1463 ab = audit_log_start(context, GFP_KERNEL,
1268 AUDIT_IPC_SET_PERM); 1464 AUDIT_IPC_SET_PERM);
1269 audit_log_format(ab, 1465 audit_log_format(ab,
1270 "qbytes=%lx ouid=%u ogid=%u mode=%#o", 1466 "qbytes=%lx ouid=%u ogid=%u mode=%#ho",
1271 context->ipc.qbytes, 1467 context->ipc.qbytes,
1272 context->ipc.perm_uid, 1468 context->ipc.perm_uid,
1273 context->ipc.perm_gid, 1469 context->ipc.perm_gid,
@@ -1278,7 +1474,7 @@ static void show_special(struct audit_context *context, int *call_panic)
1278 break; } 1474 break; }
1279 case AUDIT_MQ_OPEN: { 1475 case AUDIT_MQ_OPEN: {
1280 audit_log_format(ab, 1476 audit_log_format(ab,
1281 "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld " 1477 "oflag=0x%x mode=%#ho mq_flags=0x%lx mq_maxmsg=%ld "
1282 "mq_msgsize=%ld mq_curmsgs=%ld", 1478 "mq_msgsize=%ld mq_curmsgs=%ld",
1283 context->mq_open.oflag, context->mq_open.mode, 1479 context->mq_open.oflag, context->mq_open.mode,
1284 context->mq_open.attr.mq_flags, 1480 context->mq_open.attr.mq_flags,
@@ -1324,6 +1520,68 @@ static void show_special(struct audit_context *context, int *call_panic)
1324 audit_log_end(ab); 1520 audit_log_end(ab);
1325} 1521}
1326 1522
1523static void audit_log_name(struct audit_context *context, struct audit_names *n,
1524 int record_num, int *call_panic)
1525{
1526 struct audit_buffer *ab;
1527 ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
1528 if (!ab)
1529 return; /* audit_panic has been called */
1530
1531 audit_log_format(ab, "item=%d", record_num);
1532
1533 if (n->name) {
1534 switch (n->name_len) {
1535 case AUDIT_NAME_FULL:
1536 /* log the full path */
1537 audit_log_format(ab, " name=");
1538 audit_log_untrustedstring(ab, n->name);
1539 break;
1540 case 0:
1541 /* name was specified as a relative path and the
1542 * directory component is the cwd */
1543 audit_log_d_path(ab, " name=", &context->pwd);
1544 break;
1545 default:
1546 /* log the name's directory component */
1547 audit_log_format(ab, " name=");
1548 audit_log_n_untrustedstring(ab, n->name,
1549 n->name_len);
1550 }
1551 } else
1552 audit_log_format(ab, " name=(null)");
1553
1554 if (n->ino != (unsigned long)-1) {
1555 audit_log_format(ab, " inode=%lu"
1556 " dev=%02x:%02x mode=%#ho"
1557 " ouid=%u ogid=%u rdev=%02x:%02x",
1558 n->ino,
1559 MAJOR(n->dev),
1560 MINOR(n->dev),
1561 n->mode,
1562 n->uid,
1563 n->gid,
1564 MAJOR(n->rdev),
1565 MINOR(n->rdev));
1566 }
1567 if (n->osid != 0) {
1568 char *ctx = NULL;
1569 u32 len;
1570 if (security_secid_to_secctx(
1571 n->osid, &ctx, &len)) {
1572 audit_log_format(ab, " osid=%u", n->osid);
1573 *call_panic = 2;
1574 } else {
1575 audit_log_format(ab, " obj=%s", ctx);
1576 security_release_secctx(ctx, len);
1577 }
1578 }
1579
1580 audit_log_fcaps(ab, n);
1581
1582 audit_log_end(ab);
1583}
1584
1327static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) 1585static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
1328{ 1586{
1329 const struct cred *cred; 1587 const struct cred *cred;
@@ -1331,6 +1589,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1331 struct audit_buffer *ab; 1589 struct audit_buffer *ab;
1332 struct audit_aux_data *aux; 1590 struct audit_aux_data *aux;
1333 const char *tty; 1591 const char *tty;
1592 struct audit_names *n;
1334 1593
1335 /* tsk == current */ 1594 /* tsk == current */
1336 context->pid = tsk->pid; 1595 context->pid = tsk->pid;
@@ -1466,70 +1725,14 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1466 if (context->pwd.dentry && context->pwd.mnt) { 1725 if (context->pwd.dentry && context->pwd.mnt) {
1467 ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); 1726 ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
1468 if (ab) { 1727 if (ab) {
1469 audit_log_d_path(ab, "cwd=", &context->pwd); 1728 audit_log_d_path(ab, " cwd=", &context->pwd);
1470 audit_log_end(ab); 1729 audit_log_end(ab);
1471 } 1730 }
1472 } 1731 }
1473 for (i = 0; i < context->name_count; i++) {
1474 struct audit_names *n = &context->names[i];
1475 1732
1476 ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); 1733 i = 0;
1477 if (!ab) 1734 list_for_each_entry(n, &context->names_list, list)
1478 continue; /* audit_panic has been called */ 1735 audit_log_name(context, n, i++, &call_panic);
1479
1480 audit_log_format(ab, "item=%d", i);
1481
1482 if (n->name) {
1483 switch(n->name_len) {
1484 case AUDIT_NAME_FULL:
1485 /* log the full path */
1486 audit_log_format(ab, " name=");
1487 audit_log_untrustedstring(ab, n->name);
1488 break;
1489 case 0:
1490 /* name was specified as a relative path and the
1491 * directory component is the cwd */
1492 audit_log_d_path(ab, "name=", &context->pwd);
1493 break;
1494 default:
1495 /* log the name's directory component */
1496 audit_log_format(ab, " name=");
1497 audit_log_n_untrustedstring(ab, n->name,
1498 n->name_len);
1499 }
1500 } else
1501 audit_log_format(ab, " name=(null)");
1502
1503 if (n->ino != (unsigned long)-1) {
1504 audit_log_format(ab, " inode=%lu"
1505 " dev=%02x:%02x mode=%#o"
1506 " ouid=%u ogid=%u rdev=%02x:%02x",
1507 n->ino,
1508 MAJOR(n->dev),
1509 MINOR(n->dev),
1510 n->mode,
1511 n->uid,
1512 n->gid,
1513 MAJOR(n->rdev),
1514 MINOR(n->rdev));
1515 }
1516 if (n->osid != 0) {
1517 char *ctx = NULL;
1518 u32 len;
1519 if (security_secid_to_secctx(
1520 n->osid, &ctx, &len)) {
1521 audit_log_format(ab, " osid=%u", n->osid);
1522 call_panic = 2;
1523 } else {
1524 audit_log_format(ab, " obj=%s", ctx);
1525 security_release_secctx(ctx, len);
1526 }
1527 }
1528
1529 audit_log_fcaps(ab, n);
1530
1531 audit_log_end(ab);
1532 }
1533 1736
1534 /* Send end of event record to help user space know we are finished */ 1737 /* Send end of event record to help user space know we are finished */
1535 ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); 1738 ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
@@ -1545,12 +1748,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1545 * 1748 *
1546 * Called from copy_process and do_exit 1749 * Called from copy_process and do_exit
1547 */ 1750 */
1548void audit_free(struct task_struct *tsk) 1751void __audit_free(struct task_struct *tsk)
1549{ 1752{
1550 struct audit_context *context; 1753 struct audit_context *context;
1551 1754
1552 context = audit_get_context(tsk, 0, 0); 1755 context = audit_get_context(tsk, 0, 0);
1553 if (likely(!context)) 1756 if (!context)
1554 return; 1757 return;
1555 1758
1556 /* Check for system calls that do not go through the exit 1759 /* Check for system calls that do not go through the exit
@@ -1583,7 +1786,7 @@ void audit_free(struct task_struct *tsk)
1583 * will only be written if another part of the kernel requests that it 1786 * will only be written if another part of the kernel requests that it
1584 * be written). 1787 * be written).
1585 */ 1788 */
1586void audit_syscall_entry(int arch, int major, 1789void __audit_syscall_entry(int arch, int major,
1587 unsigned long a1, unsigned long a2, 1790 unsigned long a1, unsigned long a2,
1588 unsigned long a3, unsigned long a4) 1791 unsigned long a3, unsigned long a4)
1589{ 1792{
@@ -1591,7 +1794,7 @@ void audit_syscall_entry(int arch, int major,
1591 struct audit_context *context = tsk->audit_context; 1794 struct audit_context *context = tsk->audit_context;
1592 enum audit_state state; 1795 enum audit_state state;
1593 1796
1594 if (unlikely(!context)) 1797 if (!context)
1595 return; 1798 return;
1596 1799
1597 /* 1800 /*
@@ -1648,7 +1851,7 @@ void audit_syscall_entry(int arch, int major,
1648 context->prio = 0; 1851 context->prio = 0;
1649 state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]); 1852 state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]);
1650 } 1853 }
1651 if (likely(state == AUDIT_DISABLED)) 1854 if (state == AUDIT_DISABLED)
1652 return; 1855 return;
1653 1856
1654 context->serial = 0; 1857 context->serial = 0;
@@ -1658,45 +1861,29 @@ void audit_syscall_entry(int arch, int major,
1658 context->ppid = 0; 1861 context->ppid = 0;
1659} 1862}
1660 1863
1661void audit_finish_fork(struct task_struct *child)
1662{
1663 struct audit_context *ctx = current->audit_context;
1664 struct audit_context *p = child->audit_context;
1665 if (!p || !ctx)
1666 return;
1667 if (!ctx->in_syscall || ctx->current_state != AUDIT_RECORD_CONTEXT)
1668 return;
1669 p->arch = ctx->arch;
1670 p->major = ctx->major;
1671 memcpy(p->argv, ctx->argv, sizeof(ctx->argv));
1672 p->ctime = ctx->ctime;
1673 p->dummy = ctx->dummy;
1674 p->in_syscall = ctx->in_syscall;
1675 p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL);
1676 p->ppid = current->pid;
1677 p->prio = ctx->prio;
1678 p->current_state = ctx->current_state;
1679}
1680
1681/** 1864/**
1682 * audit_syscall_exit - deallocate audit context after a system call 1865 * audit_syscall_exit - deallocate audit context after a system call
1683 * @valid: success/failure flag 1866 * @success: success value of the syscall
1684 * @return_code: syscall return value 1867 * @return_code: return value of the syscall
1685 * 1868 *
1686 * Tear down after system call. If the audit context has been marked as 1869 * Tear down after system call. If the audit context has been marked as
1687 * auditable (either because of the AUDIT_RECORD_CONTEXT state from 1870 * auditable (either because of the AUDIT_RECORD_CONTEXT state from
1688 * filtering, or because some other part of the kernel write an audit 1871 * filtering, or because some other part of the kernel wrote an audit
1689 * message), then write out the syscall information. In call cases, 1872 * message), then write out the syscall information. In call cases,
1690 * free the names stored from getname(). 1873 * free the names stored from getname().
1691 */ 1874 */
1692void audit_syscall_exit(int valid, long return_code) 1875void __audit_syscall_exit(int success, long return_code)
1693{ 1876{
1694 struct task_struct *tsk = current; 1877 struct task_struct *tsk = current;
1695 struct audit_context *context; 1878 struct audit_context *context;
1696 1879
1697 context = audit_get_context(tsk, valid, return_code); 1880 if (success)
1881 success = AUDITSC_SUCCESS;
1882 else
1883 success = AUDITSC_FAILURE;
1698 1884
1699 if (likely(!context)) 1885 context = audit_get_context(tsk, success, return_code);
1886 if (!context)
1700 return; 1887 return;
1701 1888
1702 if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) 1889 if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
@@ -1821,6 +2008,30 @@ retry:
1821#endif 2008#endif
1822} 2009}
1823 2010
2011static struct audit_names *audit_alloc_name(struct audit_context *context)
2012{
2013 struct audit_names *aname;
2014
2015 if (context->name_count < AUDIT_NAMES) {
2016 aname = &context->preallocated_names[context->name_count];
2017 memset(aname, 0, sizeof(*aname));
2018 } else {
2019 aname = kzalloc(sizeof(*aname), GFP_NOFS);
2020 if (!aname)
2021 return NULL;
2022 aname->should_free = true;
2023 }
2024
2025 aname->ino = (unsigned long)-1;
2026 list_add_tail(&aname->list, &context->names_list);
2027
2028 context->name_count++;
2029#if AUDIT_DEBUG
2030 context->ino_count++;
2031#endif
2032 return aname;
2033}
2034
1824/** 2035/**
1825 * audit_getname - add a name to the list 2036 * audit_getname - add a name to the list
1826 * @name: name to add 2037 * @name: name to add
@@ -1831,9 +2042,7 @@ retry:
1831void __audit_getname(const char *name) 2042void __audit_getname(const char *name)
1832{ 2043{
1833 struct audit_context *context = current->audit_context; 2044 struct audit_context *context = current->audit_context;
1834 2045 struct audit_names *n;
1835 if (IS_ERR(name) || !name)
1836 return;
1837 2046
1838 if (!context->in_syscall) { 2047 if (!context->in_syscall) {
1839#if AUDIT_DEBUG == 2 2048#if AUDIT_DEBUG == 2
@@ -1843,13 +2052,15 @@ void __audit_getname(const char *name)
1843#endif 2052#endif
1844 return; 2053 return;
1845 } 2054 }
1846 BUG_ON(context->name_count >= AUDIT_NAMES); 2055
1847 context->names[context->name_count].name = name; 2056 n = audit_alloc_name(context);
1848 context->names[context->name_count].name_len = AUDIT_NAME_FULL; 2057 if (!n)
1849 context->names[context->name_count].name_put = 1; 2058 return;
1850 context->names[context->name_count].ino = (unsigned long)-1; 2059
1851 context->names[context->name_count].osid = 0; 2060 n->name = name;
1852 ++context->name_count; 2061 n->name_len = AUDIT_NAME_FULL;
2062 n->name_put = true;
2063
1853 if (!context->pwd.dentry) 2064 if (!context->pwd.dentry)
1854 get_fs_pwd(current->fs, &context->pwd); 2065 get_fs_pwd(current->fs, &context->pwd);
1855} 2066}
@@ -1871,12 +2082,13 @@ void audit_putname(const char *name)
1871 printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n", 2082 printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n",
1872 __FILE__, __LINE__, context->serial, name); 2083 __FILE__, __LINE__, context->serial, name);
1873 if (context->name_count) { 2084 if (context->name_count) {
2085 struct audit_names *n;
1874 int i; 2086 int i;
1875 for (i = 0; i < context->name_count; i++) 2087
2088 list_for_each_entry(n, &context->names_list, list)
1876 printk(KERN_ERR "name[%d] = %p = %s\n", i, 2089 printk(KERN_ERR "name[%d] = %p = %s\n", i,
1877 context->names[i].name, 2090 n->name, n->name ?: "(null)");
1878 context->names[i].name ?: "(null)"); 2091 }
1879 }
1880#endif 2092#endif
1881 __putname(name); 2093 __putname(name);
1882 } 2094 }
@@ -1897,39 +2109,11 @@ void audit_putname(const char *name)
1897#endif 2109#endif
1898} 2110}
1899 2111
1900static int audit_inc_name_count(struct audit_context *context,
1901 const struct inode *inode)
1902{
1903 if (context->name_count >= AUDIT_NAMES) {
1904 if (inode)
1905 printk(KERN_DEBUG "audit: name_count maxed, losing inode data: "
1906 "dev=%02x:%02x, inode=%lu\n",
1907 MAJOR(inode->i_sb->s_dev),
1908 MINOR(inode->i_sb->s_dev),
1909 inode->i_ino);
1910
1911 else
1912 printk(KERN_DEBUG "name_count maxed, losing inode data\n");
1913 return 1;
1914 }
1915 context->name_count++;
1916#if AUDIT_DEBUG
1917 context->ino_count++;
1918#endif
1919 return 0;
1920}
1921
1922
1923static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry) 2112static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry)
1924{ 2113{
1925 struct cpu_vfs_cap_data caps; 2114 struct cpu_vfs_cap_data caps;
1926 int rc; 2115 int rc;
1927 2116
1928 memset(&name->fcap.permitted, 0, sizeof(kernel_cap_t));
1929 memset(&name->fcap.inheritable, 0, sizeof(kernel_cap_t));
1930 name->fcap.fE = 0;
1931 name->fcap_ver = 0;
1932
1933 if (!dentry) 2117 if (!dentry)
1934 return 0; 2118 return 0;
1935 2119
@@ -1969,30 +2153,25 @@ static void audit_copy_inode(struct audit_names *name, const struct dentry *dent
1969 */ 2153 */
1970void __audit_inode(const char *name, const struct dentry *dentry) 2154void __audit_inode(const char *name, const struct dentry *dentry)
1971{ 2155{
1972 int idx;
1973 struct audit_context *context = current->audit_context; 2156 struct audit_context *context = current->audit_context;
1974 const struct inode *inode = dentry->d_inode; 2157 const struct inode *inode = dentry->d_inode;
2158 struct audit_names *n;
1975 2159
1976 if (!context->in_syscall) 2160 if (!context->in_syscall)
1977 return; 2161 return;
1978 if (context->name_count 2162
1979 && context->names[context->name_count-1].name 2163 list_for_each_entry_reverse(n, &context->names_list, list) {
1980 && context->names[context->name_count-1].name == name) 2164 if (n->name && (n->name == name))
1981 idx = context->name_count - 1; 2165 goto out;
1982 else if (context->name_count > 1
1983 && context->names[context->name_count-2].name
1984 && context->names[context->name_count-2].name == name)
1985 idx = context->name_count - 2;
1986 else {
1987 /* FIXME: how much do we care about inodes that have no
1988 * associated name? */
1989 if (audit_inc_name_count(context, inode))
1990 return;
1991 idx = context->name_count - 1;
1992 context->names[idx].name = NULL;
1993 } 2166 }
2167
2168 /* unable to find the name from a previous getname() */
2169 n = audit_alloc_name(context);
2170 if (!n)
2171 return;
2172out:
1994 handle_path(dentry); 2173 handle_path(dentry);
1995 audit_copy_inode(&context->names[idx], dentry, inode); 2174 audit_copy_inode(n, dentry, inode);
1996} 2175}
1997 2176
1998/** 2177/**
@@ -2011,11 +2190,11 @@ void __audit_inode(const char *name, const struct dentry *dentry)
2011void __audit_inode_child(const struct dentry *dentry, 2190void __audit_inode_child(const struct dentry *dentry,
2012 const struct inode *parent) 2191 const struct inode *parent)
2013{ 2192{
2014 int idx;
2015 struct audit_context *context = current->audit_context; 2193 struct audit_context *context = current->audit_context;
2016 const char *found_parent = NULL, *found_child = NULL; 2194 const char *found_parent = NULL, *found_child = NULL;
2017 const struct inode *inode = dentry->d_inode; 2195 const struct inode *inode = dentry->d_inode;
2018 const char *dname = dentry->d_name.name; 2196 const char *dname = dentry->d_name.name;
2197 struct audit_names *n;
2019 int dirlen = 0; 2198 int dirlen = 0;
2020 2199
2021 if (!context->in_syscall) 2200 if (!context->in_syscall)
@@ -2025,9 +2204,7 @@ void __audit_inode_child(const struct dentry *dentry,
2025 handle_one(inode); 2204 handle_one(inode);
2026 2205
2027 /* parent is more likely, look for it first */ 2206 /* parent is more likely, look for it first */
2028 for (idx = 0; idx < context->name_count; idx++) { 2207 list_for_each_entry(n, &context->names_list, list) {
2029 struct audit_names *n = &context->names[idx];
2030
2031 if (!n->name) 2208 if (!n->name)
2032 continue; 2209 continue;
2033 2210
@@ -2040,9 +2217,7 @@ void __audit_inode_child(const struct dentry *dentry,
2040 } 2217 }
2041 2218
2042 /* no matching parent, look for matching child */ 2219 /* no matching parent, look for matching child */
2043 for (idx = 0; idx < context->name_count; idx++) { 2220 list_for_each_entry(n, &context->names_list, list) {
2044 struct audit_names *n = &context->names[idx];
2045
2046 if (!n->name) 2221 if (!n->name)
2047 continue; 2222 continue;
2048 2223
@@ -2060,34 +2235,29 @@ void __audit_inode_child(const struct dentry *dentry,
2060 2235
2061add_names: 2236add_names:
2062 if (!found_parent) { 2237 if (!found_parent) {
2063 if (audit_inc_name_count(context, parent)) 2238 n = audit_alloc_name(context);
2239 if (!n)
2064 return; 2240 return;
2065 idx = context->name_count - 1; 2241 audit_copy_inode(n, NULL, parent);
2066 context->names[idx].name = NULL;
2067 audit_copy_inode(&context->names[idx], NULL, parent);
2068 } 2242 }
2069 2243
2070 if (!found_child) { 2244 if (!found_child) {
2071 if (audit_inc_name_count(context, inode)) 2245 n = audit_alloc_name(context);
2246 if (!n)
2072 return; 2247 return;
2073 idx = context->name_count - 1;
2074 2248
2075 /* Re-use the name belonging to the slot for a matching parent 2249 /* Re-use the name belonging to the slot for a matching parent
2076 * directory. All names for this context are relinquished in 2250 * directory. All names for this context are relinquished in
2077 * audit_free_names() */ 2251 * audit_free_names() */
2078 if (found_parent) { 2252 if (found_parent) {
2079 context->names[idx].name = found_parent; 2253 n->name = found_parent;
2080 context->names[idx].name_len = AUDIT_NAME_FULL; 2254 n->name_len = AUDIT_NAME_FULL;
2081 /* don't call __putname() */ 2255 /* don't call __putname() */
2082 context->names[idx].name_put = 0; 2256 n->name_put = false;
2083 } else {
2084 context->names[idx].name = NULL;
2085 } 2257 }
2086 2258
2087 if (inode) 2259 if (inode)
2088 audit_copy_inode(&context->names[idx], NULL, inode); 2260 audit_copy_inode(n, NULL, inode);
2089 else
2090 context->names[idx].ino = (unsigned long)-1;
2091 } 2261 }
2092} 2262}
2093EXPORT_SYMBOL_GPL(__audit_inode_child); 2263EXPORT_SYMBOL_GPL(__audit_inode_child);
@@ -2121,19 +2291,28 @@ int auditsc_get_stamp(struct audit_context *ctx,
2121static atomic_t session_id = ATOMIC_INIT(0); 2291static atomic_t session_id = ATOMIC_INIT(0);
2122 2292
2123/** 2293/**
2124 * audit_set_loginuid - set a task's audit_context loginuid 2294 * audit_set_loginuid - set current task's audit_context loginuid
2125 * @task: task whose audit context is being modified
2126 * @loginuid: loginuid value 2295 * @loginuid: loginuid value
2127 * 2296 *
2128 * Returns 0. 2297 * Returns 0.
2129 * 2298 *
2130 * Called (set) from fs/proc/base.c::proc_loginuid_write(). 2299 * Called (set) from fs/proc/base.c::proc_loginuid_write().
2131 */ 2300 */
2132int audit_set_loginuid(struct task_struct *task, uid_t loginuid) 2301int audit_set_loginuid(uid_t loginuid)
2133{ 2302{
2134 unsigned int sessionid = atomic_inc_return(&session_id); 2303 struct task_struct *task = current;
2135 struct audit_context *context = task->audit_context; 2304 struct audit_context *context = task->audit_context;
2305 unsigned int sessionid;
2306
2307#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE
2308 if (task->loginuid != -1)
2309 return -EPERM;
2310#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
2311 if (!capable(CAP_AUDIT_CONTROL))
2312 return -EPERM;
2313#endif /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
2136 2314
2315 sessionid = atomic_inc_return(&session_id);
2137 if (context && context->in_syscall) { 2316 if (context && context->in_syscall) {
2138 struct audit_buffer *ab; 2317 struct audit_buffer *ab;
2139 2318
@@ -2160,7 +2339,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
2160 * @attr: queue attributes 2339 * @attr: queue attributes
2161 * 2340 *
2162 */ 2341 */
2163void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr) 2342void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
2164{ 2343{
2165 struct audit_context *context = current->audit_context; 2344 struct audit_context *context = current->audit_context;
2166 2345
@@ -2260,7 +2439,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
2260 * 2439 *
2261 * Called only after audit_ipc_obj(). 2440 * Called only after audit_ipc_obj().
2262 */ 2441 */
2263void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) 2442void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode)
2264{ 2443{
2265 struct audit_context *context = current->audit_context; 2444 struct audit_context *context = current->audit_context;
2266 2445
@@ -2271,14 +2450,11 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mod
2271 context->ipc.has_perm = 1; 2450 context->ipc.has_perm = 1;
2272} 2451}
2273 2452
2274int audit_bprm(struct linux_binprm *bprm) 2453int __audit_bprm(struct linux_binprm *bprm)
2275{ 2454{
2276 struct audit_aux_data_execve *ax; 2455 struct audit_aux_data_execve *ax;
2277 struct audit_context *context = current->audit_context; 2456 struct audit_context *context = current->audit_context;
2278 2457
2279 if (likely(!audit_enabled || !context || context->dummy))
2280 return 0;
2281
2282 ax = kmalloc(sizeof(*ax), GFP_KERNEL); 2458 ax = kmalloc(sizeof(*ax), GFP_KERNEL);
2283 if (!ax) 2459 if (!ax)
2284 return -ENOMEM; 2460 return -ENOMEM;
@@ -2299,13 +2475,10 @@ int audit_bprm(struct linux_binprm *bprm)
2299 * @args: args array 2475 * @args: args array
2300 * 2476 *
2301 */ 2477 */
2302void audit_socketcall(int nargs, unsigned long *args) 2478void __audit_socketcall(int nargs, unsigned long *args)
2303{ 2479{
2304 struct audit_context *context = current->audit_context; 2480 struct audit_context *context = current->audit_context;
2305 2481
2306 if (likely(!context || context->dummy))
2307 return;
2308
2309 context->type = AUDIT_SOCKETCALL; 2482 context->type = AUDIT_SOCKETCALL;
2310 context->socketcall.nargs = nargs; 2483 context->socketcall.nargs = nargs;
2311 memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long)); 2484 memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long));
@@ -2331,13 +2504,10 @@ void __audit_fd_pair(int fd1, int fd2)
2331 * 2504 *
2332 * Returns 0 for success or NULL context or < 0 on error. 2505 * Returns 0 for success or NULL context or < 0 on error.
2333 */ 2506 */
2334int audit_sockaddr(int len, void *a) 2507int __audit_sockaddr(int len, void *a)
2335{ 2508{
2336 struct audit_context *context = current->audit_context; 2509 struct audit_context *context = current->audit_context;
2337 2510
2338 if (likely(!context || context->dummy))
2339 return 0;
2340
2341 if (!context->sockaddr) { 2511 if (!context->sockaddr) {
2342 void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL); 2512 void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL);
2343 if (!p) 2513 if (!p)
@@ -2499,6 +2669,25 @@ void __audit_mmap_fd(int fd, int flags)
2499 context->type = AUDIT_MMAP; 2669 context->type = AUDIT_MMAP;
2500} 2670}
2501 2671
2672static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
2673{
2674 uid_t auid, uid;
2675 gid_t gid;
2676 unsigned int sessionid;
2677
2678 auid = audit_get_loginuid(current);
2679 sessionid = audit_get_sessionid(current);
2680 current_uid_gid(&uid, &gid);
2681
2682 audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
2683 auid, uid, gid, sessionid);
2684 audit_log_task_context(ab);
2685 audit_log_format(ab, " pid=%d comm=", current->pid);
2686 audit_log_untrustedstring(ab, current->comm);
2687 audit_log_format(ab, " reason=");
2688 audit_log_string(ab, reason);
2689 audit_log_format(ab, " sig=%ld", signr);
2690}
2502/** 2691/**
2503 * audit_core_dumps - record information about processes that end abnormally 2692 * audit_core_dumps - record information about processes that end abnormally
2504 * @signr: signal value 2693 * @signr: signal value
@@ -2509,10 +2698,6 @@ void __audit_mmap_fd(int fd, int flags)
2509void audit_core_dumps(long signr) 2698void audit_core_dumps(long signr)
2510{ 2699{
2511 struct audit_buffer *ab; 2700 struct audit_buffer *ab;
2512 u32 sid;
2513 uid_t auid = audit_get_loginuid(current), uid;
2514 gid_t gid;
2515 unsigned int sessionid = audit_get_sessionid(current);
2516 2701
2517 if (!audit_enabled) 2702 if (!audit_enabled)
2518 return; 2703 return;
@@ -2521,24 +2706,17 @@ void audit_core_dumps(long signr)
2521 return; 2706 return;
2522 2707
2523 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); 2708 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
2524 current_uid_gid(&uid, &gid); 2709 audit_log_abend(ab, "memory violation", signr);
2525 audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", 2710 audit_log_end(ab);
2526 auid, uid, gid, sessionid); 2711}
2527 security_task_getsecid(current, &sid);
2528 if (sid) {
2529 char *ctx = NULL;
2530 u32 len;
2531 2712
2532 if (security_secid_to_secctx(sid, &ctx, &len)) 2713void __audit_seccomp(unsigned long syscall)
2533 audit_log_format(ab, " ssid=%u", sid); 2714{
2534 else { 2715 struct audit_buffer *ab;
2535 audit_log_format(ab, " subj=%s", ctx); 2716
2536 security_release_secctx(ctx, len); 2717 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
2537 } 2718 audit_log_abend(ab, "seccomp", SIGKILL);
2538 } 2719 audit_log_format(ab, " syscall=%ld", syscall);
2539 audit_log_format(ab, " pid=%d comm=", current->pid);
2540 audit_log_untrustedstring(ab, current->comm);
2541 audit_log_format(ab, " sig=%ld", signr);
2542 audit_log_end(ab); 2720 audit_log_end(ab);
2543} 2721}
2544 2722
diff --git a/kernel/capability.c b/kernel/capability.c
index b463871a4e69..3f1adb6c6470 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -287,74 +287,84 @@ error:
287} 287}
288 288
289/** 289/**
290 * has_capability - Does a task have a capability in init_user_ns 290 * has_ns_capability - Does a task have a capability in a specific user ns
291 * @t: The task in question 291 * @t: The task in question
292 * @ns: target user namespace
292 * @cap: The capability to be tested for 293 * @cap: The capability to be tested for
293 * 294 *
294 * Return true if the specified task has the given superior capability 295 * Return true if the specified task has the given superior capability
295 * currently in effect to the initial user namespace, false if not. 296 * currently in effect to the specified user namespace, false if not.
296 * 297 *
297 * Note that this does not set PF_SUPERPRIV on the task. 298 * Note that this does not set PF_SUPERPRIV on the task.
298 */ 299 */
299bool has_capability(struct task_struct *t, int cap) 300bool has_ns_capability(struct task_struct *t,
301 struct user_namespace *ns, int cap)
300{ 302{
301 int ret = security_real_capable(t, &init_user_ns, cap); 303 int ret;
304
305 rcu_read_lock();
306 ret = security_capable(__task_cred(t), ns, cap);
307 rcu_read_unlock();
302 308
303 return (ret == 0); 309 return (ret == 0);
304} 310}
305 311
306/** 312/**
307 * has_capability - Does a task have a capability in a specific user ns 313 * has_capability - Does a task have a capability in init_user_ns
308 * @t: The task in question 314 * @t: The task in question
309 * @ns: target user namespace
310 * @cap: The capability to be tested for 315 * @cap: The capability to be tested for
311 * 316 *
312 * Return true if the specified task has the given superior capability 317 * Return true if the specified task has the given superior capability
313 * currently in effect to the specified user namespace, false if not. 318 * currently in effect to the initial user namespace, false if not.
314 * 319 *
315 * Note that this does not set PF_SUPERPRIV on the task. 320 * Note that this does not set PF_SUPERPRIV on the task.
316 */ 321 */
317bool has_ns_capability(struct task_struct *t, 322bool has_capability(struct task_struct *t, int cap)
318 struct user_namespace *ns, int cap)
319{ 323{
320 int ret = security_real_capable(t, ns, cap); 324 return has_ns_capability(t, &init_user_ns, cap);
321
322 return (ret == 0);
323} 325}
324 326
325/** 327/**
326 * has_capability_noaudit - Does a task have a capability (unaudited) 328 * has_ns_capability_noaudit - Does a task have a capability (unaudited)
329 * in a specific user ns.
327 * @t: The task in question 330 * @t: The task in question
331 * @ns: target user namespace
328 * @cap: The capability to be tested for 332 * @cap: The capability to be tested for
329 * 333 *
330 * Return true if the specified task has the given superior capability 334 * Return true if the specified task has the given superior capability
331 * currently in effect to init_user_ns, false if not. Don't write an 335 * currently in effect to the specified user namespace, false if not.
332 * audit message for the check. 336 * Do not write an audit message for the check.
333 * 337 *
334 * Note that this does not set PF_SUPERPRIV on the task. 338 * Note that this does not set PF_SUPERPRIV on the task.
335 */ 339 */
336bool has_capability_noaudit(struct task_struct *t, int cap) 340bool has_ns_capability_noaudit(struct task_struct *t,
341 struct user_namespace *ns, int cap)
337{ 342{
338 int ret = security_real_capable_noaudit(t, &init_user_ns, cap); 343 int ret;
344
345 rcu_read_lock();
346 ret = security_capable_noaudit(__task_cred(t), ns, cap);
347 rcu_read_unlock();
339 348
340 return (ret == 0); 349 return (ret == 0);
341} 350}
342 351
343/** 352/**
344 * capable - Determine if the current task has a superior capability in effect 353 * has_capability_noaudit - Does a task have a capability (unaudited) in the
354 * initial user ns
355 * @t: The task in question
345 * @cap: The capability to be tested for 356 * @cap: The capability to be tested for
346 * 357 *
347 * Return true if the current task has the given superior capability currently 358 * Return true if the specified task has the given superior capability
348 * available for use, false if not. 359 * currently in effect to init_user_ns, false if not. Don't write an
360 * audit message for the check.
349 * 361 *
350 * This sets PF_SUPERPRIV on the task if the capability is available on the 362 * Note that this does not set PF_SUPERPRIV on the task.
351 * assumption that it's about to be used.
352 */ 363 */
353bool capable(int cap) 364bool has_capability_noaudit(struct task_struct *t, int cap)
354{ 365{
355 return ns_capable(&init_user_ns, cap); 366 return has_ns_capability_noaudit(t, &init_user_ns, cap);
356} 367}
357EXPORT_SYMBOL(capable);
358 368
359/** 369/**
360 * ns_capable - Determine if the current task has a superior capability in effect 370 * ns_capable - Determine if the current task has a superior capability in effect
@@ -374,7 +384,7 @@ bool ns_capable(struct user_namespace *ns, int cap)
374 BUG(); 384 BUG();
375 } 385 }
376 386
377 if (security_capable(ns, current_cred(), cap) == 0) { 387 if (security_capable(current_cred(), ns, cap) == 0) {
378 current->flags |= PF_SUPERPRIV; 388 current->flags |= PF_SUPERPRIV;
379 return true; 389 return true;
380 } 390 }
@@ -383,18 +393,20 @@ bool ns_capable(struct user_namespace *ns, int cap)
383EXPORT_SYMBOL(ns_capable); 393EXPORT_SYMBOL(ns_capable);
384 394
385/** 395/**
386 * task_ns_capable - Determine whether current task has a superior 396 * capable - Determine if the current task has a superior capability in effect
387 * capability targeted at a specific task's user namespace. 397 * @cap: The capability to be tested for
388 * @t: The task whose user namespace is targeted. 398 *
389 * @cap: The capability in question. 399 * Return true if the current task has the given superior capability currently
400 * available for use, false if not.
390 * 401 *
391 * Return true if it does, false otherwise. 402 * This sets PF_SUPERPRIV on the task if the capability is available on the
403 * assumption that it's about to be used.
392 */ 404 */
393bool task_ns_capable(struct task_struct *t, int cap) 405bool capable(int cap)
394{ 406{
395 return ns_capable(task_cred_xxx(t, user)->user_ns, cap); 407 return ns_capable(&init_user_ns, cap);
396} 408}
397EXPORT_SYMBOL(task_ns_capable); 409EXPORT_SYMBOL(capable);
398 410
399/** 411/**
400 * nsown_capable - Check superior capability to one's own user_ns 412 * nsown_capable - Check superior capability to one's own user_ns
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d9d5648f3cdc..a5d3b5325f77 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -63,7 +63,24 @@
63 63
64#include <linux/atomic.h> 64#include <linux/atomic.h>
65 65
66/*
67 * cgroup_mutex is the master lock. Any modification to cgroup or its
68 * hierarchy must be performed while holding it.
69 *
70 * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify
71 * cgroupfs_root of any cgroup hierarchy - subsys list, flags,
72 * release_agent_path and so on. Modifying requires both cgroup_mutex and
73 * cgroup_root_mutex. Readers can acquire either of the two. This is to
74 * break the following locking order cycle.
75 *
76 * A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem
77 * B. namespace_sem -> cgroup_mutex
78 *
79 * B happens only through cgroup_show_options() and using cgroup_root_mutex
80 * breaks it.
81 */
66static DEFINE_MUTEX(cgroup_mutex); 82static DEFINE_MUTEX(cgroup_mutex);
83static DEFINE_MUTEX(cgroup_root_mutex);
67 84
68/* 85/*
69 * Generate an array of cgroup subsystem pointers. At boot time, this is 86 * Generate an array of cgroup subsystem pointers. At boot time, this is
@@ -760,7 +777,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
760 * -> cgroup_mkdir. 777 * -> cgroup_mkdir.
761 */ 778 */
762 779
763static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); 780static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
764static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *); 781static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
765static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 782static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
766static int cgroup_populate_dir(struct cgroup *cgrp); 783static int cgroup_populate_dir(struct cgroup *cgrp);
@@ -775,7 +792,7 @@ static struct backing_dev_info cgroup_backing_dev_info = {
775static int alloc_css_id(struct cgroup_subsys *ss, 792static int alloc_css_id(struct cgroup_subsys *ss,
776 struct cgroup *parent, struct cgroup *child); 793 struct cgroup *parent, struct cgroup *child);
777 794
778static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) 795static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
779{ 796{
780 struct inode *inode = new_inode(sb); 797 struct inode *inode = new_inode(sb);
781 798
@@ -921,7 +938,7 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
921 * 938 *
922 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; 939 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
923 */ 940 */
924DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); 941static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
925 942
926static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) 943static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
927{ 944{
@@ -953,6 +970,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
953 int i; 970 int i;
954 971
955 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 972 BUG_ON(!mutex_is_locked(&cgroup_mutex));
973 BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
956 974
957 removed_bits = root->actual_subsys_bits & ~final_bits; 975 removed_bits = root->actual_subsys_bits & ~final_bits;
958 added_bits = final_bits & ~root->actual_subsys_bits; 976 added_bits = final_bits & ~root->actual_subsys_bits;
@@ -1038,12 +1056,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1038 return 0; 1056 return 0;
1039} 1057}
1040 1058
1041static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) 1059static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1042{ 1060{
1043 struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info; 1061 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
1044 struct cgroup_subsys *ss; 1062 struct cgroup_subsys *ss;
1045 1063
1046 mutex_lock(&cgroup_mutex); 1064 mutex_lock(&cgroup_root_mutex);
1047 for_each_subsys(root, ss) 1065 for_each_subsys(root, ss)
1048 seq_printf(seq, ",%s", ss->name); 1066 seq_printf(seq, ",%s", ss->name);
1049 if (test_bit(ROOT_NOPREFIX, &root->flags)) 1067 if (test_bit(ROOT_NOPREFIX, &root->flags))
@@ -1054,7 +1072,7 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
1054 seq_puts(seq, ",clone_children"); 1072 seq_puts(seq, ",clone_children");
1055 if (strlen(root->name)) 1073 if (strlen(root->name))
1056 seq_printf(seq, ",name=%s", root->name); 1074 seq_printf(seq, ",name=%s", root->name);
1057 mutex_unlock(&cgroup_mutex); 1075 mutex_unlock(&cgroup_root_mutex);
1058 return 0; 1076 return 0;
1059} 1077}
1060 1078
@@ -1175,10 +1193,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1175 1193
1176 /* 1194 /*
1177 * If the 'all' option was specified select all the subsystems, 1195 * If the 'all' option was specified select all the subsystems,
1178 * otherwise 'all, 'none' and a subsystem name options were not 1196 * otherwise if 'none', 'name=' and a subsystem name options
1179 * specified, let's default to 'all' 1197 * were not specified, let's default to 'all'
1180 */ 1198 */
1181 if (all_ss || (!all_ss && !one_ss && !opts->none)) { 1199 if (all_ss || (!one_ss && !opts->none && !opts->name)) {
1182 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1200 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
1183 struct cgroup_subsys *ss = subsys[i]; 1201 struct cgroup_subsys *ss = subsys[i];
1184 if (ss == NULL) 1202 if (ss == NULL)
@@ -1269,6 +1287,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1269 1287
1270 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 1288 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1271 mutex_lock(&cgroup_mutex); 1289 mutex_lock(&cgroup_mutex);
1290 mutex_lock(&cgroup_root_mutex);
1272 1291
1273 /* See what subsystems are wanted */ 1292 /* See what subsystems are wanted */
1274 ret = parse_cgroupfs_options(data, &opts); 1293 ret = parse_cgroupfs_options(data, &opts);
@@ -1297,6 +1316,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1297 out_unlock: 1316 out_unlock:
1298 kfree(opts.release_agent); 1317 kfree(opts.release_agent);
1299 kfree(opts.name); 1318 kfree(opts.name);
1319 mutex_unlock(&cgroup_root_mutex);
1300 mutex_unlock(&cgroup_mutex); 1320 mutex_unlock(&cgroup_mutex);
1301 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1321 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
1302 return ret; 1322 return ret;
@@ -1481,6 +1501,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1481 int ret = 0; 1501 int ret = 0;
1482 struct super_block *sb; 1502 struct super_block *sb;
1483 struct cgroupfs_root *new_root; 1503 struct cgroupfs_root *new_root;
1504 struct inode *inode;
1484 1505
1485 /* First find the desired set of subsystems */ 1506 /* First find the desired set of subsystems */
1486 mutex_lock(&cgroup_mutex); 1507 mutex_lock(&cgroup_mutex);
@@ -1514,7 +1535,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1514 /* We used the new root structure, so this is a new hierarchy */ 1535 /* We used the new root structure, so this is a new hierarchy */
1515 struct list_head tmp_cg_links; 1536 struct list_head tmp_cg_links;
1516 struct cgroup *root_cgrp = &root->top_cgroup; 1537 struct cgroup *root_cgrp = &root->top_cgroup;
1517 struct inode *inode;
1518 struct cgroupfs_root *existing_root; 1538 struct cgroupfs_root *existing_root;
1519 const struct cred *cred; 1539 const struct cred *cred;
1520 int i; 1540 int i;
@@ -1528,18 +1548,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1528 1548
1529 mutex_lock(&inode->i_mutex); 1549 mutex_lock(&inode->i_mutex);
1530 mutex_lock(&cgroup_mutex); 1550 mutex_lock(&cgroup_mutex);
1551 mutex_lock(&cgroup_root_mutex);
1531 1552
1532 if (strlen(root->name)) { 1553 /* Check for name clashes with existing mounts */
1533 /* Check for name clashes with existing mounts */ 1554 ret = -EBUSY;
1534 for_each_active_root(existing_root) { 1555 if (strlen(root->name))
1535 if (!strcmp(existing_root->name, root->name)) { 1556 for_each_active_root(existing_root)
1536 ret = -EBUSY; 1557 if (!strcmp(existing_root->name, root->name))
1537 mutex_unlock(&cgroup_mutex); 1558 goto unlock_drop;
1538 mutex_unlock(&inode->i_mutex);
1539 goto drop_new_super;
1540 }
1541 }
1542 }
1543 1559
1544 /* 1560 /*
1545 * We're accessing css_set_count without locking 1561 * We're accessing css_set_count without locking
@@ -1549,18 +1565,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1549 * have some link structures left over 1565 * have some link structures left over
1550 */ 1566 */
1551 ret = allocate_cg_links(css_set_count, &tmp_cg_links); 1567 ret = allocate_cg_links(css_set_count, &tmp_cg_links);
1552 if (ret) { 1568 if (ret)
1553 mutex_unlock(&cgroup_mutex); 1569 goto unlock_drop;
1554 mutex_unlock(&inode->i_mutex);
1555 goto drop_new_super;
1556 }
1557 1570
1558 ret = rebind_subsystems(root, root->subsys_bits); 1571 ret = rebind_subsystems(root, root->subsys_bits);
1559 if (ret == -EBUSY) { 1572 if (ret == -EBUSY) {
1560 mutex_unlock(&cgroup_mutex);
1561 mutex_unlock(&inode->i_mutex);
1562 free_cg_links(&tmp_cg_links); 1573 free_cg_links(&tmp_cg_links);
1563 goto drop_new_super; 1574 goto unlock_drop;
1564 } 1575 }
1565 /* 1576 /*
1566 * There must be no failure case after here, since rebinding 1577 * There must be no failure case after here, since rebinding
@@ -1599,6 +1610,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1599 cred = override_creds(&init_cred); 1610 cred = override_creds(&init_cred);
1600 cgroup_populate_dir(root_cgrp); 1611 cgroup_populate_dir(root_cgrp);
1601 revert_creds(cred); 1612 revert_creds(cred);
1613 mutex_unlock(&cgroup_root_mutex);
1602 mutex_unlock(&cgroup_mutex); 1614 mutex_unlock(&cgroup_mutex);
1603 mutex_unlock(&inode->i_mutex); 1615 mutex_unlock(&inode->i_mutex);
1604 } else { 1616 } else {
@@ -1615,6 +1627,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1615 kfree(opts.name); 1627 kfree(opts.name);
1616 return dget(sb->s_root); 1628 return dget(sb->s_root);
1617 1629
1630 unlock_drop:
1631 mutex_unlock(&cgroup_root_mutex);
1632 mutex_unlock(&cgroup_mutex);
1633 mutex_unlock(&inode->i_mutex);
1618 drop_new_super: 1634 drop_new_super:
1619 deactivate_locked_super(sb); 1635 deactivate_locked_super(sb);
1620 drop_modules: 1636 drop_modules:
@@ -1639,6 +1655,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1639 BUG_ON(!list_empty(&cgrp->sibling)); 1655 BUG_ON(!list_empty(&cgrp->sibling));
1640 1656
1641 mutex_lock(&cgroup_mutex); 1657 mutex_lock(&cgroup_mutex);
1658 mutex_lock(&cgroup_root_mutex);
1642 1659
1643 /* Rebind all subsystems back to the default hierarchy */ 1660 /* Rebind all subsystems back to the default hierarchy */
1644 ret = rebind_subsystems(root, 0); 1661 ret = rebind_subsystems(root, 0);
@@ -1664,6 +1681,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1664 root_count--; 1681 root_count--;
1665 } 1682 }
1666 1683
1684 mutex_unlock(&cgroup_root_mutex);
1667 mutex_unlock(&cgroup_mutex); 1685 mutex_unlock(&cgroup_mutex);
1668 1686
1669 kill_litter_super(sb); 1687 kill_litter_super(sb);
@@ -1740,11 +1758,90 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1740EXPORT_SYMBOL_GPL(cgroup_path); 1758EXPORT_SYMBOL_GPL(cgroup_path);
1741 1759
1742/* 1760/*
1761 * Control Group taskset
1762 */
1763struct task_and_cgroup {
1764 struct task_struct *task;
1765 struct cgroup *cgrp;
1766};
1767
1768struct cgroup_taskset {
1769 struct task_and_cgroup single;
1770 struct flex_array *tc_array;
1771 int tc_array_len;
1772 int idx;
1773 struct cgroup *cur_cgrp;
1774};
1775
1776/**
1777 * cgroup_taskset_first - reset taskset and return the first task
1778 * @tset: taskset of interest
1779 *
1780 * @tset iteration is initialized and the first task is returned.
1781 */
1782struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
1783{
1784 if (tset->tc_array) {
1785 tset->idx = 0;
1786 return cgroup_taskset_next(tset);
1787 } else {
1788 tset->cur_cgrp = tset->single.cgrp;
1789 return tset->single.task;
1790 }
1791}
1792EXPORT_SYMBOL_GPL(cgroup_taskset_first);
1793
1794/**
1795 * cgroup_taskset_next - iterate to the next task in taskset
1796 * @tset: taskset of interest
1797 *
1798 * Return the next task in @tset. Iteration must have been initialized
1799 * with cgroup_taskset_first().
1800 */
1801struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
1802{
1803 struct task_and_cgroup *tc;
1804
1805 if (!tset->tc_array || tset->idx >= tset->tc_array_len)
1806 return NULL;
1807
1808 tc = flex_array_get(tset->tc_array, tset->idx++);
1809 tset->cur_cgrp = tc->cgrp;
1810 return tc->task;
1811}
1812EXPORT_SYMBOL_GPL(cgroup_taskset_next);
1813
1814/**
1815 * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task
1816 * @tset: taskset of interest
1817 *
1818 * Return the cgroup for the current (last returned) task of @tset. This
1819 * function must be preceded by either cgroup_taskset_first() or
1820 * cgroup_taskset_next().
1821 */
1822struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset)
1823{
1824 return tset->cur_cgrp;
1825}
1826EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup);
1827
1828/**
1829 * cgroup_taskset_size - return the number of tasks in taskset
1830 * @tset: taskset of interest
1831 */
1832int cgroup_taskset_size(struct cgroup_taskset *tset)
1833{
1834 return tset->tc_array ? tset->tc_array_len : 1;
1835}
1836EXPORT_SYMBOL_GPL(cgroup_taskset_size);
1837
1838
1839/*
1743 * cgroup_task_migrate - move a task from one cgroup to another. 1840 * cgroup_task_migrate - move a task from one cgroup to another.
1744 * 1841 *
1745 * 'guarantee' is set if the caller promises that a new css_set for the task 1842 * 'guarantee' is set if the caller promises that a new css_set for the task
1746 * will already exist. If not set, this function might sleep, and can fail with 1843 * will already exist. If not set, this function might sleep, and can fail with
1747 * -ENOMEM. Otherwise, it can only fail with -ESRCH. 1844 * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.
1748 */ 1845 */
1749static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, 1846static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1750 struct task_struct *tsk, bool guarantee) 1847 struct task_struct *tsk, bool guarantee)
@@ -1753,14 +1850,12 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1753 struct css_set *newcg; 1850 struct css_set *newcg;
1754 1851
1755 /* 1852 /*
1756 * get old css_set. we need to take task_lock and refcount it, because 1853 * We are synchronized through threadgroup_lock() against PF_EXITING
1757 * an exiting task can change its css_set to init_css_set and drop its 1854 * setting such that we can't race against cgroup_exit() changing the
1758 * old one without taking cgroup_mutex. 1855 * css_set to init_css_set and dropping the old one.
1759 */ 1856 */
1760 task_lock(tsk); 1857 WARN_ON_ONCE(tsk->flags & PF_EXITING);
1761 oldcg = tsk->cgroups; 1858 oldcg = tsk->cgroups;
1762 get_css_set(oldcg);
1763 task_unlock(tsk);
1764 1859
1765 /* locate or allocate a new css_set for this task. */ 1860 /* locate or allocate a new css_set for this task. */
1766 if (guarantee) { 1861 if (guarantee) {
@@ -1775,20 +1870,11 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1775 might_sleep(); 1870 might_sleep();
1776 /* find_css_set will give us newcg already referenced. */ 1871 /* find_css_set will give us newcg already referenced. */
1777 newcg = find_css_set(oldcg, cgrp); 1872 newcg = find_css_set(oldcg, cgrp);
1778 if (!newcg) { 1873 if (!newcg)
1779 put_css_set(oldcg);
1780 return -ENOMEM; 1874 return -ENOMEM;
1781 }
1782 } 1875 }
1783 put_css_set(oldcg);
1784 1876
1785 /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
1786 task_lock(tsk); 1877 task_lock(tsk);
1787 if (tsk->flags & PF_EXITING) {
1788 task_unlock(tsk);
1789 put_css_set(newcg);
1790 return -ESRCH;
1791 }
1792 rcu_assign_pointer(tsk->cgroups, newcg); 1878 rcu_assign_pointer(tsk->cgroups, newcg);
1793 task_unlock(tsk); 1879 task_unlock(tsk);
1794 1880
@@ -1814,8 +1900,8 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1814 * @cgrp: the cgroup the task is attaching to 1900 * @cgrp: the cgroup the task is attaching to
1815 * @tsk: the task to be attached 1901 * @tsk: the task to be attached
1816 * 1902 *
1817 * Call holding cgroup_mutex. May take task_lock of 1903 * Call with cgroup_mutex and threadgroup locked. May take task_lock of
1818 * the task 'tsk' during call. 1904 * @tsk during call.
1819 */ 1905 */
1820int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 1906int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1821{ 1907{
@@ -1823,15 +1909,23 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1823 struct cgroup_subsys *ss, *failed_ss = NULL; 1909 struct cgroup_subsys *ss, *failed_ss = NULL;
1824 struct cgroup *oldcgrp; 1910 struct cgroup *oldcgrp;
1825 struct cgroupfs_root *root = cgrp->root; 1911 struct cgroupfs_root *root = cgrp->root;
1912 struct cgroup_taskset tset = { };
1913
1914 /* @tsk either already exited or can't exit until the end */
1915 if (tsk->flags & PF_EXITING)
1916 return -ESRCH;
1826 1917
1827 /* Nothing to do if the task is already in that cgroup */ 1918 /* Nothing to do if the task is already in that cgroup */
1828 oldcgrp = task_cgroup_from_root(tsk, root); 1919 oldcgrp = task_cgroup_from_root(tsk, root);
1829 if (cgrp == oldcgrp) 1920 if (cgrp == oldcgrp)
1830 return 0; 1921 return 0;
1831 1922
1923 tset.single.task = tsk;
1924 tset.single.cgrp = oldcgrp;
1925
1832 for_each_subsys(root, ss) { 1926 for_each_subsys(root, ss) {
1833 if (ss->can_attach) { 1927 if (ss->can_attach) {
1834 retval = ss->can_attach(ss, cgrp, tsk); 1928 retval = ss->can_attach(ss, cgrp, &tset);
1835 if (retval) { 1929 if (retval) {
1836 /* 1930 /*
1837 * Remember on which subsystem the can_attach() 1931 * Remember on which subsystem the can_attach()
@@ -1843,13 +1937,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1843 goto out; 1937 goto out;
1844 } 1938 }
1845 } 1939 }
1846 if (ss->can_attach_task) {
1847 retval = ss->can_attach_task(cgrp, tsk);
1848 if (retval) {
1849 failed_ss = ss;
1850 goto out;
1851 }
1852 }
1853 } 1940 }
1854 1941
1855 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); 1942 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
@@ -1857,12 +1944,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1857 goto out; 1944 goto out;
1858 1945
1859 for_each_subsys(root, ss) { 1946 for_each_subsys(root, ss) {
1860 if (ss->pre_attach)
1861 ss->pre_attach(cgrp);
1862 if (ss->attach_task)
1863 ss->attach_task(cgrp, tsk);
1864 if (ss->attach) 1947 if (ss->attach)
1865 ss->attach(ss, cgrp, oldcgrp, tsk); 1948 ss->attach(ss, cgrp, &tset);
1866 } 1949 }
1867 1950
1868 synchronize_rcu(); 1951 synchronize_rcu();
@@ -1884,7 +1967,7 @@ out:
1884 */ 1967 */
1885 break; 1968 break;
1886 if (ss->cancel_attach) 1969 if (ss->cancel_attach)
1887 ss->cancel_attach(ss, cgrp, tsk); 1970 ss->cancel_attach(ss, cgrp, &tset);
1888 } 1971 }
1889 } 1972 }
1890 return retval; 1973 return retval;
@@ -1935,23 +2018,17 @@ static bool css_set_check_fetched(struct cgroup *cgrp,
1935 2018
1936 read_lock(&css_set_lock); 2019 read_lock(&css_set_lock);
1937 newcg = find_existing_css_set(cg, cgrp, template); 2020 newcg = find_existing_css_set(cg, cgrp, template);
1938 if (newcg)
1939 get_css_set(newcg);
1940 read_unlock(&css_set_lock); 2021 read_unlock(&css_set_lock);
1941 2022
1942 /* doesn't exist at all? */ 2023 /* doesn't exist at all? */
1943 if (!newcg) 2024 if (!newcg)
1944 return false; 2025 return false;
1945 /* see if it's already in the list */ 2026 /* see if it's already in the list */
1946 list_for_each_entry(cg_entry, newcg_list, links) { 2027 list_for_each_entry(cg_entry, newcg_list, links)
1947 if (cg_entry->cg == newcg) { 2028 if (cg_entry->cg == newcg)
1948 put_css_set(newcg);
1949 return true; 2029 return true;
1950 }
1951 }
1952 2030
1953 /* not found */ 2031 /* not found */
1954 put_css_set(newcg);
1955 return false; 2032 return false;
1956} 2033}
1957 2034
@@ -1985,21 +2062,21 @@ static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
1985 * @cgrp: the cgroup to attach to 2062 * @cgrp: the cgroup to attach to
1986 * @leader: the threadgroup leader task_struct of the group to be attached 2063 * @leader: the threadgroup leader task_struct of the group to be attached
1987 * 2064 *
1988 * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will 2065 * Call holding cgroup_mutex and the group_rwsem of the leader. Will take
1989 * take task_lock of each thread in leader's threadgroup individually in turn. 2066 * task_lock of each thread in leader's threadgroup individually in turn.
1990 */ 2067 */
1991int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) 2068static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
1992{ 2069{
1993 int retval, i, group_size; 2070 int retval, i, group_size;
1994 struct cgroup_subsys *ss, *failed_ss = NULL; 2071 struct cgroup_subsys *ss, *failed_ss = NULL;
1995 bool cancel_failed_ss = false;
1996 /* guaranteed to be initialized later, but the compiler needs this */ 2072 /* guaranteed to be initialized later, but the compiler needs this */
1997 struct cgroup *oldcgrp = NULL;
1998 struct css_set *oldcg; 2073 struct css_set *oldcg;
1999 struct cgroupfs_root *root = cgrp->root; 2074 struct cgroupfs_root *root = cgrp->root;
2000 /* threadgroup list cursor and array */ 2075 /* threadgroup list cursor and array */
2001 struct task_struct *tsk; 2076 struct task_struct *tsk;
2077 struct task_and_cgroup *tc;
2002 struct flex_array *group; 2078 struct flex_array *group;
2079 struct cgroup_taskset tset = { };
2003 /* 2080 /*
2004 * we need to make sure we have css_sets for all the tasks we're 2081 * we need to make sure we have css_sets for all the tasks we're
2005 * going to move -before- we actually start moving them, so that in 2082 * going to move -before- we actually start moving them, so that in
@@ -2012,13 +2089,12 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2012 * step 0: in order to do expensive, possibly blocking operations for 2089 * step 0: in order to do expensive, possibly blocking operations for
2013 * every thread, we cannot iterate the thread group list, since it needs 2090 * every thread, we cannot iterate the thread group list, since it needs
2014 * rcu or tasklist locked. instead, build an array of all threads in the 2091 * rcu or tasklist locked. instead, build an array of all threads in the
2015 * group - threadgroup_fork_lock prevents new threads from appearing, 2092 * group - group_rwsem prevents new threads from appearing, and if
2016 * and if threads exit, this will just be an over-estimate. 2093 * threads exit, this will just be an over-estimate.
2017 */ 2094 */
2018 group_size = get_nr_threads(leader); 2095 group_size = get_nr_threads(leader);
2019 /* flex_array supports very large thread-groups better than kmalloc. */ 2096 /* flex_array supports very large thread-groups better than kmalloc. */
2020 group = flex_array_alloc(sizeof(struct task_struct *), group_size, 2097 group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);
2021 GFP_KERNEL);
2022 if (!group) 2098 if (!group)
2023 return -ENOMEM; 2099 return -ENOMEM;
2024 /* pre-allocate to guarantee space while iterating in rcu read-side. */ 2100 /* pre-allocate to guarantee space while iterating in rcu read-side. */
@@ -2040,49 +2116,53 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2040 retval = -EAGAIN; 2116 retval = -EAGAIN;
2041 goto out_free_group_list; 2117 goto out_free_group_list;
2042 } 2118 }
2043 /* take a reference on each task in the group to go in the array. */ 2119
2044 tsk = leader; 2120 tsk = leader;
2045 i = 0; 2121 i = 0;
2046 do { 2122 do {
2123 struct task_and_cgroup ent;
2124
2125 /* @tsk either already exited or can't exit until the end */
2126 if (tsk->flags & PF_EXITING)
2127 continue;
2128
2047 /* as per above, nr_threads may decrease, but not increase. */ 2129 /* as per above, nr_threads may decrease, but not increase. */
2048 BUG_ON(i >= group_size); 2130 BUG_ON(i >= group_size);
2049 get_task_struct(tsk);
2050 /* 2131 /*
2051 * saying GFP_ATOMIC has no effect here because we did prealloc 2132 * saying GFP_ATOMIC has no effect here because we did prealloc
2052 * earlier, but it's good form to communicate our expectations. 2133 * earlier, but it's good form to communicate our expectations.
2053 */ 2134 */
2054 retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC); 2135 ent.task = tsk;
2136 ent.cgrp = task_cgroup_from_root(tsk, root);
2137 /* nothing to do if this task is already in the cgroup */
2138 if (ent.cgrp == cgrp)
2139 continue;
2140 retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
2055 BUG_ON(retval != 0); 2141 BUG_ON(retval != 0);
2056 i++; 2142 i++;
2057 } while_each_thread(leader, tsk); 2143 } while_each_thread(leader, tsk);
2058 /* remember the number of threads in the array for later. */ 2144 /* remember the number of threads in the array for later. */
2059 group_size = i; 2145 group_size = i;
2146 tset.tc_array = group;
2147 tset.tc_array_len = group_size;
2060 read_unlock(&tasklist_lock); 2148 read_unlock(&tasklist_lock);
2061 2149
2150 /* methods shouldn't be called if no task is actually migrating */
2151 retval = 0;
2152 if (!group_size)
2153 goto out_free_group_list;
2154
2062 /* 2155 /*
2063 * step 1: check that we can legitimately attach to the cgroup. 2156 * step 1: check that we can legitimately attach to the cgroup.
2064 */ 2157 */
2065 for_each_subsys(root, ss) { 2158 for_each_subsys(root, ss) {
2066 if (ss->can_attach) { 2159 if (ss->can_attach) {
2067 retval = ss->can_attach(ss, cgrp, leader); 2160 retval = ss->can_attach(ss, cgrp, &tset);
2068 if (retval) { 2161 if (retval) {
2069 failed_ss = ss; 2162 failed_ss = ss;
2070 goto out_cancel_attach; 2163 goto out_cancel_attach;
2071 } 2164 }
2072 } 2165 }
2073 /* a callback to be run on every thread in the threadgroup. */
2074 if (ss->can_attach_task) {
2075 /* run on each task in the threadgroup. */
2076 for (i = 0; i < group_size; i++) {
2077 tsk = flex_array_get_ptr(group, i);
2078 retval = ss->can_attach_task(cgrp, tsk);
2079 if (retval) {
2080 failed_ss = ss;
2081 cancel_failed_ss = true;
2082 goto out_cancel_attach;
2083 }
2084 }
2085 }
2086 } 2166 }
2087 2167
2088 /* 2168 /*
@@ -2091,72 +2171,36 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2091 */ 2171 */
2092 INIT_LIST_HEAD(&newcg_list); 2172 INIT_LIST_HEAD(&newcg_list);
2093 for (i = 0; i < group_size; i++) { 2173 for (i = 0; i < group_size; i++) {
2094 tsk = flex_array_get_ptr(group, i); 2174 tc = flex_array_get(group, i);
2095 /* nothing to do if this task is already in the cgroup */ 2175 oldcg = tc->task->cgroups;
2096 oldcgrp = task_cgroup_from_root(tsk, root); 2176
2097 if (cgrp == oldcgrp) 2177 /* if we don't already have it in the list get a new one */
2098 continue; 2178 if (!css_set_check_fetched(cgrp, tc->task, oldcg,
2099 /* get old css_set pointer */ 2179 &newcg_list)) {
2100 task_lock(tsk);
2101 if (tsk->flags & PF_EXITING) {
2102 /* ignore this task if it's going away */
2103 task_unlock(tsk);
2104 continue;
2105 }
2106 oldcg = tsk->cgroups;
2107 get_css_set(oldcg);
2108 task_unlock(tsk);
2109 /* see if the new one for us is already in the list? */
2110 if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
2111 /* was already there, nothing to do. */
2112 put_css_set(oldcg);
2113 } else {
2114 /* we don't already have it. get new one. */
2115 retval = css_set_prefetch(cgrp, oldcg, &newcg_list); 2180 retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
2116 put_css_set(oldcg);
2117 if (retval) 2181 if (retval)
2118 goto out_list_teardown; 2182 goto out_list_teardown;
2119 } 2183 }
2120 } 2184 }
2121 2185
2122 /* 2186 /*
2123 * step 3: now that we're guaranteed success wrt the css_sets, proceed 2187 * step 3: now that we're guaranteed success wrt the css_sets,
2124 * to move all tasks to the new cgroup, calling ss->attach_task for each 2188 * proceed to move all tasks to the new cgroup. There are no
2125 * one along the way. there are no failure cases after here, so this is 2189 * failure cases after here, so this is the commit point.
2126 * the commit point.
2127 */ 2190 */
2128 for_each_subsys(root, ss) {
2129 if (ss->pre_attach)
2130 ss->pre_attach(cgrp);
2131 }
2132 for (i = 0; i < group_size; i++) { 2191 for (i = 0; i < group_size; i++) {
2133 tsk = flex_array_get_ptr(group, i); 2192 tc = flex_array_get(group, i);
2134 /* leave current thread as it is if it's already there */ 2193 retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true);
2135 oldcgrp = task_cgroup_from_root(tsk, root); 2194 BUG_ON(retval);
2136 if (cgrp == oldcgrp)
2137 continue;
2138 /* if the thread is PF_EXITING, it can just get skipped. */
2139 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
2140 if (retval == 0) {
2141 /* attach each task to each subsystem */
2142 for_each_subsys(root, ss) {
2143 if (ss->attach_task)
2144 ss->attach_task(cgrp, tsk);
2145 }
2146 } else {
2147 BUG_ON(retval != -ESRCH);
2148 }
2149 } 2195 }
2150 /* nothing is sensitive to fork() after this point. */ 2196 /* nothing is sensitive to fork() after this point. */
2151 2197
2152 /* 2198 /*
2153 * step 4: do expensive, non-thread-specific subsystem callbacks. 2199 * step 4: do subsystem attach callbacks.
2154 * TODO: if ever a subsystem needs to know the oldcgrp for each task
2155 * being moved, this call will need to be reworked to communicate that.
2156 */ 2200 */
2157 for_each_subsys(root, ss) { 2201 for_each_subsys(root, ss) {
2158 if (ss->attach) 2202 if (ss->attach)
2159 ss->attach(ss, cgrp, oldcgrp, leader); 2203 ss->attach(ss, cgrp, &tset);
2160 } 2204 }
2161 2205
2162 /* 2206 /*
@@ -2176,20 +2220,12 @@ out_cancel_attach:
2176 /* same deal as in cgroup_attach_task */ 2220 /* same deal as in cgroup_attach_task */
2177 if (retval) { 2221 if (retval) {
2178 for_each_subsys(root, ss) { 2222 for_each_subsys(root, ss) {
2179 if (ss == failed_ss) { 2223 if (ss == failed_ss)
2180 if (cancel_failed_ss && ss->cancel_attach)
2181 ss->cancel_attach(ss, cgrp, leader);
2182 break; 2224 break;
2183 }
2184 if (ss->cancel_attach) 2225 if (ss->cancel_attach)
2185 ss->cancel_attach(ss, cgrp, leader); 2226 ss->cancel_attach(ss, cgrp, &tset);
2186 } 2227 }
2187 } 2228 }
2188 /* clean up the array of referenced threads in the group. */
2189 for (i = 0; i < group_size; i++) {
2190 tsk = flex_array_get_ptr(group, i);
2191 put_task_struct(tsk);
2192 }
2193out_free_group_list: 2229out_free_group_list:
2194 flex_array_free(group); 2230 flex_array_free(group);
2195 return retval; 2231 return retval;
@@ -2197,8 +2233,8 @@ out_free_group_list:
2197 2233
2198/* 2234/*
2199 * Find the task_struct of the task to attach by vpid and pass it along to the 2235 * Find the task_struct of the task to attach by vpid and pass it along to the
2200 * function to attach either it or all tasks in its threadgroup. Will take 2236 * function to attach either it or all tasks in its threadgroup. Will lock
2201 * cgroup_mutex; may take task_lock of task. 2237 * cgroup_mutex and threadgroup; may take task_lock of task.
2202 */ 2238 */
2203static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) 2239static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2204{ 2240{
@@ -2225,13 +2261,7 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2225 * detect it later. 2261 * detect it later.
2226 */ 2262 */
2227 tsk = tsk->group_leader; 2263 tsk = tsk->group_leader;
2228 } else if (tsk->flags & PF_EXITING) {
2229 /* optimization for the single-task-only case */
2230 rcu_read_unlock();
2231 cgroup_unlock();
2232 return -ESRCH;
2233 } 2264 }
2234
2235 /* 2265 /*
2236 * even if we're attaching all tasks in the thread group, we 2266 * even if we're attaching all tasks in the thread group, we
2237 * only need to check permissions on one of them. 2267 * only need to check permissions on one of them.
@@ -2254,13 +2284,15 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2254 get_task_struct(tsk); 2284 get_task_struct(tsk);
2255 } 2285 }
2256 2286
2257 if (threadgroup) { 2287 threadgroup_lock(tsk);
2258 threadgroup_fork_write_lock(tsk); 2288
2289 if (threadgroup)
2259 ret = cgroup_attach_proc(cgrp, tsk); 2290 ret = cgroup_attach_proc(cgrp, tsk);
2260 threadgroup_fork_write_unlock(tsk); 2291 else
2261 } else {
2262 ret = cgroup_attach_task(cgrp, tsk); 2292 ret = cgroup_attach_task(cgrp, tsk);
2263 } 2293
2294 threadgroup_unlock(tsk);
2295
2264 put_task_struct(tsk); 2296 put_task_struct(tsk);
2265 cgroup_unlock(); 2297 cgroup_unlock();
2266 return ret; 2298 return ret;
@@ -2311,7 +2343,9 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
2311 return -EINVAL; 2343 return -EINVAL;
2312 if (!cgroup_lock_live_group(cgrp)) 2344 if (!cgroup_lock_live_group(cgrp))
2313 return -ENODEV; 2345 return -ENODEV;
2346 mutex_lock(&cgroup_root_mutex);
2314 strcpy(cgrp->root->release_agent_path, buffer); 2347 strcpy(cgrp->root->release_agent_path, buffer);
2348 mutex_unlock(&cgroup_root_mutex);
2315 cgroup_unlock(); 2349 cgroup_unlock();
2316 return 0; 2350 return 0;
2317} 2351}
@@ -2590,7 +2624,7 @@ static inline struct cftype *__file_cft(struct file *file)
2590 return __d_cft(file->f_dentry); 2624 return __d_cft(file->f_dentry);
2591} 2625}
2592 2626
2593static int cgroup_create_file(struct dentry *dentry, mode_t mode, 2627static int cgroup_create_file(struct dentry *dentry, umode_t mode,
2594 struct super_block *sb) 2628 struct super_block *sb)
2595{ 2629{
2596 struct inode *inode; 2630 struct inode *inode;
@@ -2631,7 +2665,7 @@ static int cgroup_create_file(struct dentry *dentry, mode_t mode,
2631 * @mode: mode to set on new directory. 2665 * @mode: mode to set on new directory.
2632 */ 2666 */
2633static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, 2667static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
2634 mode_t mode) 2668 umode_t mode)
2635{ 2669{
2636 struct dentry *parent; 2670 struct dentry *parent;
2637 int error = 0; 2671 int error = 0;
@@ -2658,9 +2692,9 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
2658 * returns S_IRUGO if it has only a read handler 2692 * returns S_IRUGO if it has only a read handler
2659 * returns S_IWUSR if it has only a write hander 2693 * returns S_IWUSR if it has only a write hander
2660 */ 2694 */
2661static mode_t cgroup_file_mode(const struct cftype *cft) 2695static umode_t cgroup_file_mode(const struct cftype *cft)
2662{ 2696{
2663 mode_t mode = 0; 2697 umode_t mode = 0;
2664 2698
2665 if (cft->mode) 2699 if (cft->mode)
2666 return cft->mode; 2700 return cft->mode;
@@ -2683,7 +2717,7 @@ int cgroup_add_file(struct cgroup *cgrp,
2683 struct dentry *dir = cgrp->dentry; 2717 struct dentry *dir = cgrp->dentry;
2684 struct dentry *dentry; 2718 struct dentry *dentry;
2685 int error; 2719 int error;
2686 mode_t mode; 2720 umode_t mode;
2687 2721
2688 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; 2722 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2689 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { 2723 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
@@ -2794,6 +2828,7 @@ static void cgroup_enable_task_cg_lists(void)
2794} 2828}
2795 2829
2796void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) 2830void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
2831 __acquires(css_set_lock)
2797{ 2832{
2798 /* 2833 /*
2799 * The first time anyone tries to iterate across a cgroup, 2834 * The first time anyone tries to iterate across a cgroup,
@@ -2833,6 +2868,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
2833} 2868}
2834 2869
2835void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) 2870void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
2871 __releases(css_set_lock)
2836{ 2872{
2837 read_unlock(&css_set_lock); 2873 read_unlock(&css_set_lock);
2838} 2874}
@@ -3757,7 +3793,7 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
3757 * Must be called with the mutex on the parent inode held 3793 * Must be called with the mutex on the parent inode held
3758 */ 3794 */
3759static long cgroup_create(struct cgroup *parent, struct dentry *dentry, 3795static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3760 mode_t mode) 3796 umode_t mode)
3761{ 3797{
3762 struct cgroup *cgrp; 3798 struct cgroup *cgrp;
3763 struct cgroupfs_root *root = parent->root; 3799 struct cgroupfs_root *root = parent->root;
@@ -3851,7 +3887,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3851 return err; 3887 return err;
3852} 3888}
3853 3889
3854static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode) 3890static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
3855{ 3891{
3856 struct cgroup *c_parent = dentry->d_parent->d_fsdata; 3892 struct cgroup *c_parent = dentry->d_parent->d_fsdata;
3857 3893
@@ -4496,20 +4532,31 @@ static const struct file_operations proc_cgroupstats_operations = {
4496 * 4532 *
4497 * A pointer to the shared css_set was automatically copied in 4533 * A pointer to the shared css_set was automatically copied in
4498 * fork.c by dup_task_struct(). However, we ignore that copy, since 4534 * fork.c by dup_task_struct(). However, we ignore that copy, since
4499 * it was not made under the protection of RCU or cgroup_mutex, so 4535 * it was not made under the protection of RCU, cgroup_mutex or
4500 * might no longer be a valid cgroup pointer. cgroup_attach_task() might 4536 * threadgroup_change_begin(), so it might no longer be a valid
4501 * have already changed current->cgroups, allowing the previously 4537 * cgroup pointer. cgroup_attach_task() might have already changed
4502 * referenced cgroup group to be removed and freed. 4538 * current->cgroups, allowing the previously referenced cgroup
4539 * group to be removed and freed.
4540 *
4541 * Outside the pointer validity we also need to process the css_set
4542 * inheritance between threadgoup_change_begin() and
4543 * threadgoup_change_end(), this way there is no leak in any process
4544 * wide migration performed by cgroup_attach_proc() that could otherwise
4545 * miss a thread because it is too early or too late in the fork stage.
4503 * 4546 *
4504 * At the point that cgroup_fork() is called, 'current' is the parent 4547 * At the point that cgroup_fork() is called, 'current' is the parent
4505 * task, and the passed argument 'child' points to the child task. 4548 * task, and the passed argument 'child' points to the child task.
4506 */ 4549 */
4507void cgroup_fork(struct task_struct *child) 4550void cgroup_fork(struct task_struct *child)
4508{ 4551{
4509 task_lock(current); 4552 /*
4553 * We don't need to task_lock() current because current->cgroups
4554 * can't be changed concurrently here. The parent obviously hasn't
4555 * exited and called cgroup_exit(), and we are synchronized against
4556 * cgroup migration through threadgroup_change_begin().
4557 */
4510 child->cgroups = current->cgroups; 4558 child->cgroups = current->cgroups;
4511 get_css_set(child->cgroups); 4559 get_css_set(child->cgroups);
4512 task_unlock(current);
4513 INIT_LIST_HEAD(&child->cg_list); 4560 INIT_LIST_HEAD(&child->cg_list);
4514} 4561}
4515 4562
@@ -4551,10 +4598,19 @@ void cgroup_post_fork(struct task_struct *child)
4551{ 4598{
4552 if (use_task_css_set_links) { 4599 if (use_task_css_set_links) {
4553 write_lock(&css_set_lock); 4600 write_lock(&css_set_lock);
4554 task_lock(child); 4601 if (list_empty(&child->cg_list)) {
4555 if (list_empty(&child->cg_list)) 4602 /*
4603 * It's safe to use child->cgroups without task_lock()
4604 * here because we are protected through
4605 * threadgroup_change_begin() against concurrent
4606 * css_set change in cgroup_task_migrate(). Also
4607 * the task can't exit at that point until
4608 * wake_up_new_task() is called, so we are protected
4609 * against cgroup_exit() setting child->cgroup to
4610 * init_css_set.
4611 */
4556 list_add(&child->cg_list, &child->cgroups->tasks); 4612 list_add(&child->cg_list, &child->cgroups->tasks);
4557 task_unlock(child); 4613 }
4558 write_unlock(&css_set_lock); 4614 write_unlock(&css_set_lock);
4559 } 4615 }
4560} 4616}
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 5e828a2ca8e6..fc0646b78a64 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -48,19 +48,17 @@ static inline struct freezer *task_freezer(struct task_struct *task)
48 struct freezer, css); 48 struct freezer, css);
49} 49}
50 50
51static inline int __cgroup_freezing_or_frozen(struct task_struct *task) 51bool cgroup_freezing(struct task_struct *task)
52{ 52{
53 enum freezer_state state = task_freezer(task)->state; 53 enum freezer_state state;
54 return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN); 54 bool ret;
55}
56 55
57int cgroup_freezing_or_frozen(struct task_struct *task) 56 rcu_read_lock();
58{ 57 state = task_freezer(task)->state;
59 int result; 58 ret = state == CGROUP_FREEZING || state == CGROUP_FROZEN;
60 task_lock(task); 59 rcu_read_unlock();
61 result = __cgroup_freezing_or_frozen(task); 60
62 task_unlock(task); 61 return ret;
63 return result;
64} 62}
65 63
66/* 64/*
@@ -102,9 +100,6 @@ struct cgroup_subsys freezer_subsys;
102 * freezer_can_attach(): 100 * freezer_can_attach():
103 * cgroup_mutex (held by caller of can_attach) 101 * cgroup_mutex (held by caller of can_attach)
104 * 102 *
105 * cgroup_freezing_or_frozen():
106 * task->alloc_lock (to get task's cgroup)
107 *
108 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): 103 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
109 * freezer->lock 104 * freezer->lock
110 * sighand->siglock (if the cgroup is freezing) 105 * sighand->siglock (if the cgroup is freezing)
@@ -130,7 +125,7 @@ struct cgroup_subsys freezer_subsys;
130 * write_lock css_set_lock (cgroup iterator start) 125 * write_lock css_set_lock (cgroup iterator start)
131 * task->alloc_lock 126 * task->alloc_lock
132 * read_lock css_set_lock (cgroup iterator start) 127 * read_lock css_set_lock (cgroup iterator start)
133 * task->alloc_lock (inside thaw_process(), prevents race with refrigerator()) 128 * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator())
134 * sighand->siglock 129 * sighand->siglock
135 */ 130 */
136static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, 131static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
@@ -150,7 +145,18 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
150static void freezer_destroy(struct cgroup_subsys *ss, 145static void freezer_destroy(struct cgroup_subsys *ss,
151 struct cgroup *cgroup) 146 struct cgroup *cgroup)
152{ 147{
153 kfree(cgroup_freezer(cgroup)); 148 struct freezer *freezer = cgroup_freezer(cgroup);
149
150 if (freezer->state != CGROUP_THAWED)
151 atomic_dec(&system_freezing_cnt);
152 kfree(freezer);
153}
154
155/* task is frozen or will freeze immediately when next it gets woken */
156static bool is_task_frozen_enough(struct task_struct *task)
157{
158 return frozen(task) ||
159 (task_is_stopped_or_traced(task) && freezing(task));
154} 160}
155 161
156/* 162/*
@@ -160,13 +166,17 @@ static void freezer_destroy(struct cgroup_subsys *ss,
160 */ 166 */
161static int freezer_can_attach(struct cgroup_subsys *ss, 167static int freezer_can_attach(struct cgroup_subsys *ss,
162 struct cgroup *new_cgroup, 168 struct cgroup *new_cgroup,
163 struct task_struct *task) 169 struct cgroup_taskset *tset)
164{ 170{
165 struct freezer *freezer; 171 struct freezer *freezer;
172 struct task_struct *task;
166 173
167 /* 174 /*
168 * Anything frozen can't move or be moved to/from. 175 * Anything frozen can't move or be moved to/from.
169 */ 176 */
177 cgroup_taskset_for_each(task, new_cgroup, tset)
178 if (cgroup_freezing(task))
179 return -EBUSY;
170 180
171 freezer = cgroup_freezer(new_cgroup); 181 freezer = cgroup_freezer(new_cgroup);
172 if (freezer->state != CGROUP_THAWED) 182 if (freezer->state != CGROUP_THAWED)
@@ -175,17 +185,6 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
175 return 0; 185 return 0;
176} 186}
177 187
178static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
179{
180 rcu_read_lock();
181 if (__cgroup_freezing_or_frozen(tsk)) {
182 rcu_read_unlock();
183 return -EBUSY;
184 }
185 rcu_read_unlock();
186 return 0;
187}
188
189static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) 188static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
190{ 189{
191 struct freezer *freezer; 190 struct freezer *freezer;
@@ -213,7 +212,7 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
213 212
214 /* Locking avoids race with FREEZING -> THAWED transitions. */ 213 /* Locking avoids race with FREEZING -> THAWED transitions. */
215 if (freezer->state == CGROUP_FREEZING) 214 if (freezer->state == CGROUP_FREEZING)
216 freeze_task(task, true); 215 freeze_task(task);
217 spin_unlock_irq(&freezer->lock); 216 spin_unlock_irq(&freezer->lock);
218} 217}
219 218
@@ -231,7 +230,7 @@ static void update_if_frozen(struct cgroup *cgroup,
231 cgroup_iter_start(cgroup, &it); 230 cgroup_iter_start(cgroup, &it);
232 while ((task = cgroup_iter_next(cgroup, &it))) { 231 while ((task = cgroup_iter_next(cgroup, &it))) {
233 ntotal++; 232 ntotal++;
234 if (frozen(task)) 233 if (freezing(task) && is_task_frozen_enough(task))
235 nfrozen++; 234 nfrozen++;
236 } 235 }
237 236
@@ -279,12 +278,11 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
279 struct task_struct *task; 278 struct task_struct *task;
280 unsigned int num_cant_freeze_now = 0; 279 unsigned int num_cant_freeze_now = 0;
281 280
282 freezer->state = CGROUP_FREEZING;
283 cgroup_iter_start(cgroup, &it); 281 cgroup_iter_start(cgroup, &it);
284 while ((task = cgroup_iter_next(cgroup, &it))) { 282 while ((task = cgroup_iter_next(cgroup, &it))) {
285 if (!freeze_task(task, true)) 283 if (!freeze_task(task))
286 continue; 284 continue;
287 if (frozen(task)) 285 if (is_task_frozen_enough(task))
288 continue; 286 continue;
289 if (!freezing(task) && !freezer_should_skip(task)) 287 if (!freezing(task) && !freezer_should_skip(task))
290 num_cant_freeze_now++; 288 num_cant_freeze_now++;
@@ -300,12 +298,9 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
300 struct task_struct *task; 298 struct task_struct *task;
301 299
302 cgroup_iter_start(cgroup, &it); 300 cgroup_iter_start(cgroup, &it);
303 while ((task = cgroup_iter_next(cgroup, &it))) { 301 while ((task = cgroup_iter_next(cgroup, &it)))
304 thaw_process(task); 302 __thaw_task(task);
305 }
306 cgroup_iter_end(cgroup, &it); 303 cgroup_iter_end(cgroup, &it);
307
308 freezer->state = CGROUP_THAWED;
309} 304}
310 305
311static int freezer_change_state(struct cgroup *cgroup, 306static int freezer_change_state(struct cgroup *cgroup,
@@ -319,20 +314,24 @@ static int freezer_change_state(struct cgroup *cgroup,
319 spin_lock_irq(&freezer->lock); 314 spin_lock_irq(&freezer->lock);
320 315
321 update_if_frozen(cgroup, freezer); 316 update_if_frozen(cgroup, freezer);
322 if (goal_state == freezer->state)
323 goto out;
324 317
325 switch (goal_state) { 318 switch (goal_state) {
326 case CGROUP_THAWED: 319 case CGROUP_THAWED:
320 if (freezer->state != CGROUP_THAWED)
321 atomic_dec(&system_freezing_cnt);
322 freezer->state = CGROUP_THAWED;
327 unfreeze_cgroup(cgroup, freezer); 323 unfreeze_cgroup(cgroup, freezer);
328 break; 324 break;
329 case CGROUP_FROZEN: 325 case CGROUP_FROZEN:
326 if (freezer->state == CGROUP_THAWED)
327 atomic_inc(&system_freezing_cnt);
328 freezer->state = CGROUP_FREEZING;
330 retval = try_to_freeze_cgroup(cgroup, freezer); 329 retval = try_to_freeze_cgroup(cgroup, freezer);
331 break; 330 break;
332 default: 331 default:
333 BUG(); 332 BUG();
334 } 333 }
335out: 334
336 spin_unlock_irq(&freezer->lock); 335 spin_unlock_irq(&freezer->lock);
337 336
338 return retval; 337 return retval;
@@ -381,10 +380,5 @@ struct cgroup_subsys freezer_subsys = {
381 .populate = freezer_populate, 380 .populate = freezer_populate,
382 .subsys_id = freezer_subsys_id, 381 .subsys_id = freezer_subsys_id,
383 .can_attach = freezer_can_attach, 382 .can_attach = freezer_can_attach,
384 .can_attach_task = freezer_can_attach_task,
385 .pre_attach = NULL,
386 .attach_task = NULL,
387 .attach = NULL,
388 .fork = freezer_fork, 383 .fork = freezer_fork,
389 .exit = NULL,
390}; 384};
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 563f13609470..2060c6e57027 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -178,8 +178,7 @@ static inline void check_for_tasks(int cpu)
178 write_lock_irq(&tasklist_lock); 178 write_lock_irq(&tasklist_lock);
179 for_each_process(p) { 179 for_each_process(p) {
180 if (task_cpu(p) == cpu && p->state == TASK_RUNNING && 180 if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
181 (!cputime_eq(p->utime, cputime_zero) || 181 (p->utime || p->stime))
182 !cputime_eq(p->stime, cputime_zero)))
183 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " 182 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
184 "(state = %ld, flags = %x)\n", 183 "(state = %ld, flags = %x)\n",
185 p->comm, task_pid_nr(p), cpu, 184 p->comm, task_pid_nr(p), cpu,
@@ -380,6 +379,7 @@ out:
380 cpu_maps_update_done(); 379 cpu_maps_update_done();
381 return err; 380 return err;
382} 381}
382EXPORT_SYMBOL_GPL(cpu_up);
383 383
384#ifdef CONFIG_PM_SLEEP_SMP 384#ifdef CONFIG_PM_SLEEP_SMP
385static cpumask_var_t frozen_cpus; 385static cpumask_var_t frozen_cpus;
@@ -470,7 +470,7 @@ out:
470 cpu_maps_update_done(); 470 cpu_maps_update_done();
471} 471}
472 472
473static int alloc_frozen_cpus(void) 473static int __init alloc_frozen_cpus(void)
474{ 474{
475 if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO)) 475 if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO))
476 return -ENOMEM; 476 return -ENOMEM;
@@ -543,7 +543,7 @@ cpu_hotplug_pm_callback(struct notifier_block *nb,
543} 543}
544 544
545 545
546int cpu_hotplug_pm_sync_init(void) 546static int __init cpu_hotplug_pm_sync_init(void)
547{ 547{
548 pm_notifier(cpu_hotplug_pm_callback, 0); 548 pm_notifier(cpu_hotplug_pm_callback, 0);
549 return 0; 549 return 0;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 9fe58c46a426..a09ac2b9a661 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -123,6 +123,19 @@ static inline struct cpuset *task_cs(struct task_struct *task)
123 struct cpuset, css); 123 struct cpuset, css);
124} 124}
125 125
126#ifdef CONFIG_NUMA
127static inline bool task_has_mempolicy(struct task_struct *task)
128{
129 return task->mempolicy;
130}
131#else
132static inline bool task_has_mempolicy(struct task_struct *task)
133{
134 return false;
135}
136#endif
137
138
126/* bits in struct cpuset flags field */ 139/* bits in struct cpuset flags field */
127typedef enum { 140typedef enum {
128 CS_CPU_EXCLUSIVE, 141 CS_CPU_EXCLUSIVE,
@@ -949,7 +962,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
949static void cpuset_change_task_nodemask(struct task_struct *tsk, 962static void cpuset_change_task_nodemask(struct task_struct *tsk,
950 nodemask_t *newmems) 963 nodemask_t *newmems)
951{ 964{
952 bool masks_disjoint = !nodes_intersects(*newmems, tsk->mems_allowed); 965 bool need_loop;
953 966
954repeat: 967repeat:
955 /* 968 /*
@@ -962,6 +975,14 @@ repeat:
962 return; 975 return;
963 976
964 task_lock(tsk); 977 task_lock(tsk);
978 /*
979 * Determine if a loop is necessary if another thread is doing
980 * get_mems_allowed(). If at least one node remains unchanged and
981 * tsk does not have a mempolicy, then an empty nodemask will not be
982 * possible when mems_allowed is larger than a word.
983 */
984 need_loop = task_has_mempolicy(tsk) ||
985 !nodes_intersects(*newmems, tsk->mems_allowed);
965 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); 986 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
966 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); 987 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
967 988
@@ -981,11 +1002,9 @@ repeat:
981 1002
982 /* 1003 /*
983 * Allocation of memory is very fast, we needn't sleep when waiting 1004 * Allocation of memory is very fast, we needn't sleep when waiting
984 * for the read-side. No wait is necessary, however, if at least one 1005 * for the read-side.
985 * node remains unchanged.
986 */ 1006 */
987 while (masks_disjoint && 1007 while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
988 ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
989 task_unlock(tsk); 1008 task_unlock(tsk);
990 if (!task_curr(tsk)) 1009 if (!task_curr(tsk))
991 yield(); 1010 yield();
@@ -1370,79 +1389,73 @@ static int fmeter_getrate(struct fmeter *fmp)
1370 return val; 1389 return val;
1371} 1390}
1372 1391
1373/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
1374static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1375 struct task_struct *tsk)
1376{
1377 struct cpuset *cs = cgroup_cs(cont);
1378
1379 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1380 return -ENOSPC;
1381
1382 /*
1383 * Kthreads bound to specific cpus cannot be moved to a new cpuset; we
1384 * cannot change their cpu affinity and isolating such threads by their
1385 * set of allowed nodes is unnecessary. Thus, cpusets are not
1386 * applicable for such threads. This prevents checking for success of
1387 * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may
1388 * be changed.
1389 */
1390 if (tsk->flags & PF_THREAD_BOUND)
1391 return -EINVAL;
1392
1393 return 0;
1394}
1395
1396static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task)
1397{
1398 return security_task_setscheduler(task);
1399}
1400
1401/* 1392/*
1402 * Protected by cgroup_lock. The nodemasks must be stored globally because 1393 * Protected by cgroup_lock. The nodemasks must be stored globally because
1403 * dynamically allocating them is not allowed in pre_attach, and they must 1394 * dynamically allocating them is not allowed in can_attach, and they must
1404 * persist among pre_attach, attach_task, and attach. 1395 * persist until attach.
1405 */ 1396 */
1406static cpumask_var_t cpus_attach; 1397static cpumask_var_t cpus_attach;
1407static nodemask_t cpuset_attach_nodemask_from; 1398static nodemask_t cpuset_attach_nodemask_from;
1408static nodemask_t cpuset_attach_nodemask_to; 1399static nodemask_t cpuset_attach_nodemask_to;
1409 1400
1410/* Set-up work for before attaching each task. */ 1401/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
1411static void cpuset_pre_attach(struct cgroup *cont) 1402static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
1403 struct cgroup_taskset *tset)
1412{ 1404{
1413 struct cpuset *cs = cgroup_cs(cont); 1405 struct cpuset *cs = cgroup_cs(cgrp);
1406 struct task_struct *task;
1407 int ret;
1414 1408
1409 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1410 return -ENOSPC;
1411
1412 cgroup_taskset_for_each(task, cgrp, tset) {
1413 /*
1414 * Kthreads bound to specific cpus cannot be moved to a new
1415 * cpuset; we cannot change their cpu affinity and
1416 * isolating such threads by their set of allowed nodes is
1417 * unnecessary. Thus, cpusets are not applicable for such
1418 * threads. This prevents checking for success of
1419 * set_cpus_allowed_ptr() on all attached tasks before
1420 * cpus_allowed may be changed.
1421 */
1422 if (task->flags & PF_THREAD_BOUND)
1423 return -EINVAL;
1424 if ((ret = security_task_setscheduler(task)))
1425 return ret;
1426 }
1427
1428 /* prepare for attach */
1415 if (cs == &top_cpuset) 1429 if (cs == &top_cpuset)
1416 cpumask_copy(cpus_attach, cpu_possible_mask); 1430 cpumask_copy(cpus_attach, cpu_possible_mask);
1417 else 1431 else
1418 guarantee_online_cpus(cs, cpus_attach); 1432 guarantee_online_cpus(cs, cpus_attach);
1419 1433
1420 guarantee_online_mems(cs, &cpuset_attach_nodemask_to); 1434 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
1421}
1422
1423/* Per-thread attachment work. */
1424static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk)
1425{
1426 int err;
1427 struct cpuset *cs = cgroup_cs(cont);
1428 1435
1429 /* 1436 return 0;
1430 * can_attach beforehand should guarantee that this doesn't fail.
1431 * TODO: have a better way to handle failure here
1432 */
1433 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1434 WARN_ON_ONCE(err);
1435
1436 cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to);
1437 cpuset_update_task_spread_flag(cs, tsk);
1438} 1437}
1439 1438
1440static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, 1439static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
1441 struct cgroup *oldcont, struct task_struct *tsk) 1440 struct cgroup_taskset *tset)
1442{ 1441{
1443 struct mm_struct *mm; 1442 struct mm_struct *mm;
1444 struct cpuset *cs = cgroup_cs(cont); 1443 struct task_struct *task;
1445 struct cpuset *oldcs = cgroup_cs(oldcont); 1444 struct task_struct *leader = cgroup_taskset_first(tset);
1445 struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset);
1446 struct cpuset *cs = cgroup_cs(cgrp);
1447 struct cpuset *oldcs = cgroup_cs(oldcgrp);
1448
1449 cgroup_taskset_for_each(task, cgrp, tset) {
1450 /*
1451 * can_attach beforehand should guarantee that this doesn't
1452 * fail. TODO: have a better way to handle failure here
1453 */
1454 WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
1455
1456 cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
1457 cpuset_update_task_spread_flag(cs, task);
1458 }
1446 1459
1447 /* 1460 /*
1448 * Change mm, possibly for multiple threads in a threadgroup. This is 1461 * Change mm, possibly for multiple threads in a threadgroup. This is
@@ -1450,7 +1463,7 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1450 */ 1463 */
1451 cpuset_attach_nodemask_from = oldcs->mems_allowed; 1464 cpuset_attach_nodemask_from = oldcs->mems_allowed;
1452 cpuset_attach_nodemask_to = cs->mems_allowed; 1465 cpuset_attach_nodemask_to = cs->mems_allowed;
1453 mm = get_task_mm(tsk); 1466 mm = get_task_mm(leader);
1454 if (mm) { 1467 if (mm) {
1455 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); 1468 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
1456 if (is_memory_migrate(cs)) 1469 if (is_memory_migrate(cs))
@@ -1906,9 +1919,6 @@ struct cgroup_subsys cpuset_subsys = {
1906 .create = cpuset_create, 1919 .create = cpuset_create,
1907 .destroy = cpuset_destroy, 1920 .destroy = cpuset_destroy,
1908 .can_attach = cpuset_can_attach, 1921 .can_attach = cpuset_can_attach,
1909 .can_attach_task = cpuset_can_attach_task,
1910 .pre_attach = cpuset_pre_attach,
1911 .attach_task = cpuset_attach_task,
1912 .attach = cpuset_attach, 1922 .attach = cpuset_attach,
1913 .populate = cpuset_populate, 1923 .populate = cpuset_populate,
1914 .post_clone = cpuset_post_clone, 1924 .post_clone = cpuset_post_clone,
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 63786e71a3cd..e2ae7349437f 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1982,7 +1982,7 @@ static int kdb_lsmod(int argc, const char **argv)
1982 kdb_printf("%-20s%8u 0x%p ", mod->name, 1982 kdb_printf("%-20s%8u 0x%p ", mod->name,
1983 mod->core_size, (void *)mod); 1983 mod->core_size, (void *)mod);
1984#ifdef CONFIG_MODULE_UNLOAD 1984#ifdef CONFIG_MODULE_UNLOAD
1985 kdb_printf("%4d ", module_refcount(mod)); 1985 kdb_printf("%4ld ", module_refcount(mod));
1986#endif 1986#endif
1987 if (mod->state == MODULE_STATE_GOING) 1987 if (mod->state == MODULE_STATE_GOING)
1988 kdb_printf(" (Unloading)"); 1988 kdb_printf(" (Unloading)");
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 5532dd37aa86..7d6fb40d2188 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -636,7 +636,7 @@ char kdb_task_state_char (const struct task_struct *p)
636 (p->exit_state & EXIT_ZOMBIE) ? 'Z' : 636 (p->exit_state & EXIT_ZOMBIE) ? 'Z' :
637 (p->exit_state & EXIT_DEAD) ? 'E' : 637 (p->exit_state & EXIT_DEAD) ? 'E' :
638 (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?'; 638 (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?';
639 if (p->pid == 0) { 639 if (is_idle_task(p)) {
640 /* Idle task. Is it really idle, apart from the kdb 640 /* Idle task. Is it really idle, apart from the kdb
641 * interrupt? */ 641 * interrupt? */
642 if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) { 642 if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) {
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 89e5e8aa4c36..22d901f9caf4 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -2,5 +2,5 @@ ifdef CONFIG_FUNCTION_TRACER
2CFLAGS_REMOVE_core.o = -pg 2CFLAGS_REMOVE_core.o = -pg
3endif 3endif
4 4
5obj-y := core.o ring_buffer.o 5obj-y := core.o ring_buffer.o callchain.o
6obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 6obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
new file mode 100644
index 000000000000..6581a040f399
--- /dev/null
+++ b/kernel/events/callchain.c
@@ -0,0 +1,189 @@
1/*
2 * Performance events callchain code, extracted from core.c:
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
9 * For licensing details see kernel-base/COPYING
10 */
11
12#include <linux/perf_event.h>
13#include <linux/slab.h>
14#include "internal.h"
15
16struct callchain_cpus_entries {
17 struct rcu_head rcu_head;
18 struct perf_callchain_entry *cpu_entries[0];
19};
20
21static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
22static atomic_t nr_callchain_events;
23static DEFINE_MUTEX(callchain_mutex);
24static struct callchain_cpus_entries *callchain_cpus_entries;
25
26
27__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
28 struct pt_regs *regs)
29{
30}
31
32__weak void perf_callchain_user(struct perf_callchain_entry *entry,
33 struct pt_regs *regs)
34{
35}
36
37static void release_callchain_buffers_rcu(struct rcu_head *head)
38{
39 struct callchain_cpus_entries *entries;
40 int cpu;
41
42 entries = container_of(head, struct callchain_cpus_entries, rcu_head);
43
44 for_each_possible_cpu(cpu)
45 kfree(entries->cpu_entries[cpu]);
46
47 kfree(entries);
48}
49
50static void release_callchain_buffers(void)
51{
52 struct callchain_cpus_entries *entries;
53
54 entries = callchain_cpus_entries;
55 rcu_assign_pointer(callchain_cpus_entries, NULL);
56 call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
57}
58
59static int alloc_callchain_buffers(void)
60{
61 int cpu;
62 int size;
63 struct callchain_cpus_entries *entries;
64
65 /*
66 * We can't use the percpu allocation API for data that can be
67 * accessed from NMI. Use a temporary manual per cpu allocation
68 * until that gets sorted out.
69 */
70 size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
71
72 entries = kzalloc(size, GFP_KERNEL);
73 if (!entries)
74 return -ENOMEM;
75
76 size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
77
78 for_each_possible_cpu(cpu) {
79 entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
80 cpu_to_node(cpu));
81 if (!entries->cpu_entries[cpu])
82 goto fail;
83 }
84
85 rcu_assign_pointer(callchain_cpus_entries, entries);
86
87 return 0;
88
89fail:
90 for_each_possible_cpu(cpu)
91 kfree(entries->cpu_entries[cpu]);
92 kfree(entries);
93
94 return -ENOMEM;
95}
96
97int get_callchain_buffers(void)
98{
99 int err = 0;
100 int count;
101
102 mutex_lock(&callchain_mutex);
103
104 count = atomic_inc_return(&nr_callchain_events);
105 if (WARN_ON_ONCE(count < 1)) {
106 err = -EINVAL;
107 goto exit;
108 }
109
110 if (count > 1) {
111 /* If the allocation failed, give up */
112 if (!callchain_cpus_entries)
113 err = -ENOMEM;
114 goto exit;
115 }
116
117 err = alloc_callchain_buffers();
118exit:
119 mutex_unlock(&callchain_mutex);
120
121 return err;
122}
123
124void put_callchain_buffers(void)
125{
126 if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
127 release_callchain_buffers();
128 mutex_unlock(&callchain_mutex);
129 }
130}
131
132static struct perf_callchain_entry *get_callchain_entry(int *rctx)
133{
134 int cpu;
135 struct callchain_cpus_entries *entries;
136
137 *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
138 if (*rctx == -1)
139 return NULL;
140
141 entries = rcu_dereference(callchain_cpus_entries);
142 if (!entries)
143 return NULL;
144
145 cpu = smp_processor_id();
146
147 return &entries->cpu_entries[cpu][*rctx];
148}
149
150static void
151put_callchain_entry(int rctx)
152{
153 put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
154}
155
156struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
157{
158 int rctx;
159 struct perf_callchain_entry *entry;
160
161
162 entry = get_callchain_entry(&rctx);
163 if (rctx == -1)
164 return NULL;
165
166 if (!entry)
167 goto exit_put;
168
169 entry->nr = 0;
170
171 if (!user_mode(regs)) {
172 perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
173 perf_callchain_kernel(entry, regs);
174 if (current->mm)
175 regs = task_pt_regs(current);
176 else
177 regs = NULL;
178 }
179
180 if (regs) {
181 perf_callchain_store(entry, PERF_CONTEXT_USER);
182 perf_callchain_user(entry, regs);
183 }
184
185exit_put:
186 put_callchain_entry(rctx);
187
188 return entry;
189}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0e8457da6f95..1b5c081d8b9f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4,7 +4,7 @@
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> 4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar 5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 * 8 *
9 * For licensing details see kernel-base/COPYING 9 * For licensing details see kernel-base/COPYING
10 */ 10 */
@@ -128,7 +128,7 @@ enum event_type_t {
128 * perf_sched_events : >0 events exist 128 * perf_sched_events : >0 events exist
129 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu 129 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
130 */ 130 */
131struct jump_label_key perf_sched_events __read_mostly; 131struct jump_label_key_deferred perf_sched_events __read_mostly;
132static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); 132static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
133 133
134static atomic_t nr_mmap_events __read_mostly; 134static atomic_t nr_mmap_events __read_mostly;
@@ -185,6 +185,9 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
185static void update_context_time(struct perf_event_context *ctx); 185static void update_context_time(struct perf_event_context *ctx);
186static u64 perf_event_time(struct perf_event *event); 186static u64 perf_event_time(struct perf_event *event);
187 187
188static void ring_buffer_attach(struct perf_event *event,
189 struct ring_buffer *rb);
190
188void __weak perf_event_print_debug(void) { } 191void __weak perf_event_print_debug(void) { }
189 192
190extern __weak const char *perf_pmu_name(void) 193extern __weak const char *perf_pmu_name(void)
@@ -812,7 +815,7 @@ static void update_event_times(struct perf_event *event)
812 * here. 815 * here.
813 */ 816 */
814 if (is_cgroup_event(event)) 817 if (is_cgroup_event(event))
815 run_end = perf_event_time(event); 818 run_end = perf_cgroup_event_time(event);
816 else if (ctx->is_active) 819 else if (ctx->is_active)
817 run_end = ctx->time; 820 run_end = ctx->time;
818 else 821 else
@@ -1127,6 +1130,8 @@ event_sched_out(struct perf_event *event,
1127 if (!is_software_event(event)) 1130 if (!is_software_event(event))
1128 cpuctx->active_oncpu--; 1131 cpuctx->active_oncpu--;
1129 ctx->nr_active--; 1132 ctx->nr_active--;
1133 if (event->attr.freq && event->attr.sample_freq)
1134 ctx->nr_freq--;
1130 if (event->attr.exclusive || !cpuctx->active_oncpu) 1135 if (event->attr.exclusive || !cpuctx->active_oncpu)
1131 cpuctx->exclusive = 0; 1136 cpuctx->exclusive = 0;
1132} 1137}
@@ -1322,6 +1327,7 @@ retry:
1322 } 1327 }
1323 raw_spin_unlock_irq(&ctx->lock); 1328 raw_spin_unlock_irq(&ctx->lock);
1324} 1329}
1330EXPORT_SYMBOL_GPL(perf_event_disable);
1325 1331
1326static void perf_set_shadow_time(struct perf_event *event, 1332static void perf_set_shadow_time(struct perf_event *event,
1327 struct perf_event_context *ctx, 1333 struct perf_event_context *ctx,
@@ -1403,6 +1409,8 @@ event_sched_in(struct perf_event *event,
1403 if (!is_software_event(event)) 1409 if (!is_software_event(event))
1404 cpuctx->active_oncpu++; 1410 cpuctx->active_oncpu++;
1405 ctx->nr_active++; 1411 ctx->nr_active++;
1412 if (event->attr.freq && event->attr.sample_freq)
1413 ctx->nr_freq++;
1406 1414
1407 if (event->attr.exclusive) 1415 if (event->attr.exclusive)
1408 cpuctx->exclusive = 1; 1416 cpuctx->exclusive = 1;
@@ -1659,8 +1667,7 @@ retry:
1659 * Note: this works for group members as well as group leaders 1667 * Note: this works for group members as well as group leaders
1660 * since the non-leader members' sibling_lists will be empty. 1668 * since the non-leader members' sibling_lists will be empty.
1661 */ 1669 */
1662static void __perf_event_mark_enabled(struct perf_event *event, 1670static void __perf_event_mark_enabled(struct perf_event *event)
1663 struct perf_event_context *ctx)
1664{ 1671{
1665 struct perf_event *sub; 1672 struct perf_event *sub;
1666 u64 tstamp = perf_event_time(event); 1673 u64 tstamp = perf_event_time(event);
@@ -1698,7 +1705,7 @@ static int __perf_event_enable(void *info)
1698 */ 1705 */
1699 perf_cgroup_set_timestamp(current, ctx); 1706 perf_cgroup_set_timestamp(current, ctx);
1700 1707
1701 __perf_event_mark_enabled(event, ctx); 1708 __perf_event_mark_enabled(event);
1702 1709
1703 if (!event_filter_match(event)) { 1710 if (!event_filter_match(event)) {
1704 if (is_cgroup_event(event)) 1711 if (is_cgroup_event(event))
@@ -1779,7 +1786,7 @@ void perf_event_enable(struct perf_event *event)
1779 1786
1780retry: 1787retry:
1781 if (!ctx->is_active) { 1788 if (!ctx->is_active) {
1782 __perf_event_mark_enabled(event, ctx); 1789 __perf_event_mark_enabled(event);
1783 goto out; 1790 goto out;
1784 } 1791 }
1785 1792
@@ -1806,6 +1813,7 @@ retry:
1806out: 1813out:
1807 raw_spin_unlock_irq(&ctx->lock); 1814 raw_spin_unlock_irq(&ctx->lock);
1808} 1815}
1816EXPORT_SYMBOL_GPL(perf_event_enable);
1809 1817
1810int perf_event_refresh(struct perf_event *event, int refresh) 1818int perf_event_refresh(struct perf_event *event, int refresh)
1811{ 1819{
@@ -2171,9 +2179,10 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
2171 */ 2179 */
2172 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2180 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2173 2181
2174 perf_event_sched_in(cpuctx, ctx, task); 2182 if (ctx->nr_events)
2183 cpuctx->task_ctx = ctx;
2175 2184
2176 cpuctx->task_ctx = ctx; 2185 perf_event_sched_in(cpuctx, cpuctx->task_ctx, task);
2177 2186
2178 perf_pmu_enable(ctx->pmu); 2187 perf_pmu_enable(ctx->pmu);
2179 perf_ctx_unlock(cpuctx, ctx); 2188 perf_ctx_unlock(cpuctx, ctx);
@@ -2291,7 +2300,10 @@ do { \
2291 return div64_u64(dividend, divisor); 2300 return div64_u64(dividend, divisor);
2292} 2301}
2293 2302
2294static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) 2303static DEFINE_PER_CPU(int, perf_throttled_count);
2304static DEFINE_PER_CPU(u64, perf_throttled_seq);
2305
2306static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
2295{ 2307{
2296 struct hw_perf_event *hwc = &event->hw; 2308 struct hw_perf_event *hwc = &event->hw;
2297 s64 period, sample_period; 2309 s64 period, sample_period;
@@ -2310,19 +2322,40 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
2310 hwc->sample_period = sample_period; 2322 hwc->sample_period = sample_period;
2311 2323
2312 if (local64_read(&hwc->period_left) > 8*sample_period) { 2324 if (local64_read(&hwc->period_left) > 8*sample_period) {
2313 event->pmu->stop(event, PERF_EF_UPDATE); 2325 if (disable)
2326 event->pmu->stop(event, PERF_EF_UPDATE);
2327
2314 local64_set(&hwc->period_left, 0); 2328 local64_set(&hwc->period_left, 0);
2315 event->pmu->start(event, PERF_EF_RELOAD); 2329
2330 if (disable)
2331 event->pmu->start(event, PERF_EF_RELOAD);
2316 } 2332 }
2317} 2333}
2318 2334
2319static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) 2335/*
2336 * combine freq adjustment with unthrottling to avoid two passes over the
2337 * events. At the same time, make sure, having freq events does not change
2338 * the rate of unthrottling as that would introduce bias.
2339 */
2340static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
2341 int needs_unthr)
2320{ 2342{
2321 struct perf_event *event; 2343 struct perf_event *event;
2322 struct hw_perf_event *hwc; 2344 struct hw_perf_event *hwc;
2323 u64 interrupts, now; 2345 u64 now, period = TICK_NSEC;
2324 s64 delta; 2346 s64 delta;
2325 2347
2348 /*
2349 * only need to iterate over all events iff:
2350 * - context have events in frequency mode (needs freq adjust)
2351 * - there are events to unthrottle on this cpu
2352 */
2353 if (!(ctx->nr_freq || needs_unthr))
2354 return;
2355
2356 raw_spin_lock(&ctx->lock);
2357 perf_pmu_disable(ctx->pmu);
2358
2326 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 2359 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
2327 if (event->state != PERF_EVENT_STATE_ACTIVE) 2360 if (event->state != PERF_EVENT_STATE_ACTIVE)
2328 continue; 2361 continue;
@@ -2332,13 +2365,8 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
2332 2365
2333 hwc = &event->hw; 2366 hwc = &event->hw;
2334 2367
2335 interrupts = hwc->interrupts; 2368 if (needs_unthr && hwc->interrupts == MAX_INTERRUPTS) {
2336 hwc->interrupts = 0; 2369 hwc->interrupts = 0;
2337
2338 /*
2339 * unthrottle events on the tick
2340 */
2341 if (interrupts == MAX_INTERRUPTS) {
2342 perf_log_throttle(event, 1); 2370 perf_log_throttle(event, 1);
2343 event->pmu->start(event, 0); 2371 event->pmu->start(event, 0);
2344 } 2372 }
@@ -2346,14 +2374,30 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
2346 if (!event->attr.freq || !event->attr.sample_freq) 2374 if (!event->attr.freq || !event->attr.sample_freq)
2347 continue; 2375 continue;
2348 2376
2349 event->pmu->read(event); 2377 /*
2378 * stop the event and update event->count
2379 */
2380 event->pmu->stop(event, PERF_EF_UPDATE);
2381
2350 now = local64_read(&event->count); 2382 now = local64_read(&event->count);
2351 delta = now - hwc->freq_count_stamp; 2383 delta = now - hwc->freq_count_stamp;
2352 hwc->freq_count_stamp = now; 2384 hwc->freq_count_stamp = now;
2353 2385
2386 /*
2387 * restart the event
2388 * reload only if value has changed
2389 * we have stopped the event so tell that
2390 * to perf_adjust_period() to avoid stopping it
2391 * twice.
2392 */
2354 if (delta > 0) 2393 if (delta > 0)
2355 perf_adjust_period(event, period, delta); 2394 perf_adjust_period(event, period, delta, false);
2395
2396 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
2356 } 2397 }
2398
2399 perf_pmu_enable(ctx->pmu);
2400 raw_spin_unlock(&ctx->lock);
2357} 2401}
2358 2402
2359/* 2403/*
@@ -2376,7 +2420,6 @@ static void rotate_ctx(struct perf_event_context *ctx)
2376 */ 2420 */
2377static void perf_rotate_context(struct perf_cpu_context *cpuctx) 2421static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2378{ 2422{
2379 u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
2380 struct perf_event_context *ctx = NULL; 2423 struct perf_event_context *ctx = NULL;
2381 int rotate = 0, remove = 1; 2424 int rotate = 0, remove = 1;
2382 2425
@@ -2393,15 +2436,12 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2393 rotate = 1; 2436 rotate = 1;
2394 } 2437 }
2395 2438
2396 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2397 perf_pmu_disable(cpuctx->ctx.pmu);
2398 perf_ctx_adjust_freq(&cpuctx->ctx, interval);
2399 if (ctx)
2400 perf_ctx_adjust_freq(ctx, interval);
2401
2402 if (!rotate) 2439 if (!rotate)
2403 goto done; 2440 goto done;
2404 2441
2442 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2443 perf_pmu_disable(cpuctx->ctx.pmu);
2444
2405 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2445 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2406 if (ctx) 2446 if (ctx)
2407 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); 2447 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
@@ -2412,22 +2452,33 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2412 2452
2413 perf_event_sched_in(cpuctx, ctx, current); 2453 perf_event_sched_in(cpuctx, ctx, current);
2414 2454
2455 perf_pmu_enable(cpuctx->ctx.pmu);
2456 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2415done: 2457done:
2416 if (remove) 2458 if (remove)
2417 list_del_init(&cpuctx->rotation_list); 2459 list_del_init(&cpuctx->rotation_list);
2418
2419 perf_pmu_enable(cpuctx->ctx.pmu);
2420 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2421} 2460}
2422 2461
2423void perf_event_task_tick(void) 2462void perf_event_task_tick(void)
2424{ 2463{
2425 struct list_head *head = &__get_cpu_var(rotation_list); 2464 struct list_head *head = &__get_cpu_var(rotation_list);
2426 struct perf_cpu_context *cpuctx, *tmp; 2465 struct perf_cpu_context *cpuctx, *tmp;
2466 struct perf_event_context *ctx;
2467 int throttled;
2427 2468
2428 WARN_ON(!irqs_disabled()); 2469 WARN_ON(!irqs_disabled());
2429 2470
2471 __this_cpu_inc(perf_throttled_seq);
2472 throttled = __this_cpu_xchg(perf_throttled_count, 0);
2473
2430 list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) { 2474 list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
2475 ctx = &cpuctx->ctx;
2476 perf_adjust_freq_unthr_context(ctx, throttled);
2477
2478 ctx = cpuctx->task_ctx;
2479 if (ctx)
2480 perf_adjust_freq_unthr_context(ctx, throttled);
2481
2431 if (cpuctx->jiffies_interval == 1 || 2482 if (cpuctx->jiffies_interval == 1 ||
2432 !(jiffies % cpuctx->jiffies_interval)) 2483 !(jiffies % cpuctx->jiffies_interval))
2433 perf_rotate_context(cpuctx); 2484 perf_rotate_context(cpuctx);
@@ -2444,7 +2495,7 @@ static int event_enable_on_exec(struct perf_event *event,
2444 if (event->state >= PERF_EVENT_STATE_INACTIVE) 2495 if (event->state >= PERF_EVENT_STATE_INACTIVE)
2445 return 0; 2496 return 0;
2446 2497
2447 __perf_event_mark_enabled(event, ctx); 2498 __perf_event_mark_enabled(event);
2448 2499
2449 return 1; 2500 return 1;
2450} 2501}
@@ -2476,13 +2527,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
2476 raw_spin_lock(&ctx->lock); 2527 raw_spin_lock(&ctx->lock);
2477 task_ctx_sched_out(ctx); 2528 task_ctx_sched_out(ctx);
2478 2529
2479 list_for_each_entry(event, &ctx->pinned_groups, group_entry) { 2530 list_for_each_entry(event, &ctx->event_list, event_entry) {
2480 ret = event_enable_on_exec(event, ctx);
2481 if (ret)
2482 enabled = 1;
2483 }
2484
2485 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
2486 ret = event_enable_on_exec(event, ctx); 2531 ret = event_enable_on_exec(event, ctx);
2487 if (ret) 2532 if (ret)
2488 enabled = 1; 2533 enabled = 1;
@@ -2570,215 +2615,6 @@ static u64 perf_event_read(struct perf_event *event)
2570} 2615}
2571 2616
2572/* 2617/*
2573 * Callchain support
2574 */
2575
2576struct callchain_cpus_entries {
2577 struct rcu_head rcu_head;
2578 struct perf_callchain_entry *cpu_entries[0];
2579};
2580
2581static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
2582static atomic_t nr_callchain_events;
2583static DEFINE_MUTEX(callchain_mutex);
2584struct callchain_cpus_entries *callchain_cpus_entries;
2585
2586
2587__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
2588 struct pt_regs *regs)
2589{
2590}
2591
2592__weak void perf_callchain_user(struct perf_callchain_entry *entry,
2593 struct pt_regs *regs)
2594{
2595}
2596
2597static void release_callchain_buffers_rcu(struct rcu_head *head)
2598{
2599 struct callchain_cpus_entries *entries;
2600 int cpu;
2601
2602 entries = container_of(head, struct callchain_cpus_entries, rcu_head);
2603
2604 for_each_possible_cpu(cpu)
2605 kfree(entries->cpu_entries[cpu]);
2606
2607 kfree(entries);
2608}
2609
2610static void release_callchain_buffers(void)
2611{
2612 struct callchain_cpus_entries *entries;
2613
2614 entries = callchain_cpus_entries;
2615 rcu_assign_pointer(callchain_cpus_entries, NULL);
2616 call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
2617}
2618
2619static int alloc_callchain_buffers(void)
2620{
2621 int cpu;
2622 int size;
2623 struct callchain_cpus_entries *entries;
2624
2625 /*
2626 * We can't use the percpu allocation API for data that can be
2627 * accessed from NMI. Use a temporary manual per cpu allocation
2628 * until that gets sorted out.
2629 */
2630 size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
2631
2632 entries = kzalloc(size, GFP_KERNEL);
2633 if (!entries)
2634 return -ENOMEM;
2635
2636 size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
2637
2638 for_each_possible_cpu(cpu) {
2639 entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
2640 cpu_to_node(cpu));
2641 if (!entries->cpu_entries[cpu])
2642 goto fail;
2643 }
2644
2645 rcu_assign_pointer(callchain_cpus_entries, entries);
2646
2647 return 0;
2648
2649fail:
2650 for_each_possible_cpu(cpu)
2651 kfree(entries->cpu_entries[cpu]);
2652 kfree(entries);
2653
2654 return -ENOMEM;
2655}
2656
2657static int get_callchain_buffers(void)
2658{
2659 int err = 0;
2660 int count;
2661
2662 mutex_lock(&callchain_mutex);
2663
2664 count = atomic_inc_return(&nr_callchain_events);
2665 if (WARN_ON_ONCE(count < 1)) {
2666 err = -EINVAL;
2667 goto exit;
2668 }
2669
2670 if (count > 1) {
2671 /* If the allocation failed, give up */
2672 if (!callchain_cpus_entries)
2673 err = -ENOMEM;
2674 goto exit;
2675 }
2676
2677 err = alloc_callchain_buffers();
2678 if (err)
2679 release_callchain_buffers();
2680exit:
2681 mutex_unlock(&callchain_mutex);
2682
2683 return err;
2684}
2685
2686static void put_callchain_buffers(void)
2687{
2688 if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
2689 release_callchain_buffers();
2690 mutex_unlock(&callchain_mutex);
2691 }
2692}
2693
2694static int get_recursion_context(int *recursion)
2695{
2696 int rctx;
2697
2698 if (in_nmi())
2699 rctx = 3;
2700 else if (in_irq())
2701 rctx = 2;
2702 else if (in_softirq())
2703 rctx = 1;
2704 else
2705 rctx = 0;
2706
2707 if (recursion[rctx])
2708 return -1;
2709
2710 recursion[rctx]++;
2711 barrier();
2712
2713 return rctx;
2714}
2715
2716static inline void put_recursion_context(int *recursion, int rctx)
2717{
2718 barrier();
2719 recursion[rctx]--;
2720}
2721
2722static struct perf_callchain_entry *get_callchain_entry(int *rctx)
2723{
2724 int cpu;
2725 struct callchain_cpus_entries *entries;
2726
2727 *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
2728 if (*rctx == -1)
2729 return NULL;
2730
2731 entries = rcu_dereference(callchain_cpus_entries);
2732 if (!entries)
2733 return NULL;
2734
2735 cpu = smp_processor_id();
2736
2737 return &entries->cpu_entries[cpu][*rctx];
2738}
2739
2740static void
2741put_callchain_entry(int rctx)
2742{
2743 put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
2744}
2745
2746static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2747{
2748 int rctx;
2749 struct perf_callchain_entry *entry;
2750
2751
2752 entry = get_callchain_entry(&rctx);
2753 if (rctx == -1)
2754 return NULL;
2755
2756 if (!entry)
2757 goto exit_put;
2758
2759 entry->nr = 0;
2760
2761 if (!user_mode(regs)) {
2762 perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
2763 perf_callchain_kernel(entry, regs);
2764 if (current->mm)
2765 regs = task_pt_regs(current);
2766 else
2767 regs = NULL;
2768 }
2769
2770 if (regs) {
2771 perf_callchain_store(entry, PERF_CONTEXT_USER);
2772 perf_callchain_user(entry, regs);
2773 }
2774
2775exit_put:
2776 put_callchain_entry(rctx);
2777
2778 return entry;
2779}
2780
2781/*
2782 * Initialize the perf_event context in a task_struct: 2618 * Initialize the perf_event context in a task_struct:
2783 */ 2619 */
2784static void __perf_event_init_context(struct perf_event_context *ctx) 2620static void __perf_event_init_context(struct perf_event_context *ctx)
@@ -2942,7 +2778,7 @@ static void free_event(struct perf_event *event)
2942 2778
2943 if (!event->parent) { 2779 if (!event->parent) {
2944 if (event->attach_state & PERF_ATTACH_TASK) 2780 if (event->attach_state & PERF_ATTACH_TASK)
2945 jump_label_dec(&perf_sched_events); 2781 jump_label_dec_deferred(&perf_sched_events);
2946 if (event->attr.mmap || event->attr.mmap_data) 2782 if (event->attr.mmap || event->attr.mmap_data)
2947 atomic_dec(&nr_mmap_events); 2783 atomic_dec(&nr_mmap_events);
2948 if (event->attr.comm) 2784 if (event->attr.comm)
@@ -2953,7 +2789,7 @@ static void free_event(struct perf_event *event)
2953 put_callchain_buffers(); 2789 put_callchain_buffers();
2954 if (is_cgroup_event(event)) { 2790 if (is_cgroup_event(event)) {
2955 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); 2791 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
2956 jump_label_dec(&perf_sched_events); 2792 jump_label_dec_deferred(&perf_sched_events);
2957 } 2793 }
2958 } 2794 }
2959 2795
@@ -3190,12 +3026,33 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
3190 struct ring_buffer *rb; 3026 struct ring_buffer *rb;
3191 unsigned int events = POLL_HUP; 3027 unsigned int events = POLL_HUP;
3192 3028
3029 /*
3030 * Race between perf_event_set_output() and perf_poll(): perf_poll()
3031 * grabs the rb reference but perf_event_set_output() overrides it.
3032 * Here is the timeline for two threads T1, T2:
3033 * t0: T1, rb = rcu_dereference(event->rb)
3034 * t1: T2, old_rb = event->rb
3035 * t2: T2, event->rb = new rb
3036 * t3: T2, ring_buffer_detach(old_rb)
3037 * t4: T1, ring_buffer_attach(rb1)
3038 * t5: T1, poll_wait(event->waitq)
3039 *
3040 * To avoid this problem, we grab mmap_mutex in perf_poll()
3041 * thereby ensuring that the assignment of the new ring buffer
3042 * and the detachment of the old buffer appear atomic to perf_poll()
3043 */
3044 mutex_lock(&event->mmap_mutex);
3045
3193 rcu_read_lock(); 3046 rcu_read_lock();
3194 rb = rcu_dereference(event->rb); 3047 rb = rcu_dereference(event->rb);
3195 if (rb) 3048 if (rb) {
3049 ring_buffer_attach(event, rb);
3196 events = atomic_xchg(&rb->poll, 0); 3050 events = atomic_xchg(&rb->poll, 0);
3051 }
3197 rcu_read_unlock(); 3052 rcu_read_unlock();
3198 3053
3054 mutex_unlock(&event->mmap_mutex);
3055
3199 poll_wait(file, &event->waitq, wait); 3056 poll_wait(file, &event->waitq, wait);
3200 3057
3201 return events; 3058 return events;
@@ -3496,6 +3353,53 @@ unlock:
3496 return ret; 3353 return ret;
3497} 3354}
3498 3355
3356static void ring_buffer_attach(struct perf_event *event,
3357 struct ring_buffer *rb)
3358{
3359 unsigned long flags;
3360
3361 if (!list_empty(&event->rb_entry))
3362 return;
3363
3364 spin_lock_irqsave(&rb->event_lock, flags);
3365 if (!list_empty(&event->rb_entry))
3366 goto unlock;
3367
3368 list_add(&event->rb_entry, &rb->event_list);
3369unlock:
3370 spin_unlock_irqrestore(&rb->event_lock, flags);
3371}
3372
3373static void ring_buffer_detach(struct perf_event *event,
3374 struct ring_buffer *rb)
3375{
3376 unsigned long flags;
3377
3378 if (list_empty(&event->rb_entry))
3379 return;
3380
3381 spin_lock_irqsave(&rb->event_lock, flags);
3382 list_del_init(&event->rb_entry);
3383 wake_up_all(&event->waitq);
3384 spin_unlock_irqrestore(&rb->event_lock, flags);
3385}
3386
3387static void ring_buffer_wakeup(struct perf_event *event)
3388{
3389 struct ring_buffer *rb;
3390
3391 rcu_read_lock();
3392 rb = rcu_dereference(event->rb);
3393 if (!rb)
3394 goto unlock;
3395
3396 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
3397 wake_up_all(&event->waitq);
3398
3399unlock:
3400 rcu_read_unlock();
3401}
3402
3499static void rb_free_rcu(struct rcu_head *rcu_head) 3403static void rb_free_rcu(struct rcu_head *rcu_head)
3500{ 3404{
3501 struct ring_buffer *rb; 3405 struct ring_buffer *rb;
@@ -3521,9 +3425,19 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
3521 3425
3522static void ring_buffer_put(struct ring_buffer *rb) 3426static void ring_buffer_put(struct ring_buffer *rb)
3523{ 3427{
3428 struct perf_event *event, *n;
3429 unsigned long flags;
3430
3524 if (!atomic_dec_and_test(&rb->refcount)) 3431 if (!atomic_dec_and_test(&rb->refcount))
3525 return; 3432 return;
3526 3433
3434 spin_lock_irqsave(&rb->event_lock, flags);
3435 list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) {
3436 list_del_init(&event->rb_entry);
3437 wake_up_all(&event->waitq);
3438 }
3439 spin_unlock_irqrestore(&rb->event_lock, flags);
3440
3527 call_rcu(&rb->rcu_head, rb_free_rcu); 3441 call_rcu(&rb->rcu_head, rb_free_rcu);
3528} 3442}
3529 3443
@@ -3546,6 +3460,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
3546 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); 3460 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
3547 vma->vm_mm->pinned_vm -= event->mmap_locked; 3461 vma->vm_mm->pinned_vm -= event->mmap_locked;
3548 rcu_assign_pointer(event->rb, NULL); 3462 rcu_assign_pointer(event->rb, NULL);
3463 ring_buffer_detach(event, rb);
3549 mutex_unlock(&event->mmap_mutex); 3464 mutex_unlock(&event->mmap_mutex);
3550 3465
3551 ring_buffer_put(rb); 3466 ring_buffer_put(rb);
@@ -3700,7 +3615,7 @@ static const struct file_operations perf_fops = {
3700 3615
3701void perf_event_wakeup(struct perf_event *event) 3616void perf_event_wakeup(struct perf_event *event)
3702{ 3617{
3703 wake_up_all(&event->waitq); 3618 ring_buffer_wakeup(event);
3704 3619
3705 if (event->pending_kill) { 3620 if (event->pending_kill) {
3706 kill_fasync(&event->fasync, SIGIO, event->pending_kill); 3621 kill_fasync(&event->fasync, SIGIO, event->pending_kill);
@@ -4624,6 +4539,7 @@ static int __perf_event_overflow(struct perf_event *event,
4624{ 4539{
4625 int events = atomic_read(&event->event_limit); 4540 int events = atomic_read(&event->event_limit);
4626 struct hw_perf_event *hwc = &event->hw; 4541 struct hw_perf_event *hwc = &event->hw;
4542 u64 seq;
4627 int ret = 0; 4543 int ret = 0;
4628 4544
4629 /* 4545 /*
@@ -4633,14 +4549,20 @@ static int __perf_event_overflow(struct perf_event *event,
4633 if (unlikely(!is_sampling_event(event))) 4549 if (unlikely(!is_sampling_event(event)))
4634 return 0; 4550 return 0;
4635 4551
4636 if (unlikely(hwc->interrupts >= max_samples_per_tick)) { 4552 seq = __this_cpu_read(perf_throttled_seq);
4637 if (throttle) { 4553 if (seq != hwc->interrupts_seq) {
4554 hwc->interrupts_seq = seq;
4555 hwc->interrupts = 1;
4556 } else {
4557 hwc->interrupts++;
4558 if (unlikely(throttle
4559 && hwc->interrupts >= max_samples_per_tick)) {
4560 __this_cpu_inc(perf_throttled_count);
4638 hwc->interrupts = MAX_INTERRUPTS; 4561 hwc->interrupts = MAX_INTERRUPTS;
4639 perf_log_throttle(event, 0); 4562 perf_log_throttle(event, 0);
4640 ret = 1; 4563 ret = 1;
4641 } 4564 }
4642 } else 4565 }
4643 hwc->interrupts++;
4644 4566
4645 if (event->attr.freq) { 4567 if (event->attr.freq) {
4646 u64 now = perf_clock(); 4568 u64 now = perf_clock();
@@ -4649,7 +4571,7 @@ static int __perf_event_overflow(struct perf_event *event,
4649 hwc->freq_time_stamp = now; 4571 hwc->freq_time_stamp = now;
4650 4572
4651 if (delta > 0 && delta < 2*TICK_NSEC) 4573 if (delta > 0 && delta < 2*TICK_NSEC)
4652 perf_adjust_period(event, delta, hwc->last_period); 4574 perf_adjust_period(event, delta, hwc->last_period, true);
4653 } 4575 }
4654 4576
4655 /* 4577 /*
@@ -4737,7 +4659,6 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
4737 struct hw_perf_event *hwc = &event->hw; 4659 struct hw_perf_event *hwc = &event->hw;
4738 int throttle = 0; 4660 int throttle = 0;
4739 4661
4740 data->period = event->hw.last_period;
4741 if (!overflow) 4662 if (!overflow)
4742 overflow = perf_swevent_set_period(event); 4663 overflow = perf_swevent_set_period(event);
4743 4664
@@ -4771,6 +4692,12 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
4771 if (!is_sampling_event(event)) 4692 if (!is_sampling_event(event))
4772 return; 4693 return;
4773 4694
4695 if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
4696 data->period = nr;
4697 return perf_swevent_overflow(event, 1, data, regs);
4698 } else
4699 data->period = event->hw.last_period;
4700
4774 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) 4701 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
4775 return perf_swevent_overflow(event, 1, data, regs); 4702 return perf_swevent_overflow(event, 1, data, regs);
4776 4703
@@ -5283,7 +5210,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
5283 regs = get_irq_regs(); 5210 regs = get_irq_regs();
5284 5211
5285 if (regs && !perf_exclude_event(event, regs)) { 5212 if (regs && !perf_exclude_event(event, regs)) {
5286 if (!(event->attr.exclude_idle && current->pid == 0)) 5213 if (!(event->attr.exclude_idle && is_idle_task(current)))
5287 if (perf_event_overflow(event, &data, regs)) 5214 if (perf_event_overflow(event, &data, regs))
5288 ret = HRTIMER_NORESTART; 5215 ret = HRTIMER_NORESTART;
5289 } 5216 }
@@ -5822,6 +5749,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
5822 INIT_LIST_HEAD(&event->group_entry); 5749 INIT_LIST_HEAD(&event->group_entry);
5823 INIT_LIST_HEAD(&event->event_entry); 5750 INIT_LIST_HEAD(&event->event_entry);
5824 INIT_LIST_HEAD(&event->sibling_list); 5751 INIT_LIST_HEAD(&event->sibling_list);
5752 INIT_LIST_HEAD(&event->rb_entry);
5753
5825 init_waitqueue_head(&event->waitq); 5754 init_waitqueue_head(&event->waitq);
5826 init_irq_work(&event->pending, perf_pending_event); 5755 init_irq_work(&event->pending, perf_pending_event);
5827 5756
@@ -5896,7 +5825,7 @@ done:
5896 5825
5897 if (!event->parent) { 5826 if (!event->parent) {
5898 if (event->attach_state & PERF_ATTACH_TASK) 5827 if (event->attach_state & PERF_ATTACH_TASK)
5899 jump_label_inc(&perf_sched_events); 5828 jump_label_inc(&perf_sched_events.key);
5900 if (event->attr.mmap || event->attr.mmap_data) 5829 if (event->attr.mmap || event->attr.mmap_data)
5901 atomic_inc(&nr_mmap_events); 5830 atomic_inc(&nr_mmap_events);
5902 if (event->attr.comm) 5831 if (event->attr.comm)
@@ -6028,6 +5957,8 @@ set:
6028 5957
6029 old_rb = event->rb; 5958 old_rb = event->rb;
6030 rcu_assign_pointer(event->rb, rb); 5959 rcu_assign_pointer(event->rb, rb);
5960 if (old_rb)
5961 ring_buffer_detach(event, old_rb);
6031 ret = 0; 5962 ret = 0;
6032unlock: 5963unlock:
6033 mutex_unlock(&event->mmap_mutex); 5964 mutex_unlock(&event->mmap_mutex);
@@ -6132,7 +6063,7 @@ SYSCALL_DEFINE5(perf_event_open,
6132 * - that may need work on context switch 6063 * - that may need work on context switch
6133 */ 6064 */
6134 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); 6065 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
6135 jump_label_inc(&perf_sched_events); 6066 jump_label_inc(&perf_sched_events.key);
6136 } 6067 }
6137 6068
6138 /* 6069 /*
@@ -6978,6 +6909,9 @@ void __init perf_event_init(void)
6978 6909
6979 ret = init_hw_breakpoint(); 6910 ret = init_hw_breakpoint();
6980 WARN(ret, "hw_breakpoint initialization failed with: %d", ret); 6911 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
6912
6913 /* do not patch jump label more than once per second */
6914 jump_label_rate_limit(&perf_sched_events, HZ);
6981} 6915}
6982 6916
6983static int __init perf_event_sysfs_init(void) 6917static int __init perf_event_sysfs_init(void)
@@ -7044,10 +6978,13 @@ static int __perf_cgroup_move(void *info)
7044 return 0; 6978 return 0;
7045} 6979}
7046 6980
7047static void 6981static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7048perf_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *task) 6982 struct cgroup_taskset *tset)
7049{ 6983{
7050 task_function_call(task, __perf_cgroup_move, task); 6984 struct task_struct *task;
6985
6986 cgroup_taskset_for_each(task, cgrp, tset)
6987 task_function_call(task, __perf_cgroup_move, task);
7051} 6988}
7052 6989
7053static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, 6990static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
@@ -7061,7 +6998,7 @@ static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
7061 if (!(task->flags & PF_EXITING)) 6998 if (!(task->flags & PF_EXITING))
7062 return; 6999 return;
7063 7000
7064 perf_cgroup_attach_task(cgrp, task); 7001 task_function_call(task, __perf_cgroup_move, task);
7065} 7002}
7066 7003
7067struct cgroup_subsys perf_subsys = { 7004struct cgroup_subsys perf_subsys = {
@@ -7070,6 +7007,6 @@ struct cgroup_subsys perf_subsys = {
7070 .create = perf_cgroup_create, 7007 .create = perf_cgroup_create,
7071 .destroy = perf_cgroup_destroy, 7008 .destroy = perf_cgroup_destroy,
7072 .exit = perf_cgroup_exit, 7009 .exit = perf_cgroup_exit,
7073 .attach_task = perf_cgroup_attach_task, 7010 .attach = perf_cgroup_attach,
7074}; 7011};
7075#endif /* CONFIG_CGROUP_PERF */ 7012#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index b7971d6f38bf..ee706ce44aa0 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -651,10 +651,10 @@ int __init init_hw_breakpoint(void)
651 651
652 err_alloc: 652 err_alloc:
653 for_each_possible_cpu(err_cpu) { 653 for_each_possible_cpu(err_cpu) {
654 if (err_cpu == cpu)
655 break;
656 for (i = 0; i < TYPE_MAX; i++) 654 for (i = 0; i < TYPE_MAX; i++)
657 kfree(per_cpu(nr_task_bp_pinned[i], cpu)); 655 kfree(per_cpu(nr_task_bp_pinned[i], cpu));
656 if (err_cpu == cpu)
657 break;
658 } 658 }
659 659
660 return -ENOMEM; 660 return -ENOMEM;
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 09097dd8116c..b0b107f90afc 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -1,6 +1,10 @@
1#ifndef _KERNEL_EVENTS_INTERNAL_H 1#ifndef _KERNEL_EVENTS_INTERNAL_H
2#define _KERNEL_EVENTS_INTERNAL_H 2#define _KERNEL_EVENTS_INTERNAL_H
3 3
4#include <linux/hardirq.h>
5
6/* Buffer handling */
7
4#define RING_BUFFER_WRITABLE 0x01 8#define RING_BUFFER_WRITABLE 0x01
5 9
6struct ring_buffer { 10struct ring_buffer {
@@ -22,6 +26,9 @@ struct ring_buffer {
22 local_t lost; /* nr records lost */ 26 local_t lost; /* nr records lost */
23 27
24 long watermark; /* wakeup watermark */ 28 long watermark; /* wakeup watermark */
29 /* poll crap */
30 spinlock_t event_lock;
31 struct list_head event_list;
25 32
26 struct perf_event_mmap_page *user_page; 33 struct perf_event_mmap_page *user_page;
27 void *data_pages[0]; 34 void *data_pages[0];
@@ -64,7 +71,7 @@ static inline int page_order(struct ring_buffer *rb)
64} 71}
65#endif 72#endif
66 73
67static unsigned long perf_data_size(struct ring_buffer *rb) 74static inline unsigned long perf_data_size(struct ring_buffer *rb)
68{ 75{
69 return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); 76 return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
70} 77}
@@ -93,4 +100,37 @@ __output_copy(struct perf_output_handle *handle,
93 } while (len); 100 } while (len);
94} 101}
95 102
103/* Callchain handling */
104extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
105extern int get_callchain_buffers(void);
106extern void put_callchain_buffers(void);
107
108static inline int get_recursion_context(int *recursion)
109{
110 int rctx;
111
112 if (in_nmi())
113 rctx = 3;
114 else if (in_irq())
115 rctx = 2;
116 else if (in_softirq())
117 rctx = 1;
118 else
119 rctx = 0;
120
121 if (recursion[rctx])
122 return -1;
123
124 recursion[rctx]++;
125 barrier();
126
127 return rctx;
128}
129
130static inline void put_recursion_context(int *recursion, int rctx)
131{
132 barrier();
133 recursion[rctx]--;
134}
135
96#endif /* _KERNEL_EVENTS_INTERNAL_H */ 136#endif /* _KERNEL_EVENTS_INTERNAL_H */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index a2a29205cc0f..6ddaba43fb7a 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -4,7 +4,7 @@
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> 4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar 5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 * 8 *
9 * For licensing details see kernel-base/COPYING 9 * For licensing details see kernel-base/COPYING
10 */ 10 */
@@ -209,6 +209,9 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
209 rb->writable = 1; 209 rb->writable = 1;
210 210
211 atomic_set(&rb->refcount, 1); 211 atomic_set(&rb->refcount, 1);
212
213 INIT_LIST_HEAD(&rb->event_list);
214 spin_lock_init(&rb->event_lock);
212} 215}
213 216
214#ifndef CONFIG_PERF_USE_VMALLOC 217#ifndef CONFIG_PERF_USE_VMALLOC
diff --git a/kernel/exit.c b/kernel/exit.c
index d0b7d988f873..4b4042f9bc6a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -51,6 +51,7 @@
51#include <trace/events/sched.h> 51#include <trace/events/sched.h>
52#include <linux/hw_breakpoint.h> 52#include <linux/hw_breakpoint.h>
53#include <linux/oom.h> 53#include <linux/oom.h>
54#include <linux/writeback.h>
54 55
55#include <asm/uaccess.h> 56#include <asm/uaccess.h>
56#include <asm/unistd.h> 57#include <asm/unistd.h>
@@ -121,9 +122,9 @@ static void __exit_signal(struct task_struct *tsk)
121 * We won't ever get here for the group leader, since it 122 * We won't ever get here for the group leader, since it
122 * will have been the last reference on the signal_struct. 123 * will have been the last reference on the signal_struct.
123 */ 124 */
124 sig->utime = cputime_add(sig->utime, tsk->utime); 125 sig->utime += tsk->utime;
125 sig->stime = cputime_add(sig->stime, tsk->stime); 126 sig->stime += tsk->stime;
126 sig->gtime = cputime_add(sig->gtime, tsk->gtime); 127 sig->gtime += tsk->gtime;
127 sig->min_flt += tsk->min_flt; 128 sig->min_flt += tsk->min_flt;
128 sig->maj_flt += tsk->maj_flt; 129 sig->maj_flt += tsk->maj_flt;
129 sig->nvcsw += tsk->nvcsw; 130 sig->nvcsw += tsk->nvcsw;
@@ -679,8 +680,6 @@ static void exit_mm(struct task_struct * tsk)
679 tsk->mm = NULL; 680 tsk->mm = NULL;
680 up_read(&mm->mmap_sem); 681 up_read(&mm->mmap_sem);
681 enter_lazy_tlb(mm, current); 682 enter_lazy_tlb(mm, current);
682 /* We don't want this task to be frozen prematurely */
683 clear_freeze_flag(tsk);
684 task_unlock(tsk); 683 task_unlock(tsk);
685 mm_update_next_owner(mm); 684 mm_update_next_owner(mm);
686 mmput(mm); 685 mmput(mm);
@@ -888,7 +887,7 @@ static void check_stack_usage(void)
888static inline void check_stack_usage(void) {} 887static inline void check_stack_usage(void) {}
889#endif 888#endif
890 889
891NORET_TYPE void do_exit(long code) 890void do_exit(long code)
892{ 891{
893 struct task_struct *tsk = current; 892 struct task_struct *tsk = current;
894 int group_dead; 893 int group_dead;
@@ -965,8 +964,7 @@ NORET_TYPE void do_exit(long code)
965 acct_collect(code, group_dead); 964 acct_collect(code, group_dead);
966 if (group_dead) 965 if (group_dead)
967 tty_audit_exit(); 966 tty_audit_exit();
968 if (unlikely(tsk->audit_context)) 967 audit_free(tsk);
969 audit_free(tsk);
970 968
971 tsk->exit_code = code; 969 tsk->exit_code = code;
972 taskstats_exit(tsk, group_dead); 970 taskstats_exit(tsk, group_dead);
@@ -1037,9 +1035,28 @@ NORET_TYPE void do_exit(long code)
1037 validate_creds_for_do_exit(tsk); 1035 validate_creds_for_do_exit(tsk);
1038 1036
1039 preempt_disable(); 1037 preempt_disable();
1038 if (tsk->nr_dirtied)
1039 __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
1040 exit_rcu(); 1040 exit_rcu();
1041
1042 /*
1043 * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
1044 * when the following two conditions become true.
1045 * - There is race condition of mmap_sem (It is acquired by
1046 * exit_mm()), and
1047 * - SMI occurs before setting TASK_RUNINNG.
1048 * (or hypervisor of virtual machine switches to other guest)
1049 * As a result, we may become TASK_RUNNING after becoming TASK_DEAD
1050 *
1051 * To avoid it, we have to wait for releasing tsk->pi_lock which
1052 * is held by try_to_wake_up()
1053 */
1054 smp_mb();
1055 raw_spin_unlock_wait(&tsk->pi_lock);
1056
1041 /* causes final put_task_struct in finish_task_switch(). */ 1057 /* causes final put_task_struct in finish_task_switch(). */
1042 tsk->state = TASK_DEAD; 1058 tsk->state = TASK_DEAD;
1059 tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */
1043 schedule(); 1060 schedule();
1044 BUG(); 1061 BUG();
1045 /* Avoid "noreturn function does return". */ 1062 /* Avoid "noreturn function does return". */
@@ -1049,7 +1066,7 @@ NORET_TYPE void do_exit(long code)
1049 1066
1050EXPORT_SYMBOL_GPL(do_exit); 1067EXPORT_SYMBOL_GPL(do_exit);
1051 1068
1052NORET_TYPE void complete_and_exit(struct completion *comp, long code) 1069void complete_and_exit(struct completion *comp, long code)
1053{ 1070{
1054 if (comp) 1071 if (comp)
1055 complete(comp); 1072 complete(comp);
@@ -1068,7 +1085,7 @@ SYSCALL_DEFINE1(exit, int, error_code)
1068 * Take down every thread in the group. This is called by fatal signals 1085 * Take down every thread in the group. This is called by fatal signals
1069 * as well as by sys_exit_group (below). 1086 * as well as by sys_exit_group (below).
1070 */ 1087 */
1071NORET_TYPE void 1088void
1072do_group_exit(int exit_code) 1089do_group_exit(int exit_code)
1073{ 1090{
1074 struct signal_struct *sig = current->signal; 1091 struct signal_struct *sig = current->signal;
@@ -1255,19 +1272,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1255 spin_lock_irq(&p->real_parent->sighand->siglock); 1272 spin_lock_irq(&p->real_parent->sighand->siglock);
1256 psig = p->real_parent->signal; 1273 psig = p->real_parent->signal;
1257 sig = p->signal; 1274 sig = p->signal;
1258 psig->cutime = 1275 psig->cutime += tgutime + sig->cutime;
1259 cputime_add(psig->cutime, 1276 psig->cstime += tgstime + sig->cstime;
1260 cputime_add(tgutime, 1277 psig->cgtime += p->gtime + sig->gtime + sig->cgtime;
1261 sig->cutime));
1262 psig->cstime =
1263 cputime_add(psig->cstime,
1264 cputime_add(tgstime,
1265 sig->cstime));
1266 psig->cgtime =
1267 cputime_add(psig->cgtime,
1268 cputime_add(p->gtime,
1269 cputime_add(sig->gtime,
1270 sig->cgtime)));
1271 psig->cmin_flt += 1278 psig->cmin_flt +=
1272 p->min_flt + sig->min_flt + sig->cmin_flt; 1279 p->min_flt + sig->min_flt + sig->cmin_flt;
1273 psig->cmaj_flt += 1280 psig->cmaj_flt +=
@@ -1540,8 +1547,15 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
1540 } 1547 }
1541 1548
1542 /* dead body doesn't have much to contribute */ 1549 /* dead body doesn't have much to contribute */
1543 if (p->exit_state == EXIT_DEAD) 1550 if (unlikely(p->exit_state == EXIT_DEAD)) {
1551 /*
1552 * But do not ignore this task until the tracer does
1553 * wait_task_zombie()->do_notify_parent().
1554 */
1555 if (likely(!ptrace) && unlikely(ptrace_reparented(p)))
1556 wo->notask_error = 0;
1544 return 0; 1557 return 0;
1558 }
1545 1559
1546 /* slay zombie? */ 1560 /* slay zombie? */
1547 if (p->exit_state == EXIT_ZOMBIE) { 1561 if (p->exit_state == EXIT_ZOMBIE) {
diff --git a/kernel/fork.c b/kernel/fork.c
index da4a6a10d088..e2cd3e2a5ae8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -66,6 +66,7 @@
66#include <linux/user-return-notifier.h> 66#include <linux/user-return-notifier.h>
67#include <linux/oom.h> 67#include <linux/oom.h>
68#include <linux/khugepaged.h> 68#include <linux/khugepaged.h>
69#include <linux/signalfd.h>
69 70
70#include <asm/pgtable.h> 71#include <asm/pgtable.h>
71#include <asm/pgalloc.h> 72#include <asm/pgalloc.h>
@@ -76,6 +77,9 @@
76 77
77#include <trace/events/sched.h> 78#include <trace/events/sched.h>
78 79
80#define CREATE_TRACE_POINTS
81#include <trace/events/task.h>
82
79/* 83/*
80 * Protected counters by write_lock_irq(&tasklist_lock) 84 * Protected counters by write_lock_irq(&tasklist_lock)
81 */ 85 */
@@ -644,6 +648,26 @@ struct mm_struct *get_task_mm(struct task_struct *task)
644} 648}
645EXPORT_SYMBOL_GPL(get_task_mm); 649EXPORT_SYMBOL_GPL(get_task_mm);
646 650
651struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
652{
653 struct mm_struct *mm;
654 int err;
655
656 err = mutex_lock_killable(&task->signal->cred_guard_mutex);
657 if (err)
658 return ERR_PTR(err);
659
660 mm = get_task_mm(task);
661 if (mm && mm != current->mm &&
662 !ptrace_may_access(task, mode)) {
663 mmput(mm);
664 mm = ERR_PTR(-EACCES);
665 }
666 mutex_unlock(&task->signal->cred_guard_mutex);
667
668 return mm;
669}
670
647/* Please note the differences between mmput and mm_release. 671/* Please note the differences between mmput and mm_release.
648 * mmput is called whenever we stop holding onto a mm_struct, 672 * mmput is called whenever we stop holding onto a mm_struct,
649 * error success whatever. 673 * error success whatever.
@@ -870,6 +894,7 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
870{ 894{
871#ifdef CONFIG_BLOCK 895#ifdef CONFIG_BLOCK
872 struct io_context *ioc = current->io_context; 896 struct io_context *ioc = current->io_context;
897 struct io_context *new_ioc;
873 898
874 if (!ioc) 899 if (!ioc)
875 return 0; 900 return 0;
@@ -881,11 +906,12 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
881 if (unlikely(!tsk->io_context)) 906 if (unlikely(!tsk->io_context))
882 return -ENOMEM; 907 return -ENOMEM;
883 } else if (ioprio_valid(ioc->ioprio)) { 908 } else if (ioprio_valid(ioc->ioprio)) {
884 tsk->io_context = alloc_io_context(GFP_KERNEL, -1); 909 new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
885 if (unlikely(!tsk->io_context)) 910 if (unlikely(!new_ioc))
886 return -ENOMEM; 911 return -ENOMEM;
887 912
888 tsk->io_context->ioprio = ioc->ioprio; 913 new_ioc->ioprio = ioc->ioprio;
914 put_io_context(new_ioc);
889 } 915 }
890#endif 916#endif
891 return 0; 917 return 0;
@@ -910,8 +936,10 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
910 936
911void __cleanup_sighand(struct sighand_struct *sighand) 937void __cleanup_sighand(struct sighand_struct *sighand)
912{ 938{
913 if (atomic_dec_and_test(&sighand->count)) 939 if (atomic_dec_and_test(&sighand->count)) {
940 signalfd_cleanup(sighand);
914 kmem_cache_free(sighand_cachep, sighand); 941 kmem_cache_free(sighand_cachep, sighand);
942 }
915} 943}
916 944
917 945
@@ -972,7 +1000,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
972 sched_autogroup_fork(sig); 1000 sched_autogroup_fork(sig);
973 1001
974#ifdef CONFIG_CGROUPS 1002#ifdef CONFIG_CGROUPS
975 init_rwsem(&sig->threadgroup_fork_lock); 1003 init_rwsem(&sig->group_rwsem);
976#endif 1004#endif
977 1005
978 sig->oom_adj = current->signal->oom_adj; 1006 sig->oom_adj = current->signal->oom_adj;
@@ -992,7 +1020,6 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
992 new_flags |= PF_FORKNOEXEC; 1020 new_flags |= PF_FORKNOEXEC;
993 new_flags |= PF_STARTING; 1021 new_flags |= PF_STARTING;
994 p->flags = new_flags; 1022 p->flags = new_flags;
995 clear_freeze_flag(p);
996} 1023}
997 1024
998SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) 1025SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
@@ -1023,8 +1050,8 @@ void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
1023 */ 1050 */
1024static void posix_cpu_timers_init(struct task_struct *tsk) 1051static void posix_cpu_timers_init(struct task_struct *tsk)
1025{ 1052{
1026 tsk->cputime_expires.prof_exp = cputime_zero; 1053 tsk->cputime_expires.prof_exp = 0;
1027 tsk->cputime_expires.virt_exp = cputime_zero; 1054 tsk->cputime_expires.virt_exp = 0;
1028 tsk->cputime_expires.sched_exp = 0; 1055 tsk->cputime_expires.sched_exp = 0;
1029 INIT_LIST_HEAD(&tsk->cpu_timers[0]); 1056 INIT_LIST_HEAD(&tsk->cpu_timers[0]);
1030 INIT_LIST_HEAD(&tsk->cpu_timers[1]); 1057 INIT_LIST_HEAD(&tsk->cpu_timers[1]);
@@ -1132,14 +1159,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1132 1159
1133 init_sigpending(&p->pending); 1160 init_sigpending(&p->pending);
1134 1161
1135 p->utime = cputime_zero; 1162 p->utime = p->stime = p->gtime = 0;
1136 p->stime = cputime_zero; 1163 p->utimescaled = p->stimescaled = 0;
1137 p->gtime = cputime_zero;
1138 p->utimescaled = cputime_zero;
1139 p->stimescaled = cputime_zero;
1140#ifndef CONFIG_VIRT_CPU_ACCOUNTING 1164#ifndef CONFIG_VIRT_CPU_ACCOUNTING
1141 p->prev_utime = cputime_zero; 1165 p->prev_utime = p->prev_stime = 0;
1142 p->prev_stime = cputime_zero;
1143#endif 1166#endif
1144#if defined(SPLIT_RSS_COUNTING) 1167#if defined(SPLIT_RSS_COUNTING)
1145 memset(&p->rss_stat, 0, sizeof(p->rss_stat)); 1168 memset(&p->rss_stat, 0, sizeof(p->rss_stat));
@@ -1158,7 +1181,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1158 p->io_context = NULL; 1181 p->io_context = NULL;
1159 p->audit_context = NULL; 1182 p->audit_context = NULL;
1160 if (clone_flags & CLONE_THREAD) 1183 if (clone_flags & CLONE_THREAD)
1161 threadgroup_fork_read_lock(current); 1184 threadgroup_change_begin(current);
1162 cgroup_fork(p); 1185 cgroup_fork(p);
1163#ifdef CONFIG_NUMA 1186#ifdef CONFIG_NUMA
1164 p->mempolicy = mpol_dup(p->mempolicy); 1187 p->mempolicy = mpol_dup(p->mempolicy);
@@ -1296,6 +1319,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1296 1319
1297 p->nr_dirtied = 0; 1320 p->nr_dirtied = 0;
1298 p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); 1321 p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
1322 p->dirty_paused_when = 0;
1299 1323
1300 /* 1324 /*
1301 * Ok, make it visible to the rest of the system. 1325 * Ok, make it visible to the rest of the system.
@@ -1373,8 +1397,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1373 proc_fork_connector(p); 1397 proc_fork_connector(p);
1374 cgroup_post_fork(p); 1398 cgroup_post_fork(p);
1375 if (clone_flags & CLONE_THREAD) 1399 if (clone_flags & CLONE_THREAD)
1376 threadgroup_fork_read_unlock(current); 1400 threadgroup_change_end(current);
1377 perf_event_fork(p); 1401 perf_event_fork(p);
1402
1403 trace_task_newtask(p, clone_flags);
1404
1378 return p; 1405 return p;
1379 1406
1380bad_fork_free_pid: 1407bad_fork_free_pid:
@@ -1408,7 +1435,7 @@ bad_fork_cleanup_policy:
1408bad_fork_cleanup_cgroup: 1435bad_fork_cleanup_cgroup:
1409#endif 1436#endif
1410 if (clone_flags & CLONE_THREAD) 1437 if (clone_flags & CLONE_THREAD)
1411 threadgroup_fork_read_unlock(current); 1438 threadgroup_change_end(current);
1412 cgroup_exit(p, cgroup_callbacks_done); 1439 cgroup_exit(p, cgroup_callbacks_done);
1413 delayacct_tsk_free(p); 1440 delayacct_tsk_free(p);
1414 module_put(task_thread_info(p)->exec_domain->module); 1441 module_put(task_thread_info(p)->exec_domain->module);
@@ -1523,8 +1550,6 @@ long do_fork(unsigned long clone_flags,
1523 init_completion(&vfork); 1550 init_completion(&vfork);
1524 } 1551 }
1525 1552
1526 audit_finish_fork(p);
1527
1528 /* 1553 /*
1529 * We set PF_STARTING at creation in case tracing wants to 1554 * We set PF_STARTING at creation in case tracing wants to
1530 * use this to distinguish a fully live task from one that 1555 * use this to distinguish a fully live task from one that
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 7be56c534397..9815b8d1eed5 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -9,101 +9,114 @@
9#include <linux/export.h> 9#include <linux/export.h>
10#include <linux/syscalls.h> 10#include <linux/syscalls.h>
11#include <linux/freezer.h> 11#include <linux/freezer.h>
12#include <linux/kthread.h>
12 13
13/* 14/* total number of freezing conditions in effect */
14 * freezing is complete, mark current process as frozen 15atomic_t system_freezing_cnt = ATOMIC_INIT(0);
16EXPORT_SYMBOL(system_freezing_cnt);
17
18/* indicate whether PM freezing is in effect, protected by pm_mutex */
19bool pm_freezing;
20bool pm_nosig_freezing;
21
22/* protects freezing and frozen transitions */
23static DEFINE_SPINLOCK(freezer_lock);
24
25/**
26 * freezing_slow_path - slow path for testing whether a task needs to be frozen
27 * @p: task to be tested
28 *
29 * This function is called by freezing() if system_freezing_cnt isn't zero
30 * and tests whether @p needs to enter and stay in frozen state. Can be
31 * called under any context. The freezers are responsible for ensuring the
32 * target tasks see the updated state.
15 */ 33 */
16static inline void frozen_process(void) 34bool freezing_slow_path(struct task_struct *p)
17{ 35{
18 if (!unlikely(current->flags & PF_NOFREEZE)) { 36 if (p->flags & PF_NOFREEZE)
19 current->flags |= PF_FROZEN; 37 return false;
20 smp_wmb(); 38
21 } 39 if (pm_nosig_freezing || cgroup_freezing(p))
22 clear_freeze_flag(current); 40 return true;
41
42 if (pm_freezing && !(p->flags & PF_KTHREAD))
43 return true;
44
45 return false;
23} 46}
47EXPORT_SYMBOL(freezing_slow_path);
24 48
25/* Refrigerator is place where frozen processes are stored :-). */ 49/* Refrigerator is place where frozen processes are stored :-). */
26void refrigerator(void) 50bool __refrigerator(bool check_kthr_stop)
27{ 51{
28 /* Hmm, should we be allowed to suspend when there are realtime 52 /* Hmm, should we be allowed to suspend when there are realtime
29 processes around? */ 53 processes around? */
30 long save; 54 bool was_frozen = false;
55 long save = current->state;
31 56
32 task_lock(current);
33 if (freezing(current)) {
34 frozen_process();
35 task_unlock(current);
36 } else {
37 task_unlock(current);
38 return;
39 }
40 save = current->state;
41 pr_debug("%s entered refrigerator\n", current->comm); 57 pr_debug("%s entered refrigerator\n", current->comm);
42 58
43 spin_lock_irq(&current->sighand->siglock);
44 recalc_sigpending(); /* We sent fake signal, clean it up */
45 spin_unlock_irq(&current->sighand->siglock);
46
47 /* prevent accounting of that task to load */
48 current->flags |= PF_FREEZING;
49
50 for (;;) { 59 for (;;) {
51 set_current_state(TASK_UNINTERRUPTIBLE); 60 set_current_state(TASK_UNINTERRUPTIBLE);
52 if (!frozen(current)) 61
62 spin_lock_irq(&freezer_lock);
63 current->flags |= PF_FROZEN;
64 if (!freezing(current) ||
65 (check_kthr_stop && kthread_should_stop()))
66 current->flags &= ~PF_FROZEN;
67 spin_unlock_irq(&freezer_lock);
68
69 if (!(current->flags & PF_FROZEN))
53 break; 70 break;
71 was_frozen = true;
54 schedule(); 72 schedule();
55 } 73 }
56 74
57 /* Remove the accounting blocker */
58 current->flags &= ~PF_FREEZING;
59
60 pr_debug("%s left refrigerator\n", current->comm); 75 pr_debug("%s left refrigerator\n", current->comm);
61 __set_current_state(save); 76
77 /*
78 * Restore saved task state before returning. The mb'd version
79 * needs to be used; otherwise, it might silently break
80 * synchronization which depends on ordered task state change.
81 */
82 set_current_state(save);
83
84 return was_frozen;
62} 85}
63EXPORT_SYMBOL(refrigerator); 86EXPORT_SYMBOL(__refrigerator);
64 87
65static void fake_signal_wake_up(struct task_struct *p) 88static void fake_signal_wake_up(struct task_struct *p)
66{ 89{
67 unsigned long flags; 90 unsigned long flags;
68 91
69 spin_lock_irqsave(&p->sighand->siglock, flags); 92 if (lock_task_sighand(p, &flags)) {
70 signal_wake_up(p, 0); 93 signal_wake_up(p, 0);
71 spin_unlock_irqrestore(&p->sighand->siglock, flags); 94 unlock_task_sighand(p, &flags);
95 }
72} 96}
73 97
74/** 98/**
75 * freeze_task - send a freeze request to given task 99 * freeze_task - send a freeze request to given task
76 * @p: task to send the request to 100 * @p: task to send the request to
77 * @sig_only: if set, the request will only be sent if the task has the 101 *
78 * PF_FREEZER_NOSIG flag unset 102 * If @p is freezing, the freeze request is sent by setting %TIF_FREEZE
79 * Return value: 'false', if @sig_only is set and the task has 103 * flag and either sending a fake signal to it or waking it up, depending
80 * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise 104 * on whether it has %PF_FREEZER_NOSIG set.
81 * 105 *
82 * The freeze request is sent by setting the tasks's TIF_FREEZE flag and 106 * RETURNS:
83 * either sending a fake signal to it or waking it up, depending on whether 107 * %false, if @p is not freezing or already frozen; %true, otherwise
84 * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task
85 * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its
86 * TIF_FREEZE flag will not be set.
87 */ 108 */
88bool freeze_task(struct task_struct *p, bool sig_only) 109bool freeze_task(struct task_struct *p)
89{ 110{
90 /* 111 unsigned long flags;
91 * We first check if the task is freezing and next if it has already 112
92 * been frozen to avoid the race with frozen_process() which first marks 113 spin_lock_irqsave(&freezer_lock, flags);
93 * the task as frozen and next clears its TIF_FREEZE. 114 if (!freezing(p) || frozen(p)) {
94 */ 115 spin_unlock_irqrestore(&freezer_lock, flags);
95 if (!freezing(p)) { 116 return false;
96 smp_rmb();
97 if (frozen(p))
98 return false;
99
100 if (!sig_only || should_send_signal(p))
101 set_freeze_flag(p);
102 else
103 return false;
104 } 117 }
105 118
106 if (should_send_signal(p)) { 119 if (!(p->flags & PF_KTHREAD)) {
107 fake_signal_wake_up(p); 120 fake_signal_wake_up(p);
108 /* 121 /*
109 * fake_signal_wake_up() goes through p's scheduler 122 * fake_signal_wake_up() goes through p's scheduler
@@ -111,56 +124,48 @@ bool freeze_task(struct task_struct *p, bool sig_only)
111 * TASK_RUNNING transition can't race with task state 124 * TASK_RUNNING transition can't race with task state
112 * testing in try_to_freeze_tasks(). 125 * testing in try_to_freeze_tasks().
113 */ 126 */
114 } else if (sig_only) {
115 return false;
116 } else { 127 } else {
117 wake_up_state(p, TASK_INTERRUPTIBLE); 128 wake_up_state(p, TASK_INTERRUPTIBLE);
118 } 129 }
119 130
131 spin_unlock_irqrestore(&freezer_lock, flags);
120 return true; 132 return true;
121} 133}
122 134
123void cancel_freezing(struct task_struct *p) 135void __thaw_task(struct task_struct *p)
124{ 136{
125 unsigned long flags; 137 unsigned long flags;
126 138
127 if (freezing(p)) { 139 /*
128 pr_debug(" clean up: %s\n", p->comm); 140 * Clear freezing and kick @p if FROZEN. Clearing is guaranteed to
129 clear_freeze_flag(p); 141 * be visible to @p as waking up implies wmb. Waking up inside
130 spin_lock_irqsave(&p->sighand->siglock, flags); 142 * freezer_lock also prevents wakeups from leaking outside
131 recalc_sigpending_and_wake(p); 143 * refrigerator.
132 spin_unlock_irqrestore(&p->sighand->siglock, flags); 144 */
133 } 145 spin_lock_irqsave(&freezer_lock, flags);
134} 146 if (frozen(p))
135 147 wake_up_process(p);
136static int __thaw_process(struct task_struct *p) 148 spin_unlock_irqrestore(&freezer_lock, flags);
137{
138 if (frozen(p)) {
139 p->flags &= ~PF_FROZEN;
140 return 1;
141 }
142 clear_freeze_flag(p);
143 return 0;
144} 149}
145 150
146/* 151/**
147 * Wake up a frozen process 152 * set_freezable - make %current freezable
148 * 153 *
149 * task_lock() is needed to prevent the race with refrigerator() which may 154 * Mark %current freezable and enter refrigerator if necessary.
150 * occur if the freezing of tasks fails. Namely, without the lock, if the
151 * freezing of tasks failed, thaw_tasks() might have run before a task in
152 * refrigerator() could call frozen_process(), in which case the task would be
153 * frozen and no one would thaw it.
154 */ 155 */
155int thaw_process(struct task_struct *p) 156bool set_freezable(void)
156{ 157{
157 task_lock(p); 158 might_sleep();
158 if (__thaw_process(p) == 1) { 159
159 task_unlock(p); 160 /*
160 wake_up_process(p); 161 * Modify flags while holding freezer_lock. This ensures the
161 return 1; 162 * freezer notices that we aren't frozen yet or the freezing
162 } 163 * condition is visible to try_to_freeze() below.
163 task_unlock(p); 164 */
164 return 0; 165 spin_lock_irq(&freezer_lock);
166 current->flags &= ~PF_NOFREEZE;
167 spin_unlock_irq(&freezer_lock);
168
169 return try_to_freeze();
165} 170}
166EXPORT_SYMBOL(thaw_process); 171EXPORT_SYMBOL(set_freezable);
diff --git a/kernel/futex.c b/kernel/futex.c
index ea87f4d2f455..1614be20173d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -314,17 +314,29 @@ again:
314#endif 314#endif
315 315
316 lock_page(page_head); 316 lock_page(page_head);
317
318 /*
319 * If page_head->mapping is NULL, then it cannot be a PageAnon
320 * page; but it might be the ZERO_PAGE or in the gate area or
321 * in a special mapping (all cases which we are happy to fail);
322 * or it may have been a good file page when get_user_pages_fast
323 * found it, but truncated or holepunched or subjected to
324 * invalidate_complete_page2 before we got the page lock (also
325 * cases which we are happy to fail). And we hold a reference,
326 * so refcount care in invalidate_complete_page's remove_mapping
327 * prevents drop_caches from setting mapping to NULL beneath us.
328 *
329 * The case we do have to guard against is when memory pressure made
330 * shmem_writepage move it from filecache to swapcache beneath us:
331 * an unlikely race, but we do need to retry for page_head->mapping.
332 */
317 if (!page_head->mapping) { 333 if (!page_head->mapping) {
334 int shmem_swizzled = PageSwapCache(page_head);
318 unlock_page(page_head); 335 unlock_page(page_head);
319 put_page(page_head); 336 put_page(page_head);
320 /* 337 if (shmem_swizzled)
321 * ZERO_PAGE pages don't have a mapping. Avoid a busy loop 338 goto again;
322 * trying to find one. RW mapping would have COW'd (and thus 339 return -EFAULT;
323 * have a mapping) so this page is RO and won't ever change.
324 */
325 if ((page_head == ZERO_PAGE(address)))
326 return -EFAULT;
327 goto again;
328 } 340 }
329 341
330 /* 342 /*
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 422e567eecf6..ae34bf51682b 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -885,10 +885,13 @@ static void __remove_hrtimer(struct hrtimer *timer,
885 struct hrtimer_clock_base *base, 885 struct hrtimer_clock_base *base,
886 unsigned long newstate, int reprogram) 886 unsigned long newstate, int reprogram)
887{ 887{
888 struct timerqueue_node *next_timer;
888 if (!(timer->state & HRTIMER_STATE_ENQUEUED)) 889 if (!(timer->state & HRTIMER_STATE_ENQUEUED))
889 goto out; 890 goto out;
890 891
891 if (&timer->node == timerqueue_getnext(&base->active)) { 892 next_timer = timerqueue_getnext(&base->active);
893 timerqueue_del(&base->active, &timer->node);
894 if (&timer->node == next_timer) {
892#ifdef CONFIG_HIGH_RES_TIMERS 895#ifdef CONFIG_HIGH_RES_TIMERS
893 /* Reprogram the clock event device. if enabled */ 896 /* Reprogram the clock event device. if enabled */
894 if (reprogram && hrtimer_hres_active()) { 897 if (reprogram && hrtimer_hres_active()) {
@@ -901,7 +904,6 @@ static void __remove_hrtimer(struct hrtimer *timer,
901 } 904 }
902#endif 905#endif
903 } 906 }
904 timerqueue_del(&base->active, &timer->node);
905 if (!timerqueue_getnext(&base->active)) 907 if (!timerqueue_getnext(&base->active))
906 base->cpu_base->active_bases &= ~(1 << base->index); 908 base->cpu_base->active_bases &= ~(1 << base->index);
907out: 909out:
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 8b1748d0172c..2e48ec0c2e91 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -74,11 +74,17 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
74 74
75 /* 75 /*
76 * Ensure the task is not frozen. 76 * Ensure the task is not frozen.
77 * Also, when a freshly created task is scheduled once, changes 77 * Also, skip vfork and any other user process that freezer should skip.
78 * its state to TASK_UNINTERRUPTIBLE without having ever been
79 * switched out once, it musn't be checked.
80 */ 78 */
81 if (unlikely(t->flags & PF_FROZEN || !switch_count)) 79 if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP)))
80 return;
81
82 /*
83 * When a freshly created task is scheduled once, changes its state to
84 * TASK_UNINTERRUPTIBLE without having ever been switched out once, it
85 * musn't be checked.
86 */
87 if (unlikely(!switch_count))
82 return; 88 return;
83 89
84 if (switch_count != t->last_switch_count) { 90 if (switch_count != t->last_switch_count) {
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 342d8f44e401..0119b9d467ae 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -53,7 +53,7 @@ unsigned long probe_irq_on(void)
53 if (desc->irq_data.chip->irq_set_type) 53 if (desc->irq_data.chip->irq_set_type)
54 desc->irq_data.chip->irq_set_type(&desc->irq_data, 54 desc->irq_data.chip->irq_set_type(&desc->irq_data,
55 IRQ_TYPE_PROBE); 55 IRQ_TYPE_PROBE);
56 irq_startup(desc); 56 irq_startup(desc, false);
57 } 57 }
58 raw_spin_unlock_irq(&desc->lock); 58 raw_spin_unlock_irq(&desc->lock);
59 } 59 }
@@ -70,7 +70,7 @@ unsigned long probe_irq_on(void)
70 raw_spin_lock_irq(&desc->lock); 70 raw_spin_lock_irq(&desc->lock);
71 if (!desc->action && irq_settings_can_probe(desc)) { 71 if (!desc->action && irq_settings_can_probe(desc)) {
72 desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; 72 desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
73 if (irq_startup(desc)) 73 if (irq_startup(desc, false))
74 desc->istate |= IRQS_PENDING; 74 desc->istate |= IRQS_PENDING;
75 } 75 }
76 raw_spin_unlock_irq(&desc->lock); 76 raw_spin_unlock_irq(&desc->lock);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index f7c543a801d9..fb7db75ee0c8 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -157,19 +157,22 @@ static void irq_state_set_masked(struct irq_desc *desc)
157 irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); 157 irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);
158} 158}
159 159
160int irq_startup(struct irq_desc *desc) 160int irq_startup(struct irq_desc *desc, bool resend)
161{ 161{
162 int ret = 0;
163
162 irq_state_clr_disabled(desc); 164 irq_state_clr_disabled(desc);
163 desc->depth = 0; 165 desc->depth = 0;
164 166
165 if (desc->irq_data.chip->irq_startup) { 167 if (desc->irq_data.chip->irq_startup) {
166 int ret = desc->irq_data.chip->irq_startup(&desc->irq_data); 168 ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
167 irq_state_clr_masked(desc); 169 irq_state_clr_masked(desc);
168 return ret; 170 } else {
171 irq_enable(desc);
169 } 172 }
170 173 if (resend)
171 irq_enable(desc); 174 check_irq_resend(desc, desc->irq_data.irq);
172 return 0; 175 return ret;
173} 176}
174 177
175void irq_shutdown(struct irq_desc *desc) 178void irq_shutdown(struct irq_desc *desc)
@@ -330,6 +333,24 @@ out_unlock:
330} 333}
331EXPORT_SYMBOL_GPL(handle_simple_irq); 334EXPORT_SYMBOL_GPL(handle_simple_irq);
332 335
336/*
337 * Called unconditionally from handle_level_irq() and only for oneshot
338 * interrupts from handle_fasteoi_irq()
339 */
340static void cond_unmask_irq(struct irq_desc *desc)
341{
342 /*
343 * We need to unmask in the following cases:
344 * - Standard level irq (IRQF_ONESHOT is not set)
345 * - Oneshot irq which did not wake the thread (caused by a
346 * spurious interrupt or a primary handler handling it
347 * completely).
348 */
349 if (!irqd_irq_disabled(&desc->irq_data) &&
350 irqd_irq_masked(&desc->irq_data) && !desc->threads_oneshot)
351 unmask_irq(desc);
352}
353
333/** 354/**
334 * handle_level_irq - Level type irq handler 355 * handle_level_irq - Level type irq handler
335 * @irq: the interrupt number 356 * @irq: the interrupt number
@@ -362,8 +383,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
362 383
363 handle_irq_event(desc); 384 handle_irq_event(desc);
364 385
365 if (!irqd_irq_disabled(&desc->irq_data) && !(desc->istate & IRQS_ONESHOT)) 386 cond_unmask_irq(desc);
366 unmask_irq(desc); 387
367out_unlock: 388out_unlock:
368 raw_spin_unlock(&desc->lock); 389 raw_spin_unlock(&desc->lock);
369} 390}
@@ -417,6 +438,9 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
417 preflow_handler(desc); 438 preflow_handler(desc);
418 handle_irq_event(desc); 439 handle_irq_event(desc);
419 440
441 if (desc->istate & IRQS_ONESHOT)
442 cond_unmask_irq(desc);
443
420out_eoi: 444out_eoi:
421 desc->irq_data.chip->irq_eoi(&desc->irq_data); 445 desc->irq_data.chip->irq_eoi(&desc->irq_data);
422out_unlock: 446out_unlock:
@@ -625,7 +649,7 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
625 irq_settings_set_noprobe(desc); 649 irq_settings_set_noprobe(desc);
626 irq_settings_set_norequest(desc); 650 irq_settings_set_norequest(desc);
627 irq_settings_set_nothread(desc); 651 irq_settings_set_nothread(desc);
628 irq_startup(desc); 652 irq_startup(desc, true);
629 } 653 }
630out: 654out:
631 irq_put_desc_busunlock(desc, flags); 655 irq_put_desc_busunlock(desc, flags);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index a73dd6c7372d..40378ff877e7 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -15,7 +15,7 @@
15 15
16#define istate core_internal_state__do_not_mess_with_it 16#define istate core_internal_state__do_not_mess_with_it
17 17
18extern int noirqdebug; 18extern bool noirqdebug;
19 19
20/* 20/*
21 * Bits used by threaded handlers: 21 * Bits used by threaded handlers:
@@ -67,7 +67,7 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
67extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); 67extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
68extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); 68extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
69 69
70extern int irq_startup(struct irq_desc *desc); 70extern int irq_startup(struct irq_desc *desc, bool resend);
71extern void irq_shutdown(struct irq_desc *desc); 71extern void irq_shutdown(struct irq_desc *desc);
72extern void irq_enable(struct irq_desc *desc); 72extern void irq_enable(struct irq_desc *desc);
73extern void irq_disable(struct irq_desc *desc); 73extern void irq_disable(struct irq_desc *desc);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 200ce832c585..1f9e26526b69 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -135,6 +135,9 @@ int irq_domain_simple_dt_translate(struct irq_domain *d,
135 return -EINVAL; 135 return -EINVAL;
136 if (intsize < 1) 136 if (intsize < 1)
137 return -EINVAL; 137 return -EINVAL;
138 if (d->nr_irq && ((intspec[0] < d->hwirq_base) ||
139 (intspec[0] >= d->hwirq_base + d->nr_irq)))
140 return -EINVAL;
138 141
139 *out_hwirq = intspec[0]; 142 *out_hwirq = intspec[0];
140 *out_type = IRQ_TYPE_NONE; 143 *out_type = IRQ_TYPE_NONE;
@@ -143,11 +146,6 @@ int irq_domain_simple_dt_translate(struct irq_domain *d,
143 return 0; 146 return 0;
144} 147}
145 148
146struct irq_domain_ops irq_domain_simple_ops = {
147 .dt_translate = irq_domain_simple_dt_translate,
148};
149EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
150
151/** 149/**
152 * irq_domain_create_simple() - Set up a 'simple' translation range 150 * irq_domain_create_simple() - Set up a 'simple' translation range
153 */ 151 */
@@ -182,3 +180,10 @@ void irq_domain_generate_simple(const struct of_device_id *match,
182} 180}
183EXPORT_SYMBOL_GPL(irq_domain_generate_simple); 181EXPORT_SYMBOL_GPL(irq_domain_generate_simple);
184#endif /* CONFIG_OF_IRQ */ 182#endif /* CONFIG_OF_IRQ */
183
184struct irq_domain_ops irq_domain_simple_ops = {
185#ifdef CONFIG_OF_IRQ
186 .dt_translate = irq_domain_simple_dt_translate,
187#endif /* CONFIG_OF_IRQ */
188};
189EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 67ce837ae52c..32313c084442 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -623,8 +623,9 @@ static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id)
623 623
624static int irq_wait_for_interrupt(struct irqaction *action) 624static int irq_wait_for_interrupt(struct irqaction *action)
625{ 625{
626 set_current_state(TASK_INTERRUPTIBLE);
627
626 while (!kthread_should_stop()) { 628 while (!kthread_should_stop()) {
627 set_current_state(TASK_INTERRUPTIBLE);
628 629
629 if (test_and_clear_bit(IRQTF_RUNTHREAD, 630 if (test_and_clear_bit(IRQTF_RUNTHREAD,
630 &action->thread_flags)) { 631 &action->thread_flags)) {
@@ -632,7 +633,9 @@ static int irq_wait_for_interrupt(struct irqaction *action)
632 return 0; 633 return 0;
633 } 634 }
634 schedule(); 635 schedule();
636 set_current_state(TASK_INTERRUPTIBLE);
635 } 637 }
638 __set_current_state(TASK_RUNNING);
636 return -1; 639 return -1;
637} 640}
638 641
@@ -1024,7 +1027,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1024 desc->istate |= IRQS_ONESHOT; 1027 desc->istate |= IRQS_ONESHOT;
1025 1028
1026 if (irq_settings_can_autoenable(desc)) 1029 if (irq_settings_can_autoenable(desc))
1027 irq_startup(desc); 1030 irq_startup(desc, true);
1028 else 1031 else
1029 /* Undo nested disables: */ 1032 /* Undo nested disables: */
1030 desc->depth = 1; 1033 desc->depth = 1;
@@ -1289,7 +1292,7 @@ EXPORT_SYMBOL(free_irq);
1289 * and to set up the interrupt handler in the right order. 1292 * and to set up the interrupt handler in the right order.
1290 * 1293 *
1291 * If you want to set up a threaded irq handler for your device 1294 * If you want to set up a threaded irq handler for your device
1292 * then you need to supply @handler and @thread_fn. @handler ist 1295 * then you need to supply @handler and @thread_fn. @handler is
1293 * still called in hard interrupt context and has to check 1296 * still called in hard interrupt context and has to check
1294 * whether the interrupt originates from the device. If yes it 1297 * whether the interrupt originates from the device. If yes it
1295 * needs to disable the interrupt on the device and return 1298 * needs to disable the interrupt on the device and return
@@ -1596,7 +1599,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
1596 return -ENOMEM; 1599 return -ENOMEM;
1597 1600
1598 action->handler = handler; 1601 action->handler = handler;
1599 action->flags = IRQF_PERCPU; 1602 action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND;
1600 action->name = devname; 1603 action->name = devname;
1601 action->percpu_dev_id = dev_id; 1604 action->percpu_dev_id = dev_id;
1602 1605
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index b5f4742693c0..611cd6003c45 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -84,7 +84,9 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
84 */ 84 */
85 action = desc->action; 85 action = desc->action;
86 if (!action || !(action->flags & IRQF_SHARED) || 86 if (!action || !(action->flags & IRQF_SHARED) ||
87 (action->flags & __IRQF_TIMER) || !action->next) 87 (action->flags & __IRQF_TIMER) ||
88 (action->handler(irq, action->dev_id) == IRQ_HANDLED) ||
89 !action->next)
88 goto out; 90 goto out;
89 91
90 /* Already running on another processor */ 92 /* Already running on another processor */
@@ -323,7 +325,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
323 desc->irqs_unhandled = 0; 325 desc->irqs_unhandled = 0;
324} 326}
325 327
326int noirqdebug __read_mostly; 328bool noirqdebug __read_mostly;
327 329
328int noirqdebug_setup(char *str) 330int noirqdebug_setup(char *str)
329{ 331{
diff --git a/kernel/itimer.c b/kernel/itimer.c
index d802883153da..22000c3db0dd 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -52,22 +52,22 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
52 52
53 cval = it->expires; 53 cval = it->expires;
54 cinterval = it->incr; 54 cinterval = it->incr;
55 if (!cputime_eq(cval, cputime_zero)) { 55 if (cval) {
56 struct task_cputime cputime; 56 struct task_cputime cputime;
57 cputime_t t; 57 cputime_t t;
58 58
59 thread_group_cputimer(tsk, &cputime); 59 thread_group_cputimer(tsk, &cputime);
60 if (clock_id == CPUCLOCK_PROF) 60 if (clock_id == CPUCLOCK_PROF)
61 t = cputime_add(cputime.utime, cputime.stime); 61 t = cputime.utime + cputime.stime;
62 else 62 else
63 /* CPUCLOCK_VIRT */ 63 /* CPUCLOCK_VIRT */
64 t = cputime.utime; 64 t = cputime.utime;
65 65
66 if (cputime_le(cval, t)) 66 if (cval < t)
67 /* about to fire */ 67 /* about to fire */
68 cval = cputime_one_jiffy; 68 cval = cputime_one_jiffy;
69 else 69 else
70 cval = cputime_sub(cval, t); 70 cval = cval - t;
71 } 71 }
72 72
73 spin_unlock_irq(&tsk->sighand->siglock); 73 spin_unlock_irq(&tsk->sighand->siglock);
@@ -161,10 +161,9 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
161 161
162 cval = it->expires; 162 cval = it->expires;
163 cinterval = it->incr; 163 cinterval = it->incr;
164 if (!cputime_eq(cval, cputime_zero) || 164 if (cval || nval) {
165 !cputime_eq(nval, cputime_zero)) { 165 if (nval > 0)
166 if (cputime_gt(nval, cputime_zero)) 166 nval += cputime_one_jiffy;
167 nval = cputime_add(nval, cputime_one_jiffy);
168 set_process_cpu_timer(tsk, clock_id, &nval, &cval); 167 set_process_cpu_timer(tsk, clock_id, &nval, &cval);
169 } 168 }
170 it->expires = nval; 169 it->expires = nval;
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index bbdfe2a462a0..01d3b70fc98a 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -66,19 +66,53 @@ void jump_label_inc(struct jump_label_key *key)
66 return; 66 return;
67 67
68 jump_label_lock(); 68 jump_label_lock();
69 if (atomic_add_return(1, &key->enabled) == 1) 69 if (atomic_read(&key->enabled) == 0)
70 jump_label_update(key, JUMP_LABEL_ENABLE); 70 jump_label_update(key, JUMP_LABEL_ENABLE);
71 atomic_inc(&key->enabled);
71 jump_label_unlock(); 72 jump_label_unlock();
72} 73}
74EXPORT_SYMBOL_GPL(jump_label_inc);
73 75
74void jump_label_dec(struct jump_label_key *key) 76static void __jump_label_dec(struct jump_label_key *key,
77 unsigned long rate_limit, struct delayed_work *work)
75{ 78{
76 if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) 79 if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex))
77 return; 80 return;
78 81
79 jump_label_update(key, JUMP_LABEL_DISABLE); 82 if (rate_limit) {
83 atomic_inc(&key->enabled);
84 schedule_delayed_work(work, rate_limit);
85 } else
86 jump_label_update(key, JUMP_LABEL_DISABLE);
87
80 jump_label_unlock(); 88 jump_label_unlock();
81} 89}
90EXPORT_SYMBOL_GPL(jump_label_dec);
91
92static void jump_label_update_timeout(struct work_struct *work)
93{
94 struct jump_label_key_deferred *key =
95 container_of(work, struct jump_label_key_deferred, work.work);
96 __jump_label_dec(&key->key, 0, NULL);
97}
98
99void jump_label_dec(struct jump_label_key *key)
100{
101 __jump_label_dec(key, 0, NULL);
102}
103
104void jump_label_dec_deferred(struct jump_label_key_deferred *key)
105{
106 __jump_label_dec(&key->key, key->timeout, &key->work);
107}
108
109
110void jump_label_rate_limit(struct jump_label_key_deferred *key,
111 unsigned long rl)
112{
113 key->timeout = rl;
114 INIT_DELAYED_WORK(&key->work, jump_label_update_timeout);
115}
82 116
83static int addr_conflict(struct jump_entry *entry, void *start, void *end) 117static int addr_conflict(struct jump_entry *entry, void *start, void *end)
84{ 118{
@@ -110,7 +144,7 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start,
110 * running code can override this to make the non-live update case 144 * running code can override this to make the non-live update case
111 * cheaper. 145 * cheaper.
112 */ 146 */
113void __weak arch_jump_label_transform_static(struct jump_entry *entry, 147void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry,
114 enum jump_label_type type) 148 enum jump_label_type type)
115{ 149{
116 arch_jump_label_transform(entry, type); 150 arch_jump_label_transform(entry, type);
@@ -216,8 +250,13 @@ void jump_label_apply_nops(struct module *mod)
216 if (iter_start == iter_stop) 250 if (iter_start == iter_stop)
217 return; 251 return;
218 252
219 for (iter = iter_start; iter < iter_stop; iter++) 253 for (iter = iter_start; iter < iter_stop; iter++) {
220 arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE); 254 struct jump_label_key *iterk;
255
256 iterk = (struct jump_label_key *)(unsigned long)iter->key;
257 arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ?
258 JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE);
259 }
221} 260}
222 261
223static int jump_label_add_module(struct module *mod) 262static int jump_label_add_module(struct module *mod)
@@ -257,8 +296,7 @@ static int jump_label_add_module(struct module *mod)
257 key->next = jlm; 296 key->next = jlm;
258 297
259 if (jump_label_enabled(key)) 298 if (jump_label_enabled(key))
260 __jump_label_update(key, iter, iter_stop, 299 __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE);
261 JUMP_LABEL_ENABLE);
262 } 300 }
263 301
264 return 0; 302 return 0;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index dc7bc0829286..7b0886786701 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -32,7 +32,6 @@
32#include <linux/console.h> 32#include <linux/console.h>
33#include <linux/vmalloc.h> 33#include <linux/vmalloc.h>
34#include <linux/swap.h> 34#include <linux/swap.h>
35#include <linux/kmsg_dump.h>
36#include <linux/syscore_ops.h> 35#include <linux/syscore_ops.h>
37 36
38#include <asm/page.h> 37#include <asm/page.h>
@@ -1094,8 +1093,6 @@ void crash_kexec(struct pt_regs *regs)
1094 if (kexec_crash_image) { 1093 if (kexec_crash_image) {
1095 struct pt_regs fixed_regs; 1094 struct pt_regs fixed_regs;
1096 1095
1097 kmsg_dump(KMSG_DUMP_KEXEC);
1098
1099 crash_setup_regs(&fixed_regs, regs); 1096 crash_setup_regs(&fixed_regs, regs);
1100 crash_save_vmcoreinfo(); 1097 crash_save_vmcoreinfo();
1101 machine_crash_shutdown(&fixed_regs); 1098 machine_crash_shutdown(&fixed_regs);
@@ -1132,6 +1129,8 @@ int crash_shrink_memory(unsigned long new_size)
1132{ 1129{
1133 int ret = 0; 1130 int ret = 0;
1134 unsigned long start, end; 1131 unsigned long start, end;
1132 unsigned long old_size;
1133 struct resource *ram_res;
1135 1134
1136 mutex_lock(&kexec_mutex); 1135 mutex_lock(&kexec_mutex);
1137 1136
@@ -1141,11 +1140,15 @@ int crash_shrink_memory(unsigned long new_size)
1141 } 1140 }
1142 start = crashk_res.start; 1141 start = crashk_res.start;
1143 end = crashk_res.end; 1142 end = crashk_res.end;
1143 old_size = (end == 0) ? 0 : end - start + 1;
1144 if (new_size >= old_size) {
1145 ret = (new_size == old_size) ? 0 : -EINVAL;
1146 goto unlock;
1147 }
1144 1148
1145 if (new_size >= end - start + 1) { 1149 ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
1146 ret = -EINVAL; 1150 if (!ram_res) {
1147 if (new_size == end - start + 1) 1151 ret = -ENOMEM;
1148 ret = 0;
1149 goto unlock; 1152 goto unlock;
1150 } 1153 }
1151 1154
@@ -1157,7 +1160,15 @@ int crash_shrink_memory(unsigned long new_size)
1157 1160
1158 if ((start == end) && (crashk_res.parent != NULL)) 1161 if ((start == end) && (crashk_res.parent != NULL))
1159 release_resource(&crashk_res); 1162 release_resource(&crashk_res);
1163
1164 ram_res->start = end;
1165 ram_res->end = crashk_res.end;
1166 ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
1167 ram_res->name = "System RAM";
1168
1160 crashk_res.end = end - 1; 1169 crashk_res.end = end - 1;
1170
1171 insert_resource(&iomem_resource, ram_res);
1161 crash_unmap_reserved_pages(); 1172 crash_unmap_reserved_pages();
1162 1173
1163unlock: 1174unlock:
@@ -1523,7 +1534,7 @@ int kernel_kexec(void)
1523 1534
1524#ifdef CONFIG_KEXEC_JUMP 1535#ifdef CONFIG_KEXEC_JUMP
1525 if (kexec_image->preserve_context) { 1536 if (kexec_image->preserve_context) {
1526 mutex_lock(&pm_mutex); 1537 lock_system_sleep();
1527 pm_prepare_console(); 1538 pm_prepare_console();
1528 error = freeze_processes(); 1539 error = freeze_processes();
1529 if (error) { 1540 if (error) {
@@ -1576,7 +1587,7 @@ int kernel_kexec(void)
1576 thaw_processes(); 1587 thaw_processes();
1577 Restore_console: 1588 Restore_console:
1578 pm_restore_console(); 1589 pm_restore_console();
1579 mutex_unlock(&pm_mutex); 1590 unlock_system_sleep();
1580 } 1591 }
1581#endif 1592#endif
1582 1593
diff --git a/kernel/kmod.c b/kernel/kmod.c
index a4bea97c75b6..a0a88543934e 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -36,6 +36,7 @@
36#include <linux/resource.h> 36#include <linux/resource.h>
37#include <linux/notifier.h> 37#include <linux/notifier.h>
38#include <linux/suspend.h> 38#include <linux/suspend.h>
39#include <linux/rwsem.h>
39#include <asm/uaccess.h> 40#include <asm/uaccess.h>
40 41
41#include <trace/events/module.h> 42#include <trace/events/module.h>
@@ -50,6 +51,7 @@ static struct workqueue_struct *khelper_wq;
50static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; 51static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
51static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; 52static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
52static DEFINE_SPINLOCK(umh_sysctl_lock); 53static DEFINE_SPINLOCK(umh_sysctl_lock);
54static DECLARE_RWSEM(umhelper_sem);
53 55
54#ifdef CONFIG_MODULES 56#ifdef CONFIG_MODULES
55 57
@@ -275,6 +277,7 @@ static void __call_usermodehelper(struct work_struct *work)
275 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY 277 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
276 * (used for preventing user land processes from being created after the user 278 * (used for preventing user land processes from being created after the user
277 * land has been frozen during a system-wide hibernation or suspend operation). 279 * land has been frozen during a system-wide hibernation or suspend operation).
280 * Should always be manipulated under umhelper_sem acquired for write.
278 */ 281 */
279static int usermodehelper_disabled = 1; 282static int usermodehelper_disabled = 1;
280 283
@@ -282,17 +285,29 @@ static int usermodehelper_disabled = 1;
282static atomic_t running_helpers = ATOMIC_INIT(0); 285static atomic_t running_helpers = ATOMIC_INIT(0);
283 286
284/* 287/*
285 * Wait queue head used by usermodehelper_pm_callback() to wait for all running 288 * Wait queue head used by usermodehelper_disable() to wait for all running
286 * helpers to finish. 289 * helpers to finish.
287 */ 290 */
288static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); 291static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);
289 292
290/* 293/*
291 * Time to wait for running_helpers to become zero before the setting of 294 * Time to wait for running_helpers to become zero before the setting of
292 * usermodehelper_disabled in usermodehelper_pm_callback() fails 295 * usermodehelper_disabled in usermodehelper_disable() fails
293 */ 296 */
294#define RUNNING_HELPERS_TIMEOUT (5 * HZ) 297#define RUNNING_HELPERS_TIMEOUT (5 * HZ)
295 298
299void read_lock_usermodehelper(void)
300{
301 down_read(&umhelper_sem);
302}
303EXPORT_SYMBOL_GPL(read_lock_usermodehelper);
304
305void read_unlock_usermodehelper(void)
306{
307 up_read(&umhelper_sem);
308}
309EXPORT_SYMBOL_GPL(read_unlock_usermodehelper);
310
296/** 311/**
297 * usermodehelper_disable - prevent new helpers from being started 312 * usermodehelper_disable - prevent new helpers from being started
298 */ 313 */
@@ -300,8 +315,10 @@ int usermodehelper_disable(void)
300{ 315{
301 long retval; 316 long retval;
302 317
318 down_write(&umhelper_sem);
303 usermodehelper_disabled = 1; 319 usermodehelper_disabled = 1;
304 smp_mb(); 320 up_write(&umhelper_sem);
321
305 /* 322 /*
306 * From now on call_usermodehelper_exec() won't start any new 323 * From now on call_usermodehelper_exec() won't start any new
307 * helpers, so it is sufficient if running_helpers turns out to 324 * helpers, so it is sufficient if running_helpers turns out to
@@ -314,7 +331,9 @@ int usermodehelper_disable(void)
314 if (retval) 331 if (retval)
315 return 0; 332 return 0;
316 333
334 down_write(&umhelper_sem);
317 usermodehelper_disabled = 0; 335 usermodehelper_disabled = 0;
336 up_write(&umhelper_sem);
318 return -EAGAIN; 337 return -EAGAIN;
319} 338}
320 339
@@ -323,7 +342,9 @@ int usermodehelper_disable(void)
323 */ 342 */
324void usermodehelper_enable(void) 343void usermodehelper_enable(void)
325{ 344{
345 down_write(&umhelper_sem);
326 usermodehelper_disabled = 0; 346 usermodehelper_disabled = 0;
347 up_write(&umhelper_sem);
327} 348}
328 349
329/** 350/**
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e5d84644823b..9788c0ec6f43 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1077,6 +1077,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
1077 /* Early boot. kretprobe_table_locks not yet initialized. */ 1077 /* Early boot. kretprobe_table_locks not yet initialized. */
1078 return; 1078 return;
1079 1079
1080 INIT_HLIST_HEAD(&empty_rp);
1080 hash = hash_ptr(tk, KPROBE_HASH_BITS); 1081 hash = hash_ptr(tk, KPROBE_HASH_BITS);
1081 head = &kretprobe_inst_table[hash]; 1082 head = &kretprobe_inst_table[hash];
1082 kretprobe_table_lock(hash, &flags); 1083 kretprobe_table_lock(hash, &flags);
@@ -1085,7 +1086,6 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
1085 recycle_rp_inst(ri, &empty_rp); 1086 recycle_rp_inst(ri, &empty_rp);
1086 } 1087 }
1087 kretprobe_table_unlock(hash, &flags); 1088 kretprobe_table_unlock(hash, &flags);
1088 INIT_HLIST_HEAD(&empty_rp);
1089 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { 1089 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
1090 hlist_del(&ri->hlist); 1090 hlist_del(&ri->hlist);
1091 kfree(ri); 1091 kfree(ri);
@@ -1673,8 +1673,12 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
1673 ri->rp = rp; 1673 ri->rp = rp;
1674 ri->task = current; 1674 ri->task = current;
1675 1675
1676 if (rp->entry_handler && rp->entry_handler(ri, regs)) 1676 if (rp->entry_handler && rp->entry_handler(ri, regs)) {
1677 raw_spin_lock_irqsave(&rp->lock, flags);
1678 hlist_add_head(&ri->hlist, &rp->free_instances);
1679 raw_spin_unlock_irqrestore(&rp->lock, flags);
1677 return 0; 1680 return 0;
1681 }
1678 1682
1679 arch_prepare_kretprobe(ri, regs); 1683 arch_prepare_kretprobe(ri, regs);
1680 1684
@@ -2198,7 +2202,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
2198 const char __user *user_buf, size_t count, loff_t *ppos) 2202 const char __user *user_buf, size_t count, loff_t *ppos)
2199{ 2203{
2200 char buf[32]; 2204 char buf[32];
2201 int buf_size; 2205 size_t buf_size;
2202 2206
2203 buf_size = min(count, (sizeof(buf)-1)); 2207 buf_size = min(count, (sizeof(buf)-1));
2204 if (copy_from_user(buf, user_buf, buf_size)) 2208 if (copy_from_user(buf, user_buf, buf_size))
diff --git a/kernel/kthread.c b/kernel/kthread.c
index b6d216a92639..3d3de633702e 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -59,6 +59,31 @@ int kthread_should_stop(void)
59EXPORT_SYMBOL(kthread_should_stop); 59EXPORT_SYMBOL(kthread_should_stop);
60 60
61/** 61/**
62 * kthread_freezable_should_stop - should this freezable kthread return now?
63 * @was_frozen: optional out parameter, indicates whether %current was frozen
64 *
65 * kthread_should_stop() for freezable kthreads, which will enter
66 * refrigerator if necessary. This function is safe from kthread_stop() /
67 * freezer deadlock and freezable kthreads should use this function instead
68 * of calling try_to_freeze() directly.
69 */
70bool kthread_freezable_should_stop(bool *was_frozen)
71{
72 bool frozen = false;
73
74 might_sleep();
75
76 if (unlikely(freezing(current)))
77 frozen = __refrigerator(true);
78
79 if (was_frozen)
80 *was_frozen = frozen;
81
82 return kthread_should_stop();
83}
84EXPORT_SYMBOL_GPL(kthread_freezable_should_stop);
85
86/**
62 * kthread_data - return data value specified on kthread creation 87 * kthread_data - return data value specified on kthread creation
63 * @task: kthread task in question 88 * @task: kthread task in question
64 * 89 *
@@ -257,7 +282,7 @@ int kthreadd(void *unused)
257 set_cpus_allowed_ptr(tsk, cpu_all_mask); 282 set_cpus_allowed_ptr(tsk, cpu_all_mask);
258 set_mems_allowed(node_states[N_HIGH_MEMORY]); 283 set_mems_allowed(node_states[N_HIGH_MEMORY]);
259 284
260 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; 285 current->flags |= PF_NOFREEZE;
261 286
262 for (;;) { 287 for (;;) {
263 set_current_state(TASK_INTERRUPTIBLE); 288 set_current_state(TASK_INTERRUPTIBLE);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index e69434b070da..8889f7dd7c46 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -44,6 +44,7 @@
44#include <linux/stringify.h> 44#include <linux/stringify.h>
45#include <linux/bitops.h> 45#include <linux/bitops.h>
46#include <linux/gfp.h> 46#include <linux/gfp.h>
47#include <linux/kmemcheck.h>
47 48
48#include <asm/sections.h> 49#include <asm/sections.h>
49 50
@@ -430,6 +431,7 @@ unsigned int max_lockdep_depth;
430 * about it later on, in lockdep_info(). 431 * about it later on, in lockdep_info().
431 */ 432 */
432static int lockdep_init_error; 433static int lockdep_init_error;
434static const char *lock_init_error;
433static unsigned long lockdep_init_trace_data[20]; 435static unsigned long lockdep_init_trace_data[20];
434static struct stack_trace lockdep_init_trace = { 436static struct stack_trace lockdep_init_trace = {
435 .max_entries = ARRAY_SIZE(lockdep_init_trace_data), 437 .max_entries = ARRAY_SIZE(lockdep_init_trace_data),
@@ -498,36 +500,32 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS])
498 usage[i] = '\0'; 500 usage[i] = '\0';
499} 501}
500 502
501static int __print_lock_name(struct lock_class *class) 503static void __print_lock_name(struct lock_class *class)
502{ 504{
503 char str[KSYM_NAME_LEN]; 505 char str[KSYM_NAME_LEN];
504 const char *name; 506 const char *name;
505 507
506 name = class->name; 508 name = class->name;
507 if (!name)
508 name = __get_key_name(class->key, str);
509
510 return printk("%s", name);
511}
512
513static void print_lock_name(struct lock_class *class)
514{
515 char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS];
516 const char *name;
517
518 get_usage_chars(class, usage);
519
520 name = class->name;
521 if (!name) { 509 if (!name) {
522 name = __get_key_name(class->key, str); 510 name = __get_key_name(class->key, str);
523 printk(" (%s", name); 511 printk("%s", name);
524 } else { 512 } else {
525 printk(" (%s", name); 513 printk("%s", name);
526 if (class->name_version > 1) 514 if (class->name_version > 1)
527 printk("#%d", class->name_version); 515 printk("#%d", class->name_version);
528 if (class->subclass) 516 if (class->subclass)
529 printk("/%d", class->subclass); 517 printk("/%d", class->subclass);
530 } 518 }
519}
520
521static void print_lock_name(struct lock_class *class)
522{
523 char usage[LOCK_USAGE_CHARS];
524
525 get_usage_chars(class, usage);
526
527 printk(" (");
528 __print_lock_name(class);
531 printk("){%s}", usage); 529 printk("){%s}", usage);
532} 530}
533 531
@@ -567,11 +565,12 @@ static void lockdep_print_held_locks(struct task_struct *curr)
567 } 565 }
568} 566}
569 567
570static void print_kernel_version(void) 568static void print_kernel_ident(void)
571{ 569{
572 printk("%s %.*s\n", init_utsname()->release, 570 printk("%s %.*s %s\n", init_utsname()->release,
573 (int)strcspn(init_utsname()->version, " "), 571 (int)strcspn(init_utsname()->version, " "),
574 init_utsname()->version); 572 init_utsname()->version,
573 print_tainted());
575} 574}
576 575
577static int very_verbose(struct lock_class *class) 576static int very_verbose(struct lock_class *class)
@@ -655,6 +654,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
655 if (unlikely(!lockdep_initialized)) { 654 if (unlikely(!lockdep_initialized)) {
656 lockdep_init(); 655 lockdep_init();
657 lockdep_init_error = 1; 656 lockdep_init_error = 1;
657 lock_init_error = lock->name;
658 save_stack_trace(&lockdep_init_trace); 658 save_stack_trace(&lockdep_init_trace);
659 } 659 }
660#endif 660#endif
@@ -722,7 +722,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
722 722
723 class = look_up_lock_class(lock, subclass); 723 class = look_up_lock_class(lock, subclass);
724 if (likely(class)) 724 if (likely(class))
725 return class; 725 goto out_set_class_cache;
726 726
727 /* 727 /*
728 * Debug-check: all keys must be persistent! 728 * Debug-check: all keys must be persistent!
@@ -807,6 +807,7 @@ out_unlock_set:
807 graph_unlock(); 807 graph_unlock();
808 raw_local_irq_restore(flags); 808 raw_local_irq_restore(flags);
809 809
810out_set_class_cache:
810 if (!subclass || force) 811 if (!subclass || force)
811 lock->class_cache[0] = class; 812 lock->class_cache[0] = class;
812 else if (subclass < NR_LOCKDEP_CACHING_CLASSES) 813 else if (subclass < NR_LOCKDEP_CACHING_CLASSES)
@@ -1148,7 +1149,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
1148 printk("\n"); 1149 printk("\n");
1149 printk("======================================================\n"); 1150 printk("======================================================\n");
1150 printk("[ INFO: possible circular locking dependency detected ]\n"); 1151 printk("[ INFO: possible circular locking dependency detected ]\n");
1151 print_kernel_version(); 1152 print_kernel_ident();
1152 printk("-------------------------------------------------------\n"); 1153 printk("-------------------------------------------------------\n");
1153 printk("%s/%d is trying to acquire lock:\n", 1154 printk("%s/%d is trying to acquire lock:\n",
1154 curr->comm, task_pid_nr(curr)); 1155 curr->comm, task_pid_nr(curr));
@@ -1487,7 +1488,7 @@ print_bad_irq_dependency(struct task_struct *curr,
1487 printk("======================================================\n"); 1488 printk("======================================================\n");
1488 printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", 1489 printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
1489 irqclass, irqclass); 1490 irqclass, irqclass);
1490 print_kernel_version(); 1491 print_kernel_ident();
1491 printk("------------------------------------------------------\n"); 1492 printk("------------------------------------------------------\n");
1492 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", 1493 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
1493 curr->comm, task_pid_nr(curr), 1494 curr->comm, task_pid_nr(curr),
@@ -1716,7 +1717,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
1716 printk("\n"); 1717 printk("\n");
1717 printk("=============================================\n"); 1718 printk("=============================================\n");
1718 printk("[ INFO: possible recursive locking detected ]\n"); 1719 printk("[ INFO: possible recursive locking detected ]\n");
1719 print_kernel_version(); 1720 print_kernel_ident();
1720 printk("---------------------------------------------\n"); 1721 printk("---------------------------------------------\n");
1721 printk("%s/%d is trying to acquire lock:\n", 1722 printk("%s/%d is trying to acquire lock:\n",
1722 curr->comm, task_pid_nr(curr)); 1723 curr->comm, task_pid_nr(curr));
@@ -2223,7 +2224,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
2223 printk("\n"); 2224 printk("\n");
2224 printk("=================================\n"); 2225 printk("=================================\n");
2225 printk("[ INFO: inconsistent lock state ]\n"); 2226 printk("[ INFO: inconsistent lock state ]\n");
2226 print_kernel_version(); 2227 print_kernel_ident();
2227 printk("---------------------------------\n"); 2228 printk("---------------------------------\n");
2228 2229
2229 printk("inconsistent {%s} -> {%s} usage.\n", 2230 printk("inconsistent {%s} -> {%s} usage.\n",
@@ -2288,7 +2289,7 @@ print_irq_inversion_bug(struct task_struct *curr,
2288 printk("\n"); 2289 printk("\n");
2289 printk("=========================================================\n"); 2290 printk("=========================================================\n");
2290 printk("[ INFO: possible irq lock inversion dependency detected ]\n"); 2291 printk("[ INFO: possible irq lock inversion dependency detected ]\n");
2291 print_kernel_version(); 2292 print_kernel_ident();
2292 printk("---------------------------------------------------------\n"); 2293 printk("---------------------------------------------------------\n");
2293 printk("%s/%d just changed the state of lock:\n", 2294 printk("%s/%d just changed the state of lock:\n",
2294 curr->comm, task_pid_nr(curr)); 2295 curr->comm, task_pid_nr(curr));
@@ -2948,7 +2949,12 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
2948void lockdep_init_map(struct lockdep_map *lock, const char *name, 2949void lockdep_init_map(struct lockdep_map *lock, const char *name,
2949 struct lock_class_key *key, int subclass) 2950 struct lock_class_key *key, int subclass)
2950{ 2951{
2951 memset(lock, 0, sizeof(*lock)); 2952 int i;
2953
2954 kmemcheck_mark_initialized(lock, sizeof(*lock));
2955
2956 for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
2957 lock->class_cache[i] = NULL;
2952 2958
2953#ifdef CONFIG_LOCK_STAT 2959#ifdef CONFIG_LOCK_STAT
2954 lock->cpu = raw_smp_processor_id(); 2960 lock->cpu = raw_smp_processor_id();
@@ -3169,6 +3175,7 @@ print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
3169 printk("\n"); 3175 printk("\n");
3170 printk("=====================================\n"); 3176 printk("=====================================\n");
3171 printk("[ BUG: bad unlock balance detected! ]\n"); 3177 printk("[ BUG: bad unlock balance detected! ]\n");
3178 print_kernel_ident();
3172 printk("-------------------------------------\n"); 3179 printk("-------------------------------------\n");
3173 printk("%s/%d is trying to release lock (", 3180 printk("%s/%d is trying to release lock (",
3174 curr->comm, task_pid_nr(curr)); 3181 curr->comm, task_pid_nr(curr));
@@ -3613,6 +3620,7 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
3613 printk("\n"); 3620 printk("\n");
3614 printk("=================================\n"); 3621 printk("=================================\n");
3615 printk("[ BUG: bad contention detected! ]\n"); 3622 printk("[ BUG: bad contention detected! ]\n");
3623 print_kernel_ident();
3616 printk("---------------------------------\n"); 3624 printk("---------------------------------\n");
3617 printk("%s/%d is trying to contend lock (", 3625 printk("%s/%d is trying to contend lock (",
3618 curr->comm, task_pid_nr(curr)); 3626 curr->comm, task_pid_nr(curr));
@@ -3968,7 +3976,8 @@ void __init lockdep_info(void)
3968 3976
3969#ifdef CONFIG_DEBUG_LOCKDEP 3977#ifdef CONFIG_DEBUG_LOCKDEP
3970 if (lockdep_init_error) { 3978 if (lockdep_init_error) {
3971 printk("WARNING: lockdep init error! Arch code didn't call lockdep_init() early enough?\n"); 3979 printk("WARNING: lockdep init error! lock-%s was acquired"
3980 "before lockdep_init\n", lock_init_error);
3972 printk("Call stack leading to lockdep invocation was:\n"); 3981 printk("Call stack leading to lockdep invocation was:\n");
3973 print_stack_trace(&lockdep_init_trace, 0); 3982 print_stack_trace(&lockdep_init_trace, 0);
3974 } 3983 }
@@ -3987,6 +3996,7 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
3987 printk("\n"); 3996 printk("\n");
3988 printk("=========================\n"); 3997 printk("=========================\n");
3989 printk("[ BUG: held lock freed! ]\n"); 3998 printk("[ BUG: held lock freed! ]\n");
3999 print_kernel_ident();
3990 printk("-------------------------\n"); 4000 printk("-------------------------\n");
3991 printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", 4001 printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
3992 curr->comm, task_pid_nr(curr), mem_from, mem_to-1); 4002 curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
@@ -4044,6 +4054,7 @@ static void print_held_locks_bug(struct task_struct *curr)
4044 printk("\n"); 4054 printk("\n");
4045 printk("=====================================\n"); 4055 printk("=====================================\n");
4046 printk("[ BUG: lock held at task exit time! ]\n"); 4056 printk("[ BUG: lock held at task exit time! ]\n");
4057 print_kernel_ident();
4047 printk("-------------------------------------\n"); 4058 printk("-------------------------------------\n");
4048 printk("%s/%d is exiting with locks still held!\n", 4059 printk("%s/%d is exiting with locks still held!\n",
4049 curr->comm, task_pid_nr(curr)); 4060 curr->comm, task_pid_nr(curr));
@@ -4141,6 +4152,7 @@ void lockdep_sys_exit(void)
4141 printk("\n"); 4152 printk("\n");
4142 printk("================================================\n"); 4153 printk("================================================\n");
4143 printk("[ BUG: lock held when returning to user space! ]\n"); 4154 printk("[ BUG: lock held when returning to user space! ]\n");
4155 print_kernel_ident();
4144 printk("------------------------------------------------\n"); 4156 printk("------------------------------------------------\n");
4145 printk("%s/%d is leaving the kernel with locks still held!\n", 4157 printk("%s/%d is leaving the kernel with locks still held!\n",
4146 curr->comm, curr->pid); 4158 curr->comm, curr->pid);
@@ -4160,10 +4172,33 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
4160 printk("\n"); 4172 printk("\n");
4161 printk("===============================\n"); 4173 printk("===============================\n");
4162 printk("[ INFO: suspicious RCU usage. ]\n"); 4174 printk("[ INFO: suspicious RCU usage. ]\n");
4175 print_kernel_ident();
4163 printk("-------------------------------\n"); 4176 printk("-------------------------------\n");
4164 printk("%s:%d %s!\n", file, line, s); 4177 printk("%s:%d %s!\n", file, line, s);
4165 printk("\nother info that might help us debug this:\n\n"); 4178 printk("\nother info that might help us debug this:\n\n");
4166 printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks); 4179 printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks);
4180
4181 /*
4182 * If a CPU is in the RCU-free window in idle (ie: in the section
4183 * between rcu_idle_enter() and rcu_idle_exit(), then RCU
4184 * considers that CPU to be in an "extended quiescent state",
4185 * which means that RCU will be completely ignoring that CPU.
4186 * Therefore, rcu_read_lock() and friends have absolutely no
4187 * effect on a CPU running in that state. In other words, even if
4188 * such an RCU-idle CPU has called rcu_read_lock(), RCU might well
4189 * delete data structures out from under it. RCU really has no
4190 * choice here: we need to keep an RCU-free window in idle where
4191 * the CPU may possibly enter into low power mode. This way we can
4192 * notice an extended quiescent state to other CPUs that started a grace
4193 * period. Otherwise we would delay any grace period as long as we run
4194 * in the idle task.
4195 *
4196 * So complain bitterly if someone does call rcu_read_lock(),
4197 * rcu_read_lock_bh() and so on from extended quiescent states.
4198 */
4199 if (rcu_is_cpu_idle())
4200 printk("RCU used illegally from extended quiescent state!\n");
4201
4167 lockdep_print_held_locks(curr); 4202 lockdep_print_held_locks(curr);
4168 printk("\nstack backtrace:\n"); 4203 printk("\nstack backtrace:\n");
4169 dump_stack(); 4204 dump_stack();
diff --git a/kernel/module.c b/kernel/module.c
index 178333c48d1e..2c932760fd33 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -62,12 +62,6 @@
62#define CREATE_TRACE_POINTS 62#define CREATE_TRACE_POINTS
63#include <trace/events/module.h> 63#include <trace/events/module.h>
64 64
65#if 0
66#define DEBUGP printk
67#else
68#define DEBUGP(fmt , a...)
69#endif
70
71#ifndef ARCH_SHF_SMALL 65#ifndef ARCH_SHF_SMALL
72#define ARCH_SHF_SMALL 0 66#define ARCH_SHF_SMALL 0
73#endif 67#endif
@@ -138,7 +132,6 @@ struct load_info {
138 unsigned long len; 132 unsigned long len;
139 Elf_Shdr *sechdrs; 133 Elf_Shdr *sechdrs;
140 char *secstrings, *strtab; 134 char *secstrings, *strtab;
141 unsigned long *strmap;
142 unsigned long symoffs, stroffs; 135 unsigned long symoffs, stroffs;
143 struct _ddebug *debug; 136 struct _ddebug *debug;
144 unsigned int num_debug; 137 unsigned int num_debug;
@@ -410,7 +403,7 @@ const struct kernel_symbol *find_symbol(const char *name,
410 return fsa.sym; 403 return fsa.sym;
411 } 404 }
412 405
413 DEBUGP("Failed to find symbol %s\n", name); 406 pr_debug("Failed to find symbol %s\n", name);
414 return NULL; 407 return NULL;
415} 408}
416EXPORT_SYMBOL_GPL(find_symbol); 409EXPORT_SYMBOL_GPL(find_symbol);
@@ -600,11 +593,11 @@ static int already_uses(struct module *a, struct module *b)
600 593
601 list_for_each_entry(use, &b->source_list, source_list) { 594 list_for_each_entry(use, &b->source_list, source_list) {
602 if (use->source == a) { 595 if (use->source == a) {
603 DEBUGP("%s uses %s!\n", a->name, b->name); 596 pr_debug("%s uses %s!\n", a->name, b->name);
604 return 1; 597 return 1;
605 } 598 }
606 } 599 }
607 DEBUGP("%s does not use %s!\n", a->name, b->name); 600 pr_debug("%s does not use %s!\n", a->name, b->name);
608 return 0; 601 return 0;
609} 602}
610 603
@@ -619,7 +612,7 @@ static int add_module_usage(struct module *a, struct module *b)
619{ 612{
620 struct module_use *use; 613 struct module_use *use;
621 614
622 DEBUGP("Allocating new usage for %s.\n", a->name); 615 pr_debug("Allocating new usage for %s.\n", a->name);
623 use = kmalloc(sizeof(*use), GFP_ATOMIC); 616 use = kmalloc(sizeof(*use), GFP_ATOMIC);
624 if (!use) { 617 if (!use) {
625 printk(KERN_WARNING "%s: out of memory loading\n", a->name); 618 printk(KERN_WARNING "%s: out of memory loading\n", a->name);
@@ -663,7 +656,7 @@ static void module_unload_free(struct module *mod)
663 mutex_lock(&module_mutex); 656 mutex_lock(&module_mutex);
664 list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) { 657 list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) {
665 struct module *i = use->target; 658 struct module *i = use->target;
666 DEBUGP("%s unusing %s\n", mod->name, i->name); 659 pr_debug("%s unusing %s\n", mod->name, i->name);
667 module_put(i); 660 module_put(i);
668 list_del(&use->source_list); 661 list_del(&use->source_list);
669 list_del(&use->target_list); 662 list_del(&use->target_list);
@@ -726,9 +719,9 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
726 } 719 }
727} 720}
728 721
729unsigned int module_refcount(struct module *mod) 722unsigned long module_refcount(struct module *mod)
730{ 723{
731 unsigned int incs = 0, decs = 0; 724 unsigned long incs = 0, decs = 0;
732 int cpu; 725 int cpu;
733 726
734 for_each_possible_cpu(cpu) 727 for_each_possible_cpu(cpu)
@@ -761,7 +754,7 @@ static void wait_for_zero_refcount(struct module *mod)
761 /* Since we might sleep for some time, release the mutex first */ 754 /* Since we might sleep for some time, release the mutex first */
762 mutex_unlock(&module_mutex); 755 mutex_unlock(&module_mutex);
763 for (;;) { 756 for (;;) {
764 DEBUGP("Looking at refcount...\n"); 757 pr_debug("Looking at refcount...\n");
765 set_current_state(TASK_UNINTERRUPTIBLE); 758 set_current_state(TASK_UNINTERRUPTIBLE);
766 if (module_refcount(mod) == 0) 759 if (module_refcount(mod) == 0)
767 break; 760 break;
@@ -804,7 +797,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
804 if (mod->state != MODULE_STATE_LIVE) { 797 if (mod->state != MODULE_STATE_LIVE) {
805 /* FIXME: if (force), slam module count and wake up 798 /* FIXME: if (force), slam module count and wake up
806 waiter --RR */ 799 waiter --RR */
807 DEBUGP("%s already dying\n", mod->name); 800 pr_debug("%s already dying\n", mod->name);
808 ret = -EBUSY; 801 ret = -EBUSY;
809 goto out; 802 goto out;
810 } 803 }
@@ -854,7 +847,7 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
854 struct module_use *use; 847 struct module_use *use;
855 int printed_something = 0; 848 int printed_something = 0;
856 849
857 seq_printf(m, " %u ", module_refcount(mod)); 850 seq_printf(m, " %lu ", module_refcount(mod));
858 851
859 /* Always include a trailing , so userspace can differentiate 852 /* Always include a trailing , so userspace can differentiate
860 between this and the old multi-field proc format. */ 853 between this and the old multi-field proc format. */
@@ -904,13 +897,11 @@ EXPORT_SYMBOL_GPL(symbol_put_addr);
904static ssize_t show_refcnt(struct module_attribute *mattr, 897static ssize_t show_refcnt(struct module_attribute *mattr,
905 struct module_kobject *mk, char *buffer) 898 struct module_kobject *mk, char *buffer)
906{ 899{
907 return sprintf(buffer, "%u\n", module_refcount(mk->mod)); 900 return sprintf(buffer, "%lu\n", module_refcount(mk->mod));
908} 901}
909 902
910static struct module_attribute refcnt = { 903static struct module_attribute modinfo_refcnt =
911 .attr = { .name = "refcnt", .mode = 0444 }, 904 __ATTR(refcnt, 0444, show_refcnt, NULL);
912 .show = show_refcnt,
913};
914 905
915void module_put(struct module *module) 906void module_put(struct module *module)
916{ 907{
@@ -951,6 +942,26 @@ static inline int module_unload_init(struct module *mod)
951} 942}
952#endif /* CONFIG_MODULE_UNLOAD */ 943#endif /* CONFIG_MODULE_UNLOAD */
953 944
945static size_t module_flags_taint(struct module *mod, char *buf)
946{
947 size_t l = 0;
948
949 if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
950 buf[l++] = 'P';
951 if (mod->taints & (1 << TAINT_OOT_MODULE))
952 buf[l++] = 'O';
953 if (mod->taints & (1 << TAINT_FORCED_MODULE))
954 buf[l++] = 'F';
955 if (mod->taints & (1 << TAINT_CRAP))
956 buf[l++] = 'C';
957 /*
958 * TAINT_FORCED_RMMOD: could be added.
959 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
960 * apply to modules.
961 */
962 return l;
963}
964
954static ssize_t show_initstate(struct module_attribute *mattr, 965static ssize_t show_initstate(struct module_attribute *mattr,
955 struct module_kobject *mk, char *buffer) 966 struct module_kobject *mk, char *buffer)
956{ 967{
@@ -970,10 +981,8 @@ static ssize_t show_initstate(struct module_attribute *mattr,
970 return sprintf(buffer, "%s\n", state); 981 return sprintf(buffer, "%s\n", state);
971} 982}
972 983
973static struct module_attribute initstate = { 984static struct module_attribute modinfo_initstate =
974 .attr = { .name = "initstate", .mode = 0444 }, 985 __ATTR(initstate, 0444, show_initstate, NULL);
975 .show = show_initstate,
976};
977 986
978static ssize_t store_uevent(struct module_attribute *mattr, 987static ssize_t store_uevent(struct module_attribute *mattr,
979 struct module_kobject *mk, 988 struct module_kobject *mk,
@@ -986,18 +995,50 @@ static ssize_t store_uevent(struct module_attribute *mattr,
986 return count; 995 return count;
987} 996}
988 997
989struct module_attribute module_uevent = { 998struct module_attribute module_uevent =
990 .attr = { .name = "uevent", .mode = 0200 }, 999 __ATTR(uevent, 0200, NULL, store_uevent);
991 .store = store_uevent, 1000
992}; 1001static ssize_t show_coresize(struct module_attribute *mattr,
1002 struct module_kobject *mk, char *buffer)
1003{
1004 return sprintf(buffer, "%u\n", mk->mod->core_size);
1005}
1006
1007static struct module_attribute modinfo_coresize =
1008 __ATTR(coresize, 0444, show_coresize, NULL);
1009
1010static ssize_t show_initsize(struct module_attribute *mattr,
1011 struct module_kobject *mk, char *buffer)
1012{
1013 return sprintf(buffer, "%u\n", mk->mod->init_size);
1014}
1015
1016static struct module_attribute modinfo_initsize =
1017 __ATTR(initsize, 0444, show_initsize, NULL);
1018
1019static ssize_t show_taint(struct module_attribute *mattr,
1020 struct module_kobject *mk, char *buffer)
1021{
1022 size_t l;
1023
1024 l = module_flags_taint(mk->mod, buffer);
1025 buffer[l++] = '\n';
1026 return l;
1027}
1028
1029static struct module_attribute modinfo_taint =
1030 __ATTR(taint, 0444, show_taint, NULL);
993 1031
994static struct module_attribute *modinfo_attrs[] = { 1032static struct module_attribute *modinfo_attrs[] = {
1033 &module_uevent,
995 &modinfo_version, 1034 &modinfo_version,
996 &modinfo_srcversion, 1035 &modinfo_srcversion,
997 &initstate, 1036 &modinfo_initstate,
998 &module_uevent, 1037 &modinfo_coresize,
1038 &modinfo_initsize,
1039 &modinfo_taint,
999#ifdef CONFIG_MODULE_UNLOAD 1040#ifdef CONFIG_MODULE_UNLOAD
1000 &refcnt, 1041 &modinfo_refcnt,
1001#endif 1042#endif
1002 NULL, 1043 NULL,
1003}; 1044};
@@ -1057,7 +1098,7 @@ static int check_version(Elf_Shdr *sechdrs,
1057 1098
1058 if (versions[i].crc == maybe_relocated(*crc, crc_owner)) 1099 if (versions[i].crc == maybe_relocated(*crc, crc_owner))
1059 return 1; 1100 return 1;
1060 DEBUGP("Found checksum %lX vs module %lX\n", 1101 pr_debug("Found checksum %lX vs module %lX\n",
1061 maybe_relocated(*crc, crc_owner), versions[i].crc); 1102 maybe_relocated(*crc, crc_owner), versions[i].crc);
1062 goto bad_version; 1103 goto bad_version;
1063 } 1104 }
@@ -1834,7 +1875,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
1834 case SHN_COMMON: 1875 case SHN_COMMON:
1835 /* We compiled with -fno-common. These are not 1876 /* We compiled with -fno-common. These are not
1836 supposed to happen. */ 1877 supposed to happen. */
1837 DEBUGP("Common symbol: %s\n", name); 1878 pr_debug("Common symbol: %s\n", name);
1838 printk("%s: please compile with -fno-common\n", 1879 printk("%s: please compile with -fno-common\n",
1839 mod->name); 1880 mod->name);
1840 ret = -ENOEXEC; 1881 ret = -ENOEXEC;
@@ -1842,7 +1883,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
1842 1883
1843 case SHN_ABS: 1884 case SHN_ABS:
1844 /* Don't need to do anything */ 1885 /* Don't need to do anything */
1845 DEBUGP("Absolute symbol: 0x%08lx\n", 1886 pr_debug("Absolute symbol: 0x%08lx\n",
1846 (long)sym[i].st_value); 1887 (long)sym[i].st_value);
1847 break; 1888 break;
1848 1889
@@ -1966,7 +2007,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
1966 for (i = 0; i < info->hdr->e_shnum; i++) 2007 for (i = 0; i < info->hdr->e_shnum; i++)
1967 info->sechdrs[i].sh_entsize = ~0UL; 2008 info->sechdrs[i].sh_entsize = ~0UL;
1968 2009
1969 DEBUGP("Core section allocation order:\n"); 2010 pr_debug("Core section allocation order:\n");
1970 for (m = 0; m < ARRAY_SIZE(masks); ++m) { 2011 for (m = 0; m < ARRAY_SIZE(masks); ++m) {
1971 for (i = 0; i < info->hdr->e_shnum; ++i) { 2012 for (i = 0; i < info->hdr->e_shnum; ++i) {
1972 Elf_Shdr *s = &info->sechdrs[i]; 2013 Elf_Shdr *s = &info->sechdrs[i];
@@ -1978,7 +2019,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
1978 || strstarts(sname, ".init")) 2019 || strstarts(sname, ".init"))
1979 continue; 2020 continue;
1980 s->sh_entsize = get_offset(mod, &mod->core_size, s, i); 2021 s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
1981 DEBUGP("\t%s\n", name); 2022 pr_debug("\t%s\n", sname);
1982 } 2023 }
1983 switch (m) { 2024 switch (m) {
1984 case 0: /* executable */ 2025 case 0: /* executable */
@@ -1995,7 +2036,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
1995 } 2036 }
1996 } 2037 }
1997 2038
1998 DEBUGP("Init section allocation order:\n"); 2039 pr_debug("Init section allocation order:\n");
1999 for (m = 0; m < ARRAY_SIZE(masks); ++m) { 2040 for (m = 0; m < ARRAY_SIZE(masks); ++m) {
2000 for (i = 0; i < info->hdr->e_shnum; ++i) { 2041 for (i = 0; i < info->hdr->e_shnum; ++i) {
2001 Elf_Shdr *s = &info->sechdrs[i]; 2042 Elf_Shdr *s = &info->sechdrs[i];
@@ -2008,7 +2049,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
2008 continue; 2049 continue;
2009 s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) 2050 s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
2010 | INIT_OFFSET_MASK); 2051 | INIT_OFFSET_MASK);
2011 DEBUGP("\t%s\n", sname); 2052 pr_debug("\t%s\n", sname);
2012 } 2053 }
2013 switch (m) { 2054 switch (m) {
2014 case 0: /* executable */ 2055 case 0: /* executable */
@@ -2178,45 +2219,46 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
2178 return true; 2219 return true;
2179} 2220}
2180 2221
2222/*
2223 * We only allocate and copy the strings needed by the parts of symtab
2224 * we keep. This is simple, but has the effect of making multiple
2225 * copies of duplicates. We could be more sophisticated, see
2226 * linux-kernel thread starting with
2227 * <73defb5e4bca04a6431392cc341112b1@localhost>.
2228 */
2181static void layout_symtab(struct module *mod, struct load_info *info) 2229static void layout_symtab(struct module *mod, struct load_info *info)
2182{ 2230{
2183 Elf_Shdr *symsect = info->sechdrs + info->index.sym; 2231 Elf_Shdr *symsect = info->sechdrs + info->index.sym;
2184 Elf_Shdr *strsect = info->sechdrs + info->index.str; 2232 Elf_Shdr *strsect = info->sechdrs + info->index.str;
2185 const Elf_Sym *src; 2233 const Elf_Sym *src;
2186 unsigned int i, nsrc, ndst; 2234 unsigned int i, nsrc, ndst, strtab_size;
2187 2235
2188 /* Put symbol section at end of init part of module. */ 2236 /* Put symbol section at end of init part of module. */
2189 symsect->sh_flags |= SHF_ALLOC; 2237 symsect->sh_flags |= SHF_ALLOC;
2190 symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, 2238 symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
2191 info->index.sym) | INIT_OFFSET_MASK; 2239 info->index.sym) | INIT_OFFSET_MASK;
2192 DEBUGP("\t%s\n", info->secstrings + symsect->sh_name); 2240 pr_debug("\t%s\n", info->secstrings + symsect->sh_name);
2193 2241
2194 src = (void *)info->hdr + symsect->sh_offset; 2242 src = (void *)info->hdr + symsect->sh_offset;
2195 nsrc = symsect->sh_size / sizeof(*src); 2243 nsrc = symsect->sh_size / sizeof(*src);
2196 for (ndst = i = 1; i < nsrc; ++i, ++src)
2197 if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) {
2198 unsigned int j = src->st_name;
2199 2244
2200 while (!__test_and_set_bit(j, info->strmap) 2245 /* Compute total space required for the core symbols' strtab. */
2201 && info->strtab[j]) 2246 for (ndst = i = strtab_size = 1; i < nsrc; ++i, ++src)
2202 ++j; 2247 if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) {
2203 ++ndst; 2248 strtab_size += strlen(&info->strtab[src->st_name]) + 1;
2249 ndst++;
2204 } 2250 }
2205 2251
2206 /* Append room for core symbols at end of core part. */ 2252 /* Append room for core symbols at end of core part. */
2207 info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); 2253 info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
2208 mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym); 2254 info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
2255 mod->core_size += strtab_size;
2209 2256
2210 /* Put string table section at end of init part of module. */ 2257 /* Put string table section at end of init part of module. */
2211 strsect->sh_flags |= SHF_ALLOC; 2258 strsect->sh_flags |= SHF_ALLOC;
2212 strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, 2259 strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
2213 info->index.str) | INIT_OFFSET_MASK; 2260 info->index.str) | INIT_OFFSET_MASK;
2214 DEBUGP("\t%s\n", info->secstrings + strsect->sh_name); 2261 pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
2215
2216 /* Append room for core symbols' strings at end of core part. */
2217 info->stroffs = mod->core_size;
2218 __set_bit(0, info->strmap);
2219 mod->core_size += bitmap_weight(info->strmap, strsect->sh_size);
2220} 2262}
2221 2263
2222static void add_kallsyms(struct module *mod, const struct load_info *info) 2264static void add_kallsyms(struct module *mod, const struct load_info *info)
@@ -2237,22 +2279,19 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
2237 mod->symtab[i].st_info = elf_type(&mod->symtab[i], info); 2279 mod->symtab[i].st_info = elf_type(&mod->symtab[i], info);
2238 2280
2239 mod->core_symtab = dst = mod->module_core + info->symoffs; 2281 mod->core_symtab = dst = mod->module_core + info->symoffs;
2282 mod->core_strtab = s = mod->module_core + info->stroffs;
2240 src = mod->symtab; 2283 src = mod->symtab;
2241 *dst = *src; 2284 *dst = *src;
2285 *s++ = 0;
2242 for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { 2286 for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) {
2243 if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) 2287 if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum))
2244 continue; 2288 continue;
2289
2245 dst[ndst] = *src; 2290 dst[ndst] = *src;
2246 dst[ndst].st_name = bitmap_weight(info->strmap, 2291 dst[ndst++].st_name = s - mod->core_strtab;
2247 dst[ndst].st_name); 2292 s += strlcpy(s, &mod->strtab[src->st_name], KSYM_NAME_LEN) + 1;
2248 ++ndst;
2249 } 2293 }
2250 mod->core_num_syms = ndst; 2294 mod->core_num_syms = ndst;
2251
2252 mod->core_strtab = s = mod->module_core + info->stroffs;
2253 for (*s = 0, i = 1; i < info->sechdrs[info->index.str].sh_size; ++i)
2254 if (test_bit(i, info->strmap))
2255 *++s = mod->strtab[i];
2256} 2295}
2257#else 2296#else
2258static inline void layout_symtab(struct module *mod, struct load_info *info) 2297static inline void layout_symtab(struct module *mod, struct load_info *info)
@@ -2621,7 +2660,7 @@ static int move_module(struct module *mod, struct load_info *info)
2621 mod->module_init = ptr; 2660 mod->module_init = ptr;
2622 2661
2623 /* Transfer each section which specifies SHF_ALLOC */ 2662 /* Transfer each section which specifies SHF_ALLOC */
2624 DEBUGP("final section addresses:\n"); 2663 pr_debug("final section addresses:\n");
2625 for (i = 0; i < info->hdr->e_shnum; i++) { 2664 for (i = 0; i < info->hdr->e_shnum; i++) {
2626 void *dest; 2665 void *dest;
2627 Elf_Shdr *shdr = &info->sechdrs[i]; 2666 Elf_Shdr *shdr = &info->sechdrs[i];
@@ -2639,8 +2678,8 @@ static int move_module(struct module *mod, struct load_info *info)
2639 memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size); 2678 memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
2640 /* Update sh_addr to point to copy in image. */ 2679 /* Update sh_addr to point to copy in image. */
2641 shdr->sh_addr = (unsigned long)dest; 2680 shdr->sh_addr = (unsigned long)dest;
2642 DEBUGP("\t0x%lx %s\n", 2681 pr_debug("\t0x%lx %s\n",
2643 shdr->sh_addr, info->secstrings + shdr->sh_name); 2682 (long)shdr->sh_addr, info->secstrings + shdr->sh_name);
2644 } 2683 }
2645 2684
2646 return 0; 2685 return 0;
@@ -2742,27 +2781,18 @@ static struct module *layout_and_allocate(struct load_info *info)
2742 this is done generically; there doesn't appear to be any 2781 this is done generically; there doesn't appear to be any
2743 special cases for the architectures. */ 2782 special cases for the architectures. */
2744 layout_sections(mod, info); 2783 layout_sections(mod, info);
2745
2746 info->strmap = kzalloc(BITS_TO_LONGS(info->sechdrs[info->index.str].sh_size)
2747 * sizeof(long), GFP_KERNEL);
2748 if (!info->strmap) {
2749 err = -ENOMEM;
2750 goto free_percpu;
2751 }
2752 layout_symtab(mod, info); 2784 layout_symtab(mod, info);
2753 2785
2754 /* Allocate and move to the final place */ 2786 /* Allocate and move to the final place */
2755 err = move_module(mod, info); 2787 err = move_module(mod, info);
2756 if (err) 2788 if (err)
2757 goto free_strmap; 2789 goto free_percpu;
2758 2790
2759 /* Module has been copied to its final place now: return it. */ 2791 /* Module has been copied to its final place now: return it. */
2760 mod = (void *)info->sechdrs[info->index.mod].sh_addr; 2792 mod = (void *)info->sechdrs[info->index.mod].sh_addr;
2761 kmemleak_load_module(mod, info); 2793 kmemleak_load_module(mod, info);
2762 return mod; 2794 return mod;
2763 2795
2764free_strmap:
2765 kfree(info->strmap);
2766free_percpu: 2796free_percpu:
2767 percpu_modfree(mod); 2797 percpu_modfree(mod);
2768out: 2798out:
@@ -2772,7 +2802,6 @@ out:
2772/* mod is no longer valid after this! */ 2802/* mod is no longer valid after this! */
2773static void module_deallocate(struct module *mod, struct load_info *info) 2803static void module_deallocate(struct module *mod, struct load_info *info)
2774{ 2804{
2775 kfree(info->strmap);
2776 percpu_modfree(mod); 2805 percpu_modfree(mod);
2777 module_free(mod, mod->module_init); 2806 module_free(mod, mod->module_init);
2778 module_free(mod, mod->module_core); 2807 module_free(mod, mod->module_core);
@@ -2811,7 +2840,7 @@ static struct module *load_module(void __user *umod,
2811 struct module *mod; 2840 struct module *mod;
2812 long err; 2841 long err;
2813 2842
2814 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", 2843 pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n",
2815 umod, len, uargs); 2844 umod, len, uargs);
2816 2845
2817 /* Copy in the blobs from userspace, check they are vaguely sane. */ 2846 /* Copy in the blobs from userspace, check they are vaguely sane. */
@@ -2902,8 +2931,7 @@ static struct module *load_module(void __user *umod,
2902 if (err < 0) 2931 if (err < 0)
2903 goto unlink; 2932 goto unlink;
2904 2933
2905 /* Get rid of temporary copy and strmap. */ 2934 /* Get rid of temporary copy. */
2906 kfree(info.strmap);
2907 free_copy(&info); 2935 free_copy(&info);
2908 2936
2909 /* Done! */ 2937 /* Done! */
@@ -3256,20 +3284,7 @@ static char *module_flags(struct module *mod, char *buf)
3256 mod->state == MODULE_STATE_GOING || 3284 mod->state == MODULE_STATE_GOING ||
3257 mod->state == MODULE_STATE_COMING) { 3285 mod->state == MODULE_STATE_COMING) {
3258 buf[bx++] = '('; 3286 buf[bx++] = '(';
3259 if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE)) 3287 bx += module_flags_taint(mod, buf + bx);
3260 buf[bx++] = 'P';
3261 else if (mod->taints & (1 << TAINT_OOT_MODULE))
3262 buf[bx++] = 'O';
3263 if (mod->taints & (1 << TAINT_FORCED_MODULE))
3264 buf[bx++] = 'F';
3265 if (mod->taints & (1 << TAINT_CRAP))
3266 buf[bx++] = 'C';
3267 /*
3268 * TAINT_FORCED_RMMOD: could be added.
3269 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
3270 * apply to modules.
3271 */
3272
3273 /* Show a - for module-is-being-unloaded */ 3288 /* Show a - for module-is-being-unloaded */
3274 if (mod->state == MODULE_STATE_GOING) 3289 if (mod->state == MODULE_STATE_GOING)
3275 buf[bx++] = '-'; 3290 buf[bx++] = '-';
diff --git a/kernel/panic.c b/kernel/panic.c
index b26593604214..80aed44e345a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -49,6 +49,15 @@ static long no_blink(int state)
49long (*panic_blink)(int state); 49long (*panic_blink)(int state);
50EXPORT_SYMBOL(panic_blink); 50EXPORT_SYMBOL(panic_blink);
51 51
52/*
53 * Stop ourself in panic -- architecture code may override this
54 */
55void __weak panic_smp_self_stop(void)
56{
57 while (1)
58 cpu_relax();
59}
60
52/** 61/**
53 * panic - halt the system 62 * panic - halt the system
54 * @fmt: The text string to print 63 * @fmt: The text string to print
@@ -57,8 +66,9 @@ EXPORT_SYMBOL(panic_blink);
57 * 66 *
58 * This function never returns. 67 * This function never returns.
59 */ 68 */
60NORET_TYPE void panic(const char * fmt, ...) 69void panic(const char *fmt, ...)
61{ 70{
71 static DEFINE_SPINLOCK(panic_lock);
62 static char buf[1024]; 72 static char buf[1024];
63 va_list args; 73 va_list args;
64 long i, i_next = 0; 74 long i, i_next = 0;
@@ -68,8 +78,14 @@ NORET_TYPE void panic(const char * fmt, ...)
68 * It's possible to come here directly from a panic-assertion and 78 * It's possible to come here directly from a panic-assertion and
69 * not have preempt disabled. Some functions called from here want 79 * not have preempt disabled. Some functions called from here want
70 * preempt to be disabled. No point enabling it later though... 80 * preempt to be disabled. No point enabling it later though...
81 *
82 * Only one CPU is allowed to execute the panic code from here. For
83 * multiple parallel invocations of panic, all other CPUs either
84 * stop themself or will wait until they are stopped by the 1st CPU
85 * with smp_send_stop().
71 */ 86 */
72 preempt_disable(); 87 if (!spin_trylock(&panic_lock))
88 panic_smp_self_stop();
73 89
74 console_verbose(); 90 console_verbose();
75 bust_spinlocks(1); 91 bust_spinlocks(1);
@@ -78,7 +94,11 @@ NORET_TYPE void panic(const char * fmt, ...)
78 va_end(args); 94 va_end(args);
79 printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); 95 printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
80#ifdef CONFIG_DEBUG_BUGVERBOSE 96#ifdef CONFIG_DEBUG_BUGVERBOSE
81 dump_stack(); 97 /*
98 * Avoid nested stack-dumping if a panic occurs during oops processing
99 */
100 if (!oops_in_progress)
101 dump_stack();
82#endif 102#endif
83 103
84 /* 104 /*
@@ -237,11 +257,20 @@ void add_taint(unsigned flag)
237 * Can't trust the integrity of the kernel anymore. 257 * Can't trust the integrity of the kernel anymore.
238 * We don't call directly debug_locks_off() because the issue 258 * We don't call directly debug_locks_off() because the issue
239 * is not necessarily serious enough to set oops_in_progress to 1 259 * is not necessarily serious enough to set oops_in_progress to 1
240 * Also we want to keep up lockdep for staging development and 260 * Also we want to keep up lockdep for staging/out-of-tree
241 * post-warning case. 261 * development and post-warning case.
242 */ 262 */
243 if (flag != TAINT_CRAP && flag != TAINT_WARN && __debug_locks_off()) 263 switch (flag) {
244 printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n"); 264 case TAINT_CRAP:
265 case TAINT_OOT_MODULE:
266 case TAINT_WARN:
267 case TAINT_FIRMWARE_WORKAROUND:
268 break;
269
270 default:
271 if (__debug_locks_off())
272 printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n");
273 }
245 274
246 set_bit(flag, &tainted_mask); 275 set_bit(flag, &tainted_mask);
247} 276}
diff --git a/kernel/params.c b/kernel/params.c
index 65aae11eb93f..4bc965d8a1fe 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -25,12 +25,6 @@
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/ctype.h> 26#include <linux/ctype.h>
27 27
28#if 0
29#define DEBUGP printk
30#else
31#define DEBUGP(fmt, a...)
32#endif
33
34/* Protects all parameters, and incidentally kmalloced_param list. */ 28/* Protects all parameters, and incidentally kmalloced_param list. */
35static DEFINE_MUTEX(param_lock); 29static DEFINE_MUTEX(param_lock);
36 30
@@ -103,9 +97,10 @@ static int parse_one(char *param,
103 for (i = 0; i < num_params; i++) { 97 for (i = 0; i < num_params; i++) {
104 if (parameq(param, params[i].name)) { 98 if (parameq(param, params[i].name)) {
105 /* No one handled NULL, so do it here. */ 99 /* No one handled NULL, so do it here. */
106 if (!val && params[i].ops->set != param_set_bool) 100 if (!val && params[i].ops->set != param_set_bool
101 && params[i].ops->set != param_set_bint)
107 return -EINVAL; 102 return -EINVAL;
108 DEBUGP("They are equal! Calling %p\n", 103 pr_debug("They are equal! Calling %p\n",
109 params[i].ops->set); 104 params[i].ops->set);
110 mutex_lock(&param_lock); 105 mutex_lock(&param_lock);
111 err = params[i].ops->set(val, &params[i]); 106 err = params[i].ops->set(val, &params[i]);
@@ -115,11 +110,11 @@ static int parse_one(char *param,
115 } 110 }
116 111
117 if (handle_unknown) { 112 if (handle_unknown) {
118 DEBUGP("Unknown argument: calling %p\n", handle_unknown); 113 pr_debug("Unknown argument: calling %p\n", handle_unknown);
119 return handle_unknown(param, val); 114 return handle_unknown(param, val);
120 } 115 }
121 116
122 DEBUGP("Unknown argument `%s'\n", param); 117 pr_debug("Unknown argument `%s'\n", param);
123 return -ENOENT; 118 return -ENOENT;
124} 119}
125 120
@@ -184,7 +179,7 @@ int parse_args(const char *name,
184{ 179{
185 char *param, *val; 180 char *param, *val;
186 181
187 DEBUGP("Parsing ARGS: %s\n", args); 182 pr_debug("Parsing ARGS: %s\n", args);
188 183
189 /* Chew leading spaces */ 184 /* Chew leading spaces */
190 args = skip_spaces(args); 185 args = skip_spaces(args);
@@ -369,6 +364,30 @@ struct kernel_param_ops param_ops_invbool = {
369}; 364};
370EXPORT_SYMBOL(param_ops_invbool); 365EXPORT_SYMBOL(param_ops_invbool);
371 366
367int param_set_bint(const char *val, const struct kernel_param *kp)
368{
369 struct kernel_param boolkp;
370 bool v;
371 int ret;
372
373 /* Match bool exactly, by re-using it. */
374 boolkp = *kp;
375 boolkp.arg = &v;
376 boolkp.flags |= KPARAM_ISBOOL;
377
378 ret = param_set_bool(val, &boolkp);
379 if (ret == 0)
380 *(int *)kp->arg = v;
381 return ret;
382}
383EXPORT_SYMBOL(param_set_bint);
384
385struct kernel_param_ops param_ops_bint = {
386 .set = param_set_bint,
387 .get = param_get_int,
388};
389EXPORT_SYMBOL(param_ops_bint);
390
372/* We break the rule and mangle the string. */ 391/* We break the rule and mangle the string. */
373static int param_array(const char *name, 392static int param_array(const char *name,
374 const char *val, 393 const char *val,
diff --git a/kernel/pid.c b/kernel/pid.c
index fa5f72227e5f..9f08dfabaf13 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -137,7 +137,9 @@ static int pid_before(int base, int a, int b)
137} 137}
138 138
139/* 139/*
140 * We might be racing with someone else trying to set pid_ns->last_pid. 140 * We might be racing with someone else trying to set pid_ns->last_pid
141 * at the pid allocation time (there's also a sysctl for this, but racing
142 * with this one is OK, see comment in kernel/pid_namespace.c about it).
141 * We want the winner to have the "later" value, because if the 143 * We want the winner to have the "later" value, because if the
142 * "earlier" value prevails, then a pid may get reused immediately. 144 * "earlier" value prevails, then a pid may get reused immediately.
143 * 145 *
@@ -541,12 +543,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
541 */ 543 */
542void __init pidhash_init(void) 544void __init pidhash_init(void)
543{ 545{
544 int i, pidhash_size; 546 unsigned int i, pidhash_size;
545 547
546 pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, 548 pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
547 HASH_EARLY | HASH_SMALL, 549 HASH_EARLY | HASH_SMALL,
548 &pidhash_shift, NULL, 4096); 550 &pidhash_shift, NULL, 4096);
549 pidhash_size = 1 << pidhash_shift; 551 pidhash_size = 1U << pidhash_shift;
550 552
551 for (i = 0; i < pidhash_size; i++) 553 for (i = 0; i < pidhash_size; i++)
552 INIT_HLIST_HEAD(&pid_hash[i]); 554 INIT_HLIST_HEAD(&pid_hash[i]);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index e9c9adc84ca6..a8968396046d 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -191,9 +191,40 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
191 return; 191 return;
192} 192}
193 193
194static int pid_ns_ctl_handler(struct ctl_table *table, int write,
195 void __user *buffer, size_t *lenp, loff_t *ppos)
196{
197 struct ctl_table tmp = *table;
198
199 if (write && !capable(CAP_SYS_ADMIN))
200 return -EPERM;
201
202 /*
203 * Writing directly to ns' last_pid field is OK, since this field
204 * is volatile in a living namespace anyway and a code writing to
205 * it should synchronize its usage with external means.
206 */
207
208 tmp.data = &current->nsproxy->pid_ns->last_pid;
209 return proc_dointvec(&tmp, write, buffer, lenp, ppos);
210}
211
212static struct ctl_table pid_ns_ctl_table[] = {
213 {
214 .procname = "ns_last_pid",
215 .maxlen = sizeof(int),
216 .mode = 0666, /* permissions are checked in the handler */
217 .proc_handler = pid_ns_ctl_handler,
218 },
219 { }
220};
221
222static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
223
194static __init int pid_namespaces_init(void) 224static __init int pid_namespaces_init(void)
195{ 225{
196 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); 226 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
227 register_sysctl_paths(kern_path, pid_ns_ctl_table);
197 return 0; 228 return 0;
198} 229}
199 230
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index e7cb76dc18f5..125cb67daa21 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -78,7 +78,7 @@ static inline int cpu_time_before(const clockid_t which_clock,
78 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { 78 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
79 return now.sched < then.sched; 79 return now.sched < then.sched;
80 } else { 80 } else {
81 return cputime_lt(now.cpu, then.cpu); 81 return now.cpu < then.cpu;
82 } 82 }
83} 83}
84static inline void cpu_time_add(const clockid_t which_clock, 84static inline void cpu_time_add(const clockid_t which_clock,
@@ -88,7 +88,7 @@ static inline void cpu_time_add(const clockid_t which_clock,
88 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { 88 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
89 acc->sched += val.sched; 89 acc->sched += val.sched;
90 } else { 90 } else {
91 acc->cpu = cputime_add(acc->cpu, val.cpu); 91 acc->cpu += val.cpu;
92 } 92 }
93} 93}
94static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, 94static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
@@ -98,25 +98,12 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
98 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { 98 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
99 a.sched -= b.sched; 99 a.sched -= b.sched;
100 } else { 100 } else {
101 a.cpu = cputime_sub(a.cpu, b.cpu); 101 a.cpu -= b.cpu;
102 } 102 }
103 return a; 103 return a;
104} 104}
105 105
106/* 106/*
107 * Divide and limit the result to res >= 1
108 *
109 * This is necessary to prevent signal delivery starvation, when the result of
110 * the division would be rounded down to 0.
111 */
112static inline cputime_t cputime_div_non_zero(cputime_t time, unsigned long div)
113{
114 cputime_t res = cputime_div(time, div);
115
116 return max_t(cputime_t, res, 1);
117}
118
119/*
120 * Update expiry time from increment, and increase overrun count, 107 * Update expiry time from increment, and increase overrun count,
121 * given the current clock sample. 108 * given the current clock sample.
122 */ 109 */
@@ -148,28 +135,26 @@ static void bump_cpu_timer(struct k_itimer *timer,
148 } else { 135 } else {
149 cputime_t delta, incr; 136 cputime_t delta, incr;
150 137
151 if (cputime_lt(now.cpu, timer->it.cpu.expires.cpu)) 138 if (now.cpu < timer->it.cpu.expires.cpu)
152 return; 139 return;
153 incr = timer->it.cpu.incr.cpu; 140 incr = timer->it.cpu.incr.cpu;
154 delta = cputime_sub(cputime_add(now.cpu, incr), 141 delta = now.cpu + incr - timer->it.cpu.expires.cpu;
155 timer->it.cpu.expires.cpu);
156 /* Don't use (incr*2 < delta), incr*2 might overflow. */ 142 /* Don't use (incr*2 < delta), incr*2 might overflow. */
157 for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++) 143 for (i = 0; incr < delta - incr; i++)
158 incr = cputime_add(incr, incr); 144 incr += incr;
159 for (; i >= 0; incr = cputime_halve(incr), i--) { 145 for (; i >= 0; incr = incr >> 1, i--) {
160 if (cputime_lt(delta, incr)) 146 if (delta < incr)
161 continue; 147 continue;
162 timer->it.cpu.expires.cpu = 148 timer->it.cpu.expires.cpu += incr;
163 cputime_add(timer->it.cpu.expires.cpu, incr);
164 timer->it_overrun += 1 << i; 149 timer->it_overrun += 1 << i;
165 delta = cputime_sub(delta, incr); 150 delta -= incr;
166 } 151 }
167 } 152 }
168} 153}
169 154
170static inline cputime_t prof_ticks(struct task_struct *p) 155static inline cputime_t prof_ticks(struct task_struct *p)
171{ 156{
172 return cputime_add(p->utime, p->stime); 157 return p->utime + p->stime;
173} 158}
174static inline cputime_t virt_ticks(struct task_struct *p) 159static inline cputime_t virt_ticks(struct task_struct *p)
175{ 160{
@@ -248,8 +233,8 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
248 233
249 t = tsk; 234 t = tsk;
250 do { 235 do {
251 times->utime = cputime_add(times->utime, t->utime); 236 times->utime += t->utime;
252 times->stime = cputime_add(times->stime, t->stime); 237 times->stime += t->stime;
253 times->sum_exec_runtime += task_sched_runtime(t); 238 times->sum_exec_runtime += task_sched_runtime(t);
254 } while_each_thread(tsk, t); 239 } while_each_thread(tsk, t);
255out: 240out:
@@ -258,10 +243,10 @@ out:
258 243
259static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) 244static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
260{ 245{
261 if (cputime_gt(b->utime, a->utime)) 246 if (b->utime > a->utime)
262 a->utime = b->utime; 247 a->utime = b->utime;
263 248
264 if (cputime_gt(b->stime, a->stime)) 249 if (b->stime > a->stime)
265 a->stime = b->stime; 250 a->stime = b->stime;
266 251
267 if (b->sum_exec_runtime > a->sum_exec_runtime) 252 if (b->sum_exec_runtime > a->sum_exec_runtime)
@@ -306,7 +291,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
306 return -EINVAL; 291 return -EINVAL;
307 case CPUCLOCK_PROF: 292 case CPUCLOCK_PROF:
308 thread_group_cputime(p, &cputime); 293 thread_group_cputime(p, &cputime);
309 cpu->cpu = cputime_add(cputime.utime, cputime.stime); 294 cpu->cpu = cputime.utime + cputime.stime;
310 break; 295 break;
311 case CPUCLOCK_VIRT: 296 case CPUCLOCK_VIRT:
312 thread_group_cputime(p, &cputime); 297 thread_group_cputime(p, &cputime);
@@ -470,26 +455,24 @@ static void cleanup_timers(struct list_head *head,
470 unsigned long long sum_exec_runtime) 455 unsigned long long sum_exec_runtime)
471{ 456{
472 struct cpu_timer_list *timer, *next; 457 struct cpu_timer_list *timer, *next;
473 cputime_t ptime = cputime_add(utime, stime); 458 cputime_t ptime = utime + stime;
474 459
475 list_for_each_entry_safe(timer, next, head, entry) { 460 list_for_each_entry_safe(timer, next, head, entry) {
476 list_del_init(&timer->entry); 461 list_del_init(&timer->entry);
477 if (cputime_lt(timer->expires.cpu, ptime)) { 462 if (timer->expires.cpu < ptime) {
478 timer->expires.cpu = cputime_zero; 463 timer->expires.cpu = 0;
479 } else { 464 } else {
480 timer->expires.cpu = cputime_sub(timer->expires.cpu, 465 timer->expires.cpu -= ptime;
481 ptime);
482 } 466 }
483 } 467 }
484 468
485 ++head; 469 ++head;
486 list_for_each_entry_safe(timer, next, head, entry) { 470 list_for_each_entry_safe(timer, next, head, entry) {
487 list_del_init(&timer->entry); 471 list_del_init(&timer->entry);
488 if (cputime_lt(timer->expires.cpu, utime)) { 472 if (timer->expires.cpu < utime) {
489 timer->expires.cpu = cputime_zero; 473 timer->expires.cpu = 0;
490 } else { 474 } else {
491 timer->expires.cpu = cputime_sub(timer->expires.cpu, 475 timer->expires.cpu -= utime;
492 utime);
493 } 476 }
494 } 477 }
495 478
@@ -520,8 +503,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
520 struct signal_struct *const sig = tsk->signal; 503 struct signal_struct *const sig = tsk->signal;
521 504
522 cleanup_timers(tsk->signal->cpu_timers, 505 cleanup_timers(tsk->signal->cpu_timers,
523 cputime_add(tsk->utime, sig->utime), 506 tsk->utime + sig->utime, tsk->stime + sig->stime,
524 cputime_add(tsk->stime, sig->stime),
525 tsk->se.sum_exec_runtime + sig->sum_sched_runtime); 507 tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
526} 508}
527 509
@@ -540,8 +522,7 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
540 522
541static inline int expires_gt(cputime_t expires, cputime_t new_exp) 523static inline int expires_gt(cputime_t expires, cputime_t new_exp)
542{ 524{
543 return cputime_eq(expires, cputime_zero) || 525 return expires == 0 || expires > new_exp;
544 cputime_gt(expires, new_exp);
545} 526}
546 527
547/* 528/*
@@ -651,7 +632,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
651 default: 632 default:
652 return -EINVAL; 633 return -EINVAL;
653 case CPUCLOCK_PROF: 634 case CPUCLOCK_PROF:
654 cpu->cpu = cputime_add(cputime.utime, cputime.stime); 635 cpu->cpu = cputime.utime + cputime.stime;
655 break; 636 break;
656 case CPUCLOCK_VIRT: 637 case CPUCLOCK_VIRT:
657 cpu->cpu = cputime.utime; 638 cpu->cpu = cputime.utime;
@@ -918,12 +899,12 @@ static void check_thread_timers(struct task_struct *tsk,
918 unsigned long soft; 899 unsigned long soft;
919 900
920 maxfire = 20; 901 maxfire = 20;
921 tsk->cputime_expires.prof_exp = cputime_zero; 902 tsk->cputime_expires.prof_exp = 0;
922 while (!list_empty(timers)) { 903 while (!list_empty(timers)) {
923 struct cpu_timer_list *t = list_first_entry(timers, 904 struct cpu_timer_list *t = list_first_entry(timers,
924 struct cpu_timer_list, 905 struct cpu_timer_list,
925 entry); 906 entry);
926 if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) { 907 if (!--maxfire || prof_ticks(tsk) < t->expires.cpu) {
927 tsk->cputime_expires.prof_exp = t->expires.cpu; 908 tsk->cputime_expires.prof_exp = t->expires.cpu;
928 break; 909 break;
929 } 910 }
@@ -933,12 +914,12 @@ static void check_thread_timers(struct task_struct *tsk,
933 914
934 ++timers; 915 ++timers;
935 maxfire = 20; 916 maxfire = 20;
936 tsk->cputime_expires.virt_exp = cputime_zero; 917 tsk->cputime_expires.virt_exp = 0;
937 while (!list_empty(timers)) { 918 while (!list_empty(timers)) {
938 struct cpu_timer_list *t = list_first_entry(timers, 919 struct cpu_timer_list *t = list_first_entry(timers,
939 struct cpu_timer_list, 920 struct cpu_timer_list,
940 entry); 921 entry);
941 if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) { 922 if (!--maxfire || virt_ticks(tsk) < t->expires.cpu) {
942 tsk->cputime_expires.virt_exp = t->expires.cpu; 923 tsk->cputime_expires.virt_exp = t->expires.cpu;
943 break; 924 break;
944 } 925 }
@@ -1009,20 +990,19 @@ static u32 onecputick;
1009static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, 990static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1010 cputime_t *expires, cputime_t cur_time, int signo) 991 cputime_t *expires, cputime_t cur_time, int signo)
1011{ 992{
1012 if (cputime_eq(it->expires, cputime_zero)) 993 if (!it->expires)
1013 return; 994 return;
1014 995
1015 if (cputime_ge(cur_time, it->expires)) { 996 if (cur_time >= it->expires) {
1016 if (!cputime_eq(it->incr, cputime_zero)) { 997 if (it->incr) {
1017 it->expires = cputime_add(it->expires, it->incr); 998 it->expires += it->incr;
1018 it->error += it->incr_error; 999 it->error += it->incr_error;
1019 if (it->error >= onecputick) { 1000 if (it->error >= onecputick) {
1020 it->expires = cputime_sub(it->expires, 1001 it->expires -= cputime_one_jiffy;
1021 cputime_one_jiffy);
1022 it->error -= onecputick; 1002 it->error -= onecputick;
1023 } 1003 }
1024 } else { 1004 } else {
1025 it->expires = cputime_zero; 1005 it->expires = 0;
1026 } 1006 }
1027 1007
1028 trace_itimer_expire(signo == SIGPROF ? 1008 trace_itimer_expire(signo == SIGPROF ?
@@ -1031,9 +1011,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1031 __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); 1011 __group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
1032 } 1012 }
1033 1013
1034 if (!cputime_eq(it->expires, cputime_zero) && 1014 if (it->expires && (!*expires || it->expires < *expires)) {
1035 (cputime_eq(*expires, cputime_zero) ||
1036 cputime_lt(it->expires, *expires))) {
1037 *expires = it->expires; 1015 *expires = it->expires;
1038 } 1016 }
1039} 1017}
@@ -1048,9 +1026,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1048 */ 1026 */
1049static inline int task_cputime_zero(const struct task_cputime *cputime) 1027static inline int task_cputime_zero(const struct task_cputime *cputime)
1050{ 1028{
1051 if (cputime_eq(cputime->utime, cputime_zero) && 1029 if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
1052 cputime_eq(cputime->stime, cputime_zero) &&
1053 cputime->sum_exec_runtime == 0)
1054 return 1; 1030 return 1;
1055 return 0; 1031 return 0;
1056} 1032}
@@ -1076,15 +1052,15 @@ static void check_process_timers(struct task_struct *tsk,
1076 */ 1052 */
1077 thread_group_cputimer(tsk, &cputime); 1053 thread_group_cputimer(tsk, &cputime);
1078 utime = cputime.utime; 1054 utime = cputime.utime;
1079 ptime = cputime_add(utime, cputime.stime); 1055 ptime = utime + cputime.stime;
1080 sum_sched_runtime = cputime.sum_exec_runtime; 1056 sum_sched_runtime = cputime.sum_exec_runtime;
1081 maxfire = 20; 1057 maxfire = 20;
1082 prof_expires = cputime_zero; 1058 prof_expires = 0;
1083 while (!list_empty(timers)) { 1059 while (!list_empty(timers)) {
1084 struct cpu_timer_list *tl = list_first_entry(timers, 1060 struct cpu_timer_list *tl = list_first_entry(timers,
1085 struct cpu_timer_list, 1061 struct cpu_timer_list,
1086 entry); 1062 entry);
1087 if (!--maxfire || cputime_lt(ptime, tl->expires.cpu)) { 1063 if (!--maxfire || ptime < tl->expires.cpu) {
1088 prof_expires = tl->expires.cpu; 1064 prof_expires = tl->expires.cpu;
1089 break; 1065 break;
1090 } 1066 }
@@ -1094,12 +1070,12 @@ static void check_process_timers(struct task_struct *tsk,
1094 1070
1095 ++timers; 1071 ++timers;
1096 maxfire = 20; 1072 maxfire = 20;
1097 virt_expires = cputime_zero; 1073 virt_expires = 0;
1098 while (!list_empty(timers)) { 1074 while (!list_empty(timers)) {
1099 struct cpu_timer_list *tl = list_first_entry(timers, 1075 struct cpu_timer_list *tl = list_first_entry(timers,
1100 struct cpu_timer_list, 1076 struct cpu_timer_list,
1101 entry); 1077 entry);
1102 if (!--maxfire || cputime_lt(utime, tl->expires.cpu)) { 1078 if (!--maxfire || utime < tl->expires.cpu) {
1103 virt_expires = tl->expires.cpu; 1079 virt_expires = tl->expires.cpu;
1104 break; 1080 break;
1105 } 1081 }
@@ -1154,8 +1130,7 @@ static void check_process_timers(struct task_struct *tsk,
1154 } 1130 }
1155 } 1131 }
1156 x = secs_to_cputime(soft); 1132 x = secs_to_cputime(soft);
1157 if (cputime_eq(prof_expires, cputime_zero) || 1133 if (!prof_expires || x < prof_expires) {
1158 cputime_lt(x, prof_expires)) {
1159 prof_expires = x; 1134 prof_expires = x;
1160 } 1135 }
1161 } 1136 }
@@ -1249,12 +1224,9 @@ out:
1249static inline int task_cputime_expired(const struct task_cputime *sample, 1224static inline int task_cputime_expired(const struct task_cputime *sample,
1250 const struct task_cputime *expires) 1225 const struct task_cputime *expires)
1251{ 1226{
1252 if (!cputime_eq(expires->utime, cputime_zero) && 1227 if (expires->utime && sample->utime >= expires->utime)
1253 cputime_ge(sample->utime, expires->utime))
1254 return 1; 1228 return 1;
1255 if (!cputime_eq(expires->stime, cputime_zero) && 1229 if (expires->stime && sample->utime + sample->stime >= expires->stime)
1256 cputime_ge(cputime_add(sample->utime, sample->stime),
1257 expires->stime))
1258 return 1; 1230 return 1;
1259 if (expires->sum_exec_runtime != 0 && 1231 if (expires->sum_exec_runtime != 0 &&
1260 sample->sum_exec_runtime >= expires->sum_exec_runtime) 1232 sample->sum_exec_runtime >= expires->sum_exec_runtime)
@@ -1389,18 +1361,18 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1389 * it to be relative, *newval argument is relative and we update 1361 * it to be relative, *newval argument is relative and we update
1390 * it to be absolute. 1362 * it to be absolute.
1391 */ 1363 */
1392 if (!cputime_eq(*oldval, cputime_zero)) { 1364 if (*oldval) {
1393 if (cputime_le(*oldval, now.cpu)) { 1365 if (*oldval <= now.cpu) {
1394 /* Just about to fire. */ 1366 /* Just about to fire. */
1395 *oldval = cputime_one_jiffy; 1367 *oldval = cputime_one_jiffy;
1396 } else { 1368 } else {
1397 *oldval = cputime_sub(*oldval, now.cpu); 1369 *oldval -= now.cpu;
1398 } 1370 }
1399 } 1371 }
1400 1372
1401 if (cputime_eq(*newval, cputime_zero)) 1373 if (!*newval)
1402 return; 1374 return;
1403 *newval = cputime_add(*newval, now.cpu); 1375 *newval += now.cpu;
1404 } 1376 }
1405 1377
1406 /* 1378 /*
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 196c01268ebd..6d6d28870335 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -43,8 +43,6 @@ int in_suspend __nosavedata;
43enum { 43enum {
44 HIBERNATION_INVALID, 44 HIBERNATION_INVALID,
45 HIBERNATION_PLATFORM, 45 HIBERNATION_PLATFORM,
46 HIBERNATION_TEST,
47 HIBERNATION_TESTPROC,
48 HIBERNATION_SHUTDOWN, 46 HIBERNATION_SHUTDOWN,
49 HIBERNATION_REBOOT, 47 HIBERNATION_REBOOT,
50 /* keep last */ 48 /* keep last */
@@ -55,7 +53,7 @@ enum {
55 53
56static int hibernation_mode = HIBERNATION_SHUTDOWN; 54static int hibernation_mode = HIBERNATION_SHUTDOWN;
57 55
58static bool freezer_test_done; 56bool freezer_test_done;
59 57
60static const struct platform_hibernation_ops *hibernation_ops; 58static const struct platform_hibernation_ops *hibernation_ops;
61 59
@@ -71,14 +69,14 @@ void hibernation_set_ops(const struct platform_hibernation_ops *ops)
71 WARN_ON(1); 69 WARN_ON(1);
72 return; 70 return;
73 } 71 }
74 mutex_lock(&pm_mutex); 72 lock_system_sleep();
75 hibernation_ops = ops; 73 hibernation_ops = ops;
76 if (ops) 74 if (ops)
77 hibernation_mode = HIBERNATION_PLATFORM; 75 hibernation_mode = HIBERNATION_PLATFORM;
78 else if (hibernation_mode == HIBERNATION_PLATFORM) 76 else if (hibernation_mode == HIBERNATION_PLATFORM)
79 hibernation_mode = HIBERNATION_SHUTDOWN; 77 hibernation_mode = HIBERNATION_SHUTDOWN;
80 78
81 mutex_unlock(&pm_mutex); 79 unlock_system_sleep();
82} 80}
83 81
84static bool entering_platform_hibernation; 82static bool entering_platform_hibernation;
@@ -96,15 +94,6 @@ static void hibernation_debug_sleep(void)
96 mdelay(5000); 94 mdelay(5000);
97} 95}
98 96
99static int hibernation_testmode(int mode)
100{
101 if (hibernation_mode == mode) {
102 hibernation_debug_sleep();
103 return 1;
104 }
105 return 0;
106}
107
108static int hibernation_test(int level) 97static int hibernation_test(int level)
109{ 98{
110 if (pm_test_level == level) { 99 if (pm_test_level == level) {
@@ -114,7 +103,6 @@ static int hibernation_test(int level)
114 return 0; 103 return 0;
115} 104}
116#else /* !CONFIG_PM_DEBUG */ 105#else /* !CONFIG_PM_DEBUG */
117static int hibernation_testmode(int mode) { return 0; }
118static int hibernation_test(int level) { return 0; } 106static int hibernation_test(int level) { return 0; }
119#endif /* !CONFIG_PM_DEBUG */ 107#endif /* !CONFIG_PM_DEBUG */
120 108
@@ -278,8 +266,7 @@ static int create_image(int platform_mode)
278 goto Platform_finish; 266 goto Platform_finish;
279 267
280 error = disable_nonboot_cpus(); 268 error = disable_nonboot_cpus();
281 if (error || hibernation_test(TEST_CPUS) 269 if (error || hibernation_test(TEST_CPUS))
282 || hibernation_testmode(HIBERNATION_TEST))
283 goto Enable_cpus; 270 goto Enable_cpus;
284 271
285 local_irq_disable(); 272 local_irq_disable();
@@ -333,7 +320,7 @@ static int create_image(int platform_mode)
333 */ 320 */
334int hibernation_snapshot(int platform_mode) 321int hibernation_snapshot(int platform_mode)
335{ 322{
336 pm_message_t msg = PMSG_RECOVER; 323 pm_message_t msg;
337 int error; 324 int error;
338 325
339 error = platform_begin(platform_mode); 326 error = platform_begin(platform_mode);
@@ -347,39 +334,40 @@ int hibernation_snapshot(int platform_mode)
347 334
348 error = freeze_kernel_threads(); 335 error = freeze_kernel_threads();
349 if (error) 336 if (error)
350 goto Close; 337 goto Cleanup;
351 338
352 if (hibernation_test(TEST_FREEZER) || 339 if (hibernation_test(TEST_FREEZER)) {
353 hibernation_testmode(HIBERNATION_TESTPROC)) {
354 340
355 /* 341 /*
356 * Indicate to the caller that we are returning due to a 342 * Indicate to the caller that we are returning due to a
357 * successful freezer test. 343 * successful freezer test.
358 */ 344 */
359 freezer_test_done = true; 345 freezer_test_done = true;
360 goto Close; 346 goto Cleanup;
361 } 347 }
362 348
363 error = dpm_prepare(PMSG_FREEZE); 349 error = dpm_prepare(PMSG_FREEZE);
364 if (error) 350 if (error) {
365 goto Complete_devices; 351 dpm_complete(PMSG_RECOVER);
352 goto Cleanup;
353 }
366 354
367 suspend_console(); 355 suspend_console();
368 pm_restrict_gfp_mask(); 356 pm_restrict_gfp_mask();
357
369 error = dpm_suspend(PMSG_FREEZE); 358 error = dpm_suspend(PMSG_FREEZE);
370 if (error)
371 goto Recover_platform;
372 359
373 if (hibernation_test(TEST_DEVICES)) 360 if (error || hibernation_test(TEST_DEVICES))
374 goto Recover_platform; 361 platform_recover(platform_mode);
362 else
363 error = create_image(platform_mode);
375 364
376 error = create_image(platform_mode);
377 /* 365 /*
378 * Control returns here (1) after the image has been created or the 366 * In the case that we call create_image() above, the control
367 * returns here (1) after the image has been created or the
379 * image creation has failed and (2) after a successful restore. 368 * image creation has failed and (2) after a successful restore.
380 */ 369 */
381 370
382 Resume_devices:
383 /* We may need to release the preallocated image pages here. */ 371 /* We may need to release the preallocated image pages here. */
384 if (error || !in_suspend) 372 if (error || !in_suspend)
385 swsusp_free(); 373 swsusp_free();
@@ -391,17 +379,15 @@ int hibernation_snapshot(int platform_mode)
391 pm_restore_gfp_mask(); 379 pm_restore_gfp_mask();
392 380
393 resume_console(); 381 resume_console();
394
395 Complete_devices:
396 dpm_complete(msg); 382 dpm_complete(msg);
397 383
398 Close: 384 Close:
399 platform_end(platform_mode); 385 platform_end(platform_mode);
400 return error; 386 return error;
401 387
402 Recover_platform: 388 Cleanup:
403 platform_recover(platform_mode); 389 swsusp_free();
404 goto Resume_devices; 390 goto Close;
405} 391}
406 392
407/** 393/**
@@ -586,9 +572,6 @@ int hibernation_platform_enter(void)
586static void power_down(void) 572static void power_down(void)
587{ 573{
588 switch (hibernation_mode) { 574 switch (hibernation_mode) {
589 case HIBERNATION_TEST:
590 case HIBERNATION_TESTPROC:
591 break;
592 case HIBERNATION_REBOOT: 575 case HIBERNATION_REBOOT:
593 kernel_restart(NULL); 576 kernel_restart(NULL);
594 break; 577 break;
@@ -607,17 +590,6 @@ static void power_down(void)
607 while(1); 590 while(1);
608} 591}
609 592
610static int prepare_processes(void)
611{
612 int error = 0;
613
614 if (freeze_processes()) {
615 error = -EBUSY;
616 thaw_processes();
617 }
618 return error;
619}
620
621/** 593/**
622 * hibernate - Carry out system hibernation, including saving the image. 594 * hibernate - Carry out system hibernation, including saving the image.
623 */ 595 */
@@ -625,7 +597,7 @@ int hibernate(void)
625{ 597{
626 int error; 598 int error;
627 599
628 mutex_lock(&pm_mutex); 600 lock_system_sleep();
629 /* The snapshot device should not be opened while we're running */ 601 /* The snapshot device should not be opened while we're running */
630 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { 602 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
631 error = -EBUSY; 603 error = -EBUSY;
@@ -650,7 +622,7 @@ int hibernate(void)
650 sys_sync(); 622 sys_sync();
651 printk("done.\n"); 623 printk("done.\n");
652 624
653 error = prepare_processes(); 625 error = freeze_processes();
654 if (error) 626 if (error)
655 goto Finish; 627 goto Finish;
656 628
@@ -693,7 +665,7 @@ int hibernate(void)
693 pm_restore_console(); 665 pm_restore_console();
694 atomic_inc(&snapshot_device_available); 666 atomic_inc(&snapshot_device_available);
695 Unlock: 667 Unlock:
696 mutex_unlock(&pm_mutex); 668 unlock_system_sleep();
697 return error; 669 return error;
698} 670}
699 671
@@ -807,11 +779,13 @@ static int software_resume(void)
807 goto close_finish; 779 goto close_finish;
808 780
809 error = create_basic_memory_bitmaps(); 781 error = create_basic_memory_bitmaps();
810 if (error) 782 if (error) {
783 usermodehelper_enable();
811 goto close_finish; 784 goto close_finish;
785 }
812 786
813 pr_debug("PM: Preparing processes for restore.\n"); 787 pr_debug("PM: Preparing processes for restore.\n");
814 error = prepare_processes(); 788 error = freeze_processes();
815 if (error) { 789 if (error) {
816 swsusp_close(FMODE_READ); 790 swsusp_close(FMODE_READ);
817 goto Done; 791 goto Done;
@@ -851,8 +825,6 @@ static const char * const hibernation_modes[] = {
851 [HIBERNATION_PLATFORM] = "platform", 825 [HIBERNATION_PLATFORM] = "platform",
852 [HIBERNATION_SHUTDOWN] = "shutdown", 826 [HIBERNATION_SHUTDOWN] = "shutdown",
853 [HIBERNATION_REBOOT] = "reboot", 827 [HIBERNATION_REBOOT] = "reboot",
854 [HIBERNATION_TEST] = "test",
855 [HIBERNATION_TESTPROC] = "testproc",
856}; 828};
857 829
858/* 830/*
@@ -861,17 +833,15 @@ static const char * const hibernation_modes[] = {
861 * Hibernation can be handled in several ways. There are a few different ways 833 * Hibernation can be handled in several ways. There are a few different ways
862 * to put the system into the sleep state: using the platform driver (e.g. ACPI 834 * to put the system into the sleep state: using the platform driver (e.g. ACPI
863 * or other hibernation_ops), powering it off or rebooting it (for testing 835 * or other hibernation_ops), powering it off or rebooting it (for testing
864 * mostly), or using one of the two available test modes. 836 * mostly).
865 * 837 *
866 * The sysfs file /sys/power/disk provides an interface for selecting the 838 * The sysfs file /sys/power/disk provides an interface for selecting the
867 * hibernation mode to use. Reading from this file causes the available modes 839 * hibernation mode to use. Reading from this file causes the available modes
868 * to be printed. There are 5 modes that can be supported: 840 * to be printed. There are 3 modes that can be supported:
869 * 841 *
870 * 'platform' 842 * 'platform'
871 * 'shutdown' 843 * 'shutdown'
872 * 'reboot' 844 * 'reboot'
873 * 'test'
874 * 'testproc'
875 * 845 *
876 * If a platform hibernation driver is in use, 'platform' will be supported 846 * If a platform hibernation driver is in use, 'platform' will be supported
877 * and will be used by default. Otherwise, 'shutdown' will be used by default. 847 * and will be used by default. Otherwise, 'shutdown' will be used by default.
@@ -895,8 +865,6 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
895 switch (i) { 865 switch (i) {
896 case HIBERNATION_SHUTDOWN: 866 case HIBERNATION_SHUTDOWN:
897 case HIBERNATION_REBOOT: 867 case HIBERNATION_REBOOT:
898 case HIBERNATION_TEST:
899 case HIBERNATION_TESTPROC:
900 break; 868 break;
901 case HIBERNATION_PLATFORM: 869 case HIBERNATION_PLATFORM:
902 if (hibernation_ops) 870 if (hibernation_ops)
@@ -925,7 +893,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
925 p = memchr(buf, '\n', n); 893 p = memchr(buf, '\n', n);
926 len = p ? p - buf : n; 894 len = p ? p - buf : n;
927 895
928 mutex_lock(&pm_mutex); 896 lock_system_sleep();
929 for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { 897 for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
930 if (len == strlen(hibernation_modes[i]) 898 if (len == strlen(hibernation_modes[i])
931 && !strncmp(buf, hibernation_modes[i], len)) { 899 && !strncmp(buf, hibernation_modes[i], len)) {
@@ -937,8 +905,6 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
937 switch (mode) { 905 switch (mode) {
938 case HIBERNATION_SHUTDOWN: 906 case HIBERNATION_SHUTDOWN:
939 case HIBERNATION_REBOOT: 907 case HIBERNATION_REBOOT:
940 case HIBERNATION_TEST:
941 case HIBERNATION_TESTPROC:
942 hibernation_mode = mode; 908 hibernation_mode = mode;
943 break; 909 break;
944 case HIBERNATION_PLATFORM: 910 case HIBERNATION_PLATFORM:
@@ -953,7 +919,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
953 if (!error) 919 if (!error)
954 pr_debug("PM: Hibernation mode set to '%s'\n", 920 pr_debug("PM: Hibernation mode set to '%s'\n",
955 hibernation_modes[mode]); 921 hibernation_modes[mode]);
956 mutex_unlock(&pm_mutex); 922 unlock_system_sleep();
957 return error ? error : n; 923 return error ? error : n;
958} 924}
959 925
@@ -980,9 +946,9 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
980 if (maj != MAJOR(res) || min != MINOR(res)) 946 if (maj != MAJOR(res) || min != MINOR(res))
981 goto out; 947 goto out;
982 948
983 mutex_lock(&pm_mutex); 949 lock_system_sleep();
984 swsusp_resume_device = res; 950 swsusp_resume_device = res;
985 mutex_unlock(&pm_mutex); 951 unlock_system_sleep();
986 printk(KERN_INFO "PM: Starting manual resume from disk\n"); 952 printk(KERN_INFO "PM: Starting manual resume from disk\n");
987 noresume = 0; 953 noresume = 0;
988 software_resume(); 954 software_resume();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 36e0f0903c32..9824b41e5a18 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Copyright (c) 2003 Patrick Mochel 4 * Copyright (c) 2003 Patrick Mochel
5 * Copyright (c) 2003 Open Source Development Lab 5 * Copyright (c) 2003 Open Source Development Lab
6 * 6 *
7 * This file is released under the GPLv2 7 * This file is released under the GPLv2
8 * 8 *
9 */ 9 */
@@ -116,7 +116,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
116 p = memchr(buf, '\n', n); 116 p = memchr(buf, '\n', n);
117 len = p ? p - buf : n; 117 len = p ? p - buf : n;
118 118
119 mutex_lock(&pm_mutex); 119 lock_system_sleep();
120 120
121 level = TEST_FIRST; 121 level = TEST_FIRST;
122 for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++) 122 for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++)
@@ -126,7 +126,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
126 break; 126 break;
127 } 127 }
128 128
129 mutex_unlock(&pm_mutex); 129 unlock_system_sleep();
130 130
131 return error ? error : n; 131 return error ? error : n;
132} 132}
@@ -240,7 +240,7 @@ struct kobject *power_kobj;
240 * 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and 240 * 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and
241 * 'disk' (Suspend-to-Disk). 241 * 'disk' (Suspend-to-Disk).
242 * 242 *
243 * store() accepts one of those strings, translates it into the 243 * store() accepts one of those strings, translates it into the
244 * proper enumerated value, and initiates a suspend transition. 244 * proper enumerated value, and initiates a suspend transition.
245 */ 245 */
246static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, 246static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
@@ -282,7 +282,7 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
282 /* First, check if we are requested to hibernate */ 282 /* First, check if we are requested to hibernate */
283 if (len == 4 && !strncmp(buf, "disk", len)) { 283 if (len == 4 && !strncmp(buf, "disk", len)) {
284 error = hibernate(); 284 error = hibernate();
285 goto Exit; 285 goto Exit;
286 } 286 }
287 287
288#ifdef CONFIG_SUSPEND 288#ifdef CONFIG_SUSPEND
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 23a2db1ec442..21724eee5206 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -50,6 +50,8 @@ static inline char *check_image_kernel(struct swsusp_info *info)
50#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) 50#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT)
51 51
52/* kernel/power/hibernate.c */ 52/* kernel/power/hibernate.c */
53extern bool freezer_test_done;
54
53extern int hibernation_snapshot(int platform_mode); 55extern int hibernation_snapshot(int platform_mode);
54extern int hibernation_restore(int platform_mode); 56extern int hibernation_restore(int platform_mode);
55extern int hibernation_platform_enter(void); 57extern int hibernation_platform_enter(void);
@@ -229,8 +231,28 @@ extern int pm_test_level;
229#ifdef CONFIG_SUSPEND_FREEZER 231#ifdef CONFIG_SUSPEND_FREEZER
230static inline int suspend_freeze_processes(void) 232static inline int suspend_freeze_processes(void)
231{ 233{
232 int error = freeze_processes(); 234 int error;
233 return error ? : freeze_kernel_threads(); 235
236 error = freeze_processes();
237
238 /*
239 * freeze_processes() automatically thaws every task if freezing
240 * fails. So we need not do anything extra upon error.
241 */
242 if (error)
243 goto Finish;
244
245 error = freeze_kernel_threads();
246
247 /*
248 * freeze_kernel_threads() thaws only kernel threads upon freezing
249 * failure. So we have to thaw the userspace tasks ourselves.
250 */
251 if (error)
252 thaw_processes();
253
254 Finish:
255 return error;
234} 256}
235 257
236static inline void suspend_thaw_processes(void) 258static inline void suspend_thaw_processes(void)
diff --git a/kernel/power/process.c b/kernel/power/process.c
index addbbe5531bc..7e426459e60a 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -22,16 +22,7 @@
22 */ 22 */
23#define TIMEOUT (20 * HZ) 23#define TIMEOUT (20 * HZ)
24 24
25static inline int freezable(struct task_struct * p) 25static int try_to_freeze_tasks(bool user_only)
26{
27 if ((p == current) ||
28 (p->flags & PF_NOFREEZE) ||
29 (p->exit_state != 0))
30 return 0;
31 return 1;
32}
33
34static int try_to_freeze_tasks(bool sig_only)
35{ 26{
36 struct task_struct *g, *p; 27 struct task_struct *g, *p;
37 unsigned long end_time; 28 unsigned long end_time;
@@ -46,17 +37,14 @@ static int try_to_freeze_tasks(bool sig_only)
46 37
47 end_time = jiffies + TIMEOUT; 38 end_time = jiffies + TIMEOUT;
48 39
49 if (!sig_only) 40 if (!user_only)
50 freeze_workqueues_begin(); 41 freeze_workqueues_begin();
51 42
52 while (true) { 43 while (true) {
53 todo = 0; 44 todo = 0;
54 read_lock(&tasklist_lock); 45 read_lock(&tasklist_lock);
55 do_each_thread(g, p) { 46 do_each_thread(g, p) {
56 if (frozen(p) || !freezable(p)) 47 if (p == current || !freeze_task(p))
57 continue;
58
59 if (!freeze_task(p, sig_only))
60 continue; 48 continue;
61 49
62 /* 50 /*
@@ -77,7 +65,7 @@ static int try_to_freeze_tasks(bool sig_only)
77 } while_each_thread(g, p); 65 } while_each_thread(g, p);
78 read_unlock(&tasklist_lock); 66 read_unlock(&tasklist_lock);
79 67
80 if (!sig_only) { 68 if (!user_only) {
81 wq_busy = freeze_workqueues_busy(); 69 wq_busy = freeze_workqueues_busy();
82 todo += wq_busy; 70 todo += wq_busy;
83 } 71 }
@@ -103,11 +91,6 @@ static int try_to_freeze_tasks(bool sig_only)
103 elapsed_csecs = elapsed_csecs64; 91 elapsed_csecs = elapsed_csecs64;
104 92
105 if (todo) { 93 if (todo) {
106 /* This does not unfreeze processes that are already frozen
107 * (we have slightly ugly calling convention in that respect,
108 * and caller must call thaw_processes() if something fails),
109 * but it cleans up leftover PF_FREEZE requests.
110 */
111 printk("\n"); 94 printk("\n");
112 printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds " 95 printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds "
113 "(%d tasks refusing to freeze, wq_busy=%d):\n", 96 "(%d tasks refusing to freeze, wq_busy=%d):\n",
@@ -115,15 +98,11 @@ static int try_to_freeze_tasks(bool sig_only)
115 elapsed_csecs / 100, elapsed_csecs % 100, 98 elapsed_csecs / 100, elapsed_csecs % 100,
116 todo - wq_busy, wq_busy); 99 todo - wq_busy, wq_busy);
117 100
118 thaw_workqueues();
119
120 read_lock(&tasklist_lock); 101 read_lock(&tasklist_lock);
121 do_each_thread(g, p) { 102 do_each_thread(g, p) {
122 task_lock(p); 103 if (!wakeup && !freezer_should_skip(p) &&
123 if (!wakeup && freezing(p) && !freezer_should_skip(p)) 104 p != current && freezing(p) && !frozen(p))
124 sched_show_task(p); 105 sched_show_task(p);
125 cancel_freezing(p);
126 task_unlock(p);
127 } while_each_thread(g, p); 106 } while_each_thread(g, p);
128 read_unlock(&tasklist_lock); 107 read_unlock(&tasklist_lock);
129 } else { 108 } else {
@@ -136,12 +115,18 @@ static int try_to_freeze_tasks(bool sig_only)
136 115
137/** 116/**
138 * freeze_processes - Signal user space processes to enter the refrigerator. 117 * freeze_processes - Signal user space processes to enter the refrigerator.
118 *
119 * On success, returns 0. On failure, -errno and system is fully thawed.
139 */ 120 */
140int freeze_processes(void) 121int freeze_processes(void)
141{ 122{
142 int error; 123 int error;
143 124
125 if (!pm_freezing)
126 atomic_inc(&system_freezing_cnt);
127
144 printk("Freezing user space processes ... "); 128 printk("Freezing user space processes ... ");
129 pm_freezing = true;
145 error = try_to_freeze_tasks(true); 130 error = try_to_freeze_tasks(true);
146 if (!error) { 131 if (!error) {
147 printk("done."); 132 printk("done.");
@@ -150,17 +135,25 @@ int freeze_processes(void)
150 printk("\n"); 135 printk("\n");
151 BUG_ON(in_atomic()); 136 BUG_ON(in_atomic());
152 137
138 if (error)
139 thaw_processes();
153 return error; 140 return error;
154} 141}
155 142
156/** 143/**
157 * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator. 144 * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator.
145 *
146 * On success, returns 0. On failure, -errno and only the kernel threads are
147 * thawed, so as to give a chance to the caller to do additional cleanups
148 * (if any) before thawing the userspace tasks. So, it is the responsibility
149 * of the caller to thaw the userspace tasks, when the time is right.
158 */ 150 */
159int freeze_kernel_threads(void) 151int freeze_kernel_threads(void)
160{ 152{
161 int error; 153 int error;
162 154
163 printk("Freezing remaining freezable tasks ... "); 155 printk("Freezing remaining freezable tasks ... ");
156 pm_nosig_freezing = true;
164 error = try_to_freeze_tasks(false); 157 error = try_to_freeze_tasks(false);
165 if (!error) 158 if (!error)
166 printk("done."); 159 printk("done.");
@@ -168,38 +161,52 @@ int freeze_kernel_threads(void)
168 printk("\n"); 161 printk("\n");
169 BUG_ON(in_atomic()); 162 BUG_ON(in_atomic());
170 163
164 if (error)
165 thaw_kernel_threads();
171 return error; 166 return error;
172} 167}
173 168
174static void thaw_tasks(bool nosig_only) 169void thaw_processes(void)
175{ 170{
176 struct task_struct *g, *p; 171 struct task_struct *g, *p;
177 172
178 read_lock(&tasklist_lock); 173 if (pm_freezing)
179 do_each_thread(g, p) { 174 atomic_dec(&system_freezing_cnt);
180 if (!freezable(p)) 175 pm_freezing = false;
181 continue; 176 pm_nosig_freezing = false;
182 177
183 if (nosig_only && should_send_signal(p)) 178 oom_killer_enable();
184 continue; 179
180 printk("Restarting tasks ... ");
185 181
186 if (cgroup_freezing_or_frozen(p)) 182 thaw_workqueues();
187 continue;
188 183
189 thaw_process(p); 184 read_lock(&tasklist_lock);
185 do_each_thread(g, p) {
186 __thaw_task(p);
190 } while_each_thread(g, p); 187 } while_each_thread(g, p);
191 read_unlock(&tasklist_lock); 188 read_unlock(&tasklist_lock);
189
190 schedule();
191 printk("done.\n");
192} 192}
193 193
194void thaw_processes(void) 194void thaw_kernel_threads(void)
195{ 195{
196 oom_killer_enable(); 196 struct task_struct *g, *p;
197
198 pm_nosig_freezing = false;
199 printk("Restarting kernel threads ... ");
197 200
198 printk("Restarting tasks ... ");
199 thaw_workqueues(); 201 thaw_workqueues();
200 thaw_tasks(true); 202
201 thaw_tasks(false); 203 read_lock(&tasklist_lock);
204 do_each_thread(g, p) {
205 if (p->flags & (PF_KTHREAD | PF_WQ_WORKER))
206 __thaw_task(p);
207 } while_each_thread(g, p);
208 read_unlock(&tasklist_lock);
209
202 schedule(); 210 schedule();
203 printk("done.\n"); 211 printk("done.\n");
204} 212}
205
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index cbe2c1441392..6a768e537001 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -812,7 +812,8 @@ unsigned int snapshot_additional_pages(struct zone *zone)
812 unsigned int res; 812 unsigned int res;
813 813
814 res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); 814 res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
815 res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE); 815 res += DIV_ROUND_UP(res * sizeof(struct bm_block),
816 LINKED_PAGE_DATA_SIZE);
816 return 2 * res; 817 return 2 * res;
817} 818}
818 819
@@ -858,6 +859,9 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
858 PageReserved(page)) 859 PageReserved(page))
859 return NULL; 860 return NULL;
860 861
862 if (page_is_guard(page))
863 return NULL;
864
861 return page; 865 return page;
862} 866}
863 867
@@ -920,6 +924,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
920 && (!kernel_page_present(page) || pfn_is_nosave(pfn))) 924 && (!kernel_page_present(page) || pfn_is_nosave(pfn)))
921 return NULL; 925 return NULL;
922 926
927 if (page_is_guard(page))
928 return NULL;
929
923 return page; 930 return page;
924} 931}
925 932
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4953dc054c53..4fd51beed879 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -42,9 +42,9 @@ static const struct platform_suspend_ops *suspend_ops;
42 */ 42 */
43void suspend_set_ops(const struct platform_suspend_ops *ops) 43void suspend_set_ops(const struct platform_suspend_ops *ops)
44{ 44{
45 mutex_lock(&pm_mutex); 45 lock_system_sleep();
46 suspend_ops = ops; 46 suspend_ops = ops;
47 mutex_unlock(&pm_mutex); 47 unlock_system_sleep();
48} 48}
49EXPORT_SYMBOL_GPL(suspend_set_ops); 49EXPORT_SYMBOL_GPL(suspend_set_ops);
50 50
@@ -106,13 +106,11 @@ static int suspend_prepare(void)
106 goto Finish; 106 goto Finish;
107 107
108 error = suspend_freeze_processes(); 108 error = suspend_freeze_processes();
109 if (error) { 109 if (!error)
110 suspend_stats.failed_freeze++;
111 dpm_save_failed_step(SUSPEND_FREEZE);
112 } else
113 return 0; 110 return 0;
114 111
115 suspend_thaw_processes(); 112 suspend_stats.failed_freeze++;
113 dpm_save_failed_step(SUSPEND_FREEZE);
116 usermodehelper_enable(); 114 usermodehelper_enable();
117 Finish: 115 Finish:
118 pm_notifier_call_chain(PM_POST_SUSPEND); 116 pm_notifier_call_chain(PM_POST_SUSPEND);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 11a594c4ba25..8742fd013a94 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -18,7 +18,6 @@
18#include <linux/bitops.h> 18#include <linux/bitops.h>
19#include <linux/genhd.h> 19#include <linux/genhd.h>
20#include <linux/device.h> 20#include <linux/device.h>
21#include <linux/buffer_head.h>
22#include <linux/bio.h> 21#include <linux/bio.h>
23#include <linux/blkdev.h> 22#include <linux/blkdev.h>
24#include <linux/swap.h> 23#include <linux/swap.h>
@@ -774,8 +773,7 @@ static int enough_swap(unsigned int nr_pages, unsigned int flags)
774 773
775 pr_debug("PM: Free swap pages: %u\n", free_swap); 774 pr_debug("PM: Free swap pages: %u\n", free_swap);
776 775
777 required = PAGES_FOR_IO + ((flags & SF_NOCOMPRESS_MODE) ? 776 required = PAGES_FOR_IO + nr_pages;
778 nr_pages : (nr_pages * LZO_CMP_PAGES) / LZO_UNC_PAGES + 1);
779 return free_swap > required; 777 return free_swap > required;
780} 778}
781 779
@@ -803,10 +801,12 @@ int swsusp_write(unsigned int flags)
803 printk(KERN_ERR "PM: Cannot get swap writer\n"); 801 printk(KERN_ERR "PM: Cannot get swap writer\n");
804 return error; 802 return error;
805 } 803 }
806 if (!enough_swap(pages, flags)) { 804 if (flags & SF_NOCOMPRESS_MODE) {
807 printk(KERN_ERR "PM: Not enough free swap\n"); 805 if (!enough_swap(pages, flags)) {
808 error = -ENOSPC; 806 printk(KERN_ERR "PM: Not enough free swap\n");
809 goto out_finish; 807 error = -ENOSPC;
808 goto out_finish;
809 }
810 } 810 }
811 memset(&snapshot, 0, sizeof(struct snapshot_handle)); 811 memset(&snapshot, 0, sizeof(struct snapshot_handle));
812 error = snapshot_read_next(&snapshot); 812 error = snapshot_read_next(&snapshot);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 6d8f535c2b88..3e100075b13c 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -21,6 +21,7 @@
21#include <linux/swapops.h> 21#include <linux/swapops.h>
22#include <linux/pm.h> 22#include <linux/pm.h>
23#include <linux/fs.h> 23#include <linux/fs.h>
24#include <linux/compat.h>
24#include <linux/console.h> 25#include <linux/console.h>
25#include <linux/cpu.h> 26#include <linux/cpu.h>
26#include <linux/freezer.h> 27#include <linux/freezer.h>
@@ -30,28 +31,6 @@
30 31
31#include "power.h" 32#include "power.h"
32 33
33/*
34 * NOTE: The SNAPSHOT_SET_SWAP_FILE and SNAPSHOT_PMOPS ioctls are obsolete and
35 * will be removed in the future. They are only preserved here for
36 * compatibility with existing userland utilities.
37 */
38#define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
39#define SNAPSHOT_PMOPS _IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int)
40
41#define PMOPS_PREPARE 1
42#define PMOPS_ENTER 2
43#define PMOPS_FINISH 3
44
45/*
46 * NOTE: The following ioctl definitions are wrong and have been replaced with
47 * correct ones. They are only preserved here for compatibility with existing
48 * userland utilities and will be removed in the future.
49 */
50#define SNAPSHOT_ATOMIC_SNAPSHOT _IOW(SNAPSHOT_IOC_MAGIC, 3, void *)
51#define SNAPSHOT_SET_IMAGE_SIZE _IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long)
52#define SNAPSHOT_AVAIL_SWAP _IOR(SNAPSHOT_IOC_MAGIC, 7, void *)
53#define SNAPSHOT_GET_SWAP_PAGE _IOR(SNAPSHOT_IOC_MAGIC, 8, void *)
54
55 34
56#define SNAPSHOT_MINOR 231 35#define SNAPSHOT_MINOR 231
57 36
@@ -71,7 +50,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
71 struct snapshot_data *data; 50 struct snapshot_data *data;
72 int error; 51 int error;
73 52
74 mutex_lock(&pm_mutex); 53 lock_system_sleep();
75 54
76 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { 55 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
77 error = -EBUSY; 56 error = -EBUSY;
@@ -123,7 +102,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
123 data->platform_support = 0; 102 data->platform_support = 0;
124 103
125 Unlock: 104 Unlock:
126 mutex_unlock(&pm_mutex); 105 unlock_system_sleep();
127 106
128 return error; 107 return error;
129} 108}
@@ -132,7 +111,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
132{ 111{
133 struct snapshot_data *data; 112 struct snapshot_data *data;
134 113
135 mutex_lock(&pm_mutex); 114 lock_system_sleep();
136 115
137 swsusp_free(); 116 swsusp_free();
138 free_basic_memory_bitmaps(); 117 free_basic_memory_bitmaps();
@@ -146,7 +125,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
146 PM_POST_HIBERNATION : PM_POST_RESTORE); 125 PM_POST_HIBERNATION : PM_POST_RESTORE);
147 atomic_inc(&snapshot_device_available); 126 atomic_inc(&snapshot_device_available);
148 127
149 mutex_unlock(&pm_mutex); 128 unlock_system_sleep();
150 129
151 return 0; 130 return 0;
152} 131}
@@ -158,7 +137,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
158 ssize_t res; 137 ssize_t res;
159 loff_t pg_offp = *offp & ~PAGE_MASK; 138 loff_t pg_offp = *offp & ~PAGE_MASK;
160 139
161 mutex_lock(&pm_mutex); 140 lock_system_sleep();
162 141
163 data = filp->private_data; 142 data = filp->private_data;
164 if (!data->ready) { 143 if (!data->ready) {
@@ -179,7 +158,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
179 *offp += res; 158 *offp += res;
180 159
181 Unlock: 160 Unlock:
182 mutex_unlock(&pm_mutex); 161 unlock_system_sleep();
183 162
184 return res; 163 return res;
185} 164}
@@ -191,7 +170,7 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
191 ssize_t res; 170 ssize_t res;
192 loff_t pg_offp = *offp & ~PAGE_MASK; 171 loff_t pg_offp = *offp & ~PAGE_MASK;
193 172
194 mutex_lock(&pm_mutex); 173 lock_system_sleep();
195 174
196 data = filp->private_data; 175 data = filp->private_data;
197 176
@@ -208,20 +187,11 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
208 if (res > 0) 187 if (res > 0)
209 *offp += res; 188 *offp += res;
210unlock: 189unlock:
211 mutex_unlock(&pm_mutex); 190 unlock_system_sleep();
212 191
213 return res; 192 return res;
214} 193}
215 194
216static void snapshot_deprecated_ioctl(unsigned int cmd)
217{
218 if (printk_ratelimit())
219 printk(KERN_NOTICE "%pf: ioctl '%.8x' is deprecated and will "
220 "be removed soon, update your suspend-to-disk "
221 "utilities\n",
222 __builtin_return_address(0), cmd);
223}
224
225static long snapshot_ioctl(struct file *filp, unsigned int cmd, 195static long snapshot_ioctl(struct file *filp, unsigned int cmd,
226 unsigned long arg) 196 unsigned long arg)
227{ 197{
@@ -257,11 +227,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
257 break; 227 break;
258 228
259 error = freeze_processes(); 229 error = freeze_processes();
260 if (error) { 230 if (error)
261 thaw_processes();
262 usermodehelper_enable(); 231 usermodehelper_enable();
263 } 232 else
264 if (!error)
265 data->frozen = 1; 233 data->frozen = 1;
266 break; 234 break;
267 235
@@ -274,8 +242,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
274 data->frozen = 0; 242 data->frozen = 0;
275 break; 243 break;
276 244
277 case SNAPSHOT_ATOMIC_SNAPSHOT:
278 snapshot_deprecated_ioctl(cmd);
279 case SNAPSHOT_CREATE_IMAGE: 245 case SNAPSHOT_CREATE_IMAGE:
280 if (data->mode != O_RDONLY || !data->frozen || data->ready) { 246 if (data->mode != O_RDONLY || !data->frozen || data->ready) {
281 error = -EPERM; 247 error = -EPERM;
@@ -283,10 +249,17 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
283 } 249 }
284 pm_restore_gfp_mask(); 250 pm_restore_gfp_mask();
285 error = hibernation_snapshot(data->platform_support); 251 error = hibernation_snapshot(data->platform_support);
286 if (!error) 252 if (error) {
253 thaw_kernel_threads();
254 } else {
287 error = put_user(in_suspend, (int __user *)arg); 255 error = put_user(in_suspend, (int __user *)arg);
288 if (!error) 256 if (!error && !freezer_test_done)
289 data->ready = 1; 257 data->ready = 1;
258 if (freezer_test_done) {
259 freezer_test_done = false;
260 thaw_kernel_threads();
261 }
262 }
290 break; 263 break;
291 264
292 case SNAPSHOT_ATOMIC_RESTORE: 265 case SNAPSHOT_ATOMIC_RESTORE:
@@ -303,10 +276,17 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
303 swsusp_free(); 276 swsusp_free();
304 memset(&data->handle, 0, sizeof(struct snapshot_handle)); 277 memset(&data->handle, 0, sizeof(struct snapshot_handle));
305 data->ready = 0; 278 data->ready = 0;
279 /*
280 * It is necessary to thaw kernel threads here, because
281 * SNAPSHOT_CREATE_IMAGE may be invoked directly after
282 * SNAPSHOT_FREE. In that case, if kernel threads were not
283 * thawed, the preallocation of memory carried out by
284 * hibernation_snapshot() might run into problems (i.e. it
285 * might fail or even deadlock).
286 */
287 thaw_kernel_threads();
306 break; 288 break;
307 289
308 case SNAPSHOT_SET_IMAGE_SIZE:
309 snapshot_deprecated_ioctl(cmd);
310 case SNAPSHOT_PREF_IMAGE_SIZE: 290 case SNAPSHOT_PREF_IMAGE_SIZE:
311 image_size = arg; 291 image_size = arg;
312 break; 292 break;
@@ -321,16 +301,12 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
321 error = put_user(size, (loff_t __user *)arg); 301 error = put_user(size, (loff_t __user *)arg);
322 break; 302 break;
323 303
324 case SNAPSHOT_AVAIL_SWAP:
325 snapshot_deprecated_ioctl(cmd);
326 case SNAPSHOT_AVAIL_SWAP_SIZE: 304 case SNAPSHOT_AVAIL_SWAP_SIZE:
327 size = count_swap_pages(data->swap, 1); 305 size = count_swap_pages(data->swap, 1);
328 size <<= PAGE_SHIFT; 306 size <<= PAGE_SHIFT;
329 error = put_user(size, (loff_t __user *)arg); 307 error = put_user(size, (loff_t __user *)arg);
330 break; 308 break;
331 309
332 case SNAPSHOT_GET_SWAP_PAGE:
333 snapshot_deprecated_ioctl(cmd);
334 case SNAPSHOT_ALLOC_SWAP_PAGE: 310 case SNAPSHOT_ALLOC_SWAP_PAGE:
335 if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { 311 if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
336 error = -ENODEV; 312 error = -ENODEV;
@@ -353,27 +329,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
353 free_all_swap_pages(data->swap); 329 free_all_swap_pages(data->swap);
354 break; 330 break;
355 331
356 case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */
357 snapshot_deprecated_ioctl(cmd);
358 if (!swsusp_swap_in_use()) {
359 /*
360 * User space encodes device types as two-byte values,
361 * so we need to recode them
362 */
363 if (old_decode_dev(arg)) {
364 data->swap = swap_type_of(old_decode_dev(arg),
365 0, NULL);
366 if (data->swap < 0)
367 error = -ENODEV;
368 } else {
369 data->swap = -1;
370 error = -EINVAL;
371 }
372 } else {
373 error = -EPERM;
374 }
375 break;
376
377 case SNAPSHOT_S2RAM: 332 case SNAPSHOT_S2RAM:
378 if (!data->frozen) { 333 if (!data->frozen) {
379 error = -EPERM; 334 error = -EPERM;
@@ -396,33 +351,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
396 error = hibernation_platform_enter(); 351 error = hibernation_platform_enter();
397 break; 352 break;
398 353
399 case SNAPSHOT_PMOPS: /* This ioctl is deprecated */
400 snapshot_deprecated_ioctl(cmd);
401 error = -EINVAL;
402
403 switch (arg) {
404
405 case PMOPS_PREPARE:
406 data->platform_support = 1;
407 error = 0;
408 break;
409
410 case PMOPS_ENTER:
411 if (data->platform_support)
412 error = hibernation_platform_enter();
413 break;
414
415 case PMOPS_FINISH:
416 if (data->platform_support)
417 error = 0;
418 break;
419
420 default:
421 printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg);
422
423 }
424 break;
425
426 case SNAPSHOT_SET_SWAP_AREA: 354 case SNAPSHOT_SET_SWAP_AREA:
427 if (swsusp_swap_in_use()) { 355 if (swsusp_swap_in_use()) {
428 error = -EPERM; 356 error = -EPERM;
@@ -464,6 +392,66 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
464 return error; 392 return error;
465} 393}
466 394
395#ifdef CONFIG_COMPAT
396
397struct compat_resume_swap_area {
398 compat_loff_t offset;
399 u32 dev;
400} __packed;
401
402static long
403snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
404{
405 BUILD_BUG_ON(sizeof(loff_t) != sizeof(compat_loff_t));
406
407 switch (cmd) {
408 case SNAPSHOT_GET_IMAGE_SIZE:
409 case SNAPSHOT_AVAIL_SWAP_SIZE:
410 case SNAPSHOT_ALLOC_SWAP_PAGE: {
411 compat_loff_t __user *uoffset = compat_ptr(arg);
412 loff_t offset;
413 mm_segment_t old_fs;
414 int err;
415
416 old_fs = get_fs();
417 set_fs(KERNEL_DS);
418 err = snapshot_ioctl(file, cmd, (unsigned long) &offset);
419 set_fs(old_fs);
420 if (!err && put_user(offset, uoffset))
421 err = -EFAULT;
422 return err;
423 }
424
425 case SNAPSHOT_CREATE_IMAGE:
426 return snapshot_ioctl(file, cmd,
427 (unsigned long) compat_ptr(arg));
428
429 case SNAPSHOT_SET_SWAP_AREA: {
430 struct compat_resume_swap_area __user *u_swap_area =
431 compat_ptr(arg);
432 struct resume_swap_area swap_area;
433 mm_segment_t old_fs;
434 int err;
435
436 err = get_user(swap_area.offset, &u_swap_area->offset);
437 err |= get_user(swap_area.dev, &u_swap_area->dev);
438 if (err)
439 return -EFAULT;
440 old_fs = get_fs();
441 set_fs(KERNEL_DS);
442 err = snapshot_ioctl(file, SNAPSHOT_SET_SWAP_AREA,
443 (unsigned long) &swap_area);
444 set_fs(old_fs);
445 return err;
446 }
447
448 default:
449 return snapshot_ioctl(file, cmd, arg);
450 }
451}
452
453#endif /* CONFIG_COMPAT */
454
467static const struct file_operations snapshot_fops = { 455static const struct file_operations snapshot_fops = {
468 .open = snapshot_open, 456 .open = snapshot_open,
469 .release = snapshot_release, 457 .release = snapshot_release,
@@ -471,6 +459,9 @@ static const struct file_operations snapshot_fops = {
471 .write = snapshot_write, 459 .write = snapshot_write,
472 .llseek = no_llseek, 460 .llseek = no_llseek,
473 .unlocked_ioctl = snapshot_ioctl, 461 .unlocked_ioctl = snapshot_ioctl,
462#ifdef CONFIG_COMPAT
463 .compat_ioctl = snapshot_compat_ioctl,
464#endif
474}; 465};
475 466
476static struct miscdevice snapshot_device = { 467static struct miscdevice snapshot_device = {
diff --git a/kernel/printk.c b/kernel/printk.c
index 1455a0d4eedd..13c0a1143f49 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -199,7 +199,7 @@ void __init setup_log_buf(int early)
199 unsigned long mem; 199 unsigned long mem;
200 200
201 mem = memblock_alloc(new_log_buf_len, PAGE_SIZE); 201 mem = memblock_alloc(new_log_buf_len, PAGE_SIZE);
202 if (mem == MEMBLOCK_ERROR) 202 if (!mem)
203 return; 203 return;
204 new_log_buf = __va(mem); 204 new_log_buf = __va(mem);
205 } else { 205 } else {
@@ -521,7 +521,7 @@ static void __call_console_drivers(unsigned start, unsigned end)
521 } 521 }
522} 522}
523 523
524static int __read_mostly ignore_loglevel; 524static bool __read_mostly ignore_loglevel;
525 525
526static int __init ignore_loglevel_setup(char *str) 526static int __init ignore_loglevel_setup(char *str)
527{ 527{
@@ -532,7 +532,7 @@ static int __init ignore_loglevel_setup(char *str)
532} 532}
533 533
534early_param("ignore_loglevel", ignore_loglevel_setup); 534early_param("ignore_loglevel", ignore_loglevel_setup);
535module_param_named(ignore_loglevel, ignore_loglevel, bool, S_IRUGO | S_IWUSR); 535module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
536MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" 536MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
537 "print all kernel messages to the console."); 537 "print all kernel messages to the console.");
538 538
@@ -688,6 +688,7 @@ static void zap_locks(void)
688 688
689 oops_timestamp = jiffies; 689 oops_timestamp = jiffies;
690 690
691 debug_locks_off();
691 /* If a crash is occurring, make sure we can't deadlock */ 692 /* If a crash is occurring, make sure we can't deadlock */
692 raw_spin_lock_init(&logbuf_lock); 693 raw_spin_lock_init(&logbuf_lock);
693 /* And make sure that we print immediately */ 694 /* And make sure that we print immediately */
@@ -695,9 +696,9 @@ static void zap_locks(void)
695} 696}
696 697
697#if defined(CONFIG_PRINTK_TIME) 698#if defined(CONFIG_PRINTK_TIME)
698static int printk_time = 1; 699static bool printk_time = 1;
699#else 700#else
700static int printk_time = 0; 701static bool printk_time = 0;
701#endif 702#endif
702module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); 703module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
703 704
@@ -840,9 +841,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
840 boot_delay_msec(); 841 boot_delay_msec();
841 printk_delay(); 842 printk_delay();
842 843
843 preempt_disable();
844 /* This stops the holder of console_sem just where we want him */ 844 /* This stops the holder of console_sem just where we want him */
845 raw_local_irq_save(flags); 845 local_irq_save(flags);
846 this_cpu = smp_processor_id(); 846 this_cpu = smp_processor_id();
847 847
848 /* 848 /*
@@ -856,7 +856,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
856 * recursion and return - but flag the recursion so that 856 * recursion and return - but flag the recursion so that
857 * it can be printed at the next appropriate moment: 857 * it can be printed at the next appropriate moment:
858 */ 858 */
859 if (!oops_in_progress) { 859 if (!oops_in_progress && !lockdep_recursing(current)) {
860 recursion_bug = 1; 860 recursion_bug = 1;
861 goto out_restore_irqs; 861 goto out_restore_irqs;
862 } 862 }
@@ -962,9 +962,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
962 962
963 lockdep_on(); 963 lockdep_on();
964out_restore_irqs: 964out_restore_irqs:
965 raw_local_irq_restore(flags); 965 local_irq_restore(flags);
966 966
967 preempt_enable();
968 return printed_len; 967 return printed_len;
969} 968}
970EXPORT_SYMBOL(printk); 969EXPORT_SYMBOL(printk);
@@ -1099,7 +1098,7 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha
1099 return -1; 1098 return -1;
1100} 1099}
1101 1100
1102int console_suspend_enabled = 1; 1101bool console_suspend_enabled = 1;
1103EXPORT_SYMBOL(console_suspend_enabled); 1102EXPORT_SYMBOL(console_suspend_enabled);
1104 1103
1105static int __init console_suspend_disable(char *str) 1104static int __init console_suspend_disable(char *str)
@@ -1293,10 +1292,11 @@ again:
1293 raw_spin_lock(&logbuf_lock); 1292 raw_spin_lock(&logbuf_lock);
1294 if (con_start != log_end) 1293 if (con_start != log_end)
1295 retry = 1; 1294 retry = 1;
1295 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
1296
1296 if (retry && console_trylock()) 1297 if (retry && console_trylock())
1297 goto again; 1298 goto again;
1298 1299
1299 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
1300 if (wake_klogd) 1300 if (wake_klogd)
1301 wake_up_klogd(); 1301 wake_up_klogd();
1302} 1302}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 24d04477b257..00ab2ca5ed11 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -96,9 +96,20 @@ void __ptrace_unlink(struct task_struct *child)
96 */ 96 */
97 if (!(child->flags & PF_EXITING) && 97 if (!(child->flags & PF_EXITING) &&
98 (child->signal->flags & SIGNAL_STOP_STOPPED || 98 (child->signal->flags & SIGNAL_STOP_STOPPED ||
99 child->signal->group_stop_count)) 99 child->signal->group_stop_count)) {
100 child->jobctl |= JOBCTL_STOP_PENDING; 100 child->jobctl |= JOBCTL_STOP_PENDING;
101 101
102 /*
103 * This is only possible if this thread was cloned by the
104 * traced task running in the stopped group, set the signal
105 * for the future reports.
106 * FIXME: we should change ptrace_init_task() to handle this
107 * case.
108 */
109 if (!(child->jobctl & JOBCTL_STOP_SIGMASK))
110 child->jobctl |= SIGSTOP;
111 }
112
102 /* 113 /*
103 * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick 114 * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick
104 * @child in the butt. Note that @resume should be used iff @child 115 * @child in the butt. Note that @resume should be used iff @child
@@ -161,6 +172,14 @@ int ptrace_check_attach(struct task_struct *child, bool ignore_state)
161 return ret; 172 return ret;
162} 173}
163 174
175static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
176{
177 if (mode & PTRACE_MODE_NOAUDIT)
178 return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE);
179 else
180 return has_ns_capability(current, ns, CAP_SYS_PTRACE);
181}
182
164int __ptrace_may_access(struct task_struct *task, unsigned int mode) 183int __ptrace_may_access(struct task_struct *task, unsigned int mode)
165{ 184{
166 const struct cred *cred = current_cred(), *tcred; 185 const struct cred *cred = current_cred(), *tcred;
@@ -187,7 +206,7 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
187 cred->gid == tcred->sgid && 206 cred->gid == tcred->sgid &&
188 cred->gid == tcred->gid)) 207 cred->gid == tcred->gid))
189 goto ok; 208 goto ok;
190 if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE)) 209 if (ptrace_has_cap(tcred->user->user_ns, mode))
191 goto ok; 210 goto ok;
192 rcu_read_unlock(); 211 rcu_read_unlock();
193 return -EPERM; 212 return -EPERM;
@@ -196,7 +215,7 @@ ok:
196 smp_rmb(); 215 smp_rmb();
197 if (task->mm) 216 if (task->mm)
198 dumpable = get_dumpable(task->mm); 217 dumpable = get_dumpable(task->mm);
199 if (!dumpable && !task_ns_capable(task, CAP_SYS_PTRACE)) 218 if (!dumpable && !ptrace_has_cap(task_user_ns(task), mode))
200 return -EPERM; 219 return -EPERM;
201 220
202 return security_ptrace_access_check(task, mode); 221 return security_ptrace_access_check(task, mode);
@@ -266,7 +285,7 @@ static int ptrace_attach(struct task_struct *task, long request,
266 task->ptrace = PT_PTRACED; 285 task->ptrace = PT_PTRACED;
267 if (seize) 286 if (seize)
268 task->ptrace |= PT_SEIZED; 287 task->ptrace |= PT_SEIZED;
269 if (task_ns_capable(task, CAP_SYS_PTRACE)) 288 if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE))
270 task->ptrace |= PT_PTRACE_CAP; 289 task->ptrace |= PT_PTRACE_CAP;
271 290
272 __ptrace_link(task, current); 291 __ptrace_link(task, current);
diff --git a/kernel/rcu.h b/kernel/rcu.h
index f600868d550d..aa88baab5f78 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu.h
@@ -30,6 +30,13 @@
30#endif /* #else #ifdef CONFIG_RCU_TRACE */ 30#endif /* #else #ifdef CONFIG_RCU_TRACE */
31 31
32/* 32/*
33 * Process-level increment to ->dynticks_nesting field. This allows for
34 * architectures that use half-interrupts and half-exceptions from
35 * process context.
36 */
37#define DYNTICK_TASK_NESTING (LLONG_MAX / 2 - 1)
38
39/*
33 * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally 40 * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally
34 * by call_rcu() and rcu callback execution, and are therefore not part of the 41 * by call_rcu() and rcu callback execution, and are therefore not part of the
35 * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors. 42 * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors.
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index c5b98e565aee..2bc4e135ff23 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -93,6 +93,8 @@ int rcu_read_lock_bh_held(void)
93{ 93{
94 if (!debug_lockdep_rcu_enabled()) 94 if (!debug_lockdep_rcu_enabled())
95 return 1; 95 return 1;
96 if (rcu_is_cpu_idle())
97 return 0;
96 return in_softirq() || irqs_disabled(); 98 return in_softirq() || irqs_disabled();
97} 99}
98EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); 100EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
@@ -316,3 +318,13 @@ struct debug_obj_descr rcuhead_debug_descr = {
316}; 318};
317EXPORT_SYMBOL_GPL(rcuhead_debug_descr); 319EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
318#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ 320#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
321
322#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
323void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp)
324{
325 trace_rcu_torture_read(rcutorturename, rhp);
326}
327EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
328#else
329#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0)
330#endif
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 636af6d9c6e5..977296dca0a4 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -53,31 +53,137 @@ static void __call_rcu(struct rcu_head *head,
53 53
54#include "rcutiny_plugin.h" 54#include "rcutiny_plugin.h"
55 55
56#ifdef CONFIG_NO_HZ 56static long long rcu_dynticks_nesting = DYNTICK_TASK_NESTING;
57 57
58static long rcu_dynticks_nesting = 1; 58/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
59static void rcu_idle_enter_common(long long oldval)
60{
61 if (rcu_dynticks_nesting) {
62 RCU_TRACE(trace_rcu_dyntick("--=",
63 oldval, rcu_dynticks_nesting));
64 return;
65 }
66 RCU_TRACE(trace_rcu_dyntick("Start", oldval, rcu_dynticks_nesting));
67 if (!is_idle_task(current)) {
68 struct task_struct *idle = idle_task(smp_processor_id());
69
70 RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task",
71 oldval, rcu_dynticks_nesting));
72 ftrace_dump(DUMP_ALL);
73 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
74 current->pid, current->comm,
75 idle->pid, idle->comm); /* must be idle task! */
76 }
77 rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */
78}
59 79
60/* 80/*
61 * Enter dynticks-idle mode, which is an extended quiescent state 81 * Enter idle, which is an extended quiescent state if we have fully
62 * if we have fully entered that mode (i.e., if the new value of 82 * entered that mode (i.e., if the new value of dynticks_nesting is zero).
63 * dynticks_nesting is zero).
64 */ 83 */
65void rcu_enter_nohz(void) 84void rcu_idle_enter(void)
66{ 85{
67 if (--rcu_dynticks_nesting == 0) 86 unsigned long flags;
68 rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ 87 long long oldval;
88
89 local_irq_save(flags);
90 oldval = rcu_dynticks_nesting;
91 rcu_dynticks_nesting = 0;
92 rcu_idle_enter_common(oldval);
93 local_irq_restore(flags);
69} 94}
70 95
71/* 96/*
72 * Exit dynticks-idle mode, so that we are no longer in an extended 97 * Exit an interrupt handler towards idle.
73 * quiescent state.
74 */ 98 */
75void rcu_exit_nohz(void) 99void rcu_irq_exit(void)
100{
101 unsigned long flags;
102 long long oldval;
103
104 local_irq_save(flags);
105 oldval = rcu_dynticks_nesting;
106 rcu_dynticks_nesting--;
107 WARN_ON_ONCE(rcu_dynticks_nesting < 0);
108 rcu_idle_enter_common(oldval);
109 local_irq_restore(flags);
110}
111
112/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */
113static void rcu_idle_exit_common(long long oldval)
76{ 114{
115 if (oldval) {
116 RCU_TRACE(trace_rcu_dyntick("++=",
117 oldval, rcu_dynticks_nesting));
118 return;
119 }
120 RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting));
121 if (!is_idle_task(current)) {
122 struct task_struct *idle = idle_task(smp_processor_id());
123
124 RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task",
125 oldval, rcu_dynticks_nesting));
126 ftrace_dump(DUMP_ALL);
127 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
128 current->pid, current->comm,
129 idle->pid, idle->comm); /* must be idle task! */
130 }
131}
132
133/*
134 * Exit idle, so that we are no longer in an extended quiescent state.
135 */
136void rcu_idle_exit(void)
137{
138 unsigned long flags;
139 long long oldval;
140
141 local_irq_save(flags);
142 oldval = rcu_dynticks_nesting;
143 WARN_ON_ONCE(oldval != 0);
144 rcu_dynticks_nesting = DYNTICK_TASK_NESTING;
145 rcu_idle_exit_common(oldval);
146 local_irq_restore(flags);
147}
148
149/*
150 * Enter an interrupt handler, moving away from idle.
151 */
152void rcu_irq_enter(void)
153{
154 unsigned long flags;
155 long long oldval;
156
157 local_irq_save(flags);
158 oldval = rcu_dynticks_nesting;
77 rcu_dynticks_nesting++; 159 rcu_dynticks_nesting++;
160 WARN_ON_ONCE(rcu_dynticks_nesting == 0);
161 rcu_idle_exit_common(oldval);
162 local_irq_restore(flags);
163}
164
165#ifdef CONFIG_PROVE_RCU
166
167/*
168 * Test whether RCU thinks that the current CPU is idle.
169 */
170int rcu_is_cpu_idle(void)
171{
172 return !rcu_dynticks_nesting;
78} 173}
174EXPORT_SYMBOL(rcu_is_cpu_idle);
175
176#endif /* #ifdef CONFIG_PROVE_RCU */
79 177
80#endif /* #ifdef CONFIG_NO_HZ */ 178/*
179 * Test whether the current CPU was interrupted from idle. Nested
180 * interrupts don't count, we must be running at the first interrupt
181 * level.
182 */
183int rcu_is_cpu_rrupt_from_idle(void)
184{
185 return rcu_dynticks_nesting <= 0;
186}
81 187
82/* 188/*
83 * Helper function for rcu_sched_qs() and rcu_bh_qs(). 189 * Helper function for rcu_sched_qs() and rcu_bh_qs().
@@ -126,14 +232,13 @@ void rcu_bh_qs(int cpu)
126 232
127/* 233/*
128 * Check to see if the scheduling-clock interrupt came from an extended 234 * Check to see if the scheduling-clock interrupt came from an extended
129 * quiescent state, and, if so, tell RCU about it. 235 * quiescent state, and, if so, tell RCU about it. This function must
236 * be called from hardirq context. It is normally called from the
237 * scheduling-clock interrupt.
130 */ 238 */
131void rcu_check_callbacks(int cpu, int user) 239void rcu_check_callbacks(int cpu, int user)
132{ 240{
133 if (user || 241 if (user || rcu_is_cpu_rrupt_from_idle())
134 (idle_cpu(cpu) &&
135 !in_softirq() &&
136 hardirq_count() <= (1 << HARDIRQ_SHIFT)))
137 rcu_sched_qs(cpu); 242 rcu_sched_qs(cpu);
138 else if (!in_softirq()) 243 else if (!in_softirq())
139 rcu_bh_qs(cpu); 244 rcu_bh_qs(cpu);
@@ -154,7 +259,11 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
154 /* If no RCU callbacks ready to invoke, just return. */ 259 /* If no RCU callbacks ready to invoke, just return. */
155 if (&rcp->rcucblist == rcp->donetail) { 260 if (&rcp->rcucblist == rcp->donetail) {
156 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); 261 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1));
157 RCU_TRACE(trace_rcu_batch_end(rcp->name, 0)); 262 RCU_TRACE(trace_rcu_batch_end(rcp->name, 0,
263 ACCESS_ONCE(rcp->rcucblist),
264 need_resched(),
265 is_idle_task(current),
266 rcu_is_callbacks_kthread()));
158 return; 267 return;
159 } 268 }
160 269
@@ -183,7 +292,9 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
183 RCU_TRACE(cb_count++); 292 RCU_TRACE(cb_count++);
184 } 293 }
185 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); 294 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
186 RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count)); 295 RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(),
296 is_idle_task(current),
297 rcu_is_callbacks_kthread()));
187} 298}
188 299
189static void rcu_process_callbacks(struct softirq_action *unused) 300static void rcu_process_callbacks(struct softirq_action *unused)
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 2b0484a5dc28..9cb1ae4aabdd 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -312,8 +312,8 @@ static int rcu_boost(void)
312 rt_mutex_lock(&mtx); 312 rt_mutex_lock(&mtx);
313 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ 313 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
314 314
315 return rcu_preempt_ctrlblk.boost_tasks != NULL || 315 return ACCESS_ONCE(rcu_preempt_ctrlblk.boost_tasks) != NULL ||
316 rcu_preempt_ctrlblk.exp_tasks != NULL; 316 ACCESS_ONCE(rcu_preempt_ctrlblk.exp_tasks) != NULL;
317} 317}
318 318
319/* 319/*
@@ -885,6 +885,19 @@ static void invoke_rcu_callbacks(void)
885 wake_up(&rcu_kthread_wq); 885 wake_up(&rcu_kthread_wq);
886} 886}
887 887
888#ifdef CONFIG_RCU_TRACE
889
890/*
891 * Is the current CPU running the RCU-callbacks kthread?
892 * Caller must have preemption disabled.
893 */
894static bool rcu_is_callbacks_kthread(void)
895{
896 return rcu_kthread_task == current;
897}
898
899#endif /* #ifdef CONFIG_RCU_TRACE */
900
888/* 901/*
889 * This kthread invokes RCU callbacks whose grace periods have 902 * This kthread invokes RCU callbacks whose grace periods have
890 * elapsed. It is awakened as needed, and takes the place of the 903 * elapsed. It is awakened as needed, and takes the place of the
@@ -938,6 +951,18 @@ void invoke_rcu_callbacks(void)
938 raise_softirq(RCU_SOFTIRQ); 951 raise_softirq(RCU_SOFTIRQ);
939} 952}
940 953
954#ifdef CONFIG_RCU_TRACE
955
956/*
957 * There is no callback kthread, so this thread is never it.
958 */
959static bool rcu_is_callbacks_kthread(void)
960{
961 return false;
962}
963
964#endif /* #ifdef CONFIG_RCU_TRACE */
965
941void rcu_init(void) 966void rcu_init(void)
942{ 967{
943 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 968 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 764825c2685c..a58ac285fc69 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -56,14 +56,16 @@ static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */
56static int nfakewriters = 4; /* # fake writer threads */ 56static int nfakewriters = 4; /* # fake writer threads */
57static int stat_interval; /* Interval between stats, in seconds. */ 57static int stat_interval; /* Interval between stats, in seconds. */
58 /* Defaults to "only at end of test". */ 58 /* Defaults to "only at end of test". */
59static int verbose; /* Print more debug info. */ 59static bool verbose; /* Print more debug info. */
60static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ 60static bool test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
61static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ 61static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
62static int stutter = 5; /* Start/stop testing interval (in sec) */ 62static int stutter = 5; /* Start/stop testing interval (in sec) */
63static int irqreader = 1; /* RCU readers from irq (timers). */ 63static int irqreader = 1; /* RCU readers from irq (timers). */
64static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ 64static int fqs_duration; /* Duration of bursts (us), 0 to disable. */
65static int fqs_holdoff = 0; /* Hold time within burst (us). */ 65static int fqs_holdoff; /* Hold time within burst (us). */
66static int fqs_stutter = 3; /* Wait time between bursts (s). */ 66static int fqs_stutter = 3; /* Wait time between bursts (s). */
67static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */
68static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */
67static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ 69static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
68static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ 70static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
69static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ 71static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
@@ -91,6 +93,10 @@ module_param(fqs_holdoff, int, 0444);
91MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); 93MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
92module_param(fqs_stutter, int, 0444); 94module_param(fqs_stutter, int, 0444);
93MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); 95MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
96module_param(onoff_interval, int, 0444);
97MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
98module_param(shutdown_secs, int, 0444);
99MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable.");
94module_param(test_boost, int, 0444); 100module_param(test_boost, int, 0444);
95MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); 101MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
96module_param(test_boost_interval, int, 0444); 102module_param(test_boost_interval, int, 0444);
@@ -119,6 +125,10 @@ static struct task_struct *shuffler_task;
119static struct task_struct *stutter_task; 125static struct task_struct *stutter_task;
120static struct task_struct *fqs_task; 126static struct task_struct *fqs_task;
121static struct task_struct *boost_tasks[NR_CPUS]; 127static struct task_struct *boost_tasks[NR_CPUS];
128static struct task_struct *shutdown_task;
129#ifdef CONFIG_HOTPLUG_CPU
130static struct task_struct *onoff_task;
131#endif /* #ifdef CONFIG_HOTPLUG_CPU */
122 132
123#define RCU_TORTURE_PIPE_LEN 10 133#define RCU_TORTURE_PIPE_LEN 10
124 134
@@ -149,6 +159,10 @@ static long n_rcu_torture_boost_rterror;
149static long n_rcu_torture_boost_failure; 159static long n_rcu_torture_boost_failure;
150static long n_rcu_torture_boosts; 160static long n_rcu_torture_boosts;
151static long n_rcu_torture_timers; 161static long n_rcu_torture_timers;
162static long n_offline_attempts;
163static long n_offline_successes;
164static long n_online_attempts;
165static long n_online_successes;
152static struct list_head rcu_torture_removed; 166static struct list_head rcu_torture_removed;
153static cpumask_var_t shuffle_tmp_mask; 167static cpumask_var_t shuffle_tmp_mask;
154 168
@@ -160,6 +174,8 @@ static int stutter_pause_test;
160#define RCUTORTURE_RUNNABLE_INIT 0 174#define RCUTORTURE_RUNNABLE_INIT 0
161#endif 175#endif
162int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; 176int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
177module_param(rcutorture_runnable, int, 0444);
178MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot");
163 179
164#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) 180#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
165#define rcu_can_boost() 1 181#define rcu_can_boost() 1
@@ -167,6 +183,7 @@ int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
167#define rcu_can_boost() 0 183#define rcu_can_boost() 0
168#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ 184#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
169 185
186static unsigned long shutdown_time; /* jiffies to system shutdown. */
170static unsigned long boost_starttime; /* jiffies of next boost test start. */ 187static unsigned long boost_starttime; /* jiffies of next boost test start. */
171DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ 188DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
172 /* and boost task create/destroy. */ 189 /* and boost task create/destroy. */
@@ -182,6 +199,9 @@ static int fullstop = FULLSTOP_RMMOD;
182 */ 199 */
183static DEFINE_MUTEX(fullstop_mutex); 200static DEFINE_MUTEX(fullstop_mutex);
184 201
202/* Forward reference. */
203static void rcu_torture_cleanup(void);
204
185/* 205/*
186 * Detect and respond to a system shutdown. 206 * Detect and respond to a system shutdown.
187 */ 207 */
@@ -612,6 +632,30 @@ static struct rcu_torture_ops srcu_ops = {
612 .name = "srcu" 632 .name = "srcu"
613}; 633};
614 634
635static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl)
636{
637 return srcu_read_lock_raw(&srcu_ctl);
638}
639
640static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl)
641{
642 srcu_read_unlock_raw(&srcu_ctl, idx);
643}
644
645static struct rcu_torture_ops srcu_raw_ops = {
646 .init = srcu_torture_init,
647 .cleanup = srcu_torture_cleanup,
648 .readlock = srcu_torture_read_lock_raw,
649 .read_delay = srcu_read_delay,
650 .readunlock = srcu_torture_read_unlock_raw,
651 .completed = srcu_torture_completed,
652 .deferred_free = rcu_sync_torture_deferred_free,
653 .sync = srcu_torture_synchronize,
654 .cb_barrier = NULL,
655 .stats = srcu_torture_stats,
656 .name = "srcu_raw"
657};
658
615static void srcu_torture_synchronize_expedited(void) 659static void srcu_torture_synchronize_expedited(void)
616{ 660{
617 synchronize_srcu_expedited(&srcu_ctl); 661 synchronize_srcu_expedited(&srcu_ctl);
@@ -913,6 +957,18 @@ rcu_torture_fakewriter(void *arg)
913 return 0; 957 return 0;
914} 958}
915 959
960void rcutorture_trace_dump(void)
961{
962 static atomic_t beenhere = ATOMIC_INIT(0);
963
964 if (atomic_read(&beenhere))
965 return;
966 if (atomic_xchg(&beenhere, 1) != 0)
967 return;
968 do_trace_rcu_torture_read(cur_ops->name, (struct rcu_head *)~0UL);
969 ftrace_dump(DUMP_ALL);
970}
971
916/* 972/*
917 * RCU torture reader from timer handler. Dereferences rcu_torture_current, 973 * RCU torture reader from timer handler. Dereferences rcu_torture_current,
918 * incrementing the corresponding element of the pipeline array. The 974 * incrementing the corresponding element of the pipeline array. The
@@ -934,6 +990,7 @@ static void rcu_torture_timer(unsigned long unused)
934 rcu_read_lock_bh_held() || 990 rcu_read_lock_bh_held() ||
935 rcu_read_lock_sched_held() || 991 rcu_read_lock_sched_held() ||
936 srcu_read_lock_held(&srcu_ctl)); 992 srcu_read_lock_held(&srcu_ctl));
993 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
937 if (p == NULL) { 994 if (p == NULL) {
938 /* Leave because rcu_torture_writer is not yet underway */ 995 /* Leave because rcu_torture_writer is not yet underway */
939 cur_ops->readunlock(idx); 996 cur_ops->readunlock(idx);
@@ -951,6 +1008,8 @@ static void rcu_torture_timer(unsigned long unused)
951 /* Should not happen, but... */ 1008 /* Should not happen, but... */
952 pipe_count = RCU_TORTURE_PIPE_LEN; 1009 pipe_count = RCU_TORTURE_PIPE_LEN;
953 } 1010 }
1011 if (pipe_count > 1)
1012 rcutorture_trace_dump();
954 __this_cpu_inc(rcu_torture_count[pipe_count]); 1013 __this_cpu_inc(rcu_torture_count[pipe_count]);
955 completed = cur_ops->completed() - completed; 1014 completed = cur_ops->completed() - completed;
956 if (completed > RCU_TORTURE_PIPE_LEN) { 1015 if (completed > RCU_TORTURE_PIPE_LEN) {
@@ -994,6 +1053,7 @@ rcu_torture_reader(void *arg)
994 rcu_read_lock_bh_held() || 1053 rcu_read_lock_bh_held() ||
995 rcu_read_lock_sched_held() || 1054 rcu_read_lock_sched_held() ||
996 srcu_read_lock_held(&srcu_ctl)); 1055 srcu_read_lock_held(&srcu_ctl));
1056 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
997 if (p == NULL) { 1057 if (p == NULL) {
998 /* Wait for rcu_torture_writer to get underway */ 1058 /* Wait for rcu_torture_writer to get underway */
999 cur_ops->readunlock(idx); 1059 cur_ops->readunlock(idx);
@@ -1009,6 +1069,8 @@ rcu_torture_reader(void *arg)
1009 /* Should not happen, but... */ 1069 /* Should not happen, but... */
1010 pipe_count = RCU_TORTURE_PIPE_LEN; 1070 pipe_count = RCU_TORTURE_PIPE_LEN;
1011 } 1071 }
1072 if (pipe_count > 1)
1073 rcutorture_trace_dump();
1012 __this_cpu_inc(rcu_torture_count[pipe_count]); 1074 __this_cpu_inc(rcu_torture_count[pipe_count]);
1013 completed = cur_ops->completed() - completed; 1075 completed = cur_ops->completed() - completed;
1014 if (completed > RCU_TORTURE_PIPE_LEN) { 1076 if (completed > RCU_TORTURE_PIPE_LEN) {
@@ -1056,7 +1118,8 @@ rcu_torture_printk(char *page)
1056 cnt += sprintf(&page[cnt], 1118 cnt += sprintf(&page[cnt],
1057 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " 1119 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
1058 "rtmbe: %d rtbke: %ld rtbre: %ld " 1120 "rtmbe: %d rtbke: %ld rtbre: %ld "
1059 "rtbf: %ld rtb: %ld nt: %ld", 1121 "rtbf: %ld rtb: %ld nt: %ld "
1122 "onoff: %ld/%ld:%ld/%ld",
1060 rcu_torture_current, 1123 rcu_torture_current,
1061 rcu_torture_current_version, 1124 rcu_torture_current_version,
1062 list_empty(&rcu_torture_freelist), 1125 list_empty(&rcu_torture_freelist),
@@ -1068,7 +1131,11 @@ rcu_torture_printk(char *page)
1068 n_rcu_torture_boost_rterror, 1131 n_rcu_torture_boost_rterror,
1069 n_rcu_torture_boost_failure, 1132 n_rcu_torture_boost_failure,
1070 n_rcu_torture_boosts, 1133 n_rcu_torture_boosts,
1071 n_rcu_torture_timers); 1134 n_rcu_torture_timers,
1135 n_online_successes,
1136 n_online_attempts,
1137 n_offline_successes,
1138 n_offline_attempts);
1072 if (atomic_read(&n_rcu_torture_mberror) != 0 || 1139 if (atomic_read(&n_rcu_torture_mberror) != 0 ||
1073 n_rcu_torture_boost_ktrerror != 0 || 1140 n_rcu_torture_boost_ktrerror != 0 ||
1074 n_rcu_torture_boost_rterror != 0 || 1141 n_rcu_torture_boost_rterror != 0 ||
@@ -1232,12 +1299,14 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
1232 "shuffle_interval=%d stutter=%d irqreader=%d " 1299 "shuffle_interval=%d stutter=%d irqreader=%d "
1233 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " 1300 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
1234 "test_boost=%d/%d test_boost_interval=%d " 1301 "test_boost=%d/%d test_boost_interval=%d "
1235 "test_boost_duration=%d\n", 1302 "test_boost_duration=%d shutdown_secs=%d "
1303 "onoff_interval=%d\n",
1236 torture_type, tag, nrealreaders, nfakewriters, 1304 torture_type, tag, nrealreaders, nfakewriters,
1237 stat_interval, verbose, test_no_idle_hz, shuffle_interval, 1305 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
1238 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, 1306 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
1239 test_boost, cur_ops->can_boost, 1307 test_boost, cur_ops->can_boost,
1240 test_boost_interval, test_boost_duration); 1308 test_boost_interval, test_boost_duration, shutdown_secs,
1309 onoff_interval);
1241} 1310}
1242 1311
1243static struct notifier_block rcutorture_shutdown_nb = { 1312static struct notifier_block rcutorture_shutdown_nb = {
@@ -1287,6 +1356,131 @@ static int rcutorture_booster_init(int cpu)
1287 return 0; 1356 return 0;
1288} 1357}
1289 1358
1359/*
1360 * Cause the rcutorture test to shutdown the system after the test has
1361 * run for the time specified by the shutdown_secs module parameter.
1362 */
1363static int
1364rcu_torture_shutdown(void *arg)
1365{
1366 long delta;
1367 unsigned long jiffies_snap;
1368
1369 VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started");
1370 jiffies_snap = ACCESS_ONCE(jiffies);
1371 while (ULONG_CMP_LT(jiffies_snap, shutdown_time) &&
1372 !kthread_should_stop()) {
1373 delta = shutdown_time - jiffies_snap;
1374 if (verbose)
1375 printk(KERN_ALERT "%s" TORTURE_FLAG
1376 "rcu_torture_shutdown task: %lu "
1377 "jiffies remaining\n",
1378 torture_type, delta);
1379 schedule_timeout_interruptible(delta);
1380 jiffies_snap = ACCESS_ONCE(jiffies);
1381 }
1382 if (kthread_should_stop()) {
1383 VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping");
1384 return 0;
1385 }
1386
1387 /* OK, shut down the system. */
1388
1389 VERBOSE_PRINTK_STRING("rcu_torture_shutdown task shutting down system");
1390 shutdown_task = NULL; /* Avoid self-kill deadlock. */
1391 rcu_torture_cleanup(); /* Get the success/failure message. */
1392 kernel_power_off(); /* Shut down the system. */
1393 return 0;
1394}
1395
1396#ifdef CONFIG_HOTPLUG_CPU
1397
1398/*
1399 * Execute random CPU-hotplug operations at the interval specified
1400 * by the onoff_interval.
1401 */
1402static int __cpuinit
1403rcu_torture_onoff(void *arg)
1404{
1405 int cpu;
1406 int maxcpu = -1;
1407 DEFINE_RCU_RANDOM(rand);
1408
1409 VERBOSE_PRINTK_STRING("rcu_torture_onoff task started");
1410 for_each_online_cpu(cpu)
1411 maxcpu = cpu;
1412 WARN_ON(maxcpu < 0);
1413 while (!kthread_should_stop()) {
1414 cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1);
1415 if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
1416 if (verbose)
1417 printk(KERN_ALERT "%s" TORTURE_FLAG
1418 "rcu_torture_onoff task: offlining %d\n",
1419 torture_type, cpu);
1420 n_offline_attempts++;
1421 if (cpu_down(cpu) == 0) {
1422 if (verbose)
1423 printk(KERN_ALERT "%s" TORTURE_FLAG
1424 "rcu_torture_onoff task: "
1425 "offlined %d\n",
1426 torture_type, cpu);
1427 n_offline_successes++;
1428 }
1429 } else if (cpu_is_hotpluggable(cpu)) {
1430 if (verbose)
1431 printk(KERN_ALERT "%s" TORTURE_FLAG
1432 "rcu_torture_onoff task: onlining %d\n",
1433 torture_type, cpu);
1434 n_online_attempts++;
1435 if (cpu_up(cpu) == 0) {
1436 if (verbose)
1437 printk(KERN_ALERT "%s" TORTURE_FLAG
1438 "rcu_torture_onoff task: "
1439 "onlined %d\n",
1440 torture_type, cpu);
1441 n_online_successes++;
1442 }
1443 }
1444 schedule_timeout_interruptible(onoff_interval * HZ);
1445 }
1446 VERBOSE_PRINTK_STRING("rcu_torture_onoff task stopping");
1447 return 0;
1448}
1449
1450static int __cpuinit
1451rcu_torture_onoff_init(void)
1452{
1453 if (onoff_interval <= 0)
1454 return 0;
1455 onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff");
1456 if (IS_ERR(onoff_task)) {
1457 onoff_task = NULL;
1458 return PTR_ERR(onoff_task);
1459 }
1460 return 0;
1461}
1462
1463static void rcu_torture_onoff_cleanup(void)
1464{
1465 if (onoff_task == NULL)
1466 return;
1467 VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task");
1468 kthread_stop(onoff_task);
1469}
1470
1471#else /* #ifdef CONFIG_HOTPLUG_CPU */
1472
1473static void
1474rcu_torture_onoff_init(void)
1475{
1476}
1477
1478static void rcu_torture_onoff_cleanup(void)
1479{
1480}
1481
1482#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
1483
1290static int rcutorture_cpu_notify(struct notifier_block *self, 1484static int rcutorture_cpu_notify(struct notifier_block *self,
1291 unsigned long action, void *hcpu) 1485 unsigned long action, void *hcpu)
1292{ 1486{
@@ -1391,6 +1585,11 @@ rcu_torture_cleanup(void)
1391 for_each_possible_cpu(i) 1585 for_each_possible_cpu(i)
1392 rcutorture_booster_cleanup(i); 1586 rcutorture_booster_cleanup(i);
1393 } 1587 }
1588 if (shutdown_task != NULL) {
1589 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task");
1590 kthread_stop(shutdown_task);
1591 }
1592 rcu_torture_onoff_cleanup();
1394 1593
1395 /* Wait for all RCU callbacks to fire. */ 1594 /* Wait for all RCU callbacks to fire. */
1396 1595
@@ -1416,7 +1615,7 @@ rcu_torture_init(void)
1416 static struct rcu_torture_ops *torture_ops[] = 1615 static struct rcu_torture_ops *torture_ops[] =
1417 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, 1616 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
1418 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, 1617 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
1419 &srcu_ops, &srcu_expedited_ops, 1618 &srcu_ops, &srcu_raw_ops, &srcu_expedited_ops,
1420 &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; 1619 &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
1421 1620
1422 mutex_lock(&fullstop_mutex); 1621 mutex_lock(&fullstop_mutex);
@@ -1607,6 +1806,18 @@ rcu_torture_init(void)
1607 } 1806 }
1608 } 1807 }
1609 } 1808 }
1809 if (shutdown_secs > 0) {
1810 shutdown_time = jiffies + shutdown_secs * HZ;
1811 shutdown_task = kthread_run(rcu_torture_shutdown, NULL,
1812 "rcu_torture_shutdown");
1813 if (IS_ERR(shutdown_task)) {
1814 firsterr = PTR_ERR(shutdown_task);
1815 VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown");
1816 shutdown_task = NULL;
1817 goto unwind;
1818 }
1819 }
1820 rcu_torture_onoff_init();
1610 register_reboot_notifier(&rcutorture_shutdown_nb); 1821 register_reboot_notifier(&rcutorture_shutdown_nb);
1611 rcutorture_record_test_transition(); 1822 rcutorture_record_test_transition();
1612 mutex_unlock(&fullstop_mutex); 1823 mutex_unlock(&fullstop_mutex);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 6b76d812740c..6c4a6722abfd 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -69,7 +69,7 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
69 NUM_RCU_LVL_3, \ 69 NUM_RCU_LVL_3, \
70 NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \ 70 NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \
71 }, \ 71 }, \
72 .signaled = RCU_GP_IDLE, \ 72 .fqs_state = RCU_GP_IDLE, \
73 .gpnum = -300, \ 73 .gpnum = -300, \
74 .completed = -300, \ 74 .completed = -300, \
75 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ 75 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \
@@ -195,12 +195,10 @@ void rcu_note_context_switch(int cpu)
195} 195}
196EXPORT_SYMBOL_GPL(rcu_note_context_switch); 196EXPORT_SYMBOL_GPL(rcu_note_context_switch);
197 197
198#ifdef CONFIG_NO_HZ
199DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 198DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
200 .dynticks_nesting = 1, 199 .dynticks_nesting = DYNTICK_TASK_NESTING,
201 .dynticks = ATOMIC_INIT(1), 200 .dynticks = ATOMIC_INIT(1),
202}; 201};
203#endif /* #ifdef CONFIG_NO_HZ */
204 202
205static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ 203static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */
206static int qhimark = 10000; /* If this many pending, ignore blimit. */ 204static int qhimark = 10000; /* If this many pending, ignore blimit. */
@@ -328,11 +326,11 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
328 return 1; 326 return 1;
329 } 327 }
330 328
331 /* If preemptible RCU, no point in sending reschedule IPI. */ 329 /*
332 if (rdp->preemptible) 330 * The CPU is online, so send it a reschedule IPI. This forces
333 return 0; 331 * it through the scheduler, and (inefficiently) also handles cases
334 332 * where idle loops fail to inform RCU about the CPU being idle.
335 /* The CPU is online, so send it a reschedule IPI. */ 333 */
336 if (rdp->cpu != smp_processor_id()) 334 if (rdp->cpu != smp_processor_id())
337 smp_send_reschedule(rdp->cpu); 335 smp_send_reschedule(rdp->cpu);
338 else 336 else
@@ -343,59 +341,181 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
343 341
344#endif /* #ifdef CONFIG_SMP */ 342#endif /* #ifdef CONFIG_SMP */
345 343
346#ifdef CONFIG_NO_HZ 344/*
345 * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle
346 *
347 * If the new value of the ->dynticks_nesting counter now is zero,
348 * we really have entered idle, and must do the appropriate accounting.
349 * The caller must have disabled interrupts.
350 */
351static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)
352{
353 trace_rcu_dyntick("Start", oldval, 0);
354 if (!is_idle_task(current)) {
355 struct task_struct *idle = idle_task(smp_processor_id());
356
357 trace_rcu_dyntick("Error on entry: not idle task", oldval, 0);
358 ftrace_dump(DUMP_ALL);
359 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
360 current->pid, current->comm,
361 idle->pid, idle->comm); /* must be idle task! */
362 }
363 rcu_prepare_for_idle(smp_processor_id());
364 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
365 smp_mb__before_atomic_inc(); /* See above. */
366 atomic_inc(&rdtp->dynticks);
367 smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */
368 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
369}
347 370
348/** 371/**
349 * rcu_enter_nohz - inform RCU that current CPU is entering nohz 372 * rcu_idle_enter - inform RCU that current CPU is entering idle
350 * 373 *
351 * Enter nohz mode, in other words, -leave- the mode in which RCU 374 * Enter idle mode, in other words, -leave- the mode in which RCU
352 * read-side critical sections can occur. (Though RCU read-side 375 * read-side critical sections can occur. (Though RCU read-side
353 * critical sections can occur in irq handlers in nohz mode, a possibility 376 * critical sections can occur in irq handlers in idle, a possibility
354 * handled by rcu_irq_enter() and rcu_irq_exit()). 377 * handled by irq_enter() and irq_exit().)
378 *
379 * We crowbar the ->dynticks_nesting field to zero to allow for
380 * the possibility of usermode upcalls having messed up our count
381 * of interrupt nesting level during the prior busy period.
355 */ 382 */
356void rcu_enter_nohz(void) 383void rcu_idle_enter(void)
357{ 384{
358 unsigned long flags; 385 unsigned long flags;
386 long long oldval;
359 struct rcu_dynticks *rdtp; 387 struct rcu_dynticks *rdtp;
360 388
361 local_irq_save(flags); 389 local_irq_save(flags);
362 rdtp = &__get_cpu_var(rcu_dynticks); 390 rdtp = &__get_cpu_var(rcu_dynticks);
363 if (--rdtp->dynticks_nesting) { 391 oldval = rdtp->dynticks_nesting;
364 local_irq_restore(flags); 392 rdtp->dynticks_nesting = 0;
365 return; 393 rcu_idle_enter_common(rdtp, oldval);
366 }
367 trace_rcu_dyntick("Start");
368 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
369 smp_mb__before_atomic_inc(); /* See above. */
370 atomic_inc(&rdtp->dynticks);
371 smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */
372 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
373 local_irq_restore(flags); 394 local_irq_restore(flags);
374} 395}
375 396
376/* 397/**
377 * rcu_exit_nohz - inform RCU that current CPU is leaving nohz 398 * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
399 *
400 * Exit from an interrupt handler, which might possibly result in entering
401 * idle mode, in other words, leaving the mode in which read-side critical
402 * sections can occur.
378 * 403 *
379 * Exit nohz mode, in other words, -enter- the mode in which RCU 404 * This code assumes that the idle loop never does anything that might
380 * read-side critical sections normally occur. 405 * result in unbalanced calls to irq_enter() and irq_exit(). If your
406 * architecture violates this assumption, RCU will give you what you
407 * deserve, good and hard. But very infrequently and irreproducibly.
408 *
409 * Use things like work queues to work around this limitation.
410 *
411 * You have been warned.
381 */ 412 */
382void rcu_exit_nohz(void) 413void rcu_irq_exit(void)
383{ 414{
384 unsigned long flags; 415 unsigned long flags;
416 long long oldval;
385 struct rcu_dynticks *rdtp; 417 struct rcu_dynticks *rdtp;
386 418
387 local_irq_save(flags); 419 local_irq_save(flags);
388 rdtp = &__get_cpu_var(rcu_dynticks); 420 rdtp = &__get_cpu_var(rcu_dynticks);
389 if (rdtp->dynticks_nesting++) { 421 oldval = rdtp->dynticks_nesting;
390 local_irq_restore(flags); 422 rdtp->dynticks_nesting--;
391 return; 423 WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
392 } 424 if (rdtp->dynticks_nesting)
425 trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting);
426 else
427 rcu_idle_enter_common(rdtp, oldval);
428 local_irq_restore(flags);
429}
430
431/*
432 * rcu_idle_exit_common - inform RCU that current CPU is moving away from idle
433 *
434 * If the new value of the ->dynticks_nesting counter was previously zero,
435 * we really have exited idle, and must do the appropriate accounting.
436 * The caller must have disabled interrupts.
437 */
438static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)
439{
393 smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ 440 smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */
394 atomic_inc(&rdtp->dynticks); 441 atomic_inc(&rdtp->dynticks);
395 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ 442 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
396 smp_mb__after_atomic_inc(); /* See above. */ 443 smp_mb__after_atomic_inc(); /* See above. */
397 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); 444 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
398 trace_rcu_dyntick("End"); 445 rcu_cleanup_after_idle(smp_processor_id());
446 trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting);
447 if (!is_idle_task(current)) {
448 struct task_struct *idle = idle_task(smp_processor_id());
449
450 trace_rcu_dyntick("Error on exit: not idle task",
451 oldval, rdtp->dynticks_nesting);
452 ftrace_dump(DUMP_ALL);
453 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
454 current->pid, current->comm,
455 idle->pid, idle->comm); /* must be idle task! */
456 }
457}
458
459/**
460 * rcu_idle_exit - inform RCU that current CPU is leaving idle
461 *
462 * Exit idle mode, in other words, -enter- the mode in which RCU
463 * read-side critical sections can occur.
464 *
465 * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NESTING to
466 * allow for the possibility of usermode upcalls messing up our count
467 * of interrupt nesting level during the busy period that is just
468 * now starting.
469 */
470void rcu_idle_exit(void)
471{
472 unsigned long flags;
473 struct rcu_dynticks *rdtp;
474 long long oldval;
475
476 local_irq_save(flags);
477 rdtp = &__get_cpu_var(rcu_dynticks);
478 oldval = rdtp->dynticks_nesting;
479 WARN_ON_ONCE(oldval != 0);
480 rdtp->dynticks_nesting = DYNTICK_TASK_NESTING;
481 rcu_idle_exit_common(rdtp, oldval);
482 local_irq_restore(flags);
483}
484
485/**
486 * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
487 *
488 * Enter an interrupt handler, which might possibly result in exiting
489 * idle mode, in other words, entering the mode in which read-side critical
490 * sections can occur.
491 *
492 * Note that the Linux kernel is fully capable of entering an interrupt
493 * handler that it never exits, for example when doing upcalls to
494 * user mode! This code assumes that the idle loop never does upcalls to
495 * user mode. If your architecture does do upcalls from the idle loop (or
496 * does anything else that results in unbalanced calls to the irq_enter()
497 * and irq_exit() functions), RCU will give you what you deserve, good
498 * and hard. But very infrequently and irreproducibly.
499 *
500 * Use things like work queues to work around this limitation.
501 *
502 * You have been warned.
503 */
504void rcu_irq_enter(void)
505{
506 unsigned long flags;
507 struct rcu_dynticks *rdtp;
508 long long oldval;
509
510 local_irq_save(flags);
511 rdtp = &__get_cpu_var(rcu_dynticks);
512 oldval = rdtp->dynticks_nesting;
513 rdtp->dynticks_nesting++;
514 WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
515 if (oldval)
516 trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting);
517 else
518 rcu_idle_exit_common(rdtp, oldval);
399 local_irq_restore(flags); 519 local_irq_restore(flags);
400} 520}
401 521
@@ -442,27 +562,37 @@ void rcu_nmi_exit(void)
442 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); 562 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
443} 563}
444 564
565#ifdef CONFIG_PROVE_RCU
566
445/** 567/**
446 * rcu_irq_enter - inform RCU of entry to hard irq context 568 * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle
447 * 569 *
448 * If the CPU was idle with dynamic ticks active, this updates the 570 * If the current CPU is in its idle loop and is neither in an interrupt
449 * rdtp->dynticks to let the RCU handling know that the CPU is active. 571 * or NMI handler, return true.
450 */ 572 */
451void rcu_irq_enter(void) 573int rcu_is_cpu_idle(void)
452{ 574{
453 rcu_exit_nohz(); 575 int ret;
576
577 preempt_disable();
578 ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0;
579 preempt_enable();
580 return ret;
454} 581}
582EXPORT_SYMBOL(rcu_is_cpu_idle);
583
584#endif /* #ifdef CONFIG_PROVE_RCU */
455 585
456/** 586/**
457 * rcu_irq_exit - inform RCU of exit from hard irq context 587 * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle
458 * 588 *
459 * If the CPU was idle with dynamic ticks active, update the rdp->dynticks 589 * If the current CPU is idle or running at a first-level (not nested)
460 * to put let the RCU handling be aware that the CPU is going back to idle 590 * interrupt from idle, return true. The caller must have at least
461 * with no ticks. 591 * disabled preemption.
462 */ 592 */
463void rcu_irq_exit(void) 593int rcu_is_cpu_rrupt_from_idle(void)
464{ 594{
465 rcu_enter_nohz(); 595 return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;
466} 596}
467 597
468#ifdef CONFIG_SMP 598#ifdef CONFIG_SMP
@@ -475,7 +605,7 @@ void rcu_irq_exit(void)
475static int dyntick_save_progress_counter(struct rcu_data *rdp) 605static int dyntick_save_progress_counter(struct rcu_data *rdp)
476{ 606{
477 rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); 607 rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
478 return 0; 608 return (rdp->dynticks_snap & 0x1) == 0;
479} 609}
480 610
481/* 611/*
@@ -512,26 +642,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
512 642
513#endif /* #ifdef CONFIG_SMP */ 643#endif /* #ifdef CONFIG_SMP */
514 644
515#else /* #ifdef CONFIG_NO_HZ */
516
517#ifdef CONFIG_SMP
518
519static int dyntick_save_progress_counter(struct rcu_data *rdp)
520{
521 return 0;
522}
523
524static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
525{
526 return rcu_implicit_offline_qs(rdp);
527}
528
529#endif /* #ifdef CONFIG_SMP */
530
531#endif /* #else #ifdef CONFIG_NO_HZ */
532
533int rcu_cpu_stall_suppress __read_mostly;
534
535static void record_gp_stall_check_time(struct rcu_state *rsp) 645static void record_gp_stall_check_time(struct rcu_state *rsp)
536{ 646{
537 rsp->gp_start = jiffies; 647 rsp->gp_start = jiffies;
@@ -866,8 +976,8 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
866 /* Advance to a new grace period and initialize state. */ 976 /* Advance to a new grace period and initialize state. */
867 rsp->gpnum++; 977 rsp->gpnum++;
868 trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); 978 trace_rcu_grace_period(rsp->name, rsp->gpnum, "start");
869 WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT); 979 WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT);
870 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ 980 rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */
871 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 981 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
872 record_gp_stall_check_time(rsp); 982 record_gp_stall_check_time(rsp);
873 983
@@ -877,7 +987,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
877 rnp->qsmask = rnp->qsmaskinit; 987 rnp->qsmask = rnp->qsmaskinit;
878 rnp->gpnum = rsp->gpnum; 988 rnp->gpnum = rsp->gpnum;
879 rnp->completed = rsp->completed; 989 rnp->completed = rsp->completed;
880 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ 990 rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state OK */
881 rcu_start_gp_per_cpu(rsp, rnp, rdp); 991 rcu_start_gp_per_cpu(rsp, rnp, rdp);
882 rcu_preempt_boost_start_gp(rnp); 992 rcu_preempt_boost_start_gp(rnp);
883 trace_rcu_grace_period_init(rsp->name, rnp->gpnum, 993 trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
@@ -927,7 +1037,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
927 1037
928 rnp = rcu_get_root(rsp); 1038 rnp = rcu_get_root(rsp);
929 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 1039 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
930 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ 1040 rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
931 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1041 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
932 raw_spin_unlock_irqrestore(&rsp->onofflock, flags); 1042 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
933} 1043}
@@ -991,7 +1101,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
991 1101
992 rsp->completed = rsp->gpnum; /* Declare the grace period complete. */ 1102 rsp->completed = rsp->gpnum; /* Declare the grace period complete. */
993 trace_rcu_grace_period(rsp->name, rsp->completed, "end"); 1103 trace_rcu_grace_period(rsp->name, rsp->completed, "end");
994 rsp->signaled = RCU_GP_IDLE; 1104 rsp->fqs_state = RCU_GP_IDLE;
995 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ 1105 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
996} 1106}
997 1107
@@ -1221,7 +1331,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1221 else 1331 else
1222 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1332 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1223 if (need_report & RCU_OFL_TASKS_EXP_GP) 1333 if (need_report & RCU_OFL_TASKS_EXP_GP)
1224 rcu_report_exp_rnp(rsp, rnp); 1334 rcu_report_exp_rnp(rsp, rnp, true);
1225 rcu_node_kthread_setaffinity(rnp, -1); 1335 rcu_node_kthread_setaffinity(rnp, -1);
1226} 1336}
1227 1337
@@ -1263,7 +1373,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1263 /* If no callbacks are ready, just return.*/ 1373 /* If no callbacks are ready, just return.*/
1264 if (!cpu_has_callbacks_ready_to_invoke(rdp)) { 1374 if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
1265 trace_rcu_batch_start(rsp->name, 0, 0); 1375 trace_rcu_batch_start(rsp->name, 0, 0);
1266 trace_rcu_batch_end(rsp->name, 0); 1376 trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
1377 need_resched(), is_idle_task(current),
1378 rcu_is_callbacks_kthread());
1267 return; 1379 return;
1268 } 1380 }
1269 1381
@@ -1291,12 +1403,17 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1291 debug_rcu_head_unqueue(list); 1403 debug_rcu_head_unqueue(list);
1292 __rcu_reclaim(rsp->name, list); 1404 __rcu_reclaim(rsp->name, list);
1293 list = next; 1405 list = next;
1294 if (++count >= bl) 1406 /* Stop only if limit reached and CPU has something to do. */
1407 if (++count >= bl &&
1408 (need_resched() ||
1409 (!is_idle_task(current) && !rcu_is_callbacks_kthread())))
1295 break; 1410 break;
1296 } 1411 }
1297 1412
1298 local_irq_save(flags); 1413 local_irq_save(flags);
1299 trace_rcu_batch_end(rsp->name, count); 1414 trace_rcu_batch_end(rsp->name, count, !!list, need_resched(),
1415 is_idle_task(current),
1416 rcu_is_callbacks_kthread());
1300 1417
1301 /* Update count, and requeue any remaining callbacks. */ 1418 /* Update count, and requeue any remaining callbacks. */
1302 rdp->qlen -= count; 1419 rdp->qlen -= count;
@@ -1334,16 +1451,14 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1334 * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). 1451 * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
1335 * Also schedule RCU core processing. 1452 * Also schedule RCU core processing.
1336 * 1453 *
1337 * This function must be called with hardirqs disabled. It is normally 1454 * This function must be called from hardirq context. It is normally
1338 * invoked from the scheduling-clock interrupt. If rcu_pending returns 1455 * invoked from the scheduling-clock interrupt. If rcu_pending returns
1339 * false, there is no point in invoking rcu_check_callbacks(). 1456 * false, there is no point in invoking rcu_check_callbacks().
1340 */ 1457 */
1341void rcu_check_callbacks(int cpu, int user) 1458void rcu_check_callbacks(int cpu, int user)
1342{ 1459{
1343 trace_rcu_utilization("Start scheduler-tick"); 1460 trace_rcu_utilization("Start scheduler-tick");
1344 if (user || 1461 if (user || rcu_is_cpu_rrupt_from_idle()) {
1345 (idle_cpu(cpu) && rcu_scheduler_active &&
1346 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
1347 1462
1348 /* 1463 /*
1349 * Get here if this CPU took its interrupt from user 1464 * Get here if this CPU took its interrupt from user
@@ -1457,7 +1572,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1457 goto unlock_fqs_ret; /* no GP in progress, time updated. */ 1572 goto unlock_fqs_ret; /* no GP in progress, time updated. */
1458 } 1573 }
1459 rsp->fqs_active = 1; 1574 rsp->fqs_active = 1;
1460 switch (rsp->signaled) { 1575 switch (rsp->fqs_state) {
1461 case RCU_GP_IDLE: 1576 case RCU_GP_IDLE:
1462 case RCU_GP_INIT: 1577 case RCU_GP_INIT:
1463 1578
@@ -1473,7 +1588,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1473 force_qs_rnp(rsp, dyntick_save_progress_counter); 1588 force_qs_rnp(rsp, dyntick_save_progress_counter);
1474 raw_spin_lock(&rnp->lock); /* irqs already disabled */ 1589 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1475 if (rcu_gp_in_progress(rsp)) 1590 if (rcu_gp_in_progress(rsp))
1476 rsp->signaled = RCU_FORCE_QS; 1591 rsp->fqs_state = RCU_FORCE_QS;
1477 break; 1592 break;
1478 1593
1479 case RCU_FORCE_QS: 1594 case RCU_FORCE_QS:
@@ -1812,7 +1927,7 @@ static int rcu_pending(int cpu)
1812 * by the current CPU, even if none need be done immediately, returning 1927 * by the current CPU, even if none need be done immediately, returning
1813 * 1 if so. 1928 * 1 if so.
1814 */ 1929 */
1815static int rcu_needs_cpu_quick_check(int cpu) 1930static int rcu_cpu_has_callbacks(int cpu)
1816{ 1931{
1817 /* RCU callbacks either ready or pending? */ 1932 /* RCU callbacks either ready or pending? */
1818 return per_cpu(rcu_sched_data, cpu).nxtlist || 1933 return per_cpu(rcu_sched_data, cpu).nxtlist ||
@@ -1913,9 +2028,9 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1913 for (i = 0; i < RCU_NEXT_SIZE; i++) 2028 for (i = 0; i < RCU_NEXT_SIZE; i++)
1914 rdp->nxttail[i] = &rdp->nxtlist; 2029 rdp->nxttail[i] = &rdp->nxtlist;
1915 rdp->qlen = 0; 2030 rdp->qlen = 0;
1916#ifdef CONFIG_NO_HZ
1917 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 2031 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1918#endif /* #ifdef CONFIG_NO_HZ */ 2032 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING);
2033 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
1919 rdp->cpu = cpu; 2034 rdp->cpu = cpu;
1920 rdp->rsp = rsp; 2035 rdp->rsp = rsp;
1921 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2036 raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1942,6 +2057,10 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
1942 rdp->qlen_last_fqs_check = 0; 2057 rdp->qlen_last_fqs_check = 0;
1943 rdp->n_force_qs_snap = rsp->n_force_qs; 2058 rdp->n_force_qs_snap = rsp->n_force_qs;
1944 rdp->blimit = blimit; 2059 rdp->blimit = blimit;
2060 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_NESTING;
2061 atomic_set(&rdp->dynticks->dynticks,
2062 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
2063 rcu_prepare_for_idle_init(cpu);
1945 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 2064 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1946 2065
1947 /* 2066 /*
@@ -2023,6 +2142,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2023 rcu_send_cbs_to_online(&rcu_bh_state); 2142 rcu_send_cbs_to_online(&rcu_bh_state);
2024 rcu_send_cbs_to_online(&rcu_sched_state); 2143 rcu_send_cbs_to_online(&rcu_sched_state);
2025 rcu_preempt_send_cbs_to_online(); 2144 rcu_preempt_send_cbs_to_online();
2145 rcu_cleanup_after_idle(cpu);
2026 break; 2146 break;
2027 case CPU_DEAD: 2147 case CPU_DEAD:
2028 case CPU_DEAD_FROZEN: 2148 case CPU_DEAD_FROZEN:
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 849ce9ec51fe..fddff92d6676 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -84,9 +84,10 @@
84 * Dynticks per-CPU state. 84 * Dynticks per-CPU state.
85 */ 85 */
86struct rcu_dynticks { 86struct rcu_dynticks {
87 int dynticks_nesting; /* Track irq/process nesting level. */ 87 long long dynticks_nesting; /* Track irq/process nesting level. */
88 int dynticks_nmi_nesting; /* Track NMI nesting level. */ 88 /* Process level is worth LLONG_MAX/2. */
89 atomic_t dynticks; /* Even value for dynticks-idle, else odd. */ 89 int dynticks_nmi_nesting; /* Track NMI nesting level. */
90 atomic_t dynticks; /* Even value for idle, else odd. */
90}; 91};
91 92
92/* RCU's kthread states for tracing. */ 93/* RCU's kthread states for tracing. */
@@ -274,16 +275,12 @@ struct rcu_data {
274 /* did other CPU force QS recently? */ 275 /* did other CPU force QS recently? */
275 long blimit; /* Upper limit on a processed batch */ 276 long blimit; /* Upper limit on a processed batch */
276 277
277#ifdef CONFIG_NO_HZ
278 /* 3) dynticks interface. */ 278 /* 3) dynticks interface. */
279 struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ 279 struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */
280 int dynticks_snap; /* Per-GP tracking for dynticks. */ 280 int dynticks_snap; /* Per-GP tracking for dynticks. */
281#endif /* #ifdef CONFIG_NO_HZ */
282 281
283 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ 282 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
284#ifdef CONFIG_NO_HZ
285 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ 283 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
286#endif /* #ifdef CONFIG_NO_HZ */
287 unsigned long offline_fqs; /* Kicked due to being offline. */ 284 unsigned long offline_fqs; /* Kicked due to being offline. */
288 unsigned long resched_ipi; /* Sent a resched IPI. */ 285 unsigned long resched_ipi; /* Sent a resched IPI. */
289 286
@@ -302,16 +299,12 @@ struct rcu_data {
302 struct rcu_state *rsp; 299 struct rcu_state *rsp;
303}; 300};
304 301
305/* Values for signaled field in struct rcu_state. */ 302/* Values for fqs_state field in struct rcu_state. */
306#define RCU_GP_IDLE 0 /* No grace period in progress. */ 303#define RCU_GP_IDLE 0 /* No grace period in progress. */
307#define RCU_GP_INIT 1 /* Grace period being initialized. */ 304#define RCU_GP_INIT 1 /* Grace period being initialized. */
308#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ 305#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */
309#define RCU_FORCE_QS 3 /* Need to force quiescent state. */ 306#define RCU_FORCE_QS 3 /* Need to force quiescent state. */
310#ifdef CONFIG_NO_HZ
311#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK 307#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
312#else /* #ifdef CONFIG_NO_HZ */
313#define RCU_SIGNAL_INIT RCU_FORCE_QS
314#endif /* #else #ifdef CONFIG_NO_HZ */
315 308
316#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 309#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
317 310
@@ -361,7 +354,7 @@ struct rcu_state {
361 354
362 /* The following fields are guarded by the root rcu_node's lock. */ 355 /* The following fields are guarded by the root rcu_node's lock. */
363 356
364 u8 signaled ____cacheline_internodealigned_in_smp; 357 u8 fqs_state ____cacheline_internodealigned_in_smp;
365 /* Force QS state. */ 358 /* Force QS state. */
366 u8 fqs_active; /* force_quiescent_state() */ 359 u8 fqs_active; /* force_quiescent_state() */
367 /* is running. */ 360 /* is running. */
@@ -451,7 +444,8 @@ static void rcu_preempt_check_callbacks(int cpu);
451static void rcu_preempt_process_callbacks(void); 444static void rcu_preempt_process_callbacks(void);
452void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 445void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
453#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) 446#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
454static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp); 447static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
448 bool wake);
455#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ 449#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
456static int rcu_preempt_pending(int cpu); 450static int rcu_preempt_pending(int cpu);
457static int rcu_preempt_needs_cpu(int cpu); 451static int rcu_preempt_needs_cpu(int cpu);
@@ -461,6 +455,7 @@ static void __init __rcu_init_preempt(void);
461static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); 455static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
462static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); 456static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
463static void invoke_rcu_callbacks_kthread(void); 457static void invoke_rcu_callbacks_kthread(void);
458static bool rcu_is_callbacks_kthread(void);
464#ifdef CONFIG_RCU_BOOST 459#ifdef CONFIG_RCU_BOOST
465static void rcu_preempt_do_callbacks(void); 460static void rcu_preempt_do_callbacks(void);
466static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, 461static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
@@ -473,5 +468,8 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg);
473#endif /* #ifdef CONFIG_RCU_BOOST */ 468#endif /* #ifdef CONFIG_RCU_BOOST */
474static void rcu_cpu_kthread_setrt(int cpu, int to_rt); 469static void rcu_cpu_kthread_setrt(int cpu, int to_rt);
475static void __cpuinit rcu_prepare_kthreads(int cpu); 470static void __cpuinit rcu_prepare_kthreads(int cpu);
471static void rcu_prepare_for_idle_init(int cpu);
472static void rcu_cleanup_after_idle(int cpu);
473static void rcu_prepare_for_idle(int cpu);
476 474
477#endif /* #ifndef RCU_TREE_NONCORE */ 475#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 4b9b9f8a4184..8bb35d73e1f9 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -312,6 +312,7 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
312{ 312{
313 int empty; 313 int empty;
314 int empty_exp; 314 int empty_exp;
315 int empty_exp_now;
315 unsigned long flags; 316 unsigned long flags;
316 struct list_head *np; 317 struct list_head *np;
317#ifdef CONFIG_RCU_BOOST 318#ifdef CONFIG_RCU_BOOST
@@ -382,8 +383,10 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
382 /* 383 /*
383 * If this was the last task on the current list, and if 384 * If this was the last task on the current list, and if
384 * we aren't waiting on any CPUs, report the quiescent state. 385 * we aren't waiting on any CPUs, report the quiescent state.
385 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. 386 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
387 * so we must take a snapshot of the expedited state.
386 */ 388 */
389 empty_exp_now = !rcu_preempted_readers_exp(rnp);
387 if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { 390 if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
388 trace_rcu_quiescent_state_report("preempt_rcu", 391 trace_rcu_quiescent_state_report("preempt_rcu",
389 rnp->gpnum, 392 rnp->gpnum,
@@ -406,8 +409,8 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
406 * If this was the last task on the expedited lists, 409 * If this was the last task on the expedited lists,
407 * then we need to report up the rcu_node hierarchy. 410 * then we need to report up the rcu_node hierarchy.
408 */ 411 */
409 if (!empty_exp && !rcu_preempted_readers_exp(rnp)) 412 if (!empty_exp && empty_exp_now)
410 rcu_report_exp_rnp(&rcu_preempt_state, rnp); 413 rcu_report_exp_rnp(&rcu_preempt_state, rnp, true);
411 } else { 414 } else {
412 local_irq_restore(flags); 415 local_irq_restore(flags);
413 } 416 }
@@ -729,9 +732,13 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
729 * recursively up the tree. (Calm down, calm down, we do the recursion 732 * recursively up the tree. (Calm down, calm down, we do the recursion
730 * iteratively!) 733 * iteratively!)
731 * 734 *
735 * Most callers will set the "wake" flag, but the task initiating the
736 * expedited grace period need not wake itself.
737 *
732 * Caller must hold sync_rcu_preempt_exp_mutex. 738 * Caller must hold sync_rcu_preempt_exp_mutex.
733 */ 739 */
734static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) 740static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
741 bool wake)
735{ 742{
736 unsigned long flags; 743 unsigned long flags;
737 unsigned long mask; 744 unsigned long mask;
@@ -744,7 +751,8 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
744 } 751 }
745 if (rnp->parent == NULL) { 752 if (rnp->parent == NULL) {
746 raw_spin_unlock_irqrestore(&rnp->lock, flags); 753 raw_spin_unlock_irqrestore(&rnp->lock, flags);
747 wake_up(&sync_rcu_preempt_exp_wq); 754 if (wake)
755 wake_up(&sync_rcu_preempt_exp_wq);
748 break; 756 break;
749 } 757 }
750 mask = rnp->grpmask; 758 mask = rnp->grpmask;
@@ -777,7 +785,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
777 must_wait = 1; 785 must_wait = 1;
778 } 786 }
779 if (!must_wait) 787 if (!must_wait)
780 rcu_report_exp_rnp(rsp, rnp); 788 rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */
781} 789}
782 790
783/* 791/*
@@ -1069,9 +1077,9 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
1069 * report on tasks preempted in RCU read-side critical sections during 1077 * report on tasks preempted in RCU read-side critical sections during
1070 * expedited RCU grace periods. 1078 * expedited RCU grace periods.
1071 */ 1079 */
1072static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) 1080static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
1081 bool wake)
1073{ 1082{
1074 return;
1075} 1083}
1076 1084
1077#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 1085#endif /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -1157,8 +1165,6 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1157 1165
1158#endif /* #else #ifdef CONFIG_RCU_TRACE */ 1166#endif /* #else #ifdef CONFIG_RCU_TRACE */
1159 1167
1160static struct lock_class_key rcu_boost_class;
1161
1162/* 1168/*
1163 * Carry out RCU priority boosting on the task indicated by ->exp_tasks 1169 * Carry out RCU priority boosting on the task indicated by ->exp_tasks
1164 * or ->boost_tasks, advancing the pointer to the next task in the 1170 * or ->boost_tasks, advancing the pointer to the next task in the
@@ -1221,15 +1227,13 @@ static int rcu_boost(struct rcu_node *rnp)
1221 */ 1227 */
1222 t = container_of(tb, struct task_struct, rcu_node_entry); 1228 t = container_of(tb, struct task_struct, rcu_node_entry);
1223 rt_mutex_init_proxy_locked(&mtx, t); 1229 rt_mutex_init_proxy_locked(&mtx, t);
1224 /* Avoid lockdep false positives. This rt_mutex is its own thing. */
1225 lockdep_set_class_and_name(&mtx.wait_lock, &rcu_boost_class,
1226 "rcu_boost_mutex");
1227 t->rcu_boost_mutex = &mtx; 1230 t->rcu_boost_mutex = &mtx;
1228 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1231 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1229 rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ 1232 rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */
1230 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ 1233 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
1231 1234
1232 return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL; 1235 return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
1236 ACCESS_ONCE(rnp->boost_tasks) != NULL;
1233} 1237}
1234 1238
1235/* 1239/*
@@ -1329,6 +1333,15 @@ static void invoke_rcu_callbacks_kthread(void)
1329} 1333}
1330 1334
1331/* 1335/*
1336 * Is the current CPU running the RCU-callbacks kthread?
1337 * Caller must have preemption disabled.
1338 */
1339static bool rcu_is_callbacks_kthread(void)
1340{
1341 return __get_cpu_var(rcu_cpu_kthread_task) == current;
1342}
1343
1344/*
1332 * Set the affinity of the boost kthread. The CPU-hotplug locks are 1345 * Set the affinity of the boost kthread. The CPU-hotplug locks are
1333 * held, so no one should be messing with the existence of the boost 1346 * held, so no one should be messing with the existence of the boost
1334 * kthread. 1347 * kthread.
@@ -1772,6 +1785,11 @@ static void invoke_rcu_callbacks_kthread(void)
1772 WARN_ON_ONCE(1); 1785 WARN_ON_ONCE(1);
1773} 1786}
1774 1787
1788static bool rcu_is_callbacks_kthread(void)
1789{
1790 return false;
1791}
1792
1775static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) 1793static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1776{ 1794{
1777} 1795}
@@ -1907,7 +1925,7 @@ void synchronize_sched_expedited(void)
1907 * grace period works for us. 1925 * grace period works for us.
1908 */ 1926 */
1909 get_online_cpus(); 1927 get_online_cpus();
1910 snap = atomic_read(&sync_sched_expedited_started) - 1; 1928 snap = atomic_read(&sync_sched_expedited_started);
1911 smp_mb(); /* ensure read is before try_stop_cpus(). */ 1929 smp_mb(); /* ensure read is before try_stop_cpus(). */
1912 } 1930 }
1913 1931
@@ -1939,88 +1957,243 @@ EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
1939 * 1 if so. This function is part of the RCU implementation; it is -not- 1957 * 1 if so. This function is part of the RCU implementation; it is -not-
1940 * an exported member of the RCU API. 1958 * an exported member of the RCU API.
1941 * 1959 *
1942 * Because we have preemptible RCU, just check whether this CPU needs 1960 * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
1943 * any flavor of RCU. Do not chew up lots of CPU cycles with preemption 1961 * any flavor of RCU.
1944 * disabled in a most-likely vain attempt to cause RCU not to need this CPU.
1945 */ 1962 */
1946int rcu_needs_cpu(int cpu) 1963int rcu_needs_cpu(int cpu)
1947{ 1964{
1948 return rcu_needs_cpu_quick_check(cpu); 1965 return rcu_cpu_has_callbacks(cpu);
1966}
1967
1968/*
1969 * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it.
1970 */
1971static void rcu_prepare_for_idle_init(int cpu)
1972{
1973}
1974
1975/*
1976 * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
1977 * after it.
1978 */
1979static void rcu_cleanup_after_idle(int cpu)
1980{
1981}
1982
1983/*
1984 * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=y,
1985 * is nothing.
1986 */
1987static void rcu_prepare_for_idle(int cpu)
1988{
1949} 1989}
1950 1990
1951#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 1991#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
1952 1992
1953#define RCU_NEEDS_CPU_FLUSHES 5 1993/*
1994 * This code is invoked when a CPU goes idle, at which point we want
1995 * to have the CPU do everything required for RCU so that it can enter
1996 * the energy-efficient dyntick-idle mode. This is handled by a
1997 * state machine implemented by rcu_prepare_for_idle() below.
1998 *
1999 * The following three proprocessor symbols control this state machine:
2000 *
2001 * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt
2002 * to satisfy RCU. Beyond this point, it is better to incur a periodic
2003 * scheduling-clock interrupt than to loop through the state machine
2004 * at full power.
2005 * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are
2006 * optional if RCU does not need anything immediately from this
2007 * CPU, even if this CPU still has RCU callbacks queued. The first
2008 * times through the state machine are mandatory: we need to give
2009 * the state machine a chance to communicate a quiescent state
2010 * to the RCU core.
2011 * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
2012 * to sleep in dyntick-idle mode with RCU callbacks pending. This
2013 * is sized to be roughly one RCU grace period. Those energy-efficiency
2014 * benchmarkers who might otherwise be tempted to set this to a large
2015 * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
2016 * system. And if you are -that- concerned about energy efficiency,
2017 * just power the system down and be done with it!
2018 *
2019 * The values below work well in practice. If future workloads require
2020 * adjustment, they can be converted into kernel config parameters, though
2021 * making the state machine smarter might be a better option.
2022 */
2023#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */
2024#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */
2025#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */
2026
1954static DEFINE_PER_CPU(int, rcu_dyntick_drain); 2027static DEFINE_PER_CPU(int, rcu_dyntick_drain);
1955static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); 2028static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
2029static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer);
2030static ktime_t rcu_idle_gp_wait;
1956 2031
1957/* 2032/*
1958 * Check to see if any future RCU-related work will need to be done 2033 * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
1959 * by the current CPU, even if none need be done immediately, returning 2034 * callbacks on this CPU, (2) this CPU has not yet attempted to enter
1960 * 1 if so. This function is part of the RCU implementation; it is -not- 2035 * dyntick-idle mode, or (3) this CPU is in the process of attempting to
1961 * an exported member of the RCU API. 2036 * enter dyntick-idle mode. Otherwise, if we have recently tried and failed
2037 * to enter dyntick-idle mode, we refuse to try to enter it. After all,
2038 * it is better to incur scheduling-clock interrupts than to spin
2039 * continuously for the same time duration!
2040 */
2041int rcu_needs_cpu(int cpu)
2042{
2043 /* If no callbacks, RCU doesn't need the CPU. */
2044 if (!rcu_cpu_has_callbacks(cpu))
2045 return 0;
2046 /* Otherwise, RCU needs the CPU only if it recently tried and failed. */
2047 return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies;
2048}
2049
2050/*
2051 * Timer handler used to force CPU to start pushing its remaining RCU
2052 * callbacks in the case where it entered dyntick-idle mode with callbacks
2053 * pending. The hander doesn't really need to do anything because the
2054 * real work is done upon re-entry to idle, or by the next scheduling-clock
2055 * interrupt should idle not be re-entered.
2056 */
2057static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp)
2058{
2059 trace_rcu_prep_idle("Timer");
2060 return HRTIMER_NORESTART;
2061}
2062
2063/*
2064 * Initialize the timer used to pull CPUs out of dyntick-idle mode.
2065 */
2066static void rcu_prepare_for_idle_init(int cpu)
2067{
2068 static int firsttime = 1;
2069 struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu);
2070
2071 hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2072 hrtp->function = rcu_idle_gp_timer_func;
2073 if (firsttime) {
2074 unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY);
2075
2076 rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000);
2077 firsttime = 0;
2078 }
2079}
2080
2081/*
2082 * Clean up for exit from idle. Because we are exiting from idle, there
2083 * is no longer any point to rcu_idle_gp_timer, so cancel it. This will
2084 * do nothing if this timer is not active, so just cancel it unconditionally.
2085 */
2086static void rcu_cleanup_after_idle(int cpu)
2087{
2088 hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu));
2089}
2090
2091/*
2092 * Check to see if any RCU-related work can be done by the current CPU,
2093 * and if so, schedule a softirq to get it done. This function is part
2094 * of the RCU implementation; it is -not- an exported member of the RCU API.
1962 * 2095 *
1963 * Because we are not supporting preemptible RCU, attempt to accelerate 2096 * The idea is for the current CPU to clear out all work required by the
1964 * any current grace periods so that RCU no longer needs this CPU, but 2097 * RCU core for the current grace period, so that this CPU can be permitted
1965 * only if all other CPUs are already in dynticks-idle mode. This will 2098 * to enter dyntick-idle mode. In some cases, it will need to be awakened
1966 * allow the CPU cores to be powered down immediately, as opposed to after 2099 * at the end of the grace period by whatever CPU ends the grace period.
1967 * waiting many milliseconds for grace periods to elapse. 2100 * This allows CPUs to go dyntick-idle more quickly, and to reduce the
2101 * number of wakeups by a modest integer factor.
1968 * 2102 *
1969 * Because it is not legal to invoke rcu_process_callbacks() with irqs 2103 * Because it is not legal to invoke rcu_process_callbacks() with irqs
1970 * disabled, we do one pass of force_quiescent_state(), then do a 2104 * disabled, we do one pass of force_quiescent_state(), then do a
1971 * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked 2105 * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
1972 * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. 2106 * later. The per-cpu rcu_dyntick_drain variable controls the sequencing.
2107 *
2108 * The caller must have disabled interrupts.
1973 */ 2109 */
1974int rcu_needs_cpu(int cpu) 2110static void rcu_prepare_for_idle(int cpu)
1975{ 2111{
1976 int c = 0; 2112 unsigned long flags;
1977 int snap; 2113
1978 int thatcpu; 2114 local_irq_save(flags);
1979 2115
1980 /* Check for being in the holdoff period. */ 2116 /*
1981 if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) 2117 * If there are no callbacks on this CPU, enter dyntick-idle mode.
1982 return rcu_needs_cpu_quick_check(cpu); 2118 * Also reset state to avoid prejudicing later attempts.
1983 2119 */
1984 /* Don't bother unless we are the last non-dyntick-idle CPU. */ 2120 if (!rcu_cpu_has_callbacks(cpu)) {
1985 for_each_online_cpu(thatcpu) { 2121 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
1986 if (thatcpu == cpu) 2122 per_cpu(rcu_dyntick_drain, cpu) = 0;
1987 continue; 2123 local_irq_restore(flags);
1988 snap = atomic_add_return(0, &per_cpu(rcu_dynticks, 2124 trace_rcu_prep_idle("No callbacks");
1989 thatcpu).dynticks); 2125 return;
1990 smp_mb(); /* Order sampling of snap with end of grace period. */ 2126 }
1991 if ((snap & 0x1) != 0) { 2127
1992 per_cpu(rcu_dyntick_drain, cpu) = 0; 2128 /*
1993 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; 2129 * If in holdoff mode, just return. We will presumably have
1994 return rcu_needs_cpu_quick_check(cpu); 2130 * refrained from disabling the scheduling-clock tick.
1995 } 2131 */
2132 if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) {
2133 local_irq_restore(flags);
2134 trace_rcu_prep_idle("In holdoff");
2135 return;
1996 } 2136 }
1997 2137
1998 /* Check and update the rcu_dyntick_drain sequencing. */ 2138 /* Check and update the rcu_dyntick_drain sequencing. */
1999 if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { 2139 if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
2000 /* First time through, initialize the counter. */ 2140 /* First time through, initialize the counter. */
2001 per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES; 2141 per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES;
2142 } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES &&
2143 !rcu_pending(cpu)) {
2144 /* Can we go dyntick-idle despite still having callbacks? */
2145 trace_rcu_prep_idle("Dyntick with callbacks");
2146 per_cpu(rcu_dyntick_drain, cpu) = 0;
2147 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
2148 hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu),
2149 rcu_idle_gp_wait, HRTIMER_MODE_REL);
2150 return; /* Nothing more to do immediately. */
2002 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { 2151 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
2003 /* We have hit the limit, so time to give up. */ 2152 /* We have hit the limit, so time to give up. */
2004 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; 2153 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
2005 return rcu_needs_cpu_quick_check(cpu); 2154 local_irq_restore(flags);
2155 trace_rcu_prep_idle("Begin holdoff");
2156 invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */
2157 return;
2006 } 2158 }
2007 2159
2008 /* Do one step pushing remaining RCU callbacks through. */ 2160 /*
2161 * Do one step of pushing the remaining RCU callbacks through
2162 * the RCU core state machine.
2163 */
2164#ifdef CONFIG_TREE_PREEMPT_RCU
2165 if (per_cpu(rcu_preempt_data, cpu).nxtlist) {
2166 local_irq_restore(flags);
2167 rcu_preempt_qs(cpu);
2168 force_quiescent_state(&rcu_preempt_state, 0);
2169 local_irq_save(flags);
2170 }
2171#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
2009 if (per_cpu(rcu_sched_data, cpu).nxtlist) { 2172 if (per_cpu(rcu_sched_data, cpu).nxtlist) {
2173 local_irq_restore(flags);
2010 rcu_sched_qs(cpu); 2174 rcu_sched_qs(cpu);
2011 force_quiescent_state(&rcu_sched_state, 0); 2175 force_quiescent_state(&rcu_sched_state, 0);
2012 c = c || per_cpu(rcu_sched_data, cpu).nxtlist; 2176 local_irq_save(flags);
2013 } 2177 }
2014 if (per_cpu(rcu_bh_data, cpu).nxtlist) { 2178 if (per_cpu(rcu_bh_data, cpu).nxtlist) {
2179 local_irq_restore(flags);
2015 rcu_bh_qs(cpu); 2180 rcu_bh_qs(cpu);
2016 force_quiescent_state(&rcu_bh_state, 0); 2181 force_quiescent_state(&rcu_bh_state, 0);
2017 c = c || per_cpu(rcu_bh_data, cpu).nxtlist; 2182 local_irq_save(flags);
2018 } 2183 }
2019 2184
2020 /* If RCU callbacks are still pending, RCU still needs this CPU. */ 2185 /*
2021 if (c) 2186 * If RCU callbacks are still pending, RCU still needs this CPU.
2187 * So try forcing the callbacks through the grace period.
2188 */
2189 if (rcu_cpu_has_callbacks(cpu)) {
2190 local_irq_restore(flags);
2191 trace_rcu_prep_idle("More callbacks");
2022 invoke_rcu_core(); 2192 invoke_rcu_core();
2023 return c; 2193 } else {
2194 local_irq_restore(flags);
2195 trace_rcu_prep_idle("Callbacks drained");
2196 }
2024} 2197}
2025 2198
2026#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 2199#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 9feffa4c0695..654cfe67f0d1 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -67,13 +67,11 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
67 rdp->completed, rdp->gpnum, 67 rdp->completed, rdp->gpnum,
68 rdp->passed_quiesce, rdp->passed_quiesce_gpnum, 68 rdp->passed_quiesce, rdp->passed_quiesce_gpnum,
69 rdp->qs_pending); 69 rdp->qs_pending);
70#ifdef CONFIG_NO_HZ 70 seq_printf(m, " dt=%d/%llx/%d df=%lu",
71 seq_printf(m, " dt=%d/%d/%d df=%lu",
72 atomic_read(&rdp->dynticks->dynticks), 71 atomic_read(&rdp->dynticks->dynticks),
73 rdp->dynticks->dynticks_nesting, 72 rdp->dynticks->dynticks_nesting,
74 rdp->dynticks->dynticks_nmi_nesting, 73 rdp->dynticks->dynticks_nmi_nesting,
75 rdp->dynticks_fqs); 74 rdp->dynticks_fqs);
76#endif /* #ifdef CONFIG_NO_HZ */
77 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); 75 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
78 seq_printf(m, " ql=%ld qs=%c%c%c%c", 76 seq_printf(m, " ql=%ld qs=%c%c%c%c",
79 rdp->qlen, 77 rdp->qlen,
@@ -141,13 +139,11 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
141 rdp->completed, rdp->gpnum, 139 rdp->completed, rdp->gpnum,
142 rdp->passed_quiesce, rdp->passed_quiesce_gpnum, 140 rdp->passed_quiesce, rdp->passed_quiesce_gpnum,
143 rdp->qs_pending); 141 rdp->qs_pending);
144#ifdef CONFIG_NO_HZ 142 seq_printf(m, ",%d,%llx,%d,%lu",
145 seq_printf(m, ",%d,%d,%d,%lu",
146 atomic_read(&rdp->dynticks->dynticks), 143 atomic_read(&rdp->dynticks->dynticks),
147 rdp->dynticks->dynticks_nesting, 144 rdp->dynticks->dynticks_nesting,
148 rdp->dynticks->dynticks_nmi_nesting, 145 rdp->dynticks->dynticks_nmi_nesting,
149 rdp->dynticks_fqs); 146 rdp->dynticks_fqs);
150#endif /* #ifdef CONFIG_NO_HZ */
151 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); 147 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
152 seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen, 148 seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen,
153 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != 149 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
@@ -171,9 +167,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
171static int show_rcudata_csv(struct seq_file *m, void *unused) 167static int show_rcudata_csv(struct seq_file *m, void *unused)
172{ 168{
173 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); 169 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\",");
174#ifdef CONFIG_NO_HZ
175 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); 170 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
176#endif /* #ifdef CONFIG_NO_HZ */
177 seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\""); 171 seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\"");
178#ifdef CONFIG_RCU_BOOST 172#ifdef CONFIG_RCU_BOOST
179 seq_puts(m, "\"kt\",\"ktl\""); 173 seq_puts(m, "\"kt\",\"ktl\"");
@@ -278,7 +272,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
278 gpnum = rsp->gpnum; 272 gpnum = rsp->gpnum;
279 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " 273 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
280 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", 274 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
281 rsp->completed, gpnum, rsp->signaled, 275 rsp->completed, gpnum, rsp->fqs_state,
282 (long)(rsp->jiffies_force_qs - jiffies), 276 (long)(rsp->jiffies_force_qs - jiffies),
283 (int)(jiffies & 0xffff), 277 (int)(jiffies & 0xffff),
284 rsp->n_force_qs, rsp->n_force_qs_ngp, 278 rsp->n_force_qs, rsp->n_force_qs_ngp,
diff --git a/kernel/relay.c b/kernel/relay.c
index 226fade4d727..ab56a1764d4d 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -164,10 +164,14 @@ depopulate:
164 */ 164 */
165static struct rchan_buf *relay_create_buf(struct rchan *chan) 165static struct rchan_buf *relay_create_buf(struct rchan *chan)
166{ 166{
167 struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); 167 struct rchan_buf *buf;
168 if (!buf) 168
169 if (chan->n_subbufs > UINT_MAX / sizeof(size_t *))
169 return NULL; 170 return NULL;
170 171
172 buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL);
173 if (!buf)
174 return NULL;
171 buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL); 175 buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL);
172 if (!buf->padding) 176 if (!buf->padding)
173 goto free_buf; 177 goto free_buf;
@@ -302,7 +306,7 @@ static void buf_unmapped_default_callback(struct rchan_buf *buf,
302 */ 306 */
303static struct dentry *create_buf_file_default_callback(const char *filename, 307static struct dentry *create_buf_file_default_callback(const char *filename,
304 struct dentry *parent, 308 struct dentry *parent,
305 int mode, 309 umode_t mode,
306 struct rchan_buf *buf, 310 struct rchan_buf *buf,
307 int *is_global) 311 int *is_global)
308{ 312{
@@ -574,6 +578,8 @@ struct rchan *relay_open(const char *base_filename,
574 578
575 if (!(subbuf_size && n_subbufs)) 579 if (!(subbuf_size && n_subbufs))
576 return NULL; 580 return NULL;
581 if (subbuf_size > UINT_MAX / n_subbufs)
582 return NULL;
577 583
578 chan = kzalloc(sizeof(struct rchan), GFP_KERNEL); 584 chan = kzalloc(sizeof(struct rchan), GFP_KERNEL);
579 if (!chan) 585 if (!chan)
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 34683efa2cce..d508363858b3 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -66,6 +66,31 @@ done:
66 return ret; 66 return ret;
67} 67}
68 68
69int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
70 struct res_counter **limit_fail_at)
71{
72 int ret, r;
73 unsigned long flags;
74 struct res_counter *c;
75
76 r = ret = 0;
77 *limit_fail_at = NULL;
78 local_irq_save(flags);
79 for (c = counter; c != NULL; c = c->parent) {
80 spin_lock(&c->lock);
81 r = res_counter_charge_locked(c, val);
82 if (r)
83 c->usage += val;
84 spin_unlock(&c->lock);
85 if (r < 0 && ret == 0) {
86 *limit_fail_at = c;
87 ret = r;
88 }
89 }
90 local_irq_restore(flags);
91
92 return ret;
93}
69void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) 94void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
70{ 95{
71 if (WARN_ON(counter->usage < val)) 96 if (WARN_ON(counter->usage < val))
@@ -159,8 +184,7 @@ int res_counter_memparse_write_strategy(const char *buf,
159 return 0; 184 return 0;
160 } 185 }
161 186
162 /* FIXME - make memparse() take const char* args */ 187 *res = memparse(buf, &end);
163 *res = memparse((char *)buf, &end);
164 if (*end != '\0') 188 if (*end != '\0')
165 return -EINVAL; 189 return -EINVAL;
166 190
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 8eafd1bd273e..16502d3a71c8 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -101,6 +101,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
101 101
102 printk("\n============================================\n"); 102 printk("\n============================================\n");
103 printk( "[ BUG: circular locking deadlock detected! ]\n"); 103 printk( "[ BUG: circular locking deadlock detected! ]\n");
104 printk("%s\n", print_tainted());
104 printk( "--------------------------------------------\n"); 105 printk( "--------------------------------------------\n");
105 printk("%s/%d is deadlocking current task %s/%d\n\n", 106 printk("%s/%d is deadlocking current task %s/%d\n\n",
106 task->comm, task_pid_nr(task), 107 task->comm, task_pid_nr(task),
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 3d9f31cd79e7..98ec49475460 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -6,11 +6,11 @@
6 * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> 6 * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
7 * 7 *
8 */ 8 */
9#include <linux/device.h>
9#include <linux/kthread.h> 10#include <linux/kthread.h>
10#include <linux/export.h> 11#include <linux/export.h>
11#include <linux/sched.h> 12#include <linux/sched.h>
12#include <linux/spinlock.h> 13#include <linux/spinlock.h>
13#include <linux/sysdev.h>
14#include <linux/timer.h> 14#include <linux/timer.h>
15#include <linux/freezer.h> 15#include <linux/freezer.h>
16 16
@@ -27,7 +27,7 @@ struct test_thread_data {
27 int opdata; 27 int opdata;
28 int mutexes[MAX_RT_TEST_MUTEXES]; 28 int mutexes[MAX_RT_TEST_MUTEXES];
29 int event; 29 int event;
30 struct sys_device sysdev; 30 struct device dev;
31}; 31};
32 32
33static struct test_thread_data thread_data[MAX_RT_TEST_THREADS]; 33static struct test_thread_data thread_data[MAX_RT_TEST_THREADS];
@@ -271,7 +271,7 @@ static int test_func(void *data)
271 * 271 *
272 * opcode:data 272 * opcode:data
273 */ 273 */
274static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribute *attr, 274static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *attr,
275 const char *buf, size_t count) 275 const char *buf, size_t count)
276{ 276{
277 struct sched_param schedpar; 277 struct sched_param schedpar;
@@ -279,8 +279,8 @@ static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribut
279 char cmdbuf[32]; 279 char cmdbuf[32];
280 int op, dat, tid, ret; 280 int op, dat, tid, ret;
281 281
282 td = container_of(dev, struct test_thread_data, sysdev); 282 td = container_of(dev, struct test_thread_data, dev);
283 tid = td->sysdev.id; 283 tid = td->dev.id;
284 284
285 /* strings from sysfs write are not 0 terminated! */ 285 /* strings from sysfs write are not 0 terminated! */
286 if (count >= sizeof(cmdbuf)) 286 if (count >= sizeof(cmdbuf))
@@ -334,7 +334,7 @@ static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribut
334 * @dev: thread to query 334 * @dev: thread to query
335 * @buf: char buffer to be filled with thread status info 335 * @buf: char buffer to be filled with thread status info
336 */ 336 */
337static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute *attr, 337static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *attr,
338 char *buf) 338 char *buf)
339{ 339{
340 struct test_thread_data *td; 340 struct test_thread_data *td;
@@ -342,8 +342,8 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute
342 char *curr = buf; 342 char *curr = buf;
343 int i; 343 int i;
344 344
345 td = container_of(dev, struct test_thread_data, sysdev); 345 td = container_of(dev, struct test_thread_data, dev);
346 tsk = threads[td->sysdev.id]; 346 tsk = threads[td->dev.id];
347 347
348 spin_lock(&rttest_lock); 348 spin_lock(&rttest_lock);
349 349
@@ -360,28 +360,29 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute
360 spin_unlock(&rttest_lock); 360 spin_unlock(&rttest_lock);
361 361
362 curr += sprintf(curr, ", T: %p, R: %p\n", tsk, 362 curr += sprintf(curr, ", T: %p, R: %p\n", tsk,
363 mutexes[td->sysdev.id].owner); 363 mutexes[td->dev.id].owner);
364 364
365 return curr - buf; 365 return curr - buf;
366} 366}
367 367
368static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL); 368static DEVICE_ATTR(status, 0600, sysfs_test_status, NULL);
369static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command); 369static DEVICE_ATTR(command, 0600, NULL, sysfs_test_command);
370 370
371static struct sysdev_class rttest_sysclass = { 371static struct bus_type rttest_subsys = {
372 .name = "rttest", 372 .name = "rttest",
373 .dev_name = "rttest",
373}; 374};
374 375
375static int init_test_thread(int id) 376static int init_test_thread(int id)
376{ 377{
377 thread_data[id].sysdev.cls = &rttest_sysclass; 378 thread_data[id].dev.bus = &rttest_subsys;
378 thread_data[id].sysdev.id = id; 379 thread_data[id].dev.id = id;
379 380
380 threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id); 381 threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id);
381 if (IS_ERR(threads[id])) 382 if (IS_ERR(threads[id]))
382 return PTR_ERR(threads[id]); 383 return PTR_ERR(threads[id]);
383 384
384 return sysdev_register(&thread_data[id].sysdev); 385 return device_register(&thread_data[id].dev);
385} 386}
386 387
387static int init_rttest(void) 388static int init_rttest(void)
@@ -393,7 +394,7 @@ static int init_rttest(void)
393 for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) 394 for (i = 0; i < MAX_RT_TEST_MUTEXES; i++)
394 rt_mutex_init(&mutexes[i]); 395 rt_mutex_init(&mutexes[i]);
395 396
396 ret = sysdev_class_register(&rttest_sysclass); 397 ret = subsys_system_register(&rttest_subsys, NULL);
397 if (ret) 398 if (ret)
398 return ret; 399 return ret;
399 400
@@ -401,10 +402,10 @@ static int init_rttest(void)
401 ret = init_test_thread(i); 402 ret = init_test_thread(i);
402 if (ret) 403 if (ret)
403 break; 404 break;
404 ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status); 405 ret = device_create_file(&thread_data[i].dev, &dev_attr_status);
405 if (ret) 406 if (ret)
406 break; 407 break;
407 ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command); 408 ret = device_create_file(&thread_data[i].dev, &dev_attr_command);
408 if (ret) 409 if (ret)
409 break; 410 break;
410 } 411 }
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index f9d8482dd487..a242e691c993 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -579,7 +579,6 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
579 struct rt_mutex_waiter *waiter) 579 struct rt_mutex_waiter *waiter)
580{ 580{
581 int ret = 0; 581 int ret = 0;
582 int was_disabled;
583 582
584 for (;;) { 583 for (;;) {
585 /* Try to acquire the lock: */ 584 /* Try to acquire the lock: */
@@ -602,17 +601,10 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
602 601
603 raw_spin_unlock(&lock->wait_lock); 602 raw_spin_unlock(&lock->wait_lock);
604 603
605 was_disabled = irqs_disabled();
606 if (was_disabled)
607 local_irq_enable();
608
609 debug_rt_mutex_print_deadlock(waiter); 604 debug_rt_mutex_print_deadlock(waiter);
610 605
611 schedule_rt_mutex(lock); 606 schedule_rt_mutex(lock);
612 607
613 if (was_disabled)
614 local_irq_disable();
615
616 raw_spin_lock(&lock->wait_lock); 608 raw_spin_lock(&lock->wait_lock);
617 set_current_state(state); 609 set_current_state(state);
618 } 610 }
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
new file mode 100644
index 000000000000..9a7dd35102a3
--- /dev/null
+++ b/kernel/sched/Makefile
@@ -0,0 +1,20 @@
1ifdef CONFIG_FUNCTION_TRACER
2CFLAGS_REMOVE_clock.o = -pg
3endif
4
5ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
6# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
7# needed for x86 only. Why this used to be enabled for all architectures is beyond
8# me. I suspect most platforms don't need this, but until we know that for sure
9# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k
10# to get a correct value for the wait-channel (WCHAN in ps). --davidm
11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
12endif
13
14obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o
15obj-$(CONFIG_SMP) += cpupri.o
16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
17obj-$(CONFIG_SCHEDSTATS) += stats.o
18obj-$(CONFIG_SCHED_DEBUG) += debug.o
19
20
diff --git a/kernel/sched_autogroup.c b/kernel/sched/auto_group.c
index 429242f3c484..e8a1f83ee0e7 100644
--- a/kernel/sched_autogroup.c
+++ b/kernel/sched/auto_group.c
@@ -1,15 +1,19 @@
1#ifdef CONFIG_SCHED_AUTOGROUP 1#ifdef CONFIG_SCHED_AUTOGROUP
2 2
3#include "sched.h"
4
3#include <linux/proc_fs.h> 5#include <linux/proc_fs.h>
4#include <linux/seq_file.h> 6#include <linux/seq_file.h>
5#include <linux/kallsyms.h> 7#include <linux/kallsyms.h>
6#include <linux/utsname.h> 8#include <linux/utsname.h>
9#include <linux/security.h>
10#include <linux/export.h>
7 11
8unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; 12unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
9static struct autogroup autogroup_default; 13static struct autogroup autogroup_default;
10static atomic_t autogroup_seq_nr; 14static atomic_t autogroup_seq_nr;
11 15
12static void __init autogroup_init(struct task_struct *init_task) 16void __init autogroup_init(struct task_struct *init_task)
13{ 17{
14 autogroup_default.tg = &root_task_group; 18 autogroup_default.tg = &root_task_group;
15 kref_init(&autogroup_default.kref); 19 kref_init(&autogroup_default.kref);
@@ -17,7 +21,7 @@ static void __init autogroup_init(struct task_struct *init_task)
17 init_task->signal->autogroup = &autogroup_default; 21 init_task->signal->autogroup = &autogroup_default;
18} 22}
19 23
20static inline void autogroup_free(struct task_group *tg) 24void autogroup_free(struct task_group *tg)
21{ 25{
22 kfree(tg->autogroup); 26 kfree(tg->autogroup);
23} 27}
@@ -59,10 +63,6 @@ static inline struct autogroup *autogroup_task_get(struct task_struct *p)
59 return ag; 63 return ag;
60} 64}
61 65
62#ifdef CONFIG_RT_GROUP_SCHED
63static void free_rt_sched_group(struct task_group *tg);
64#endif
65
66static inline struct autogroup *autogroup_create(void) 66static inline struct autogroup *autogroup_create(void)
67{ 67{
68 struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); 68 struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
@@ -108,8 +108,7 @@ out_fail:
108 return autogroup_kref_get(&autogroup_default); 108 return autogroup_kref_get(&autogroup_default);
109} 109}
110 110
111static inline bool 111bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
112task_wants_autogroup(struct task_struct *p, struct task_group *tg)
113{ 112{
114 if (tg != &root_task_group) 113 if (tg != &root_task_group)
115 return false; 114 return false;
@@ -127,22 +126,6 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg)
127 return true; 126 return true;
128} 127}
129 128
130static inline bool task_group_is_autogroup(struct task_group *tg)
131{
132 return !!tg->autogroup;
133}
134
135static inline struct task_group *
136autogroup_task_group(struct task_struct *p, struct task_group *tg)
137{
138 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
139
140 if (enabled && task_wants_autogroup(p, tg))
141 return p->signal->autogroup->tg;
142
143 return tg;
144}
145
146static void 129static void
147autogroup_move_group(struct task_struct *p, struct autogroup *ag) 130autogroup_move_group(struct task_struct *p, struct autogroup *ag)
148{ 131{
@@ -263,7 +246,7 @@ out:
263#endif /* CONFIG_PROC_FS */ 246#endif /* CONFIG_PROC_FS */
264 247
265#ifdef CONFIG_SCHED_DEBUG 248#ifdef CONFIG_SCHED_DEBUG
266static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) 249int autogroup_path(struct task_group *tg, char *buf, int buflen)
267{ 250{
268 if (!task_group_is_autogroup(tg)) 251 if (!task_group_is_autogroup(tg))
269 return 0; 252 return 0;
diff --git a/kernel/sched_autogroup.h b/kernel/sched/auto_group.h
index c2f0e7248dca..8bd047142816 100644
--- a/kernel/sched_autogroup.h
+++ b/kernel/sched/auto_group.h
@@ -1,5 +1,8 @@
1#ifdef CONFIG_SCHED_AUTOGROUP 1#ifdef CONFIG_SCHED_AUTOGROUP
2 2
3#include <linux/kref.h>
4#include <linux/rwsem.h>
5
3struct autogroup { 6struct autogroup {
4 /* 7 /*
5 * reference doesn't mean how many thread attach to this 8 * reference doesn't mean how many thread attach to this
@@ -13,9 +16,28 @@ struct autogroup {
13 int nice; 16 int nice;
14}; 17};
15 18
16static inline bool task_group_is_autogroup(struct task_group *tg); 19extern void autogroup_init(struct task_struct *init_task);
20extern void autogroup_free(struct task_group *tg);
21
22static inline bool task_group_is_autogroup(struct task_group *tg)
23{
24 return !!tg->autogroup;
25}
26
27extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
28
17static inline struct task_group * 29static inline struct task_group *
18autogroup_task_group(struct task_struct *p, struct task_group *tg); 30autogroup_task_group(struct task_struct *p, struct task_group *tg)
31{
32 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
33
34 if (enabled && task_wants_autogroup(p, tg))
35 return p->signal->autogroup->tg;
36
37 return tg;
38}
39
40extern int autogroup_path(struct task_group *tg, char *buf, int buflen);
19 41
20#else /* !CONFIG_SCHED_AUTOGROUP */ 42#else /* !CONFIG_SCHED_AUTOGROUP */
21 43
diff --git a/kernel/sched_clock.c b/kernel/sched/clock.c
index c685e31492df..c685e31492df 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched/clock.c
diff --git a/kernel/sched.c b/kernel/sched/core.c
index 0e9344a71be3..33a0676ea744 100644
--- a/kernel/sched.c
+++ b/kernel/sched/core.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * kernel/sched.c 2 * kernel/sched/core.c
3 * 3 *
4 * Kernel scheduler and related syscalls 4 * Kernel scheduler and related syscalls
5 * 5 *
@@ -56,7 +56,6 @@
56#include <linux/percpu.h> 56#include <linux/percpu.h>
57#include <linux/proc_fs.h> 57#include <linux/proc_fs.h>
58#include <linux/seq_file.h> 58#include <linux/seq_file.h>
59#include <linux/stop_machine.h>
60#include <linux/sysctl.h> 59#include <linux/sysctl.h>
61#include <linux/syscalls.h> 60#include <linux/syscalls.h>
62#include <linux/times.h> 61#include <linux/times.h>
@@ -71,6 +70,7 @@
71#include <linux/ctype.h> 70#include <linux/ctype.h>
72#include <linux/ftrace.h> 71#include <linux/ftrace.h>
73#include <linux/slab.h> 72#include <linux/slab.h>
73#include <linux/init_task.h>
74 74
75#include <asm/tlb.h> 75#include <asm/tlb.h>
76#include <asm/irq_regs.h> 76#include <asm/irq_regs.h>
@@ -79,124 +79,13 @@
79#include <asm/paravirt.h> 79#include <asm/paravirt.h>
80#endif 80#endif
81 81
82#include "sched_cpupri.h" 82#include "sched.h"
83#include "workqueue_sched.h" 83#include "../workqueue_sched.h"
84#include "sched_autogroup.h"
85 84
86#define CREATE_TRACE_POINTS 85#define CREATE_TRACE_POINTS
87#include <trace/events/sched.h> 86#include <trace/events/sched.h>
88 87
89/* 88void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
90 * Convert user-nice values [ -20 ... 0 ... 19 ]
91 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
92 * and back.
93 */
94#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
95#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
96#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
97
98/*
99 * 'User priority' is the nice value converted to something we
100 * can work with better when scaling various scheduler parameters,
101 * it's a [ 0 ... 39 ] range.
102 */
103#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
104#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
105#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
106
107/*
108 * Helpers for converting nanosecond timing to jiffy resolution
109 */
110#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
111
112#define NICE_0_LOAD SCHED_LOAD_SCALE
113#define NICE_0_SHIFT SCHED_LOAD_SHIFT
114
115/*
116 * These are the 'tuning knobs' of the scheduler:
117 *
118 * default timeslice is 100 msecs (used only for SCHED_RR tasks).
119 * Timeslices get refilled after they expire.
120 */
121#define DEF_TIMESLICE (100 * HZ / 1000)
122
123/*
124 * single value that denotes runtime == period, ie unlimited time.
125 */
126#define RUNTIME_INF ((u64)~0ULL)
127
128static inline int rt_policy(int policy)
129{
130 if (policy == SCHED_FIFO || policy == SCHED_RR)
131 return 1;
132 return 0;
133}
134
135static inline int task_has_rt_policy(struct task_struct *p)
136{
137 return rt_policy(p->policy);
138}
139
140/*
141 * This is the priority-queue data structure of the RT scheduling class:
142 */
143struct rt_prio_array {
144 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
145 struct list_head queue[MAX_RT_PRIO];
146};
147
148struct rt_bandwidth {
149 /* nests inside the rq lock: */
150 raw_spinlock_t rt_runtime_lock;
151 ktime_t rt_period;
152 u64 rt_runtime;
153 struct hrtimer rt_period_timer;
154};
155
156static struct rt_bandwidth def_rt_bandwidth;
157
158static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
159
160static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
161{
162 struct rt_bandwidth *rt_b =
163 container_of(timer, struct rt_bandwidth, rt_period_timer);
164 ktime_t now;
165 int overrun;
166 int idle = 0;
167
168 for (;;) {
169 now = hrtimer_cb_get_time(timer);
170 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
171
172 if (!overrun)
173 break;
174
175 idle = do_sched_rt_period_timer(rt_b, overrun);
176 }
177
178 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
179}
180
181static
182void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
183{
184 rt_b->rt_period = ns_to_ktime(period);
185 rt_b->rt_runtime = runtime;
186
187 raw_spin_lock_init(&rt_b->rt_runtime_lock);
188
189 hrtimer_init(&rt_b->rt_period_timer,
190 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
191 rt_b->rt_period_timer.function = sched_rt_period_timer;
192}
193
194static inline int rt_bandwidth_enabled(void)
195{
196 return sysctl_sched_rt_runtime >= 0;
197}
198
199static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
200{ 89{
201 unsigned long delta; 90 unsigned long delta;
202 ktime_t soft, hard, now; 91 ktime_t soft, hard, now;
@@ -216,580 +105,12 @@ static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
216 } 105 }
217} 106}
218 107
219static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 108DEFINE_MUTEX(sched_domains_mutex);
220{ 109DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
221 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
222 return;
223
224 if (hrtimer_active(&rt_b->rt_period_timer))
225 return;
226
227 raw_spin_lock(&rt_b->rt_runtime_lock);
228 start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
229 raw_spin_unlock(&rt_b->rt_runtime_lock);
230}
231
232#ifdef CONFIG_RT_GROUP_SCHED
233static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
234{
235 hrtimer_cancel(&rt_b->rt_period_timer);
236}
237#endif
238
239/*
240 * sched_domains_mutex serializes calls to init_sched_domains,
241 * detach_destroy_domains and partition_sched_domains.
242 */
243static DEFINE_MUTEX(sched_domains_mutex);
244
245#ifdef CONFIG_CGROUP_SCHED
246
247#include <linux/cgroup.h>
248
249struct cfs_rq;
250
251static LIST_HEAD(task_groups);
252
253struct cfs_bandwidth {
254#ifdef CONFIG_CFS_BANDWIDTH
255 raw_spinlock_t lock;
256 ktime_t period;
257 u64 quota, runtime;
258 s64 hierarchal_quota;
259 u64 runtime_expires;
260
261 int idle, timer_active;
262 struct hrtimer period_timer, slack_timer;
263 struct list_head throttled_cfs_rq;
264
265 /* statistics */
266 int nr_periods, nr_throttled;
267 u64 throttled_time;
268#endif
269};
270
271/* task group related information */
272struct task_group {
273 struct cgroup_subsys_state css;
274
275#ifdef CONFIG_FAIR_GROUP_SCHED
276 /* schedulable entities of this group on each cpu */
277 struct sched_entity **se;
278 /* runqueue "owned" by this group on each cpu */
279 struct cfs_rq **cfs_rq;
280 unsigned long shares;
281
282 atomic_t load_weight;
283#endif
284
285#ifdef CONFIG_RT_GROUP_SCHED
286 struct sched_rt_entity **rt_se;
287 struct rt_rq **rt_rq;
288
289 struct rt_bandwidth rt_bandwidth;
290#endif
291
292 struct rcu_head rcu;
293 struct list_head list;
294
295 struct task_group *parent;
296 struct list_head siblings;
297 struct list_head children;
298
299#ifdef CONFIG_SCHED_AUTOGROUP
300 struct autogroup *autogroup;
301#endif
302
303 struct cfs_bandwidth cfs_bandwidth;
304};
305
306/* task_group_lock serializes the addition/removal of task groups */
307static DEFINE_SPINLOCK(task_group_lock);
308
309#ifdef CONFIG_FAIR_GROUP_SCHED
310
311# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
312
313/*
314 * A weight of 0 or 1 can cause arithmetics problems.
315 * A weight of a cfs_rq is the sum of weights of which entities
316 * are queued on this cfs_rq, so a weight of a entity should not be
317 * too large, so as the shares value of a task group.
318 * (The default weight is 1024 - so there's no practical
319 * limitation from this.)
320 */
321#define MIN_SHARES (1UL << 1)
322#define MAX_SHARES (1UL << 18)
323
324static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
325#endif
326
327/* Default task group.
328 * Every task in system belong to this group at bootup.
329 */
330struct task_group root_task_group;
331
332#endif /* CONFIG_CGROUP_SCHED */
333
334/* CFS-related fields in a runqueue */
335struct cfs_rq {
336 struct load_weight load;
337 unsigned long nr_running, h_nr_running;
338
339 u64 exec_clock;
340 u64 min_vruntime;
341#ifndef CONFIG_64BIT
342 u64 min_vruntime_copy;
343#endif
344
345 struct rb_root tasks_timeline;
346 struct rb_node *rb_leftmost;
347
348 struct list_head tasks;
349 struct list_head *balance_iterator;
350
351 /*
352 * 'curr' points to currently running entity on this cfs_rq.
353 * It is set to NULL otherwise (i.e when none are currently running).
354 */
355 struct sched_entity *curr, *next, *last, *skip;
356
357#ifdef CONFIG_SCHED_DEBUG
358 unsigned int nr_spread_over;
359#endif
360
361#ifdef CONFIG_FAIR_GROUP_SCHED
362 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
363
364 /*
365 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
366 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
367 * (like users, containers etc.)
368 *
369 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
370 * list is used during load balance.
371 */
372 int on_list;
373 struct list_head leaf_cfs_rq_list;
374 struct task_group *tg; /* group that "owns" this runqueue */
375
376#ifdef CONFIG_SMP
377 /*
378 * the part of load.weight contributed by tasks
379 */
380 unsigned long task_weight;
381
382 /*
383 * h_load = weight * f(tg)
384 *
385 * Where f(tg) is the recursive weight fraction assigned to
386 * this group.
387 */
388 unsigned long h_load;
389
390 /*
391 * Maintaining per-cpu shares distribution for group scheduling
392 *
393 * load_stamp is the last time we updated the load average
394 * load_last is the last time we updated the load average and saw load
395 * load_unacc_exec_time is currently unaccounted execution time
396 */
397 u64 load_avg;
398 u64 load_period;
399 u64 load_stamp, load_last, load_unacc_exec_time;
400
401 unsigned long load_contribution;
402#endif
403#ifdef CONFIG_CFS_BANDWIDTH
404 int runtime_enabled;
405 u64 runtime_expires;
406 s64 runtime_remaining;
407
408 u64 throttled_timestamp;
409 int throttled, throttle_count;
410 struct list_head throttled_list;
411#endif
412#endif
413};
414
415#ifdef CONFIG_FAIR_GROUP_SCHED
416#ifdef CONFIG_CFS_BANDWIDTH
417static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
418{
419 return &tg->cfs_bandwidth;
420}
421
422static inline u64 default_cfs_period(void);
423static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
424static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
425
426static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
427{
428 struct cfs_bandwidth *cfs_b =
429 container_of(timer, struct cfs_bandwidth, slack_timer);
430 do_sched_cfs_slack_timer(cfs_b);
431
432 return HRTIMER_NORESTART;
433}
434
435static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
436{
437 struct cfs_bandwidth *cfs_b =
438 container_of(timer, struct cfs_bandwidth, period_timer);
439 ktime_t now;
440 int overrun;
441 int idle = 0;
442
443 for (;;) {
444 now = hrtimer_cb_get_time(timer);
445 overrun = hrtimer_forward(timer, now, cfs_b->period);
446
447 if (!overrun)
448 break;
449
450 idle = do_sched_cfs_period_timer(cfs_b, overrun);
451 }
452
453 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
454}
455
456static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
457{
458 raw_spin_lock_init(&cfs_b->lock);
459 cfs_b->runtime = 0;
460 cfs_b->quota = RUNTIME_INF;
461 cfs_b->period = ns_to_ktime(default_cfs_period());
462
463 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
464 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
465 cfs_b->period_timer.function = sched_cfs_period_timer;
466 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
467 cfs_b->slack_timer.function = sched_cfs_slack_timer;
468}
469
470static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
471{
472 cfs_rq->runtime_enabled = 0;
473 INIT_LIST_HEAD(&cfs_rq->throttled_list);
474}
475
476/* requires cfs_b->lock, may release to reprogram timer */
477static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
478{
479 /*
480 * The timer may be active because we're trying to set a new bandwidth
481 * period or because we're racing with the tear-down path
482 * (timer_active==0 becomes visible before the hrtimer call-back
483 * terminates). In either case we ensure that it's re-programmed
484 */
485 while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
486 raw_spin_unlock(&cfs_b->lock);
487 /* ensure cfs_b->lock is available while we wait */
488 hrtimer_cancel(&cfs_b->period_timer);
489
490 raw_spin_lock(&cfs_b->lock);
491 /* if someone else restarted the timer then we're done */
492 if (cfs_b->timer_active)
493 return;
494 }
495
496 cfs_b->timer_active = 1;
497 start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
498}
499
500static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
501{
502 hrtimer_cancel(&cfs_b->period_timer);
503 hrtimer_cancel(&cfs_b->slack_timer);
504}
505#else
506static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
507static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
508static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
509
510static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
511{
512 return NULL;
513}
514#endif /* CONFIG_CFS_BANDWIDTH */
515#endif /* CONFIG_FAIR_GROUP_SCHED */
516
517/* Real-Time classes' related field in a runqueue: */
518struct rt_rq {
519 struct rt_prio_array active;
520 unsigned long rt_nr_running;
521#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
522 struct {
523 int curr; /* highest queued rt task prio */
524#ifdef CONFIG_SMP
525 int next; /* next highest */
526#endif
527 } highest_prio;
528#endif
529#ifdef CONFIG_SMP
530 unsigned long rt_nr_migratory;
531 unsigned long rt_nr_total;
532 int overloaded;
533 struct plist_head pushable_tasks;
534#endif
535 int rt_throttled;
536 u64 rt_time;
537 u64 rt_runtime;
538 /* Nests inside the rq lock: */
539 raw_spinlock_t rt_runtime_lock;
540
541#ifdef CONFIG_RT_GROUP_SCHED
542 unsigned long rt_nr_boosted;
543
544 struct rq *rq;
545 struct list_head leaf_rt_rq_list;
546 struct task_group *tg;
547#endif
548};
549
550#ifdef CONFIG_SMP
551
552/*
553 * We add the notion of a root-domain which will be used to define per-domain
554 * variables. Each exclusive cpuset essentially defines an island domain by
555 * fully partitioning the member cpus from any other cpuset. Whenever a new
556 * exclusive cpuset is created, we also create and attach a new root-domain
557 * object.
558 *
559 */
560struct root_domain {
561 atomic_t refcount;
562 atomic_t rto_count;
563 struct rcu_head rcu;
564 cpumask_var_t span;
565 cpumask_var_t online;
566
567 /*
568 * The "RT overload" flag: it gets set if a CPU has more than
569 * one runnable RT task.
570 */
571 cpumask_var_t rto_mask;
572 struct cpupri cpupri;
573};
574
575/*
576 * By default the system creates a single root-domain with all cpus as
577 * members (mimicking the global state we have today).
578 */
579static struct root_domain def_root_domain;
580
581#endif /* CONFIG_SMP */
582
583/*
584 * This is the main, per-CPU runqueue data structure.
585 *
586 * Locking rule: those places that want to lock multiple runqueues
587 * (such as the load balancing or the thread migration code), lock
588 * acquire operations must be ordered by ascending &runqueue.
589 */
590struct rq {
591 /* runqueue lock: */
592 raw_spinlock_t lock;
593
594 /*
595 * nr_running and cpu_load should be in the same cacheline because
596 * remote CPUs use both these fields when doing load calculation.
597 */
598 unsigned long nr_running;
599 #define CPU_LOAD_IDX_MAX 5
600 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
601 unsigned long last_load_update_tick;
602#ifdef CONFIG_NO_HZ
603 u64 nohz_stamp;
604 unsigned char nohz_balance_kick;
605#endif
606 int skip_clock_update;
607
608 /* capture load from *all* tasks on this cpu: */
609 struct load_weight load;
610 unsigned long nr_load_updates;
611 u64 nr_switches;
612
613 struct cfs_rq cfs;
614 struct rt_rq rt;
615
616#ifdef CONFIG_FAIR_GROUP_SCHED
617 /* list of leaf cfs_rq on this cpu: */
618 struct list_head leaf_cfs_rq_list;
619#endif
620#ifdef CONFIG_RT_GROUP_SCHED
621 struct list_head leaf_rt_rq_list;
622#endif
623
624 /*
625 * This is part of a global counter where only the total sum
626 * over all CPUs matters. A task can increase this counter on
627 * one CPU and if it got migrated afterwards it may decrease
628 * it on another CPU. Always updated under the runqueue lock:
629 */
630 unsigned long nr_uninterruptible;
631
632 struct task_struct *curr, *idle, *stop;
633 unsigned long next_balance;
634 struct mm_struct *prev_mm;
635
636 u64 clock;
637 u64 clock_task;
638
639 atomic_t nr_iowait;
640
641#ifdef CONFIG_SMP
642 struct root_domain *rd;
643 struct sched_domain *sd;
644
645 unsigned long cpu_power;
646
647 unsigned char idle_balance;
648 /* For active balancing */
649 int post_schedule;
650 int active_balance;
651 int push_cpu;
652 struct cpu_stop_work active_balance_work;
653 /* cpu of this runqueue: */
654 int cpu;
655 int online;
656
657 u64 rt_avg;
658 u64 age_stamp;
659 u64 idle_stamp;
660 u64 avg_idle;
661#endif
662
663#ifdef CONFIG_IRQ_TIME_ACCOUNTING
664 u64 prev_irq_time;
665#endif
666#ifdef CONFIG_PARAVIRT
667 u64 prev_steal_time;
668#endif
669#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
670 u64 prev_steal_time_rq;
671#endif
672
673 /* calc_load related fields */
674 unsigned long calc_load_update;
675 long calc_load_active;
676
677#ifdef CONFIG_SCHED_HRTICK
678#ifdef CONFIG_SMP
679 int hrtick_csd_pending;
680 struct call_single_data hrtick_csd;
681#endif
682 struct hrtimer hrtick_timer;
683#endif
684
685#ifdef CONFIG_SCHEDSTATS
686 /* latency stats */
687 struct sched_info rq_sched_info;
688 unsigned long long rq_cpu_time;
689 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
690
691 /* sys_sched_yield() stats */
692 unsigned int yld_count;
693
694 /* schedule() stats */
695 unsigned int sched_switch;
696 unsigned int sched_count;
697 unsigned int sched_goidle;
698
699 /* try_to_wake_up() stats */
700 unsigned int ttwu_count;
701 unsigned int ttwu_local;
702#endif
703
704#ifdef CONFIG_SMP
705 struct llist_head wake_list;
706#endif
707};
708
709static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
710
711
712static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
713
714static inline int cpu_of(struct rq *rq)
715{
716#ifdef CONFIG_SMP
717 return rq->cpu;
718#else
719 return 0;
720#endif
721}
722
723#define rcu_dereference_check_sched_domain(p) \
724 rcu_dereference_check((p), \
725 lockdep_is_held(&sched_domains_mutex))
726
727/*
728 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
729 * See detach_destroy_domains: synchronize_sched for details.
730 *
731 * The domain tree of any CPU may only be accessed from within
732 * preempt-disabled sections.
733 */
734#define for_each_domain(cpu, __sd) \
735 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
736
737#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
738#define this_rq() (&__get_cpu_var(runqueues))
739#define task_rq(p) cpu_rq(task_cpu(p))
740#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
741#define raw_rq() (&__raw_get_cpu_var(runqueues))
742
743#ifdef CONFIG_CGROUP_SCHED
744
745/*
746 * Return the group to which this tasks belongs.
747 *
748 * We use task_subsys_state_check() and extend the RCU verification with
749 * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
750 * task it moves into the cgroup. Therefore by holding either of those locks,
751 * we pin the task to the current cgroup.
752 */
753static inline struct task_group *task_group(struct task_struct *p)
754{
755 struct task_group *tg;
756 struct cgroup_subsys_state *css;
757
758 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
759 lockdep_is_held(&p->pi_lock) ||
760 lockdep_is_held(&task_rq(p)->lock));
761 tg = container_of(css, struct task_group, css);
762
763 return autogroup_task_group(p, tg);
764}
765
766/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
767static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
768{
769#ifdef CONFIG_FAIR_GROUP_SCHED
770 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
771 p->se.parent = task_group(p)->se[cpu];
772#endif
773
774#ifdef CONFIG_RT_GROUP_SCHED
775 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
776 p->rt.parent = task_group(p)->rt_se[cpu];
777#endif
778}
779
780#else /* CONFIG_CGROUP_SCHED */
781
782static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
783static inline struct task_group *task_group(struct task_struct *p)
784{
785 return NULL;
786}
787
788#endif /* CONFIG_CGROUP_SCHED */
789 110
790static void update_rq_clock_task(struct rq *rq, s64 delta); 111static void update_rq_clock_task(struct rq *rq, s64 delta);
791 112
792static void update_rq_clock(struct rq *rq) 113void update_rq_clock(struct rq *rq)
793{ 114{
794 s64 delta; 115 s64 delta;
795 116
@@ -802,44 +123,14 @@ static void update_rq_clock(struct rq *rq)
802} 123}
803 124
804/* 125/*
805 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
806 */
807#ifdef CONFIG_SCHED_DEBUG
808# define const_debug __read_mostly
809#else
810# define const_debug static const
811#endif
812
813/**
814 * runqueue_is_locked - Returns true if the current cpu runqueue is locked
815 * @cpu: the processor in question.
816 *
817 * This interface allows printk to be called with the runqueue lock
818 * held and know whether or not it is OK to wake up the klogd.
819 */
820int runqueue_is_locked(int cpu)
821{
822 return raw_spin_is_locked(&cpu_rq(cpu)->lock);
823}
824
825/*
826 * Debugging: various feature bits 126 * Debugging: various feature bits
827 */ 127 */
828 128
829#define SCHED_FEAT(name, enabled) \ 129#define SCHED_FEAT(name, enabled) \
830 __SCHED_FEAT_##name ,
831
832enum {
833#include "sched_features.h"
834};
835
836#undef SCHED_FEAT
837
838#define SCHED_FEAT(name, enabled) \
839 (1UL << __SCHED_FEAT_##name) * enabled | 130 (1UL << __SCHED_FEAT_##name) * enabled |
840 131
841const_debug unsigned int sysctl_sched_features = 132const_debug unsigned int sysctl_sched_features =
842#include "sched_features.h" 133#include "features.h"
843 0; 134 0;
844 135
845#undef SCHED_FEAT 136#undef SCHED_FEAT
@@ -849,7 +140,7 @@ const_debug unsigned int sysctl_sched_features =
849 #name , 140 #name ,
850 141
851static __read_mostly char *sched_feat_names[] = { 142static __read_mostly char *sched_feat_names[] = {
852#include "sched_features.h" 143#include "features.h"
853 NULL 144 NULL
854}; 145};
855 146
@@ -859,7 +150,7 @@ static int sched_feat_show(struct seq_file *m, void *v)
859{ 150{
860 int i; 151 int i;
861 152
862 for (i = 0; sched_feat_names[i]; i++) { 153 for (i = 0; i < __SCHED_FEAT_NR; i++) {
863 if (!(sysctl_sched_features & (1UL << i))) 154 if (!(sysctl_sched_features & (1UL << i)))
864 seq_puts(m, "NO_"); 155 seq_puts(m, "NO_");
865 seq_printf(m, "%s ", sched_feat_names[i]); 156 seq_printf(m, "%s ", sched_feat_names[i]);
@@ -869,6 +160,36 @@ static int sched_feat_show(struct seq_file *m, void *v)
869 return 0; 160 return 0;
870} 161}
871 162
163#ifdef HAVE_JUMP_LABEL
164
165#define jump_label_key__true jump_label_key_enabled
166#define jump_label_key__false jump_label_key_disabled
167
168#define SCHED_FEAT(name, enabled) \
169 jump_label_key__##enabled ,
170
171struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = {
172#include "features.h"
173};
174
175#undef SCHED_FEAT
176
177static void sched_feat_disable(int i)
178{
179 if (jump_label_enabled(&sched_feat_keys[i]))
180 jump_label_dec(&sched_feat_keys[i]);
181}
182
183static void sched_feat_enable(int i)
184{
185 if (!jump_label_enabled(&sched_feat_keys[i]))
186 jump_label_inc(&sched_feat_keys[i]);
187}
188#else
189static void sched_feat_disable(int i) { };
190static void sched_feat_enable(int i) { };
191#endif /* HAVE_JUMP_LABEL */
192
872static ssize_t 193static ssize_t
873sched_feat_write(struct file *filp, const char __user *ubuf, 194sched_feat_write(struct file *filp, const char __user *ubuf,
874 size_t cnt, loff_t *ppos) 195 size_t cnt, loff_t *ppos)
@@ -892,17 +213,20 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
892 cmp += 3; 213 cmp += 3;
893 } 214 }
894 215
895 for (i = 0; sched_feat_names[i]; i++) { 216 for (i = 0; i < __SCHED_FEAT_NR; i++) {
896 if (strcmp(cmp, sched_feat_names[i]) == 0) { 217 if (strcmp(cmp, sched_feat_names[i]) == 0) {
897 if (neg) 218 if (neg) {
898 sysctl_sched_features &= ~(1UL << i); 219 sysctl_sched_features &= ~(1UL << i);
899 else 220 sched_feat_disable(i);
221 } else {
900 sysctl_sched_features |= (1UL << i); 222 sysctl_sched_features |= (1UL << i);
223 sched_feat_enable(i);
224 }
901 break; 225 break;
902 } 226 }
903 } 227 }
904 228
905 if (!sched_feat_names[i]) 229 if (i == __SCHED_FEAT_NR)
906 return -EINVAL; 230 return -EINVAL;
907 231
908 *ppos += cnt; 232 *ppos += cnt;
@@ -931,10 +255,7 @@ static __init int sched_init_debug(void)
931 return 0; 255 return 0;
932} 256}
933late_initcall(sched_init_debug); 257late_initcall(sched_init_debug);
934 258#endif /* CONFIG_SCHED_DEBUG */
935#endif
936
937#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
938 259
939/* 260/*
940 * Number of tasks to iterate in a single balance run. 261 * Number of tasks to iterate in a single balance run.
@@ -956,7 +277,7 @@ const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
956 */ 277 */
957unsigned int sysctl_sched_rt_period = 1000000; 278unsigned int sysctl_sched_rt_period = 1000000;
958 279
959static __read_mostly int scheduler_running; 280__read_mostly int scheduler_running;
960 281
961/* 282/*
962 * part of the period that we allow rt tasks to run in us. 283 * part of the period that we allow rt tasks to run in us.
@@ -964,112 +285,7 @@ static __read_mostly int scheduler_running;
964 */ 285 */
965int sysctl_sched_rt_runtime = 950000; 286int sysctl_sched_rt_runtime = 950000;
966 287
967static inline u64 global_rt_period(void)
968{
969 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
970}
971
972static inline u64 global_rt_runtime(void)
973{
974 if (sysctl_sched_rt_runtime < 0)
975 return RUNTIME_INF;
976
977 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
978}
979
980#ifndef prepare_arch_switch
981# define prepare_arch_switch(next) do { } while (0)
982#endif
983#ifndef finish_arch_switch
984# define finish_arch_switch(prev) do { } while (0)
985#endif
986
987static inline int task_current(struct rq *rq, struct task_struct *p)
988{
989 return rq->curr == p;
990}
991
992static inline int task_running(struct rq *rq, struct task_struct *p)
993{
994#ifdef CONFIG_SMP
995 return p->on_cpu;
996#else
997 return task_current(rq, p);
998#endif
999}
1000
1001#ifndef __ARCH_WANT_UNLOCKED_CTXSW
1002static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
1003{
1004#ifdef CONFIG_SMP
1005 /*
1006 * We can optimise this out completely for !SMP, because the
1007 * SMP rebalancing from interrupt is the only thing that cares
1008 * here.
1009 */
1010 next->on_cpu = 1;
1011#endif
1012}
1013 288
1014static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
1015{
1016#ifdef CONFIG_SMP
1017 /*
1018 * After ->on_cpu is cleared, the task can be moved to a different CPU.
1019 * We must ensure this doesn't happen until the switch is completely
1020 * finished.
1021 */
1022 smp_wmb();
1023 prev->on_cpu = 0;
1024#endif
1025#ifdef CONFIG_DEBUG_SPINLOCK
1026 /* this is a valid case when another task releases the spinlock */
1027 rq->lock.owner = current;
1028#endif
1029 /*
1030 * If we are tracking spinlock dependencies then we have to
1031 * fix up the runqueue lock - which gets 'carried over' from
1032 * prev into current:
1033 */
1034 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
1035
1036 raw_spin_unlock_irq(&rq->lock);
1037}
1038
1039#else /* __ARCH_WANT_UNLOCKED_CTXSW */
1040static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
1041{
1042#ifdef CONFIG_SMP
1043 /*
1044 * We can optimise this out completely for !SMP, because the
1045 * SMP rebalancing from interrupt is the only thing that cares
1046 * here.
1047 */
1048 next->on_cpu = 1;
1049#endif
1050#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1051 raw_spin_unlock_irq(&rq->lock);
1052#else
1053 raw_spin_unlock(&rq->lock);
1054#endif
1055}
1056
1057static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
1058{
1059#ifdef CONFIG_SMP
1060 /*
1061 * After ->on_cpu is cleared, the task can be moved to a different CPU.
1062 * We must ensure this doesn't happen until the switch is completely
1063 * finished.
1064 */
1065 smp_wmb();
1066 prev->on_cpu = 0;
1067#endif
1068#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1069 local_irq_enable();
1070#endif
1071}
1072#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
1073 289
1074/* 290/*
1075 * __task_rq_lock - lock the rq @p resides on. 291 * __task_rq_lock - lock the rq @p resides on.
@@ -1152,20 +368,6 @@ static struct rq *this_rq_lock(void)
1152 * rq->lock. 368 * rq->lock.
1153 */ 369 */
1154 370
1155/*
1156 * Use hrtick when:
1157 * - enabled by features
1158 * - hrtimer is actually high res
1159 */
1160static inline int hrtick_enabled(struct rq *rq)
1161{
1162 if (!sched_feat(HRTICK))
1163 return 0;
1164 if (!cpu_active(cpu_of(rq)))
1165 return 0;
1166 return hrtimer_is_hres_active(&rq->hrtick_timer);
1167}
1168
1169static void hrtick_clear(struct rq *rq) 371static void hrtick_clear(struct rq *rq)
1170{ 372{
1171 if (hrtimer_active(&rq->hrtick_timer)) 373 if (hrtimer_active(&rq->hrtick_timer))
@@ -1209,7 +411,7 @@ static void __hrtick_start(void *arg)
1209 * 411 *
1210 * called with rq->lock held and irqs disabled 412 * called with rq->lock held and irqs disabled
1211 */ 413 */
1212static void hrtick_start(struct rq *rq, u64 delay) 414void hrtick_start(struct rq *rq, u64 delay)
1213{ 415{
1214 struct hrtimer *timer = &rq->hrtick_timer; 416 struct hrtimer *timer = &rq->hrtick_timer;
1215 ktime_t time = ktime_add_ns(timer->base->get_time(), delay); 417 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
@@ -1253,7 +455,7 @@ static __init void init_hrtick(void)
1253 * 455 *
1254 * called with rq->lock held and irqs disabled 456 * called with rq->lock held and irqs disabled
1255 */ 457 */
1256static void hrtick_start(struct rq *rq, u64 delay) 458void hrtick_start(struct rq *rq, u64 delay)
1257{ 459{
1258 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, 460 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1259 HRTIMER_MODE_REL_PINNED, 0); 461 HRTIMER_MODE_REL_PINNED, 0);
@@ -1304,7 +506,7 @@ static inline void init_hrtick(void)
1304#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) 506#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1305#endif 507#endif
1306 508
1307static void resched_task(struct task_struct *p) 509void resched_task(struct task_struct *p)
1308{ 510{
1309 int cpu; 511 int cpu;
1310 512
@@ -1325,7 +527,7 @@ static void resched_task(struct task_struct *p)
1325 smp_send_reschedule(cpu); 527 smp_send_reschedule(cpu);
1326} 528}
1327 529
1328static void resched_cpu(int cpu) 530void resched_cpu(int cpu)
1329{ 531{
1330 struct rq *rq = cpu_rq(cpu); 532 struct rq *rq = cpu_rq(cpu);
1331 unsigned long flags; 533 unsigned long flags;
@@ -1406,7 +608,8 @@ void wake_up_idle_cpu(int cpu)
1406 608
1407static inline bool got_nohz_idle_kick(void) 609static inline bool got_nohz_idle_kick(void)
1408{ 610{
1409 return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick; 611 int cpu = smp_processor_id();
612 return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
1410} 613}
1411 614
1412#else /* CONFIG_NO_HZ */ 615#else /* CONFIG_NO_HZ */
@@ -1418,12 +621,7 @@ static inline bool got_nohz_idle_kick(void)
1418 621
1419#endif /* CONFIG_NO_HZ */ 622#endif /* CONFIG_NO_HZ */
1420 623
1421static u64 sched_avg_period(void) 624void sched_avg_update(struct rq *rq)
1422{
1423 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
1424}
1425
1426static void sched_avg_update(struct rq *rq)
1427{ 625{
1428 s64 period = sched_avg_period(); 626 s64 period = sched_avg_period();
1429 627
@@ -1439,193 +637,23 @@ static void sched_avg_update(struct rq *rq)
1439 } 637 }
1440} 638}
1441 639
1442static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1443{
1444 rq->rt_avg += rt_delta;
1445 sched_avg_update(rq);
1446}
1447
1448#else /* !CONFIG_SMP */ 640#else /* !CONFIG_SMP */
1449static void resched_task(struct task_struct *p) 641void resched_task(struct task_struct *p)
1450{ 642{
1451 assert_raw_spin_locked(&task_rq(p)->lock); 643 assert_raw_spin_locked(&task_rq(p)->lock);
1452 set_tsk_need_resched(p); 644 set_tsk_need_resched(p);
1453} 645}
1454
1455static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1456{
1457}
1458
1459static void sched_avg_update(struct rq *rq)
1460{
1461}
1462#endif /* CONFIG_SMP */ 646#endif /* CONFIG_SMP */
1463 647
1464#if BITS_PER_LONG == 32
1465# define WMULT_CONST (~0UL)
1466#else
1467# define WMULT_CONST (1UL << 32)
1468#endif
1469
1470#define WMULT_SHIFT 32
1471
1472/*
1473 * Shift right and round:
1474 */
1475#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1476
1477/*
1478 * delta *= weight / lw
1479 */
1480static unsigned long
1481calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1482 struct load_weight *lw)
1483{
1484 u64 tmp;
1485
1486 /*
1487 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
1488 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
1489 * 2^SCHED_LOAD_RESOLUTION.
1490 */
1491 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
1492 tmp = (u64)delta_exec * scale_load_down(weight);
1493 else
1494 tmp = (u64)delta_exec;
1495
1496 if (!lw->inv_weight) {
1497 unsigned long w = scale_load_down(lw->weight);
1498
1499 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
1500 lw->inv_weight = 1;
1501 else if (unlikely(!w))
1502 lw->inv_weight = WMULT_CONST;
1503 else
1504 lw->inv_weight = WMULT_CONST / w;
1505 }
1506
1507 /*
1508 * Check whether we'd overflow the 64-bit multiplication:
1509 */
1510 if (unlikely(tmp > WMULT_CONST))
1511 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
1512 WMULT_SHIFT/2);
1513 else
1514 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
1515
1516 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1517}
1518
1519static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1520{
1521 lw->weight += inc;
1522 lw->inv_weight = 0;
1523}
1524
1525static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1526{
1527 lw->weight -= dec;
1528 lw->inv_weight = 0;
1529}
1530
1531static inline void update_load_set(struct load_weight *lw, unsigned long w)
1532{
1533 lw->weight = w;
1534 lw->inv_weight = 0;
1535}
1536
1537/*
1538 * To aid in avoiding the subversion of "niceness" due to uneven distribution
1539 * of tasks with abnormal "nice" values across CPUs the contribution that
1540 * each task makes to its run queue's load is weighted according to its
1541 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
1542 * scaled version of the new time slice allocation that they receive on time
1543 * slice expiry etc.
1544 */
1545
1546#define WEIGHT_IDLEPRIO 3
1547#define WMULT_IDLEPRIO 1431655765
1548
1549/*
1550 * Nice levels are multiplicative, with a gentle 10% change for every
1551 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
1552 * nice 1, it will get ~10% less CPU time than another CPU-bound task
1553 * that remained on nice 0.
1554 *
1555 * The "10% effect" is relative and cumulative: from _any_ nice level,
1556 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
1557 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
1558 * If a task goes up by ~10% and another task goes down by ~10% then
1559 * the relative distance between them is ~25%.)
1560 */
1561static const int prio_to_weight[40] = {
1562 /* -20 */ 88761, 71755, 56483, 46273, 36291,
1563 /* -15 */ 29154, 23254, 18705, 14949, 11916,
1564 /* -10 */ 9548, 7620, 6100, 4904, 3906,
1565 /* -5 */ 3121, 2501, 1991, 1586, 1277,
1566 /* 0 */ 1024, 820, 655, 526, 423,
1567 /* 5 */ 335, 272, 215, 172, 137,
1568 /* 10 */ 110, 87, 70, 56, 45,
1569 /* 15 */ 36, 29, 23, 18, 15,
1570};
1571
1572/*
1573 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
1574 *
1575 * In cases where the weight does not change often, we can use the
1576 * precalculated inverse to speed up arithmetics by turning divisions
1577 * into multiplications:
1578 */
1579static const u32 prio_to_wmult[40] = {
1580 /* -20 */ 48388, 59856, 76040, 92818, 118348,
1581 /* -15 */ 147320, 184698, 229616, 287308, 360437,
1582 /* -10 */ 449829, 563644, 704093, 875809, 1099582,
1583 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
1584 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
1585 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
1586 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
1587 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1588};
1589
1590/* Time spent by the tasks of the cpu accounting group executing in ... */
1591enum cpuacct_stat_index {
1592 CPUACCT_STAT_USER, /* ... user mode */
1593 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
1594
1595 CPUACCT_STAT_NSTATS,
1596};
1597
1598#ifdef CONFIG_CGROUP_CPUACCT
1599static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1600static void cpuacct_update_stats(struct task_struct *tsk,
1601 enum cpuacct_stat_index idx, cputime_t val);
1602#else
1603static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1604static inline void cpuacct_update_stats(struct task_struct *tsk,
1605 enum cpuacct_stat_index idx, cputime_t val) {}
1606#endif
1607
1608static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1609{
1610 update_load_add(&rq->load, load);
1611}
1612
1613static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1614{
1615 update_load_sub(&rq->load, load);
1616}
1617
1618#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ 648#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
1619 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) 649 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
1620typedef int (*tg_visitor)(struct task_group *, void *);
1621
1622/* 650/*
1623 * Iterate task_group tree rooted at *from, calling @down when first entering a 651 * Iterate task_group tree rooted at *from, calling @down when first entering a
1624 * node and @up when leaving it for the final time. 652 * node and @up when leaving it for the final time.
1625 * 653 *
1626 * Caller must hold rcu_lock or sufficient equivalent. 654 * Caller must hold rcu_lock or sufficient equivalent.
1627 */ 655 */
1628static int walk_tg_tree_from(struct task_group *from, 656int walk_tg_tree_from(struct task_group *from,
1629 tg_visitor down, tg_visitor up, void *data) 657 tg_visitor down, tg_visitor up, void *data)
1630{ 658{
1631 struct task_group *parent, *child; 659 struct task_group *parent, *child;
@@ -1656,270 +684,13 @@ out:
1656 return ret; 684 return ret;
1657} 685}
1658 686
1659/* 687int tg_nop(struct task_group *tg, void *data)
1660 * Iterate the full tree, calling @down when first entering a node and @up when
1661 * leaving it for the final time.
1662 *
1663 * Caller must hold rcu_lock or sufficient equivalent.
1664 */
1665
1666static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1667{
1668 return walk_tg_tree_from(&root_task_group, down, up, data);
1669}
1670
1671static int tg_nop(struct task_group *tg, void *data)
1672{ 688{
1673 return 0; 689 return 0;
1674} 690}
1675#endif 691#endif
1676 692
1677#ifdef CONFIG_SMP 693void update_cpu_load(struct rq *this_rq);
1678/* Used instead of source_load when we know the type == 0 */
1679static unsigned long weighted_cpuload(const int cpu)
1680{
1681 return cpu_rq(cpu)->load.weight;
1682}
1683
1684/*
1685 * Return a low guess at the load of a migration-source cpu weighted
1686 * according to the scheduling class and "nice" value.
1687 *
1688 * We want to under-estimate the load of migration sources, to
1689 * balance conservatively.
1690 */
1691static unsigned long source_load(int cpu, int type)
1692{
1693 struct rq *rq = cpu_rq(cpu);
1694 unsigned long total = weighted_cpuload(cpu);
1695
1696 if (type == 0 || !sched_feat(LB_BIAS))
1697 return total;
1698
1699 return min(rq->cpu_load[type-1], total);
1700}
1701
1702/*
1703 * Return a high guess at the load of a migration-target cpu weighted
1704 * according to the scheduling class and "nice" value.
1705 */
1706static unsigned long target_load(int cpu, int type)
1707{
1708 struct rq *rq = cpu_rq(cpu);
1709 unsigned long total = weighted_cpuload(cpu);
1710
1711 if (type == 0 || !sched_feat(LB_BIAS))
1712 return total;
1713
1714 return max(rq->cpu_load[type-1], total);
1715}
1716
1717static unsigned long power_of(int cpu)
1718{
1719 return cpu_rq(cpu)->cpu_power;
1720}
1721
1722static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1723
1724static unsigned long cpu_avg_load_per_task(int cpu)
1725{
1726 struct rq *rq = cpu_rq(cpu);
1727 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
1728
1729 if (nr_running)
1730 return rq->load.weight / nr_running;
1731
1732 return 0;
1733}
1734
1735#ifdef CONFIG_PREEMPT
1736
1737static void double_rq_lock(struct rq *rq1, struct rq *rq2);
1738
1739/*
1740 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1741 * way at the expense of forcing extra atomic operations in all
1742 * invocations. This assures that the double_lock is acquired using the
1743 * same underlying policy as the spinlock_t on this architecture, which
1744 * reduces latency compared to the unfair variant below. However, it
1745 * also adds more overhead and therefore may reduce throughput.
1746 */
1747static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1748 __releases(this_rq->lock)
1749 __acquires(busiest->lock)
1750 __acquires(this_rq->lock)
1751{
1752 raw_spin_unlock(&this_rq->lock);
1753 double_rq_lock(this_rq, busiest);
1754
1755 return 1;
1756}
1757
1758#else
1759/*
1760 * Unfair double_lock_balance: Optimizes throughput at the expense of
1761 * latency by eliminating extra atomic operations when the locks are
1762 * already in proper order on entry. This favors lower cpu-ids and will
1763 * grant the double lock to lower cpus over higher ids under contention,
1764 * regardless of entry order into the function.
1765 */
1766static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1767 __releases(this_rq->lock)
1768 __acquires(busiest->lock)
1769 __acquires(this_rq->lock)
1770{
1771 int ret = 0;
1772
1773 if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1774 if (busiest < this_rq) {
1775 raw_spin_unlock(&this_rq->lock);
1776 raw_spin_lock(&busiest->lock);
1777 raw_spin_lock_nested(&this_rq->lock,
1778 SINGLE_DEPTH_NESTING);
1779 ret = 1;
1780 } else
1781 raw_spin_lock_nested(&busiest->lock,
1782 SINGLE_DEPTH_NESTING);
1783 }
1784 return ret;
1785}
1786
1787#endif /* CONFIG_PREEMPT */
1788
1789/*
1790 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1791 */
1792static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1793{
1794 if (unlikely(!irqs_disabled())) {
1795 /* printk() doesn't work good under rq->lock */
1796 raw_spin_unlock(&this_rq->lock);
1797 BUG_ON(1);
1798 }
1799
1800 return _double_lock_balance(this_rq, busiest);
1801}
1802
1803static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1804 __releases(busiest->lock)
1805{
1806 raw_spin_unlock(&busiest->lock);
1807 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1808}
1809
1810/*
1811 * double_rq_lock - safely lock two runqueues
1812 *
1813 * Note this does not disable interrupts like task_rq_lock,
1814 * you need to do so manually before calling.
1815 */
1816static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1817 __acquires(rq1->lock)
1818 __acquires(rq2->lock)
1819{
1820 BUG_ON(!irqs_disabled());
1821 if (rq1 == rq2) {
1822 raw_spin_lock(&rq1->lock);
1823 __acquire(rq2->lock); /* Fake it out ;) */
1824 } else {
1825 if (rq1 < rq2) {
1826 raw_spin_lock(&rq1->lock);
1827 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1828 } else {
1829 raw_spin_lock(&rq2->lock);
1830 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1831 }
1832 }
1833}
1834
1835/*
1836 * double_rq_unlock - safely unlock two runqueues
1837 *
1838 * Note this does not restore interrupts like task_rq_unlock,
1839 * you need to do so manually after calling.
1840 */
1841static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1842 __releases(rq1->lock)
1843 __releases(rq2->lock)
1844{
1845 raw_spin_unlock(&rq1->lock);
1846 if (rq1 != rq2)
1847 raw_spin_unlock(&rq2->lock);
1848 else
1849 __release(rq2->lock);
1850}
1851
1852#else /* CONFIG_SMP */
1853
1854/*
1855 * double_rq_lock - safely lock two runqueues
1856 *
1857 * Note this does not disable interrupts like task_rq_lock,
1858 * you need to do so manually before calling.
1859 */
1860static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1861 __acquires(rq1->lock)
1862 __acquires(rq2->lock)
1863{
1864 BUG_ON(!irqs_disabled());
1865 BUG_ON(rq1 != rq2);
1866 raw_spin_lock(&rq1->lock);
1867 __acquire(rq2->lock); /* Fake it out ;) */
1868}
1869
1870/*
1871 * double_rq_unlock - safely unlock two runqueues
1872 *
1873 * Note this does not restore interrupts like task_rq_unlock,
1874 * you need to do so manually after calling.
1875 */
1876static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1877 __releases(rq1->lock)
1878 __releases(rq2->lock)
1879{
1880 BUG_ON(rq1 != rq2);
1881 raw_spin_unlock(&rq1->lock);
1882 __release(rq2->lock);
1883}
1884
1885#endif
1886
1887static void calc_load_account_idle(struct rq *this_rq);
1888static void update_sysctl(void);
1889static int get_update_sysctl_factor(void);
1890static void update_cpu_load(struct rq *this_rq);
1891
1892static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1893{
1894 set_task_rq(p, cpu);
1895#ifdef CONFIG_SMP
1896 /*
1897 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1898 * successfully executed on another CPU. We must ensure that updates of
1899 * per-task data have been completed by this moment.
1900 */
1901 smp_wmb();
1902 task_thread_info(p)->cpu = cpu;
1903#endif
1904}
1905
1906static const struct sched_class rt_sched_class;
1907
1908#define sched_class_highest (&stop_sched_class)
1909#define for_each_class(class) \
1910 for (class = sched_class_highest; class; class = class->next)
1911
1912#include "sched_stats.h"
1913
1914static void inc_nr_running(struct rq *rq)
1915{
1916 rq->nr_running++;
1917}
1918
1919static void dec_nr_running(struct rq *rq)
1920{
1921 rq->nr_running--;
1922}
1923 694
1924static void set_load_weight(struct task_struct *p) 695static void set_load_weight(struct task_struct *p)
1925{ 696{
@@ -1953,10 +724,7 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1953 p->sched_class->dequeue_task(rq, p, flags); 724 p->sched_class->dequeue_task(rq, p, flags);
1954} 725}
1955 726
1956/* 727void activate_task(struct rq *rq, struct task_struct *p, int flags)
1957 * activate_task - move a task to the runqueue.
1958 */
1959static void activate_task(struct rq *rq, struct task_struct *p, int flags)
1960{ 728{
1961 if (task_contributes_to_load(p)) 729 if (task_contributes_to_load(p))
1962 rq->nr_uninterruptible--; 730 rq->nr_uninterruptible--;
@@ -1964,10 +732,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags)
1964 enqueue_task(rq, p, flags); 732 enqueue_task(rq, p, flags);
1965} 733}
1966 734
1967/* 735void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1968 * deactivate_task - remove a task from the runqueue.
1969 */
1970static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1971{ 736{
1972 if (task_contributes_to_load(p)) 737 if (task_contributes_to_load(p))
1973 rq->nr_uninterruptible++; 738 rq->nr_uninterruptible++;
@@ -2158,14 +923,14 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
2158#ifdef CONFIG_IRQ_TIME_ACCOUNTING 923#ifdef CONFIG_IRQ_TIME_ACCOUNTING
2159static int irqtime_account_hi_update(void) 924static int irqtime_account_hi_update(void)
2160{ 925{
2161 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 926 u64 *cpustat = kcpustat_this_cpu->cpustat;
2162 unsigned long flags; 927 unsigned long flags;
2163 u64 latest_ns; 928 u64 latest_ns;
2164 int ret = 0; 929 int ret = 0;
2165 930
2166 local_irq_save(flags); 931 local_irq_save(flags);
2167 latest_ns = this_cpu_read(cpu_hardirq_time); 932 latest_ns = this_cpu_read(cpu_hardirq_time);
2168 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq)) 933 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
2169 ret = 1; 934 ret = 1;
2170 local_irq_restore(flags); 935 local_irq_restore(flags);
2171 return ret; 936 return ret;
@@ -2173,14 +938,14 @@ static int irqtime_account_hi_update(void)
2173 938
2174static int irqtime_account_si_update(void) 939static int irqtime_account_si_update(void)
2175{ 940{
2176 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 941 u64 *cpustat = kcpustat_this_cpu->cpustat;
2177 unsigned long flags; 942 unsigned long flags;
2178 u64 latest_ns; 943 u64 latest_ns;
2179 int ret = 0; 944 int ret = 0;
2180 945
2181 local_irq_save(flags); 946 local_irq_save(flags);
2182 latest_ns = this_cpu_read(cpu_softirq_time); 947 latest_ns = this_cpu_read(cpu_softirq_time);
2183 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq)) 948 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
2184 ret = 1; 949 ret = 1;
2185 local_irq_restore(flags); 950 local_irq_restore(flags);
2186 return ret; 951 return ret;
@@ -2192,15 +957,6 @@ static int irqtime_account_si_update(void)
2192 957
2193#endif 958#endif
2194 959
2195#include "sched_idletask.c"
2196#include "sched_fair.c"
2197#include "sched_rt.c"
2198#include "sched_autogroup.c"
2199#include "sched_stoptask.c"
2200#ifdef CONFIG_SCHED_DEBUG
2201# include "sched_debug.c"
2202#endif
2203
2204void sched_set_stop_task(int cpu, struct task_struct *stop) 960void sched_set_stop_task(int cpu, struct task_struct *stop)
2205{ 961{
2206 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; 962 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
@@ -2298,7 +1054,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2298 p->sched_class->prio_changed(rq, p, oldprio); 1054 p->sched_class->prio_changed(rq, p, oldprio);
2299} 1055}
2300 1056
2301static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 1057void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2302{ 1058{
2303 const struct sched_class *class; 1059 const struct sched_class *class;
2304 1060
@@ -2324,38 +1080,6 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2324} 1080}
2325 1081
2326#ifdef CONFIG_SMP 1082#ifdef CONFIG_SMP
2327/*
2328 * Is this task likely cache-hot:
2329 */
2330static int
2331task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2332{
2333 s64 delta;
2334
2335 if (p->sched_class != &fair_sched_class)
2336 return 0;
2337
2338 if (unlikely(p->policy == SCHED_IDLE))
2339 return 0;
2340
2341 /*
2342 * Buddy candidates are cache hot:
2343 */
2344 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
2345 (&p->se == cfs_rq_of(&p->se)->next ||
2346 &p->se == cfs_rq_of(&p->se)->last))
2347 return 1;
2348
2349 if (sysctl_sched_migration_cost == -1)
2350 return 1;
2351 if (sysctl_sched_migration_cost == 0)
2352 return 0;
2353
2354 delta = now - p->se.exec_start;
2355
2356 return delta < (s64)sysctl_sched_migration_cost;
2357}
2358
2359void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 1083void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2360{ 1084{
2361#ifdef CONFIG_SCHED_DEBUG 1085#ifdef CONFIG_SCHED_DEBUG
@@ -2782,6 +1506,11 @@ static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
2782 1506
2783} 1507}
2784#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ 1508#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
1509
1510static inline int ttwu_share_cache(int this_cpu, int that_cpu)
1511{
1512 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
1513}
2785#endif /* CONFIG_SMP */ 1514#endif /* CONFIG_SMP */
2786 1515
2787static void ttwu_queue(struct task_struct *p, int cpu) 1516static void ttwu_queue(struct task_struct *p, int cpu)
@@ -2789,7 +1518,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
2789 struct rq *rq = cpu_rq(cpu); 1518 struct rq *rq = cpu_rq(cpu);
2790 1519
2791#if defined(CONFIG_SMP) 1520#if defined(CONFIG_SMP)
2792 if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { 1521 if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) {
2793 sched_clock_cpu(cpu); /* sync clocks x-cpu */ 1522 sched_clock_cpu(cpu); /* sync clocks x-cpu */
2794 ttwu_queue_remote(p, cpu); 1523 ttwu_queue_remote(p, cpu);
2795 return; 1524 return;
@@ -3438,7 +2167,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
3438 */ 2167 */
3439static atomic_long_t calc_load_tasks_idle; 2168static atomic_long_t calc_load_tasks_idle;
3440 2169
3441static void calc_load_account_idle(struct rq *this_rq) 2170void calc_load_account_idle(struct rq *this_rq)
3442{ 2171{
3443 long delta; 2172 long delta;
3444 2173
@@ -3582,7 +2311,7 @@ static void calc_global_nohz(unsigned long ticks)
3582 */ 2311 */
3583} 2312}
3584#else 2313#else
3585static void calc_load_account_idle(struct rq *this_rq) 2314void calc_load_account_idle(struct rq *this_rq)
3586{ 2315{
3587} 2316}
3588 2317
@@ -3725,7 +2454,7 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
3725 * scheduler tick (TICK_NSEC). With tickless idle this will not be called 2454 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
3726 * every tick. We fix it up based on jiffies. 2455 * every tick. We fix it up based on jiffies.
3727 */ 2456 */
3728static void update_cpu_load(struct rq *this_rq) 2457void update_cpu_load(struct rq *this_rq)
3729{ 2458{
3730 unsigned long this_load = this_rq->load.weight; 2459 unsigned long this_load = this_rq->load.weight;
3731 unsigned long curr_jiffies = jiffies; 2460 unsigned long curr_jiffies = jiffies;
@@ -3803,8 +2532,10 @@ unlock:
3803#endif 2532#endif
3804 2533
3805DEFINE_PER_CPU(struct kernel_stat, kstat); 2534DEFINE_PER_CPU(struct kernel_stat, kstat);
2535DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
3806 2536
3807EXPORT_PER_CPU_SYMBOL(kstat); 2537EXPORT_PER_CPU_SYMBOL(kstat);
2538EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
3808 2539
3809/* 2540/*
3810 * Return any ns on the sched_clock that have not yet been accounted in 2541 * Return any ns on the sched_clock that have not yet been accounted in
@@ -3857,6 +2588,42 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3857 return ns; 2588 return ns;
3858} 2589}
3859 2590
2591#ifdef CONFIG_CGROUP_CPUACCT
2592struct cgroup_subsys cpuacct_subsys;
2593struct cpuacct root_cpuacct;
2594#endif
2595
2596static inline void task_group_account_field(struct task_struct *p, int index,
2597 u64 tmp)
2598{
2599#ifdef CONFIG_CGROUP_CPUACCT
2600 struct kernel_cpustat *kcpustat;
2601 struct cpuacct *ca;
2602#endif
2603 /*
2604 * Since all updates are sure to touch the root cgroup, we
2605 * get ourselves ahead and touch it first. If the root cgroup
2606 * is the only cgroup, then nothing else should be necessary.
2607 *
2608 */
2609 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
2610
2611#ifdef CONFIG_CGROUP_CPUACCT
2612 if (unlikely(!cpuacct_subsys.active))
2613 return;
2614
2615 rcu_read_lock();
2616 ca = task_ca(p);
2617 while (ca && (ca != &root_cpuacct)) {
2618 kcpustat = this_cpu_ptr(ca->cpustat);
2619 kcpustat->cpustat[index] += tmp;
2620 ca = parent_ca(ca);
2621 }
2622 rcu_read_unlock();
2623#endif
2624}
2625
2626
3860/* 2627/*
3861 * Account user cpu time to a process. 2628 * Account user cpu time to a process.
3862 * @p: the process that the cpu time gets accounted to 2629 * @p: the process that the cpu time gets accounted to
@@ -3866,22 +2633,18 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3866void account_user_time(struct task_struct *p, cputime_t cputime, 2633void account_user_time(struct task_struct *p, cputime_t cputime,
3867 cputime_t cputime_scaled) 2634 cputime_t cputime_scaled)
3868{ 2635{
3869 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2636 int index;
3870 cputime64_t tmp;
3871 2637
3872 /* Add user time to process. */ 2638 /* Add user time to process. */
3873 p->utime = cputime_add(p->utime, cputime); 2639 p->utime += cputime;
3874 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); 2640 p->utimescaled += cputime_scaled;
3875 account_group_user_time(p, cputime); 2641 account_group_user_time(p, cputime);
3876 2642
2643 index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
2644
3877 /* Add user time to cpustat. */ 2645 /* Add user time to cpustat. */
3878 tmp = cputime_to_cputime64(cputime); 2646 task_group_account_field(p, index, (__force u64) cputime);
3879 if (TASK_NICE(p) > 0)
3880 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3881 else
3882 cpustat->user = cputime64_add(cpustat->user, tmp);
3883 2647
3884 cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
3885 /* Account for user time used */ 2648 /* Account for user time used */
3886 acct_update_integrals(p); 2649 acct_update_integrals(p);
3887} 2650}
@@ -3895,24 +2658,21 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
3895static void account_guest_time(struct task_struct *p, cputime_t cputime, 2658static void account_guest_time(struct task_struct *p, cputime_t cputime,
3896 cputime_t cputime_scaled) 2659 cputime_t cputime_scaled)
3897{ 2660{
3898 cputime64_t tmp; 2661 u64 *cpustat = kcpustat_this_cpu->cpustat;
3899 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3900
3901 tmp = cputime_to_cputime64(cputime);
3902 2662
3903 /* Add guest time to process. */ 2663 /* Add guest time to process. */
3904 p->utime = cputime_add(p->utime, cputime); 2664 p->utime += cputime;
3905 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); 2665 p->utimescaled += cputime_scaled;
3906 account_group_user_time(p, cputime); 2666 account_group_user_time(p, cputime);
3907 p->gtime = cputime_add(p->gtime, cputime); 2667 p->gtime += cputime;
3908 2668
3909 /* Add guest time to cpustat. */ 2669 /* Add guest time to cpustat. */
3910 if (TASK_NICE(p) > 0) { 2670 if (TASK_NICE(p) > 0) {
3911 cpustat->nice = cputime64_add(cpustat->nice, tmp); 2671 cpustat[CPUTIME_NICE] += (__force u64) cputime;
3912 cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp); 2672 cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
3913 } else { 2673 } else {
3914 cpustat->user = cputime64_add(cpustat->user, tmp); 2674 cpustat[CPUTIME_USER] += (__force u64) cputime;
3915 cpustat->guest = cputime64_add(cpustat->guest, tmp); 2675 cpustat[CPUTIME_GUEST] += (__force u64) cputime;
3916 } 2676 }
3917} 2677}
3918 2678
@@ -3925,18 +2685,15 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
3925 */ 2685 */
3926static inline 2686static inline
3927void __account_system_time(struct task_struct *p, cputime_t cputime, 2687void __account_system_time(struct task_struct *p, cputime_t cputime,
3928 cputime_t cputime_scaled, cputime64_t *target_cputime64) 2688 cputime_t cputime_scaled, int index)
3929{ 2689{
3930 cputime64_t tmp = cputime_to_cputime64(cputime);
3931
3932 /* Add system time to process. */ 2690 /* Add system time to process. */
3933 p->stime = cputime_add(p->stime, cputime); 2691 p->stime += cputime;
3934 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); 2692 p->stimescaled += cputime_scaled;
3935 account_group_system_time(p, cputime); 2693 account_group_system_time(p, cputime);
3936 2694
3937 /* Add system time to cpustat. */ 2695 /* Add system time to cpustat. */
3938 *target_cputime64 = cputime64_add(*target_cputime64, tmp); 2696 task_group_account_field(p, index, (__force u64) cputime);
3939 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
3940 2697
3941 /* Account for system time used */ 2698 /* Account for system time used */
3942 acct_update_integrals(p); 2699 acct_update_integrals(p);
@@ -3952,8 +2709,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
3952void account_system_time(struct task_struct *p, int hardirq_offset, 2709void account_system_time(struct task_struct *p, int hardirq_offset,
3953 cputime_t cputime, cputime_t cputime_scaled) 2710 cputime_t cputime, cputime_t cputime_scaled)
3954{ 2711{
3955 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2712 int index;
3956 cputime64_t *target_cputime64;
3957 2713
3958 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 2714 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
3959 account_guest_time(p, cputime, cputime_scaled); 2715 account_guest_time(p, cputime, cputime_scaled);
@@ -3961,13 +2717,13 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3961 } 2717 }
3962 2718
3963 if (hardirq_count() - hardirq_offset) 2719 if (hardirq_count() - hardirq_offset)
3964 target_cputime64 = &cpustat->irq; 2720 index = CPUTIME_IRQ;
3965 else if (in_serving_softirq()) 2721 else if (in_serving_softirq())
3966 target_cputime64 = &cpustat->softirq; 2722 index = CPUTIME_SOFTIRQ;
3967 else 2723 else
3968 target_cputime64 = &cpustat->system; 2724 index = CPUTIME_SYSTEM;
3969 2725
3970 __account_system_time(p, cputime, cputime_scaled, target_cputime64); 2726 __account_system_time(p, cputime, cputime_scaled, index);
3971} 2727}
3972 2728
3973/* 2729/*
@@ -3976,10 +2732,9 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3976 */ 2732 */
3977void account_steal_time(cputime_t cputime) 2733void account_steal_time(cputime_t cputime)
3978{ 2734{
3979 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2735 u64 *cpustat = kcpustat_this_cpu->cpustat;
3980 cputime64_t cputime64 = cputime_to_cputime64(cputime);
3981 2736
3982 cpustat->steal = cputime64_add(cpustat->steal, cputime64); 2737 cpustat[CPUTIME_STEAL] += (__force u64) cputime;
3983} 2738}
3984 2739
3985/* 2740/*
@@ -3988,14 +2743,13 @@ void account_steal_time(cputime_t cputime)
3988 */ 2743 */
3989void account_idle_time(cputime_t cputime) 2744void account_idle_time(cputime_t cputime)
3990{ 2745{
3991 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2746 u64 *cpustat = kcpustat_this_cpu->cpustat;
3992 cputime64_t cputime64 = cputime_to_cputime64(cputime);
3993 struct rq *rq = this_rq(); 2747 struct rq *rq = this_rq();
3994 2748
3995 if (atomic_read(&rq->nr_iowait) > 0) 2749 if (atomic_read(&rq->nr_iowait) > 0)
3996 cpustat->iowait = cputime64_add(cpustat->iowait, cputime64); 2750 cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
3997 else 2751 else
3998 cpustat->idle = cputime64_add(cpustat->idle, cputime64); 2752 cpustat[CPUTIME_IDLE] += (__force u64) cputime;
3999} 2753}
4000 2754
4001static __always_inline bool steal_account_process_tick(void) 2755static __always_inline bool steal_account_process_tick(void)
@@ -4045,16 +2799,15 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
4045 struct rq *rq) 2799 struct rq *rq)
4046{ 2800{
4047 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 2801 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
4048 cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); 2802 u64 *cpustat = kcpustat_this_cpu->cpustat;
4049 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4050 2803
4051 if (steal_account_process_tick()) 2804 if (steal_account_process_tick())
4052 return; 2805 return;
4053 2806
4054 if (irqtime_account_hi_update()) { 2807 if (irqtime_account_hi_update()) {
4055 cpustat->irq = cputime64_add(cpustat->irq, tmp); 2808 cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
4056 } else if (irqtime_account_si_update()) { 2809 } else if (irqtime_account_si_update()) {
4057 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 2810 cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
4058 } else if (this_cpu_ksoftirqd() == p) { 2811 } else if (this_cpu_ksoftirqd() == p) {
4059 /* 2812 /*
4060 * ksoftirqd time do not get accounted in cpu_softirq_time. 2813 * ksoftirqd time do not get accounted in cpu_softirq_time.
@@ -4062,7 +2815,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
4062 * Also, p->stime needs to be updated for ksoftirqd. 2815 * Also, p->stime needs to be updated for ksoftirqd.
4063 */ 2816 */
4064 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, 2817 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
4065 &cpustat->softirq); 2818 CPUTIME_SOFTIRQ);
4066 } else if (user_tick) { 2819 } else if (user_tick) {
4067 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 2820 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
4068 } else if (p == rq->idle) { 2821 } else if (p == rq->idle) {
@@ -4071,7 +2824,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
4071 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); 2824 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
4072 } else { 2825 } else {
4073 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, 2826 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
4074 &cpustat->system); 2827 CPUTIME_SYSTEM);
4075 } 2828 }
4076} 2829}
4077 2830
@@ -4170,7 +2923,7 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
4170 2923
4171void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 2924void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
4172{ 2925{
4173 cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime); 2926 cputime_t rtime, utime = p->utime, total = utime + p->stime;
4174 2927
4175 /* 2928 /*
4176 * Use CFS's precise accounting: 2929 * Use CFS's precise accounting:
@@ -4178,11 +2931,11 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
4178 rtime = nsecs_to_cputime(p->se.sum_exec_runtime); 2931 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
4179 2932
4180 if (total) { 2933 if (total) {
4181 u64 temp = rtime; 2934 u64 temp = (__force u64) rtime;
4182 2935
4183 temp *= utime; 2936 temp *= (__force u64) utime;
4184 do_div(temp, total); 2937 do_div(temp, (__force u32) total);
4185 utime = (cputime_t)temp; 2938 utime = (__force cputime_t) temp;
4186 } else 2939 } else
4187 utime = rtime; 2940 utime = rtime;
4188 2941
@@ -4190,7 +2943,7 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
4190 * Compare with previous values, to keep monotonicity: 2943 * Compare with previous values, to keep monotonicity:
4191 */ 2944 */
4192 p->prev_utime = max(p->prev_utime, utime); 2945 p->prev_utime = max(p->prev_utime, utime);
4193 p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime)); 2946 p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
4194 2947
4195 *ut = p->prev_utime; 2948 *ut = p->prev_utime;
4196 *st = p->prev_stime; 2949 *st = p->prev_stime;
@@ -4207,21 +2960,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
4207 2960
4208 thread_group_cputime(p, &cputime); 2961 thread_group_cputime(p, &cputime);
4209 2962
4210 total = cputime_add(cputime.utime, cputime.stime); 2963 total = cputime.utime + cputime.stime;
4211 rtime = nsecs_to_cputime(cputime.sum_exec_runtime); 2964 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
4212 2965
4213 if (total) { 2966 if (total) {
4214 u64 temp = rtime; 2967 u64 temp = (__force u64) rtime;
4215 2968
4216 temp *= cputime.utime; 2969 temp *= (__force u64) cputime.utime;
4217 do_div(temp, total); 2970 do_div(temp, (__force u32) total);
4218 utime = (cputime_t)temp; 2971 utime = (__force cputime_t) temp;
4219 } else 2972 } else
4220 utime = rtime; 2973 utime = rtime;
4221 2974
4222 sig->prev_utime = max(sig->prev_utime, utime); 2975 sig->prev_utime = max(sig->prev_utime, utime);
4223 sig->prev_stime = max(sig->prev_stime, 2976 sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
4224 cputime_sub(rtime, sig->prev_utime));
4225 2977
4226 *ut = sig->prev_utime; 2978 *ut = sig->prev_utime;
4227 *st = sig->prev_stime; 2979 *st = sig->prev_stime;
@@ -4320,6 +3072,9 @@ static noinline void __schedule_bug(struct task_struct *prev)
4320{ 3072{
4321 struct pt_regs *regs = get_irq_regs(); 3073 struct pt_regs *regs = get_irq_regs();
4322 3074
3075 if (oops_in_progress)
3076 return;
3077
4323 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", 3078 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
4324 prev->comm, prev->pid, preempt_count()); 3079 prev->comm, prev->pid, preempt_count());
4325 3080
@@ -4810,6 +3565,9 @@ EXPORT_SYMBOL(wait_for_completion);
4810 * This waits for either a completion of a specific task to be signaled or for a 3565 * This waits for either a completion of a specific task to be signaled or for a
4811 * specified timeout to expire. The timeout is in jiffies. It is not 3566 * specified timeout to expire. The timeout is in jiffies. It is not
4812 * interruptible. 3567 * interruptible.
3568 *
3569 * The return value is 0 if timed out, and positive (at least 1, or number of
3570 * jiffies left till timeout) if completed.
4813 */ 3571 */
4814unsigned long __sched 3572unsigned long __sched
4815wait_for_completion_timeout(struct completion *x, unsigned long timeout) 3573wait_for_completion_timeout(struct completion *x, unsigned long timeout)
@@ -4824,6 +3582,8 @@ EXPORT_SYMBOL(wait_for_completion_timeout);
4824 * 3582 *
4825 * This waits for completion of a specific task to be signaled. It is 3583 * This waits for completion of a specific task to be signaled. It is
4826 * interruptible. 3584 * interruptible.
3585 *
3586 * The return value is -ERESTARTSYS if interrupted, 0 if completed.
4827 */ 3587 */
4828int __sched wait_for_completion_interruptible(struct completion *x) 3588int __sched wait_for_completion_interruptible(struct completion *x)
4829{ 3589{
@@ -4841,6 +3601,9 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
4841 * 3601 *
4842 * This waits for either a completion of a specific task to be signaled or for a 3602 * This waits for either a completion of a specific task to be signaled or for a
4843 * specified timeout to expire. It is interruptible. The timeout is in jiffies. 3603 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
3604 *
3605 * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
3606 * positive (at least 1, or number of jiffies left till timeout) if completed.
4844 */ 3607 */
4845long __sched 3608long __sched
4846wait_for_completion_interruptible_timeout(struct completion *x, 3609wait_for_completion_interruptible_timeout(struct completion *x,
@@ -4856,6 +3619,8 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4856 * 3619 *
4857 * This waits to be signaled for completion of a specific task. It can be 3620 * This waits to be signaled for completion of a specific task. It can be
4858 * interrupted by a kill signal. 3621 * interrupted by a kill signal.
3622 *
3623 * The return value is -ERESTARTSYS if interrupted, 0 if completed.
4859 */ 3624 */
4860int __sched wait_for_completion_killable(struct completion *x) 3625int __sched wait_for_completion_killable(struct completion *x)
4861{ 3626{
@@ -4874,6 +3639,9 @@ EXPORT_SYMBOL(wait_for_completion_killable);
4874 * This waits for either a completion of a specific task to be 3639 * This waits for either a completion of a specific task to be
4875 * signaled or for a specified timeout to expire. It can be 3640 * signaled or for a specified timeout to expire. It can be
4876 * interrupted by a kill signal. The timeout is in jiffies. 3641 * interrupted by a kill signal. The timeout is in jiffies.
3642 *
3643 * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
3644 * positive (at least 1, or number of jiffies left till timeout) if completed.
4877 */ 3645 */
4878long __sched 3646long __sched
4879wait_for_completion_killable_timeout(struct completion *x, 3647wait_for_completion_killable_timeout(struct completion *x,
@@ -5360,7 +4128,7 @@ recheck:
5360 on_rq = p->on_rq; 4128 on_rq = p->on_rq;
5361 running = task_current(rq, p); 4129 running = task_current(rq, p);
5362 if (on_rq) 4130 if (on_rq)
5363 deactivate_task(rq, p, 0); 4131 dequeue_task(rq, p, 0);
5364 if (running) 4132 if (running)
5365 p->sched_class->put_prev_task(rq, p); 4133 p->sched_class->put_prev_task(rq, p);
5366 4134
@@ -5373,7 +4141,7 @@ recheck:
5373 if (running) 4141 if (running)
5374 p->sched_class->set_curr_task(rq); 4142 p->sched_class->set_curr_task(rq);
5375 if (on_rq) 4143 if (on_rq)
5376 activate_task(rq, p, 0); 4144 enqueue_task(rq, p, 0);
5377 4145
5378 check_class_changed(rq, p, prev_class, oldprio); 4146 check_class_changed(rq, p, prev_class, oldprio);
5379 task_rq_unlock(rq, p, &flags); 4147 task_rq_unlock(rq, p, &flags);
@@ -5556,7 +4324,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
5556 goto out_free_cpus_allowed; 4324 goto out_free_cpus_allowed;
5557 } 4325 }
5558 retval = -EPERM; 4326 retval = -EPERM;
5559 if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE)) 4327 if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE))
5560 goto out_unlock; 4328 goto out_unlock;
5561 4329
5562 retval = security_task_setscheduler(p); 4330 retval = security_task_setscheduler(p);
@@ -5838,6 +4606,13 @@ again:
5838 */ 4606 */
5839 if (preempt && rq != p_rq) 4607 if (preempt && rq != p_rq)
5840 resched_task(p_rq->curr); 4608 resched_task(p_rq->curr);
4609 } else {
4610 /*
4611 * We might have set it in task_yield_fair(), but are
4612 * not going to schedule(), so don't want to skip
4613 * the next update.
4614 */
4615 rq->skip_clock_update = 0;
5841 } 4616 }
5842 4617
5843out: 4618out:
@@ -6005,7 +4780,7 @@ void sched_show_task(struct task_struct *p)
6005 free = stack_not_used(p); 4780 free = stack_not_used(p);
6006#endif 4781#endif
6007 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, 4782 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
6008 task_pid_nr(p), task_pid_nr(p->real_parent), 4783 task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)),
6009 (unsigned long)task_thread_info(p)->flags); 4784 (unsigned long)task_thread_info(p)->flags);
6010 4785
6011 show_stack(p, NULL); 4786 show_stack(p, NULL);
@@ -6099,53 +4874,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
6099 */ 4874 */
6100 idle->sched_class = &idle_sched_class; 4875 idle->sched_class = &idle_sched_class;
6101 ftrace_graph_init_idle_task(idle, cpu); 4876 ftrace_graph_init_idle_task(idle, cpu);
6102} 4877#if defined(CONFIG_SMP)
6103 4878 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
6104/* 4879#endif
6105 * Increase the granularity value when there are more CPUs,
6106 * because with more CPUs the 'effective latency' as visible
6107 * to users decreases. But the relationship is not linear,
6108 * so pick a second-best guess by going with the log2 of the
6109 * number of CPUs.
6110 *
6111 * This idea comes from the SD scheduler of Con Kolivas:
6112 */
6113static int get_update_sysctl_factor(void)
6114{
6115 unsigned int cpus = min_t(int, num_online_cpus(), 8);
6116 unsigned int factor;
6117
6118 switch (sysctl_sched_tunable_scaling) {
6119 case SCHED_TUNABLESCALING_NONE:
6120 factor = 1;
6121 break;
6122 case SCHED_TUNABLESCALING_LINEAR:
6123 factor = cpus;
6124 break;
6125 case SCHED_TUNABLESCALING_LOG:
6126 default:
6127 factor = 1 + ilog2(cpus);
6128 break;
6129 }
6130
6131 return factor;
6132}
6133
6134static void update_sysctl(void)
6135{
6136 unsigned int factor = get_update_sysctl_factor();
6137
6138#define SET_SYSCTL(name) \
6139 (sysctl_##name = (factor) * normalized_sysctl_##name)
6140 SET_SYSCTL(sched_min_granularity);
6141 SET_SYSCTL(sched_latency);
6142 SET_SYSCTL(sched_wakeup_granularity);
6143#undef SET_SYSCTL
6144}
6145
6146static inline void sched_init_granularity(void)
6147{
6148 update_sysctl();
6149} 4880}
6150 4881
6151#ifdef CONFIG_SMP 4882#ifdef CONFIG_SMP
@@ -6261,9 +4992,9 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
6261 * placed properly. 4992 * placed properly.
6262 */ 4993 */
6263 if (p->on_rq) { 4994 if (p->on_rq) {
6264 deactivate_task(rq_src, p, 0); 4995 dequeue_task(rq_src, p, 0);
6265 set_task_cpu(p, dest_cpu); 4996 set_task_cpu(p, dest_cpu);
6266 activate_task(rq_dest, p, 0); 4997 enqueue_task(rq_dest, p, 0);
6267 check_preempt_curr(rq_dest, p, 0); 4998 check_preempt_curr(rq_dest, p, 0);
6268 } 4999 }
6269done: 5000done:
@@ -6334,30 +5065,6 @@ static void calc_global_load_remove(struct rq *rq)
6334 rq->calc_load_active = 0; 5065 rq->calc_load_active = 0;
6335} 5066}
6336 5067
6337#ifdef CONFIG_CFS_BANDWIDTH
6338static void unthrottle_offline_cfs_rqs(struct rq *rq)
6339{
6340 struct cfs_rq *cfs_rq;
6341
6342 for_each_leaf_cfs_rq(rq, cfs_rq) {
6343 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
6344
6345 if (!cfs_rq->runtime_enabled)
6346 continue;
6347
6348 /*
6349 * clock_task is not advancing so we just need to make sure
6350 * there's some valid quota amount
6351 */
6352 cfs_rq->runtime_remaining = cfs_b->quota;
6353 if (cfs_rq_throttled(cfs_rq))
6354 unthrottle_cfs_rq(cfs_rq);
6355 }
6356}
6357#else
6358static void unthrottle_offline_cfs_rqs(struct rq *rq) {}
6359#endif
6360
6361/* 5068/*
6362 * Migrate all tasks from the rq, sleeping tasks will be migrated by 5069 * Migrate all tasks from the rq, sleeping tasks will be migrated by
6363 * try_to_wake_up()->select_task_rq(). 5070 * try_to_wake_up()->select_task_rq().
@@ -6463,7 +5170,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
6463static void 5170static void
6464set_table_entry(struct ctl_table *entry, 5171set_table_entry(struct ctl_table *entry,
6465 const char *procname, void *data, int maxlen, 5172 const char *procname, void *data, int maxlen,
6466 mode_t mode, proc_handler *proc_handler) 5173 umode_t mode, proc_handler *proc_handler)
6467{ 5174{
6468 entry->procname = procname; 5175 entry->procname = procname;
6469 entry->data = data; 5176 entry->data = data;
@@ -6963,6 +5670,12 @@ out:
6963 return -ENOMEM; 5670 return -ENOMEM;
6964} 5671}
6965 5672
5673/*
5674 * By default the system creates a single root-domain with all cpus as
5675 * members (mimicking the global state we have today).
5676 */
5677struct root_domain def_root_domain;
5678
6966static void init_defrootdomain(void) 5679static void init_defrootdomain(void)
6967{ 5680{
6968 init_rootdomain(&def_root_domain); 5681 init_rootdomain(&def_root_domain);
@@ -7034,6 +5747,31 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
7034} 5747}
7035 5748
7036/* 5749/*
5750 * Keep a special pointer to the highest sched_domain that has
5751 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
5752 * allows us to avoid some pointer chasing select_idle_sibling().
5753 *
5754 * Also keep a unique ID per domain (we use the first cpu number in
5755 * the cpumask of the domain), this allows us to quickly tell if
5756 * two cpus are in the same cache domain, see ttwu_share_cache().
5757 */
5758DEFINE_PER_CPU(struct sched_domain *, sd_llc);
5759DEFINE_PER_CPU(int, sd_llc_id);
5760
5761static void update_top_cache_domain(int cpu)
5762{
5763 struct sched_domain *sd;
5764 int id = cpu;
5765
5766 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
5767 if (sd)
5768 id = cpumask_first(sched_domain_span(sd));
5769
5770 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
5771 per_cpu(sd_llc_id, cpu) = id;
5772}
5773
5774/*
7037 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 5775 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
7038 * hold the hotplug lock. 5776 * hold the hotplug lock.
7039 */ 5777 */
@@ -7072,6 +5810,8 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
7072 tmp = rq->sd; 5810 tmp = rq->sd;
7073 rcu_assign_pointer(rq->sd, sd); 5811 rcu_assign_pointer(rq->sd, sd);
7074 destroy_sched_domains(tmp, cpu); 5812 destroy_sched_domains(tmp, cpu);
5813
5814 update_top_cache_domain(cpu);
7075} 5815}
7076 5816
7077/* cpus with isolated domains */ 5817/* cpus with isolated domains */
@@ -7231,7 +5971,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
7231 continue; 5971 continue;
7232 5972
7233 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 5973 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
7234 GFP_KERNEL, cpu_to_node(i)); 5974 GFP_KERNEL, cpu_to_node(cpu));
7235 5975
7236 if (!sg) 5976 if (!sg)
7237 goto fail; 5977 goto fail;
@@ -7369,6 +6109,12 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7369 return; 6109 return;
7370 6110
7371 update_group_power(sd, cpu); 6111 update_group_power(sd, cpu);
6112 atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
6113}
6114
6115int __weak arch_sd_sibling_asym_packing(void)
6116{
6117 return 0*SD_ASYM_PACKING;
7372} 6118}
7373 6119
7374/* 6120/*
@@ -7923,54 +6669,52 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7923} 6669}
7924 6670
7925#ifdef CONFIG_SCHED_MC 6671#ifdef CONFIG_SCHED_MC
7926static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, 6672static ssize_t sched_mc_power_savings_show(struct device *dev,
7927 struct sysdev_class_attribute *attr, 6673 struct device_attribute *attr,
7928 char *page) 6674 char *buf)
7929{ 6675{
7930 return sprintf(page, "%u\n", sched_mc_power_savings); 6676 return sprintf(buf, "%u\n", sched_mc_power_savings);
7931} 6677}
7932static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, 6678static ssize_t sched_mc_power_savings_store(struct device *dev,
7933 struct sysdev_class_attribute *attr, 6679 struct device_attribute *attr,
7934 const char *buf, size_t count) 6680 const char *buf, size_t count)
7935{ 6681{
7936 return sched_power_savings_store(buf, count, 0); 6682 return sched_power_savings_store(buf, count, 0);
7937} 6683}
7938static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644, 6684static DEVICE_ATTR(sched_mc_power_savings, 0644,
7939 sched_mc_power_savings_show, 6685 sched_mc_power_savings_show,
7940 sched_mc_power_savings_store); 6686 sched_mc_power_savings_store);
7941#endif 6687#endif
7942 6688
7943#ifdef CONFIG_SCHED_SMT 6689#ifdef CONFIG_SCHED_SMT
7944static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, 6690static ssize_t sched_smt_power_savings_show(struct device *dev,
7945 struct sysdev_class_attribute *attr, 6691 struct device_attribute *attr,
7946 char *page) 6692 char *buf)
7947{ 6693{
7948 return sprintf(page, "%u\n", sched_smt_power_savings); 6694 return sprintf(buf, "%u\n", sched_smt_power_savings);
7949} 6695}
7950static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, 6696static ssize_t sched_smt_power_savings_store(struct device *dev,
7951 struct sysdev_class_attribute *attr, 6697 struct device_attribute *attr,
7952 const char *buf, size_t count) 6698 const char *buf, size_t count)
7953{ 6699{
7954 return sched_power_savings_store(buf, count, 1); 6700 return sched_power_savings_store(buf, count, 1);
7955} 6701}
7956static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644, 6702static DEVICE_ATTR(sched_smt_power_savings, 0644,
7957 sched_smt_power_savings_show, 6703 sched_smt_power_savings_show,
7958 sched_smt_power_savings_store); 6704 sched_smt_power_savings_store);
7959#endif 6705#endif
7960 6706
7961int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) 6707int __init sched_create_sysfs_power_savings_entries(struct device *dev)
7962{ 6708{
7963 int err = 0; 6709 int err = 0;
7964 6710
7965#ifdef CONFIG_SCHED_SMT 6711#ifdef CONFIG_SCHED_SMT
7966 if (smt_capable()) 6712 if (smt_capable())
7967 err = sysfs_create_file(&cls->kset.kobj, 6713 err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
7968 &attr_sched_smt_power_savings.attr);
7969#endif 6714#endif
7970#ifdef CONFIG_SCHED_MC 6715#ifdef CONFIG_SCHED_MC
7971 if (!err && mc_capable()) 6716 if (!err && mc_capable())
7972 err = sysfs_create_file(&cls->kset.kobj, 6717 err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
7973 &attr_sched_mc_power_savings.attr);
7974#endif 6718#endif
7975 return err; 6719 return err;
7976} 6720}
@@ -7984,7 +6728,7 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7984static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, 6728static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
7985 void *hcpu) 6729 void *hcpu)
7986{ 6730{
7987 switch (action & ~CPU_TASKS_FROZEN) { 6731 switch (action) {
7988 case CPU_ONLINE: 6732 case CPU_ONLINE:
7989 case CPU_DOWN_FAILED: 6733 case CPU_DOWN_FAILED:
7990 cpuset_update_active_cpus(); 6734 cpuset_update_active_cpus();
@@ -7997,33 +6741,10 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
7997static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, 6741static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
7998 void *hcpu) 6742 void *hcpu)
7999{ 6743{
8000 switch (action & ~CPU_TASKS_FROZEN) {
8001 case CPU_DOWN_PREPARE:
8002 cpuset_update_active_cpus();
8003 return NOTIFY_OK;
8004 default:
8005 return NOTIFY_DONE;
8006 }
8007}
8008
8009static int update_runtime(struct notifier_block *nfb,
8010 unsigned long action, void *hcpu)
8011{
8012 int cpu = (int)(long)hcpu;
8013
8014 switch (action) { 6744 switch (action) {
8015 case CPU_DOWN_PREPARE: 6745 case CPU_DOWN_PREPARE:
8016 case CPU_DOWN_PREPARE_FROZEN: 6746 cpuset_update_active_cpus();
8017 disable_runtime(cpu_rq(cpu));
8018 return NOTIFY_OK;
8019
8020 case CPU_DOWN_FAILED:
8021 case CPU_DOWN_FAILED_FROZEN:
8022 case CPU_ONLINE:
8023 case CPU_ONLINE_FROZEN:
8024 enable_runtime(cpu_rq(cpu));
8025 return NOTIFY_OK; 6747 return NOTIFY_OK;
8026
8027 default: 6748 default:
8028 return NOTIFY_DONE; 6749 return NOTIFY_DONE;
8029 } 6750 }
@@ -8077,104 +6798,11 @@ int in_sched_functions(unsigned long addr)
8077 && addr < (unsigned long)__sched_text_end); 6798 && addr < (unsigned long)__sched_text_end);
8078} 6799}
8079 6800
8080static void init_cfs_rq(struct cfs_rq *cfs_rq) 6801#ifdef CONFIG_CGROUP_SCHED
8081{ 6802struct task_group root_task_group;
8082 cfs_rq->tasks_timeline = RB_ROOT;
8083 INIT_LIST_HEAD(&cfs_rq->tasks);
8084 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
8085#ifndef CONFIG_64BIT
8086 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
8087#endif
8088}
8089
8090static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
8091{
8092 struct rt_prio_array *array;
8093 int i;
8094
8095 array = &rt_rq->active;
8096 for (i = 0; i < MAX_RT_PRIO; i++) {
8097 INIT_LIST_HEAD(array->queue + i);
8098 __clear_bit(i, array->bitmap);
8099 }
8100 /* delimiter for bitsearch: */
8101 __set_bit(MAX_RT_PRIO, array->bitmap);
8102
8103#if defined CONFIG_SMP
8104 rt_rq->highest_prio.curr = MAX_RT_PRIO;
8105 rt_rq->highest_prio.next = MAX_RT_PRIO;
8106 rt_rq->rt_nr_migratory = 0;
8107 rt_rq->overloaded = 0;
8108 plist_head_init(&rt_rq->pushable_tasks);
8109#endif
8110
8111 rt_rq->rt_time = 0;
8112 rt_rq->rt_throttled = 0;
8113 rt_rq->rt_runtime = 0;
8114 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
8115}
8116
8117#ifdef CONFIG_FAIR_GROUP_SCHED
8118static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8119 struct sched_entity *se, int cpu,
8120 struct sched_entity *parent)
8121{
8122 struct rq *rq = cpu_rq(cpu);
8123
8124 cfs_rq->tg = tg;
8125 cfs_rq->rq = rq;
8126#ifdef CONFIG_SMP
8127 /* allow initial update_cfs_load() to truncate */
8128 cfs_rq->load_stamp = 1;
8129#endif
8130 init_cfs_rq_runtime(cfs_rq);
8131
8132 tg->cfs_rq[cpu] = cfs_rq;
8133 tg->se[cpu] = se;
8134
8135 /* se could be NULL for root_task_group */
8136 if (!se)
8137 return;
8138
8139 if (!parent)
8140 se->cfs_rq = &rq->cfs;
8141 else
8142 se->cfs_rq = parent->my_q;
8143
8144 se->my_q = cfs_rq;
8145 update_load_set(&se->load, 0);
8146 se->parent = parent;
8147}
8148#endif 6803#endif
8149 6804
8150#ifdef CONFIG_RT_GROUP_SCHED 6805DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
8151static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
8152 struct sched_rt_entity *rt_se, int cpu,
8153 struct sched_rt_entity *parent)
8154{
8155 struct rq *rq = cpu_rq(cpu);
8156
8157 rt_rq->highest_prio.curr = MAX_RT_PRIO;
8158 rt_rq->rt_nr_boosted = 0;
8159 rt_rq->rq = rq;
8160 rt_rq->tg = tg;
8161
8162 tg->rt_rq[cpu] = rt_rq;
8163 tg->rt_se[cpu] = rt_se;
8164
8165 if (!rt_se)
8166 return;
8167
8168 if (!parent)
8169 rt_se->rt_rq = &rq->rt;
8170 else
8171 rt_se->rt_rq = parent->my_q;
8172
8173 rt_se->my_q = rt_rq;
8174 rt_se->parent = parent;
8175 INIT_LIST_HEAD(&rt_se->run_list);
8176}
8177#endif
8178 6806
8179void __init sched_init(void) 6807void __init sched_init(void)
8180{ 6808{
@@ -8232,9 +6860,17 @@ void __init sched_init(void)
8232#ifdef CONFIG_CGROUP_SCHED 6860#ifdef CONFIG_CGROUP_SCHED
8233 list_add(&root_task_group.list, &task_groups); 6861 list_add(&root_task_group.list, &task_groups);
8234 INIT_LIST_HEAD(&root_task_group.children); 6862 INIT_LIST_HEAD(&root_task_group.children);
6863 INIT_LIST_HEAD(&root_task_group.siblings);
8235 autogroup_init(&init_task); 6864 autogroup_init(&init_task);
6865
8236#endif /* CONFIG_CGROUP_SCHED */ 6866#endif /* CONFIG_CGROUP_SCHED */
8237 6867
6868#ifdef CONFIG_CGROUP_CPUACCT
6869 root_cpuacct.cpustat = &kernel_cpustat;
6870 root_cpuacct.cpuusage = alloc_percpu(u64);
6871 /* Too early, not expected to fail */
6872 BUG_ON(!root_cpuacct.cpuusage);
6873#endif
8238 for_each_possible_cpu(i) { 6874 for_each_possible_cpu(i) {
8239 struct rq *rq; 6875 struct rq *rq;
8240 6876
@@ -8246,7 +6882,7 @@ void __init sched_init(void)
8246 init_cfs_rq(&rq->cfs); 6882 init_cfs_rq(&rq->cfs);
8247 init_rt_rq(&rq->rt, rq); 6883 init_rt_rq(&rq->rt, rq);
8248#ifdef CONFIG_FAIR_GROUP_SCHED 6884#ifdef CONFIG_FAIR_GROUP_SCHED
8249 root_task_group.shares = root_task_group_load; 6885 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
8250 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 6886 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
8251 /* 6887 /*
8252 * How much cpu bandwidth does root_task_group get? 6888 * How much cpu bandwidth does root_task_group get?
@@ -8296,7 +6932,7 @@ void __init sched_init(void)
8296 rq->avg_idle = 2*sysctl_sched_migration_cost; 6932 rq->avg_idle = 2*sysctl_sched_migration_cost;
8297 rq_attach_root(rq, &def_root_domain); 6933 rq_attach_root(rq, &def_root_domain);
8298#ifdef CONFIG_NO_HZ 6934#ifdef CONFIG_NO_HZ
8299 rq->nohz_balance_kick = 0; 6935 rq->nohz_flags = 0;
8300#endif 6936#endif
8301#endif 6937#endif
8302 init_rq_hrtick(rq); 6938 init_rq_hrtick(rq);
@@ -8309,10 +6945,6 @@ void __init sched_init(void)
8309 INIT_HLIST_HEAD(&init_task.preempt_notifiers); 6945 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
8310#endif 6946#endif
8311 6947
8312#ifdef CONFIG_SMP
8313 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
8314#endif
8315
8316#ifdef CONFIG_RT_MUTEXES 6948#ifdef CONFIG_RT_MUTEXES
8317 plist_head_init(&init_task.pi_waiters); 6949 plist_head_init(&init_task.pi_waiters);
8318#endif 6950#endif
@@ -8340,17 +6972,11 @@ void __init sched_init(void)
8340 6972
8341#ifdef CONFIG_SMP 6973#ifdef CONFIG_SMP
8342 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); 6974 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
8343#ifdef CONFIG_NO_HZ
8344 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
8345 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
8346 atomic_set(&nohz.load_balancer, nr_cpu_ids);
8347 atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
8348 atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
8349#endif
8350 /* May be allocated at isolcpus cmdline parse time */ 6975 /* May be allocated at isolcpus cmdline parse time */
8351 if (cpu_isolated_map == NULL) 6976 if (cpu_isolated_map == NULL)
8352 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 6977 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
8353#endif /* SMP */ 6978#endif
6979 init_sched_fair_class();
8354 6980
8355 scheduler_running = 1; 6981 scheduler_running = 1;
8356} 6982}
@@ -8400,10 +7026,10 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
8400 7026
8401 on_rq = p->on_rq; 7027 on_rq = p->on_rq;
8402 if (on_rq) 7028 if (on_rq)
8403 deactivate_task(rq, p, 0); 7029 dequeue_task(rq, p, 0);
8404 __setscheduler(rq, p, SCHED_NORMAL, 0); 7030 __setscheduler(rq, p, SCHED_NORMAL, 0);
8405 if (on_rq) { 7031 if (on_rq) {
8406 activate_task(rq, p, 0); 7032 enqueue_task(rq, p, 0);
8407 resched_task(rq->curr); 7033 resched_task(rq->curr);
8408 } 7034 }
8409 7035
@@ -8502,169 +7128,10 @@ void set_curr_task(int cpu, struct task_struct *p)
8502 7128
8503#endif 7129#endif
8504 7130
8505#ifdef CONFIG_FAIR_GROUP_SCHED
8506static void free_fair_sched_group(struct task_group *tg)
8507{
8508 int i;
8509
8510 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
8511
8512 for_each_possible_cpu(i) {
8513 if (tg->cfs_rq)
8514 kfree(tg->cfs_rq[i]);
8515 if (tg->se)
8516 kfree(tg->se[i]);
8517 }
8518
8519 kfree(tg->cfs_rq);
8520 kfree(tg->se);
8521}
8522
8523static
8524int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8525{
8526 struct cfs_rq *cfs_rq;
8527 struct sched_entity *se;
8528 int i;
8529
8530 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
8531 if (!tg->cfs_rq)
8532 goto err;
8533 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
8534 if (!tg->se)
8535 goto err;
8536
8537 tg->shares = NICE_0_LOAD;
8538
8539 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
8540
8541 for_each_possible_cpu(i) {
8542 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8543 GFP_KERNEL, cpu_to_node(i));
8544 if (!cfs_rq)
8545 goto err;
8546
8547 se = kzalloc_node(sizeof(struct sched_entity),
8548 GFP_KERNEL, cpu_to_node(i));
8549 if (!se)
8550 goto err_free_rq;
8551
8552 init_cfs_rq(cfs_rq);
8553 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8554 }
8555
8556 return 1;
8557
8558err_free_rq:
8559 kfree(cfs_rq);
8560err:
8561 return 0;
8562}
8563
8564static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8565{
8566 struct rq *rq = cpu_rq(cpu);
8567 unsigned long flags;
8568
8569 /*
8570 * Only empty task groups can be destroyed; so we can speculatively
8571 * check on_list without danger of it being re-added.
8572 */
8573 if (!tg->cfs_rq[cpu]->on_list)
8574 return;
8575
8576 raw_spin_lock_irqsave(&rq->lock, flags);
8577 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8578 raw_spin_unlock_irqrestore(&rq->lock, flags);
8579}
8580#else /* !CONFIG_FAIR_GROUP_SCHED */
8581static inline void free_fair_sched_group(struct task_group *tg)
8582{
8583}
8584
8585static inline
8586int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8587{
8588 return 1;
8589}
8590
8591static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8592{
8593}
8594#endif /* CONFIG_FAIR_GROUP_SCHED */
8595
8596#ifdef CONFIG_RT_GROUP_SCHED
8597static void free_rt_sched_group(struct task_group *tg)
8598{
8599 int i;
8600
8601 if (tg->rt_se)
8602 destroy_rt_bandwidth(&tg->rt_bandwidth);
8603
8604 for_each_possible_cpu(i) {
8605 if (tg->rt_rq)
8606 kfree(tg->rt_rq[i]);
8607 if (tg->rt_se)
8608 kfree(tg->rt_se[i]);
8609 }
8610
8611 kfree(tg->rt_rq);
8612 kfree(tg->rt_se);
8613}
8614
8615static
8616int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8617{
8618 struct rt_rq *rt_rq;
8619 struct sched_rt_entity *rt_se;
8620 int i;
8621
8622 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
8623 if (!tg->rt_rq)
8624 goto err;
8625 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
8626 if (!tg->rt_se)
8627 goto err;
8628
8629 init_rt_bandwidth(&tg->rt_bandwidth,
8630 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
8631
8632 for_each_possible_cpu(i) {
8633 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8634 GFP_KERNEL, cpu_to_node(i));
8635 if (!rt_rq)
8636 goto err;
8637
8638 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
8639 GFP_KERNEL, cpu_to_node(i));
8640 if (!rt_se)
8641 goto err_free_rq;
8642
8643 init_rt_rq(rt_rq, cpu_rq(i));
8644 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
8645 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
8646 }
8647
8648 return 1;
8649
8650err_free_rq:
8651 kfree(rt_rq);
8652err:
8653 return 0;
8654}
8655#else /* !CONFIG_RT_GROUP_SCHED */
8656static inline void free_rt_sched_group(struct task_group *tg)
8657{
8658}
8659
8660static inline
8661int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8662{
8663 return 1;
8664}
8665#endif /* CONFIG_RT_GROUP_SCHED */
8666
8667#ifdef CONFIG_CGROUP_SCHED 7131#ifdef CONFIG_CGROUP_SCHED
7132/* task_group_lock serializes the addition/removal of task groups */
7133static DEFINE_SPINLOCK(task_group_lock);
7134
8668static void free_sched_group(struct task_group *tg) 7135static void free_sched_group(struct task_group *tg)
8669{ 7136{
8670 free_fair_sched_group(tg); 7137 free_fair_sched_group(tg);
@@ -8769,50 +7236,6 @@ void sched_move_task(struct task_struct *tsk)
8769} 7236}
8770#endif /* CONFIG_CGROUP_SCHED */ 7237#endif /* CONFIG_CGROUP_SCHED */
8771 7238
8772#ifdef CONFIG_FAIR_GROUP_SCHED
8773static DEFINE_MUTEX(shares_mutex);
8774
8775int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8776{
8777 int i;
8778 unsigned long flags;
8779
8780 /*
8781 * We can't change the weight of the root cgroup.
8782 */
8783 if (!tg->se[0])
8784 return -EINVAL;
8785
8786 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
8787
8788 mutex_lock(&shares_mutex);
8789 if (tg->shares == shares)
8790 goto done;
8791
8792 tg->shares = shares;
8793 for_each_possible_cpu(i) {
8794 struct rq *rq = cpu_rq(i);
8795 struct sched_entity *se;
8796
8797 se = tg->se[i];
8798 /* Propagate contribution to hierarchy */
8799 raw_spin_lock_irqsave(&rq->lock, flags);
8800 for_each_sched_entity(se)
8801 update_cfs_shares(group_cfs_rq(se));
8802 raw_spin_unlock_irqrestore(&rq->lock, flags);
8803 }
8804
8805done:
8806 mutex_unlock(&shares_mutex);
8807 return 0;
8808}
8809
8810unsigned long sched_group_shares(struct task_group *tg)
8811{
8812 return tg->shares;
8813}
8814#endif
8815
8816#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) 7239#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
8817static unsigned long to_ratio(u64 period, u64 runtime) 7240static unsigned long to_ratio(u64 period, u64 runtime)
8818{ 7241{
@@ -8835,7 +7258,7 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
8835 struct task_struct *g, *p; 7258 struct task_struct *g, *p;
8836 7259
8837 do_each_thread(g, p) { 7260 do_each_thread(g, p) {
8838 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) 7261 if (rt_task(p) && task_rq(p)->rt.tg == tg)
8839 return 1; 7262 return 1;
8840 } while_each_thread(g, p); 7263 } while_each_thread(g, p);
8841 7264
@@ -9127,24 +7550,31 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9127 sched_destroy_group(tg); 7550 sched_destroy_group(tg);
9128} 7551}
9129 7552
9130static int 7553static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
9131cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 7554 struct cgroup_taskset *tset)
9132{ 7555{
7556 struct task_struct *task;
7557
7558 cgroup_taskset_for_each(task, cgrp, tset) {
9133#ifdef CONFIG_RT_GROUP_SCHED 7559#ifdef CONFIG_RT_GROUP_SCHED
9134 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) 7560 if (!sched_rt_can_attach(cgroup_tg(cgrp), task))
9135 return -EINVAL; 7561 return -EINVAL;
9136#else 7562#else
9137 /* We don't support RT-tasks being in separate groups */ 7563 /* We don't support RT-tasks being in separate groups */
9138 if (tsk->sched_class != &fair_sched_class) 7564 if (task->sched_class != &fair_sched_class)
9139 return -EINVAL; 7565 return -EINVAL;
9140#endif 7566#endif
7567 }
9141 return 0; 7568 return 0;
9142} 7569}
9143 7570
9144static void 7571static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
9145cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 7572 struct cgroup_taskset *tset)
9146{ 7573{
9147 sched_move_task(tsk); 7574 struct task_struct *task;
7575
7576 cgroup_taskset_for_each(task, cgrp, tset)
7577 sched_move_task(task);
9148} 7578}
9149 7579
9150static void 7580static void
@@ -9186,8 +7616,8 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
9186 7616
9187static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) 7617static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9188{ 7618{
9189 int i, ret = 0, runtime_enabled; 7619 int i, ret = 0, runtime_enabled, runtime_was_enabled;
9190 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); 7620 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9191 7621
9192 if (tg == &root_task_group) 7622 if (tg == &root_task_group)
9193 return -EINVAL; 7623 return -EINVAL;
@@ -9214,6 +7644,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9214 goto out_unlock; 7644 goto out_unlock;
9215 7645
9216 runtime_enabled = quota != RUNTIME_INF; 7646 runtime_enabled = quota != RUNTIME_INF;
7647 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
7648 account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
9217 raw_spin_lock_irq(&cfs_b->lock); 7649 raw_spin_lock_irq(&cfs_b->lock);
9218 cfs_b->period = ns_to_ktime(period); 7650 cfs_b->period = ns_to_ktime(period);
9219 cfs_b->quota = quota; 7651 cfs_b->quota = quota;
@@ -9229,13 +7661,13 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9229 7661
9230 for_each_possible_cpu(i) { 7662 for_each_possible_cpu(i) {
9231 struct cfs_rq *cfs_rq = tg->cfs_rq[i]; 7663 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
9232 struct rq *rq = rq_of(cfs_rq); 7664 struct rq *rq = cfs_rq->rq;
9233 7665
9234 raw_spin_lock_irq(&rq->lock); 7666 raw_spin_lock_irq(&rq->lock);
9235 cfs_rq->runtime_enabled = runtime_enabled; 7667 cfs_rq->runtime_enabled = runtime_enabled;
9236 cfs_rq->runtime_remaining = 0; 7668 cfs_rq->runtime_remaining = 0;
9237 7669
9238 if (cfs_rq_throttled(cfs_rq)) 7670 if (cfs_rq->throttled)
9239 unthrottle_cfs_rq(cfs_rq); 7671 unthrottle_cfs_rq(cfs_rq);
9240 raw_spin_unlock_irq(&rq->lock); 7672 raw_spin_unlock_irq(&rq->lock);
9241 } 7673 }
@@ -9249,7 +7681,7 @@ int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
9249{ 7681{
9250 u64 quota, period; 7682 u64 quota, period;
9251 7683
9252 period = ktime_to_ns(tg_cfs_bandwidth(tg)->period); 7684 period = ktime_to_ns(tg->cfs_bandwidth.period);
9253 if (cfs_quota_us < 0) 7685 if (cfs_quota_us < 0)
9254 quota = RUNTIME_INF; 7686 quota = RUNTIME_INF;
9255 else 7687 else
@@ -9262,10 +7694,10 @@ long tg_get_cfs_quota(struct task_group *tg)
9262{ 7694{
9263 u64 quota_us; 7695 u64 quota_us;
9264 7696
9265 if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF) 7697 if (tg->cfs_bandwidth.quota == RUNTIME_INF)
9266 return -1; 7698 return -1;
9267 7699
9268 quota_us = tg_cfs_bandwidth(tg)->quota; 7700 quota_us = tg->cfs_bandwidth.quota;
9269 do_div(quota_us, NSEC_PER_USEC); 7701 do_div(quota_us, NSEC_PER_USEC);
9270 7702
9271 return quota_us; 7703 return quota_us;
@@ -9276,10 +7708,7 @@ int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
9276 u64 quota, period; 7708 u64 quota, period;
9277 7709
9278 period = (u64)cfs_period_us * NSEC_PER_USEC; 7710 period = (u64)cfs_period_us * NSEC_PER_USEC;
9279 quota = tg_cfs_bandwidth(tg)->quota; 7711 quota = tg->cfs_bandwidth.quota;
9280
9281 if (period <= 0)
9282 return -EINVAL;
9283 7712
9284 return tg_set_cfs_bandwidth(tg, period, quota); 7713 return tg_set_cfs_bandwidth(tg, period, quota);
9285} 7714}
@@ -9288,7 +7717,7 @@ long tg_get_cfs_period(struct task_group *tg)
9288{ 7717{
9289 u64 cfs_period_us; 7718 u64 cfs_period_us;
9290 7719
9291 cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period); 7720 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
9292 do_div(cfs_period_us, NSEC_PER_USEC); 7721 do_div(cfs_period_us, NSEC_PER_USEC);
9293 7722
9294 return cfs_period_us; 7723 return cfs_period_us;
@@ -9348,13 +7777,13 @@ static u64 normalize_cfs_quota(struct task_group *tg,
9348static int tg_cfs_schedulable_down(struct task_group *tg, void *data) 7777static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
9349{ 7778{
9350 struct cfs_schedulable_data *d = data; 7779 struct cfs_schedulable_data *d = data;
9351 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); 7780 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9352 s64 quota = 0, parent_quota = -1; 7781 s64 quota = 0, parent_quota = -1;
9353 7782
9354 if (!tg->parent) { 7783 if (!tg->parent) {
9355 quota = RUNTIME_INF; 7784 quota = RUNTIME_INF;
9356 } else { 7785 } else {
9357 struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent); 7786 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
9358 7787
9359 quota = normalize_cfs_quota(tg, d); 7788 quota = normalize_cfs_quota(tg, d);
9360 parent_quota = parent_b->hierarchal_quota; 7789 parent_quota = parent_b->hierarchal_quota;
@@ -9398,7 +7827,7 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
9398 struct cgroup_map_cb *cb) 7827 struct cgroup_map_cb *cb)
9399{ 7828{
9400 struct task_group *tg = cgroup_tg(cgrp); 7829 struct task_group *tg = cgroup_tg(cgrp);
9401 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); 7830 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9402 7831
9403 cb->fill(cb, "nr_periods", cfs_b->nr_periods); 7832 cb->fill(cb, "nr_periods", cfs_b->nr_periods);
9404 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); 7833 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
@@ -9480,8 +7909,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
9480 .name = "cpu", 7909 .name = "cpu",
9481 .create = cpu_cgroup_create, 7910 .create = cpu_cgroup_create,
9482 .destroy = cpu_cgroup_destroy, 7911 .destroy = cpu_cgroup_destroy,
9483 .can_attach_task = cpu_cgroup_can_attach_task, 7912 .can_attach = cpu_cgroup_can_attach,
9484 .attach_task = cpu_cgroup_attach_task, 7913 .attach = cpu_cgroup_attach,
9485 .exit = cpu_cgroup_exit, 7914 .exit = cpu_cgroup_exit,
9486 .populate = cpu_cgroup_populate, 7915 .populate = cpu_cgroup_populate,
9487 .subsys_id = cpu_cgroup_subsys_id, 7916 .subsys_id = cpu_cgroup_subsys_id,
@@ -9499,38 +7928,16 @@ struct cgroup_subsys cpu_cgroup_subsys = {
9499 * (balbir@in.ibm.com). 7928 * (balbir@in.ibm.com).
9500 */ 7929 */
9501 7930
9502/* track cpu usage of a group of tasks and its child groups */
9503struct cpuacct {
9504 struct cgroup_subsys_state css;
9505 /* cpuusage holds pointer to a u64-type object on every cpu */
9506 u64 __percpu *cpuusage;
9507 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
9508 struct cpuacct *parent;
9509};
9510
9511struct cgroup_subsys cpuacct_subsys;
9512
9513/* return cpu accounting group corresponding to this container */
9514static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
9515{
9516 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
9517 struct cpuacct, css);
9518}
9519
9520/* return cpu accounting group to which this task belongs */
9521static inline struct cpuacct *task_ca(struct task_struct *tsk)
9522{
9523 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
9524 struct cpuacct, css);
9525}
9526
9527/* create a new cpu accounting group */ 7931/* create a new cpu accounting group */
9528static struct cgroup_subsys_state *cpuacct_create( 7932static struct cgroup_subsys_state *cpuacct_create(
9529 struct cgroup_subsys *ss, struct cgroup *cgrp) 7933 struct cgroup_subsys *ss, struct cgroup *cgrp)
9530{ 7934{
9531 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); 7935 struct cpuacct *ca;
9532 int i; 7936
7937 if (!cgrp->parent)
7938 return &root_cpuacct.css;
9533 7939
7940 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
9534 if (!ca) 7941 if (!ca)
9535 goto out; 7942 goto out;
9536 7943
@@ -9538,18 +7945,13 @@ static struct cgroup_subsys_state *cpuacct_create(
9538 if (!ca->cpuusage) 7945 if (!ca->cpuusage)
9539 goto out_free_ca; 7946 goto out_free_ca;
9540 7947
9541 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) 7948 ca->cpustat = alloc_percpu(struct kernel_cpustat);
9542 if (percpu_counter_init(&ca->cpustat[i], 0)) 7949 if (!ca->cpustat)
9543 goto out_free_counters; 7950 goto out_free_cpuusage;
9544
9545 if (cgrp->parent)
9546 ca->parent = cgroup_ca(cgrp->parent);
9547 7951
9548 return &ca->css; 7952 return &ca->css;
9549 7953
9550out_free_counters: 7954out_free_cpuusage:
9551 while (--i >= 0)
9552 percpu_counter_destroy(&ca->cpustat[i]);
9553 free_percpu(ca->cpuusage); 7955 free_percpu(ca->cpuusage);
9554out_free_ca: 7956out_free_ca:
9555 kfree(ca); 7957 kfree(ca);
@@ -9562,10 +7964,8 @@ static void
9562cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) 7964cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9563{ 7965{
9564 struct cpuacct *ca = cgroup_ca(cgrp); 7966 struct cpuacct *ca = cgroup_ca(cgrp);
9565 int i;
9566 7967
9567 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) 7968 free_percpu(ca->cpustat);
9568 percpu_counter_destroy(&ca->cpustat[i]);
9569 free_percpu(ca->cpuusage); 7969 free_percpu(ca->cpuusage);
9570 kfree(ca); 7970 kfree(ca);
9571} 7971}
@@ -9658,16 +8058,31 @@ static const char *cpuacct_stat_desc[] = {
9658}; 8058};
9659 8059
9660static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, 8060static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
9661 struct cgroup_map_cb *cb) 8061 struct cgroup_map_cb *cb)
9662{ 8062{
9663 struct cpuacct *ca = cgroup_ca(cgrp); 8063 struct cpuacct *ca = cgroup_ca(cgrp);
9664 int i; 8064 int cpu;
8065 s64 val = 0;
9665 8066
9666 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) { 8067 for_each_online_cpu(cpu) {
9667 s64 val = percpu_counter_read(&ca->cpustat[i]); 8068 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
9668 val = cputime64_to_clock_t(val); 8069 val += kcpustat->cpustat[CPUTIME_USER];
9669 cb->fill(cb, cpuacct_stat_desc[i], val); 8070 val += kcpustat->cpustat[CPUTIME_NICE];
9670 } 8071 }
8072 val = cputime64_to_clock_t(val);
8073 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
8074
8075 val = 0;
8076 for_each_online_cpu(cpu) {
8077 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
8078 val += kcpustat->cpustat[CPUTIME_SYSTEM];
8079 val += kcpustat->cpustat[CPUTIME_IRQ];
8080 val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
8081 }
8082
8083 val = cputime64_to_clock_t(val);
8084 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
8085
9671 return 0; 8086 return 0;
9672} 8087}
9673 8088
@@ -9697,7 +8112,7 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
9697 * 8112 *
9698 * called with rq->lock held. 8113 * called with rq->lock held.
9699 */ 8114 */
9700static void cpuacct_charge(struct task_struct *tsk, u64 cputime) 8115void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9701{ 8116{
9702 struct cpuacct *ca; 8117 struct cpuacct *ca;
9703 int cpu; 8118 int cpu;
@@ -9711,7 +8126,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9711 8126
9712 ca = task_ca(tsk); 8127 ca = task_ca(tsk);
9713 8128
9714 for (; ca; ca = ca->parent) { 8129 for (; ca; ca = parent_ca(ca)) {
9715 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 8130 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9716 *cpuusage += cputime; 8131 *cpuusage += cputime;
9717 } 8132 }
@@ -9719,45 +8134,6 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9719 rcu_read_unlock(); 8134 rcu_read_unlock();
9720} 8135}
9721 8136
9722/*
9723 * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
9724 * in cputime_t units. As a result, cpuacct_update_stats calls
9725 * percpu_counter_add with values large enough to always overflow the
9726 * per cpu batch limit causing bad SMP scalability.
9727 *
9728 * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
9729 * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
9730 * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
9731 */
9732#ifdef CONFIG_SMP
9733#define CPUACCT_BATCH \
9734 min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
9735#else
9736#define CPUACCT_BATCH 0
9737#endif
9738
9739/*
9740 * Charge the system/user time to the task's accounting group.
9741 */
9742static void cpuacct_update_stats(struct task_struct *tsk,
9743 enum cpuacct_stat_index idx, cputime_t val)
9744{
9745 struct cpuacct *ca;
9746 int batch = CPUACCT_BATCH;
9747
9748 if (unlikely(!cpuacct_subsys.active))
9749 return;
9750
9751 rcu_read_lock();
9752 ca = task_ca(tsk);
9753
9754 do {
9755 __percpu_counter_add(&ca->cpustat[idx], val, batch);
9756 ca = ca->parent;
9757 } while (ca);
9758 rcu_read_unlock();
9759}
9760
9761struct cgroup_subsys cpuacct_subsys = { 8137struct cgroup_subsys cpuacct_subsys = {
9762 .name = "cpuacct", 8138 .name = "cpuacct",
9763 .create = cpuacct_create, 8139 .create = cpuacct_create,
diff --git a/kernel/sched_cpupri.c b/kernel/sched/cpupri.c
index a86cf9d9eb11..d72586fdf660 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * kernel/sched_cpupri.c 2 * kernel/sched/cpupri.c
3 * 3 *
4 * CPU priority management 4 * CPU priority management
5 * 5 *
@@ -28,7 +28,7 @@
28 */ 28 */
29 29
30#include <linux/gfp.h> 30#include <linux/gfp.h>
31#include "sched_cpupri.h" 31#include "cpupri.h"
32 32
33/* Convert between a 140 based task->prio, and our 102 based cpupri */ 33/* Convert between a 140 based task->prio, and our 102 based cpupri */
34static int convert_prio(int prio) 34static int convert_prio(int prio)
@@ -129,7 +129,7 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
129 * cpupri_set - update the cpu priority setting 129 * cpupri_set - update the cpu priority setting
130 * @cp: The cpupri context 130 * @cp: The cpupri context
131 * @cpu: The target cpu 131 * @cpu: The target cpu
132 * @pri: The priority (INVALID-RT99) to assign to this CPU 132 * @newpri: The priority (INVALID-RT99) to assign to this CPU
133 * 133 *
134 * Note: Assumes cpu_rq(cpu)->lock is locked 134 * Note: Assumes cpu_rq(cpu)->lock is locked
135 * 135 *
@@ -200,7 +200,6 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
200/** 200/**
201 * cpupri_init - initialize the cpupri structure 201 * cpupri_init - initialize the cpupri structure
202 * @cp: The cpupri context 202 * @cp: The cpupri context
203 * @bootmem: true if allocations need to use bootmem
204 * 203 *
205 * Returns: -ENOMEM if memory fails. 204 * Returns: -ENOMEM if memory fails.
206 */ 205 */
diff --git a/kernel/sched_cpupri.h b/kernel/sched/cpupri.h
index f6d756173491..f6d756173491 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched/cpupri.h
diff --git a/kernel/sched_debug.c b/kernel/sched/debug.c
index a6710a112b4f..2a075e10004b 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched/debug.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * kernel/time/sched_debug.c 2 * kernel/sched/debug.c
3 * 3 *
4 * Print the CFS rbtree 4 * Print the CFS rbtree
5 * 5 *
@@ -16,6 +16,8 @@
16#include <linux/kallsyms.h> 16#include <linux/kallsyms.h>
17#include <linux/utsname.h> 17#include <linux/utsname.h>
18 18
19#include "sched.h"
20
19static DEFINE_SPINLOCK(sched_debug_lock); 21static DEFINE_SPINLOCK(sched_debug_lock);
20 22
21/* 23/*
@@ -373,7 +375,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
373 return 0; 375 return 0;
374} 376}
375 377
376static void sysrq_sched_debug_show(void) 378void sysrq_sched_debug_show(void)
377{ 379{
378 sched_debug_show(NULL, NULL); 380 sched_debug_show(NULL, NULL);
379} 381}
diff --git a/kernel/sched_fair.c b/kernel/sched/fair.c
index 5c9e67923b7c..aca16b843b7e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched/fair.c
@@ -23,6 +23,13 @@
23#include <linux/latencytop.h> 23#include <linux/latencytop.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/cpumask.h> 25#include <linux/cpumask.h>
26#include <linux/slab.h>
27#include <linux/profile.h>
28#include <linux/interrupt.h>
29
30#include <trace/events/sched.h>
31
32#include "sched.h"
26 33
27/* 34/*
28 * Targeted preemption latency for CPU-bound tasks: 35 * Targeted preemption latency for CPU-bound tasks:
@@ -103,7 +110,110 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
103unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; 110unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
104#endif 111#endif
105 112
106static const struct sched_class fair_sched_class; 113/*
114 * Increase the granularity value when there are more CPUs,
115 * because with more CPUs the 'effective latency' as visible
116 * to users decreases. But the relationship is not linear,
117 * so pick a second-best guess by going with the log2 of the
118 * number of CPUs.
119 *
120 * This idea comes from the SD scheduler of Con Kolivas:
121 */
122static int get_update_sysctl_factor(void)
123{
124 unsigned int cpus = min_t(int, num_online_cpus(), 8);
125 unsigned int factor;
126
127 switch (sysctl_sched_tunable_scaling) {
128 case SCHED_TUNABLESCALING_NONE:
129 factor = 1;
130 break;
131 case SCHED_TUNABLESCALING_LINEAR:
132 factor = cpus;
133 break;
134 case SCHED_TUNABLESCALING_LOG:
135 default:
136 factor = 1 + ilog2(cpus);
137 break;
138 }
139
140 return factor;
141}
142
143static void update_sysctl(void)
144{
145 unsigned int factor = get_update_sysctl_factor();
146
147#define SET_SYSCTL(name) \
148 (sysctl_##name = (factor) * normalized_sysctl_##name)
149 SET_SYSCTL(sched_min_granularity);
150 SET_SYSCTL(sched_latency);
151 SET_SYSCTL(sched_wakeup_granularity);
152#undef SET_SYSCTL
153}
154
155void sched_init_granularity(void)
156{
157 update_sysctl();
158}
159
160#if BITS_PER_LONG == 32
161# define WMULT_CONST (~0UL)
162#else
163# define WMULT_CONST (1UL << 32)
164#endif
165
166#define WMULT_SHIFT 32
167
168/*
169 * Shift right and round:
170 */
171#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
172
173/*
174 * delta *= weight / lw
175 */
176static unsigned long
177calc_delta_mine(unsigned long delta_exec, unsigned long weight,
178 struct load_weight *lw)
179{
180 u64 tmp;
181
182 /*
183 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
184 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
185 * 2^SCHED_LOAD_RESOLUTION.
186 */
187 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
188 tmp = (u64)delta_exec * scale_load_down(weight);
189 else
190 tmp = (u64)delta_exec;
191
192 if (!lw->inv_weight) {
193 unsigned long w = scale_load_down(lw->weight);
194
195 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
196 lw->inv_weight = 1;
197 else if (unlikely(!w))
198 lw->inv_weight = WMULT_CONST;
199 else
200 lw->inv_weight = WMULT_CONST / w;
201 }
202
203 /*
204 * Check whether we'd overflow the 64-bit multiplication:
205 */
206 if (unlikely(tmp > WMULT_CONST))
207 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
208 WMULT_SHIFT/2);
209 else
210 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
211
212 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
213}
214
215
216const struct sched_class fair_sched_class;
107 217
108/************************************************************** 218/**************************************************************
109 * CFS operations on generic schedulable entities: 219 * CFS operations on generic schedulable entities:
@@ -413,7 +523,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
413 rb_erase(&se->run_node, &cfs_rq->tasks_timeline); 523 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
414} 524}
415 525
416static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) 526struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
417{ 527{
418 struct rb_node *left = cfs_rq->rb_leftmost; 528 struct rb_node *left = cfs_rq->rb_leftmost;
419 529
@@ -434,7 +544,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se)
434} 544}
435 545
436#ifdef CONFIG_SCHED_DEBUG 546#ifdef CONFIG_SCHED_DEBUG
437static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) 547struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
438{ 548{
439 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); 549 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
440 550
@@ -684,7 +794,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
684{ 794{
685 update_load_add(&cfs_rq->load, se->load.weight); 795 update_load_add(&cfs_rq->load, se->load.weight);
686 if (!parent_entity(se)) 796 if (!parent_entity(se))
687 inc_cpu_load(rq_of(cfs_rq), se->load.weight); 797 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
688 if (entity_is_task(se)) { 798 if (entity_is_task(se)) {
689 add_cfs_task_weight(cfs_rq, se->load.weight); 799 add_cfs_task_weight(cfs_rq, se->load.weight);
690 list_add(&se->group_node, &cfs_rq->tasks); 800 list_add(&se->group_node, &cfs_rq->tasks);
@@ -697,7 +807,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
697{ 807{
698 update_load_sub(&cfs_rq->load, se->load.weight); 808 update_load_sub(&cfs_rq->load, se->load.weight);
699 if (!parent_entity(se)) 809 if (!parent_entity(se))
700 dec_cpu_load(rq_of(cfs_rq), se->load.weight); 810 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
701 if (entity_is_task(se)) { 811 if (entity_is_task(se)) {
702 add_cfs_task_weight(cfs_rq, -se->load.weight); 812 add_cfs_task_weight(cfs_rq, -se->load.weight);
703 list_del_init(&se->group_node); 813 list_del_init(&se->group_node);
@@ -772,19 +882,32 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
772 list_del_leaf_cfs_rq(cfs_rq); 882 list_del_leaf_cfs_rq(cfs_rq);
773} 883}
774 884
885static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
886{
887 long tg_weight;
888
889 /*
890 * Use this CPU's actual weight instead of the last load_contribution
891 * to gain a more accurate current total weight. See
892 * update_cfs_rq_load_contribution().
893 */
894 tg_weight = atomic_read(&tg->load_weight);
895 tg_weight -= cfs_rq->load_contribution;
896 tg_weight += cfs_rq->load.weight;
897
898 return tg_weight;
899}
900
775static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) 901static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
776{ 902{
777 long load_weight, load, shares; 903 long tg_weight, load, shares;
778 904
905 tg_weight = calc_tg_weight(tg, cfs_rq);
779 load = cfs_rq->load.weight; 906 load = cfs_rq->load.weight;
780 907
781 load_weight = atomic_read(&tg->load_weight);
782 load_weight += load;
783 load_weight -= cfs_rq->load_contribution;
784
785 shares = (tg->shares * load); 908 shares = (tg->shares * load);
786 if (load_weight) 909 if (tg_weight)
787 shares /= load_weight; 910 shares /= tg_weight;
788 911
789 if (shares < MIN_SHARES) 912 if (shares < MIN_SHARES)
790 shares = MIN_SHARES; 913 shares = MIN_SHARES;
@@ -907,6 +1030,8 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
907 trace_sched_stat_iowait(tsk, delta); 1030 trace_sched_stat_iowait(tsk, delta);
908 } 1031 }
909 1032
1033 trace_sched_stat_blocked(tsk, delta);
1034
910 /* 1035 /*
911 * Blocking time is in units of nanosecs, so shift by 1036 * Blocking time is in units of nanosecs, so shift by
912 * 20 to get a milliseconds-range estimation of the 1037 * 20 to get a milliseconds-range estimation of the
@@ -1274,6 +1399,32 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1274 */ 1399 */
1275 1400
1276#ifdef CONFIG_CFS_BANDWIDTH 1401#ifdef CONFIG_CFS_BANDWIDTH
1402
1403#ifdef HAVE_JUMP_LABEL
1404static struct jump_label_key __cfs_bandwidth_used;
1405
1406static inline bool cfs_bandwidth_used(void)
1407{
1408 return static_branch(&__cfs_bandwidth_used);
1409}
1410
1411void account_cfs_bandwidth_used(int enabled, int was_enabled)
1412{
1413 /* only need to count groups transitioning between enabled/!enabled */
1414 if (enabled && !was_enabled)
1415 jump_label_inc(&__cfs_bandwidth_used);
1416 else if (!enabled && was_enabled)
1417 jump_label_dec(&__cfs_bandwidth_used);
1418}
1419#else /* HAVE_JUMP_LABEL */
1420static bool cfs_bandwidth_used(void)
1421{
1422 return true;
1423}
1424
1425void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
1426#endif /* HAVE_JUMP_LABEL */
1427
1277/* 1428/*
1278 * default period for cfs group bandwidth. 1429 * default period for cfs group bandwidth.
1279 * default: 0.1s, units: nanoseconds 1430 * default: 0.1s, units: nanoseconds
@@ -1295,7 +1446,7 @@ static inline u64 sched_cfs_bandwidth_slice(void)
1295 * 1446 *
1296 * requires cfs_b->lock 1447 * requires cfs_b->lock
1297 */ 1448 */
1298static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) 1449void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
1299{ 1450{
1300 u64 now; 1451 u64 now;
1301 1452
@@ -1307,6 +1458,11 @@ static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
1307 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); 1458 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
1308} 1459}
1309 1460
1461static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
1462{
1463 return &tg->cfs_bandwidth;
1464}
1465
1310/* returns 0 on failure to allocate runtime */ 1466/* returns 0 on failure to allocate runtime */
1311static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) 1467static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1312{ 1468{
@@ -1408,7 +1564,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1408static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, 1564static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1409 unsigned long delta_exec) 1565 unsigned long delta_exec)
1410{ 1566{
1411 if (!cfs_rq->runtime_enabled) 1567 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
1412 return; 1568 return;
1413 1569
1414 __account_cfs_rq_runtime(cfs_rq, delta_exec); 1570 __account_cfs_rq_runtime(cfs_rq, delta_exec);
@@ -1416,13 +1572,13 @@ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1416 1572
1417static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) 1573static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
1418{ 1574{
1419 return cfs_rq->throttled; 1575 return cfs_bandwidth_used() && cfs_rq->throttled;
1420} 1576}
1421 1577
1422/* check whether cfs_rq, or any parent, is throttled */ 1578/* check whether cfs_rq, or any parent, is throttled */
1423static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) 1579static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
1424{ 1580{
1425 return cfs_rq->throttle_count; 1581 return cfs_bandwidth_used() && cfs_rq->throttle_count;
1426} 1582}
1427 1583
1428/* 1584/*
@@ -1517,7 +1673,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
1517 raw_spin_unlock(&cfs_b->lock); 1673 raw_spin_unlock(&cfs_b->lock);
1518} 1674}
1519 1675
1520static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) 1676void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
1521{ 1677{
1522 struct rq *rq = rq_of(cfs_rq); 1678 struct rq *rq = rq_of(cfs_rq);
1523 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); 1679 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
@@ -1743,7 +1899,10 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1743 1899
1744static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) 1900static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1745{ 1901{
1746 if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running) 1902 if (!cfs_bandwidth_used())
1903 return;
1904
1905 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
1747 return; 1906 return;
1748 1907
1749 __return_cfs_rq_runtime(cfs_rq); 1908 __return_cfs_rq_runtime(cfs_rq);
@@ -1788,6 +1947,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
1788 */ 1947 */
1789static void check_enqueue_throttle(struct cfs_rq *cfs_rq) 1948static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
1790{ 1949{
1950 if (!cfs_bandwidth_used())
1951 return;
1952
1791 /* an active group must be handled by the update_curr()->put() path */ 1953 /* an active group must be handled by the update_curr()->put() path */
1792 if (!cfs_rq->runtime_enabled || cfs_rq->curr) 1954 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
1793 return; 1955 return;
@@ -1805,6 +1967,9 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
1805/* conditionally throttle active cfs_rq's from put_prev_entity() */ 1967/* conditionally throttle active cfs_rq's from put_prev_entity() */
1806static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) 1968static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1807{ 1969{
1970 if (!cfs_bandwidth_used())
1971 return;
1972
1808 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) 1973 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
1809 return; 1974 return;
1810 1975
@@ -1817,7 +1982,112 @@ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1817 1982
1818 throttle_cfs_rq(cfs_rq); 1983 throttle_cfs_rq(cfs_rq);
1819} 1984}
1820#else 1985
1986static inline u64 default_cfs_period(void);
1987static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
1988static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
1989
1990static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
1991{
1992 struct cfs_bandwidth *cfs_b =
1993 container_of(timer, struct cfs_bandwidth, slack_timer);
1994 do_sched_cfs_slack_timer(cfs_b);
1995
1996 return HRTIMER_NORESTART;
1997}
1998
1999static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
2000{
2001 struct cfs_bandwidth *cfs_b =
2002 container_of(timer, struct cfs_bandwidth, period_timer);
2003 ktime_t now;
2004 int overrun;
2005 int idle = 0;
2006
2007 for (;;) {
2008 now = hrtimer_cb_get_time(timer);
2009 overrun = hrtimer_forward(timer, now, cfs_b->period);
2010
2011 if (!overrun)
2012 break;
2013
2014 idle = do_sched_cfs_period_timer(cfs_b, overrun);
2015 }
2016
2017 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
2018}
2019
2020void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2021{
2022 raw_spin_lock_init(&cfs_b->lock);
2023 cfs_b->runtime = 0;
2024 cfs_b->quota = RUNTIME_INF;
2025 cfs_b->period = ns_to_ktime(default_cfs_period());
2026
2027 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
2028 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2029 cfs_b->period_timer.function = sched_cfs_period_timer;
2030 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2031 cfs_b->slack_timer.function = sched_cfs_slack_timer;
2032}
2033
2034static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2035{
2036 cfs_rq->runtime_enabled = 0;
2037 INIT_LIST_HEAD(&cfs_rq->throttled_list);
2038}
2039
2040/* requires cfs_b->lock, may release to reprogram timer */
2041void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2042{
2043 /*
2044 * The timer may be active because we're trying to set a new bandwidth
2045 * period or because we're racing with the tear-down path
2046 * (timer_active==0 becomes visible before the hrtimer call-back
2047 * terminates). In either case we ensure that it's re-programmed
2048 */
2049 while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
2050 raw_spin_unlock(&cfs_b->lock);
2051 /* ensure cfs_b->lock is available while we wait */
2052 hrtimer_cancel(&cfs_b->period_timer);
2053
2054 raw_spin_lock(&cfs_b->lock);
2055 /* if someone else restarted the timer then we're done */
2056 if (cfs_b->timer_active)
2057 return;
2058 }
2059
2060 cfs_b->timer_active = 1;
2061 start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
2062}
2063
2064static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2065{
2066 hrtimer_cancel(&cfs_b->period_timer);
2067 hrtimer_cancel(&cfs_b->slack_timer);
2068}
2069
2070void unthrottle_offline_cfs_rqs(struct rq *rq)
2071{
2072 struct cfs_rq *cfs_rq;
2073
2074 for_each_leaf_cfs_rq(rq, cfs_rq) {
2075 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2076
2077 if (!cfs_rq->runtime_enabled)
2078 continue;
2079
2080 /*
2081 * clock_task is not advancing so we just need to make sure
2082 * there's some valid quota amount
2083 */
2084 cfs_rq->runtime_remaining = cfs_b->quota;
2085 if (cfs_rq_throttled(cfs_rq))
2086 unthrottle_cfs_rq(cfs_rq);
2087 }
2088}
2089
2090#else /* CONFIG_CFS_BANDWIDTH */
1821static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, 2091static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1822 unsigned long delta_exec) {} 2092 unsigned long delta_exec) {}
1823static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 2093static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -1839,8 +2109,22 @@ static inline int throttled_lb_pair(struct task_group *tg,
1839{ 2109{
1840 return 0; 2110 return 0;
1841} 2111}
2112
2113void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2114
2115#ifdef CONFIG_FAIR_GROUP_SCHED
2116static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
1842#endif 2117#endif
1843 2118
2119static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2120{
2121 return NULL;
2122}
2123static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2124void unthrottle_offline_cfs_rqs(struct rq *rq) {}
2125
2126#endif /* CONFIG_CFS_BANDWIDTH */
2127
1844/************************************************** 2128/**************************************************
1845 * CFS operations on tasks: 2129 * CFS operations on tasks:
1846 */ 2130 */
@@ -1853,7 +2137,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
1853 2137
1854 WARN_ON(task_rq(p) != rq); 2138 WARN_ON(task_rq(p) != rq);
1855 2139
1856 if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) { 2140 if (cfs_rq->nr_running > 1) {
1857 u64 slice = sched_slice(cfs_rq, se); 2141 u64 slice = sched_slice(cfs_rq, se);
1858 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; 2142 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
1859 s64 delta = slice - ran; 2143 s64 delta = slice - ran;
@@ -1884,7 +2168,7 @@ static void hrtick_update(struct rq *rq)
1884{ 2168{
1885 struct task_struct *curr = rq->curr; 2169 struct task_struct *curr = rq->curr;
1886 2170
1887 if (curr->sched_class != &fair_sched_class) 2171 if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
1888 return; 2172 return;
1889 2173
1890 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) 2174 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
@@ -2007,6 +2291,61 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2007} 2291}
2008 2292
2009#ifdef CONFIG_SMP 2293#ifdef CONFIG_SMP
2294/* Used instead of source_load when we know the type == 0 */
2295static unsigned long weighted_cpuload(const int cpu)
2296{
2297 return cpu_rq(cpu)->load.weight;
2298}
2299
2300/*
2301 * Return a low guess at the load of a migration-source cpu weighted
2302 * according to the scheduling class and "nice" value.
2303 *
2304 * We want to under-estimate the load of migration sources, to
2305 * balance conservatively.
2306 */
2307static unsigned long source_load(int cpu, int type)
2308{
2309 struct rq *rq = cpu_rq(cpu);
2310 unsigned long total = weighted_cpuload(cpu);
2311
2312 if (type == 0 || !sched_feat(LB_BIAS))
2313 return total;
2314
2315 return min(rq->cpu_load[type-1], total);
2316}
2317
2318/*
2319 * Return a high guess at the load of a migration-target cpu weighted
2320 * according to the scheduling class and "nice" value.
2321 */
2322static unsigned long target_load(int cpu, int type)
2323{
2324 struct rq *rq = cpu_rq(cpu);
2325 unsigned long total = weighted_cpuload(cpu);
2326
2327 if (type == 0 || !sched_feat(LB_BIAS))
2328 return total;
2329
2330 return max(rq->cpu_load[type-1], total);
2331}
2332
2333static unsigned long power_of(int cpu)
2334{
2335 return cpu_rq(cpu)->cpu_power;
2336}
2337
2338static unsigned long cpu_avg_load_per_task(int cpu)
2339{
2340 struct rq *rq = cpu_rq(cpu);
2341 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
2342
2343 if (nr_running)
2344 return rq->load.weight / nr_running;
2345
2346 return 0;
2347}
2348
2010 2349
2011static void task_waking_fair(struct task_struct *p) 2350static void task_waking_fair(struct task_struct *p)
2012{ 2351{
@@ -2036,36 +2375,100 @@ static void task_waking_fair(struct task_struct *p)
2036 * Adding load to a group doesn't make a group heavier, but can cause movement 2375 * Adding load to a group doesn't make a group heavier, but can cause movement
2037 * of group shares between cpus. Assuming the shares were perfectly aligned one 2376 * of group shares between cpus. Assuming the shares were perfectly aligned one
2038 * can calculate the shift in shares. 2377 * can calculate the shift in shares.
2378 *
2379 * Calculate the effective load difference if @wl is added (subtracted) to @tg
2380 * on this @cpu and results in a total addition (subtraction) of @wg to the
2381 * total group weight.
2382 *
2383 * Given a runqueue weight distribution (rw_i) we can compute a shares
2384 * distribution (s_i) using:
2385 *
2386 * s_i = rw_i / \Sum rw_j (1)
2387 *
2388 * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
2389 * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
2390 * shares distribution (s_i):
2391 *
2392 * rw_i = { 2, 4, 1, 0 }
2393 * s_i = { 2/7, 4/7, 1/7, 0 }
2394 *
2395 * As per wake_affine() we're interested in the load of two CPUs (the CPU the
2396 * task used to run on and the CPU the waker is running on), we need to
2397 * compute the effect of waking a task on either CPU and, in case of a sync
2398 * wakeup, compute the effect of the current task going to sleep.
2399 *
2400 * So for a change of @wl to the local @cpu with an overall group weight change
2401 * of @wl we can compute the new shares distribution (s'_i) using:
2402 *
2403 * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
2404 *
2405 * Suppose we're interested in CPUs 0 and 1, and want to compute the load
2406 * differences in waking a task to CPU 0. The additional task changes the
2407 * weight and shares distributions like:
2408 *
2409 * rw'_i = { 3, 4, 1, 0 }
2410 * s'_i = { 3/8, 4/8, 1/8, 0 }
2411 *
2412 * We can then compute the difference in effective weight by using:
2413 *
2414 * dw_i = S * (s'_i - s_i) (3)
2415 *
2416 * Where 'S' is the group weight as seen by its parent.
2417 *
2418 * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
2419 * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
2420 * 4/7) times the weight of the group.
2039 */ 2421 */
2040static long effective_load(struct task_group *tg, int cpu, long wl, long wg) 2422static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
2041{ 2423{
2042 struct sched_entity *se = tg->se[cpu]; 2424 struct sched_entity *se = tg->se[cpu];
2043 2425
2044 if (!tg->parent) 2426 if (!tg->parent) /* the trivial, non-cgroup case */
2045 return wl; 2427 return wl;
2046 2428
2047 for_each_sched_entity(se) { 2429 for_each_sched_entity(se) {
2048 long lw, w; 2430 long w, W;
2049 2431
2050 tg = se->my_q->tg; 2432 tg = se->my_q->tg;
2051 w = se->my_q->load.weight;
2052 2433
2053 /* use this cpu's instantaneous contribution */ 2434 /*
2054 lw = atomic_read(&tg->load_weight); 2435 * W = @wg + \Sum rw_j
2055 lw -= se->my_q->load_contribution; 2436 */
2056 lw += w + wg; 2437 W = wg + calc_tg_weight(tg, se->my_q);
2057 2438
2058 wl += w; 2439 /*
2440 * w = rw_i + @wl
2441 */
2442 w = se->my_q->load.weight + wl;
2059 2443
2060 if (lw > 0 && wl < lw) 2444 /*
2061 wl = (wl * tg->shares) / lw; 2445 * wl = S * s'_i; see (2)
2446 */
2447 if (W > 0 && w < W)
2448 wl = (w * tg->shares) / W;
2062 else 2449 else
2063 wl = tg->shares; 2450 wl = tg->shares;
2064 2451
2065 /* zero point is MIN_SHARES */ 2452 /*
2453 * Per the above, wl is the new se->load.weight value; since
2454 * those are clipped to [MIN_SHARES, ...) do so now. See
2455 * calc_cfs_shares().
2456 */
2066 if (wl < MIN_SHARES) 2457 if (wl < MIN_SHARES)
2067 wl = MIN_SHARES; 2458 wl = MIN_SHARES;
2459
2460 /*
2461 * wl = dw_i = S * (s'_i - s_i); see (3)
2462 */
2068 wl -= se->load.weight; 2463 wl -= se->load.weight;
2464
2465 /*
2466 * Recursively apply this logic to all parent groups to compute
2467 * the final effective load change on the root group. Since
2468 * only the @tg group gets extra weight, all parent groups can
2469 * only redistribute existing shares. @wl is the shift in shares
2470 * resulting from this level per the above.
2471 */
2069 wg = 0; 2472 wg = 0;
2070 } 2473 }
2071 2474
@@ -2249,6 +2652,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
2249 int cpu = smp_processor_id(); 2652 int cpu = smp_processor_id();
2250 int prev_cpu = task_cpu(p); 2653 int prev_cpu = task_cpu(p);
2251 struct sched_domain *sd; 2654 struct sched_domain *sd;
2655 struct sched_group *sg;
2252 int i; 2656 int i;
2253 2657
2254 /* 2658 /*
@@ -2269,25 +2673,28 @@ static int select_idle_sibling(struct task_struct *p, int target)
2269 * Otherwise, iterate the domains and find an elegible idle cpu. 2673 * Otherwise, iterate the domains and find an elegible idle cpu.
2270 */ 2674 */
2271 rcu_read_lock(); 2675 rcu_read_lock();
2272 for_each_domain(target, sd) {
2273 if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
2274 break;
2275 2676
2276 for_each_cpu_and(i, sched_domain_span(sd), tsk_cpus_allowed(p)) { 2677 sd = rcu_dereference(per_cpu(sd_llc, target));
2277 if (idle_cpu(i)) { 2678 for_each_lower_domain(sd) {
2278 target = i; 2679 sg = sd->groups;
2279 break; 2680 do {
2681 if (!cpumask_intersects(sched_group_cpus(sg),
2682 tsk_cpus_allowed(p)))
2683 goto next;
2684
2685 for_each_cpu(i, sched_group_cpus(sg)) {
2686 if (!idle_cpu(i))
2687 goto next;
2280 } 2688 }
2281 }
2282 2689
2283 /* 2690 target = cpumask_first_and(sched_group_cpus(sg),
2284 * Lets stop looking for an idle sibling when we reached 2691 tsk_cpus_allowed(p));
2285 * the domain that spans the current cpu and prev_cpu. 2692 goto done;
2286 */ 2693next:
2287 if (cpumask_test_cpu(cpu, sched_domain_span(sd)) && 2694 sg = sg->next;
2288 cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) 2695 } while (sg != sd->groups);
2289 break;
2290 } 2696 }
2697done:
2291 rcu_read_unlock(); 2698 rcu_read_unlock();
2292 2699
2293 return target; 2700 return target;
@@ -2315,6 +2722,9 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2315 int want_sd = 1; 2722 int want_sd = 1;
2316 int sync = wake_flags & WF_SYNC; 2723 int sync = wake_flags & WF_SYNC;
2317 2724
2725 if (p->rt.nr_cpus_allowed == 1)
2726 return prev_cpu;
2727
2318 if (sd_flag & SD_BALANCE_WAKE) { 2728 if (sd_flag & SD_BALANCE_WAKE) {
2319 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) 2729 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
2320 want_affine = 1; 2730 want_affine = 1;
@@ -2599,7 +3009,8 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
2599 } while (cfs_rq); 3009 } while (cfs_rq);
2600 3010
2601 p = task_of(se); 3011 p = task_of(se);
2602 hrtick_start_fair(rq, p); 3012 if (hrtick_enabled(rq))
3013 hrtick_start_fair(rq, p);
2603 3014
2604 return p; 3015 return p;
2605} 3016}
@@ -2643,6 +3054,12 @@ static void yield_task_fair(struct rq *rq)
2643 * Update run-time statistics of the 'current'. 3054 * Update run-time statistics of the 'current'.
2644 */ 3055 */
2645 update_curr(cfs_rq); 3056 update_curr(cfs_rq);
3057 /*
3058 * Tell update_rq_clock() that we've just updated,
3059 * so we don't do microscopic update in schedule()
3060 * and double the fastpath cost.
3061 */
3062 rq->skip_clock_update = 1;
2646 } 3063 }
2647 3064
2648 set_skip_buddy(se); 3065 set_skip_buddy(se);
@@ -2683,12 +3100,50 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
2683} 3100}
2684 3101
2685/* 3102/*
3103 * Is this task likely cache-hot:
3104 */
3105static int
3106task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
3107{
3108 s64 delta;
3109
3110 if (p->sched_class != &fair_sched_class)
3111 return 0;
3112
3113 if (unlikely(p->policy == SCHED_IDLE))
3114 return 0;
3115
3116 /*
3117 * Buddy candidates are cache hot:
3118 */
3119 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
3120 (&p->se == cfs_rq_of(&p->se)->next ||
3121 &p->se == cfs_rq_of(&p->se)->last))
3122 return 1;
3123
3124 if (sysctl_sched_migration_cost == -1)
3125 return 1;
3126 if (sysctl_sched_migration_cost == 0)
3127 return 0;
3128
3129 delta = now - p->se.exec_start;
3130
3131 return delta < (s64)sysctl_sched_migration_cost;
3132}
3133
3134#define LBF_ALL_PINNED 0x01
3135#define LBF_NEED_BREAK 0x02 /* clears into HAD_BREAK */
3136#define LBF_HAD_BREAK 0x04
3137#define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */
3138#define LBF_ABORT 0x10
3139
3140/*
2686 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? 3141 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
2687 */ 3142 */
2688static 3143static
2689int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, 3144int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2690 struct sched_domain *sd, enum cpu_idle_type idle, 3145 struct sched_domain *sd, enum cpu_idle_type idle,
2691 int *all_pinned) 3146 int *lb_flags)
2692{ 3147{
2693 int tsk_cache_hot = 0; 3148 int tsk_cache_hot = 0;
2694 /* 3149 /*
@@ -2701,7 +3156,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2701 schedstat_inc(p, se.statistics.nr_failed_migrations_affine); 3156 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
2702 return 0; 3157 return 0;
2703 } 3158 }
2704 *all_pinned = 0; 3159 *lb_flags &= ~LBF_ALL_PINNED;
2705 3160
2706 if (task_running(rq, p)) { 3161 if (task_running(rq, p)) {
2707 schedstat_inc(p, se.statistics.nr_failed_migrations_running); 3162 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
@@ -2775,7 +3230,7 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
2775static unsigned long 3230static unsigned long
2776balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, 3231balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2777 unsigned long max_load_move, struct sched_domain *sd, 3232 unsigned long max_load_move, struct sched_domain *sd,
2778 enum cpu_idle_type idle, int *all_pinned, 3233 enum cpu_idle_type idle, int *lb_flags,
2779 struct cfs_rq *busiest_cfs_rq) 3234 struct cfs_rq *busiest_cfs_rq)
2780{ 3235{
2781 int loops = 0, pulled = 0; 3236 int loops = 0, pulled = 0;
@@ -2786,12 +3241,14 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2786 goto out; 3241 goto out;
2787 3242
2788 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { 3243 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
2789 if (loops++ > sysctl_sched_nr_migrate) 3244 if (loops++ > sysctl_sched_nr_migrate) {
3245 *lb_flags |= LBF_NEED_BREAK;
2790 break; 3246 break;
3247 }
2791 3248
2792 if ((p->se.load.weight >> 1) > rem_load_move || 3249 if ((p->se.load.weight >> 1) > rem_load_move ||
2793 !can_migrate_task(p, busiest, this_cpu, sd, idle, 3250 !can_migrate_task(p, busiest, this_cpu, sd, idle,
2794 all_pinned)) 3251 lb_flags))
2795 continue; 3252 continue;
2796 3253
2797 pull_task(busiest, p, this_rq, this_cpu); 3254 pull_task(busiest, p, this_rq, this_cpu);
@@ -2804,8 +3261,10 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2804 * kernels will stop after the first task is pulled to minimize 3261 * kernels will stop after the first task is pulled to minimize
2805 * the critical section. 3262 * the critical section.
2806 */ 3263 */
2807 if (idle == CPU_NEWLY_IDLE) 3264 if (idle == CPU_NEWLY_IDLE) {
3265 *lb_flags |= LBF_ABORT;
2808 break; 3266 break;
3267 }
2809#endif 3268#endif
2810 3269
2811 /* 3270 /*
@@ -2910,7 +3369,7 @@ static unsigned long
2910load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 3369load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2911 unsigned long max_load_move, 3370 unsigned long max_load_move,
2912 struct sched_domain *sd, enum cpu_idle_type idle, 3371 struct sched_domain *sd, enum cpu_idle_type idle,
2913 int *all_pinned) 3372 int *lb_flags)
2914{ 3373{
2915 long rem_load_move = max_load_move; 3374 long rem_load_move = max_load_move;
2916 struct cfs_rq *busiest_cfs_rq; 3375 struct cfs_rq *busiest_cfs_rq;
@@ -2923,6 +3382,9 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2923 unsigned long busiest_weight = busiest_cfs_rq->load.weight; 3382 unsigned long busiest_weight = busiest_cfs_rq->load.weight;
2924 u64 rem_load, moved_load; 3383 u64 rem_load, moved_load;
2925 3384
3385 if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
3386 break;
3387
2926 /* 3388 /*
2927 * empty group or part of a throttled hierarchy 3389 * empty group or part of a throttled hierarchy
2928 */ 3390 */
@@ -2934,7 +3396,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2934 rem_load = div_u64(rem_load, busiest_h_load + 1); 3396 rem_load = div_u64(rem_load, busiest_h_load + 1);
2935 3397
2936 moved_load = balance_tasks(this_rq, this_cpu, busiest, 3398 moved_load = balance_tasks(this_rq, this_cpu, busiest,
2937 rem_load, sd, idle, all_pinned, 3399 rem_load, sd, idle, lb_flags,
2938 busiest_cfs_rq); 3400 busiest_cfs_rq);
2939 3401
2940 if (!moved_load) 3402 if (!moved_load)
@@ -2960,10 +3422,10 @@ static unsigned long
2960load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 3422load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2961 unsigned long max_load_move, 3423 unsigned long max_load_move,
2962 struct sched_domain *sd, enum cpu_idle_type idle, 3424 struct sched_domain *sd, enum cpu_idle_type idle,
2963 int *all_pinned) 3425 int *lb_flags)
2964{ 3426{
2965 return balance_tasks(this_rq, this_cpu, busiest, 3427 return balance_tasks(this_rq, this_cpu, busiest,
2966 max_load_move, sd, idle, all_pinned, 3428 max_load_move, sd, idle, lb_flags,
2967 &busiest->cfs); 3429 &busiest->cfs);
2968} 3430}
2969#endif 3431#endif
@@ -2978,29 +3440,30 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
2978static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, 3440static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2979 unsigned long max_load_move, 3441 unsigned long max_load_move,
2980 struct sched_domain *sd, enum cpu_idle_type idle, 3442 struct sched_domain *sd, enum cpu_idle_type idle,
2981 int *all_pinned) 3443 int *lb_flags)
2982{ 3444{
2983 unsigned long total_load_moved = 0, load_moved; 3445 unsigned long total_load_moved = 0, load_moved;
2984 3446
2985 do { 3447 do {
2986 load_moved = load_balance_fair(this_rq, this_cpu, busiest, 3448 load_moved = load_balance_fair(this_rq, this_cpu, busiest,
2987 max_load_move - total_load_moved, 3449 max_load_move - total_load_moved,
2988 sd, idle, all_pinned); 3450 sd, idle, lb_flags);
2989 3451
2990 total_load_moved += load_moved; 3452 total_load_moved += load_moved;
2991 3453
3454 if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
3455 break;
3456
2992#ifdef CONFIG_PREEMPT 3457#ifdef CONFIG_PREEMPT
2993 /* 3458 /*
2994 * NEWIDLE balancing is a source of latency, so preemptible 3459 * NEWIDLE balancing is a source of latency, so preemptible
2995 * kernels will stop after the first task is pulled to minimize 3460 * kernels will stop after the first task is pulled to minimize
2996 * the critical section. 3461 * the critical section.
2997 */ 3462 */
2998 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) 3463 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) {
2999 break; 3464 *lb_flags |= LBF_ABORT;
3000
3001 if (raw_spin_is_contended(&this_rq->lock) ||
3002 raw_spin_is_contended(&busiest->lock))
3003 break; 3465 break;
3466 }
3004#endif 3467#endif
3005 } while (load_moved && max_load_move > total_load_moved); 3468 } while (load_moved && max_load_move > total_load_moved);
3006 3469
@@ -3062,15 +3525,6 @@ struct sg_lb_stats {
3062}; 3525};
3063 3526
3064/** 3527/**
3065 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
3066 * @group: The group whose first cpu is to be returned.
3067 */
3068static inline unsigned int group_first_cpu(struct sched_group *group)
3069{
3070 return cpumask_first(sched_group_cpus(group));
3071}
3072
3073/**
3074 * get_sd_load_idx - Obtain the load index for a given sched domain. 3528 * get_sd_load_idx - Obtain the load index for a given sched domain.
3075 * @sd: The sched_domain whose load_idx is to be obtained. 3529 * @sd: The sched_domain whose load_idx is to be obtained.
3076 * @idle: The Idle status of the CPU for whose sd load_icx is obtained. 3530 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
@@ -3319,7 +3773,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
3319 sdg->sgp->power = power; 3773 sdg->sgp->power = power;
3320} 3774}
3321 3775
3322static void update_group_power(struct sched_domain *sd, int cpu) 3776void update_group_power(struct sched_domain *sd, int cpu)
3323{ 3777{
3324 struct sched_domain *child = sd->child; 3778 struct sched_domain *child = sd->child;
3325 struct sched_group *group, *sdg = sd->groups; 3779 struct sched_group *group, *sdg = sd->groups;
@@ -3511,7 +3965,7 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
3511} 3965}
3512 3966
3513/** 3967/**
3514 * update_sd_lb_stats - Update sched_group's statistics for load balancing. 3968 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
3515 * @sd: sched_domain whose statistics are to be updated. 3969 * @sd: sched_domain whose statistics are to be updated.
3516 * @this_cpu: Cpu for which load balance is currently performed. 3970 * @this_cpu: Cpu for which load balance is currently performed.
3517 * @idle: Idle status of this_cpu 3971 * @idle: Idle status of this_cpu
@@ -3585,11 +4039,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3585 } while (sg != sd->groups); 4039 } while (sg != sd->groups);
3586} 4040}
3587 4041
3588int __weak arch_sd_sibling_asym_packing(void)
3589{
3590 return 0*SD_ASYM_PACKING;
3591}
3592
3593/** 4042/**
3594 * check_asym_packing - Check to see if the group is packed into the 4043 * check_asym_packing - Check to see if the group is packed into the
3595 * sched doman. 4044 * sched doman.
@@ -3953,7 +4402,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
3953#define MAX_PINNED_INTERVAL 512 4402#define MAX_PINNED_INTERVAL 512
3954 4403
3955/* Working cpumask for load_balance and load_balance_newidle. */ 4404/* Working cpumask for load_balance and load_balance_newidle. */
3956static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 4405DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
3957 4406
3958static int need_active_balance(struct sched_domain *sd, int idle, 4407static int need_active_balance(struct sched_domain *sd, int idle,
3959 int busiest_cpu, int this_cpu) 4408 int busiest_cpu, int this_cpu)
@@ -4004,7 +4453,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4004 struct sched_domain *sd, enum cpu_idle_type idle, 4453 struct sched_domain *sd, enum cpu_idle_type idle,
4005 int *balance) 4454 int *balance)
4006{ 4455{
4007 int ld_moved, all_pinned = 0, active_balance = 0; 4456 int ld_moved, lb_flags = 0, active_balance = 0;
4008 struct sched_group *group; 4457 struct sched_group *group;
4009 unsigned long imbalance; 4458 unsigned long imbalance;
4010 struct rq *busiest; 4459 struct rq *busiest;
@@ -4045,11 +4494,11 @@ redo:
4045 * still unbalanced. ld_moved simply stays zero, so it is 4494 * still unbalanced. ld_moved simply stays zero, so it is
4046 * correctly treated as an imbalance. 4495 * correctly treated as an imbalance.
4047 */ 4496 */
4048 all_pinned = 1; 4497 lb_flags |= LBF_ALL_PINNED;
4049 local_irq_save(flags); 4498 local_irq_save(flags);
4050 double_rq_lock(this_rq, busiest); 4499 double_rq_lock(this_rq, busiest);
4051 ld_moved = move_tasks(this_rq, this_cpu, busiest, 4500 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4052 imbalance, sd, idle, &all_pinned); 4501 imbalance, sd, idle, &lb_flags);
4053 double_rq_unlock(this_rq, busiest); 4502 double_rq_unlock(this_rq, busiest);
4054 local_irq_restore(flags); 4503 local_irq_restore(flags);
4055 4504
@@ -4059,8 +4508,18 @@ redo:
4059 if (ld_moved && this_cpu != smp_processor_id()) 4508 if (ld_moved && this_cpu != smp_processor_id())
4060 resched_cpu(this_cpu); 4509 resched_cpu(this_cpu);
4061 4510
4511 if (lb_flags & LBF_ABORT)
4512 goto out_balanced;
4513
4514 if (lb_flags & LBF_NEED_BREAK) {
4515 lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK;
4516 if (lb_flags & LBF_ABORT)
4517 goto out_balanced;
4518 goto redo;
4519 }
4520
4062 /* All tasks on this runqueue were pinned by CPU affinity */ 4521 /* All tasks on this runqueue were pinned by CPU affinity */
4063 if (unlikely(all_pinned)) { 4522 if (unlikely(lb_flags & LBF_ALL_PINNED)) {
4064 cpumask_clear_cpu(cpu_of(busiest), cpus); 4523 cpumask_clear_cpu(cpu_of(busiest), cpus);
4065 if (!cpumask_empty(cpus)) 4524 if (!cpumask_empty(cpus))
4066 goto redo; 4525 goto redo;
@@ -4090,7 +4549,7 @@ redo:
4090 tsk_cpus_allowed(busiest->curr))) { 4549 tsk_cpus_allowed(busiest->curr))) {
4091 raw_spin_unlock_irqrestore(&busiest->lock, 4550 raw_spin_unlock_irqrestore(&busiest->lock,
4092 flags); 4551 flags);
4093 all_pinned = 1; 4552 lb_flags |= LBF_ALL_PINNED;
4094 goto out_one_pinned; 4553 goto out_one_pinned;
4095 } 4554 }
4096 4555
@@ -4143,7 +4602,8 @@ out_balanced:
4143 4602
4144out_one_pinned: 4603out_one_pinned:
4145 /* tune up the balancing interval */ 4604 /* tune up the balancing interval */
4146 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || 4605 if (((lb_flags & LBF_ALL_PINNED) &&
4606 sd->balance_interval < MAX_PINNED_INTERVAL) ||
4147 (sd->balance_interval < sd->max_interval)) 4607 (sd->balance_interval < sd->max_interval))
4148 sd->balance_interval *= 2; 4608 sd->balance_interval *= 2;
4149 4609
@@ -4156,7 +4616,7 @@ out:
4156 * idle_balance is called by schedule() if this_cpu is about to become 4616 * idle_balance is called by schedule() if this_cpu is about to become
4157 * idle. Attempts to pull tasks from other CPUs. 4617 * idle. Attempts to pull tasks from other CPUs.
4158 */ 4618 */
4159static void idle_balance(int this_cpu, struct rq *this_rq) 4619void idle_balance(int this_cpu, struct rq *this_rq)
4160{ 4620{
4161 struct sched_domain *sd; 4621 struct sched_domain *sd;
4162 int pulled_task = 0; 4622 int pulled_task = 0;
@@ -4271,28 +4731,16 @@ out_unlock:
4271#ifdef CONFIG_NO_HZ 4731#ifdef CONFIG_NO_HZ
4272/* 4732/*
4273 * idle load balancing details 4733 * idle load balancing details
4274 * - One of the idle CPUs nominates itself as idle load_balancer, while
4275 * entering idle.
4276 * - This idle load balancer CPU will also go into tickless mode when
4277 * it is idle, just like all other idle CPUs
4278 * - When one of the busy CPUs notice that there may be an idle rebalancing 4734 * - When one of the busy CPUs notice that there may be an idle rebalancing
4279 * needed, they will kick the idle load balancer, which then does idle 4735 * needed, they will kick the idle load balancer, which then does idle
4280 * load balancing for all the idle CPUs. 4736 * load balancing for all the idle CPUs.
4281 */ 4737 */
4282static struct { 4738static struct {
4283 atomic_t load_balancer;
4284 atomic_t first_pick_cpu;
4285 atomic_t second_pick_cpu;
4286 cpumask_var_t idle_cpus_mask; 4739 cpumask_var_t idle_cpus_mask;
4287 cpumask_var_t grp_idle_mask; 4740 atomic_t nr_cpus;
4288 unsigned long next_balance; /* in jiffy units */ 4741 unsigned long next_balance; /* in jiffy units */
4289} nohz ____cacheline_aligned; 4742} nohz ____cacheline_aligned;
4290 4743
4291int get_nohz_load_balancer(void)
4292{
4293 return atomic_read(&nohz.load_balancer);
4294}
4295
4296#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 4744#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4297/** 4745/**
4298 * lowest_flag_domain - Return lowest sched_domain containing flag. 4746 * lowest_flag_domain - Return lowest sched_domain containing flag.
@@ -4329,33 +4777,6 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4329 (sd && (sd->flags & flag)); sd = sd->parent) 4777 (sd && (sd->flags & flag)); sd = sd->parent)
4330 4778
4331/** 4779/**
4332 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
4333 * @ilb_group: group to be checked for semi-idleness
4334 *
4335 * Returns: 1 if the group is semi-idle. 0 otherwise.
4336 *
4337 * We define a sched_group to be semi idle if it has atleast one idle-CPU
4338 * and atleast one non-idle CPU. This helper function checks if the given
4339 * sched_group is semi-idle or not.
4340 */
4341static inline int is_semi_idle_group(struct sched_group *ilb_group)
4342{
4343 cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
4344 sched_group_cpus(ilb_group));
4345
4346 /*
4347 * A sched_group is semi-idle when it has atleast one busy cpu
4348 * and atleast one idle cpu.
4349 */
4350 if (cpumask_empty(nohz.grp_idle_mask))
4351 return 0;
4352
4353 if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
4354 return 0;
4355
4356 return 1;
4357}
4358/**
4359 * find_new_ilb - Finds the optimum idle load balancer for nomination. 4780 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4360 * @cpu: The cpu which is nominating a new idle_load_balancer. 4781 * @cpu: The cpu which is nominating a new idle_load_balancer.
4361 * 4782 *
@@ -4369,9 +4790,9 @@ static inline int is_semi_idle_group(struct sched_group *ilb_group)
4369 */ 4790 */
4370static int find_new_ilb(int cpu) 4791static int find_new_ilb(int cpu)
4371{ 4792{
4793 int ilb = cpumask_first(nohz.idle_cpus_mask);
4794 struct sched_group *ilbg;
4372 struct sched_domain *sd; 4795 struct sched_domain *sd;
4373 struct sched_group *ilb_group;
4374 int ilb = nr_cpu_ids;
4375 4796
4376 /* 4797 /*
4377 * Have idle load balancer selection from semi-idle packages only 4798 * Have idle load balancer selection from semi-idle packages only
@@ -4389,23 +4810,28 @@ static int find_new_ilb(int cpu)
4389 4810
4390 rcu_read_lock(); 4811 rcu_read_lock();
4391 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { 4812 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4392 ilb_group = sd->groups; 4813 ilbg = sd->groups;
4393 4814
4394 do { 4815 do {
4395 if (is_semi_idle_group(ilb_group)) { 4816 if (ilbg->group_weight !=
4396 ilb = cpumask_first(nohz.grp_idle_mask); 4817 atomic_read(&ilbg->sgp->nr_busy_cpus)) {
4818 ilb = cpumask_first_and(nohz.idle_cpus_mask,
4819 sched_group_cpus(ilbg));
4397 goto unlock; 4820 goto unlock;
4398 } 4821 }
4399 4822
4400 ilb_group = ilb_group->next; 4823 ilbg = ilbg->next;
4401 4824
4402 } while (ilb_group != sd->groups); 4825 } while (ilbg != sd->groups);
4403 } 4826 }
4404unlock: 4827unlock:
4405 rcu_read_unlock(); 4828 rcu_read_unlock();
4406 4829
4407out_done: 4830out_done:
4408 return ilb; 4831 if (ilb < nr_cpu_ids && idle_cpu(ilb))
4832 return ilb;
4833
4834 return nr_cpu_ids;
4409} 4835}
4410#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ 4836#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4411static inline int find_new_ilb(int call_cpu) 4837static inline int find_new_ilb(int call_cpu)
@@ -4425,102 +4851,98 @@ static void nohz_balancer_kick(int cpu)
4425 4851
4426 nohz.next_balance++; 4852 nohz.next_balance++;
4427 4853
4428 ilb_cpu = get_nohz_load_balancer(); 4854 ilb_cpu = find_new_ilb(cpu);
4429 4855
4430 if (ilb_cpu >= nr_cpu_ids) { 4856 if (ilb_cpu >= nr_cpu_ids)
4431 ilb_cpu = cpumask_first(nohz.idle_cpus_mask); 4857 return;
4432 if (ilb_cpu >= nr_cpu_ids)
4433 return;
4434 }
4435 4858
4436 if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { 4859 if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
4437 cpu_rq(ilb_cpu)->nohz_balance_kick = 1; 4860 return;
4861 /*
4862 * Use smp_send_reschedule() instead of resched_cpu().
4863 * This way we generate a sched IPI on the target cpu which
4864 * is idle. And the softirq performing nohz idle load balance
4865 * will be run before returning from the IPI.
4866 */
4867 smp_send_reschedule(ilb_cpu);
4868 return;
4869}
4438 4870
4439 smp_mb(); 4871static inline void clear_nohz_tick_stopped(int cpu)
4440 /* 4872{
4441 * Use smp_send_reschedule() instead of resched_cpu(). 4873 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
4442 * This way we generate a sched IPI on the target cpu which 4874 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
4443 * is idle. And the softirq performing nohz idle load balance 4875 atomic_dec(&nohz.nr_cpus);
4444 * will be run before returning from the IPI. 4876 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
4445 */
4446 smp_send_reschedule(ilb_cpu);
4447 } 4877 }
4448 return;
4449} 4878}
4450 4879
4451/* 4880static inline void set_cpu_sd_state_busy(void)
4452 * This routine will try to nominate the ilb (idle load balancing)
4453 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
4454 * load balancing on behalf of all those cpus.
4455 *
4456 * When the ilb owner becomes busy, we will not have new ilb owner until some
4457 * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
4458 * idle load balancing by kicking one of the idle CPUs.
4459 *
4460 * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
4461 * ilb owner CPU in future (when there is a need for idle load balancing on
4462 * behalf of all idle CPUs).
4463 */
4464void select_nohz_load_balancer(int stop_tick)
4465{ 4881{
4882 struct sched_domain *sd;
4466 int cpu = smp_processor_id(); 4883 int cpu = smp_processor_id();
4467 4884
4468 if (stop_tick) { 4885 if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
4469 if (!cpu_active(cpu)) { 4886 return;
4470 if (atomic_read(&nohz.load_balancer) != cpu) 4887 clear_bit(NOHZ_IDLE, nohz_flags(cpu));
4471 return;
4472 4888
4473 /* 4889 rcu_read_lock();
4474 * If we are going offline and still the leader, 4890 for_each_domain(cpu, sd)
4475 * give up! 4891 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
4476 */ 4892 rcu_read_unlock();
4477 if (atomic_cmpxchg(&nohz.load_balancer, cpu, 4893}
4478 nr_cpu_ids) != cpu)
4479 BUG();
4480 4894
4481 return; 4895void set_cpu_sd_state_idle(void)
4482 } 4896{
4897 struct sched_domain *sd;
4898 int cpu = smp_processor_id();
4483 4899
4484 cpumask_set_cpu(cpu, nohz.idle_cpus_mask); 4900 if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
4901 return;
4902 set_bit(NOHZ_IDLE, nohz_flags(cpu));
4485 4903
4486 if (atomic_read(&nohz.first_pick_cpu) == cpu) 4904 rcu_read_lock();
4487 atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids); 4905 for_each_domain(cpu, sd)
4488 if (atomic_read(&nohz.second_pick_cpu) == cpu) 4906 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
4489 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); 4907 rcu_read_unlock();
4908}
4490 4909
4491 if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) { 4910/*
4492 int new_ilb; 4911 * This routine will record that this cpu is going idle with tick stopped.
4912 * This info will be used in performing idle load balancing in the future.
4913 */
4914void select_nohz_load_balancer(int stop_tick)
4915{
4916 int cpu = smp_processor_id();
4493 4917
4494 /* make me the ilb owner */ 4918 /*
4495 if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids, 4919 * If this cpu is going down, then nothing needs to be done.
4496 cpu) != nr_cpu_ids) 4920 */
4497 return; 4921 if (!cpu_active(cpu))
4922 return;
4498 4923
4499 /* 4924 if (stop_tick) {
4500 * Check to see if there is a more power-efficient 4925 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
4501 * ilb.
4502 */
4503 new_ilb = find_new_ilb(cpu);
4504 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4505 atomic_set(&nohz.load_balancer, nr_cpu_ids);
4506 resched_cpu(new_ilb);
4507 return;
4508 }
4509 return;
4510 }
4511 } else {
4512 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
4513 return; 4926 return;
4514 4927
4515 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); 4928 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
4516 4929 atomic_inc(&nohz.nr_cpus);
4517 if (atomic_read(&nohz.load_balancer) == cpu) 4930 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
4518 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
4519 nr_cpu_ids) != cpu)
4520 BUG();
4521 } 4931 }
4522 return; 4932 return;
4523} 4933}
4934
4935static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
4936 unsigned long action, void *hcpu)
4937{
4938 switch (action & ~CPU_TASKS_FROZEN) {
4939 case CPU_DYING:
4940 clear_nohz_tick_stopped(smp_processor_id());
4941 return NOTIFY_OK;
4942 default:
4943 return NOTIFY_DONE;
4944 }
4945}
4524#endif 4946#endif
4525 4947
4526static DEFINE_SPINLOCK(balancing); 4948static DEFINE_SPINLOCK(balancing);
@@ -4531,7 +4953,7 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
4531 * Scale the max load_balance interval with the number of CPUs in the system. 4953 * Scale the max load_balance interval with the number of CPUs in the system.
4532 * This trades load-balance latency on larger machines for less cross talk. 4954 * This trades load-balance latency on larger machines for less cross talk.
4533 */ 4955 */
4534static void update_max_interval(void) 4956void update_max_interval(void)
4535{ 4957{
4536 max_load_balance_interval = HZ*num_online_cpus()/10; 4958 max_load_balance_interval = HZ*num_online_cpus()/10;
4537} 4959}
@@ -4623,11 +5045,12 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
4623 struct rq *rq; 5045 struct rq *rq;
4624 int balance_cpu; 5046 int balance_cpu;
4625 5047
4626 if (idle != CPU_IDLE || !this_rq->nohz_balance_kick) 5048 if (idle != CPU_IDLE ||
4627 return; 5049 !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
5050 goto end;
4628 5051
4629 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { 5052 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
4630 if (balance_cpu == this_cpu) 5053 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
4631 continue; 5054 continue;
4632 5055
4633 /* 5056 /*
@@ -4635,10 +5058,8 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
4635 * work being done for other cpus. Next load 5058 * work being done for other cpus. Next load
4636 * balancing owner will pick it up. 5059 * balancing owner will pick it up.
4637 */ 5060 */
4638 if (need_resched()) { 5061 if (need_resched())
4639 this_rq->nohz_balance_kick = 0;
4640 break; 5062 break;
4641 }
4642 5063
4643 raw_spin_lock_irq(&this_rq->lock); 5064 raw_spin_lock_irq(&this_rq->lock);
4644 update_rq_clock(this_rq); 5065 update_rq_clock(this_rq);
@@ -4652,53 +5073,71 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
4652 this_rq->next_balance = rq->next_balance; 5073 this_rq->next_balance = rq->next_balance;
4653 } 5074 }
4654 nohz.next_balance = this_rq->next_balance; 5075 nohz.next_balance = this_rq->next_balance;
4655 this_rq->nohz_balance_kick = 0; 5076end:
5077 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
4656} 5078}
4657 5079
4658/* 5080/*
4659 * Current heuristic for kicking the idle load balancer 5081 * Current heuristic for kicking the idle load balancer in the presence
4660 * - first_pick_cpu is the one of the busy CPUs. It will kick 5082 * of an idle cpu is the system.
4661 * idle load balancer when it has more than one process active. This 5083 * - This rq has more than one task.
4662 * eliminates the need for idle load balancing altogether when we have 5084 * - At any scheduler domain level, this cpu's scheduler group has multiple
4663 * only one running process in the system (common case). 5085 * busy cpu's exceeding the group's power.
4664 * - If there are more than one busy CPU, idle load balancer may have 5086 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
4665 * to run for active_load_balance to happen (i.e., two busy CPUs are 5087 * domain span are idle.
4666 * SMT or core siblings and can run better if they move to different
4667 * physical CPUs). So, second_pick_cpu is the second of the busy CPUs
4668 * which will kick idle load balancer as soon as it has any load.
4669 */ 5088 */
4670static inline int nohz_kick_needed(struct rq *rq, int cpu) 5089static inline int nohz_kick_needed(struct rq *rq, int cpu)
4671{ 5090{
4672 unsigned long now = jiffies; 5091 unsigned long now = jiffies;
4673 int ret; 5092 struct sched_domain *sd;
4674 int first_pick_cpu, second_pick_cpu;
4675 5093
4676 if (time_before(now, nohz.next_balance)) 5094 if (unlikely(idle_cpu(cpu)))
4677 return 0; 5095 return 0;
4678 5096
4679 if (idle_cpu(cpu)) 5097 /*
4680 return 0; 5098 * We may be recently in ticked or tickless idle mode. At the first
5099 * busy tick after returning from idle, we will update the busy stats.
5100 */
5101 set_cpu_sd_state_busy();
5102 clear_nohz_tick_stopped(cpu);
4681 5103
4682 first_pick_cpu = atomic_read(&nohz.first_pick_cpu); 5104 /*
4683 second_pick_cpu = atomic_read(&nohz.second_pick_cpu); 5105 * None are in tickless mode and hence no need for NOHZ idle load
5106 * balancing.
5107 */
5108 if (likely(!atomic_read(&nohz.nr_cpus)))
5109 return 0;
4684 5110
4685 if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu && 5111 if (time_before(now, nohz.next_balance))
4686 second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
4687 return 0; 5112 return 0;
4688 5113
4689 ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu); 5114 if (rq->nr_running >= 2)
4690 if (ret == nr_cpu_ids || ret == cpu) { 5115 goto need_kick;
4691 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); 5116
4692 if (rq->nr_running > 1) 5117 rcu_read_lock();
4693 return 1; 5118 for_each_domain(cpu, sd) {
4694 } else { 5119 struct sched_group *sg = sd->groups;
4695 ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu); 5120 struct sched_group_power *sgp = sg->sgp;
4696 if (ret == nr_cpu_ids || ret == cpu) { 5121 int nr_busy = atomic_read(&sgp->nr_busy_cpus);
4697 if (rq->nr_running) 5122
4698 return 1; 5123 if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
4699 } 5124 goto need_kick_unlock;
5125
5126 if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
5127 && (cpumask_first_and(nohz.idle_cpus_mask,
5128 sched_domain_span(sd)) < cpu))
5129 goto need_kick_unlock;
5130
5131 if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
5132 break;
4700 } 5133 }
5134 rcu_read_unlock();
4701 return 0; 5135 return 0;
5136
5137need_kick_unlock:
5138 rcu_read_unlock();
5139need_kick:
5140 return 1;
4702} 5141}
4703#else 5142#else
4704static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } 5143static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
@@ -4733,14 +5172,14 @@ static inline int on_null_domain(int cpu)
4733/* 5172/*
4734 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 5173 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4735 */ 5174 */
4736static inline void trigger_load_balance(struct rq *rq, int cpu) 5175void trigger_load_balance(struct rq *rq, int cpu)
4737{ 5176{
4738 /* Don't need to rebalance while attached to NULL domain */ 5177 /* Don't need to rebalance while attached to NULL domain */
4739 if (time_after_eq(jiffies, rq->next_balance) && 5178 if (time_after_eq(jiffies, rq->next_balance) &&
4740 likely(!on_null_domain(cpu))) 5179 likely(!on_null_domain(cpu)))
4741 raise_softirq(SCHED_SOFTIRQ); 5180 raise_softirq(SCHED_SOFTIRQ);
4742#ifdef CONFIG_NO_HZ 5181#ifdef CONFIG_NO_HZ
4743 else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) 5182 if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
4744 nohz_balancer_kick(cpu); 5183 nohz_balancer_kick(cpu);
4745#endif 5184#endif
4746} 5185}
@@ -4755,15 +5194,6 @@ static void rq_offline_fair(struct rq *rq)
4755 update_sysctl(); 5194 update_sysctl();
4756} 5195}
4757 5196
4758#else /* CONFIG_SMP */
4759
4760/*
4761 * on UP we do not need to balance between CPUs:
4762 */
4763static inline void idle_balance(int cpu, struct rq *rq)
4764{
4765}
4766
4767#endif /* CONFIG_SMP */ 5197#endif /* CONFIG_SMP */
4768 5198
4769/* 5199/*
@@ -4787,8 +5217,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
4787 */ 5217 */
4788static void task_fork_fair(struct task_struct *p) 5218static void task_fork_fair(struct task_struct *p)
4789{ 5219{
4790 struct cfs_rq *cfs_rq = task_cfs_rq(current); 5220 struct cfs_rq *cfs_rq;
4791 struct sched_entity *se = &p->se, *curr = cfs_rq->curr; 5221 struct sched_entity *se = &p->se, *curr;
4792 int this_cpu = smp_processor_id(); 5222 int this_cpu = smp_processor_id();
4793 struct rq *rq = this_rq(); 5223 struct rq *rq = this_rq();
4794 unsigned long flags; 5224 unsigned long flags;
@@ -4797,6 +5227,9 @@ static void task_fork_fair(struct task_struct *p)
4797 5227
4798 update_rq_clock(rq); 5228 update_rq_clock(rq);
4799 5229
5230 cfs_rq = task_cfs_rq(current);
5231 curr = cfs_rq->curr;
5232
4800 if (unlikely(task_cpu(p) != this_cpu)) { 5233 if (unlikely(task_cpu(p) != this_cpu)) {
4801 rcu_read_lock(); 5234 rcu_read_lock();
4802 __set_task_cpu(p, this_cpu); 5235 __set_task_cpu(p, this_cpu);
@@ -4906,6 +5339,16 @@ static void set_curr_task_fair(struct rq *rq)
4906 } 5339 }
4907} 5340}
4908 5341
5342void init_cfs_rq(struct cfs_rq *cfs_rq)
5343{
5344 cfs_rq->tasks_timeline = RB_ROOT;
5345 INIT_LIST_HEAD(&cfs_rq->tasks);
5346 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
5347#ifndef CONFIG_64BIT
5348 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
5349#endif
5350}
5351
4909#ifdef CONFIG_FAIR_GROUP_SCHED 5352#ifdef CONFIG_FAIR_GROUP_SCHED
4910static void task_move_group_fair(struct task_struct *p, int on_rq) 5353static void task_move_group_fair(struct task_struct *p, int on_rq)
4911{ 5354{
@@ -4922,13 +5365,182 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
4922 * to another cgroup's rq. This does somewhat interfere with the 5365 * to another cgroup's rq. This does somewhat interfere with the
4923 * fair sleeper stuff for the first placement, but who cares. 5366 * fair sleeper stuff for the first placement, but who cares.
4924 */ 5367 */
5368 /*
5369 * When !on_rq, vruntime of the task has usually NOT been normalized.
5370 * But there are some cases where it has already been normalized:
5371 *
5372 * - Moving a forked child which is waiting for being woken up by
5373 * wake_up_new_task().
5374 * - Moving a task which has been woken up by try_to_wake_up() and
5375 * waiting for actually being woken up by sched_ttwu_pending().
5376 *
5377 * To prevent boost or penalty in the new cfs_rq caused by delta
5378 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
5379 */
5380 if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
5381 on_rq = 1;
5382
4925 if (!on_rq) 5383 if (!on_rq)
4926 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; 5384 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
4927 set_task_rq(p, task_cpu(p)); 5385 set_task_rq(p, task_cpu(p));
4928 if (!on_rq) 5386 if (!on_rq)
4929 p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; 5387 p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
4930} 5388}
5389
5390void free_fair_sched_group(struct task_group *tg)
5391{
5392 int i;
5393
5394 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
5395
5396 for_each_possible_cpu(i) {
5397 if (tg->cfs_rq)
5398 kfree(tg->cfs_rq[i]);
5399 if (tg->se)
5400 kfree(tg->se[i]);
5401 }
5402
5403 kfree(tg->cfs_rq);
5404 kfree(tg->se);
5405}
5406
5407int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
5408{
5409 struct cfs_rq *cfs_rq;
5410 struct sched_entity *se;
5411 int i;
5412
5413 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
5414 if (!tg->cfs_rq)
5415 goto err;
5416 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
5417 if (!tg->se)
5418 goto err;
5419
5420 tg->shares = NICE_0_LOAD;
5421
5422 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
5423
5424 for_each_possible_cpu(i) {
5425 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
5426 GFP_KERNEL, cpu_to_node(i));
5427 if (!cfs_rq)
5428 goto err;
5429
5430 se = kzalloc_node(sizeof(struct sched_entity),
5431 GFP_KERNEL, cpu_to_node(i));
5432 if (!se)
5433 goto err_free_rq;
5434
5435 init_cfs_rq(cfs_rq);
5436 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
5437 }
5438
5439 return 1;
5440
5441err_free_rq:
5442 kfree(cfs_rq);
5443err:
5444 return 0;
5445}
5446
5447void unregister_fair_sched_group(struct task_group *tg, int cpu)
5448{
5449 struct rq *rq = cpu_rq(cpu);
5450 unsigned long flags;
5451
5452 /*
5453 * Only empty task groups can be destroyed; so we can speculatively
5454 * check on_list without danger of it being re-added.
5455 */
5456 if (!tg->cfs_rq[cpu]->on_list)
5457 return;
5458
5459 raw_spin_lock_irqsave(&rq->lock, flags);
5460 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
5461 raw_spin_unlock_irqrestore(&rq->lock, flags);
5462}
5463
5464void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
5465 struct sched_entity *se, int cpu,
5466 struct sched_entity *parent)
5467{
5468 struct rq *rq = cpu_rq(cpu);
5469
5470 cfs_rq->tg = tg;
5471 cfs_rq->rq = rq;
5472#ifdef CONFIG_SMP
5473 /* allow initial update_cfs_load() to truncate */
5474 cfs_rq->load_stamp = 1;
4931#endif 5475#endif
5476 init_cfs_rq_runtime(cfs_rq);
5477
5478 tg->cfs_rq[cpu] = cfs_rq;
5479 tg->se[cpu] = se;
5480
5481 /* se could be NULL for root_task_group */
5482 if (!se)
5483 return;
5484
5485 if (!parent)
5486 se->cfs_rq = &rq->cfs;
5487 else
5488 se->cfs_rq = parent->my_q;
5489
5490 se->my_q = cfs_rq;
5491 update_load_set(&se->load, 0);
5492 se->parent = parent;
5493}
5494
5495static DEFINE_MUTEX(shares_mutex);
5496
5497int sched_group_set_shares(struct task_group *tg, unsigned long shares)
5498{
5499 int i;
5500 unsigned long flags;
5501
5502 /*
5503 * We can't change the weight of the root cgroup.
5504 */
5505 if (!tg->se[0])
5506 return -EINVAL;
5507
5508 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
5509
5510 mutex_lock(&shares_mutex);
5511 if (tg->shares == shares)
5512 goto done;
5513
5514 tg->shares = shares;
5515 for_each_possible_cpu(i) {
5516 struct rq *rq = cpu_rq(i);
5517 struct sched_entity *se;
5518
5519 se = tg->se[i];
5520 /* Propagate contribution to hierarchy */
5521 raw_spin_lock_irqsave(&rq->lock, flags);
5522 for_each_sched_entity(se)
5523 update_cfs_shares(group_cfs_rq(se));
5524 raw_spin_unlock_irqrestore(&rq->lock, flags);
5525 }
5526
5527done:
5528 mutex_unlock(&shares_mutex);
5529 return 0;
5530}
5531#else /* CONFIG_FAIR_GROUP_SCHED */
5532
5533void free_fair_sched_group(struct task_group *tg) { }
5534
5535int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
5536{
5537 return 1;
5538}
5539
5540void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
5541
5542#endif /* CONFIG_FAIR_GROUP_SCHED */
5543
4932 5544
4933static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) 5545static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
4934{ 5546{
@@ -4948,7 +5560,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
4948/* 5560/*
4949 * All the scheduling class methods: 5561 * All the scheduling class methods:
4950 */ 5562 */
4951static const struct sched_class fair_sched_class = { 5563const struct sched_class fair_sched_class = {
4952 .next = &idle_sched_class, 5564 .next = &idle_sched_class,
4953 .enqueue_task = enqueue_task_fair, 5565 .enqueue_task = enqueue_task_fair,
4954 .dequeue_task = dequeue_task_fair, 5566 .dequeue_task = dequeue_task_fair,
@@ -4985,7 +5597,7 @@ static const struct sched_class fair_sched_class = {
4985}; 5597};
4986 5598
4987#ifdef CONFIG_SCHED_DEBUG 5599#ifdef CONFIG_SCHED_DEBUG
4988static void print_cfs_stats(struct seq_file *m, int cpu) 5600void print_cfs_stats(struct seq_file *m, int cpu)
4989{ 5601{
4990 struct cfs_rq *cfs_rq; 5602 struct cfs_rq *cfs_rq;
4991 5603
@@ -4995,3 +5607,16 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
4995 rcu_read_unlock(); 5607 rcu_read_unlock();
4996} 5608}
4997#endif 5609#endif
5610
5611__init void init_sched_fair_class(void)
5612{
5613#ifdef CONFIG_SMP
5614 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
5615
5616#ifdef CONFIG_NO_HZ
5617 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
5618 cpu_notifier(sched_ilb_notifier, 0);
5619#endif
5620#endif /* SMP */
5621
5622}
diff --git a/kernel/sched_features.h b/kernel/sched/features.h
index efa0a7b75dde..e61fd73913d0 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched/features.h
@@ -3,13 +3,13 @@
3 * them to run sooner, but does not allow tons of sleepers to 3 * them to run sooner, but does not allow tons of sleepers to
4 * rip the spread apart. 4 * rip the spread apart.
5 */ 5 */
6SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1) 6SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
7 7
8/* 8/*
9 * Place new tasks ahead so that they do not starve already running 9 * Place new tasks ahead so that they do not starve already running
10 * tasks 10 * tasks
11 */ 11 */
12SCHED_FEAT(START_DEBIT, 1) 12SCHED_FEAT(START_DEBIT, true)
13 13
14/* 14/*
15 * Based on load and program behaviour, see if it makes sense to place 15 * Based on load and program behaviour, see if it makes sense to place
@@ -17,53 +17,54 @@ SCHED_FEAT(START_DEBIT, 1)
17 * improve cache locality. Typically used with SYNC wakeups as 17 * improve cache locality. Typically used with SYNC wakeups as
18 * generated by pipes and the like, see also SYNC_WAKEUPS. 18 * generated by pipes and the like, see also SYNC_WAKEUPS.
19 */ 19 */
20SCHED_FEAT(AFFINE_WAKEUPS, 1) 20SCHED_FEAT(AFFINE_WAKEUPS, true)
21 21
22/* 22/*
23 * Prefer to schedule the task we woke last (assuming it failed 23 * Prefer to schedule the task we woke last (assuming it failed
24 * wakeup-preemption), since its likely going to consume data we 24 * wakeup-preemption), since its likely going to consume data we
25 * touched, increases cache locality. 25 * touched, increases cache locality.
26 */ 26 */
27SCHED_FEAT(NEXT_BUDDY, 0) 27SCHED_FEAT(NEXT_BUDDY, false)
28 28
29/* 29/*
30 * Prefer to schedule the task that ran last (when we did 30 * Prefer to schedule the task that ran last (when we did
31 * wake-preempt) as that likely will touch the same data, increases 31 * wake-preempt) as that likely will touch the same data, increases
32 * cache locality. 32 * cache locality.
33 */ 33 */
34SCHED_FEAT(LAST_BUDDY, 1) 34SCHED_FEAT(LAST_BUDDY, true)
35 35
36/* 36/*
37 * Consider buddies to be cache hot, decreases the likelyness of a 37 * Consider buddies to be cache hot, decreases the likelyness of a
38 * cache buddy being migrated away, increases cache locality. 38 * cache buddy being migrated away, increases cache locality.
39 */ 39 */
40SCHED_FEAT(CACHE_HOT_BUDDY, 1) 40SCHED_FEAT(CACHE_HOT_BUDDY, true)
41 41
42/* 42/*
43 * Use arch dependent cpu power functions 43 * Use arch dependent cpu power functions
44 */ 44 */
45SCHED_FEAT(ARCH_POWER, 0) 45SCHED_FEAT(ARCH_POWER, false)
46 46
47SCHED_FEAT(HRTICK, 0) 47SCHED_FEAT(HRTICK, false)
48SCHED_FEAT(DOUBLE_TICK, 0) 48SCHED_FEAT(DOUBLE_TICK, false)
49SCHED_FEAT(LB_BIAS, 1) 49SCHED_FEAT(LB_BIAS, true)
50 50
51/* 51/*
52 * Spin-wait on mutex acquisition when the mutex owner is running on 52 * Spin-wait on mutex acquisition when the mutex owner is running on
53 * another cpu -- assumes that when the owner is running, it will soon 53 * another cpu -- assumes that when the owner is running, it will soon
54 * release the lock. Decreases scheduling overhead. 54 * release the lock. Decreases scheduling overhead.
55 */ 55 */
56SCHED_FEAT(OWNER_SPIN, 1) 56SCHED_FEAT(OWNER_SPIN, true)
57 57
58/* 58/*
59 * Decrement CPU power based on time not spent running tasks 59 * Decrement CPU power based on time not spent running tasks
60 */ 60 */
61SCHED_FEAT(NONTASK_POWER, 1) 61SCHED_FEAT(NONTASK_POWER, true)
62 62
63/* 63/*
64 * Queue remote wakeups on the target CPU and process them 64 * Queue remote wakeups on the target CPU and process them
65 * using the scheduler IPI. Reduces rq->lock contention/bounces. 65 * using the scheduler IPI. Reduces rq->lock contention/bounces.
66 */ 66 */
67SCHED_FEAT(TTWU_QUEUE, 1) 67SCHED_FEAT(TTWU_QUEUE, true)
68 68
69SCHED_FEAT(FORCE_SD_OVERLAP, 0) 69SCHED_FEAT(FORCE_SD_OVERLAP, false)
70SCHED_FEAT(RT_RUNTIME_SHARE, true)
diff --git a/kernel/sched_idletask.c b/kernel/sched/idle_task.c
index 0a51882534ea..91b4c957f289 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched/idle_task.c
@@ -1,3 +1,5 @@
1#include "sched.h"
2
1/* 3/*
2 * idle-task scheduling class. 4 * idle-task scheduling class.
3 * 5 *
@@ -71,7 +73,7 @@ static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task
71/* 73/*
72 * Simple, special scheduling class for the per-CPU idle tasks: 74 * Simple, special scheduling class for the per-CPU idle tasks:
73 */ 75 */
74static const struct sched_class idle_sched_class = { 76const struct sched_class idle_sched_class = {
75 /* .next is NULL */ 77 /* .next is NULL */
76 /* no enqueue/yield_task for idle tasks */ 78 /* no enqueue/yield_task for idle tasks */
77 79
diff --git a/kernel/sched_rt.c b/kernel/sched/rt.c
index 056cbd2e2a27..f42ae7fb5ec5 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched/rt.c
@@ -3,7 +3,92 @@
3 * policies) 3 * policies)
4 */ 4 */
5 5
6#include "sched.h"
7
8#include <linux/slab.h>
9
10static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
11
12struct rt_bandwidth def_rt_bandwidth;
13
14static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
15{
16 struct rt_bandwidth *rt_b =
17 container_of(timer, struct rt_bandwidth, rt_period_timer);
18 ktime_t now;
19 int overrun;
20 int idle = 0;
21
22 for (;;) {
23 now = hrtimer_cb_get_time(timer);
24 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
25
26 if (!overrun)
27 break;
28
29 idle = do_sched_rt_period_timer(rt_b, overrun);
30 }
31
32 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
33}
34
35void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
36{
37 rt_b->rt_period = ns_to_ktime(period);
38 rt_b->rt_runtime = runtime;
39
40 raw_spin_lock_init(&rt_b->rt_runtime_lock);
41
42 hrtimer_init(&rt_b->rt_period_timer,
43 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
44 rt_b->rt_period_timer.function = sched_rt_period_timer;
45}
46
47static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
48{
49 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
50 return;
51
52 if (hrtimer_active(&rt_b->rt_period_timer))
53 return;
54
55 raw_spin_lock(&rt_b->rt_runtime_lock);
56 start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
57 raw_spin_unlock(&rt_b->rt_runtime_lock);
58}
59
60void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
61{
62 struct rt_prio_array *array;
63 int i;
64
65 array = &rt_rq->active;
66 for (i = 0; i < MAX_RT_PRIO; i++) {
67 INIT_LIST_HEAD(array->queue + i);
68 __clear_bit(i, array->bitmap);
69 }
70 /* delimiter for bitsearch: */
71 __set_bit(MAX_RT_PRIO, array->bitmap);
72
73#if defined CONFIG_SMP
74 rt_rq->highest_prio.curr = MAX_RT_PRIO;
75 rt_rq->highest_prio.next = MAX_RT_PRIO;
76 rt_rq->rt_nr_migratory = 0;
77 rt_rq->overloaded = 0;
78 plist_head_init(&rt_rq->pushable_tasks);
79#endif
80
81 rt_rq->rt_time = 0;
82 rt_rq->rt_throttled = 0;
83 rt_rq->rt_runtime = 0;
84 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
85}
86
6#ifdef CONFIG_RT_GROUP_SCHED 87#ifdef CONFIG_RT_GROUP_SCHED
88static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
89{
90 hrtimer_cancel(&rt_b->rt_period_timer);
91}
7 92
8#define rt_entity_is_task(rt_se) (!(rt_se)->my_q) 93#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
9 94
@@ -25,6 +110,91 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
25 return rt_se->rt_rq; 110 return rt_se->rt_rq;
26} 111}
27 112
113void free_rt_sched_group(struct task_group *tg)
114{
115 int i;
116
117 if (tg->rt_se)
118 destroy_rt_bandwidth(&tg->rt_bandwidth);
119
120 for_each_possible_cpu(i) {
121 if (tg->rt_rq)
122 kfree(tg->rt_rq[i]);
123 if (tg->rt_se)
124 kfree(tg->rt_se[i]);
125 }
126
127 kfree(tg->rt_rq);
128 kfree(tg->rt_se);
129}
130
131void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
132 struct sched_rt_entity *rt_se, int cpu,
133 struct sched_rt_entity *parent)
134{
135 struct rq *rq = cpu_rq(cpu);
136
137 rt_rq->highest_prio.curr = MAX_RT_PRIO;
138 rt_rq->rt_nr_boosted = 0;
139 rt_rq->rq = rq;
140 rt_rq->tg = tg;
141
142 tg->rt_rq[cpu] = rt_rq;
143 tg->rt_se[cpu] = rt_se;
144
145 if (!rt_se)
146 return;
147
148 if (!parent)
149 rt_se->rt_rq = &rq->rt;
150 else
151 rt_se->rt_rq = parent->my_q;
152
153 rt_se->my_q = rt_rq;
154 rt_se->parent = parent;
155 INIT_LIST_HEAD(&rt_se->run_list);
156}
157
158int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
159{
160 struct rt_rq *rt_rq;
161 struct sched_rt_entity *rt_se;
162 int i;
163
164 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
165 if (!tg->rt_rq)
166 goto err;
167 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
168 if (!tg->rt_se)
169 goto err;
170
171 init_rt_bandwidth(&tg->rt_bandwidth,
172 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
173
174 for_each_possible_cpu(i) {
175 rt_rq = kzalloc_node(sizeof(struct rt_rq),
176 GFP_KERNEL, cpu_to_node(i));
177 if (!rt_rq)
178 goto err;
179
180 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
181 GFP_KERNEL, cpu_to_node(i));
182 if (!rt_se)
183 goto err_free_rq;
184
185 init_rt_rq(rt_rq, cpu_rq(i));
186 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
187 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
188 }
189
190 return 1;
191
192err_free_rq:
193 kfree(rt_rq);
194err:
195 return 0;
196}
197
28#else /* CONFIG_RT_GROUP_SCHED */ 198#else /* CONFIG_RT_GROUP_SCHED */
29 199
30#define rt_entity_is_task(rt_se) (1) 200#define rt_entity_is_task(rt_se) (1)
@@ -47,6 +217,12 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
47 return &rq->rt; 217 return &rq->rt;
48} 218}
49 219
220void free_rt_sched_group(struct task_group *tg) { }
221
222int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
223{
224 return 1;
225}
50#endif /* CONFIG_RT_GROUP_SCHED */ 226#endif /* CONFIG_RT_GROUP_SCHED */
51 227
52#ifdef CONFIG_SMP 228#ifdef CONFIG_SMP
@@ -556,10 +732,35 @@ static void enable_runtime(struct rq *rq)
556 raw_spin_unlock_irqrestore(&rq->lock, flags); 732 raw_spin_unlock_irqrestore(&rq->lock, flags);
557} 733}
558 734
735int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu)
736{
737 int cpu = (int)(long)hcpu;
738
739 switch (action) {
740 case CPU_DOWN_PREPARE:
741 case CPU_DOWN_PREPARE_FROZEN:
742 disable_runtime(cpu_rq(cpu));
743 return NOTIFY_OK;
744
745 case CPU_DOWN_FAILED:
746 case CPU_DOWN_FAILED_FROZEN:
747 case CPU_ONLINE:
748 case CPU_ONLINE_FROZEN:
749 enable_runtime(cpu_rq(cpu));
750 return NOTIFY_OK;
751
752 default:
753 return NOTIFY_DONE;
754 }
755}
756
559static int balance_runtime(struct rt_rq *rt_rq) 757static int balance_runtime(struct rt_rq *rt_rq)
560{ 758{
561 int more = 0; 759 int more = 0;
562 760
761 if (!sched_feat(RT_RUNTIME_SHARE))
762 return more;
763
563 if (rt_rq->rt_time > rt_rq->rt_runtime) { 764 if (rt_rq->rt_time > rt_rq->rt_runtime) {
564 raw_spin_unlock(&rt_rq->rt_runtime_lock); 765 raw_spin_unlock(&rt_rq->rt_runtime_lock);
565 more = do_balance_runtime(rt_rq); 766 more = do_balance_runtime(rt_rq);
@@ -645,7 +846,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
645 if (rt_rq->rt_throttled) 846 if (rt_rq->rt_throttled)
646 return rt_rq_throttled(rt_rq); 847 return rt_rq_throttled(rt_rq);
647 848
648 if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq)) 849 if (runtime >= sched_rt_period(rt_rq))
649 return 0; 850 return 0;
650 851
651 balance_runtime(rt_rq); 852 balance_runtime(rt_rq);
@@ -954,8 +1155,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
954} 1155}
955 1156
956/* 1157/*
957 * Put task to the end of the run list without the overhead of dequeue 1158 * Put task to the head or the end of the run list without the overhead of
958 * followed by enqueue. 1159 * dequeue followed by enqueue.
959 */ 1160 */
960static void 1161static void
961requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head) 1162requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
@@ -999,6 +1200,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
999 1200
1000 cpu = task_cpu(p); 1201 cpu = task_cpu(p);
1001 1202
1203 if (p->rt.nr_cpus_allowed == 1)
1204 goto out;
1205
1002 /* For anything but wake ups, just return the task_cpu */ 1206 /* For anything but wake ups, just return the task_cpu */
1003 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) 1207 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
1004 goto out; 1208 goto out;
@@ -1175,8 +1379,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1175/* Only try algorithms three times */ 1379/* Only try algorithms three times */
1176#define RT_MAX_TRIES 3 1380#define RT_MAX_TRIES 3
1177 1381
1178static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
1179
1180static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 1382static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1181{ 1383{
1182 if (!task_running(rq, p) && 1384 if (!task_running(rq, p) &&
@@ -1385,6 +1587,11 @@ static int push_rt_task(struct rq *rq)
1385 if (!next_task) 1587 if (!next_task)
1386 return 0; 1588 return 0;
1387 1589
1590#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1591 if (unlikely(task_running(rq, next_task)))
1592 return 0;
1593#endif
1594
1388retry: 1595retry:
1389 if (unlikely(next_task == rq->curr)) { 1596 if (unlikely(next_task == rq->curr)) {
1390 WARN_ON(1); 1597 WARN_ON(1);
@@ -1650,13 +1857,14 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
1650 pull_rt_task(rq); 1857 pull_rt_task(rq);
1651} 1858}
1652 1859
1653static inline void init_sched_rt_class(void) 1860void init_sched_rt_class(void)
1654{ 1861{
1655 unsigned int i; 1862 unsigned int i;
1656 1863
1657 for_each_possible_cpu(i) 1864 for_each_possible_cpu(i) {
1658 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), 1865 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
1659 GFP_KERNEL, cpu_to_node(i)); 1866 GFP_KERNEL, cpu_to_node(i));
1867 }
1660} 1868}
1661#endif /* CONFIG_SMP */ 1869#endif /* CONFIG_SMP */
1662 1870
@@ -1797,7 +2005,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
1797 return 0; 2005 return 0;
1798} 2006}
1799 2007
1800static const struct sched_class rt_sched_class = { 2008const struct sched_class rt_sched_class = {
1801 .next = &fair_sched_class, 2009 .next = &fair_sched_class,
1802 .enqueue_task = enqueue_task_rt, 2010 .enqueue_task = enqueue_task_rt,
1803 .dequeue_task = dequeue_task_rt, 2011 .dequeue_task = dequeue_task_rt,
@@ -1832,7 +2040,7 @@ static const struct sched_class rt_sched_class = {
1832#ifdef CONFIG_SCHED_DEBUG 2040#ifdef CONFIG_SCHED_DEBUG
1833extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); 2041extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
1834 2042
1835static void print_rt_stats(struct seq_file *m, int cpu) 2043void print_rt_stats(struct seq_file *m, int cpu)
1836{ 2044{
1837 rt_rq_iter_t iter; 2045 rt_rq_iter_t iter;
1838 struct rt_rq *rt_rq; 2046 struct rt_rq *rt_rq;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
new file mode 100644
index 000000000000..98c0c2623db8
--- /dev/null
+++ b/kernel/sched/sched.h
@@ -0,0 +1,1166 @@
1
2#include <linux/sched.h>
3#include <linux/mutex.h>
4#include <linux/spinlock.h>
5#include <linux/stop_machine.h>
6
7#include "cpupri.h"
8
9extern __read_mostly int scheduler_running;
10
11/*
12 * Convert user-nice values [ -20 ... 0 ... 19 ]
13 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
14 * and back.
15 */
16#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
17#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
18#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
19
20/*
21 * 'User priority' is the nice value converted to something we
22 * can work with better when scaling various scheduler parameters,
23 * it's a [ 0 ... 39 ] range.
24 */
25#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
26#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
27#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
28
29/*
30 * Helpers for converting nanosecond timing to jiffy resolution
31 */
32#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
33
34#define NICE_0_LOAD SCHED_LOAD_SCALE
35#define NICE_0_SHIFT SCHED_LOAD_SHIFT
36
37/*
38 * These are the 'tuning knobs' of the scheduler:
39 *
40 * default timeslice is 100 msecs (used only for SCHED_RR tasks).
41 * Timeslices get refilled after they expire.
42 */
43#define DEF_TIMESLICE (100 * HZ / 1000)
44
45/*
46 * single value that denotes runtime == period, ie unlimited time.
47 */
48#define RUNTIME_INF ((u64)~0ULL)
49
50static inline int rt_policy(int policy)
51{
52 if (policy == SCHED_FIFO || policy == SCHED_RR)
53 return 1;
54 return 0;
55}
56
57static inline int task_has_rt_policy(struct task_struct *p)
58{
59 return rt_policy(p->policy);
60}
61
62/*
63 * This is the priority-queue data structure of the RT scheduling class:
64 */
65struct rt_prio_array {
66 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
67 struct list_head queue[MAX_RT_PRIO];
68};
69
70struct rt_bandwidth {
71 /* nests inside the rq lock: */
72 raw_spinlock_t rt_runtime_lock;
73 ktime_t rt_period;
74 u64 rt_runtime;
75 struct hrtimer rt_period_timer;
76};
77
78extern struct mutex sched_domains_mutex;
79
80#ifdef CONFIG_CGROUP_SCHED
81
82#include <linux/cgroup.h>
83
84struct cfs_rq;
85struct rt_rq;
86
87static LIST_HEAD(task_groups);
88
89struct cfs_bandwidth {
90#ifdef CONFIG_CFS_BANDWIDTH
91 raw_spinlock_t lock;
92 ktime_t period;
93 u64 quota, runtime;
94 s64 hierarchal_quota;
95 u64 runtime_expires;
96
97 int idle, timer_active;
98 struct hrtimer period_timer, slack_timer;
99 struct list_head throttled_cfs_rq;
100
101 /* statistics */
102 int nr_periods, nr_throttled;
103 u64 throttled_time;
104#endif
105};
106
107/* task group related information */
108struct task_group {
109 struct cgroup_subsys_state css;
110
111#ifdef CONFIG_FAIR_GROUP_SCHED
112 /* schedulable entities of this group on each cpu */
113 struct sched_entity **se;
114 /* runqueue "owned" by this group on each cpu */
115 struct cfs_rq **cfs_rq;
116 unsigned long shares;
117
118 atomic_t load_weight;
119#endif
120
121#ifdef CONFIG_RT_GROUP_SCHED
122 struct sched_rt_entity **rt_se;
123 struct rt_rq **rt_rq;
124
125 struct rt_bandwidth rt_bandwidth;
126#endif
127
128 struct rcu_head rcu;
129 struct list_head list;
130
131 struct task_group *parent;
132 struct list_head siblings;
133 struct list_head children;
134
135#ifdef CONFIG_SCHED_AUTOGROUP
136 struct autogroup *autogroup;
137#endif
138
139 struct cfs_bandwidth cfs_bandwidth;
140};
141
142#ifdef CONFIG_FAIR_GROUP_SCHED
143#define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
144
145/*
146 * A weight of 0 or 1 can cause arithmetics problems.
147 * A weight of a cfs_rq is the sum of weights of which entities
148 * are queued on this cfs_rq, so a weight of a entity should not be
149 * too large, so as the shares value of a task group.
150 * (The default weight is 1024 - so there's no practical
151 * limitation from this.)
152 */
153#define MIN_SHARES (1UL << 1)
154#define MAX_SHARES (1UL << 18)
155#endif
156
157/* Default task group.
158 * Every task in system belong to this group at bootup.
159 */
160extern struct task_group root_task_group;
161
162typedef int (*tg_visitor)(struct task_group *, void *);
163
164extern int walk_tg_tree_from(struct task_group *from,
165 tg_visitor down, tg_visitor up, void *data);
166
167/*
168 * Iterate the full tree, calling @down when first entering a node and @up when
169 * leaving it for the final time.
170 *
171 * Caller must hold rcu_lock or sufficient equivalent.
172 */
173static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
174{
175 return walk_tg_tree_from(&root_task_group, down, up, data);
176}
177
178extern int tg_nop(struct task_group *tg, void *data);
179
180extern void free_fair_sched_group(struct task_group *tg);
181extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
182extern void unregister_fair_sched_group(struct task_group *tg, int cpu);
183extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
184 struct sched_entity *se, int cpu,
185 struct sched_entity *parent);
186extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
187extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
188
189extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
190extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
191extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
192
193extern void free_rt_sched_group(struct task_group *tg);
194extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent);
195extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
196 struct sched_rt_entity *rt_se, int cpu,
197 struct sched_rt_entity *parent);
198
199#else /* CONFIG_CGROUP_SCHED */
200
201struct cfs_bandwidth { };
202
203#endif /* CONFIG_CGROUP_SCHED */
204
205/* CFS-related fields in a runqueue */
206struct cfs_rq {
207 struct load_weight load;
208 unsigned long nr_running, h_nr_running;
209
210 u64 exec_clock;
211 u64 min_vruntime;
212#ifndef CONFIG_64BIT
213 u64 min_vruntime_copy;
214#endif
215
216 struct rb_root tasks_timeline;
217 struct rb_node *rb_leftmost;
218
219 struct list_head tasks;
220 struct list_head *balance_iterator;
221
222 /*
223 * 'curr' points to currently running entity on this cfs_rq.
224 * It is set to NULL otherwise (i.e when none are currently running).
225 */
226 struct sched_entity *curr, *next, *last, *skip;
227
228#ifdef CONFIG_SCHED_DEBUG
229 unsigned int nr_spread_over;
230#endif
231
232#ifdef CONFIG_FAIR_GROUP_SCHED
233 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
234
235 /*
236 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
237 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
238 * (like users, containers etc.)
239 *
240 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
241 * list is used during load balance.
242 */
243 int on_list;
244 struct list_head leaf_cfs_rq_list;
245 struct task_group *tg; /* group that "owns" this runqueue */
246
247#ifdef CONFIG_SMP
248 /*
249 * the part of load.weight contributed by tasks
250 */
251 unsigned long task_weight;
252
253 /*
254 * h_load = weight * f(tg)
255 *
256 * Where f(tg) is the recursive weight fraction assigned to
257 * this group.
258 */
259 unsigned long h_load;
260
261 /*
262 * Maintaining per-cpu shares distribution for group scheduling
263 *
264 * load_stamp is the last time we updated the load average
265 * load_last is the last time we updated the load average and saw load
266 * load_unacc_exec_time is currently unaccounted execution time
267 */
268 u64 load_avg;
269 u64 load_period;
270 u64 load_stamp, load_last, load_unacc_exec_time;
271
272 unsigned long load_contribution;
273#endif /* CONFIG_SMP */
274#ifdef CONFIG_CFS_BANDWIDTH
275 int runtime_enabled;
276 u64 runtime_expires;
277 s64 runtime_remaining;
278
279 u64 throttled_timestamp;
280 int throttled, throttle_count;
281 struct list_head throttled_list;
282#endif /* CONFIG_CFS_BANDWIDTH */
283#endif /* CONFIG_FAIR_GROUP_SCHED */
284};
285
286static inline int rt_bandwidth_enabled(void)
287{
288 return sysctl_sched_rt_runtime >= 0;
289}
290
291/* Real-Time classes' related field in a runqueue: */
292struct rt_rq {
293 struct rt_prio_array active;
294 unsigned long rt_nr_running;
295#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
296 struct {
297 int curr; /* highest queued rt task prio */
298#ifdef CONFIG_SMP
299 int next; /* next highest */
300#endif
301 } highest_prio;
302#endif
303#ifdef CONFIG_SMP
304 unsigned long rt_nr_migratory;
305 unsigned long rt_nr_total;
306 int overloaded;
307 struct plist_head pushable_tasks;
308#endif
309 int rt_throttled;
310 u64 rt_time;
311 u64 rt_runtime;
312 /* Nests inside the rq lock: */
313 raw_spinlock_t rt_runtime_lock;
314
315#ifdef CONFIG_RT_GROUP_SCHED
316 unsigned long rt_nr_boosted;
317
318 struct rq *rq;
319 struct list_head leaf_rt_rq_list;
320 struct task_group *tg;
321#endif
322};
323
324#ifdef CONFIG_SMP
325
326/*
327 * We add the notion of a root-domain which will be used to define per-domain
328 * variables. Each exclusive cpuset essentially defines an island domain by
329 * fully partitioning the member cpus from any other cpuset. Whenever a new
330 * exclusive cpuset is created, we also create and attach a new root-domain
331 * object.
332 *
333 */
334struct root_domain {
335 atomic_t refcount;
336 atomic_t rto_count;
337 struct rcu_head rcu;
338 cpumask_var_t span;
339 cpumask_var_t online;
340
341 /*
342 * The "RT overload" flag: it gets set if a CPU has more than
343 * one runnable RT task.
344 */
345 cpumask_var_t rto_mask;
346 struct cpupri cpupri;
347};
348
349extern struct root_domain def_root_domain;
350
351#endif /* CONFIG_SMP */
352
353/*
354 * This is the main, per-CPU runqueue data structure.
355 *
356 * Locking rule: those places that want to lock multiple runqueues
357 * (such as the load balancing or the thread migration code), lock
358 * acquire operations must be ordered by ascending &runqueue.
359 */
360struct rq {
361 /* runqueue lock: */
362 raw_spinlock_t lock;
363
364 /*
365 * nr_running and cpu_load should be in the same cacheline because
366 * remote CPUs use both these fields when doing load calculation.
367 */
368 unsigned long nr_running;
369 #define CPU_LOAD_IDX_MAX 5
370 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
371 unsigned long last_load_update_tick;
372#ifdef CONFIG_NO_HZ
373 u64 nohz_stamp;
374 unsigned long nohz_flags;
375#endif
376 int skip_clock_update;
377
378 /* capture load from *all* tasks on this cpu: */
379 struct load_weight load;
380 unsigned long nr_load_updates;
381 u64 nr_switches;
382
383 struct cfs_rq cfs;
384 struct rt_rq rt;
385
386#ifdef CONFIG_FAIR_GROUP_SCHED
387 /* list of leaf cfs_rq on this cpu: */
388 struct list_head leaf_cfs_rq_list;
389#endif
390#ifdef CONFIG_RT_GROUP_SCHED
391 struct list_head leaf_rt_rq_list;
392#endif
393
394 /*
395 * This is part of a global counter where only the total sum
396 * over all CPUs matters. A task can increase this counter on
397 * one CPU and if it got migrated afterwards it may decrease
398 * it on another CPU. Always updated under the runqueue lock:
399 */
400 unsigned long nr_uninterruptible;
401
402 struct task_struct *curr, *idle, *stop;
403 unsigned long next_balance;
404 struct mm_struct *prev_mm;
405
406 u64 clock;
407 u64 clock_task;
408
409 atomic_t nr_iowait;
410
411#ifdef CONFIG_SMP
412 struct root_domain *rd;
413 struct sched_domain *sd;
414
415 unsigned long cpu_power;
416
417 unsigned char idle_balance;
418 /* For active balancing */
419 int post_schedule;
420 int active_balance;
421 int push_cpu;
422 struct cpu_stop_work active_balance_work;
423 /* cpu of this runqueue: */
424 int cpu;
425 int online;
426
427 u64 rt_avg;
428 u64 age_stamp;
429 u64 idle_stamp;
430 u64 avg_idle;
431#endif
432
433#ifdef CONFIG_IRQ_TIME_ACCOUNTING
434 u64 prev_irq_time;
435#endif
436#ifdef CONFIG_PARAVIRT
437 u64 prev_steal_time;
438#endif
439#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
440 u64 prev_steal_time_rq;
441#endif
442
443 /* calc_load related fields */
444 unsigned long calc_load_update;
445 long calc_load_active;
446
447#ifdef CONFIG_SCHED_HRTICK
448#ifdef CONFIG_SMP
449 int hrtick_csd_pending;
450 struct call_single_data hrtick_csd;
451#endif
452 struct hrtimer hrtick_timer;
453#endif
454
455#ifdef CONFIG_SCHEDSTATS
456 /* latency stats */
457 struct sched_info rq_sched_info;
458 unsigned long long rq_cpu_time;
459 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
460
461 /* sys_sched_yield() stats */
462 unsigned int yld_count;
463
464 /* schedule() stats */
465 unsigned int sched_switch;
466 unsigned int sched_count;
467 unsigned int sched_goidle;
468
469 /* try_to_wake_up() stats */
470 unsigned int ttwu_count;
471 unsigned int ttwu_local;
472#endif
473
474#ifdef CONFIG_SMP
475 struct llist_head wake_list;
476#endif
477};
478
479static inline int cpu_of(struct rq *rq)
480{
481#ifdef CONFIG_SMP
482 return rq->cpu;
483#else
484 return 0;
485#endif
486}
487
488DECLARE_PER_CPU(struct rq, runqueues);
489
490#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
491#define this_rq() (&__get_cpu_var(runqueues))
492#define task_rq(p) cpu_rq(task_cpu(p))
493#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
494#define raw_rq() (&__raw_get_cpu_var(runqueues))
495
496#ifdef CONFIG_SMP
497
498#define rcu_dereference_check_sched_domain(p) \
499 rcu_dereference_check((p), \
500 lockdep_is_held(&sched_domains_mutex))
501
502/*
503 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
504 * See detach_destroy_domains: synchronize_sched for details.
505 *
506 * The domain tree of any CPU may only be accessed from within
507 * preempt-disabled sections.
508 */
509#define for_each_domain(cpu, __sd) \
510 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
511 __sd; __sd = __sd->parent)
512
513#define for_each_lower_domain(sd) for (; sd; sd = sd->child)
514
515/**
516 * highest_flag_domain - Return highest sched_domain containing flag.
517 * @cpu: The cpu whose highest level of sched domain is to
518 * be returned.
519 * @flag: The flag to check for the highest sched_domain
520 * for the given cpu.
521 *
522 * Returns the highest sched_domain of a cpu which contains the given flag.
523 */
524static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
525{
526 struct sched_domain *sd, *hsd = NULL;
527
528 for_each_domain(cpu, sd) {
529 if (!(sd->flags & flag))
530 break;
531 hsd = sd;
532 }
533
534 return hsd;
535}
536
537DECLARE_PER_CPU(struct sched_domain *, sd_llc);
538DECLARE_PER_CPU(int, sd_llc_id);
539
540#endif /* CONFIG_SMP */
541
542#include "stats.h"
543#include "auto_group.h"
544
545#ifdef CONFIG_CGROUP_SCHED
546
547/*
548 * Return the group to which this tasks belongs.
549 *
550 * We use task_subsys_state_check() and extend the RCU verification with
551 * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
552 * task it moves into the cgroup. Therefore by holding either of those locks,
553 * we pin the task to the current cgroup.
554 */
555static inline struct task_group *task_group(struct task_struct *p)
556{
557 struct task_group *tg;
558 struct cgroup_subsys_state *css;
559
560 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
561 lockdep_is_held(&p->pi_lock) ||
562 lockdep_is_held(&task_rq(p)->lock));
563 tg = container_of(css, struct task_group, css);
564
565 return autogroup_task_group(p, tg);
566}
567
568/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
569static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
570{
571#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED)
572 struct task_group *tg = task_group(p);
573#endif
574
575#ifdef CONFIG_FAIR_GROUP_SCHED
576 p->se.cfs_rq = tg->cfs_rq[cpu];
577 p->se.parent = tg->se[cpu];
578#endif
579
580#ifdef CONFIG_RT_GROUP_SCHED
581 p->rt.rt_rq = tg->rt_rq[cpu];
582 p->rt.parent = tg->rt_se[cpu];
583#endif
584}
585
586#else /* CONFIG_CGROUP_SCHED */
587
588static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
589static inline struct task_group *task_group(struct task_struct *p)
590{
591 return NULL;
592}
593
594#endif /* CONFIG_CGROUP_SCHED */
595
596static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
597{
598 set_task_rq(p, cpu);
599#ifdef CONFIG_SMP
600 /*
601 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
602 * successfuly executed on another CPU. We must ensure that updates of
603 * per-task data have been completed by this moment.
604 */
605 smp_wmb();
606 task_thread_info(p)->cpu = cpu;
607#endif
608}
609
610/*
611 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
612 */
613#ifdef CONFIG_SCHED_DEBUG
614# include <linux/jump_label.h>
615# define const_debug __read_mostly
616#else
617# define const_debug const
618#endif
619
620extern const_debug unsigned int sysctl_sched_features;
621
622#define SCHED_FEAT(name, enabled) \
623 __SCHED_FEAT_##name ,
624
625enum {
626#include "features.h"
627 __SCHED_FEAT_NR,
628};
629
630#undef SCHED_FEAT
631
632#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
633static __always_inline bool static_branch__true(struct jump_label_key *key)
634{
635 return likely(static_branch(key)); /* Not out of line branch. */
636}
637
638static __always_inline bool static_branch__false(struct jump_label_key *key)
639{
640 return unlikely(static_branch(key)); /* Out of line branch. */
641}
642
643#define SCHED_FEAT(name, enabled) \
644static __always_inline bool static_branch_##name(struct jump_label_key *key) \
645{ \
646 return static_branch__##enabled(key); \
647}
648
649#include "features.h"
650
651#undef SCHED_FEAT
652
653extern struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR];
654#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
655#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
656#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
657#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
658
659static inline u64 global_rt_period(void)
660{
661 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
662}
663
664static inline u64 global_rt_runtime(void)
665{
666 if (sysctl_sched_rt_runtime < 0)
667 return RUNTIME_INF;
668
669 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
670}
671
672
673
674static inline int task_current(struct rq *rq, struct task_struct *p)
675{
676 return rq->curr == p;
677}
678
679static inline int task_running(struct rq *rq, struct task_struct *p)
680{
681#ifdef CONFIG_SMP
682 return p->on_cpu;
683#else
684 return task_current(rq, p);
685#endif
686}
687
688
689#ifndef prepare_arch_switch
690# define prepare_arch_switch(next) do { } while (0)
691#endif
692#ifndef finish_arch_switch
693# define finish_arch_switch(prev) do { } while (0)
694#endif
695
696#ifndef __ARCH_WANT_UNLOCKED_CTXSW
697static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
698{
699#ifdef CONFIG_SMP
700 /*
701 * We can optimise this out completely for !SMP, because the
702 * SMP rebalancing from interrupt is the only thing that cares
703 * here.
704 */
705 next->on_cpu = 1;
706#endif
707}
708
709static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
710{
711#ifdef CONFIG_SMP
712 /*
713 * After ->on_cpu is cleared, the task can be moved to a different CPU.
714 * We must ensure this doesn't happen until the switch is completely
715 * finished.
716 */
717 smp_wmb();
718 prev->on_cpu = 0;
719#endif
720#ifdef CONFIG_DEBUG_SPINLOCK
721 /* this is a valid case when another task releases the spinlock */
722 rq->lock.owner = current;
723#endif
724 /*
725 * If we are tracking spinlock dependencies then we have to
726 * fix up the runqueue lock - which gets 'carried over' from
727 * prev into current:
728 */
729 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
730
731 raw_spin_unlock_irq(&rq->lock);
732}
733
734#else /* __ARCH_WANT_UNLOCKED_CTXSW */
735static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
736{
737#ifdef CONFIG_SMP
738 /*
739 * We can optimise this out completely for !SMP, because the
740 * SMP rebalancing from interrupt is the only thing that cares
741 * here.
742 */
743 next->on_cpu = 1;
744#endif
745#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
746 raw_spin_unlock_irq(&rq->lock);
747#else
748 raw_spin_unlock(&rq->lock);
749#endif
750}
751
752static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
753{
754#ifdef CONFIG_SMP
755 /*
756 * After ->on_cpu is cleared, the task can be moved to a different CPU.
757 * We must ensure this doesn't happen until the switch is completely
758 * finished.
759 */
760 smp_wmb();
761 prev->on_cpu = 0;
762#endif
763#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
764 local_irq_enable();
765#endif
766}
767#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
768
769
770static inline void update_load_add(struct load_weight *lw, unsigned long inc)
771{
772 lw->weight += inc;
773 lw->inv_weight = 0;
774}
775
776static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
777{
778 lw->weight -= dec;
779 lw->inv_weight = 0;
780}
781
782static inline void update_load_set(struct load_weight *lw, unsigned long w)
783{
784 lw->weight = w;
785 lw->inv_weight = 0;
786}
787
788/*
789 * To aid in avoiding the subversion of "niceness" due to uneven distribution
790 * of tasks with abnormal "nice" values across CPUs the contribution that
791 * each task makes to its run queue's load is weighted according to its
792 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
793 * scaled version of the new time slice allocation that they receive on time
794 * slice expiry etc.
795 */
796
797#define WEIGHT_IDLEPRIO 3
798#define WMULT_IDLEPRIO 1431655765
799
800/*
801 * Nice levels are multiplicative, with a gentle 10% change for every
802 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
803 * nice 1, it will get ~10% less CPU time than another CPU-bound task
804 * that remained on nice 0.
805 *
806 * The "10% effect" is relative and cumulative: from _any_ nice level,
807 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
808 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
809 * If a task goes up by ~10% and another task goes down by ~10% then
810 * the relative distance between them is ~25%.)
811 */
812static const int prio_to_weight[40] = {
813 /* -20 */ 88761, 71755, 56483, 46273, 36291,
814 /* -15 */ 29154, 23254, 18705, 14949, 11916,
815 /* -10 */ 9548, 7620, 6100, 4904, 3906,
816 /* -5 */ 3121, 2501, 1991, 1586, 1277,
817 /* 0 */ 1024, 820, 655, 526, 423,
818 /* 5 */ 335, 272, 215, 172, 137,
819 /* 10 */ 110, 87, 70, 56, 45,
820 /* 15 */ 36, 29, 23, 18, 15,
821};
822
823/*
824 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
825 *
826 * In cases where the weight does not change often, we can use the
827 * precalculated inverse to speed up arithmetics by turning divisions
828 * into multiplications:
829 */
830static const u32 prio_to_wmult[40] = {
831 /* -20 */ 48388, 59856, 76040, 92818, 118348,
832 /* -15 */ 147320, 184698, 229616, 287308, 360437,
833 /* -10 */ 449829, 563644, 704093, 875809, 1099582,
834 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
835 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
836 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
837 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
838 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
839};
840
841/* Time spent by the tasks of the cpu accounting group executing in ... */
842enum cpuacct_stat_index {
843 CPUACCT_STAT_USER, /* ... user mode */
844 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
845
846 CPUACCT_STAT_NSTATS,
847};
848
849
850#define sched_class_highest (&stop_sched_class)
851#define for_each_class(class) \
852 for (class = sched_class_highest; class; class = class->next)
853
854extern const struct sched_class stop_sched_class;
855extern const struct sched_class rt_sched_class;
856extern const struct sched_class fair_sched_class;
857extern const struct sched_class idle_sched_class;
858
859
860#ifdef CONFIG_SMP
861
862extern void trigger_load_balance(struct rq *rq, int cpu);
863extern void idle_balance(int this_cpu, struct rq *this_rq);
864
865#else /* CONFIG_SMP */
866
867static inline void idle_balance(int cpu, struct rq *rq)
868{
869}
870
871#endif
872
873extern void sysrq_sched_debug_show(void);
874extern void sched_init_granularity(void);
875extern void update_max_interval(void);
876extern void update_group_power(struct sched_domain *sd, int cpu);
877extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu);
878extern void init_sched_rt_class(void);
879extern void init_sched_fair_class(void);
880
881extern void resched_task(struct task_struct *p);
882extern void resched_cpu(int cpu);
883
884extern struct rt_bandwidth def_rt_bandwidth;
885extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
886
887extern void update_cpu_load(struct rq *this_rq);
888
889#ifdef CONFIG_CGROUP_CPUACCT
890#include <linux/cgroup.h>
891/* track cpu usage of a group of tasks and its child groups */
892struct cpuacct {
893 struct cgroup_subsys_state css;
894 /* cpuusage holds pointer to a u64-type object on every cpu */
895 u64 __percpu *cpuusage;
896 struct kernel_cpustat __percpu *cpustat;
897};
898
899/* return cpu accounting group corresponding to this container */
900static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
901{
902 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
903 struct cpuacct, css);
904}
905
906/* return cpu accounting group to which this task belongs */
907static inline struct cpuacct *task_ca(struct task_struct *tsk)
908{
909 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
910 struct cpuacct, css);
911}
912
913static inline struct cpuacct *parent_ca(struct cpuacct *ca)
914{
915 if (!ca || !ca->css.cgroup->parent)
916 return NULL;
917 return cgroup_ca(ca->css.cgroup->parent);
918}
919
920extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
921#else
922static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
923#endif
924
925static inline void inc_nr_running(struct rq *rq)
926{
927 rq->nr_running++;
928}
929
930static inline void dec_nr_running(struct rq *rq)
931{
932 rq->nr_running--;
933}
934
935extern void update_rq_clock(struct rq *rq);
936
937extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
938extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
939
940extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
941
942extern const_debug unsigned int sysctl_sched_time_avg;
943extern const_debug unsigned int sysctl_sched_nr_migrate;
944extern const_debug unsigned int sysctl_sched_migration_cost;
945
946static inline u64 sched_avg_period(void)
947{
948 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
949}
950
951void calc_load_account_idle(struct rq *this_rq);
952
953#ifdef CONFIG_SCHED_HRTICK
954
955/*
956 * Use hrtick when:
957 * - enabled by features
958 * - hrtimer is actually high res
959 */
960static inline int hrtick_enabled(struct rq *rq)
961{
962 if (!sched_feat(HRTICK))
963 return 0;
964 if (!cpu_active(cpu_of(rq)))
965 return 0;
966 return hrtimer_is_hres_active(&rq->hrtick_timer);
967}
968
969void hrtick_start(struct rq *rq, u64 delay);
970
971#else
972
973static inline int hrtick_enabled(struct rq *rq)
974{
975 return 0;
976}
977
978#endif /* CONFIG_SCHED_HRTICK */
979
980#ifdef CONFIG_SMP
981extern void sched_avg_update(struct rq *rq);
982static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
983{
984 rq->rt_avg += rt_delta;
985 sched_avg_update(rq);
986}
987#else
988static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
989static inline void sched_avg_update(struct rq *rq) { }
990#endif
991
992extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
993
994#ifdef CONFIG_SMP
995#ifdef CONFIG_PREEMPT
996
997static inline void double_rq_lock(struct rq *rq1, struct rq *rq2);
998
999/*
1000 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1001 * way at the expense of forcing extra atomic operations in all
1002 * invocations. This assures that the double_lock is acquired using the
1003 * same underlying policy as the spinlock_t on this architecture, which
1004 * reduces latency compared to the unfair variant below. However, it
1005 * also adds more overhead and therefore may reduce throughput.
1006 */
1007static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1008 __releases(this_rq->lock)
1009 __acquires(busiest->lock)
1010 __acquires(this_rq->lock)
1011{
1012 raw_spin_unlock(&this_rq->lock);
1013 double_rq_lock(this_rq, busiest);
1014
1015 return 1;
1016}
1017
1018#else
1019/*
1020 * Unfair double_lock_balance: Optimizes throughput at the expense of
1021 * latency by eliminating extra atomic operations when the locks are
1022 * already in proper order on entry. This favors lower cpu-ids and will
1023 * grant the double lock to lower cpus over higher ids under contention,
1024 * regardless of entry order into the function.
1025 */
1026static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1027 __releases(this_rq->lock)
1028 __acquires(busiest->lock)
1029 __acquires(this_rq->lock)
1030{
1031 int ret = 0;
1032
1033 if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1034 if (busiest < this_rq) {
1035 raw_spin_unlock(&this_rq->lock);
1036 raw_spin_lock(&busiest->lock);
1037 raw_spin_lock_nested(&this_rq->lock,
1038 SINGLE_DEPTH_NESTING);
1039 ret = 1;
1040 } else
1041 raw_spin_lock_nested(&busiest->lock,
1042 SINGLE_DEPTH_NESTING);
1043 }
1044 return ret;
1045}
1046
1047#endif /* CONFIG_PREEMPT */
1048
1049/*
1050 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1051 */
1052static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1053{
1054 if (unlikely(!irqs_disabled())) {
1055 /* printk() doesn't work good under rq->lock */
1056 raw_spin_unlock(&this_rq->lock);
1057 BUG_ON(1);
1058 }
1059
1060 return _double_lock_balance(this_rq, busiest);
1061}
1062
1063static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1064 __releases(busiest->lock)
1065{
1066 raw_spin_unlock(&busiest->lock);
1067 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1068}
1069
1070/*
1071 * double_rq_lock - safely lock two runqueues
1072 *
1073 * Note this does not disable interrupts like task_rq_lock,
1074 * you need to do so manually before calling.
1075 */
1076static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
1077 __acquires(rq1->lock)
1078 __acquires(rq2->lock)
1079{
1080 BUG_ON(!irqs_disabled());
1081 if (rq1 == rq2) {
1082 raw_spin_lock(&rq1->lock);
1083 __acquire(rq2->lock); /* Fake it out ;) */
1084 } else {
1085 if (rq1 < rq2) {
1086 raw_spin_lock(&rq1->lock);
1087 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1088 } else {
1089 raw_spin_lock(&rq2->lock);
1090 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1091 }
1092 }
1093}
1094
1095/*
1096 * double_rq_unlock - safely unlock two runqueues
1097 *
1098 * Note this does not restore interrupts like task_rq_unlock,
1099 * you need to do so manually after calling.
1100 */
1101static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1102 __releases(rq1->lock)
1103 __releases(rq2->lock)
1104{
1105 raw_spin_unlock(&rq1->lock);
1106 if (rq1 != rq2)
1107 raw_spin_unlock(&rq2->lock);
1108 else
1109 __release(rq2->lock);
1110}
1111
1112#else /* CONFIG_SMP */
1113
1114/*
1115 * double_rq_lock - safely lock two runqueues
1116 *
1117 * Note this does not disable interrupts like task_rq_lock,
1118 * you need to do so manually before calling.
1119 */
1120static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
1121 __acquires(rq1->lock)
1122 __acquires(rq2->lock)
1123{
1124 BUG_ON(!irqs_disabled());
1125 BUG_ON(rq1 != rq2);
1126 raw_spin_lock(&rq1->lock);
1127 __acquire(rq2->lock); /* Fake it out ;) */
1128}
1129
1130/*
1131 * double_rq_unlock - safely unlock two runqueues
1132 *
1133 * Note this does not restore interrupts like task_rq_unlock,
1134 * you need to do so manually after calling.
1135 */
1136static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1137 __releases(rq1->lock)
1138 __releases(rq2->lock)
1139{
1140 BUG_ON(rq1 != rq2);
1141 raw_spin_unlock(&rq1->lock);
1142 __release(rq2->lock);
1143}
1144
1145#endif
1146
1147extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
1148extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
1149extern void print_cfs_stats(struct seq_file *m, int cpu);
1150extern void print_rt_stats(struct seq_file *m, int cpu);
1151
1152extern void init_cfs_rq(struct cfs_rq *cfs_rq);
1153extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
1154extern void unthrottle_offline_cfs_rqs(struct rq *rq);
1155
1156extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
1157
1158#ifdef CONFIG_NO_HZ
1159enum rq_nohz_flag_bits {
1160 NOHZ_TICK_STOPPED,
1161 NOHZ_BALANCE_KICK,
1162 NOHZ_IDLE,
1163};
1164
1165#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
1166#endif
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
new file mode 100644
index 000000000000..2a581ba8e190
--- /dev/null
+++ b/kernel/sched/stats.c
@@ -0,0 +1,111 @@
1
2#include <linux/slab.h>
3#include <linux/fs.h>
4#include <linux/seq_file.h>
5#include <linux/proc_fs.h>
6
7#include "sched.h"
8
9/*
10 * bump this up when changing the output format or the meaning of an existing
11 * format, so that tools can adapt (or abort)
12 */
13#define SCHEDSTAT_VERSION 15
14
15static int show_schedstat(struct seq_file *seq, void *v)
16{
17 int cpu;
18 int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
19 char *mask_str = kmalloc(mask_len, GFP_KERNEL);
20
21 if (mask_str == NULL)
22 return -ENOMEM;
23
24 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
25 seq_printf(seq, "timestamp %lu\n", jiffies);
26 for_each_online_cpu(cpu) {
27 struct rq *rq = cpu_rq(cpu);
28#ifdef CONFIG_SMP
29 struct sched_domain *sd;
30 int dcount = 0;
31#endif
32
33 /* runqueue-specific stats */
34 seq_printf(seq,
35 "cpu%d %u %u %u %u %u %u %llu %llu %lu",
36 cpu, rq->yld_count,
37 rq->sched_switch, rq->sched_count, rq->sched_goidle,
38 rq->ttwu_count, rq->ttwu_local,
39 rq->rq_cpu_time,
40 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
41
42 seq_printf(seq, "\n");
43
44#ifdef CONFIG_SMP
45 /* domain-specific stats */
46 rcu_read_lock();
47 for_each_domain(cpu, sd) {
48 enum cpu_idle_type itype;
49
50 cpumask_scnprintf(mask_str, mask_len,
51 sched_domain_span(sd));
52 seq_printf(seq, "domain%d %s", dcount++, mask_str);
53 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
54 itype++) {
55 seq_printf(seq, " %u %u %u %u %u %u %u %u",
56 sd->lb_count[itype],
57 sd->lb_balanced[itype],
58 sd->lb_failed[itype],
59 sd->lb_imbalance[itype],
60 sd->lb_gained[itype],
61 sd->lb_hot_gained[itype],
62 sd->lb_nobusyq[itype],
63 sd->lb_nobusyg[itype]);
64 }
65 seq_printf(seq,
66 " %u %u %u %u %u %u %u %u %u %u %u %u\n",
67 sd->alb_count, sd->alb_failed, sd->alb_pushed,
68 sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
69 sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
70 sd->ttwu_wake_remote, sd->ttwu_move_affine,
71 sd->ttwu_move_balance);
72 }
73 rcu_read_unlock();
74#endif
75 }
76 kfree(mask_str);
77 return 0;
78}
79
80static int schedstat_open(struct inode *inode, struct file *file)
81{
82 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
83 char *buf = kmalloc(size, GFP_KERNEL);
84 struct seq_file *m;
85 int res;
86
87 if (!buf)
88 return -ENOMEM;
89 res = single_open(file, show_schedstat, NULL);
90 if (!res) {
91 m = file->private_data;
92 m->buf = buf;
93 m->size = size;
94 } else
95 kfree(buf);
96 return res;
97}
98
99static const struct file_operations proc_schedstat_operations = {
100 .open = schedstat_open,
101 .read = seq_read,
102 .llseek = seq_lseek,
103 .release = single_release,
104};
105
106static int __init proc_schedstat_init(void)
107{
108 proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
109 return 0;
110}
111module_init(proc_schedstat_init);
diff --git a/kernel/sched_stats.h b/kernel/sched/stats.h
index 87f9e36ea56e..2ef90a51ec5e 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched/stats.h
@@ -1,108 +1,5 @@
1 1
2#ifdef CONFIG_SCHEDSTATS 2#ifdef CONFIG_SCHEDSTATS
3/*
4 * bump this up when changing the output format or the meaning of an existing
5 * format, so that tools can adapt (or abort)
6 */
7#define SCHEDSTAT_VERSION 15
8
9static int show_schedstat(struct seq_file *seq, void *v)
10{
11 int cpu;
12 int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
13 char *mask_str = kmalloc(mask_len, GFP_KERNEL);
14
15 if (mask_str == NULL)
16 return -ENOMEM;
17
18 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
19 seq_printf(seq, "timestamp %lu\n", jiffies);
20 for_each_online_cpu(cpu) {
21 struct rq *rq = cpu_rq(cpu);
22#ifdef CONFIG_SMP
23 struct sched_domain *sd;
24 int dcount = 0;
25#endif
26
27 /* runqueue-specific stats */
28 seq_printf(seq,
29 "cpu%d %u %u %u %u %u %u %llu %llu %lu",
30 cpu, rq->yld_count,
31 rq->sched_switch, rq->sched_count, rq->sched_goidle,
32 rq->ttwu_count, rq->ttwu_local,
33 rq->rq_cpu_time,
34 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
35
36 seq_printf(seq, "\n");
37
38#ifdef CONFIG_SMP
39 /* domain-specific stats */
40 rcu_read_lock();
41 for_each_domain(cpu, sd) {
42 enum cpu_idle_type itype;
43
44 cpumask_scnprintf(mask_str, mask_len,
45 sched_domain_span(sd));
46 seq_printf(seq, "domain%d %s", dcount++, mask_str);
47 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
48 itype++) {
49 seq_printf(seq, " %u %u %u %u %u %u %u %u",
50 sd->lb_count[itype],
51 sd->lb_balanced[itype],
52 sd->lb_failed[itype],
53 sd->lb_imbalance[itype],
54 sd->lb_gained[itype],
55 sd->lb_hot_gained[itype],
56 sd->lb_nobusyq[itype],
57 sd->lb_nobusyg[itype]);
58 }
59 seq_printf(seq,
60 " %u %u %u %u %u %u %u %u %u %u %u %u\n",
61 sd->alb_count, sd->alb_failed, sd->alb_pushed,
62 sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
63 sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
64 sd->ttwu_wake_remote, sd->ttwu_move_affine,
65 sd->ttwu_move_balance);
66 }
67 rcu_read_unlock();
68#endif
69 }
70 kfree(mask_str);
71 return 0;
72}
73
74static int schedstat_open(struct inode *inode, struct file *file)
75{
76 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
77 char *buf = kmalloc(size, GFP_KERNEL);
78 struct seq_file *m;
79 int res;
80
81 if (!buf)
82 return -ENOMEM;
83 res = single_open(file, show_schedstat, NULL);
84 if (!res) {
85 m = file->private_data;
86 m->buf = buf;
87 m->size = size;
88 } else
89 kfree(buf);
90 return res;
91}
92
93static const struct file_operations proc_schedstat_operations = {
94 .open = schedstat_open,
95 .read = seq_read,
96 .llseek = seq_lseek,
97 .release = single_release,
98};
99
100static int __init proc_schedstat_init(void)
101{
102 proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
103 return 0;
104}
105module_init(proc_schedstat_init);
106 3
107/* 4/*
108 * Expects runqueue lock to be held for atomicity of update 5 * Expects runqueue lock to be held for atomicity of update
@@ -283,8 +180,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
283 return; 180 return;
284 181
285 raw_spin_lock(&cputimer->lock); 182 raw_spin_lock(&cputimer->lock);
286 cputimer->cputime.utime = 183 cputimer->cputime.utime += cputime;
287 cputime_add(cputimer->cputime.utime, cputime);
288 raw_spin_unlock(&cputimer->lock); 184 raw_spin_unlock(&cputimer->lock);
289} 185}
290 186
@@ -307,8 +203,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
307 return; 203 return;
308 204
309 raw_spin_lock(&cputimer->lock); 205 raw_spin_lock(&cputimer->lock);
310 cputimer->cputime.stime = 206 cputimer->cputime.stime += cputime;
311 cputime_add(cputimer->cputime.stime, cputime);
312 raw_spin_unlock(&cputimer->lock); 207 raw_spin_unlock(&cputimer->lock);
313} 208}
314 209
diff --git a/kernel/sched_stoptask.c b/kernel/sched/stop_task.c
index 8b44e7fa7fb3..7b386e86fd23 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched/stop_task.c
@@ -1,3 +1,5 @@
1#include "sched.h"
2
1/* 3/*
2 * stop-task scheduling class. 4 * stop-task scheduling class.
3 * 5 *
@@ -80,7 +82,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task)
80/* 82/*
81 * Simple, special scheduling class for the per-CPU stop tasks: 83 * Simple, special scheduling class for the per-CPU stop tasks:
82 */ 84 */
83static const struct sched_class stop_sched_class = { 85const struct sched_class stop_sched_class = {
84 .next = &rt_sched_class, 86 .next = &rt_sched_class,
85 87
86 .enqueue_task = enqueue_task_stop, 88 .enqueue_task = enqueue_task_stop,
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 57d4b13b631d..e8d76c5895ea 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -6,6 +6,7 @@
6 * This defines a simple but solid secure-computing mode. 6 * This defines a simple but solid secure-computing mode.
7 */ 7 */
8 8
9#include <linux/audit.h>
9#include <linux/seccomp.h> 10#include <linux/seccomp.h>
10#include <linux/sched.h> 11#include <linux/sched.h>
11#include <linux/compat.h> 12#include <linux/compat.h>
@@ -54,6 +55,7 @@ void __secure_computing(int this_syscall)
54#ifdef SECCOMP_DEBUG 55#ifdef SECCOMP_DEBUG
55 dump_stack(); 56 dump_stack();
56#endif 57#endif
58 audit_seccomp(this_syscall);
57 do_exit(SIGKILL); 59 do_exit(SIGKILL);
58} 60}
59 61
diff --git a/kernel/signal.c b/kernel/signal.c
index b3f78d09a105..c73c4284160e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -28,6 +28,7 @@
28#include <linux/freezer.h> 28#include <linux/freezer.h>
29#include <linux/pid_namespace.h> 29#include <linux/pid_namespace.h>
30#include <linux/nsproxy.h> 30#include <linux/nsproxy.h>
31#include <linux/user_namespace.h>
31#define CREATE_TRACE_POINTS 32#define CREATE_TRACE_POINTS
32#include <trace/events/signal.h> 33#include <trace/events/signal.h>
33 34
@@ -1019,6 +1020,34 @@ static inline int legacy_queue(struct sigpending *signals, int sig)
1019 return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); 1020 return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
1020} 1021}
1021 1022
1023/*
1024 * map the uid in struct cred into user namespace *ns
1025 */
1026static inline uid_t map_cred_ns(const struct cred *cred,
1027 struct user_namespace *ns)
1028{
1029 return user_ns_map_uid(ns, cred, cred->uid);
1030}
1031
1032#ifdef CONFIG_USER_NS
1033static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
1034{
1035 if (current_user_ns() == task_cred_xxx(t, user_ns))
1036 return;
1037
1038 if (SI_FROMKERNEL(info))
1039 return;
1040
1041 info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns),
1042 current_cred(), info->si_uid);
1043}
1044#else
1045static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
1046{
1047 return;
1048}
1049#endif
1050
1022static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, 1051static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
1023 int group, int from_ancestor_ns) 1052 int group, int from_ancestor_ns)
1024{ 1053{
@@ -1088,6 +1117,9 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
1088 q->info.si_pid = 0; 1117 q->info.si_pid = 0;
1089 break; 1118 break;
1090 } 1119 }
1120
1121 userns_fixup_signal_uid(&q->info, t);
1122
1091 } else if (!is_si_special(info)) { 1123 } else if (!is_si_special(info)) {
1092 if (sig >= SIGRTMIN && info->si_code != SI_USER) { 1124 if (sig >= SIGRTMIN && info->si_code != SI_USER) {
1093 /* 1125 /*
@@ -1626,13 +1658,12 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
1626 */ 1658 */
1627 rcu_read_lock(); 1659 rcu_read_lock();
1628 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); 1660 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
1629 info.si_uid = __task_cred(tsk)->uid; 1661 info.si_uid = map_cred_ns(__task_cred(tsk),
1662 task_cred_xxx(tsk->parent, user_ns));
1630 rcu_read_unlock(); 1663 rcu_read_unlock();
1631 1664
1632 info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime, 1665 info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime);
1633 tsk->signal->utime)); 1666 info.si_stime = cputime_to_clock_t(tsk->stime + tsk->signal->stime);
1634 info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
1635 tsk->signal->stime));
1636 1667
1637 info.si_status = tsk->exit_code & 0x7f; 1668 info.si_status = tsk->exit_code & 0x7f;
1638 if (tsk->exit_code & 0x80) 1669 if (tsk->exit_code & 0x80)
@@ -1711,7 +1742,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
1711 */ 1742 */
1712 rcu_read_lock(); 1743 rcu_read_lock();
1713 info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); 1744 info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
1714 info.si_uid = __task_cred(tsk)->uid; 1745 info.si_uid = map_cred_ns(__task_cred(tsk),
1746 task_cred_xxx(parent, user_ns));
1715 rcu_read_unlock(); 1747 rcu_read_unlock();
1716 1748
1717 info.si_utime = cputime_to_clock_t(tsk->utime); 1749 info.si_utime = cputime_to_clock_t(tsk->utime);
@@ -1994,8 +2026,6 @@ static bool do_signal_stop(int signr)
1994 */ 2026 */
1995 if (!(sig->flags & SIGNAL_STOP_STOPPED)) 2027 if (!(sig->flags & SIGNAL_STOP_STOPPED))
1996 sig->group_exit_code = signr; 2028 sig->group_exit_code = signr;
1997 else
1998 WARN_ON_ONCE(!current->ptrace);
1999 2029
2000 sig->group_stop_count = 0; 2030 sig->group_stop_count = 0;
2001 2031
@@ -2129,8 +2159,11 @@ static int ptrace_signal(int signr, siginfo_t *info,
2129 info->si_signo = signr; 2159 info->si_signo = signr;
2130 info->si_errno = 0; 2160 info->si_errno = 0;
2131 info->si_code = SI_USER; 2161 info->si_code = SI_USER;
2162 rcu_read_lock();
2132 info->si_pid = task_pid_vnr(current->parent); 2163 info->si_pid = task_pid_vnr(current->parent);
2133 info->si_uid = task_uid(current->parent); 2164 info->si_uid = map_cred_ns(__task_cred(current->parent),
2165 current_user_ns());
2166 rcu_read_unlock();
2134 } 2167 }
2135 2168
2136 /* If the (new) signal is now blocked, requeue it. */ 2169 /* If the (new) signal is now blocked, requeue it. */
@@ -2322,6 +2355,27 @@ relock:
2322 return signr; 2355 return signr;
2323} 2356}
2324 2357
2358/**
2359 * block_sigmask - add @ka's signal mask to current->blocked
2360 * @ka: action for @signr
2361 * @signr: signal that has been successfully delivered
2362 *
2363 * This function should be called when a signal has succesfully been
2364 * delivered. It adds the mask of signals for @ka to current->blocked
2365 * so that they are blocked during the execution of the signal
2366 * handler. In addition, @signr will be blocked unless %SA_NODEFER is
2367 * set in @ka->sa.sa_flags.
2368 */
2369void block_sigmask(struct k_sigaction *ka, int signr)
2370{
2371 sigset_t blocked;
2372
2373 sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
2374 if (!(ka->sa.sa_flags & SA_NODEFER))
2375 sigaddset(&blocked, signr);
2376 set_current_blocked(&blocked);
2377}
2378
2325/* 2379/*
2326 * It could be that complete_signal() picked us to notify about the 2380 * It could be that complete_signal() picked us to notify about the
2327 * group-wide signal. Other threads should be notified now to take 2381 * group-wide signal. Other threads should be notified now to take
@@ -2359,8 +2413,15 @@ void exit_signals(struct task_struct *tsk)
2359 int group_stop = 0; 2413 int group_stop = 0;
2360 sigset_t unblocked; 2414 sigset_t unblocked;
2361 2415
2416 /*
2417 * @tsk is about to have PF_EXITING set - lock out users which
2418 * expect stable threadgroup.
2419 */
2420 threadgroup_change_begin(tsk);
2421
2362 if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) { 2422 if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) {
2363 tsk->flags |= PF_EXITING; 2423 tsk->flags |= PF_EXITING;
2424 threadgroup_change_end(tsk);
2364 return; 2425 return;
2365 } 2426 }
2366 2427
@@ -2370,6 +2431,9 @@ void exit_signals(struct task_struct *tsk)
2370 * see wants_signal(), do_signal_stop(). 2431 * see wants_signal(), do_signal_stop().
2371 */ 2432 */
2372 tsk->flags |= PF_EXITING; 2433 tsk->flags |= PF_EXITING;
2434
2435 threadgroup_change_end(tsk);
2436
2373 if (!signal_pending(tsk)) 2437 if (!signal_pending(tsk))
2374 goto out; 2438 goto out;
2375 2439
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 2c71d91efff0..4eb3a0fa351e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -347,12 +347,12 @@ void irq_exit(void)
347 if (!in_interrupt() && local_softirq_pending()) 347 if (!in_interrupt() && local_softirq_pending())
348 invoke_softirq(); 348 invoke_softirq();
349 349
350 rcu_irq_exit();
351#ifdef CONFIG_NO_HZ 350#ifdef CONFIG_NO_HZ
352 /* Make sure that timer wheel updates are propagated */ 351 /* Make sure that timer wheel updates are propagated */
353 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) 352 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
354 tick_nohz_stop_sched_tick(0); 353 tick_nohz_irq_exit();
355#endif 354#endif
355 rcu_irq_exit();
356 preempt_enable_no_resched(); 356 preempt_enable_no_resched();
357} 357}
358 358
diff --git a/kernel/sys.c b/kernel/sys.c
index 481611fbd079..40701538fbd1 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1605,7 +1605,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1605 unsigned long maxrss = 0; 1605 unsigned long maxrss = 0;
1606 1606
1607 memset((char *) r, 0, sizeof *r); 1607 memset((char *) r, 0, sizeof *r);
1608 utime = stime = cputime_zero; 1608 utime = stime = 0;
1609 1609
1610 if (who == RUSAGE_THREAD) { 1610 if (who == RUSAGE_THREAD) {
1611 task_times(current, &utime, &stime); 1611 task_times(current, &utime, &stime);
@@ -1635,8 +1635,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1635 1635
1636 case RUSAGE_SELF: 1636 case RUSAGE_SELF:
1637 thread_group_times(p, &tgutime, &tgstime); 1637 thread_group_times(p, &tgutime, &tgstime);
1638 utime = cputime_add(utime, tgutime); 1638 utime += tgutime;
1639 stime = cputime_add(stime, tgstime); 1639 stime += tgstime;
1640 r->ru_nvcsw += p->signal->nvcsw; 1640 r->ru_nvcsw += p->signal->nvcsw;
1641 r->ru_nivcsw += p->signal->nivcsw; 1641 r->ru_nivcsw += p->signal->nivcsw;
1642 r->ru_minflt += p->signal->min_flt; 1642 r->ru_minflt += p->signal->min_flt;
@@ -1692,6 +1692,124 @@ SYSCALL_DEFINE1(umask, int, mask)
1692 return mask; 1692 return mask;
1693} 1693}
1694 1694
1695#ifdef CONFIG_CHECKPOINT_RESTORE
1696static int prctl_set_mm(int opt, unsigned long addr,
1697 unsigned long arg4, unsigned long arg5)
1698{
1699 unsigned long rlim = rlimit(RLIMIT_DATA);
1700 unsigned long vm_req_flags;
1701 unsigned long vm_bad_flags;
1702 struct vm_area_struct *vma;
1703 int error = 0;
1704 struct mm_struct *mm = current->mm;
1705
1706 if (arg4 | arg5)
1707 return -EINVAL;
1708
1709 if (!capable(CAP_SYS_ADMIN))
1710 return -EPERM;
1711
1712 if (addr >= TASK_SIZE)
1713 return -EINVAL;
1714
1715 down_read(&mm->mmap_sem);
1716 vma = find_vma(mm, addr);
1717
1718 if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) {
1719 /* It must be existing VMA */
1720 if (!vma || vma->vm_start > addr)
1721 goto out;
1722 }
1723
1724 error = -EINVAL;
1725 switch (opt) {
1726 case PR_SET_MM_START_CODE:
1727 case PR_SET_MM_END_CODE:
1728 vm_req_flags = VM_READ | VM_EXEC;
1729 vm_bad_flags = VM_WRITE | VM_MAYSHARE;
1730
1731 if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
1732 (vma->vm_flags & vm_bad_flags))
1733 goto out;
1734
1735 if (opt == PR_SET_MM_START_CODE)
1736 mm->start_code = addr;
1737 else
1738 mm->end_code = addr;
1739 break;
1740
1741 case PR_SET_MM_START_DATA:
1742 case PR_SET_MM_END_DATA:
1743 vm_req_flags = VM_READ | VM_WRITE;
1744 vm_bad_flags = VM_EXEC | VM_MAYSHARE;
1745
1746 if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
1747 (vma->vm_flags & vm_bad_flags))
1748 goto out;
1749
1750 if (opt == PR_SET_MM_START_DATA)
1751 mm->start_data = addr;
1752 else
1753 mm->end_data = addr;
1754 break;
1755
1756 case PR_SET_MM_START_STACK:
1757
1758#ifdef CONFIG_STACK_GROWSUP
1759 vm_req_flags = VM_READ | VM_WRITE | VM_GROWSUP;
1760#else
1761 vm_req_flags = VM_READ | VM_WRITE | VM_GROWSDOWN;
1762#endif
1763 if ((vma->vm_flags & vm_req_flags) != vm_req_flags)
1764 goto out;
1765
1766 mm->start_stack = addr;
1767 break;
1768
1769 case PR_SET_MM_START_BRK:
1770 if (addr <= mm->end_data)
1771 goto out;
1772
1773 if (rlim < RLIM_INFINITY &&
1774 (mm->brk - addr) +
1775 (mm->end_data - mm->start_data) > rlim)
1776 goto out;
1777
1778 mm->start_brk = addr;
1779 break;
1780
1781 case PR_SET_MM_BRK:
1782 if (addr <= mm->end_data)
1783 goto out;
1784
1785 if (rlim < RLIM_INFINITY &&
1786 (addr - mm->start_brk) +
1787 (mm->end_data - mm->start_data) > rlim)
1788 goto out;
1789
1790 mm->brk = addr;
1791 break;
1792
1793 default:
1794 error = -EINVAL;
1795 goto out;
1796 }
1797
1798 error = 0;
1799
1800out:
1801 up_read(&mm->mmap_sem);
1802
1803 return error;
1804}
1805#else /* CONFIG_CHECKPOINT_RESTORE */
1806static int prctl_set_mm(int opt, unsigned long addr,
1807 unsigned long arg4, unsigned long arg5)
1808{
1809 return -EINVAL;
1810}
1811#endif
1812
1695SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, 1813SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1696 unsigned long, arg4, unsigned long, arg5) 1814 unsigned long, arg4, unsigned long, arg5)
1697{ 1815{
@@ -1841,6 +1959,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1841 else 1959 else
1842 error = PR_MCE_KILL_DEFAULT; 1960 error = PR_MCE_KILL_DEFAULT;
1843 break; 1961 break;
1962 case PR_SET_MM:
1963 error = prctl_set_mm(arg2, arg3, arg4, arg5);
1964 break;
1844 default: 1965 default:
1845 error = -EINVAL; 1966 error = -EINVAL;
1846 break; 1967 break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ae2719643854..f487f257e05e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -803,6 +803,15 @@ static struct ctl_table kern_table[] = {
803 .mode = 0644, 803 .mode = 0644,
804 .proc_handler = proc_dointvec, 804 .proc_handler = proc_dointvec,
805 }, 805 },
806#ifdef CONFIG_DEBUG_STACKOVERFLOW
807 {
808 .procname = "panic_on_stackoverflow",
809 .data = &sysctl_panic_on_stackoverflow,
810 .maxlen = sizeof(int),
811 .mode = 0644,
812 .proc_handler = proc_dointvec,
813 },
814#endif
806 { 815 {
807 .procname = "bootloader_type", 816 .procname = "bootloader_type",
808 .data = &bootloader_type, 817 .data = &bootloader_type,
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 6318b511afa1..a650694883a1 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1354,7 +1354,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1354 1354
1355 fput(file); 1355 fput(file);
1356out_putname: 1356out_putname:
1357 putname(pathname); 1357 __putname(pathname);
1358out: 1358out:
1359 return result; 1359 return result;
1360} 1360}
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index b26c2228fe92..2cf9cc7aa103 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -25,7 +25,7 @@ config HIGH_RES_TIMERS
25config GENERIC_CLOCKEVENTS_BUILD 25config GENERIC_CLOCKEVENTS_BUILD
26 bool 26 bool
27 default y 27 default y
28 depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR 28 depends on GENERIC_CLOCKEVENTS
29 29
30config GENERIC_CLOCKEVENTS_MIN_ADJUST 30config GENERIC_CLOCKEVENTS_MIN_ADJUST
31 bool 31 bool
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index c436e790b21b..8a46f5d64504 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -195,7 +195,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
195 struct alarm *alarm; 195 struct alarm *alarm;
196 ktime_t expired = next->expires; 196 ktime_t expired = next->expires;
197 197
198 if (expired.tv64 >= now.tv64) 198 if (expired.tv64 > now.tv64)
199 break; 199 break;
200 200
201 alarm = container_of(next, struct alarm, node); 201 alarm = container_of(next, struct alarm, node);
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 1ecd6ba36d6c..9cd928f7a7c6 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -17,7 +17,6 @@
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/notifier.h> 18#include <linux/notifier.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <linux/sysdev.h>
21 20
22#include "tick-internal.h" 21#include "tick-internal.h"
23 22
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index cf52fda2e096..a45ca167ab24 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -23,8 +23,8 @@
23 * o Allow clocksource drivers to be unregistered 23 * o Allow clocksource drivers to be unregistered
24 */ 24 */
25 25
26#include <linux/device.h>
26#include <linux/clocksource.h> 27#include <linux/clocksource.h>
27#include <linux/sysdev.h>
28#include <linux/init.h> 28#include <linux/init.h>
29#include <linux/module.h> 29#include <linux/module.h>
30#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ 30#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
@@ -492,6 +492,22 @@ void clocksource_touch_watchdog(void)
492} 492}
493 493
494/** 494/**
495 * clocksource_max_adjustment- Returns max adjustment amount
496 * @cs: Pointer to clocksource
497 *
498 */
499static u32 clocksource_max_adjustment(struct clocksource *cs)
500{
501 u64 ret;
502 /*
503 * We won't try to correct for more then 11% adjustments (110,000 ppm),
504 */
505 ret = (u64)cs->mult * 11;
506 do_div(ret,100);
507 return (u32)ret;
508}
509
510/**
495 * clocksource_max_deferment - Returns max time the clocksource can be deferred 511 * clocksource_max_deferment - Returns max time the clocksource can be deferred
496 * @cs: Pointer to clocksource 512 * @cs: Pointer to clocksource
497 * 513 *
@@ -503,25 +519,28 @@ static u64 clocksource_max_deferment(struct clocksource *cs)
503 /* 519 /*
504 * Calculate the maximum number of cycles that we can pass to the 520 * Calculate the maximum number of cycles that we can pass to the
505 * cyc2ns function without overflowing a 64-bit signed result. The 521 * cyc2ns function without overflowing a 64-bit signed result. The
506 * maximum number of cycles is equal to ULLONG_MAX/cs->mult which 522 * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj)
507 * is equivalent to the below. 523 * which is equivalent to the below.
508 * max_cycles < (2^63)/cs->mult 524 * max_cycles < (2^63)/(cs->mult + cs->maxadj)
509 * max_cycles < 2^(log2((2^63)/cs->mult)) 525 * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj)))
510 * max_cycles < 2^(log2(2^63) - log2(cs->mult)) 526 * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj))
511 * max_cycles < 2^(63 - log2(cs->mult)) 527 * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj))
512 * max_cycles < 1 << (63 - log2(cs->mult)) 528 * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj))
513 * Please note that we add 1 to the result of the log2 to account for 529 * Please note that we add 1 to the result of the log2 to account for
514 * any rounding errors, ensure the above inequality is satisfied and 530 * any rounding errors, ensure the above inequality is satisfied and
515 * no overflow will occur. 531 * no overflow will occur.
516 */ 532 */
517 max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1)); 533 max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1));
518 534
519 /* 535 /*
520 * The actual maximum number of cycles we can defer the clocksource is 536 * The actual maximum number of cycles we can defer the clocksource is
521 * determined by the minimum of max_cycles and cs->mask. 537 * determined by the minimum of max_cycles and cs->mask.
538 * Note: Here we subtract the maxadj to make sure we don't sleep for
539 * too long if there's a large negative adjustment.
522 */ 540 */
523 max_cycles = min_t(u64, max_cycles, (u64) cs->mask); 541 max_cycles = min_t(u64, max_cycles, (u64) cs->mask);
524 max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift); 542 max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj,
543 cs->shift);
525 544
526 /* 545 /*
527 * To ensure that the clocksource does not wrap whilst we are idle, 546 * To ensure that the clocksource does not wrap whilst we are idle,
@@ -529,7 +548,7 @@ static u64 clocksource_max_deferment(struct clocksource *cs)
529 * note a margin of 12.5% is used because this can be computed with 548 * note a margin of 12.5% is used because this can be computed with
530 * a shift, versus say 10% which would require division. 549 * a shift, versus say 10% which would require division.
531 */ 550 */
532 return max_nsecs - (max_nsecs >> 5); 551 return max_nsecs - (max_nsecs >> 3);
533} 552}
534 553
535#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET 554#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
@@ -628,7 +647,7 @@ static void clocksource_enqueue(struct clocksource *cs)
628 647
629/** 648/**
630 * __clocksource_updatefreq_scale - Used update clocksource with new freq 649 * __clocksource_updatefreq_scale - Used update clocksource with new freq
631 * @t: clocksource to be registered 650 * @cs: clocksource to be registered
632 * @scale: Scale factor multiplied against freq to get clocksource hz 651 * @scale: Scale factor multiplied against freq to get clocksource hz
633 * @freq: clocksource frequency (cycles per second) divided by scale 652 * @freq: clocksource frequency (cycles per second) divided by scale
634 * 653 *
@@ -640,7 +659,6 @@ static void clocksource_enqueue(struct clocksource *cs)
640void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) 659void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
641{ 660{
642 u64 sec; 661 u64 sec;
643
644 /* 662 /*
645 * Calc the maximum number of seconds which we can run before 663 * Calc the maximum number of seconds which we can run before
646 * wrapping around. For clocksources which have a mask > 32bit 664 * wrapping around. For clocksources which have a mask > 32bit
@@ -651,7 +669,7 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
651 * ~ 0.06ppm granularity for NTP. We apply the same 12.5% 669 * ~ 0.06ppm granularity for NTP. We apply the same 12.5%
652 * margin as we do in clocksource_max_deferment() 670 * margin as we do in clocksource_max_deferment()
653 */ 671 */
654 sec = (cs->mask - (cs->mask >> 5)); 672 sec = (cs->mask - (cs->mask >> 3));
655 do_div(sec, freq); 673 do_div(sec, freq);
656 do_div(sec, scale); 674 do_div(sec, scale);
657 if (!sec) 675 if (!sec)
@@ -661,13 +679,27 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
661 679
662 clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, 680 clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
663 NSEC_PER_SEC / scale, sec * scale); 681 NSEC_PER_SEC / scale, sec * scale);
682
683 /*
684 * for clocksources that have large mults, to avoid overflow.
685 * Since mult may be adjusted by ntp, add an safety extra margin
686 *
687 */
688 cs->maxadj = clocksource_max_adjustment(cs);
689 while ((cs->mult + cs->maxadj < cs->mult)
690 || (cs->mult - cs->maxadj > cs->mult)) {
691 cs->mult >>= 1;
692 cs->shift--;
693 cs->maxadj = clocksource_max_adjustment(cs);
694 }
695
664 cs->max_idle_ns = clocksource_max_deferment(cs); 696 cs->max_idle_ns = clocksource_max_deferment(cs);
665} 697}
666EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); 698EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
667 699
668/** 700/**
669 * __clocksource_register_scale - Used to install new clocksources 701 * __clocksource_register_scale - Used to install new clocksources
670 * @t: clocksource to be registered 702 * @cs: clocksource to be registered
671 * @scale: Scale factor multiplied against freq to get clocksource hz 703 * @scale: Scale factor multiplied against freq to get clocksource hz
672 * @freq: clocksource frequency (cycles per second) divided by scale 704 * @freq: clocksource frequency (cycles per second) divided by scale
673 * 705 *
@@ -695,12 +727,18 @@ EXPORT_SYMBOL_GPL(__clocksource_register_scale);
695 727
696/** 728/**
697 * clocksource_register - Used to install new clocksources 729 * clocksource_register - Used to install new clocksources
698 * @t: clocksource to be registered 730 * @cs: clocksource to be registered
699 * 731 *
700 * Returns -EBUSY if registration fails, zero otherwise. 732 * Returns -EBUSY if registration fails, zero otherwise.
701 */ 733 */
702int clocksource_register(struct clocksource *cs) 734int clocksource_register(struct clocksource *cs)
703{ 735{
736 /* calculate max adjustment for given mult/shift */
737 cs->maxadj = clocksource_max_adjustment(cs);
738 WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
739 "Clocksource %s might overflow on 11%% adjustment\n",
740 cs->name);
741
704 /* calculate max idle time permitted for this clocksource */ 742 /* calculate max idle time permitted for this clocksource */
705 cs->max_idle_ns = clocksource_max_deferment(cs); 743 cs->max_idle_ns = clocksource_max_deferment(cs);
706 744
@@ -723,6 +761,8 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating)
723 761
724/** 762/**
725 * clocksource_change_rating - Change the rating of a registered clocksource 763 * clocksource_change_rating - Change the rating of a registered clocksource
764 * @cs: clocksource to be changed
765 * @rating: new rating
726 */ 766 */
727void clocksource_change_rating(struct clocksource *cs, int rating) 767void clocksource_change_rating(struct clocksource *cs, int rating)
728{ 768{
@@ -734,6 +774,7 @@ EXPORT_SYMBOL(clocksource_change_rating);
734 774
735/** 775/**
736 * clocksource_unregister - remove a registered clocksource 776 * clocksource_unregister - remove a registered clocksource
777 * @cs: clocksource to be unregistered
737 */ 778 */
738void clocksource_unregister(struct clocksource *cs) 779void clocksource_unregister(struct clocksource *cs)
739{ 780{
@@ -749,13 +790,14 @@ EXPORT_SYMBOL(clocksource_unregister);
749/** 790/**
750 * sysfs_show_current_clocksources - sysfs interface for current clocksource 791 * sysfs_show_current_clocksources - sysfs interface for current clocksource
751 * @dev: unused 792 * @dev: unused
793 * @attr: unused
752 * @buf: char buffer to be filled with clocksource list 794 * @buf: char buffer to be filled with clocksource list
753 * 795 *
754 * Provides sysfs interface for listing current clocksource. 796 * Provides sysfs interface for listing current clocksource.
755 */ 797 */
756static ssize_t 798static ssize_t
757sysfs_show_current_clocksources(struct sys_device *dev, 799sysfs_show_current_clocksources(struct device *dev,
758 struct sysdev_attribute *attr, char *buf) 800 struct device_attribute *attr, char *buf)
759{ 801{
760 ssize_t count = 0; 802 ssize_t count = 0;
761 803
@@ -769,14 +811,15 @@ sysfs_show_current_clocksources(struct sys_device *dev,
769/** 811/**
770 * sysfs_override_clocksource - interface for manually overriding clocksource 812 * sysfs_override_clocksource - interface for manually overriding clocksource
771 * @dev: unused 813 * @dev: unused
814 * @attr: unused
772 * @buf: name of override clocksource 815 * @buf: name of override clocksource
773 * @count: length of buffer 816 * @count: length of buffer
774 * 817 *
775 * Takes input from sysfs interface for manually overriding the default 818 * Takes input from sysfs interface for manually overriding the default
776 * clocksource selection. 819 * clocksource selection.
777 */ 820 */
778static ssize_t sysfs_override_clocksource(struct sys_device *dev, 821static ssize_t sysfs_override_clocksource(struct device *dev,
779 struct sysdev_attribute *attr, 822 struct device_attribute *attr,
780 const char *buf, size_t count) 823 const char *buf, size_t count)
781{ 824{
782 size_t ret = count; 825 size_t ret = count;
@@ -804,13 +847,14 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
804/** 847/**
805 * sysfs_show_available_clocksources - sysfs interface for listing clocksource 848 * sysfs_show_available_clocksources - sysfs interface for listing clocksource
806 * @dev: unused 849 * @dev: unused
850 * @attr: unused
807 * @buf: char buffer to be filled with clocksource list 851 * @buf: char buffer to be filled with clocksource list
808 * 852 *
809 * Provides sysfs interface for listing registered clocksources 853 * Provides sysfs interface for listing registered clocksources
810 */ 854 */
811static ssize_t 855static ssize_t
812sysfs_show_available_clocksources(struct sys_device *dev, 856sysfs_show_available_clocksources(struct device *dev,
813 struct sysdev_attribute *attr, 857 struct device_attribute *attr,
814 char *buf) 858 char *buf)
815{ 859{
816 struct clocksource *src; 860 struct clocksource *src;
@@ -839,35 +883,36 @@ sysfs_show_available_clocksources(struct sys_device *dev,
839/* 883/*
840 * Sysfs setup bits: 884 * Sysfs setup bits:
841 */ 885 */
842static SYSDEV_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, 886static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources,
843 sysfs_override_clocksource); 887 sysfs_override_clocksource);
844 888
845static SYSDEV_ATTR(available_clocksource, 0444, 889static DEVICE_ATTR(available_clocksource, 0444,
846 sysfs_show_available_clocksources, NULL); 890 sysfs_show_available_clocksources, NULL);
847 891
848static struct sysdev_class clocksource_sysclass = { 892static struct bus_type clocksource_subsys = {
849 .name = "clocksource", 893 .name = "clocksource",
894 .dev_name = "clocksource",
850}; 895};
851 896
852static struct sys_device device_clocksource = { 897static struct device device_clocksource = {
853 .id = 0, 898 .id = 0,
854 .cls = &clocksource_sysclass, 899 .bus = &clocksource_subsys,
855}; 900};
856 901
857static int __init init_clocksource_sysfs(void) 902static int __init init_clocksource_sysfs(void)
858{ 903{
859 int error = sysdev_class_register(&clocksource_sysclass); 904 int error = subsys_system_register(&clocksource_subsys, NULL);
860 905
861 if (!error) 906 if (!error)
862 error = sysdev_register(&device_clocksource); 907 error = device_register(&device_clocksource);
863 if (!error) 908 if (!error)
864 error = sysdev_create_file( 909 error = device_create_file(
865 &device_clocksource, 910 &device_clocksource,
866 &attr_current_clocksource); 911 &dev_attr_current_clocksource);
867 if (!error) 912 if (!error)
868 error = sysdev_create_file( 913 error = device_create_file(
869 &device_clocksource, 914 &device_clocksource,
870 &attr_available_clocksource); 915 &dev_attr_available_clocksource);
871 return error; 916 return error;
872} 917}
873 918
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index f954282d9a82..fd4a7b1625a2 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -71,7 +71,7 @@ int tick_check_broadcast_device(struct clock_event_device *dev)
71 (dev->features & CLOCK_EVT_FEAT_C3STOP)) 71 (dev->features & CLOCK_EVT_FEAT_C3STOP))
72 return 0; 72 return 0;
73 73
74 clockevents_exchange_device(NULL, dev); 74 clockevents_exchange_device(tick_broadcast_device.evtdev, dev);
75 tick_broadcast_device.evtdev = dev; 75 tick_broadcast_device.evtdev = dev;
76 if (!cpumask_empty(tick_get_broadcast_mask())) 76 if (!cpumask_empty(tick_get_broadcast_mask()))
77 tick_broadcast_start_periodic(dev); 77 tick_broadcast_start_periodic(dev);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 40420644d0ba..7656642e4b8e 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -275,42 +275,17 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
275} 275}
276EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); 276EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
277 277
278/** 278static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
279 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
280 *
281 * When the next event is more than a tick into the future, stop the idle tick
282 * Called either from the idle loop or from irq_exit() when an idle period was
283 * just interrupted by an interrupt which did not cause a reschedule.
284 */
285void tick_nohz_stop_sched_tick(int inidle)
286{ 279{
287 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; 280 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
288 struct tick_sched *ts;
289 ktime_t last_update, expires, now; 281 ktime_t last_update, expires, now;
290 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 282 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
291 u64 time_delta; 283 u64 time_delta;
292 int cpu; 284 int cpu;
293 285
294 local_irq_save(flags);
295
296 cpu = smp_processor_id(); 286 cpu = smp_processor_id();
297 ts = &per_cpu(tick_cpu_sched, cpu); 287 ts = &per_cpu(tick_cpu_sched, cpu);
298 288
299 /*
300 * Call to tick_nohz_start_idle stops the last_update_time from being
301 * updated. Thus, it must not be called in the event we are called from
302 * irq_exit() with the prior state different than idle.
303 */
304 if (!inidle && !ts->inidle)
305 goto end;
306
307 /*
308 * Set ts->inidle unconditionally. Even if the system did not
309 * switch to NOHZ mode the cpu frequency governers rely on the
310 * update of the idle time accounting in tick_nohz_start_idle().
311 */
312 ts->inidle = 1;
313
314 now = tick_nohz_start_idle(cpu, ts); 289 now = tick_nohz_start_idle(cpu, ts);
315 290
316 /* 291 /*
@@ -326,10 +301,10 @@ void tick_nohz_stop_sched_tick(int inidle)
326 } 301 }
327 302
328 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) 303 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
329 goto end; 304 return;
330 305
331 if (need_resched()) 306 if (need_resched())
332 goto end; 307 return;
333 308
334 if (unlikely(local_softirq_pending() && cpu_online(cpu))) { 309 if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
335 static int ratelimit; 310 static int ratelimit;
@@ -339,7 +314,7 @@ void tick_nohz_stop_sched_tick(int inidle)
339 (unsigned int) local_softirq_pending()); 314 (unsigned int) local_softirq_pending());
340 ratelimit++; 315 ratelimit++;
341 } 316 }
342 goto end; 317 return;
343 } 318 }
344 319
345 ts->idle_calls++; 320 ts->idle_calls++;
@@ -434,7 +409,6 @@ void tick_nohz_stop_sched_tick(int inidle)
434 ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); 409 ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
435 ts->tick_stopped = 1; 410 ts->tick_stopped = 1;
436 ts->idle_jiffies = last_jiffies; 411 ts->idle_jiffies = last_jiffies;
437 rcu_enter_nohz();
438 } 412 }
439 413
440 ts->idle_sleeps++; 414 ts->idle_sleeps++;
@@ -472,8 +446,64 @@ out:
472 ts->next_jiffies = next_jiffies; 446 ts->next_jiffies = next_jiffies;
473 ts->last_jiffies = last_jiffies; 447 ts->last_jiffies = last_jiffies;
474 ts->sleep_length = ktime_sub(dev->next_event, now); 448 ts->sleep_length = ktime_sub(dev->next_event, now);
475end: 449}
476 local_irq_restore(flags); 450
451/**
452 * tick_nohz_idle_enter - stop the idle tick from the idle task
453 *
454 * When the next event is more than a tick into the future, stop the idle tick
455 * Called when we start the idle loop.
456 *
457 * The arch is responsible of calling:
458 *
459 * - rcu_idle_enter() after its last use of RCU before the CPU is put
460 * to sleep.
461 * - rcu_idle_exit() before the first use of RCU after the CPU is woken up.
462 */
463void tick_nohz_idle_enter(void)
464{
465 struct tick_sched *ts;
466
467 WARN_ON_ONCE(irqs_disabled());
468
469 /*
470 * Update the idle state in the scheduler domain hierarchy
471 * when tick_nohz_stop_sched_tick() is called from the idle loop.
472 * State will be updated to busy during the first busy tick after
473 * exiting idle.
474 */
475 set_cpu_sd_state_idle();
476
477 local_irq_disable();
478
479 ts = &__get_cpu_var(tick_cpu_sched);
480 /*
481 * set ts->inidle unconditionally. even if the system did not
482 * switch to nohz mode the cpu frequency governers rely on the
483 * update of the idle time accounting in tick_nohz_start_idle().
484 */
485 ts->inidle = 1;
486 tick_nohz_stop_sched_tick(ts);
487
488 local_irq_enable();
489}
490
491/**
492 * tick_nohz_irq_exit - update next tick event from interrupt exit
493 *
494 * When an interrupt fires while we are idle and it doesn't cause
495 * a reschedule, it may still add, modify or delete a timer, enqueue
496 * an RCU callback, etc...
497 * So we need to re-calculate and reprogram the next tick event.
498 */
499void tick_nohz_irq_exit(void)
500{
501 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
502
503 if (!ts->inidle)
504 return;
505
506 tick_nohz_stop_sched_tick(ts);
477} 507}
478 508
479/** 509/**
@@ -515,11 +545,13 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
515} 545}
516 546
517/** 547/**
518 * tick_nohz_restart_sched_tick - restart the idle tick from the idle task 548 * tick_nohz_idle_exit - restart the idle tick from the idle task
519 * 549 *
520 * Restart the idle tick when the CPU is woken up from idle 550 * Restart the idle tick when the CPU is woken up from idle
551 * This also exit the RCU extended quiescent state. The CPU
552 * can use RCU again after this function is called.
521 */ 553 */
522void tick_nohz_restart_sched_tick(void) 554void tick_nohz_idle_exit(void)
523{ 555{
524 int cpu = smp_processor_id(); 556 int cpu = smp_processor_id();
525 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 557 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
@@ -529,6 +561,7 @@ void tick_nohz_restart_sched_tick(void)
529 ktime_t now; 561 ktime_t now;
530 562
531 local_irq_disable(); 563 local_irq_disable();
564
532 if (ts->idle_active || (ts->inidle && ts->tick_stopped)) 565 if (ts->idle_active || (ts->inidle && ts->tick_stopped))
533 now = ktime_get(); 566 now = ktime_get();
534 567
@@ -543,8 +576,6 @@ void tick_nohz_restart_sched_tick(void)
543 576
544 ts->inidle = 0; 577 ts->inidle = 0;
545 578
546 rcu_exit_nohz();
547
548 /* Update jiffies first */ 579 /* Update jiffies first */
549 select_nohz_load_balancer(0); 580 select_nohz_load_balancer(0);
550 tick_do_update_jiffies64(now); 581 tick_do_update_jiffies64(now);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 169479994755..e6a5a6bc2769 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -131,7 +131,7 @@ static inline s64 timekeeping_get_ns_raw(void)
131 /* calculate the delta since the last update_wall_time: */ 131 /* calculate the delta since the last update_wall_time: */
132 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 132 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
133 133
134 /* return delta convert to nanoseconds using ntp adjusted mult. */ 134 /* return delta convert to nanoseconds. */
135 return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); 135 return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
136} 136}
137 137
@@ -249,6 +249,8 @@ ktime_t ktime_get(void)
249 secs = xtime.tv_sec + wall_to_monotonic.tv_sec; 249 secs = xtime.tv_sec + wall_to_monotonic.tv_sec;
250 nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; 250 nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec;
251 nsecs += timekeeping_get_ns(); 251 nsecs += timekeeping_get_ns();
252 /* If arch requires, add in gettimeoffset() */
253 nsecs += arch_gettimeoffset();
252 254
253 } while (read_seqretry(&xtime_lock, seq)); 255 } while (read_seqretry(&xtime_lock, seq));
254 /* 256 /*
@@ -280,6 +282,8 @@ void ktime_get_ts(struct timespec *ts)
280 *ts = xtime; 282 *ts = xtime;
281 tomono = wall_to_monotonic; 283 tomono = wall_to_monotonic;
282 nsecs = timekeeping_get_ns(); 284 nsecs = timekeeping_get_ns();
285 /* If arch requires, add in gettimeoffset() */
286 nsecs += arch_gettimeoffset();
283 287
284 } while (read_seqretry(&xtime_lock, seq)); 288 } while (read_seqretry(&xtime_lock, seq));
285 289
@@ -802,14 +806,44 @@ static void timekeeping_adjust(s64 offset)
802 s64 error, interval = timekeeper.cycle_interval; 806 s64 error, interval = timekeeper.cycle_interval;
803 int adj; 807 int adj;
804 808
809 /*
810 * The point of this is to check if the error is greater then half
811 * an interval.
812 *
813 * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs.
814 *
815 * Note we subtract one in the shift, so that error is really error*2.
816 * This "saves" dividing(shifting) interval twice, but keeps the
817 * (error > interval) comparison as still measuring if error is
818 * larger then half an interval.
819 *
820 * Note: It does not "save" on aggravation when reading the code.
821 */
805 error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1); 822 error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1);
806 if (error > interval) { 823 if (error > interval) {
824 /*
825 * We now divide error by 4(via shift), which checks if
826 * the error is greater then twice the interval.
827 * If it is greater, we need a bigadjust, if its smaller,
828 * we can adjust by 1.
829 */
807 error >>= 2; 830 error >>= 2;
831 /*
832 * XXX - In update_wall_time, we round up to the next
833 * nanosecond, and store the amount rounded up into
834 * the error. This causes the likely below to be unlikely.
835 *
836 * The proper fix is to avoid rounding up by using
837 * the high precision timekeeper.xtime_nsec instead of
838 * xtime.tv_nsec everywhere. Fixing this will take some
839 * time.
840 */
808 if (likely(error <= interval)) 841 if (likely(error <= interval))
809 adj = 1; 842 adj = 1;
810 else 843 else
811 adj = timekeeping_bigadjust(error, &interval, &offset); 844 adj = timekeeping_bigadjust(error, &interval, &offset);
812 } else if (error < -interval) { 845 } else if (error < -interval) {
846 /* See comment above, this is just switched for the negative */
813 error >>= 2; 847 error >>= 2;
814 if (likely(error >= -interval)) { 848 if (likely(error >= -interval)) {
815 adj = -1; 849 adj = -1;
@@ -817,9 +851,65 @@ static void timekeeping_adjust(s64 offset)
817 offset = -offset; 851 offset = -offset;
818 } else 852 } else
819 adj = timekeeping_bigadjust(error, &interval, &offset); 853 adj = timekeeping_bigadjust(error, &interval, &offset);
820 } else 854 } else /* No adjustment needed */
821 return; 855 return;
822 856
857 WARN_ONCE(timekeeper.clock->maxadj &&
858 (timekeeper.mult + adj > timekeeper.clock->mult +
859 timekeeper.clock->maxadj),
860 "Adjusting %s more then 11%% (%ld vs %ld)\n",
861 timekeeper.clock->name, (long)timekeeper.mult + adj,
862 (long)timekeeper.clock->mult +
863 timekeeper.clock->maxadj);
864 /*
865 * So the following can be confusing.
866 *
867 * To keep things simple, lets assume adj == 1 for now.
868 *
869 * When adj != 1, remember that the interval and offset values
870 * have been appropriately scaled so the math is the same.
871 *
872 * The basic idea here is that we're increasing the multiplier
873 * by one, this causes the xtime_interval to be incremented by
874 * one cycle_interval. This is because:
875 * xtime_interval = cycle_interval * mult
876 * So if mult is being incremented by one:
877 * xtime_interval = cycle_interval * (mult + 1)
878 * Its the same as:
879 * xtime_interval = (cycle_interval * mult) + cycle_interval
880 * Which can be shortened to:
881 * xtime_interval += cycle_interval
882 *
883 * So offset stores the non-accumulated cycles. Thus the current
884 * time (in shifted nanoseconds) is:
885 * now = (offset * adj) + xtime_nsec
886 * Now, even though we're adjusting the clock frequency, we have
887 * to keep time consistent. In other words, we can't jump back
888 * in time, and we also want to avoid jumping forward in time.
889 *
890 * So given the same offset value, we need the time to be the same
891 * both before and after the freq adjustment.
892 * now = (offset * adj_1) + xtime_nsec_1
893 * now = (offset * adj_2) + xtime_nsec_2
894 * So:
895 * (offset * adj_1) + xtime_nsec_1 =
896 * (offset * adj_2) + xtime_nsec_2
897 * And we know:
898 * adj_2 = adj_1 + 1
899 * So:
900 * (offset * adj_1) + xtime_nsec_1 =
901 * (offset * (adj_1+1)) + xtime_nsec_2
902 * (offset * adj_1) + xtime_nsec_1 =
903 * (offset * adj_1) + offset + xtime_nsec_2
904 * Canceling the sides:
905 * xtime_nsec_1 = offset + xtime_nsec_2
906 * Which gives us:
907 * xtime_nsec_2 = xtime_nsec_1 - offset
908 * Which simplfies to:
909 * xtime_nsec -= offset
910 *
911 * XXX - TODO: Doc ntp_error calculation.
912 */
823 timekeeper.mult += adj; 913 timekeeper.mult += adj;
824 timekeeper.xtime_interval += interval; 914 timekeeper.xtime_interval += interval;
825 timekeeper.xtime_nsec -= offset; 915 timekeeper.xtime_nsec -= offset;
diff --git a/kernel/timer.c b/kernel/timer.c
index dbaa62422b13..a297ffcf888e 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -427,6 +427,12 @@ static int timer_fixup_init(void *addr, enum debug_obj_state state)
427 } 427 }
428} 428}
429 429
430/* Stub timer callback for improperly used timers. */
431static void stub_timer(unsigned long data)
432{
433 WARN_ON(1);
434}
435
430/* 436/*
431 * fixup_activate is called when: 437 * fixup_activate is called when:
432 * - an active object is activated 438 * - an active object is activated
@@ -450,7 +456,8 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state)
450 debug_object_activate(timer, &timer_debug_descr); 456 debug_object_activate(timer, &timer_debug_descr);
451 return 0; 457 return 0;
452 } else { 458 } else {
453 WARN_ON_ONCE(1); 459 setup_timer(timer, stub_timer, 0);
460 return 1;
454 } 461 }
455 return 0; 462 return 0;
456 463
@@ -480,12 +487,40 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state)
480 } 487 }
481} 488}
482 489
490/*
491 * fixup_assert_init is called when:
492 * - an untracked/uninit-ed object is found
493 */
494static int timer_fixup_assert_init(void *addr, enum debug_obj_state state)
495{
496 struct timer_list *timer = addr;
497
498 switch (state) {
499 case ODEBUG_STATE_NOTAVAILABLE:
500 if (timer->entry.prev == TIMER_ENTRY_STATIC) {
501 /*
502 * This is not really a fixup. The timer was
503 * statically initialized. We just make sure that it
504 * is tracked in the object tracker.
505 */
506 debug_object_init(timer, &timer_debug_descr);
507 return 0;
508 } else {
509 setup_timer(timer, stub_timer, 0);
510 return 1;
511 }
512 default:
513 return 0;
514 }
515}
516
483static struct debug_obj_descr timer_debug_descr = { 517static struct debug_obj_descr timer_debug_descr = {
484 .name = "timer_list", 518 .name = "timer_list",
485 .debug_hint = timer_debug_hint, 519 .debug_hint = timer_debug_hint,
486 .fixup_init = timer_fixup_init, 520 .fixup_init = timer_fixup_init,
487 .fixup_activate = timer_fixup_activate, 521 .fixup_activate = timer_fixup_activate,
488 .fixup_free = timer_fixup_free, 522 .fixup_free = timer_fixup_free,
523 .fixup_assert_init = timer_fixup_assert_init,
489}; 524};
490 525
491static inline void debug_timer_init(struct timer_list *timer) 526static inline void debug_timer_init(struct timer_list *timer)
@@ -508,6 +543,11 @@ static inline void debug_timer_free(struct timer_list *timer)
508 debug_object_free(timer, &timer_debug_descr); 543 debug_object_free(timer, &timer_debug_descr);
509} 544}
510 545
546static inline void debug_timer_assert_init(struct timer_list *timer)
547{
548 debug_object_assert_init(timer, &timer_debug_descr);
549}
550
511static void __init_timer(struct timer_list *timer, 551static void __init_timer(struct timer_list *timer,
512 const char *name, 552 const char *name,
513 struct lock_class_key *key); 553 struct lock_class_key *key);
@@ -531,6 +571,7 @@ EXPORT_SYMBOL_GPL(destroy_timer_on_stack);
531static inline void debug_timer_init(struct timer_list *timer) { } 571static inline void debug_timer_init(struct timer_list *timer) { }
532static inline void debug_timer_activate(struct timer_list *timer) { } 572static inline void debug_timer_activate(struct timer_list *timer) { }
533static inline void debug_timer_deactivate(struct timer_list *timer) { } 573static inline void debug_timer_deactivate(struct timer_list *timer) { }
574static inline void debug_timer_assert_init(struct timer_list *timer) { }
534#endif 575#endif
535 576
536static inline void debug_init(struct timer_list *timer) 577static inline void debug_init(struct timer_list *timer)
@@ -552,6 +593,11 @@ static inline void debug_deactivate(struct timer_list *timer)
552 trace_timer_cancel(timer); 593 trace_timer_cancel(timer);
553} 594}
554 595
596static inline void debug_assert_init(struct timer_list *timer)
597{
598 debug_timer_assert_init(timer);
599}
600
555static void __init_timer(struct timer_list *timer, 601static void __init_timer(struct timer_list *timer,
556 const char *name, 602 const char *name,
557 struct lock_class_key *key) 603 struct lock_class_key *key)
@@ -902,6 +948,8 @@ int del_timer(struct timer_list *timer)
902 unsigned long flags; 948 unsigned long flags;
903 int ret = 0; 949 int ret = 0;
904 950
951 debug_assert_init(timer);
952
905 timer_stats_timer_clear_start_info(timer); 953 timer_stats_timer_clear_start_info(timer);
906 if (timer_pending(timer)) { 954 if (timer_pending(timer)) {
907 base = lock_timer_base(timer, &flags); 955 base = lock_timer_base(timer, &flags);
@@ -932,6 +980,8 @@ int try_to_del_timer_sync(struct timer_list *timer)
932 unsigned long flags; 980 unsigned long flags;
933 int ret = -1; 981 int ret = -1;
934 982
983 debug_assert_init(timer);
984
935 base = lock_timer_base(timer, &flags); 985 base = lock_timer_base(timer, &flags);
936 986
937 if (base->running_timer == timer) 987 if (base->running_timer == timer)
@@ -1368,7 +1418,7 @@ SYSCALL_DEFINE0(getppid)
1368 int pid; 1418 int pid;
1369 1419
1370 rcu_read_lock(); 1420 rcu_read_lock();
1371 pid = task_tgid_vnr(current->real_parent); 1421 pid = task_tgid_vnr(rcu_dereference(current->real_parent));
1372 rcu_read_unlock(); 1422 rcu_read_unlock();
1373 1423
1374 return pid; 1424 return pid;
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 16fc34a0806f..cdea7b56b0c9 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -402,7 +402,7 @@ static int blk_remove_buf_file_callback(struct dentry *dentry)
402 402
403static struct dentry *blk_create_buf_file_callback(const char *filename, 403static struct dentry *blk_create_buf_file_callback(const char *filename,
404 struct dentry *parent, 404 struct dentry *parent,
405 int mode, 405 umode_t mode,
406 struct rchan_buf *buf, 406 struct rchan_buf *buf,
407 int *is_global) 407 int *is_global)
408{ 408{
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 900b409543db..683d559a0eef 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -22,11 +22,13 @@
22#include <linux/hardirq.h> 22#include <linux/hardirq.h>
23#include <linux/kthread.h> 23#include <linux/kthread.h>
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
25#include <linux/bsearch.h>
25#include <linux/module.h> 26#include <linux/module.h>
26#include <linux/ftrace.h> 27#include <linux/ftrace.h>
27#include <linux/sysctl.h> 28#include <linux/sysctl.h>
28#include <linux/slab.h> 29#include <linux/slab.h>
29#include <linux/ctype.h> 30#include <linux/ctype.h>
31#include <linux/sort.h>
30#include <linux/list.h> 32#include <linux/list.h>
31#include <linux/hash.h> 33#include <linux/hash.h>
32#include <linux/rcupdate.h> 34#include <linux/rcupdate.h>
@@ -152,7 +154,6 @@ void clear_ftrace_function(void)
152 ftrace_pid_function = ftrace_stub; 154 ftrace_pid_function = ftrace_stub;
153} 155}
154 156
155#undef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
156#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST 157#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
157/* 158/*
158 * For those archs that do not test ftrace_trace_stop in their 159 * For those archs that do not test ftrace_trace_stop in their
@@ -948,13 +949,6 @@ struct ftrace_func_probe {
948 struct rcu_head rcu; 949 struct rcu_head rcu;
949}; 950};
950 951
951enum {
952 FTRACE_ENABLE_CALLS = (1 << 0),
953 FTRACE_DISABLE_CALLS = (1 << 1),
954 FTRACE_UPDATE_TRACE_FUNC = (1 << 2),
955 FTRACE_START_FUNC_RET = (1 << 3),
956 FTRACE_STOP_FUNC_RET = (1 << 4),
957};
958struct ftrace_func_entry { 952struct ftrace_func_entry {
959 struct hlist_node hlist; 953 struct hlist_node hlist;
960 unsigned long ip; 954 unsigned long ip;
@@ -985,18 +979,19 @@ static struct ftrace_ops global_ops = {
985 .filter_hash = EMPTY_HASH, 979 .filter_hash = EMPTY_HASH,
986}; 980};
987 981
988static struct dyn_ftrace *ftrace_new_addrs;
989
990static DEFINE_MUTEX(ftrace_regex_lock); 982static DEFINE_MUTEX(ftrace_regex_lock);
991 983
992struct ftrace_page { 984struct ftrace_page {
993 struct ftrace_page *next; 985 struct ftrace_page *next;
986 struct dyn_ftrace *records;
994 int index; 987 int index;
995 struct dyn_ftrace records[]; 988 int size;
996}; 989};
997 990
998#define ENTRIES_PER_PAGE \ 991static struct ftrace_page *ftrace_new_pgs;
999 ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace)) 992
993#define ENTRY_SIZE sizeof(struct dyn_ftrace)
994#define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE)
1000 995
1001/* estimate from running different kernels */ 996/* estimate from running different kernels */
1002#define NR_TO_INIT 10000 997#define NR_TO_INIT 10000
@@ -1004,7 +999,10 @@ struct ftrace_page {
1004static struct ftrace_page *ftrace_pages_start; 999static struct ftrace_page *ftrace_pages_start;
1005static struct ftrace_page *ftrace_pages; 1000static struct ftrace_page *ftrace_pages;
1006 1001
1007static struct dyn_ftrace *ftrace_free_records; 1002static bool ftrace_hash_empty(struct ftrace_hash *hash)
1003{
1004 return !hash || !hash->count;
1005}
1008 1006
1009static struct ftrace_func_entry * 1007static struct ftrace_func_entry *
1010ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) 1008ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
@@ -1014,7 +1012,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
1014 struct hlist_head *hhd; 1012 struct hlist_head *hhd;
1015 struct hlist_node *n; 1013 struct hlist_node *n;
1016 1014
1017 if (!hash->count) 1015 if (ftrace_hash_empty(hash))
1018 return NULL; 1016 return NULL;
1019 1017
1020 if (hash->size_bits > 0) 1018 if (hash->size_bits > 0)
@@ -1158,7 +1156,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
1158 return NULL; 1156 return NULL;
1159 1157
1160 /* Empty hash? */ 1158 /* Empty hash? */
1161 if (!hash || !hash->count) 1159 if (ftrace_hash_empty(hash))
1162 return new_hash; 1160 return new_hash;
1163 1161
1164 size = 1 << hash->size_bits; 1162 size = 1 << hash->size_bits;
@@ -1212,7 +1210,9 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
1212 if (!src->count) { 1210 if (!src->count) {
1213 free_ftrace_hash_rcu(*dst); 1211 free_ftrace_hash_rcu(*dst);
1214 rcu_assign_pointer(*dst, EMPTY_HASH); 1212 rcu_assign_pointer(*dst, EMPTY_HASH);
1215 return 0; 1213 /* still need to update the function records */
1214 ret = 0;
1215 goto out;
1216 } 1216 }
1217 1217
1218 /* 1218 /*
@@ -1281,9 +1281,9 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
1281 filter_hash = rcu_dereference_raw(ops->filter_hash); 1281 filter_hash = rcu_dereference_raw(ops->filter_hash);
1282 notrace_hash = rcu_dereference_raw(ops->notrace_hash); 1282 notrace_hash = rcu_dereference_raw(ops->notrace_hash);
1283 1283
1284 if ((!filter_hash || !filter_hash->count || 1284 if ((ftrace_hash_empty(filter_hash) ||
1285 ftrace_lookup_ip(filter_hash, ip)) && 1285 ftrace_lookup_ip(filter_hash, ip)) &&
1286 (!notrace_hash || !notrace_hash->count || 1286 (ftrace_hash_empty(notrace_hash) ||
1287 !ftrace_lookup_ip(notrace_hash, ip))) 1287 !ftrace_lookup_ip(notrace_hash, ip)))
1288 ret = 1; 1288 ret = 1;
1289 else 1289 else
@@ -1306,6 +1306,47 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
1306 } \ 1306 } \
1307 } 1307 }
1308 1308
1309
1310static int ftrace_cmp_recs(const void *a, const void *b)
1311{
1312 const struct dyn_ftrace *reca = a;
1313 const struct dyn_ftrace *recb = b;
1314
1315 if (reca->ip > recb->ip)
1316 return 1;
1317 if (reca->ip < recb->ip)
1318 return -1;
1319 return 0;
1320}
1321
1322/**
1323 * ftrace_location - return true if the ip giving is a traced location
1324 * @ip: the instruction pointer to check
1325 *
1326 * Returns 1 if @ip given is a pointer to a ftrace location.
1327 * That is, the instruction that is either a NOP or call to
1328 * the function tracer. It checks the ftrace internal tables to
1329 * determine if the address belongs or not.
1330 */
1331int ftrace_location(unsigned long ip)
1332{
1333 struct ftrace_page *pg;
1334 struct dyn_ftrace *rec;
1335 struct dyn_ftrace key;
1336
1337 key.ip = ip;
1338
1339 for (pg = ftrace_pages_start; pg; pg = pg->next) {
1340 rec = bsearch(&key, pg->records, pg->index,
1341 sizeof(struct dyn_ftrace),
1342 ftrace_cmp_recs);
1343 if (rec)
1344 return 1;
1345 }
1346
1347 return 0;
1348}
1349
1309static void __ftrace_hash_rec_update(struct ftrace_ops *ops, 1350static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1310 int filter_hash, 1351 int filter_hash,
1311 bool inc) 1352 bool inc)
@@ -1335,7 +1376,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1335 if (filter_hash) { 1376 if (filter_hash) {
1336 hash = ops->filter_hash; 1377 hash = ops->filter_hash;
1337 other_hash = ops->notrace_hash; 1378 other_hash = ops->notrace_hash;
1338 if (!hash || !hash->count) 1379 if (ftrace_hash_empty(hash))
1339 all = 1; 1380 all = 1;
1340 } else { 1381 } else {
1341 inc = !inc; 1382 inc = !inc;
@@ -1345,7 +1386,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1345 * If the notrace hash has no items, 1386 * If the notrace hash has no items,
1346 * then there's nothing to do. 1387 * then there's nothing to do.
1347 */ 1388 */
1348 if (hash && !hash->count) 1389 if (ftrace_hash_empty(hash))
1349 return; 1390 return;
1350 } 1391 }
1351 1392
@@ -1362,8 +1403,8 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1362 if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip)) 1403 if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip))
1363 match = 1; 1404 match = 1;
1364 } else { 1405 } else {
1365 in_hash = hash && !!ftrace_lookup_ip(hash, rec->ip); 1406 in_hash = !!ftrace_lookup_ip(hash, rec->ip);
1366 in_other_hash = other_hash && !!ftrace_lookup_ip(other_hash, rec->ip); 1407 in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip);
1367 1408
1368 /* 1409 /*
1369 * 1410 *
@@ -1371,7 +1412,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1371 if (filter_hash && in_hash && !in_other_hash) 1412 if (filter_hash && in_hash && !in_other_hash)
1372 match = 1; 1413 match = 1;
1373 else if (!filter_hash && in_hash && 1414 else if (!filter_hash && in_hash &&
1374 (in_other_hash || !other_hash->count)) 1415 (in_other_hash || ftrace_hash_empty(other_hash)))
1375 match = 1; 1416 match = 1;
1376 } 1417 }
1377 if (!match) 1418 if (!match)
@@ -1405,40 +1446,12 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops,
1405 __ftrace_hash_rec_update(ops, filter_hash, 1); 1446 __ftrace_hash_rec_update(ops, filter_hash, 1);
1406} 1447}
1407 1448
1408static void ftrace_free_rec(struct dyn_ftrace *rec)
1409{
1410 rec->freelist = ftrace_free_records;
1411 ftrace_free_records = rec;
1412 rec->flags |= FTRACE_FL_FREE;
1413}
1414
1415static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) 1449static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
1416{ 1450{
1417 struct dyn_ftrace *rec; 1451 if (ftrace_pages->index == ftrace_pages->size) {
1418 1452 /* We should have allocated enough */
1419 /* First check for freed records */ 1453 if (WARN_ON(!ftrace_pages->next))
1420 if (ftrace_free_records) {
1421 rec = ftrace_free_records;
1422
1423 if (unlikely(!(rec->flags & FTRACE_FL_FREE))) {
1424 FTRACE_WARN_ON_ONCE(1);
1425 ftrace_free_records = NULL;
1426 return NULL; 1454 return NULL;
1427 }
1428
1429 ftrace_free_records = rec->freelist;
1430 memset(rec, 0, sizeof(*rec));
1431 return rec;
1432 }
1433
1434 if (ftrace_pages->index == ENTRIES_PER_PAGE) {
1435 if (!ftrace_pages->next) {
1436 /* allocate another page */
1437 ftrace_pages->next =
1438 (void *)get_zeroed_page(GFP_KERNEL);
1439 if (!ftrace_pages->next)
1440 return NULL;
1441 }
1442 ftrace_pages = ftrace_pages->next; 1455 ftrace_pages = ftrace_pages->next;
1443 } 1456 }
1444 1457
@@ -1458,8 +1471,6 @@ ftrace_record_ip(unsigned long ip)
1458 return NULL; 1471 return NULL;
1459 1472
1460 rec->ip = ip; 1473 rec->ip = ip;
1461 rec->newlist = ftrace_new_addrs;
1462 ftrace_new_addrs = rec;
1463 1474
1464 return rec; 1475 return rec;
1465} 1476}
@@ -1474,7 +1485,19 @@ static void print_ip_ins(const char *fmt, unsigned char *p)
1474 printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); 1485 printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
1475} 1486}
1476 1487
1477static void ftrace_bug(int failed, unsigned long ip) 1488/**
1489 * ftrace_bug - report and shutdown function tracer
1490 * @failed: The failed type (EFAULT, EINVAL, EPERM)
1491 * @ip: The address that failed
1492 *
1493 * The arch code that enables or disables the function tracing
1494 * can call ftrace_bug() when it has detected a problem in
1495 * modifying the code. @failed should be one of either:
1496 * EFAULT - if the problem happens on reading the @ip address
1497 * EINVAL - if what is read at @ip is not what was expected
1498 * EPERM - if the problem happens on writting to the @ip address
1499 */
1500void ftrace_bug(int failed, unsigned long ip)
1478{ 1501{
1479 switch (failed) { 1502 switch (failed) {
1480 case -EFAULT: 1503 case -EFAULT:
@@ -1516,24 +1539,19 @@ int ftrace_text_reserved(void *start, void *end)
1516 return 0; 1539 return 0;
1517} 1540}
1518 1541
1519 1542static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
1520static int
1521__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1522{ 1543{
1523 unsigned long ftrace_addr;
1524 unsigned long flag = 0UL; 1544 unsigned long flag = 0UL;
1525 1545
1526 ftrace_addr = (unsigned long)FTRACE_ADDR;
1527
1528 /* 1546 /*
1529 * If we are enabling tracing: 1547 * If we are updating calls:
1530 * 1548 *
1531 * If the record has a ref count, then we need to enable it 1549 * If the record has a ref count, then we need to enable it
1532 * because someone is using it. 1550 * because someone is using it.
1533 * 1551 *
1534 * Otherwise we make sure its disabled. 1552 * Otherwise we make sure its disabled.
1535 * 1553 *
1536 * If we are disabling tracing, then disable all records that 1554 * If we are disabling calls, then disable all records that
1537 * are enabled. 1555 * are enabled.
1538 */ 1556 */
1539 if (enable && (rec->flags & ~FTRACE_FL_MASK)) 1557 if (enable && (rec->flags & ~FTRACE_FL_MASK))
@@ -1541,18 +1559,72 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1541 1559
1542 /* If the state of this record hasn't changed, then do nothing */ 1560 /* If the state of this record hasn't changed, then do nothing */
1543 if ((rec->flags & FTRACE_FL_ENABLED) == flag) 1561 if ((rec->flags & FTRACE_FL_ENABLED) == flag)
1544 return 0; 1562 return FTRACE_UPDATE_IGNORE;
1545 1563
1546 if (flag) { 1564 if (flag) {
1547 rec->flags |= FTRACE_FL_ENABLED; 1565 if (update)
1566 rec->flags |= FTRACE_FL_ENABLED;
1567 return FTRACE_UPDATE_MAKE_CALL;
1568 }
1569
1570 if (update)
1571 rec->flags &= ~FTRACE_FL_ENABLED;
1572
1573 return FTRACE_UPDATE_MAKE_NOP;
1574}
1575
1576/**
1577 * ftrace_update_record, set a record that now is tracing or not
1578 * @rec: the record to update
1579 * @enable: set to 1 if the record is tracing, zero to force disable
1580 *
1581 * The records that represent all functions that can be traced need
1582 * to be updated when tracing has been enabled.
1583 */
1584int ftrace_update_record(struct dyn_ftrace *rec, int enable)
1585{
1586 return ftrace_check_record(rec, enable, 1);
1587}
1588
1589/**
1590 * ftrace_test_record, check if the record has been enabled or not
1591 * @rec: the record to test
1592 * @enable: set to 1 to check if enabled, 0 if it is disabled
1593 *
1594 * The arch code may need to test if a record is already set to
1595 * tracing to determine how to modify the function code that it
1596 * represents.
1597 */
1598int ftrace_test_record(struct dyn_ftrace *rec, int enable)
1599{
1600 return ftrace_check_record(rec, enable, 0);
1601}
1602
1603static int
1604__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1605{
1606 unsigned long ftrace_addr;
1607 int ret;
1608
1609 ftrace_addr = (unsigned long)FTRACE_ADDR;
1610
1611 ret = ftrace_update_record(rec, enable);
1612
1613 switch (ret) {
1614 case FTRACE_UPDATE_IGNORE:
1615 return 0;
1616
1617 case FTRACE_UPDATE_MAKE_CALL:
1548 return ftrace_make_call(rec, ftrace_addr); 1618 return ftrace_make_call(rec, ftrace_addr);
1619
1620 case FTRACE_UPDATE_MAKE_NOP:
1621 return ftrace_make_nop(NULL, rec, ftrace_addr);
1549 } 1622 }
1550 1623
1551 rec->flags &= ~FTRACE_FL_ENABLED; 1624 return -1; /* unknow ftrace bug */
1552 return ftrace_make_nop(NULL, rec, ftrace_addr);
1553} 1625}
1554 1626
1555static void ftrace_replace_code(int enable) 1627static void ftrace_replace_code(int update)
1556{ 1628{
1557 struct dyn_ftrace *rec; 1629 struct dyn_ftrace *rec;
1558 struct ftrace_page *pg; 1630 struct ftrace_page *pg;
@@ -1562,11 +1634,7 @@ static void ftrace_replace_code(int enable)
1562 return; 1634 return;
1563 1635
1564 do_for_each_ftrace_rec(pg, rec) { 1636 do_for_each_ftrace_rec(pg, rec) {
1565 /* Skip over free records */ 1637 failed = __ftrace_replace_code(rec, update);
1566 if (rec->flags & FTRACE_FL_FREE)
1567 continue;
1568
1569 failed = __ftrace_replace_code(rec, enable);
1570 if (failed) { 1638 if (failed) {
1571 ftrace_bug(failed, rec->ip); 1639 ftrace_bug(failed, rec->ip);
1572 /* Stop processing */ 1640 /* Stop processing */
@@ -1575,6 +1643,78 @@ static void ftrace_replace_code(int enable)
1575 } while_for_each_ftrace_rec(); 1643 } while_for_each_ftrace_rec();
1576} 1644}
1577 1645
1646struct ftrace_rec_iter {
1647 struct ftrace_page *pg;
1648 int index;
1649};
1650
1651/**
1652 * ftrace_rec_iter_start, start up iterating over traced functions
1653 *
1654 * Returns an iterator handle that is used to iterate over all
1655 * the records that represent address locations where functions
1656 * are traced.
1657 *
1658 * May return NULL if no records are available.
1659 */
1660struct ftrace_rec_iter *ftrace_rec_iter_start(void)
1661{
1662 /*
1663 * We only use a single iterator.
1664 * Protected by the ftrace_lock mutex.
1665 */
1666 static struct ftrace_rec_iter ftrace_rec_iter;
1667 struct ftrace_rec_iter *iter = &ftrace_rec_iter;
1668
1669 iter->pg = ftrace_pages_start;
1670 iter->index = 0;
1671
1672 /* Could have empty pages */
1673 while (iter->pg && !iter->pg->index)
1674 iter->pg = iter->pg->next;
1675
1676 if (!iter->pg)
1677 return NULL;
1678
1679 return iter;
1680}
1681
1682/**
1683 * ftrace_rec_iter_next, get the next record to process.
1684 * @iter: The handle to the iterator.
1685 *
1686 * Returns the next iterator after the given iterator @iter.
1687 */
1688struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter)
1689{
1690 iter->index++;
1691
1692 if (iter->index >= iter->pg->index) {
1693 iter->pg = iter->pg->next;
1694 iter->index = 0;
1695
1696 /* Could have empty pages */
1697 while (iter->pg && !iter->pg->index)
1698 iter->pg = iter->pg->next;
1699 }
1700
1701 if (!iter->pg)
1702 return NULL;
1703
1704 return iter;
1705}
1706
1707/**
1708 * ftrace_rec_iter_record, get the record at the iterator location
1709 * @iter: The current iterator location
1710 *
1711 * Returns the record that the current @iter is at.
1712 */
1713struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter)
1714{
1715 return &iter->pg->records[iter->index];
1716}
1717
1578static int 1718static int
1579ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) 1719ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
1580{ 1720{
@@ -1616,13 +1756,7 @@ static int __ftrace_modify_code(void *data)
1616{ 1756{
1617 int *command = data; 1757 int *command = data;
1618 1758
1619 /* 1759 if (*command & FTRACE_UPDATE_CALLS)
1620 * Do not call function tracer while we update the code.
1621 * We are in stop machine, no worrying about races.
1622 */
1623 function_trace_stop++;
1624
1625 if (*command & FTRACE_ENABLE_CALLS)
1626 ftrace_replace_code(1); 1760 ftrace_replace_code(1);
1627 else if (*command & FTRACE_DISABLE_CALLS) 1761 else if (*command & FTRACE_DISABLE_CALLS)
1628 ftrace_replace_code(0); 1762 ftrace_replace_code(0);
@@ -1635,21 +1769,33 @@ static int __ftrace_modify_code(void *data)
1635 else if (*command & FTRACE_STOP_FUNC_RET) 1769 else if (*command & FTRACE_STOP_FUNC_RET)
1636 ftrace_disable_ftrace_graph_caller(); 1770 ftrace_disable_ftrace_graph_caller();
1637 1771
1638#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
1639 /*
1640 * For archs that call ftrace_test_stop_func(), we must
1641 * wait till after we update all the function callers
1642 * before we update the callback. This keeps different
1643 * ops that record different functions from corrupting
1644 * each other.
1645 */
1646 __ftrace_trace_function = __ftrace_trace_function_delay;
1647#endif
1648 function_trace_stop--;
1649
1650 return 0; 1772 return 0;
1651} 1773}
1652 1774
1775/**
1776 * ftrace_run_stop_machine, go back to the stop machine method
1777 * @command: The command to tell ftrace what to do
1778 *
1779 * If an arch needs to fall back to the stop machine method, the
1780 * it can call this function.
1781 */
1782void ftrace_run_stop_machine(int command)
1783{
1784 stop_machine(__ftrace_modify_code, &command, NULL);
1785}
1786
1787/**
1788 * arch_ftrace_update_code, modify the code to trace or not trace
1789 * @command: The command that needs to be done
1790 *
1791 * Archs can override this function if it does not need to
1792 * run stop_machine() to modify code.
1793 */
1794void __weak arch_ftrace_update_code(int command)
1795{
1796 ftrace_run_stop_machine(command);
1797}
1798
1653static void ftrace_run_update_code(int command) 1799static void ftrace_run_update_code(int command)
1654{ 1800{
1655 int ret; 1801 int ret;
@@ -1658,8 +1804,31 @@ static void ftrace_run_update_code(int command)
1658 FTRACE_WARN_ON(ret); 1804 FTRACE_WARN_ON(ret);
1659 if (ret) 1805 if (ret)
1660 return; 1806 return;
1807 /*
1808 * Do not call function tracer while we update the code.
1809 * We are in stop machine.
1810 */
1811 function_trace_stop++;
1661 1812
1662 stop_machine(__ftrace_modify_code, &command, NULL); 1813 /*
1814 * By default we use stop_machine() to modify the code.
1815 * But archs can do what ever they want as long as it
1816 * is safe. The stop_machine() is the safest, but also
1817 * produces the most overhead.
1818 */
1819 arch_ftrace_update_code(command);
1820
1821#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
1822 /*
1823 * For archs that call ftrace_test_stop_func(), we must
1824 * wait till after we update all the function callers
1825 * before we update the callback. This keeps different
1826 * ops that record different functions from corrupting
1827 * each other.
1828 */
1829 __ftrace_trace_function = __ftrace_trace_function_delay;
1830#endif
1831 function_trace_stop--;
1663 1832
1664 ret = ftrace_arch_code_modify_post_process(); 1833 ret = ftrace_arch_code_modify_post_process();
1665 FTRACE_WARN_ON(ret); 1834 FTRACE_WARN_ON(ret);
@@ -1690,7 +1859,7 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
1690 return -ENODEV; 1859 return -ENODEV;
1691 1860
1692 ftrace_start_up++; 1861 ftrace_start_up++;
1693 command |= FTRACE_ENABLE_CALLS; 1862 command |= FTRACE_UPDATE_CALLS;
1694 1863
1695 /* ops marked global share the filter hashes */ 1864 /* ops marked global share the filter hashes */
1696 if (ops->flags & FTRACE_OPS_FL_GLOBAL) { 1865 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
@@ -1742,8 +1911,7 @@ static void ftrace_shutdown(struct ftrace_ops *ops, int command)
1742 if (ops != &global_ops || !global_start_up) 1911 if (ops != &global_ops || !global_start_up)
1743 ops->flags &= ~FTRACE_OPS_FL_ENABLED; 1912 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
1744 1913
1745 if (!ftrace_start_up) 1914 command |= FTRACE_UPDATE_CALLS;
1746 command |= FTRACE_DISABLE_CALLS;
1747 1915
1748 if (saved_ftrace_func != ftrace_trace_function) { 1916 if (saved_ftrace_func != ftrace_trace_function) {
1749 saved_ftrace_func = ftrace_trace_function; 1917 saved_ftrace_func = ftrace_trace_function;
@@ -1765,7 +1933,7 @@ static void ftrace_startup_sysctl(void)
1765 saved_ftrace_func = NULL; 1933 saved_ftrace_func = NULL;
1766 /* ftrace_start_up is true if we want ftrace running */ 1934 /* ftrace_start_up is true if we want ftrace running */
1767 if (ftrace_start_up) 1935 if (ftrace_start_up)
1768 ftrace_run_update_code(FTRACE_ENABLE_CALLS); 1936 ftrace_run_update_code(FTRACE_UPDATE_CALLS);
1769} 1937}
1770 1938
1771static void ftrace_shutdown_sysctl(void) 1939static void ftrace_shutdown_sysctl(void)
@@ -1787,14 +1955,16 @@ static int ops_traces_mod(struct ftrace_ops *ops)
1787 struct ftrace_hash *hash; 1955 struct ftrace_hash *hash;
1788 1956
1789 hash = ops->filter_hash; 1957 hash = ops->filter_hash;
1790 return !!(!hash || !hash->count); 1958 return ftrace_hash_empty(hash);
1791} 1959}
1792 1960
1793static int ftrace_update_code(struct module *mod) 1961static int ftrace_update_code(struct module *mod)
1794{ 1962{
1963 struct ftrace_page *pg;
1795 struct dyn_ftrace *p; 1964 struct dyn_ftrace *p;
1796 cycle_t start, stop; 1965 cycle_t start, stop;
1797 unsigned long ref = 0; 1966 unsigned long ref = 0;
1967 int i;
1798 1968
1799 /* 1969 /*
1800 * When adding a module, we need to check if tracers are 1970 * When adding a module, we need to check if tracers are
@@ -1816,46 +1986,44 @@ static int ftrace_update_code(struct module *mod)
1816 start = ftrace_now(raw_smp_processor_id()); 1986 start = ftrace_now(raw_smp_processor_id());
1817 ftrace_update_cnt = 0; 1987 ftrace_update_cnt = 0;
1818 1988
1819 while (ftrace_new_addrs) { 1989 for (pg = ftrace_new_pgs; pg; pg = pg->next) {
1820 1990
1821 /* If something went wrong, bail without enabling anything */ 1991 for (i = 0; i < pg->index; i++) {
1822 if (unlikely(ftrace_disabled)) 1992 /* If something went wrong, bail without enabling anything */
1823 return -1; 1993 if (unlikely(ftrace_disabled))
1994 return -1;
1824 1995
1825 p = ftrace_new_addrs; 1996 p = &pg->records[i];
1826 ftrace_new_addrs = p->newlist; 1997 p->flags = ref;
1827 p->flags = ref;
1828 1998
1829 /* 1999 /*
1830 * Do the initial record conversion from mcount jump 2000 * Do the initial record conversion from mcount jump
1831 * to the NOP instructions. 2001 * to the NOP instructions.
1832 */ 2002 */
1833 if (!ftrace_code_disable(mod, p)) { 2003 if (!ftrace_code_disable(mod, p))
1834 ftrace_free_rec(p); 2004 break;
1835 /* Game over */
1836 break;
1837 }
1838 2005
1839 ftrace_update_cnt++; 2006 ftrace_update_cnt++;
1840 2007
1841 /* 2008 /*
1842 * If the tracing is enabled, go ahead and enable the record. 2009 * If the tracing is enabled, go ahead and enable the record.
1843 * 2010 *
1844 * The reason not to enable the record immediatelly is the 2011 * The reason not to enable the record immediatelly is the
1845 * inherent check of ftrace_make_nop/ftrace_make_call for 2012 * inherent check of ftrace_make_nop/ftrace_make_call for
1846 * correct previous instructions. Making first the NOP 2013 * correct previous instructions. Making first the NOP
1847 * conversion puts the module to the correct state, thus 2014 * conversion puts the module to the correct state, thus
1848 * passing the ftrace_make_call check. 2015 * passing the ftrace_make_call check.
1849 */ 2016 */
1850 if (ftrace_start_up && ref) { 2017 if (ftrace_start_up && ref) {
1851 int failed = __ftrace_replace_code(p, 1); 2018 int failed = __ftrace_replace_code(p, 1);
1852 if (failed) { 2019 if (failed)
1853 ftrace_bug(failed, p->ip); 2020 ftrace_bug(failed, p->ip);
1854 ftrace_free_rec(p);
1855 } 2021 }
1856 } 2022 }
1857 } 2023 }
1858 2024
2025 ftrace_new_pgs = NULL;
2026
1859 stop = ftrace_now(raw_smp_processor_id()); 2027 stop = ftrace_now(raw_smp_processor_id());
1860 ftrace_update_time = stop - start; 2028 ftrace_update_time = stop - start;
1861 ftrace_update_tot_cnt += ftrace_update_cnt; 2029 ftrace_update_tot_cnt += ftrace_update_cnt;
@@ -1863,57 +2031,108 @@ static int ftrace_update_code(struct module *mod)
1863 return 0; 2031 return 0;
1864} 2032}
1865 2033
1866static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) 2034static int ftrace_allocate_records(struct ftrace_page *pg, int count)
1867{ 2035{
1868 struct ftrace_page *pg; 2036 int order;
1869 int cnt; 2037 int cnt;
1870 int i;
1871 2038
1872 /* allocate a few pages */ 2039 if (WARN_ON(!count))
1873 ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL); 2040 return -EINVAL;
1874 if (!ftrace_pages_start) 2041
1875 return -1; 2042 order = get_count_order(DIV_ROUND_UP(count, ENTRIES_PER_PAGE));
1876 2043
1877 /* 2044 /*
1878 * Allocate a few more pages. 2045 * We want to fill as much as possible. No more than a page
1879 * 2046 * may be empty.
1880 * TODO: have some parser search vmlinux before
1881 * final linking to find all calls to ftrace.
1882 * Then we can:
1883 * a) know how many pages to allocate.
1884 * and/or
1885 * b) set up the table then.
1886 *
1887 * The dynamic code is still necessary for
1888 * modules.
1889 */ 2047 */
2048 while ((PAGE_SIZE << order) / ENTRY_SIZE >= count + ENTRIES_PER_PAGE)
2049 order--;
1890 2050
1891 pg = ftrace_pages = ftrace_pages_start; 2051 again:
2052 pg->records = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
1892 2053
1893 cnt = num_to_init / ENTRIES_PER_PAGE; 2054 if (!pg->records) {
1894 pr_info("ftrace: allocating %ld entries in %d pages\n", 2055 /* if we can't allocate this size, try something smaller */
1895 num_to_init, cnt + 1); 2056 if (!order)
2057 return -ENOMEM;
2058 order >>= 1;
2059 goto again;
2060 }
1896 2061
1897 for (i = 0; i < cnt; i++) { 2062 cnt = (PAGE_SIZE << order) / ENTRY_SIZE;
1898 pg->next = (void *)get_zeroed_page(GFP_KERNEL); 2063 pg->size = cnt;
1899 2064
1900 /* If we fail, we'll try later anyway */ 2065 if (cnt > count)
1901 if (!pg->next) 2066 cnt = count;
2067
2068 return cnt;
2069}
2070
2071static struct ftrace_page *
2072ftrace_allocate_pages(unsigned long num_to_init)
2073{
2074 struct ftrace_page *start_pg;
2075 struct ftrace_page *pg;
2076 int order;
2077 int cnt;
2078
2079 if (!num_to_init)
2080 return 0;
2081
2082 start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL);
2083 if (!pg)
2084 return NULL;
2085
2086 /*
2087 * Try to allocate as much as possible in one continues
2088 * location that fills in all of the space. We want to
2089 * waste as little space as possible.
2090 */
2091 for (;;) {
2092 cnt = ftrace_allocate_records(pg, num_to_init);
2093 if (cnt < 0)
2094 goto free_pages;
2095
2096 num_to_init -= cnt;
2097 if (!num_to_init)
1902 break; 2098 break;
1903 2099
2100 pg->next = kzalloc(sizeof(*pg), GFP_KERNEL);
2101 if (!pg->next)
2102 goto free_pages;
2103
1904 pg = pg->next; 2104 pg = pg->next;
1905 } 2105 }
1906 2106
1907 return 0; 2107 return start_pg;
2108
2109 free_pages:
2110 while (start_pg) {
2111 order = get_count_order(pg->size / ENTRIES_PER_PAGE);
2112 free_pages((unsigned long)pg->records, order);
2113 start_pg = pg->next;
2114 kfree(pg);
2115 pg = start_pg;
2116 }
2117 pr_info("ftrace: FAILED to allocate memory for functions\n");
2118 return NULL;
1908} 2119}
1909 2120
1910enum { 2121static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
1911 FTRACE_ITER_FILTER = (1 << 0), 2122{
1912 FTRACE_ITER_NOTRACE = (1 << 1), 2123 int cnt;
1913 FTRACE_ITER_PRINTALL = (1 << 2), 2124
1914 FTRACE_ITER_HASH = (1 << 3), 2125 if (!num_to_init) {
1915 FTRACE_ITER_ENABLED = (1 << 4), 2126 pr_info("ftrace: No functions to be traced?\n");
1916}; 2127 return -1;
2128 }
2129
2130 cnt = num_to_init / ENTRIES_PER_PAGE;
2131 pr_info("ftrace: allocating %ld entries in %d pages\n",
2132 num_to_init, cnt + 1);
2133
2134 return 0;
2135}
1917 2136
1918#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ 2137#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
1919 2138
@@ -1979,6 +2198,9 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
1979 void *p = NULL; 2198 void *p = NULL;
1980 loff_t l; 2199 loff_t l;
1981 2200
2201 if (!(iter->flags & FTRACE_ITER_DO_HASH))
2202 return NULL;
2203
1982 if (iter->func_pos > *pos) 2204 if (iter->func_pos > *pos)
1983 return NULL; 2205 return NULL;
1984 2206
@@ -2022,7 +2244,7 @@ static void *
2022t_next(struct seq_file *m, void *v, loff_t *pos) 2244t_next(struct seq_file *m, void *v, loff_t *pos)
2023{ 2245{
2024 struct ftrace_iterator *iter = m->private; 2246 struct ftrace_iterator *iter = m->private;
2025 struct ftrace_ops *ops = &global_ops; 2247 struct ftrace_ops *ops = iter->ops;
2026 struct dyn_ftrace *rec = NULL; 2248 struct dyn_ftrace *rec = NULL;
2027 2249
2028 if (unlikely(ftrace_disabled)) 2250 if (unlikely(ftrace_disabled))
@@ -2046,9 +2268,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
2046 } 2268 }
2047 } else { 2269 } else {
2048 rec = &iter->pg->records[iter->idx++]; 2270 rec = &iter->pg->records[iter->idx++];
2049 if ((rec->flags & FTRACE_FL_FREE) || 2271 if (((iter->flags & FTRACE_ITER_FILTER) &&
2050
2051 ((iter->flags & FTRACE_ITER_FILTER) &&
2052 !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) || 2272 !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) ||
2053 2273
2054 ((iter->flags & FTRACE_ITER_NOTRACE) && 2274 ((iter->flags & FTRACE_ITER_NOTRACE) &&
@@ -2080,7 +2300,7 @@ static void reset_iter_read(struct ftrace_iterator *iter)
2080static void *t_start(struct seq_file *m, loff_t *pos) 2300static void *t_start(struct seq_file *m, loff_t *pos)
2081{ 2301{
2082 struct ftrace_iterator *iter = m->private; 2302 struct ftrace_iterator *iter = m->private;
2083 struct ftrace_ops *ops = &global_ops; 2303 struct ftrace_ops *ops = iter->ops;
2084 void *p = NULL; 2304 void *p = NULL;
2085 loff_t l; 2305 loff_t l;
2086 2306
@@ -2100,7 +2320,8 @@ static void *t_start(struct seq_file *m, loff_t *pos)
2100 * off, we can short cut and just print out that all 2320 * off, we can short cut and just print out that all
2101 * functions are enabled. 2321 * functions are enabled.
2102 */ 2322 */
2103 if (iter->flags & FTRACE_ITER_FILTER && !ops->filter_hash->count) { 2323 if (iter->flags & FTRACE_ITER_FILTER &&
2324 ftrace_hash_empty(ops->filter_hash)) {
2104 if (*pos > 0) 2325 if (*pos > 0)
2105 return t_hash_start(m, pos); 2326 return t_hash_start(m, pos);
2106 iter->flags |= FTRACE_ITER_PRINTALL; 2327 iter->flags |= FTRACE_ITER_PRINTALL;
@@ -2125,12 +2346,8 @@ static void *t_start(struct seq_file *m, loff_t *pos)
2125 break; 2346 break;
2126 } 2347 }
2127 2348
2128 if (!p) { 2349 if (!p)
2129 if (iter->flags & FTRACE_ITER_FILTER) 2350 return t_hash_start(m, pos);
2130 return t_hash_start(m, pos);
2131
2132 return NULL;
2133 }
2134 2351
2135 return iter; 2352 return iter;
2136} 2353}
@@ -2188,6 +2405,7 @@ ftrace_avail_open(struct inode *inode, struct file *file)
2188 return -ENOMEM; 2405 return -ENOMEM;
2189 2406
2190 iter->pg = ftrace_pages_start; 2407 iter->pg = ftrace_pages_start;
2408 iter->ops = &global_ops;
2191 2409
2192 ret = seq_open(file, &show_ftrace_seq_ops); 2410 ret = seq_open(file, &show_ftrace_seq_ops);
2193 if (!ret) { 2411 if (!ret) {
@@ -2216,6 +2434,7 @@ ftrace_enabled_open(struct inode *inode, struct file *file)
2216 2434
2217 iter->pg = ftrace_pages_start; 2435 iter->pg = ftrace_pages_start;
2218 iter->flags = FTRACE_ITER_ENABLED; 2436 iter->flags = FTRACE_ITER_ENABLED;
2437 iter->ops = &global_ops;
2219 2438
2220 ret = seq_open(file, &show_ftrace_seq_ops); 2439 ret = seq_open(file, &show_ftrace_seq_ops);
2221 if (!ret) { 2440 if (!ret) {
@@ -2236,7 +2455,23 @@ static void ftrace_filter_reset(struct ftrace_hash *hash)
2236 mutex_unlock(&ftrace_lock); 2455 mutex_unlock(&ftrace_lock);
2237} 2456}
2238 2457
2239static int 2458/**
2459 * ftrace_regex_open - initialize function tracer filter files
2460 * @ops: The ftrace_ops that hold the hash filters
2461 * @flag: The type of filter to process
2462 * @inode: The inode, usually passed in to your open routine
2463 * @file: The file, usually passed in to your open routine
2464 *
2465 * ftrace_regex_open() initializes the filter files for the
2466 * @ops. Depending on @flag it may process the filter hash or
2467 * the notrace hash of @ops. With this called from the open
2468 * routine, you can use ftrace_filter_write() for the write
2469 * routine if @flag has FTRACE_ITER_FILTER set, or
2470 * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set.
2471 * ftrace_regex_lseek() should be used as the lseek routine, and
2472 * release must call ftrace_regex_release().
2473 */
2474int
2240ftrace_regex_open(struct ftrace_ops *ops, int flag, 2475ftrace_regex_open(struct ftrace_ops *ops, int flag,
2241 struct inode *inode, struct file *file) 2476 struct inode *inode, struct file *file)
2242{ 2477{
@@ -2305,8 +2540,9 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
2305static int 2540static int
2306ftrace_filter_open(struct inode *inode, struct file *file) 2541ftrace_filter_open(struct inode *inode, struct file *file)
2307{ 2542{
2308 return ftrace_regex_open(&global_ops, FTRACE_ITER_FILTER, 2543 return ftrace_regex_open(&global_ops,
2309 inode, file); 2544 FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH,
2545 inode, file);
2310} 2546}
2311 2547
2312static int 2548static int
@@ -2316,7 +2552,7 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
2316 inode, file); 2552 inode, file);
2317} 2553}
2318 2554
2319static loff_t 2555loff_t
2320ftrace_regex_lseek(struct file *file, loff_t offset, int origin) 2556ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
2321{ 2557{
2322 loff_t ret; 2558 loff_t ret;
@@ -2425,7 +2661,6 @@ match_records(struct ftrace_hash *hash, char *buff,
2425 goto out_unlock; 2661 goto out_unlock;
2426 2662
2427 do_for_each_ftrace_rec(pg, rec) { 2663 do_for_each_ftrace_rec(pg, rec) {
2428
2429 if (ftrace_match_record(rec, mod, search, search_len, type)) { 2664 if (ftrace_match_record(rec, mod, search, search_len, type)) {
2430 ret = enter_record(hash, rec, not); 2665 ret = enter_record(hash, rec, not);
2431 if (ret < 0) { 2666 if (ret < 0) {
@@ -2870,14 +3105,14 @@ out_unlock:
2870 return ret; 3105 return ret;
2871} 3106}
2872 3107
2873static ssize_t 3108ssize_t
2874ftrace_filter_write(struct file *file, const char __user *ubuf, 3109ftrace_filter_write(struct file *file, const char __user *ubuf,
2875 size_t cnt, loff_t *ppos) 3110 size_t cnt, loff_t *ppos)
2876{ 3111{
2877 return ftrace_regex_write(file, ubuf, cnt, ppos, 1); 3112 return ftrace_regex_write(file, ubuf, cnt, ppos, 1);
2878} 3113}
2879 3114
2880static ssize_t 3115ssize_t
2881ftrace_notrace_write(struct file *file, const char __user *ubuf, 3116ftrace_notrace_write(struct file *file, const char __user *ubuf,
2882 size_t cnt, loff_t *ppos) 3117 size_t cnt, loff_t *ppos)
2883{ 3118{
@@ -2918,7 +3153,7 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
2918 ret = ftrace_hash_move(ops, enable, orig_hash, hash); 3153 ret = ftrace_hash_move(ops, enable, orig_hash, hash);
2919 if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED 3154 if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED
2920 && ftrace_enabled) 3155 && ftrace_enabled)
2921 ftrace_run_update_code(FTRACE_ENABLE_CALLS); 3156 ftrace_run_update_code(FTRACE_UPDATE_CALLS);
2922 3157
2923 mutex_unlock(&ftrace_lock); 3158 mutex_unlock(&ftrace_lock);
2924 3159
@@ -3044,8 +3279,8 @@ static void __init set_ftrace_early_graph(char *buf)
3044} 3279}
3045#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 3280#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
3046 3281
3047static void __init 3282void __init
3048set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable) 3283ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable)
3049{ 3284{
3050 char *func; 3285 char *func;
3051 3286
@@ -3058,17 +3293,16 @@ set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable)
3058static void __init set_ftrace_early_filters(void) 3293static void __init set_ftrace_early_filters(void)
3059{ 3294{
3060 if (ftrace_filter_buf[0]) 3295 if (ftrace_filter_buf[0])
3061 set_ftrace_early_filter(&global_ops, ftrace_filter_buf, 1); 3296 ftrace_set_early_filter(&global_ops, ftrace_filter_buf, 1);
3062 if (ftrace_notrace_buf[0]) 3297 if (ftrace_notrace_buf[0])
3063 set_ftrace_early_filter(&global_ops, ftrace_notrace_buf, 0); 3298 ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0);
3064#ifdef CONFIG_FUNCTION_GRAPH_TRACER 3299#ifdef CONFIG_FUNCTION_GRAPH_TRACER
3065 if (ftrace_graph_buf[0]) 3300 if (ftrace_graph_buf[0])
3066 set_ftrace_early_graph(ftrace_graph_buf); 3301 set_ftrace_early_graph(ftrace_graph_buf);
3067#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 3302#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
3068} 3303}
3069 3304
3070static int 3305int ftrace_regex_release(struct inode *inode, struct file *file)
3071ftrace_regex_release(struct inode *inode, struct file *file)
3072{ 3306{
3073 struct seq_file *m = (struct seq_file *)file->private_data; 3307 struct seq_file *m = (struct seq_file *)file->private_data;
3074 struct ftrace_iterator *iter; 3308 struct ftrace_iterator *iter;
@@ -3106,7 +3340,7 @@ ftrace_regex_release(struct inode *inode, struct file *file)
3106 orig_hash, iter->hash); 3340 orig_hash, iter->hash);
3107 if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED) 3341 if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED)
3108 && ftrace_enabled) 3342 && ftrace_enabled)
3109 ftrace_run_update_code(FTRACE_ENABLE_CALLS); 3343 ftrace_run_update_code(FTRACE_UPDATE_CALLS);
3110 3344
3111 mutex_unlock(&ftrace_lock); 3345 mutex_unlock(&ftrace_lock);
3112 } 3346 }
@@ -3269,9 +3503,6 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
3269 3503
3270 do_for_each_ftrace_rec(pg, rec) { 3504 do_for_each_ftrace_rec(pg, rec) {
3271 3505
3272 if (rec->flags & FTRACE_FL_FREE)
3273 continue;
3274
3275 if (ftrace_match_record(rec, NULL, search, search_len, type)) { 3506 if (ftrace_match_record(rec, NULL, search, search_len, type)) {
3276 /* if it is in the array */ 3507 /* if it is in the array */
3277 exists = false; 3508 exists = false;
@@ -3380,15 +3611,62 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
3380 return 0; 3611 return 0;
3381} 3612}
3382 3613
3614static void ftrace_swap_recs(void *a, void *b, int size)
3615{
3616 struct dyn_ftrace *reca = a;
3617 struct dyn_ftrace *recb = b;
3618 struct dyn_ftrace t;
3619
3620 t = *reca;
3621 *reca = *recb;
3622 *recb = t;
3623}
3624
3383static int ftrace_process_locs(struct module *mod, 3625static int ftrace_process_locs(struct module *mod,
3384 unsigned long *start, 3626 unsigned long *start,
3385 unsigned long *end) 3627 unsigned long *end)
3386{ 3628{
3629 struct ftrace_page *pg;
3630 unsigned long count;
3387 unsigned long *p; 3631 unsigned long *p;
3388 unsigned long addr; 3632 unsigned long addr;
3389 unsigned long flags = 0; /* Shut up gcc */ 3633 unsigned long flags = 0; /* Shut up gcc */
3634 int ret = -ENOMEM;
3635
3636 count = end - start;
3637
3638 if (!count)
3639 return 0;
3640
3641 pg = ftrace_allocate_pages(count);
3642 if (!pg)
3643 return -ENOMEM;
3390 3644
3391 mutex_lock(&ftrace_lock); 3645 mutex_lock(&ftrace_lock);
3646
3647 /*
3648 * Core and each module needs their own pages, as
3649 * modules will free them when they are removed.
3650 * Force a new page to be allocated for modules.
3651 */
3652 if (!mod) {
3653 WARN_ON(ftrace_pages || ftrace_pages_start);
3654 /* First initialization */
3655 ftrace_pages = ftrace_pages_start = pg;
3656 } else {
3657 if (!ftrace_pages)
3658 goto out;
3659
3660 if (WARN_ON(ftrace_pages->next)) {
3661 /* Hmm, we have free pages? */
3662 while (ftrace_pages->next)
3663 ftrace_pages = ftrace_pages->next;
3664 }
3665
3666 ftrace_pages->next = pg;
3667 ftrace_pages = pg;
3668 }
3669
3392 p = start; 3670 p = start;
3393 while (p < end) { 3671 while (p < end) {
3394 addr = ftrace_call_adjust(*p++); 3672 addr = ftrace_call_adjust(*p++);
@@ -3400,9 +3678,18 @@ static int ftrace_process_locs(struct module *mod,
3400 */ 3678 */
3401 if (!addr) 3679 if (!addr)
3402 continue; 3680 continue;
3403 ftrace_record_ip(addr); 3681 if (!ftrace_record_ip(addr))
3682 break;
3404 } 3683 }
3405 3684
3685 /* These new locations need to be initialized */
3686 ftrace_new_pgs = pg;
3687
3688 /* Make each individual set of pages sorted by ips */
3689 for (; pg; pg = pg->next)
3690 sort(pg->records, pg->index, sizeof(struct dyn_ftrace),
3691 ftrace_cmp_recs, ftrace_swap_recs);
3692
3406 /* 3693 /*
3407 * We only need to disable interrupts on start up 3694 * We only need to disable interrupts on start up
3408 * because we are modifying code that an interrupt 3695 * because we are modifying code that an interrupt
@@ -3416,32 +3703,55 @@ static int ftrace_process_locs(struct module *mod,
3416 ftrace_update_code(mod); 3703 ftrace_update_code(mod);
3417 if (!mod) 3704 if (!mod)
3418 local_irq_restore(flags); 3705 local_irq_restore(flags);
3706 ret = 0;
3707 out:
3419 mutex_unlock(&ftrace_lock); 3708 mutex_unlock(&ftrace_lock);
3420 3709
3421 return 0; 3710 return ret;
3422} 3711}
3423 3712
3424#ifdef CONFIG_MODULES 3713#ifdef CONFIG_MODULES
3714
3715#define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
3716
3425void ftrace_release_mod(struct module *mod) 3717void ftrace_release_mod(struct module *mod)
3426{ 3718{
3427 struct dyn_ftrace *rec; 3719 struct dyn_ftrace *rec;
3720 struct ftrace_page **last_pg;
3428 struct ftrace_page *pg; 3721 struct ftrace_page *pg;
3722 int order;
3429 3723
3430 mutex_lock(&ftrace_lock); 3724 mutex_lock(&ftrace_lock);
3431 3725
3432 if (ftrace_disabled) 3726 if (ftrace_disabled)
3433 goto out_unlock; 3727 goto out_unlock;
3434 3728
3435 do_for_each_ftrace_rec(pg, rec) { 3729 /*
3730 * Each module has its own ftrace_pages, remove
3731 * them from the list.
3732 */
3733 last_pg = &ftrace_pages_start;
3734 for (pg = ftrace_pages_start; pg; pg = *last_pg) {
3735 rec = &pg->records[0];
3436 if (within_module_core(rec->ip, mod)) { 3736 if (within_module_core(rec->ip, mod)) {
3437 /* 3737 /*
3438 * rec->ip is changed in ftrace_free_rec() 3738 * As core pages are first, the first
3439 * It should not between s and e if record was freed. 3739 * page should never be a module page.
3440 */ 3740 */
3441 FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE); 3741 if (WARN_ON(pg == ftrace_pages_start))
3442 ftrace_free_rec(rec); 3742 goto out_unlock;
3443 } 3743
3444 } while_for_each_ftrace_rec(); 3744 /* Check if we are deleting the last page */
3745 if (pg == ftrace_pages)
3746 ftrace_pages = next_to_ftrace_page(last_pg);
3747
3748 *last_pg = pg->next;
3749 order = get_count_order(pg->size / ENTRIES_PER_PAGE);
3750 free_pages((unsigned long)pg->records, order);
3751 kfree(pg);
3752 } else
3753 last_pg = &pg->next;
3754 }
3445 out_unlock: 3755 out_unlock:
3446 mutex_unlock(&ftrace_lock); 3756 mutex_unlock(&ftrace_lock);
3447} 3757}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f2bd275bb60f..a3f1bc5d2a00 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -338,7 +338,8 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
338/* trace_flags holds trace_options default values */ 338/* trace_flags holds trace_options default values */
339unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | 339unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
340 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | 340 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
341 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE; 341 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE |
342 TRACE_ITER_IRQ_INFO;
342 343
343static int trace_stop_count; 344static int trace_stop_count;
344static DEFINE_RAW_SPINLOCK(tracing_start_lock); 345static DEFINE_RAW_SPINLOCK(tracing_start_lock);
@@ -426,6 +427,7 @@ static const char *trace_options[] = {
426 "record-cmd", 427 "record-cmd",
427 "overwrite", 428 "overwrite",
428 "disable_on_free", 429 "disable_on_free",
430 "irq-info",
429 NULL 431 NULL
430}; 432};
431 433
@@ -1843,6 +1845,33 @@ static void s_stop(struct seq_file *m, void *p)
1843 trace_event_read_unlock(); 1845 trace_event_read_unlock();
1844} 1846}
1845 1847
1848static void
1849get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *entries)
1850{
1851 unsigned long count;
1852 int cpu;
1853
1854 *total = 0;
1855 *entries = 0;
1856
1857 for_each_tracing_cpu(cpu) {
1858 count = ring_buffer_entries_cpu(tr->buffer, cpu);
1859 /*
1860 * If this buffer has skipped entries, then we hold all
1861 * entries for the trace and we need to ignore the
1862 * ones before the time stamp.
1863 */
1864 if (tr->data[cpu]->skipped_entries) {
1865 count -= tr->data[cpu]->skipped_entries;
1866 /* total is the same as the entries */
1867 *total += count;
1868 } else
1869 *total += count +
1870 ring_buffer_overrun_cpu(tr->buffer, cpu);
1871 *entries += count;
1872 }
1873}
1874
1846static void print_lat_help_header(struct seq_file *m) 1875static void print_lat_help_header(struct seq_file *m)
1847{ 1876{
1848 seq_puts(m, "# _------=> CPU# \n"); 1877 seq_puts(m, "# _------=> CPU# \n");
@@ -1855,12 +1884,35 @@ static void print_lat_help_header(struct seq_file *m)
1855 seq_puts(m, "# \\ / ||||| \\ | / \n"); 1884 seq_puts(m, "# \\ / ||||| \\ | / \n");
1856} 1885}
1857 1886
1858static void print_func_help_header(struct seq_file *m) 1887static void print_event_info(struct trace_array *tr, struct seq_file *m)
1888{
1889 unsigned long total;
1890 unsigned long entries;
1891
1892 get_total_entries(tr, &total, &entries);
1893 seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu #P:%d\n",
1894 entries, total, num_online_cpus());
1895 seq_puts(m, "#\n");
1896}
1897
1898static void print_func_help_header(struct trace_array *tr, struct seq_file *m)
1859{ 1899{
1860 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); 1900 print_event_info(tr, m);
1901 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n");
1861 seq_puts(m, "# | | | | |\n"); 1902 seq_puts(m, "# | | | | |\n");
1862} 1903}
1863 1904
1905static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m)
1906{
1907 print_event_info(tr, m);
1908 seq_puts(m, "# _-----=> irqs-off\n");
1909 seq_puts(m, "# / _----=> need-resched\n");
1910 seq_puts(m, "# | / _---=> hardirq/softirq\n");
1911 seq_puts(m, "# || / _--=> preempt-depth\n");
1912 seq_puts(m, "# ||| / delay\n");
1913 seq_puts(m, "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n");
1914 seq_puts(m, "# | | | |||| | |\n");
1915}
1864 1916
1865void 1917void
1866print_trace_header(struct seq_file *m, struct trace_iterator *iter) 1918print_trace_header(struct seq_file *m, struct trace_iterator *iter)
@@ -1869,32 +1921,14 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1869 struct trace_array *tr = iter->tr; 1921 struct trace_array *tr = iter->tr;
1870 struct trace_array_cpu *data = tr->data[tr->cpu]; 1922 struct trace_array_cpu *data = tr->data[tr->cpu];
1871 struct tracer *type = current_trace; 1923 struct tracer *type = current_trace;
1872 unsigned long entries = 0; 1924 unsigned long entries;
1873 unsigned long total = 0; 1925 unsigned long total;
1874 unsigned long count;
1875 const char *name = "preemption"; 1926 const char *name = "preemption";
1876 int cpu;
1877 1927
1878 if (type) 1928 if (type)
1879 name = type->name; 1929 name = type->name;
1880 1930
1881 1931 get_total_entries(tr, &total, &entries);
1882 for_each_tracing_cpu(cpu) {
1883 count = ring_buffer_entries_cpu(tr->buffer, cpu);
1884 /*
1885 * If this buffer has skipped entries, then we hold all
1886 * entries for the trace and we need to ignore the
1887 * ones before the time stamp.
1888 */
1889 if (tr->data[cpu]->skipped_entries) {
1890 count -= tr->data[cpu]->skipped_entries;
1891 /* total is the same as the entries */
1892 total += count;
1893 } else
1894 total += count +
1895 ring_buffer_overrun_cpu(tr->buffer, cpu);
1896 entries += count;
1897 }
1898 1932
1899 seq_printf(m, "# %s latency trace v1.1.5 on %s\n", 1933 seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
1900 name, UTS_RELEASE); 1934 name, UTS_RELEASE);
@@ -2140,6 +2174,21 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
2140 return print_trace_fmt(iter); 2174 return print_trace_fmt(iter);
2141} 2175}
2142 2176
2177void trace_latency_header(struct seq_file *m)
2178{
2179 struct trace_iterator *iter = m->private;
2180
2181 /* print nothing if the buffers are empty */
2182 if (trace_empty(iter))
2183 return;
2184
2185 if (iter->iter_flags & TRACE_FILE_LAT_FMT)
2186 print_trace_header(m, iter);
2187
2188 if (!(trace_flags & TRACE_ITER_VERBOSE))
2189 print_lat_help_header(m);
2190}
2191
2143void trace_default_header(struct seq_file *m) 2192void trace_default_header(struct seq_file *m)
2144{ 2193{
2145 struct trace_iterator *iter = m->private; 2194 struct trace_iterator *iter = m->private;
@@ -2155,8 +2204,12 @@ void trace_default_header(struct seq_file *m)
2155 if (!(trace_flags & TRACE_ITER_VERBOSE)) 2204 if (!(trace_flags & TRACE_ITER_VERBOSE))
2156 print_lat_help_header(m); 2205 print_lat_help_header(m);
2157 } else { 2206 } else {
2158 if (!(trace_flags & TRACE_ITER_VERBOSE)) 2207 if (!(trace_flags & TRACE_ITER_VERBOSE)) {
2159 print_func_help_header(m); 2208 if (trace_flags & TRACE_ITER_IRQ_INFO)
2209 print_func_help_header_irq(iter->tr, m);
2210 else
2211 print_func_help_header(iter->tr, m);
2212 }
2160 } 2213 }
2161} 2214}
2162 2215
@@ -4385,7 +4438,7 @@ static const struct file_operations trace_options_core_fops = {
4385}; 4438};
4386 4439
4387struct dentry *trace_create_file(const char *name, 4440struct dentry *trace_create_file(const char *name,
4388 mode_t mode, 4441 umode_t mode,
4389 struct dentry *parent, 4442 struct dentry *parent,
4390 void *data, 4443 void *data,
4391 const struct file_operations *fops) 4444 const struct file_operations *fops)
@@ -4775,6 +4828,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
4775{ 4828{
4776 __ftrace_dump(true, oops_dump_mode); 4829 __ftrace_dump(true, oops_dump_mode);
4777} 4830}
4831EXPORT_SYMBOL_GPL(ftrace_dump);
4778 4832
4779__init static int tracer_alloc_buffers(void) 4833__init static int tracer_alloc_buffers(void)
4780{ 4834{
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 092e1f8d18dc..b93ecbadad6d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -312,7 +312,7 @@ void tracing_reset_current(int cpu);
312void tracing_reset_current_online_cpus(void); 312void tracing_reset_current_online_cpus(void);
313int tracing_open_generic(struct inode *inode, struct file *filp); 313int tracing_open_generic(struct inode *inode, struct file *filp);
314struct dentry *trace_create_file(const char *name, 314struct dentry *trace_create_file(const char *name,
315 mode_t mode, 315 umode_t mode,
316 struct dentry *parent, 316 struct dentry *parent,
317 void *data, 317 void *data,
318 const struct file_operations *fops); 318 const struct file_operations *fops);
@@ -370,6 +370,7 @@ void trace_graph_function(struct trace_array *tr,
370 unsigned long ip, 370 unsigned long ip,
371 unsigned long parent_ip, 371 unsigned long parent_ip,
372 unsigned long flags, int pc); 372 unsigned long flags, int pc);
373void trace_latency_header(struct seq_file *m);
373void trace_default_header(struct seq_file *m); 374void trace_default_header(struct seq_file *m);
374void print_trace_header(struct seq_file *m, struct trace_iterator *iter); 375void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
375int trace_empty(struct trace_iterator *iter); 376int trace_empty(struct trace_iterator *iter);
@@ -654,6 +655,7 @@ enum trace_iterator_flags {
654 TRACE_ITER_RECORD_CMD = 0x100000, 655 TRACE_ITER_RECORD_CMD = 0x100000,
655 TRACE_ITER_OVERWRITE = 0x200000, 656 TRACE_ITER_OVERWRITE = 0x200000,
656 TRACE_ITER_STOP_ON_FREE = 0x400000, 657 TRACE_ITER_STOP_ON_FREE = 0x400000,
658 TRACE_ITER_IRQ_INFO = 0x800000,
657}; 659};
658 660
659/* 661/*
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 581876f9f387..c212a7f934ec 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1078,7 +1078,6 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
1078 /* First see if we did not already create this dir */ 1078 /* First see if we did not already create this dir */
1079 list_for_each_entry(system, &event_subsystems, list) { 1079 list_for_each_entry(system, &event_subsystems, list) {
1080 if (strcmp(system->name, name) == 0) { 1080 if (strcmp(system->name, name) == 0) {
1081 __get_system(system);
1082 system->nr_events++; 1081 system->nr_events++;
1083 return system->entry; 1082 return system->entry;
1084 } 1083 }
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 816d3d074979..24aee7127451 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -27,6 +27,12 @@
27#include "trace.h" 27#include "trace.h"
28#include "trace_output.h" 28#include "trace_output.h"
29 29
30#define DEFAULT_SYS_FILTER_MESSAGE \
31 "### global filter ###\n" \
32 "# Use this to set filters for multiple events.\n" \
33 "# Only events with the given fields will be affected.\n" \
34 "# If no events are modified, an error message will be displayed here"
35
30enum filter_op_ids 36enum filter_op_ids
31{ 37{
32 OP_OR, 38 OP_OR,
@@ -646,7 +652,7 @@ void print_subsystem_event_filter(struct event_subsystem *system,
646 if (filter && filter->filter_string) 652 if (filter && filter->filter_string)
647 trace_seq_printf(s, "%s\n", filter->filter_string); 653 trace_seq_printf(s, "%s\n", filter->filter_string);
648 else 654 else
649 trace_seq_printf(s, "none\n"); 655 trace_seq_printf(s, DEFAULT_SYS_FILTER_MESSAGE "\n");
650 mutex_unlock(&event_mutex); 656 mutex_unlock(&event_mutex);
651} 657}
652 658
@@ -1649,7 +1655,9 @@ static int replace_system_preds(struct event_subsystem *system,
1649 */ 1655 */
1650 err = replace_preds(call, NULL, ps, filter_string, true); 1656 err = replace_preds(call, NULL, ps, filter_string, true);
1651 if (err) 1657 if (err)
1652 goto fail; 1658 call->flags |= TRACE_EVENT_FL_NO_SET_FILTER;
1659 else
1660 call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER;
1653 } 1661 }
1654 1662
1655 list_for_each_entry(call, &ftrace_events, list) { 1663 list_for_each_entry(call, &ftrace_events, list) {
@@ -1658,6 +1666,9 @@ static int replace_system_preds(struct event_subsystem *system,
1658 if (strcmp(call->class->system, system->name) != 0) 1666 if (strcmp(call->class->system, system->name) != 0)
1659 continue; 1667 continue;
1660 1668
1669 if (call->flags & TRACE_EVENT_FL_NO_SET_FILTER)
1670 continue;
1671
1661 filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); 1672 filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL);
1662 if (!filter_item) 1673 if (!filter_item)
1663 goto fail_mem; 1674 goto fail_mem;
@@ -1686,7 +1697,7 @@ static int replace_system_preds(struct event_subsystem *system,
1686 * replace the filter for the call. 1697 * replace the filter for the call.
1687 */ 1698 */
1688 filter = call->filter; 1699 filter = call->filter;
1689 call->filter = filter_item->filter; 1700 rcu_assign_pointer(call->filter, filter_item->filter);
1690 filter_item->filter = filter; 1701 filter_item->filter = filter;
1691 1702
1692 fail = false; 1703 fail = false;
@@ -1727,11 +1738,121 @@ static int replace_system_preds(struct event_subsystem *system,
1727 return -ENOMEM; 1738 return -ENOMEM;
1728} 1739}
1729 1740
1741static int create_filter_start(char *filter_str, bool set_str,
1742 struct filter_parse_state **psp,
1743 struct event_filter **filterp)
1744{
1745 struct event_filter *filter;
1746 struct filter_parse_state *ps = NULL;
1747 int err = 0;
1748
1749 WARN_ON_ONCE(*psp || *filterp);
1750
1751 /* allocate everything, and if any fails, free all and fail */
1752 filter = __alloc_filter();
1753 if (filter && set_str)
1754 err = replace_filter_string(filter, filter_str);
1755
1756 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1757
1758 if (!filter || !ps || err) {
1759 kfree(ps);
1760 __free_filter(filter);
1761 return -ENOMEM;
1762 }
1763
1764 /* we're committed to creating a new filter */
1765 *filterp = filter;
1766 *psp = ps;
1767
1768 parse_init(ps, filter_ops, filter_str);
1769 err = filter_parse(ps);
1770 if (err && set_str)
1771 append_filter_err(ps, filter);
1772 return err;
1773}
1774
1775static void create_filter_finish(struct filter_parse_state *ps)
1776{
1777 if (ps) {
1778 filter_opstack_clear(ps);
1779 postfix_clear(ps);
1780 kfree(ps);
1781 }
1782}
1783
1784/**
1785 * create_filter - create a filter for a ftrace_event_call
1786 * @call: ftrace_event_call to create a filter for
1787 * @filter_str: filter string
1788 * @set_str: remember @filter_str and enable detailed error in filter
1789 * @filterp: out param for created filter (always updated on return)
1790 *
1791 * Creates a filter for @call with @filter_str. If @set_str is %true,
1792 * @filter_str is copied and recorded in the new filter.
1793 *
1794 * On success, returns 0 and *@filterp points to the new filter. On
1795 * failure, returns -errno and *@filterp may point to %NULL or to a new
1796 * filter. In the latter case, the returned filter contains error
1797 * information if @set_str is %true and the caller is responsible for
1798 * freeing it.
1799 */
1800static int create_filter(struct ftrace_event_call *call,
1801 char *filter_str, bool set_str,
1802 struct event_filter **filterp)
1803{
1804 struct event_filter *filter = NULL;
1805 struct filter_parse_state *ps = NULL;
1806 int err;
1807
1808 err = create_filter_start(filter_str, set_str, &ps, &filter);
1809 if (!err) {
1810 err = replace_preds(call, filter, ps, filter_str, false);
1811 if (err && set_str)
1812 append_filter_err(ps, filter);
1813 }
1814 create_filter_finish(ps);
1815
1816 *filterp = filter;
1817 return err;
1818}
1819
1820/**
1821 * create_system_filter - create a filter for an event_subsystem
1822 * @system: event_subsystem to create a filter for
1823 * @filter_str: filter string
1824 * @filterp: out param for created filter (always updated on return)
1825 *
1826 * Identical to create_filter() except that it creates a subsystem filter
1827 * and always remembers @filter_str.
1828 */
1829static int create_system_filter(struct event_subsystem *system,
1830 char *filter_str, struct event_filter **filterp)
1831{
1832 struct event_filter *filter = NULL;
1833 struct filter_parse_state *ps = NULL;
1834 int err;
1835
1836 err = create_filter_start(filter_str, true, &ps, &filter);
1837 if (!err) {
1838 err = replace_system_preds(system, ps, filter_str);
1839 if (!err) {
1840 /* System filters just show a default message */
1841 kfree(filter->filter_string);
1842 filter->filter_string = NULL;
1843 } else {
1844 append_filter_err(ps, filter);
1845 }
1846 }
1847 create_filter_finish(ps);
1848
1849 *filterp = filter;
1850 return err;
1851}
1852
1730int apply_event_filter(struct ftrace_event_call *call, char *filter_string) 1853int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1731{ 1854{
1732 struct filter_parse_state *ps;
1733 struct event_filter *filter; 1855 struct event_filter *filter;
1734 struct event_filter *tmp;
1735 int err = 0; 1856 int err = 0;
1736 1857
1737 mutex_lock(&event_mutex); 1858 mutex_lock(&event_mutex);
@@ -1741,56 +1862,37 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1741 filter = call->filter; 1862 filter = call->filter;
1742 if (!filter) 1863 if (!filter)
1743 goto out_unlock; 1864 goto out_unlock;
1744 call->filter = NULL; 1865 RCU_INIT_POINTER(call->filter, NULL);
1745 /* Make sure the filter is not being used */ 1866 /* Make sure the filter is not being used */
1746 synchronize_sched(); 1867 synchronize_sched();
1747 __free_filter(filter); 1868 __free_filter(filter);
1748 goto out_unlock; 1869 goto out_unlock;
1749 } 1870 }
1750 1871
1751 err = -ENOMEM; 1872 err = create_filter(call, filter_string, true, &filter);
1752 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1753 if (!ps)
1754 goto out_unlock;
1755
1756 filter = __alloc_filter();
1757 if (!filter) {
1758 kfree(ps);
1759 goto out_unlock;
1760 }
1761 1873
1762 replace_filter_string(filter, filter_string);
1763
1764 parse_init(ps, filter_ops, filter_string);
1765 err = filter_parse(ps);
1766 if (err) {
1767 append_filter_err(ps, filter);
1768 goto out;
1769 }
1770
1771 err = replace_preds(call, filter, ps, filter_string, false);
1772 if (err) {
1773 filter_disable(call);
1774 append_filter_err(ps, filter);
1775 } else
1776 call->flags |= TRACE_EVENT_FL_FILTERED;
1777out:
1778 /* 1874 /*
1779 * Always swap the call filter with the new filter 1875 * Always swap the call filter with the new filter
1780 * even if there was an error. If there was an error 1876 * even if there was an error. If there was an error
1781 * in the filter, we disable the filter and show the error 1877 * in the filter, we disable the filter and show the error
1782 * string 1878 * string
1783 */ 1879 */
1784 tmp = call->filter; 1880 if (filter) {
1785 call->filter = filter; 1881 struct event_filter *tmp = call->filter;
1786 if (tmp) { 1882
1787 /* Make sure the call is done with the filter */ 1883 if (!err)
1788 synchronize_sched(); 1884 call->flags |= TRACE_EVENT_FL_FILTERED;
1789 __free_filter(tmp); 1885 else
1886 filter_disable(call);
1887
1888 rcu_assign_pointer(call->filter, filter);
1889
1890 if (tmp) {
1891 /* Make sure the call is done with the filter */
1892 synchronize_sched();
1893 __free_filter(tmp);
1894 }
1790 } 1895 }
1791 filter_opstack_clear(ps);
1792 postfix_clear(ps);
1793 kfree(ps);
1794out_unlock: 1896out_unlock:
1795 mutex_unlock(&event_mutex); 1897 mutex_unlock(&event_mutex);
1796 1898
@@ -1800,7 +1902,6 @@ out_unlock:
1800int apply_subsystem_event_filter(struct event_subsystem *system, 1902int apply_subsystem_event_filter(struct event_subsystem *system,
1801 char *filter_string) 1903 char *filter_string)
1802{ 1904{
1803 struct filter_parse_state *ps;
1804 struct event_filter *filter; 1905 struct event_filter *filter;
1805 int err = 0; 1906 int err = 0;
1806 1907
@@ -1824,38 +1925,15 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1824 goto out_unlock; 1925 goto out_unlock;
1825 } 1926 }
1826 1927
1827 err = -ENOMEM; 1928 err = create_system_filter(system, filter_string, &filter);
1828 ps = kzalloc(sizeof(*ps), GFP_KERNEL); 1929 if (filter) {
1829 if (!ps) 1930 /*
1830 goto out_unlock; 1931 * No event actually uses the system filter
1831 1932 * we can free it without synchronize_sched().
1832 filter = __alloc_filter(); 1933 */
1833 if (!filter) 1934 __free_filter(system->filter);
1834 goto out; 1935 system->filter = filter;
1835
1836 replace_filter_string(filter, filter_string);
1837 /*
1838 * No event actually uses the system filter
1839 * we can free it without synchronize_sched().
1840 */
1841 __free_filter(system->filter);
1842 system->filter = filter;
1843
1844 parse_init(ps, filter_ops, filter_string);
1845 err = filter_parse(ps);
1846 if (err) {
1847 append_filter_err(ps, system->filter);
1848 goto out;
1849 } 1936 }
1850
1851 err = replace_system_preds(system, ps, filter_string);
1852 if (err)
1853 append_filter_err(ps, system->filter);
1854
1855out:
1856 filter_opstack_clear(ps);
1857 postfix_clear(ps);
1858 kfree(ps);
1859out_unlock: 1937out_unlock:
1860 mutex_unlock(&event_mutex); 1938 mutex_unlock(&event_mutex);
1861 1939
@@ -1877,7 +1955,6 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1877{ 1955{
1878 int err; 1956 int err;
1879 struct event_filter *filter; 1957 struct event_filter *filter;
1880 struct filter_parse_state *ps;
1881 struct ftrace_event_call *call; 1958 struct ftrace_event_call *call;
1882 1959
1883 mutex_lock(&event_mutex); 1960 mutex_lock(&event_mutex);
@@ -1892,33 +1969,10 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1892 if (event->filter) 1969 if (event->filter)
1893 goto out_unlock; 1970 goto out_unlock;
1894 1971
1895 filter = __alloc_filter(); 1972 err = create_filter(call, filter_str, false, &filter);
1896 if (!filter) {
1897 err = PTR_ERR(filter);
1898 goto out_unlock;
1899 }
1900
1901 err = -ENOMEM;
1902 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1903 if (!ps)
1904 goto free_filter;
1905
1906 parse_init(ps, filter_ops, filter_str);
1907 err = filter_parse(ps);
1908 if (err)
1909 goto free_ps;
1910
1911 err = replace_preds(call, filter, ps, filter_str, false);
1912 if (!err) 1973 if (!err)
1913 event->filter = filter; 1974 event->filter = filter;
1914 1975 else
1915free_ps:
1916 filter_opstack_clear(ps);
1917 postfix_clear(ps);
1918 kfree(ps);
1919
1920free_filter:
1921 if (err)
1922 __free_filter(filter); 1976 __free_filter(filter);
1923 1977
1924out_unlock: 1978out_unlock:
@@ -1937,43 +1991,6 @@ out_unlock:
1937#define CREATE_TRACE_POINTS 1991#define CREATE_TRACE_POINTS
1938#include "trace_events_filter_test.h" 1992#include "trace_events_filter_test.h"
1939 1993
1940static int test_get_filter(char *filter_str, struct ftrace_event_call *call,
1941 struct event_filter **pfilter)
1942{
1943 struct event_filter *filter;
1944 struct filter_parse_state *ps;
1945 int err = -ENOMEM;
1946
1947 filter = __alloc_filter();
1948 if (!filter)
1949 goto out;
1950
1951 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1952 if (!ps)
1953 goto free_filter;
1954
1955 parse_init(ps, filter_ops, filter_str);
1956 err = filter_parse(ps);
1957 if (err)
1958 goto free_ps;
1959
1960 err = replace_preds(call, filter, ps, filter_str, false);
1961 if (!err)
1962 *pfilter = filter;
1963
1964 free_ps:
1965 filter_opstack_clear(ps);
1966 postfix_clear(ps);
1967 kfree(ps);
1968
1969 free_filter:
1970 if (err)
1971 __free_filter(filter);
1972
1973 out:
1974 return err;
1975}
1976
1977#define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \ 1994#define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \
1978{ \ 1995{ \
1979 .filter = FILTER, \ 1996 .filter = FILTER, \
@@ -2092,12 +2109,13 @@ static __init int ftrace_test_event_filter(void)
2092 struct test_filter_data_t *d = &test_filter_data[i]; 2109 struct test_filter_data_t *d = &test_filter_data[i];
2093 int err; 2110 int err;
2094 2111
2095 err = test_get_filter(d->filter, &event_ftrace_test_filter, 2112 err = create_filter(&event_ftrace_test_filter, d->filter,
2096 &filter); 2113 false, &filter);
2097 if (err) { 2114 if (err) {
2098 printk(KERN_INFO 2115 printk(KERN_INFO
2099 "Failed to get filter for '%s', err %d\n", 2116 "Failed to get filter for '%s', err %d\n",
2100 d->filter, err); 2117 d->filter, err);
2118 __free_filter(filter);
2101 break; 2119 break;
2102 } 2120 }
2103 2121
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 20dad0d7a163..99d20e920368 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -280,9 +280,20 @@ static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
280} 280}
281 281
282static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { } 282static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { }
283static void irqsoff_print_header(struct seq_file *s) { }
284static void irqsoff_trace_open(struct trace_iterator *iter) { } 283static void irqsoff_trace_open(struct trace_iterator *iter) { }
285static void irqsoff_trace_close(struct trace_iterator *iter) { } 284static void irqsoff_trace_close(struct trace_iterator *iter) { }
285
286#ifdef CONFIG_FUNCTION_TRACER
287static void irqsoff_print_header(struct seq_file *s)
288{
289 trace_default_header(s);
290}
291#else
292static void irqsoff_print_header(struct seq_file *s)
293{
294 trace_latency_header(s);
295}
296#endif /* CONFIG_FUNCTION_TRACER */
286#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 297#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
287 298
288/* 299/*
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 51999309a6cf..0d6ff3555942 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -627,11 +627,23 @@ int trace_print_context(struct trace_iterator *iter)
627 unsigned long usec_rem = do_div(t, USEC_PER_SEC); 627 unsigned long usec_rem = do_div(t, USEC_PER_SEC);
628 unsigned long secs = (unsigned long)t; 628 unsigned long secs = (unsigned long)t;
629 char comm[TASK_COMM_LEN]; 629 char comm[TASK_COMM_LEN];
630 int ret;
630 631
631 trace_find_cmdline(entry->pid, comm); 632 trace_find_cmdline(entry->pid, comm);
632 633
633 return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ", 634 ret = trace_seq_printf(s, "%16s-%-5d [%03d] ",
634 comm, entry->pid, iter->cpu, secs, usec_rem); 635 comm, entry->pid, iter->cpu);
636 if (!ret)
637 return 0;
638
639 if (trace_flags & TRACE_ITER_IRQ_INFO) {
640 ret = trace_print_lat_fmt(s, entry);
641 if (!ret)
642 return 0;
643 }
644
645 return trace_seq_printf(s, " %5lu.%06lu: ",
646 secs, usec_rem);
635} 647}
636 648
637int trace_print_lat_context(struct trace_iterator *iter) 649int trace_print_lat_context(struct trace_iterator *iter)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index e4a70c0c71b6..ff791ea48b57 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -280,9 +280,20 @@ static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
280} 280}
281 281
282static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } 282static void wakeup_graph_return(struct ftrace_graph_ret *trace) { }
283static void wakeup_print_header(struct seq_file *s) { }
284static void wakeup_trace_open(struct trace_iterator *iter) { } 283static void wakeup_trace_open(struct trace_iterator *iter) { }
285static void wakeup_trace_close(struct trace_iterator *iter) { } 284static void wakeup_trace_close(struct trace_iterator *iter) { }
285
286#ifdef CONFIG_FUNCTION_TRACER
287static void wakeup_print_header(struct seq_file *s)
288{
289 trace_default_header(s);
290}
291#else
292static void wakeup_print_header(struct seq_file *s)
293{
294 trace_latency_header(s);
295}
296#endif /* CONFIG_FUNCTION_TRACER */
286#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 297#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
287 298
288/* 299/*
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 77575b386d97..d4545f49242e 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -13,6 +13,9 @@
13#include <linux/sysctl.h> 13#include <linux/sysctl.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/fs.h> 15#include <linux/fs.h>
16
17#include <asm/setup.h>
18
16#include "trace.h" 19#include "trace.h"
17 20
18#define STACK_TRACE_ENTRIES 500 21#define STACK_TRACE_ENTRIES 500
@@ -133,7 +136,6 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
133static struct ftrace_ops trace_ops __read_mostly = 136static struct ftrace_ops trace_ops __read_mostly =
134{ 137{
135 .func = stack_trace_call, 138 .func = stack_trace_call,
136 .flags = FTRACE_OPS_FL_GLOBAL,
137}; 139};
138 140
139static ssize_t 141static ssize_t
@@ -311,6 +313,21 @@ static const struct file_operations stack_trace_fops = {
311 .release = seq_release, 313 .release = seq_release,
312}; 314};
313 315
316static int
317stack_trace_filter_open(struct inode *inode, struct file *file)
318{
319 return ftrace_regex_open(&trace_ops, FTRACE_ITER_FILTER,
320 inode, file);
321}
322
323static const struct file_operations stack_trace_filter_fops = {
324 .open = stack_trace_filter_open,
325 .read = seq_read,
326 .write = ftrace_filter_write,
327 .llseek = ftrace_regex_lseek,
328 .release = ftrace_regex_release,
329};
330
314int 331int
315stack_trace_sysctl(struct ctl_table *table, int write, 332stack_trace_sysctl(struct ctl_table *table, int write,
316 void __user *buffer, size_t *lenp, 333 void __user *buffer, size_t *lenp,
@@ -338,8 +355,13 @@ stack_trace_sysctl(struct ctl_table *table, int write,
338 return ret; 355 return ret;
339} 356}
340 357
358static char stack_trace_filter_buf[COMMAND_LINE_SIZE+1] __initdata;
359
341static __init int enable_stacktrace(char *str) 360static __init int enable_stacktrace(char *str)
342{ 361{
362 if (strncmp(str, "_filter=", 8) == 0)
363 strncpy(stack_trace_filter_buf, str+8, COMMAND_LINE_SIZE);
364
343 stack_tracer_enabled = 1; 365 stack_tracer_enabled = 1;
344 last_stack_tracer_enabled = 1; 366 last_stack_tracer_enabled = 1;
345 return 1; 367 return 1;
@@ -358,6 +380,12 @@ static __init int stack_trace_init(void)
358 trace_create_file("stack_trace", 0444, d_tracer, 380 trace_create_file("stack_trace", 0444, d_tracer,
359 NULL, &stack_trace_fops); 381 NULL, &stack_trace_fops);
360 382
383 trace_create_file("stack_trace_filter", 0444, d_tracer,
384 NULL, &stack_trace_filter_fops);
385
386 if (stack_trace_filter_buf[0])
387 ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1);
388
361 if (stack_tracer_enabled) 389 if (stack_tracer_enabled)
362 register_ftrace_function(&trace_ops); 390 register_ftrace_function(&trace_ops);
363 391
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index db110b8ae030..f1539decd99d 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -634,10 +634,11 @@ static int tracepoint_module_coming(struct module *mod)
634 int ret = 0; 634 int ret = 0;
635 635
636 /* 636 /*
637 * We skip modules that tain the kernel, especially those with different 637 * We skip modules that taint the kernel, especially those with different
638 * module header (for forced load), to make sure we don't cause a crash. 638 * module headers (for forced load), to make sure we don't cause a crash.
639 * Staging and out-of-tree GPL modules are fine.
639 */ 640 */
640 if (mod->taints) 641 if (mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP)))
641 return 0; 642 return 0;
642 mutex_lock(&tracepoints_mutex); 643 mutex_lock(&tracepoints_mutex);
643 tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL); 644 tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL);
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 5bbfac85866e..23b4d784ebdd 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -127,7 +127,7 @@ void acct_update_integrals(struct task_struct *tsk)
127 127
128 local_irq_save(flags); 128 local_irq_save(flags);
129 time = tsk->stime + tsk->utime; 129 time = tsk->stime + tsk->utime;
130 dtime = cputime_sub(time, tsk->acct_timexpd); 130 dtime = time - tsk->acct_timexpd;
131 jiffies_to_timeval(cputime_to_jiffies(dtime), &value); 131 jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
132 delta = value.tv_sec; 132 delta = value.tv_sec;
133 delta = delta * USEC_PER_SEC + value.tv_usec; 133 delta = delta * USEC_PER_SEC + value.tv_usec;
diff --git a/kernel/wait.c b/kernel/wait.c
index 26fa7797f90f..7fdd9eaca2c3 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -10,10 +10,10 @@
10#include <linux/wait.h> 10#include <linux/wait.h>
11#include <linux/hash.h> 11#include <linux/hash.h>
12 12
13void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key) 13void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key)
14{ 14{
15 spin_lock_init(&q->lock); 15 spin_lock_init(&q->lock);
16 lockdep_set_class(&q->lock, key); 16 lockdep_set_class_and_name(&q->lock, key, name);
17 INIT_LIST_HEAD(&q->task_list); 17 INIT_LIST_HEAD(&q->task_list);
18} 18}
19 19
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 1d7bca7f4f52..d117262deba3 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -296,7 +296,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
296 if (__this_cpu_read(soft_watchdog_warn) == true) 296 if (__this_cpu_read(soft_watchdog_warn) == true)
297 return HRTIMER_RESTART; 297 return HRTIMER_RESTART;
298 298
299 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", 299 printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
300 smp_processor_id(), duration, 300 smp_processor_id(), duration,
301 current->comm, task_pid_nr(current)); 301 current->comm, task_pid_nr(current));
302 print_modules(); 302 print_modules();
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 42fa9ad0a810..bec7b5b53e03 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -242,10 +242,10 @@ struct workqueue_struct {
242 242
243 int nr_drainers; /* W: drain in progress */ 243 int nr_drainers; /* W: drain in progress */
244 int saved_max_active; /* W: saved cwq max_active */ 244 int saved_max_active; /* W: saved cwq max_active */
245 const char *name; /* I: workqueue name */
246#ifdef CONFIG_LOCKDEP 245#ifdef CONFIG_LOCKDEP
247 struct lockdep_map lockdep_map; 246 struct lockdep_map lockdep_map;
248#endif 247#endif
248 char name[]; /* I: workqueue name */
249}; 249};
250 250
251struct workqueue_struct *system_wq __read_mostly; 251struct workqueue_struct *system_wq __read_mostly;
@@ -2954,14 +2954,29 @@ static int wq_clamp_max_active(int max_active, unsigned int flags,
2954 return clamp_val(max_active, 1, lim); 2954 return clamp_val(max_active, 1, lim);
2955} 2955}
2956 2956
2957struct workqueue_struct *__alloc_workqueue_key(const char *name, 2957struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
2958 unsigned int flags, 2958 unsigned int flags,
2959 int max_active, 2959 int max_active,
2960 struct lock_class_key *key, 2960 struct lock_class_key *key,
2961 const char *lock_name) 2961 const char *lock_name, ...)
2962{ 2962{
2963 va_list args, args1;
2963 struct workqueue_struct *wq; 2964 struct workqueue_struct *wq;
2964 unsigned int cpu; 2965 unsigned int cpu;
2966 size_t namelen;
2967
2968 /* determine namelen, allocate wq and format name */
2969 va_start(args, lock_name);
2970 va_copy(args1, args);
2971 namelen = vsnprintf(NULL, 0, fmt, args) + 1;
2972
2973 wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL);
2974 if (!wq)
2975 goto err;
2976
2977 vsnprintf(wq->name, namelen, fmt, args1);
2978 va_end(args);
2979 va_end(args1);
2965 2980
2966 /* 2981 /*
2967 * Workqueues which may be used during memory reclaim should 2982 * Workqueues which may be used during memory reclaim should
@@ -2978,12 +2993,9 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
2978 flags |= WQ_HIGHPRI; 2993 flags |= WQ_HIGHPRI;
2979 2994
2980 max_active = max_active ?: WQ_DFL_ACTIVE; 2995 max_active = max_active ?: WQ_DFL_ACTIVE;
2981 max_active = wq_clamp_max_active(max_active, flags, name); 2996 max_active = wq_clamp_max_active(max_active, flags, wq->name);
2982
2983 wq = kzalloc(sizeof(*wq), GFP_KERNEL);
2984 if (!wq)
2985 goto err;
2986 2997
2998 /* init wq */
2987 wq->flags = flags; 2999 wq->flags = flags;
2988 wq->saved_max_active = max_active; 3000 wq->saved_max_active = max_active;
2989 mutex_init(&wq->flush_mutex); 3001 mutex_init(&wq->flush_mutex);
@@ -2991,7 +3003,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
2991 INIT_LIST_HEAD(&wq->flusher_queue); 3003 INIT_LIST_HEAD(&wq->flusher_queue);
2992 INIT_LIST_HEAD(&wq->flusher_overflow); 3004 INIT_LIST_HEAD(&wq->flusher_overflow);
2993 3005
2994 wq->name = name;
2995 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); 3006 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
2996 INIT_LIST_HEAD(&wq->list); 3007 INIT_LIST_HEAD(&wq->list);
2997 3008
@@ -3020,7 +3031,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
3020 if (!rescuer) 3031 if (!rescuer)
3021 goto err; 3032 goto err;
3022 3033
3023 rescuer->task = kthread_create(rescuer_thread, wq, "%s", name); 3034 rescuer->task = kthread_create(rescuer_thread, wq, "%s",
3035 wq->name);
3024 if (IS_ERR(rescuer->task)) 3036 if (IS_ERR(rescuer->task))
3025 goto err; 3037 goto err;
3026 3038