diff options
Diffstat (limited to 'kernel')
166 files changed, 11304 insertions, 5942 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index eca595e2fd52..f70396e5a24b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -2,16 +2,15 @@ | |||
2 | # Makefile for the linux kernel. | 2 | # Makefile for the linux kernel. |
3 | # | 3 | # |
4 | 4 | ||
5 | obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ | 5 | obj-y = fork.o exec_domain.o panic.o printk.o \ |
6 | cpu.o exit.o itimer.o time.o softirq.o resource.o \ | 6 | cpu.o exit.o itimer.o time.o softirq.o resource.o \ |
7 | sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ | 7 | sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ |
8 | signal.o sys.o kmod.o workqueue.o pid.o \ | 8 | signal.o sys.o kmod.o workqueue.o pid.o \ |
9 | rcupdate.o extable.o params.o posix-timers.o \ | 9 | rcupdate.o extable.o params.o posix-timers.o \ |
10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ | 10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ |
11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ | 11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ |
12 | notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ | 12 | notifier.o ksysfs.o cred.o \ |
13 | async.o range.o | 13 | async.o range.o groups.o |
14 | obj-y += groups.o | ||
15 | 14 | ||
16 | ifdef CONFIG_FUNCTION_TRACER | 15 | ifdef CONFIG_FUNCTION_TRACER |
17 | # Do not trace debug files and internal ftrace files | 16 | # Do not trace debug files and internal ftrace files |
@@ -20,10 +19,11 @@ CFLAGS_REMOVE_lockdep_proc.o = -pg | |||
20 | CFLAGS_REMOVE_mutex-debug.o = -pg | 19 | CFLAGS_REMOVE_mutex-debug.o = -pg |
21 | CFLAGS_REMOVE_rtmutex-debug.o = -pg | 20 | CFLAGS_REMOVE_rtmutex-debug.o = -pg |
22 | CFLAGS_REMOVE_cgroup-debug.o = -pg | 21 | CFLAGS_REMOVE_cgroup-debug.o = -pg |
23 | CFLAGS_REMOVE_sched_clock.o = -pg | ||
24 | CFLAGS_REMOVE_irq_work.o = -pg | 22 | CFLAGS_REMOVE_irq_work.o = -pg |
25 | endif | 23 | endif |
26 | 24 | ||
25 | obj-y += sched/ | ||
26 | |||
27 | obj-$(CONFIG_FREEZER) += freezer.o | 27 | obj-$(CONFIG_FREEZER) += freezer.o |
28 | obj-$(CONFIG_PROFILING) += profile.o | 28 | obj-$(CONFIG_PROFILING) += profile.o |
29 | obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o | 29 | obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o |
@@ -99,8 +99,8 @@ obj-$(CONFIG_TRACING) += trace/ | |||
99 | obj-$(CONFIG_X86_DS) += trace/ | 99 | obj-$(CONFIG_X86_DS) += trace/ |
100 | obj-$(CONFIG_RING_BUFFER) += trace/ | 100 | obj-$(CONFIG_RING_BUFFER) += trace/ |
101 | obj-$(CONFIG_TRACEPOINTS) += trace/ | 101 | obj-$(CONFIG_TRACEPOINTS) += trace/ |
102 | obj-$(CONFIG_SMP) += sched_cpupri.o | ||
103 | obj-$(CONFIG_IRQ_WORK) += irq_work.o | 102 | obj-$(CONFIG_IRQ_WORK) += irq_work.o |
103 | obj-$(CONFIG_CPU_PM) += cpu_pm.o | ||
104 | 104 | ||
105 | obj-$(CONFIG_PERF_EVENTS) += events/ | 105 | obj-$(CONFIG_PERF_EVENTS) += events/ |
106 | 106 | ||
@@ -109,15 +109,6 @@ obj-$(CONFIG_PADATA) += padata.o | |||
109 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o | 109 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o |
110 | obj-$(CONFIG_JUMP_LABEL) += jump_label.o | 110 | obj-$(CONFIG_JUMP_LABEL) += jump_label.o |
111 | 111 | ||
112 | ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) | ||
113 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is | ||
114 | # needed for x86 only. Why this used to be enabled for all architectures is beyond | ||
115 | # me. I suspect most platforms don't need this, but until we know that for sure | ||
116 | # I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k | ||
117 | # to get a correct value for the wait-channel (WCHAN in ps). --davidm | ||
118 | CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer | ||
119 | endif | ||
120 | |||
121 | $(obj)/configs.o: $(obj)/config_data.h | 112 | $(obj)/configs.o: $(obj)/config_data.h |
122 | 113 | ||
123 | # config_data.h contains the same information as ikconfig.h but gzipped. | 114 | # config_data.h contains the same information as ikconfig.h but gzipped. |
diff --git a/kernel/acct.c b/kernel/acct.c index fa7eb3de2ddc..02e6167a53b0 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
@@ -84,11 +84,10 @@ static void do_acct_process(struct bsd_acct_struct *acct, | |||
84 | * the cache line to have the data after getting the lock. | 84 | * the cache line to have the data after getting the lock. |
85 | */ | 85 | */ |
86 | struct bsd_acct_struct { | 86 | struct bsd_acct_struct { |
87 | volatile int active; | 87 | int active; |
88 | volatile int needcheck; | 88 | unsigned long needcheck; |
89 | struct file *file; | 89 | struct file *file; |
90 | struct pid_namespace *ns; | 90 | struct pid_namespace *ns; |
91 | struct timer_list timer; | ||
92 | struct list_head list; | 91 | struct list_head list; |
93 | }; | 92 | }; |
94 | 93 | ||
@@ -96,15 +95,6 @@ static DEFINE_SPINLOCK(acct_lock); | |||
96 | static LIST_HEAD(acct_list); | 95 | static LIST_HEAD(acct_list); |
97 | 96 | ||
98 | /* | 97 | /* |
99 | * Called whenever the timer says to check the free space. | ||
100 | */ | ||
101 | static void acct_timeout(unsigned long x) | ||
102 | { | ||
103 | struct bsd_acct_struct *acct = (struct bsd_acct_struct *)x; | ||
104 | acct->needcheck = 1; | ||
105 | } | ||
106 | |||
107 | /* | ||
108 | * Check the amount of free space and suspend/resume accordingly. | 98 | * Check the amount of free space and suspend/resume accordingly. |
109 | */ | 99 | */ |
110 | static int check_free_space(struct bsd_acct_struct *acct, struct file *file) | 100 | static int check_free_space(struct bsd_acct_struct *acct, struct file *file) |
@@ -112,12 +102,12 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file) | |||
112 | struct kstatfs sbuf; | 102 | struct kstatfs sbuf; |
113 | int res; | 103 | int res; |
114 | int act; | 104 | int act; |
115 | sector_t resume; | 105 | u64 resume; |
116 | sector_t suspend; | 106 | u64 suspend; |
117 | 107 | ||
118 | spin_lock(&acct_lock); | 108 | spin_lock(&acct_lock); |
119 | res = acct->active; | 109 | res = acct->active; |
120 | if (!file || !acct->needcheck) | 110 | if (!file || time_is_before_jiffies(acct->needcheck)) |
121 | goto out; | 111 | goto out; |
122 | spin_unlock(&acct_lock); | 112 | spin_unlock(&acct_lock); |
123 | 113 | ||
@@ -127,8 +117,8 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file) | |||
127 | suspend = sbuf.f_blocks * SUSPEND; | 117 | suspend = sbuf.f_blocks * SUSPEND; |
128 | resume = sbuf.f_blocks * RESUME; | 118 | resume = sbuf.f_blocks * RESUME; |
129 | 119 | ||
130 | sector_div(suspend, 100); | 120 | do_div(suspend, 100); |
131 | sector_div(resume, 100); | 121 | do_div(resume, 100); |
132 | 122 | ||
133 | if (sbuf.f_bavail <= suspend) | 123 | if (sbuf.f_bavail <= suspend) |
134 | act = -1; | 124 | act = -1; |
@@ -160,10 +150,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file) | |||
160 | } | 150 | } |
161 | } | 151 | } |
162 | 152 | ||
163 | del_timer(&acct->timer); | 153 | acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; |
164 | acct->needcheck = 0; | ||
165 | acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ; | ||
166 | add_timer(&acct->timer); | ||
167 | res = acct->active; | 154 | res = acct->active; |
168 | out: | 155 | out: |
169 | spin_unlock(&acct_lock); | 156 | spin_unlock(&acct_lock); |
@@ -185,9 +172,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file, | |||
185 | if (acct->file) { | 172 | if (acct->file) { |
186 | old_acct = acct->file; | 173 | old_acct = acct->file; |
187 | old_ns = acct->ns; | 174 | old_ns = acct->ns; |
188 | del_timer(&acct->timer); | ||
189 | acct->active = 0; | 175 | acct->active = 0; |
190 | acct->needcheck = 0; | ||
191 | acct->file = NULL; | 176 | acct->file = NULL; |
192 | acct->ns = NULL; | 177 | acct->ns = NULL; |
193 | list_del(&acct->list); | 178 | list_del(&acct->list); |
@@ -195,13 +180,9 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file, | |||
195 | if (file) { | 180 | if (file) { |
196 | acct->file = file; | 181 | acct->file = file; |
197 | acct->ns = ns; | 182 | acct->ns = ns; |
198 | acct->needcheck = 0; | 183 | acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; |
199 | acct->active = 1; | 184 | acct->active = 1; |
200 | list_add(&acct->list, &acct_list); | 185 | list_add(&acct->list, &acct_list); |
201 | /* It's been deleted if it was used before so this is safe */ | ||
202 | setup_timer(&acct->timer, acct_timeout, (unsigned long)acct); | ||
203 | acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ; | ||
204 | add_timer(&acct->timer); | ||
205 | } | 186 | } |
206 | if (old_acct) { | 187 | if (old_acct) { |
207 | mnt_unpin(old_acct->f_path.mnt); | 188 | mnt_unpin(old_acct->f_path.mnt); |
@@ -334,7 +315,7 @@ void acct_auto_close(struct super_block *sb) | |||
334 | spin_lock(&acct_lock); | 315 | spin_lock(&acct_lock); |
335 | restart: | 316 | restart: |
336 | list_for_each_entry(acct, &acct_list, list) | 317 | list_for_each_entry(acct, &acct_list, list) |
337 | if (acct->file && acct->file->f_path.mnt->mnt_sb == sb) { | 318 | if (acct->file && acct->file->f_path.dentry->d_sb == sb) { |
338 | acct_file_reopen(acct, NULL, NULL); | 319 | acct_file_reopen(acct, NULL, NULL); |
339 | goto restart; | 320 | goto restart; |
340 | } | 321 | } |
@@ -348,7 +329,6 @@ void acct_exit_ns(struct pid_namespace *ns) | |||
348 | if (acct == NULL) | 329 | if (acct == NULL) |
349 | return; | 330 | return; |
350 | 331 | ||
351 | del_timer_sync(&acct->timer); | ||
352 | spin_lock(&acct_lock); | 332 | spin_lock(&acct_lock); |
353 | if (acct->file != NULL) | 333 | if (acct->file != NULL) |
354 | acct_file_reopen(acct, NULL, NULL); | 334 | acct_file_reopen(acct, NULL, NULL); |
@@ -498,7 +478,7 @@ static void do_acct_process(struct bsd_acct_struct *acct, | |||
498 | * Fill the accounting struct with the needed info as recorded | 478 | * Fill the accounting struct with the needed info as recorded |
499 | * by the different kernel functions. | 479 | * by the different kernel functions. |
500 | */ | 480 | */ |
501 | memset((caddr_t)&ac, 0, sizeof(acct_t)); | 481 | memset(&ac, 0, sizeof(acct_t)); |
502 | 482 | ||
503 | ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER; | 483 | ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER; |
504 | strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm)); | 484 | strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm)); |
@@ -613,8 +593,8 @@ void acct_collect(long exitcode, int group_dead) | |||
613 | pacct->ac_flag |= ACORE; | 593 | pacct->ac_flag |= ACORE; |
614 | if (current->flags & PF_SIGNALED) | 594 | if (current->flags & PF_SIGNALED) |
615 | pacct->ac_flag |= AXSIG; | 595 | pacct->ac_flag |= AXSIG; |
616 | pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime); | 596 | pacct->ac_utime += current->utime; |
617 | pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime); | 597 | pacct->ac_stime += current->stime; |
618 | pacct->ac_minflt += current->min_flt; | 598 | pacct->ac_minflt += current->min_flt; |
619 | pacct->ac_majflt += current->maj_flt; | 599 | pacct->ac_majflt += current->maj_flt; |
620 | spin_unlock_irq(¤t->sighand->siglock); | 600 | spin_unlock_irq(¤t->sighand->siglock); |
diff --git a/kernel/async.c b/kernel/async.c index d5fe7af0de2e..bd0c168a3bbe 100644 --- a/kernel/async.c +++ b/kernel/async.c | |||
@@ -51,7 +51,7 @@ asynchronous and synchronous parts of the kernel. | |||
51 | #include <linux/async.h> | 51 | #include <linux/async.h> |
52 | #include <linux/atomic.h> | 52 | #include <linux/atomic.h> |
53 | #include <linux/ktime.h> | 53 | #include <linux/ktime.h> |
54 | #include <linux/module.h> | 54 | #include <linux/export.h> |
55 | #include <linux/wait.h> | 55 | #include <linux/wait.h> |
56 | #include <linux/sched.h> | 56 | #include <linux/sched.h> |
57 | #include <linux/slab.h> | 57 | #include <linux/slab.h> |
@@ -78,8 +78,6 @@ static DECLARE_WAIT_QUEUE_HEAD(async_done); | |||
78 | 78 | ||
79 | static atomic_t entry_count; | 79 | static atomic_t entry_count; |
80 | 80 | ||
81 | extern int initcall_debug; | ||
82 | |||
83 | 81 | ||
84 | /* | 82 | /* |
85 | * MUST be called with the lock held! | 83 | * MUST be called with the lock held! |
@@ -120,7 +118,7 @@ static void async_run_entry_fn(struct work_struct *work) | |||
120 | struct async_entry *entry = | 118 | struct async_entry *entry = |
121 | container_of(work, struct async_entry, work); | 119 | container_of(work, struct async_entry, work); |
122 | unsigned long flags; | 120 | unsigned long flags; |
123 | ktime_t calltime, delta, rettime; | 121 | ktime_t uninitialized_var(calltime), delta, rettime; |
124 | 122 | ||
125 | /* 1) move self to the running queue */ | 123 | /* 1) move self to the running queue */ |
126 | spin_lock_irqsave(&async_lock, flags); | 124 | spin_lock_irqsave(&async_lock, flags); |
@@ -269,7 +267,7 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain); | |||
269 | void async_synchronize_cookie_domain(async_cookie_t cookie, | 267 | void async_synchronize_cookie_domain(async_cookie_t cookie, |
270 | struct list_head *running) | 268 | struct list_head *running) |
271 | { | 269 | { |
272 | ktime_t starttime, delta, endtime; | 270 | ktime_t uninitialized_var(starttime), delta, endtime; |
273 | 271 | ||
274 | if (initcall_debug && system_state == SYSTEM_BOOTING) { | 272 | if (initcall_debug && system_state == SYSTEM_BOOTING) { |
275 | printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); | 273 | printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); |
diff --git a/kernel/audit.c b/kernel/audit.c index f3ba55fa0b70..57e3f5107937 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -45,7 +45,7 @@ | |||
45 | #include <asm/types.h> | 45 | #include <asm/types.h> |
46 | #include <linux/atomic.h> | 46 | #include <linux/atomic.h> |
47 | #include <linux/mm.h> | 47 | #include <linux/mm.h> |
48 | #include <linux/module.h> | 48 | #include <linux/export.h> |
49 | #include <linux/slab.h> | 49 | #include <linux/slab.h> |
50 | #include <linux/err.h> | 50 | #include <linux/err.h> |
51 | #include <linux/kthread.h> | 51 | #include <linux/kthread.h> |
@@ -1260,12 +1260,13 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt, | |||
1260 | avail = audit_expand(ab, | 1260 | avail = audit_expand(ab, |
1261 | max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail)); | 1261 | max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail)); |
1262 | if (!avail) | 1262 | if (!avail) |
1263 | goto out; | 1263 | goto out_va_end; |
1264 | len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2); | 1264 | len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2); |
1265 | } | 1265 | } |
1266 | va_end(args2); | ||
1267 | if (len > 0) | 1266 | if (len > 0) |
1268 | skb_put(skb, len); | 1267 | skb_put(skb, len); |
1268 | out_va_end: | ||
1269 | va_end(args2); | ||
1269 | out: | 1270 | out: |
1270 | return; | 1271 | return; |
1271 | } | 1272 | } |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index ce4b054acee5..e7fe2b0d29b3 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -48,7 +48,7 @@ | |||
48 | #include <linux/fs.h> | 48 | #include <linux/fs.h> |
49 | #include <linux/namei.h> | 49 | #include <linux/namei.h> |
50 | #include <linux/mm.h> | 50 | #include <linux/mm.h> |
51 | #include <linux/module.h> | 51 | #include <linux/export.h> |
52 | #include <linux/slab.h> | 52 | #include <linux/slab.h> |
53 | #include <linux/mount.h> | 53 | #include <linux/mount.h> |
54 | #include <linux/socket.h> | 54 | #include <linux/socket.h> |
@@ -210,12 +210,12 @@ struct audit_context { | |||
210 | struct { | 210 | struct { |
211 | uid_t uid; | 211 | uid_t uid; |
212 | gid_t gid; | 212 | gid_t gid; |
213 | mode_t mode; | 213 | umode_t mode; |
214 | u32 osid; | 214 | u32 osid; |
215 | int has_perm; | 215 | int has_perm; |
216 | uid_t perm_uid; | 216 | uid_t perm_uid; |
217 | gid_t perm_gid; | 217 | gid_t perm_gid; |
218 | mode_t perm_mode; | 218 | umode_t perm_mode; |
219 | unsigned long qbytes; | 219 | unsigned long qbytes; |
220 | } ipc; | 220 | } ipc; |
221 | struct { | 221 | struct { |
@@ -234,7 +234,7 @@ struct audit_context { | |||
234 | } mq_sendrecv; | 234 | } mq_sendrecv; |
235 | struct { | 235 | struct { |
236 | int oflag; | 236 | int oflag; |
237 | mode_t mode; | 237 | umode_t mode; |
238 | struct mq_attr attr; | 238 | struct mq_attr attr; |
239 | } mq_open; | 239 | } mq_open; |
240 | struct { | 240 | struct { |
@@ -308,7 +308,7 @@ static int audit_match_perm(struct audit_context *ctx, int mask) | |||
308 | static int audit_match_filetype(struct audit_context *ctx, int which) | 308 | static int audit_match_filetype(struct audit_context *ctx, int which) |
309 | { | 309 | { |
310 | unsigned index = which & ~S_IFMT; | 310 | unsigned index = which & ~S_IFMT; |
311 | mode_t mode = which & S_IFMT; | 311 | umode_t mode = which & S_IFMT; |
312 | 312 | ||
313 | if (unlikely(!ctx)) | 313 | if (unlikely(!ctx)) |
314 | return 0; | 314 | return 0; |
@@ -1249,7 +1249,7 @@ static void show_special(struct audit_context *context, int *call_panic) | |||
1249 | case AUDIT_IPC: { | 1249 | case AUDIT_IPC: { |
1250 | u32 osid = context->ipc.osid; | 1250 | u32 osid = context->ipc.osid; |
1251 | 1251 | ||
1252 | audit_log_format(ab, "ouid=%u ogid=%u mode=%#o", | 1252 | audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho", |
1253 | context->ipc.uid, context->ipc.gid, context->ipc.mode); | 1253 | context->ipc.uid, context->ipc.gid, context->ipc.mode); |
1254 | if (osid) { | 1254 | if (osid) { |
1255 | char *ctx = NULL; | 1255 | char *ctx = NULL; |
@@ -1267,7 +1267,7 @@ static void show_special(struct audit_context *context, int *call_panic) | |||
1267 | ab = audit_log_start(context, GFP_KERNEL, | 1267 | ab = audit_log_start(context, GFP_KERNEL, |
1268 | AUDIT_IPC_SET_PERM); | 1268 | AUDIT_IPC_SET_PERM); |
1269 | audit_log_format(ab, | 1269 | audit_log_format(ab, |
1270 | "qbytes=%lx ouid=%u ogid=%u mode=%#o", | 1270 | "qbytes=%lx ouid=%u ogid=%u mode=%#ho", |
1271 | context->ipc.qbytes, | 1271 | context->ipc.qbytes, |
1272 | context->ipc.perm_uid, | 1272 | context->ipc.perm_uid, |
1273 | context->ipc.perm_gid, | 1273 | context->ipc.perm_gid, |
@@ -1278,7 +1278,7 @@ static void show_special(struct audit_context *context, int *call_panic) | |||
1278 | break; } | 1278 | break; } |
1279 | case AUDIT_MQ_OPEN: { | 1279 | case AUDIT_MQ_OPEN: { |
1280 | audit_log_format(ab, | 1280 | audit_log_format(ab, |
1281 | "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld " | 1281 | "oflag=0x%x mode=%#ho mq_flags=0x%lx mq_maxmsg=%ld " |
1282 | "mq_msgsize=%ld mq_curmsgs=%ld", | 1282 | "mq_msgsize=%ld mq_curmsgs=%ld", |
1283 | context->mq_open.oflag, context->mq_open.mode, | 1283 | context->mq_open.oflag, context->mq_open.mode, |
1284 | context->mq_open.attr.mq_flags, | 1284 | context->mq_open.attr.mq_flags, |
@@ -1502,7 +1502,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
1502 | 1502 | ||
1503 | if (n->ino != (unsigned long)-1) { | 1503 | if (n->ino != (unsigned long)-1) { |
1504 | audit_log_format(ab, " inode=%lu" | 1504 | audit_log_format(ab, " inode=%lu" |
1505 | " dev=%02x:%02x mode=%#o" | 1505 | " dev=%02x:%02x mode=%#ho" |
1506 | " ouid=%u ogid=%u rdev=%02x:%02x", | 1506 | " ouid=%u ogid=%u rdev=%02x:%02x", |
1507 | n->ino, | 1507 | n->ino, |
1508 | MAJOR(n->dev), | 1508 | MAJOR(n->dev), |
@@ -2160,7 +2160,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid) | |||
2160 | * @attr: queue attributes | 2160 | * @attr: queue attributes |
2161 | * | 2161 | * |
2162 | */ | 2162 | */ |
2163 | void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr) | 2163 | void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr) |
2164 | { | 2164 | { |
2165 | struct audit_context *context = current->audit_context; | 2165 | struct audit_context *context = current->audit_context; |
2166 | 2166 | ||
@@ -2260,7 +2260,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp) | |||
2260 | * | 2260 | * |
2261 | * Called only after audit_ipc_obj(). | 2261 | * Called only after audit_ipc_obj(). |
2262 | */ | 2262 | */ |
2263 | void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) | 2263 | void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode) |
2264 | { | 2264 | { |
2265 | struct audit_context *context = current->audit_context; | 2265 | struct audit_context *context = current->audit_context; |
2266 | 2266 | ||
diff --git a/kernel/capability.c b/kernel/capability.c index 74fb3b603045..0fcf1c14a297 100644 --- a/kernel/capability.c +++ b/kernel/capability.c | |||
@@ -10,7 +10,7 @@ | |||
10 | #include <linux/audit.h> | 10 | #include <linux/audit.h> |
11 | #include <linux/capability.h> | 11 | #include <linux/capability.h> |
12 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
13 | #include <linux/module.h> | 13 | #include <linux/export.h> |
14 | #include <linux/security.h> | 14 | #include <linux/security.h> |
15 | #include <linux/syscalls.h> | 15 | #include <linux/syscalls.h> |
16 | #include <linux/pid_namespace.h> | 16 | #include <linux/pid_namespace.h> |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1d2b6ceea95d..a5d3b5325f77 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -63,7 +63,24 @@ | |||
63 | 63 | ||
64 | #include <linux/atomic.h> | 64 | #include <linux/atomic.h> |
65 | 65 | ||
66 | /* | ||
67 | * cgroup_mutex is the master lock. Any modification to cgroup or its | ||
68 | * hierarchy must be performed while holding it. | ||
69 | * | ||
70 | * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify | ||
71 | * cgroupfs_root of any cgroup hierarchy - subsys list, flags, | ||
72 | * release_agent_path and so on. Modifying requires both cgroup_mutex and | ||
73 | * cgroup_root_mutex. Readers can acquire either of the two. This is to | ||
74 | * break the following locking order cycle. | ||
75 | * | ||
76 | * A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem | ||
77 | * B. namespace_sem -> cgroup_mutex | ||
78 | * | ||
79 | * B happens only through cgroup_show_options() and using cgroup_root_mutex | ||
80 | * breaks it. | ||
81 | */ | ||
66 | static DEFINE_MUTEX(cgroup_mutex); | 82 | static DEFINE_MUTEX(cgroup_mutex); |
83 | static DEFINE_MUTEX(cgroup_root_mutex); | ||
67 | 84 | ||
68 | /* | 85 | /* |
69 | * Generate an array of cgroup subsystem pointers. At boot time, this is | 86 | * Generate an array of cgroup subsystem pointers. At boot time, this is |
@@ -265,7 +282,7 @@ list_for_each_entry(_root, &roots, root_list) | |||
265 | /* the list of cgroups eligible for automatic release. Protected by | 282 | /* the list of cgroups eligible for automatic release. Protected by |
266 | * release_list_lock */ | 283 | * release_list_lock */ |
267 | static LIST_HEAD(release_list); | 284 | static LIST_HEAD(release_list); |
268 | static DEFINE_SPINLOCK(release_list_lock); | 285 | static DEFINE_RAW_SPINLOCK(release_list_lock); |
269 | static void cgroup_release_agent(struct work_struct *work); | 286 | static void cgroup_release_agent(struct work_struct *work); |
270 | static DECLARE_WORK(release_agent_work, cgroup_release_agent); | 287 | static DECLARE_WORK(release_agent_work, cgroup_release_agent); |
271 | static void check_for_release(struct cgroup *cgrp); | 288 | static void check_for_release(struct cgroup *cgrp); |
@@ -760,7 +777,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock); | |||
760 | * -> cgroup_mkdir. | 777 | * -> cgroup_mkdir. |
761 | */ | 778 | */ |
762 | 779 | ||
763 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); | 780 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); |
764 | static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *); | 781 | static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *); |
765 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); | 782 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); |
766 | static int cgroup_populate_dir(struct cgroup *cgrp); | 783 | static int cgroup_populate_dir(struct cgroup *cgrp); |
@@ -775,7 +792,7 @@ static struct backing_dev_info cgroup_backing_dev_info = { | |||
775 | static int alloc_css_id(struct cgroup_subsys *ss, | 792 | static int alloc_css_id(struct cgroup_subsys *ss, |
776 | struct cgroup *parent, struct cgroup *child); | 793 | struct cgroup *parent, struct cgroup *child); |
777 | 794 | ||
778 | static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) | 795 | static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) |
779 | { | 796 | { |
780 | struct inode *inode = new_inode(sb); | 797 | struct inode *inode = new_inode(sb); |
781 | 798 | ||
@@ -921,7 +938,7 @@ static void cgroup_d_remove_dir(struct dentry *dentry) | |||
921 | * | 938 | * |
922 | * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; | 939 | * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; |
923 | */ | 940 | */ |
924 | DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); | 941 | static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); |
925 | 942 | ||
926 | static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) | 943 | static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) |
927 | { | 944 | { |
@@ -953,6 +970,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
953 | int i; | 970 | int i; |
954 | 971 | ||
955 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); | 972 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); |
973 | BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); | ||
956 | 974 | ||
957 | removed_bits = root->actual_subsys_bits & ~final_bits; | 975 | removed_bits = root->actual_subsys_bits & ~final_bits; |
958 | added_bits = final_bits & ~root->actual_subsys_bits; | 976 | added_bits = final_bits & ~root->actual_subsys_bits; |
@@ -1038,12 +1056,12 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
1038 | return 0; | 1056 | return 0; |
1039 | } | 1057 | } |
1040 | 1058 | ||
1041 | static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) | 1059 | static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) |
1042 | { | 1060 | { |
1043 | struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info; | 1061 | struct cgroupfs_root *root = dentry->d_sb->s_fs_info; |
1044 | struct cgroup_subsys *ss; | 1062 | struct cgroup_subsys *ss; |
1045 | 1063 | ||
1046 | mutex_lock(&cgroup_mutex); | 1064 | mutex_lock(&cgroup_root_mutex); |
1047 | for_each_subsys(root, ss) | 1065 | for_each_subsys(root, ss) |
1048 | seq_printf(seq, ",%s", ss->name); | 1066 | seq_printf(seq, ",%s", ss->name); |
1049 | if (test_bit(ROOT_NOPREFIX, &root->flags)) | 1067 | if (test_bit(ROOT_NOPREFIX, &root->flags)) |
@@ -1054,7 +1072,7 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) | |||
1054 | seq_puts(seq, ",clone_children"); | 1072 | seq_puts(seq, ",clone_children"); |
1055 | if (strlen(root->name)) | 1073 | if (strlen(root->name)) |
1056 | seq_printf(seq, ",name=%s", root->name); | 1074 | seq_printf(seq, ",name=%s", root->name); |
1057 | mutex_unlock(&cgroup_mutex); | 1075 | mutex_unlock(&cgroup_root_mutex); |
1058 | return 0; | 1076 | return 0; |
1059 | } | 1077 | } |
1060 | 1078 | ||
@@ -1175,10 +1193,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
1175 | 1193 | ||
1176 | /* | 1194 | /* |
1177 | * If the 'all' option was specified select all the subsystems, | 1195 | * If the 'all' option was specified select all the subsystems, |
1178 | * otherwise 'all, 'none' and a subsystem name options were not | 1196 | * otherwise if 'none', 'name=' and a subsystem name options |
1179 | * specified, let's default to 'all' | 1197 | * were not specified, let's default to 'all' |
1180 | */ | 1198 | */ |
1181 | if (all_ss || (!all_ss && !one_ss && !opts->none)) { | 1199 | if (all_ss || (!one_ss && !opts->none && !opts->name)) { |
1182 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 1200 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
1183 | struct cgroup_subsys *ss = subsys[i]; | 1201 | struct cgroup_subsys *ss = subsys[i]; |
1184 | if (ss == NULL) | 1202 | if (ss == NULL) |
@@ -1269,6 +1287,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1269 | 1287 | ||
1270 | mutex_lock(&cgrp->dentry->d_inode->i_mutex); | 1288 | mutex_lock(&cgrp->dentry->d_inode->i_mutex); |
1271 | mutex_lock(&cgroup_mutex); | 1289 | mutex_lock(&cgroup_mutex); |
1290 | mutex_lock(&cgroup_root_mutex); | ||
1272 | 1291 | ||
1273 | /* See what subsystems are wanted */ | 1292 | /* See what subsystems are wanted */ |
1274 | ret = parse_cgroupfs_options(data, &opts); | 1293 | ret = parse_cgroupfs_options(data, &opts); |
@@ -1297,6 +1316,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
1297 | out_unlock: | 1316 | out_unlock: |
1298 | kfree(opts.release_agent); | 1317 | kfree(opts.release_agent); |
1299 | kfree(opts.name); | 1318 | kfree(opts.name); |
1319 | mutex_unlock(&cgroup_root_mutex); | ||
1300 | mutex_unlock(&cgroup_mutex); | 1320 | mutex_unlock(&cgroup_mutex); |
1301 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); | 1321 | mutex_unlock(&cgrp->dentry->d_inode->i_mutex); |
1302 | return ret; | 1322 | return ret; |
@@ -1481,6 +1501,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1481 | int ret = 0; | 1501 | int ret = 0; |
1482 | struct super_block *sb; | 1502 | struct super_block *sb; |
1483 | struct cgroupfs_root *new_root; | 1503 | struct cgroupfs_root *new_root; |
1504 | struct inode *inode; | ||
1484 | 1505 | ||
1485 | /* First find the desired set of subsystems */ | 1506 | /* First find the desired set of subsystems */ |
1486 | mutex_lock(&cgroup_mutex); | 1507 | mutex_lock(&cgroup_mutex); |
@@ -1514,7 +1535,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1514 | /* We used the new root structure, so this is a new hierarchy */ | 1535 | /* We used the new root structure, so this is a new hierarchy */ |
1515 | struct list_head tmp_cg_links; | 1536 | struct list_head tmp_cg_links; |
1516 | struct cgroup *root_cgrp = &root->top_cgroup; | 1537 | struct cgroup *root_cgrp = &root->top_cgroup; |
1517 | struct inode *inode; | ||
1518 | struct cgroupfs_root *existing_root; | 1538 | struct cgroupfs_root *existing_root; |
1519 | const struct cred *cred; | 1539 | const struct cred *cred; |
1520 | int i; | 1540 | int i; |
@@ -1528,18 +1548,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1528 | 1548 | ||
1529 | mutex_lock(&inode->i_mutex); | 1549 | mutex_lock(&inode->i_mutex); |
1530 | mutex_lock(&cgroup_mutex); | 1550 | mutex_lock(&cgroup_mutex); |
1551 | mutex_lock(&cgroup_root_mutex); | ||
1531 | 1552 | ||
1532 | if (strlen(root->name)) { | 1553 | /* Check for name clashes with existing mounts */ |
1533 | /* Check for name clashes with existing mounts */ | 1554 | ret = -EBUSY; |
1534 | for_each_active_root(existing_root) { | 1555 | if (strlen(root->name)) |
1535 | if (!strcmp(existing_root->name, root->name)) { | 1556 | for_each_active_root(existing_root) |
1536 | ret = -EBUSY; | 1557 | if (!strcmp(existing_root->name, root->name)) |
1537 | mutex_unlock(&cgroup_mutex); | 1558 | goto unlock_drop; |
1538 | mutex_unlock(&inode->i_mutex); | ||
1539 | goto drop_new_super; | ||
1540 | } | ||
1541 | } | ||
1542 | } | ||
1543 | 1559 | ||
1544 | /* | 1560 | /* |
1545 | * We're accessing css_set_count without locking | 1561 | * We're accessing css_set_count without locking |
@@ -1549,18 +1565,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1549 | * have some link structures left over | 1565 | * have some link structures left over |
1550 | */ | 1566 | */ |
1551 | ret = allocate_cg_links(css_set_count, &tmp_cg_links); | 1567 | ret = allocate_cg_links(css_set_count, &tmp_cg_links); |
1552 | if (ret) { | 1568 | if (ret) |
1553 | mutex_unlock(&cgroup_mutex); | 1569 | goto unlock_drop; |
1554 | mutex_unlock(&inode->i_mutex); | ||
1555 | goto drop_new_super; | ||
1556 | } | ||
1557 | 1570 | ||
1558 | ret = rebind_subsystems(root, root->subsys_bits); | 1571 | ret = rebind_subsystems(root, root->subsys_bits); |
1559 | if (ret == -EBUSY) { | 1572 | if (ret == -EBUSY) { |
1560 | mutex_unlock(&cgroup_mutex); | ||
1561 | mutex_unlock(&inode->i_mutex); | ||
1562 | free_cg_links(&tmp_cg_links); | 1573 | free_cg_links(&tmp_cg_links); |
1563 | goto drop_new_super; | 1574 | goto unlock_drop; |
1564 | } | 1575 | } |
1565 | /* | 1576 | /* |
1566 | * There must be no failure case after here, since rebinding | 1577 | * There must be no failure case after here, since rebinding |
@@ -1599,6 +1610,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1599 | cred = override_creds(&init_cred); | 1610 | cred = override_creds(&init_cred); |
1600 | cgroup_populate_dir(root_cgrp); | 1611 | cgroup_populate_dir(root_cgrp); |
1601 | revert_creds(cred); | 1612 | revert_creds(cred); |
1613 | mutex_unlock(&cgroup_root_mutex); | ||
1602 | mutex_unlock(&cgroup_mutex); | 1614 | mutex_unlock(&cgroup_mutex); |
1603 | mutex_unlock(&inode->i_mutex); | 1615 | mutex_unlock(&inode->i_mutex); |
1604 | } else { | 1616 | } else { |
@@ -1615,6 +1627,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
1615 | kfree(opts.name); | 1627 | kfree(opts.name); |
1616 | return dget(sb->s_root); | 1628 | return dget(sb->s_root); |
1617 | 1629 | ||
1630 | unlock_drop: | ||
1631 | mutex_unlock(&cgroup_root_mutex); | ||
1632 | mutex_unlock(&cgroup_mutex); | ||
1633 | mutex_unlock(&inode->i_mutex); | ||
1618 | drop_new_super: | 1634 | drop_new_super: |
1619 | deactivate_locked_super(sb); | 1635 | deactivate_locked_super(sb); |
1620 | drop_modules: | 1636 | drop_modules: |
@@ -1639,6 +1655,7 @@ static void cgroup_kill_sb(struct super_block *sb) { | |||
1639 | BUG_ON(!list_empty(&cgrp->sibling)); | 1655 | BUG_ON(!list_empty(&cgrp->sibling)); |
1640 | 1656 | ||
1641 | mutex_lock(&cgroup_mutex); | 1657 | mutex_lock(&cgroup_mutex); |
1658 | mutex_lock(&cgroup_root_mutex); | ||
1642 | 1659 | ||
1643 | /* Rebind all subsystems back to the default hierarchy */ | 1660 | /* Rebind all subsystems back to the default hierarchy */ |
1644 | ret = rebind_subsystems(root, 0); | 1661 | ret = rebind_subsystems(root, 0); |
@@ -1664,6 +1681,7 @@ static void cgroup_kill_sb(struct super_block *sb) { | |||
1664 | root_count--; | 1681 | root_count--; |
1665 | } | 1682 | } |
1666 | 1683 | ||
1684 | mutex_unlock(&cgroup_root_mutex); | ||
1667 | mutex_unlock(&cgroup_mutex); | 1685 | mutex_unlock(&cgroup_mutex); |
1668 | 1686 | ||
1669 | kill_litter_super(sb); | 1687 | kill_litter_super(sb); |
@@ -1740,11 +1758,90 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) | |||
1740 | EXPORT_SYMBOL_GPL(cgroup_path); | 1758 | EXPORT_SYMBOL_GPL(cgroup_path); |
1741 | 1759 | ||
1742 | /* | 1760 | /* |
1761 | * Control Group taskset | ||
1762 | */ | ||
1763 | struct task_and_cgroup { | ||
1764 | struct task_struct *task; | ||
1765 | struct cgroup *cgrp; | ||
1766 | }; | ||
1767 | |||
1768 | struct cgroup_taskset { | ||
1769 | struct task_and_cgroup single; | ||
1770 | struct flex_array *tc_array; | ||
1771 | int tc_array_len; | ||
1772 | int idx; | ||
1773 | struct cgroup *cur_cgrp; | ||
1774 | }; | ||
1775 | |||
1776 | /** | ||
1777 | * cgroup_taskset_first - reset taskset and return the first task | ||
1778 | * @tset: taskset of interest | ||
1779 | * | ||
1780 | * @tset iteration is initialized and the first task is returned. | ||
1781 | */ | ||
1782 | struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset) | ||
1783 | { | ||
1784 | if (tset->tc_array) { | ||
1785 | tset->idx = 0; | ||
1786 | return cgroup_taskset_next(tset); | ||
1787 | } else { | ||
1788 | tset->cur_cgrp = tset->single.cgrp; | ||
1789 | return tset->single.task; | ||
1790 | } | ||
1791 | } | ||
1792 | EXPORT_SYMBOL_GPL(cgroup_taskset_first); | ||
1793 | |||
1794 | /** | ||
1795 | * cgroup_taskset_next - iterate to the next task in taskset | ||
1796 | * @tset: taskset of interest | ||
1797 | * | ||
1798 | * Return the next task in @tset. Iteration must have been initialized | ||
1799 | * with cgroup_taskset_first(). | ||
1800 | */ | ||
1801 | struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) | ||
1802 | { | ||
1803 | struct task_and_cgroup *tc; | ||
1804 | |||
1805 | if (!tset->tc_array || tset->idx >= tset->tc_array_len) | ||
1806 | return NULL; | ||
1807 | |||
1808 | tc = flex_array_get(tset->tc_array, tset->idx++); | ||
1809 | tset->cur_cgrp = tc->cgrp; | ||
1810 | return tc->task; | ||
1811 | } | ||
1812 | EXPORT_SYMBOL_GPL(cgroup_taskset_next); | ||
1813 | |||
1814 | /** | ||
1815 | * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task | ||
1816 | * @tset: taskset of interest | ||
1817 | * | ||
1818 | * Return the cgroup for the current (last returned) task of @tset. This | ||
1819 | * function must be preceded by either cgroup_taskset_first() or | ||
1820 | * cgroup_taskset_next(). | ||
1821 | */ | ||
1822 | struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset) | ||
1823 | { | ||
1824 | return tset->cur_cgrp; | ||
1825 | } | ||
1826 | EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup); | ||
1827 | |||
1828 | /** | ||
1829 | * cgroup_taskset_size - return the number of tasks in taskset | ||
1830 | * @tset: taskset of interest | ||
1831 | */ | ||
1832 | int cgroup_taskset_size(struct cgroup_taskset *tset) | ||
1833 | { | ||
1834 | return tset->tc_array ? tset->tc_array_len : 1; | ||
1835 | } | ||
1836 | EXPORT_SYMBOL_GPL(cgroup_taskset_size); | ||
1837 | |||
1838 | |||
1839 | /* | ||
1743 | * cgroup_task_migrate - move a task from one cgroup to another. | 1840 | * cgroup_task_migrate - move a task from one cgroup to another. |
1744 | * | 1841 | * |
1745 | * 'guarantee' is set if the caller promises that a new css_set for the task | 1842 | * 'guarantee' is set if the caller promises that a new css_set for the task |
1746 | * will already exist. If not set, this function might sleep, and can fail with | 1843 | * will already exist. If not set, this function might sleep, and can fail with |
1747 | * -ENOMEM. Otherwise, it can only fail with -ESRCH. | 1844 | * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked. |
1748 | */ | 1845 | */ |
1749 | static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, | 1846 | static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, |
1750 | struct task_struct *tsk, bool guarantee) | 1847 | struct task_struct *tsk, bool guarantee) |
@@ -1753,14 +1850,12 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, | |||
1753 | struct css_set *newcg; | 1850 | struct css_set *newcg; |
1754 | 1851 | ||
1755 | /* | 1852 | /* |
1756 | * get old css_set. we need to take task_lock and refcount it, because | 1853 | * We are synchronized through threadgroup_lock() against PF_EXITING |
1757 | * an exiting task can change its css_set to init_css_set and drop its | 1854 | * setting such that we can't race against cgroup_exit() changing the |
1758 | * old one without taking cgroup_mutex. | 1855 | * css_set to init_css_set and dropping the old one. |
1759 | */ | 1856 | */ |
1760 | task_lock(tsk); | 1857 | WARN_ON_ONCE(tsk->flags & PF_EXITING); |
1761 | oldcg = tsk->cgroups; | 1858 | oldcg = tsk->cgroups; |
1762 | get_css_set(oldcg); | ||
1763 | task_unlock(tsk); | ||
1764 | 1859 | ||
1765 | /* locate or allocate a new css_set for this task. */ | 1860 | /* locate or allocate a new css_set for this task. */ |
1766 | if (guarantee) { | 1861 | if (guarantee) { |
@@ -1775,20 +1870,11 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, | |||
1775 | might_sleep(); | 1870 | might_sleep(); |
1776 | /* find_css_set will give us newcg already referenced. */ | 1871 | /* find_css_set will give us newcg already referenced. */ |
1777 | newcg = find_css_set(oldcg, cgrp); | 1872 | newcg = find_css_set(oldcg, cgrp); |
1778 | if (!newcg) { | 1873 | if (!newcg) |
1779 | put_css_set(oldcg); | ||
1780 | return -ENOMEM; | 1874 | return -ENOMEM; |
1781 | } | ||
1782 | } | 1875 | } |
1783 | put_css_set(oldcg); | ||
1784 | 1876 | ||
1785 | /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */ | ||
1786 | task_lock(tsk); | 1877 | task_lock(tsk); |
1787 | if (tsk->flags & PF_EXITING) { | ||
1788 | task_unlock(tsk); | ||
1789 | put_css_set(newcg); | ||
1790 | return -ESRCH; | ||
1791 | } | ||
1792 | rcu_assign_pointer(tsk->cgroups, newcg); | 1878 | rcu_assign_pointer(tsk->cgroups, newcg); |
1793 | task_unlock(tsk); | 1879 | task_unlock(tsk); |
1794 | 1880 | ||
@@ -1814,8 +1900,8 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, | |||
1814 | * @cgrp: the cgroup the task is attaching to | 1900 | * @cgrp: the cgroup the task is attaching to |
1815 | * @tsk: the task to be attached | 1901 | * @tsk: the task to be attached |
1816 | * | 1902 | * |
1817 | * Call holding cgroup_mutex. May take task_lock of | 1903 | * Call with cgroup_mutex and threadgroup locked. May take task_lock of |
1818 | * the task 'tsk' during call. | 1904 | * @tsk during call. |
1819 | */ | 1905 | */ |
1820 | int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | 1906 | int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) |
1821 | { | 1907 | { |
@@ -1823,15 +1909,23 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
1823 | struct cgroup_subsys *ss, *failed_ss = NULL; | 1909 | struct cgroup_subsys *ss, *failed_ss = NULL; |
1824 | struct cgroup *oldcgrp; | 1910 | struct cgroup *oldcgrp; |
1825 | struct cgroupfs_root *root = cgrp->root; | 1911 | struct cgroupfs_root *root = cgrp->root; |
1912 | struct cgroup_taskset tset = { }; | ||
1913 | |||
1914 | /* @tsk either already exited or can't exit until the end */ | ||
1915 | if (tsk->flags & PF_EXITING) | ||
1916 | return -ESRCH; | ||
1826 | 1917 | ||
1827 | /* Nothing to do if the task is already in that cgroup */ | 1918 | /* Nothing to do if the task is already in that cgroup */ |
1828 | oldcgrp = task_cgroup_from_root(tsk, root); | 1919 | oldcgrp = task_cgroup_from_root(tsk, root); |
1829 | if (cgrp == oldcgrp) | 1920 | if (cgrp == oldcgrp) |
1830 | return 0; | 1921 | return 0; |
1831 | 1922 | ||
1923 | tset.single.task = tsk; | ||
1924 | tset.single.cgrp = oldcgrp; | ||
1925 | |||
1832 | for_each_subsys(root, ss) { | 1926 | for_each_subsys(root, ss) { |
1833 | if (ss->can_attach) { | 1927 | if (ss->can_attach) { |
1834 | retval = ss->can_attach(ss, cgrp, tsk); | 1928 | retval = ss->can_attach(ss, cgrp, &tset); |
1835 | if (retval) { | 1929 | if (retval) { |
1836 | /* | 1930 | /* |
1837 | * Remember on which subsystem the can_attach() | 1931 | * Remember on which subsystem the can_attach() |
@@ -1843,13 +1937,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
1843 | goto out; | 1937 | goto out; |
1844 | } | 1938 | } |
1845 | } | 1939 | } |
1846 | if (ss->can_attach_task) { | ||
1847 | retval = ss->can_attach_task(cgrp, tsk); | ||
1848 | if (retval) { | ||
1849 | failed_ss = ss; | ||
1850 | goto out; | ||
1851 | } | ||
1852 | } | ||
1853 | } | 1940 | } |
1854 | 1941 | ||
1855 | retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); | 1942 | retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); |
@@ -1857,12 +1944,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
1857 | goto out; | 1944 | goto out; |
1858 | 1945 | ||
1859 | for_each_subsys(root, ss) { | 1946 | for_each_subsys(root, ss) { |
1860 | if (ss->pre_attach) | ||
1861 | ss->pre_attach(cgrp); | ||
1862 | if (ss->attach_task) | ||
1863 | ss->attach_task(cgrp, tsk); | ||
1864 | if (ss->attach) | 1947 | if (ss->attach) |
1865 | ss->attach(ss, cgrp, oldcgrp, tsk); | 1948 | ss->attach(ss, cgrp, &tset); |
1866 | } | 1949 | } |
1867 | 1950 | ||
1868 | synchronize_rcu(); | 1951 | synchronize_rcu(); |
@@ -1884,7 +1967,7 @@ out: | |||
1884 | */ | 1967 | */ |
1885 | break; | 1968 | break; |
1886 | if (ss->cancel_attach) | 1969 | if (ss->cancel_attach) |
1887 | ss->cancel_attach(ss, cgrp, tsk); | 1970 | ss->cancel_attach(ss, cgrp, &tset); |
1888 | } | 1971 | } |
1889 | } | 1972 | } |
1890 | return retval; | 1973 | return retval; |
@@ -1935,23 +2018,17 @@ static bool css_set_check_fetched(struct cgroup *cgrp, | |||
1935 | 2018 | ||
1936 | read_lock(&css_set_lock); | 2019 | read_lock(&css_set_lock); |
1937 | newcg = find_existing_css_set(cg, cgrp, template); | 2020 | newcg = find_existing_css_set(cg, cgrp, template); |
1938 | if (newcg) | ||
1939 | get_css_set(newcg); | ||
1940 | read_unlock(&css_set_lock); | 2021 | read_unlock(&css_set_lock); |
1941 | 2022 | ||
1942 | /* doesn't exist at all? */ | 2023 | /* doesn't exist at all? */ |
1943 | if (!newcg) | 2024 | if (!newcg) |
1944 | return false; | 2025 | return false; |
1945 | /* see if it's already in the list */ | 2026 | /* see if it's already in the list */ |
1946 | list_for_each_entry(cg_entry, newcg_list, links) { | 2027 | list_for_each_entry(cg_entry, newcg_list, links) |
1947 | if (cg_entry->cg == newcg) { | 2028 | if (cg_entry->cg == newcg) |
1948 | put_css_set(newcg); | ||
1949 | return true; | 2029 | return true; |
1950 | } | ||
1951 | } | ||
1952 | 2030 | ||
1953 | /* not found */ | 2031 | /* not found */ |
1954 | put_css_set(newcg); | ||
1955 | return false; | 2032 | return false; |
1956 | } | 2033 | } |
1957 | 2034 | ||
@@ -1985,21 +2062,21 @@ static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg, | |||
1985 | * @cgrp: the cgroup to attach to | 2062 | * @cgrp: the cgroup to attach to |
1986 | * @leader: the threadgroup leader task_struct of the group to be attached | 2063 | * @leader: the threadgroup leader task_struct of the group to be attached |
1987 | * | 2064 | * |
1988 | * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will | 2065 | * Call holding cgroup_mutex and the group_rwsem of the leader. Will take |
1989 | * take task_lock of each thread in leader's threadgroup individually in turn. | 2066 | * task_lock of each thread in leader's threadgroup individually in turn. |
1990 | */ | 2067 | */ |
1991 | int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) | 2068 | static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) |
1992 | { | 2069 | { |
1993 | int retval, i, group_size; | 2070 | int retval, i, group_size; |
1994 | struct cgroup_subsys *ss, *failed_ss = NULL; | 2071 | struct cgroup_subsys *ss, *failed_ss = NULL; |
1995 | bool cancel_failed_ss = false; | ||
1996 | /* guaranteed to be initialized later, but the compiler needs this */ | 2072 | /* guaranteed to be initialized later, but the compiler needs this */ |
1997 | struct cgroup *oldcgrp = NULL; | ||
1998 | struct css_set *oldcg; | 2073 | struct css_set *oldcg; |
1999 | struct cgroupfs_root *root = cgrp->root; | 2074 | struct cgroupfs_root *root = cgrp->root; |
2000 | /* threadgroup list cursor and array */ | 2075 | /* threadgroup list cursor and array */ |
2001 | struct task_struct *tsk; | 2076 | struct task_struct *tsk; |
2077 | struct task_and_cgroup *tc; | ||
2002 | struct flex_array *group; | 2078 | struct flex_array *group; |
2079 | struct cgroup_taskset tset = { }; | ||
2003 | /* | 2080 | /* |
2004 | * we need to make sure we have css_sets for all the tasks we're | 2081 | * we need to make sure we have css_sets for all the tasks we're |
2005 | * going to move -before- we actually start moving them, so that in | 2082 | * going to move -before- we actually start moving them, so that in |
@@ -2012,13 +2089,12 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) | |||
2012 | * step 0: in order to do expensive, possibly blocking operations for | 2089 | * step 0: in order to do expensive, possibly blocking operations for |
2013 | * every thread, we cannot iterate the thread group list, since it needs | 2090 | * every thread, we cannot iterate the thread group list, since it needs |
2014 | * rcu or tasklist locked. instead, build an array of all threads in the | 2091 | * rcu or tasklist locked. instead, build an array of all threads in the |
2015 | * group - threadgroup_fork_lock prevents new threads from appearing, | 2092 | * group - group_rwsem prevents new threads from appearing, and if |
2016 | * and if threads exit, this will just be an over-estimate. | 2093 | * threads exit, this will just be an over-estimate. |
2017 | */ | 2094 | */ |
2018 | group_size = get_nr_threads(leader); | 2095 | group_size = get_nr_threads(leader); |
2019 | /* flex_array supports very large thread-groups better than kmalloc. */ | 2096 | /* flex_array supports very large thread-groups better than kmalloc. */ |
2020 | group = flex_array_alloc(sizeof(struct task_struct *), group_size, | 2097 | group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL); |
2021 | GFP_KERNEL); | ||
2022 | if (!group) | 2098 | if (!group) |
2023 | return -ENOMEM; | 2099 | return -ENOMEM; |
2024 | /* pre-allocate to guarantee space while iterating in rcu read-side. */ | 2100 | /* pre-allocate to guarantee space while iterating in rcu read-side. */ |
@@ -2027,7 +2103,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) | |||
2027 | goto out_free_group_list; | 2103 | goto out_free_group_list; |
2028 | 2104 | ||
2029 | /* prevent changes to the threadgroup list while we take a snapshot. */ | 2105 | /* prevent changes to the threadgroup list while we take a snapshot. */ |
2030 | rcu_read_lock(); | 2106 | read_lock(&tasklist_lock); |
2031 | if (!thread_group_leader(leader)) { | 2107 | if (!thread_group_leader(leader)) { |
2032 | /* | 2108 | /* |
2033 | * a race with de_thread from another thread's exec() may strip | 2109 | * a race with de_thread from another thread's exec() may strip |
@@ -2036,53 +2112,57 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) | |||
2036 | * throw this task away and try again (from cgroup_procs_write); | 2112 | * throw this task away and try again (from cgroup_procs_write); |
2037 | * this is "double-double-toil-and-trouble-check locking". | 2113 | * this is "double-double-toil-and-trouble-check locking". |
2038 | */ | 2114 | */ |
2039 | rcu_read_unlock(); | 2115 | read_unlock(&tasklist_lock); |
2040 | retval = -EAGAIN; | 2116 | retval = -EAGAIN; |
2041 | goto out_free_group_list; | 2117 | goto out_free_group_list; |
2042 | } | 2118 | } |
2043 | /* take a reference on each task in the group to go in the array. */ | 2119 | |
2044 | tsk = leader; | 2120 | tsk = leader; |
2045 | i = 0; | 2121 | i = 0; |
2046 | do { | 2122 | do { |
2123 | struct task_and_cgroup ent; | ||
2124 | |||
2125 | /* @tsk either already exited or can't exit until the end */ | ||
2126 | if (tsk->flags & PF_EXITING) | ||
2127 | continue; | ||
2128 | |||
2047 | /* as per above, nr_threads may decrease, but not increase. */ | 2129 | /* as per above, nr_threads may decrease, but not increase. */ |
2048 | BUG_ON(i >= group_size); | 2130 | BUG_ON(i >= group_size); |
2049 | get_task_struct(tsk); | ||
2050 | /* | 2131 | /* |
2051 | * saying GFP_ATOMIC has no effect here because we did prealloc | 2132 | * saying GFP_ATOMIC has no effect here because we did prealloc |
2052 | * earlier, but it's good form to communicate our expectations. | 2133 | * earlier, but it's good form to communicate our expectations. |
2053 | */ | 2134 | */ |
2054 | retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC); | 2135 | ent.task = tsk; |
2136 | ent.cgrp = task_cgroup_from_root(tsk, root); | ||
2137 | /* nothing to do if this task is already in the cgroup */ | ||
2138 | if (ent.cgrp == cgrp) | ||
2139 | continue; | ||
2140 | retval = flex_array_put(group, i, &ent, GFP_ATOMIC); | ||
2055 | BUG_ON(retval != 0); | 2141 | BUG_ON(retval != 0); |
2056 | i++; | 2142 | i++; |
2057 | } while_each_thread(leader, tsk); | 2143 | } while_each_thread(leader, tsk); |
2058 | /* remember the number of threads in the array for later. */ | 2144 | /* remember the number of threads in the array for later. */ |
2059 | group_size = i; | 2145 | group_size = i; |
2060 | rcu_read_unlock(); | 2146 | tset.tc_array = group; |
2147 | tset.tc_array_len = group_size; | ||
2148 | read_unlock(&tasklist_lock); | ||
2149 | |||
2150 | /* methods shouldn't be called if no task is actually migrating */ | ||
2151 | retval = 0; | ||
2152 | if (!group_size) | ||
2153 | goto out_free_group_list; | ||
2061 | 2154 | ||
2062 | /* | 2155 | /* |
2063 | * step 1: check that we can legitimately attach to the cgroup. | 2156 | * step 1: check that we can legitimately attach to the cgroup. |
2064 | */ | 2157 | */ |
2065 | for_each_subsys(root, ss) { | 2158 | for_each_subsys(root, ss) { |
2066 | if (ss->can_attach) { | 2159 | if (ss->can_attach) { |
2067 | retval = ss->can_attach(ss, cgrp, leader); | 2160 | retval = ss->can_attach(ss, cgrp, &tset); |
2068 | if (retval) { | 2161 | if (retval) { |
2069 | failed_ss = ss; | 2162 | failed_ss = ss; |
2070 | goto out_cancel_attach; | 2163 | goto out_cancel_attach; |
2071 | } | 2164 | } |
2072 | } | 2165 | } |
2073 | /* a callback to be run on every thread in the threadgroup. */ | ||
2074 | if (ss->can_attach_task) { | ||
2075 | /* run on each task in the threadgroup. */ | ||
2076 | for (i = 0; i < group_size; i++) { | ||
2077 | tsk = flex_array_get_ptr(group, i); | ||
2078 | retval = ss->can_attach_task(cgrp, tsk); | ||
2079 | if (retval) { | ||
2080 | failed_ss = ss; | ||
2081 | cancel_failed_ss = true; | ||
2082 | goto out_cancel_attach; | ||
2083 | } | ||
2084 | } | ||
2085 | } | ||
2086 | } | 2166 | } |
2087 | 2167 | ||
2088 | /* | 2168 | /* |
@@ -2091,69 +2171,36 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) | |||
2091 | */ | 2171 | */ |
2092 | INIT_LIST_HEAD(&newcg_list); | 2172 | INIT_LIST_HEAD(&newcg_list); |
2093 | for (i = 0; i < group_size; i++) { | 2173 | for (i = 0; i < group_size; i++) { |
2094 | tsk = flex_array_get_ptr(group, i); | 2174 | tc = flex_array_get(group, i); |
2095 | /* nothing to do if this task is already in the cgroup */ | 2175 | oldcg = tc->task->cgroups; |
2096 | oldcgrp = task_cgroup_from_root(tsk, root); | 2176 | |
2097 | if (cgrp == oldcgrp) | 2177 | /* if we don't already have it in the list get a new one */ |
2098 | continue; | 2178 | if (!css_set_check_fetched(cgrp, tc->task, oldcg, |
2099 | /* get old css_set pointer */ | 2179 | &newcg_list)) { |
2100 | task_lock(tsk); | ||
2101 | if (tsk->flags & PF_EXITING) { | ||
2102 | /* ignore this task if it's going away */ | ||
2103 | task_unlock(tsk); | ||
2104 | continue; | ||
2105 | } | ||
2106 | oldcg = tsk->cgroups; | ||
2107 | get_css_set(oldcg); | ||
2108 | task_unlock(tsk); | ||
2109 | /* see if the new one for us is already in the list? */ | ||
2110 | if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) { | ||
2111 | /* was already there, nothing to do. */ | ||
2112 | put_css_set(oldcg); | ||
2113 | } else { | ||
2114 | /* we don't already have it. get new one. */ | ||
2115 | retval = css_set_prefetch(cgrp, oldcg, &newcg_list); | 2180 | retval = css_set_prefetch(cgrp, oldcg, &newcg_list); |
2116 | put_css_set(oldcg); | ||
2117 | if (retval) | 2181 | if (retval) |
2118 | goto out_list_teardown; | 2182 | goto out_list_teardown; |
2119 | } | 2183 | } |
2120 | } | 2184 | } |
2121 | 2185 | ||
2122 | /* | 2186 | /* |
2123 | * step 3: now that we're guaranteed success wrt the css_sets, proceed | 2187 | * step 3: now that we're guaranteed success wrt the css_sets, |
2124 | * to move all tasks to the new cgroup, calling ss->attach_task for each | 2188 | * proceed to move all tasks to the new cgroup. There are no |
2125 | * one along the way. there are no failure cases after here, so this is | 2189 | * failure cases after here, so this is the commit point. |
2126 | * the commit point. | ||
2127 | */ | 2190 | */ |
2128 | for_each_subsys(root, ss) { | ||
2129 | if (ss->pre_attach) | ||
2130 | ss->pre_attach(cgrp); | ||
2131 | } | ||
2132 | for (i = 0; i < group_size; i++) { | 2191 | for (i = 0; i < group_size; i++) { |
2133 | tsk = flex_array_get_ptr(group, i); | 2192 | tc = flex_array_get(group, i); |
2134 | /* leave current thread as it is if it's already there */ | 2193 | retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true); |
2135 | oldcgrp = task_cgroup_from_root(tsk, root); | 2194 | BUG_ON(retval); |
2136 | if (cgrp == oldcgrp) | ||
2137 | continue; | ||
2138 | /* attach each task to each subsystem */ | ||
2139 | for_each_subsys(root, ss) { | ||
2140 | if (ss->attach_task) | ||
2141 | ss->attach_task(cgrp, tsk); | ||
2142 | } | ||
2143 | /* if the thread is PF_EXITING, it can just get skipped. */ | ||
2144 | retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true); | ||
2145 | BUG_ON(retval != 0 && retval != -ESRCH); | ||
2146 | } | 2195 | } |
2147 | /* nothing is sensitive to fork() after this point. */ | 2196 | /* nothing is sensitive to fork() after this point. */ |
2148 | 2197 | ||
2149 | /* | 2198 | /* |
2150 | * step 4: do expensive, non-thread-specific subsystem callbacks. | 2199 | * step 4: do subsystem attach callbacks. |
2151 | * TODO: if ever a subsystem needs to know the oldcgrp for each task | ||
2152 | * being moved, this call will need to be reworked to communicate that. | ||
2153 | */ | 2200 | */ |
2154 | for_each_subsys(root, ss) { | 2201 | for_each_subsys(root, ss) { |
2155 | if (ss->attach) | 2202 | if (ss->attach) |
2156 | ss->attach(ss, cgrp, oldcgrp, leader); | 2203 | ss->attach(ss, cgrp, &tset); |
2157 | } | 2204 | } |
2158 | 2205 | ||
2159 | /* | 2206 | /* |
@@ -2173,20 +2220,12 @@ out_cancel_attach: | |||
2173 | /* same deal as in cgroup_attach_task */ | 2220 | /* same deal as in cgroup_attach_task */ |
2174 | if (retval) { | 2221 | if (retval) { |
2175 | for_each_subsys(root, ss) { | 2222 | for_each_subsys(root, ss) { |
2176 | if (ss == failed_ss) { | 2223 | if (ss == failed_ss) |
2177 | if (cancel_failed_ss && ss->cancel_attach) | ||
2178 | ss->cancel_attach(ss, cgrp, leader); | ||
2179 | break; | 2224 | break; |
2180 | } | ||
2181 | if (ss->cancel_attach) | 2225 | if (ss->cancel_attach) |
2182 | ss->cancel_attach(ss, cgrp, leader); | 2226 | ss->cancel_attach(ss, cgrp, &tset); |
2183 | } | 2227 | } |
2184 | } | 2228 | } |
2185 | /* clean up the array of referenced threads in the group. */ | ||
2186 | for (i = 0; i < group_size; i++) { | ||
2187 | tsk = flex_array_get_ptr(group, i); | ||
2188 | put_task_struct(tsk); | ||
2189 | } | ||
2190 | out_free_group_list: | 2229 | out_free_group_list: |
2191 | flex_array_free(group); | 2230 | flex_array_free(group); |
2192 | return retval; | 2231 | return retval; |
@@ -2194,8 +2233,8 @@ out_free_group_list: | |||
2194 | 2233 | ||
2195 | /* | 2234 | /* |
2196 | * Find the task_struct of the task to attach by vpid and pass it along to the | 2235 | * Find the task_struct of the task to attach by vpid and pass it along to the |
2197 | * function to attach either it or all tasks in its threadgroup. Will take | 2236 | * function to attach either it or all tasks in its threadgroup. Will lock |
2198 | * cgroup_mutex; may take task_lock of task. | 2237 | * cgroup_mutex and threadgroup; may take task_lock of task. |
2199 | */ | 2238 | */ |
2200 | static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) | 2239 | static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) |
2201 | { | 2240 | { |
@@ -2222,13 +2261,7 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) | |||
2222 | * detect it later. | 2261 | * detect it later. |
2223 | */ | 2262 | */ |
2224 | tsk = tsk->group_leader; | 2263 | tsk = tsk->group_leader; |
2225 | } else if (tsk->flags & PF_EXITING) { | ||
2226 | /* optimization for the single-task-only case */ | ||
2227 | rcu_read_unlock(); | ||
2228 | cgroup_unlock(); | ||
2229 | return -ESRCH; | ||
2230 | } | 2264 | } |
2231 | |||
2232 | /* | 2265 | /* |
2233 | * even if we're attaching all tasks in the thread group, we | 2266 | * even if we're attaching all tasks in the thread group, we |
2234 | * only need to check permissions on one of them. | 2267 | * only need to check permissions on one of them. |
@@ -2251,13 +2284,15 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) | |||
2251 | get_task_struct(tsk); | 2284 | get_task_struct(tsk); |
2252 | } | 2285 | } |
2253 | 2286 | ||
2254 | if (threadgroup) { | 2287 | threadgroup_lock(tsk); |
2255 | threadgroup_fork_write_lock(tsk); | 2288 | |
2289 | if (threadgroup) | ||
2256 | ret = cgroup_attach_proc(cgrp, tsk); | 2290 | ret = cgroup_attach_proc(cgrp, tsk); |
2257 | threadgroup_fork_write_unlock(tsk); | 2291 | else |
2258 | } else { | ||
2259 | ret = cgroup_attach_task(cgrp, tsk); | 2292 | ret = cgroup_attach_task(cgrp, tsk); |
2260 | } | 2293 | |
2294 | threadgroup_unlock(tsk); | ||
2295 | |||
2261 | put_task_struct(tsk); | 2296 | put_task_struct(tsk); |
2262 | cgroup_unlock(); | 2297 | cgroup_unlock(); |
2263 | return ret; | 2298 | return ret; |
@@ -2308,7 +2343,9 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, | |||
2308 | return -EINVAL; | 2343 | return -EINVAL; |
2309 | if (!cgroup_lock_live_group(cgrp)) | 2344 | if (!cgroup_lock_live_group(cgrp)) |
2310 | return -ENODEV; | 2345 | return -ENODEV; |
2346 | mutex_lock(&cgroup_root_mutex); | ||
2311 | strcpy(cgrp->root->release_agent_path, buffer); | 2347 | strcpy(cgrp->root->release_agent_path, buffer); |
2348 | mutex_unlock(&cgroup_root_mutex); | ||
2312 | cgroup_unlock(); | 2349 | cgroup_unlock(); |
2313 | return 0; | 2350 | return 0; |
2314 | } | 2351 | } |
@@ -2587,7 +2624,7 @@ static inline struct cftype *__file_cft(struct file *file) | |||
2587 | return __d_cft(file->f_dentry); | 2624 | return __d_cft(file->f_dentry); |
2588 | } | 2625 | } |
2589 | 2626 | ||
2590 | static int cgroup_create_file(struct dentry *dentry, mode_t mode, | 2627 | static int cgroup_create_file(struct dentry *dentry, umode_t mode, |
2591 | struct super_block *sb) | 2628 | struct super_block *sb) |
2592 | { | 2629 | { |
2593 | struct inode *inode; | 2630 | struct inode *inode; |
@@ -2628,7 +2665,7 @@ static int cgroup_create_file(struct dentry *dentry, mode_t mode, | |||
2628 | * @mode: mode to set on new directory. | 2665 | * @mode: mode to set on new directory. |
2629 | */ | 2666 | */ |
2630 | static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, | 2667 | static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, |
2631 | mode_t mode) | 2668 | umode_t mode) |
2632 | { | 2669 | { |
2633 | struct dentry *parent; | 2670 | struct dentry *parent; |
2634 | int error = 0; | 2671 | int error = 0; |
@@ -2655,9 +2692,9 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, | |||
2655 | * returns S_IRUGO if it has only a read handler | 2692 | * returns S_IRUGO if it has only a read handler |
2656 | * returns S_IWUSR if it has only a write hander | 2693 | * returns S_IWUSR if it has only a write hander |
2657 | */ | 2694 | */ |
2658 | static mode_t cgroup_file_mode(const struct cftype *cft) | 2695 | static umode_t cgroup_file_mode(const struct cftype *cft) |
2659 | { | 2696 | { |
2660 | mode_t mode = 0; | 2697 | umode_t mode = 0; |
2661 | 2698 | ||
2662 | if (cft->mode) | 2699 | if (cft->mode) |
2663 | return cft->mode; | 2700 | return cft->mode; |
@@ -2680,7 +2717,7 @@ int cgroup_add_file(struct cgroup *cgrp, | |||
2680 | struct dentry *dir = cgrp->dentry; | 2717 | struct dentry *dir = cgrp->dentry; |
2681 | struct dentry *dentry; | 2718 | struct dentry *dentry; |
2682 | int error; | 2719 | int error; |
2683 | mode_t mode; | 2720 | umode_t mode; |
2684 | 2721 | ||
2685 | char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; | 2722 | char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; |
2686 | if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { | 2723 | if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { |
@@ -2791,6 +2828,7 @@ static void cgroup_enable_task_cg_lists(void) | |||
2791 | } | 2828 | } |
2792 | 2829 | ||
2793 | void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) | 2830 | void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) |
2831 | __acquires(css_set_lock) | ||
2794 | { | 2832 | { |
2795 | /* | 2833 | /* |
2796 | * The first time anyone tries to iterate across a cgroup, | 2834 | * The first time anyone tries to iterate across a cgroup, |
@@ -2830,6 +2868,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp, | |||
2830 | } | 2868 | } |
2831 | 2869 | ||
2832 | void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) | 2870 | void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) |
2871 | __releases(css_set_lock) | ||
2833 | { | 2872 | { |
2834 | read_unlock(&css_set_lock); | 2873 | read_unlock(&css_set_lock); |
2835 | } | 2874 | } |
@@ -3754,7 +3793,7 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root) | |||
3754 | * Must be called with the mutex on the parent inode held | 3793 | * Must be called with the mutex on the parent inode held |
3755 | */ | 3794 | */ |
3756 | static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | 3795 | static long cgroup_create(struct cgroup *parent, struct dentry *dentry, |
3757 | mode_t mode) | 3796 | umode_t mode) |
3758 | { | 3797 | { |
3759 | struct cgroup *cgrp; | 3798 | struct cgroup *cgrp; |
3760 | struct cgroupfs_root *root = parent->root; | 3799 | struct cgroupfs_root *root = parent->root; |
@@ -3848,7 +3887,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
3848 | return err; | 3887 | return err; |
3849 | } | 3888 | } |
3850 | 3889 | ||
3851 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode) | 3890 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) |
3852 | { | 3891 | { |
3853 | struct cgroup *c_parent = dentry->d_parent->d_fsdata; | 3892 | struct cgroup *c_parent = dentry->d_parent->d_fsdata; |
3854 | 3893 | ||
@@ -4014,11 +4053,11 @@ again: | |||
4014 | finish_wait(&cgroup_rmdir_waitq, &wait); | 4053 | finish_wait(&cgroup_rmdir_waitq, &wait); |
4015 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); | 4054 | clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); |
4016 | 4055 | ||
4017 | spin_lock(&release_list_lock); | 4056 | raw_spin_lock(&release_list_lock); |
4018 | set_bit(CGRP_REMOVED, &cgrp->flags); | 4057 | set_bit(CGRP_REMOVED, &cgrp->flags); |
4019 | if (!list_empty(&cgrp->release_list)) | 4058 | if (!list_empty(&cgrp->release_list)) |
4020 | list_del_init(&cgrp->release_list); | 4059 | list_del_init(&cgrp->release_list); |
4021 | spin_unlock(&release_list_lock); | 4060 | raw_spin_unlock(&release_list_lock); |
4022 | 4061 | ||
4023 | cgroup_lock_hierarchy(cgrp->root); | 4062 | cgroup_lock_hierarchy(cgrp->root); |
4024 | /* delete this cgroup from parent->children */ | 4063 | /* delete this cgroup from parent->children */ |
@@ -4493,20 +4532,31 @@ static const struct file_operations proc_cgroupstats_operations = { | |||
4493 | * | 4532 | * |
4494 | * A pointer to the shared css_set was automatically copied in | 4533 | * A pointer to the shared css_set was automatically copied in |
4495 | * fork.c by dup_task_struct(). However, we ignore that copy, since | 4534 | * fork.c by dup_task_struct(). However, we ignore that copy, since |
4496 | * it was not made under the protection of RCU or cgroup_mutex, so | 4535 | * it was not made under the protection of RCU, cgroup_mutex or |
4497 | * might no longer be a valid cgroup pointer. cgroup_attach_task() might | 4536 | * threadgroup_change_begin(), so it might no longer be a valid |
4498 | * have already changed current->cgroups, allowing the previously | 4537 | * cgroup pointer. cgroup_attach_task() might have already changed |
4499 | * referenced cgroup group to be removed and freed. | 4538 | * current->cgroups, allowing the previously referenced cgroup |
4539 | * group to be removed and freed. | ||
4540 | * | ||
4541 | * Outside the pointer validity we also need to process the css_set | ||
4542 | * inheritance between threadgoup_change_begin() and | ||
4543 | * threadgoup_change_end(), this way there is no leak in any process | ||
4544 | * wide migration performed by cgroup_attach_proc() that could otherwise | ||
4545 | * miss a thread because it is too early or too late in the fork stage. | ||
4500 | * | 4546 | * |
4501 | * At the point that cgroup_fork() is called, 'current' is the parent | 4547 | * At the point that cgroup_fork() is called, 'current' is the parent |
4502 | * task, and the passed argument 'child' points to the child task. | 4548 | * task, and the passed argument 'child' points to the child task. |
4503 | */ | 4549 | */ |
4504 | void cgroup_fork(struct task_struct *child) | 4550 | void cgroup_fork(struct task_struct *child) |
4505 | { | 4551 | { |
4506 | task_lock(current); | 4552 | /* |
4553 | * We don't need to task_lock() current because current->cgroups | ||
4554 | * can't be changed concurrently here. The parent obviously hasn't | ||
4555 | * exited and called cgroup_exit(), and we are synchronized against | ||
4556 | * cgroup migration through threadgroup_change_begin(). | ||
4557 | */ | ||
4507 | child->cgroups = current->cgroups; | 4558 | child->cgroups = current->cgroups; |
4508 | get_css_set(child->cgroups); | 4559 | get_css_set(child->cgroups); |
4509 | task_unlock(current); | ||
4510 | INIT_LIST_HEAD(&child->cg_list); | 4560 | INIT_LIST_HEAD(&child->cg_list); |
4511 | } | 4561 | } |
4512 | 4562 | ||
@@ -4548,10 +4598,19 @@ void cgroup_post_fork(struct task_struct *child) | |||
4548 | { | 4598 | { |
4549 | if (use_task_css_set_links) { | 4599 | if (use_task_css_set_links) { |
4550 | write_lock(&css_set_lock); | 4600 | write_lock(&css_set_lock); |
4551 | task_lock(child); | 4601 | if (list_empty(&child->cg_list)) { |
4552 | if (list_empty(&child->cg_list)) | 4602 | /* |
4603 | * It's safe to use child->cgroups without task_lock() | ||
4604 | * here because we are protected through | ||
4605 | * threadgroup_change_begin() against concurrent | ||
4606 | * css_set change in cgroup_task_migrate(). Also | ||
4607 | * the task can't exit at that point until | ||
4608 | * wake_up_new_task() is called, so we are protected | ||
4609 | * against cgroup_exit() setting child->cgroup to | ||
4610 | * init_css_set. | ||
4611 | */ | ||
4553 | list_add(&child->cg_list, &child->cgroups->tasks); | 4612 | list_add(&child->cg_list, &child->cgroups->tasks); |
4554 | task_unlock(child); | 4613 | } |
4555 | write_unlock(&css_set_lock); | 4614 | write_unlock(&css_set_lock); |
4556 | } | 4615 | } |
4557 | } | 4616 | } |
@@ -4671,13 +4730,13 @@ static void check_for_release(struct cgroup *cgrp) | |||
4671 | * already queued for a userspace notification, queue | 4730 | * already queued for a userspace notification, queue |
4672 | * it now */ | 4731 | * it now */ |
4673 | int need_schedule_work = 0; | 4732 | int need_schedule_work = 0; |
4674 | spin_lock(&release_list_lock); | 4733 | raw_spin_lock(&release_list_lock); |
4675 | if (!cgroup_is_removed(cgrp) && | 4734 | if (!cgroup_is_removed(cgrp) && |
4676 | list_empty(&cgrp->release_list)) { | 4735 | list_empty(&cgrp->release_list)) { |
4677 | list_add(&cgrp->release_list, &release_list); | 4736 | list_add(&cgrp->release_list, &release_list); |
4678 | need_schedule_work = 1; | 4737 | need_schedule_work = 1; |
4679 | } | 4738 | } |
4680 | spin_unlock(&release_list_lock); | 4739 | raw_spin_unlock(&release_list_lock); |
4681 | if (need_schedule_work) | 4740 | if (need_schedule_work) |
4682 | schedule_work(&release_agent_work); | 4741 | schedule_work(&release_agent_work); |
4683 | } | 4742 | } |
@@ -4729,7 +4788,7 @@ static void cgroup_release_agent(struct work_struct *work) | |||
4729 | { | 4788 | { |
4730 | BUG_ON(work != &release_agent_work); | 4789 | BUG_ON(work != &release_agent_work); |
4731 | mutex_lock(&cgroup_mutex); | 4790 | mutex_lock(&cgroup_mutex); |
4732 | spin_lock(&release_list_lock); | 4791 | raw_spin_lock(&release_list_lock); |
4733 | while (!list_empty(&release_list)) { | 4792 | while (!list_empty(&release_list)) { |
4734 | char *argv[3], *envp[3]; | 4793 | char *argv[3], *envp[3]; |
4735 | int i; | 4794 | int i; |
@@ -4738,7 +4797,7 @@ static void cgroup_release_agent(struct work_struct *work) | |||
4738 | struct cgroup, | 4797 | struct cgroup, |
4739 | release_list); | 4798 | release_list); |
4740 | list_del_init(&cgrp->release_list); | 4799 | list_del_init(&cgrp->release_list); |
4741 | spin_unlock(&release_list_lock); | 4800 | raw_spin_unlock(&release_list_lock); |
4742 | pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); | 4801 | pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); |
4743 | if (!pathbuf) | 4802 | if (!pathbuf) |
4744 | goto continue_free; | 4803 | goto continue_free; |
@@ -4768,9 +4827,9 @@ static void cgroup_release_agent(struct work_struct *work) | |||
4768 | continue_free: | 4827 | continue_free: |
4769 | kfree(pathbuf); | 4828 | kfree(pathbuf); |
4770 | kfree(agentbuf); | 4829 | kfree(agentbuf); |
4771 | spin_lock(&release_list_lock); | 4830 | raw_spin_lock(&release_list_lock); |
4772 | } | 4831 | } |
4773 | spin_unlock(&release_list_lock); | 4832 | raw_spin_unlock(&release_list_lock); |
4774 | mutex_unlock(&cgroup_mutex); | 4833 | mutex_unlock(&cgroup_mutex); |
4775 | } | 4834 | } |
4776 | 4835 | ||
@@ -4880,9 +4939,9 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) | |||
4880 | 4939 | ||
4881 | rcu_assign_pointer(id->css, NULL); | 4940 | rcu_assign_pointer(id->css, NULL); |
4882 | rcu_assign_pointer(css->id, NULL); | 4941 | rcu_assign_pointer(css->id, NULL); |
4883 | spin_lock(&ss->id_lock); | 4942 | write_lock(&ss->id_lock); |
4884 | idr_remove(&ss->idr, id->id); | 4943 | idr_remove(&ss->idr, id->id); |
4885 | spin_unlock(&ss->id_lock); | 4944 | write_unlock(&ss->id_lock); |
4886 | kfree_rcu(id, rcu_head); | 4945 | kfree_rcu(id, rcu_head); |
4887 | } | 4946 | } |
4888 | EXPORT_SYMBOL_GPL(free_css_id); | 4947 | EXPORT_SYMBOL_GPL(free_css_id); |
@@ -4908,10 +4967,10 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) | |||
4908 | error = -ENOMEM; | 4967 | error = -ENOMEM; |
4909 | goto err_out; | 4968 | goto err_out; |
4910 | } | 4969 | } |
4911 | spin_lock(&ss->id_lock); | 4970 | write_lock(&ss->id_lock); |
4912 | /* Don't use 0. allocates an ID of 1-65535 */ | 4971 | /* Don't use 0. allocates an ID of 1-65535 */ |
4913 | error = idr_get_new_above(&ss->idr, newid, 1, &myid); | 4972 | error = idr_get_new_above(&ss->idr, newid, 1, &myid); |
4914 | spin_unlock(&ss->id_lock); | 4973 | write_unlock(&ss->id_lock); |
4915 | 4974 | ||
4916 | /* Returns error when there are no free spaces for new ID.*/ | 4975 | /* Returns error when there are no free spaces for new ID.*/ |
4917 | if (error) { | 4976 | if (error) { |
@@ -4926,9 +4985,9 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) | |||
4926 | return newid; | 4985 | return newid; |
4927 | remove_idr: | 4986 | remove_idr: |
4928 | error = -ENOSPC; | 4987 | error = -ENOSPC; |
4929 | spin_lock(&ss->id_lock); | 4988 | write_lock(&ss->id_lock); |
4930 | idr_remove(&ss->idr, myid); | 4989 | idr_remove(&ss->idr, myid); |
4931 | spin_unlock(&ss->id_lock); | 4990 | write_unlock(&ss->id_lock); |
4932 | err_out: | 4991 | err_out: |
4933 | kfree(newid); | 4992 | kfree(newid); |
4934 | return ERR_PTR(error); | 4993 | return ERR_PTR(error); |
@@ -4940,7 +4999,7 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, | |||
4940 | { | 4999 | { |
4941 | struct css_id *newid; | 5000 | struct css_id *newid; |
4942 | 5001 | ||
4943 | spin_lock_init(&ss->id_lock); | 5002 | rwlock_init(&ss->id_lock); |
4944 | idr_init(&ss->idr); | 5003 | idr_init(&ss->idr); |
4945 | 5004 | ||
4946 | newid = get_new_cssid(ss, 0); | 5005 | newid = get_new_cssid(ss, 0); |
@@ -5035,9 +5094,9 @@ css_get_next(struct cgroup_subsys *ss, int id, | |||
5035 | * scan next entry from bitmap(tree), tmpid is updated after | 5094 | * scan next entry from bitmap(tree), tmpid is updated after |
5036 | * idr_get_next(). | 5095 | * idr_get_next(). |
5037 | */ | 5096 | */ |
5038 | spin_lock(&ss->id_lock); | 5097 | read_lock(&ss->id_lock); |
5039 | tmp = idr_get_next(&ss->idr, &tmpid); | 5098 | tmp = idr_get_next(&ss->idr, &tmpid); |
5040 | spin_unlock(&ss->id_lock); | 5099 | read_unlock(&ss->id_lock); |
5041 | 5100 | ||
5042 | if (!tmp) | 5101 | if (!tmp) |
5043 | break; | 5102 | break; |
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index e691818d7e45..fc0646b78a64 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c | |||
@@ -14,7 +14,7 @@ | |||
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
15 | */ | 15 | */ |
16 | 16 | ||
17 | #include <linux/module.h> | 17 | #include <linux/export.h> |
18 | #include <linux/slab.h> | 18 | #include <linux/slab.h> |
19 | #include <linux/cgroup.h> | 19 | #include <linux/cgroup.h> |
20 | #include <linux/fs.h> | 20 | #include <linux/fs.h> |
@@ -48,19 +48,17 @@ static inline struct freezer *task_freezer(struct task_struct *task) | |||
48 | struct freezer, css); | 48 | struct freezer, css); |
49 | } | 49 | } |
50 | 50 | ||
51 | static inline int __cgroup_freezing_or_frozen(struct task_struct *task) | 51 | bool cgroup_freezing(struct task_struct *task) |
52 | { | 52 | { |
53 | enum freezer_state state = task_freezer(task)->state; | 53 | enum freezer_state state; |
54 | return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN); | 54 | bool ret; |
55 | } | ||
56 | 55 | ||
57 | int cgroup_freezing_or_frozen(struct task_struct *task) | 56 | rcu_read_lock(); |
58 | { | 57 | state = task_freezer(task)->state; |
59 | int result; | 58 | ret = state == CGROUP_FREEZING || state == CGROUP_FROZEN; |
60 | task_lock(task); | 59 | rcu_read_unlock(); |
61 | result = __cgroup_freezing_or_frozen(task); | 60 | |
62 | task_unlock(task); | 61 | return ret; |
63 | return result; | ||
64 | } | 62 | } |
65 | 63 | ||
66 | /* | 64 | /* |
@@ -102,9 +100,6 @@ struct cgroup_subsys freezer_subsys; | |||
102 | * freezer_can_attach(): | 100 | * freezer_can_attach(): |
103 | * cgroup_mutex (held by caller of can_attach) | 101 | * cgroup_mutex (held by caller of can_attach) |
104 | * | 102 | * |
105 | * cgroup_freezing_or_frozen(): | ||
106 | * task->alloc_lock (to get task's cgroup) | ||
107 | * | ||
108 | * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): | 103 | * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): |
109 | * freezer->lock | 104 | * freezer->lock |
110 | * sighand->siglock (if the cgroup is freezing) | 105 | * sighand->siglock (if the cgroup is freezing) |
@@ -130,7 +125,7 @@ struct cgroup_subsys freezer_subsys; | |||
130 | * write_lock css_set_lock (cgroup iterator start) | 125 | * write_lock css_set_lock (cgroup iterator start) |
131 | * task->alloc_lock | 126 | * task->alloc_lock |
132 | * read_lock css_set_lock (cgroup iterator start) | 127 | * read_lock css_set_lock (cgroup iterator start) |
133 | * task->alloc_lock (inside thaw_process(), prevents race with refrigerator()) | 128 | * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator()) |
134 | * sighand->siglock | 129 | * sighand->siglock |
135 | */ | 130 | */ |
136 | static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, | 131 | static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, |
@@ -150,7 +145,18 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, | |||
150 | static void freezer_destroy(struct cgroup_subsys *ss, | 145 | static void freezer_destroy(struct cgroup_subsys *ss, |
151 | struct cgroup *cgroup) | 146 | struct cgroup *cgroup) |
152 | { | 147 | { |
153 | kfree(cgroup_freezer(cgroup)); | 148 | struct freezer *freezer = cgroup_freezer(cgroup); |
149 | |||
150 | if (freezer->state != CGROUP_THAWED) | ||
151 | atomic_dec(&system_freezing_cnt); | ||
152 | kfree(freezer); | ||
153 | } | ||
154 | |||
155 | /* task is frozen or will freeze immediately when next it gets woken */ | ||
156 | static bool is_task_frozen_enough(struct task_struct *task) | ||
157 | { | ||
158 | return frozen(task) || | ||
159 | (task_is_stopped_or_traced(task) && freezing(task)); | ||
154 | } | 160 | } |
155 | 161 | ||
156 | /* | 162 | /* |
@@ -160,13 +166,17 @@ static void freezer_destroy(struct cgroup_subsys *ss, | |||
160 | */ | 166 | */ |
161 | static int freezer_can_attach(struct cgroup_subsys *ss, | 167 | static int freezer_can_attach(struct cgroup_subsys *ss, |
162 | struct cgroup *new_cgroup, | 168 | struct cgroup *new_cgroup, |
163 | struct task_struct *task) | 169 | struct cgroup_taskset *tset) |
164 | { | 170 | { |
165 | struct freezer *freezer; | 171 | struct freezer *freezer; |
172 | struct task_struct *task; | ||
166 | 173 | ||
167 | /* | 174 | /* |
168 | * Anything frozen can't move or be moved to/from. | 175 | * Anything frozen can't move or be moved to/from. |
169 | */ | 176 | */ |
177 | cgroup_taskset_for_each(task, new_cgroup, tset) | ||
178 | if (cgroup_freezing(task)) | ||
179 | return -EBUSY; | ||
170 | 180 | ||
171 | freezer = cgroup_freezer(new_cgroup); | 181 | freezer = cgroup_freezer(new_cgroup); |
172 | if (freezer->state != CGROUP_THAWED) | 182 | if (freezer->state != CGROUP_THAWED) |
@@ -175,17 +185,6 @@ static int freezer_can_attach(struct cgroup_subsys *ss, | |||
175 | return 0; | 185 | return 0; |
176 | } | 186 | } |
177 | 187 | ||
178 | static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | ||
179 | { | ||
180 | rcu_read_lock(); | ||
181 | if (__cgroup_freezing_or_frozen(tsk)) { | ||
182 | rcu_read_unlock(); | ||
183 | return -EBUSY; | ||
184 | } | ||
185 | rcu_read_unlock(); | ||
186 | return 0; | ||
187 | } | ||
188 | |||
189 | static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) | 188 | static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) |
190 | { | 189 | { |
191 | struct freezer *freezer; | 190 | struct freezer *freezer; |
@@ -213,7 +212,7 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) | |||
213 | 212 | ||
214 | /* Locking avoids race with FREEZING -> THAWED transitions. */ | 213 | /* Locking avoids race with FREEZING -> THAWED transitions. */ |
215 | if (freezer->state == CGROUP_FREEZING) | 214 | if (freezer->state == CGROUP_FREEZING) |
216 | freeze_task(task, true); | 215 | freeze_task(task); |
217 | spin_unlock_irq(&freezer->lock); | 216 | spin_unlock_irq(&freezer->lock); |
218 | } | 217 | } |
219 | 218 | ||
@@ -231,7 +230,7 @@ static void update_if_frozen(struct cgroup *cgroup, | |||
231 | cgroup_iter_start(cgroup, &it); | 230 | cgroup_iter_start(cgroup, &it); |
232 | while ((task = cgroup_iter_next(cgroup, &it))) { | 231 | while ((task = cgroup_iter_next(cgroup, &it))) { |
233 | ntotal++; | 232 | ntotal++; |
234 | if (frozen(task)) | 233 | if (freezing(task) && is_task_frozen_enough(task)) |
235 | nfrozen++; | 234 | nfrozen++; |
236 | } | 235 | } |
237 | 236 | ||
@@ -279,12 +278,11 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) | |||
279 | struct task_struct *task; | 278 | struct task_struct *task; |
280 | unsigned int num_cant_freeze_now = 0; | 279 | unsigned int num_cant_freeze_now = 0; |
281 | 280 | ||
282 | freezer->state = CGROUP_FREEZING; | ||
283 | cgroup_iter_start(cgroup, &it); | 281 | cgroup_iter_start(cgroup, &it); |
284 | while ((task = cgroup_iter_next(cgroup, &it))) { | 282 | while ((task = cgroup_iter_next(cgroup, &it))) { |
285 | if (!freeze_task(task, true)) | 283 | if (!freeze_task(task)) |
286 | continue; | 284 | continue; |
287 | if (frozen(task)) | 285 | if (is_task_frozen_enough(task)) |
288 | continue; | 286 | continue; |
289 | if (!freezing(task) && !freezer_should_skip(task)) | 287 | if (!freezing(task) && !freezer_should_skip(task)) |
290 | num_cant_freeze_now++; | 288 | num_cant_freeze_now++; |
@@ -300,12 +298,9 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) | |||
300 | struct task_struct *task; | 298 | struct task_struct *task; |
301 | 299 | ||
302 | cgroup_iter_start(cgroup, &it); | 300 | cgroup_iter_start(cgroup, &it); |
303 | while ((task = cgroup_iter_next(cgroup, &it))) { | 301 | while ((task = cgroup_iter_next(cgroup, &it))) |
304 | thaw_process(task); | 302 | __thaw_task(task); |
305 | } | ||
306 | cgroup_iter_end(cgroup, &it); | 303 | cgroup_iter_end(cgroup, &it); |
307 | |||
308 | freezer->state = CGROUP_THAWED; | ||
309 | } | 304 | } |
310 | 305 | ||
311 | static int freezer_change_state(struct cgroup *cgroup, | 306 | static int freezer_change_state(struct cgroup *cgroup, |
@@ -319,20 +314,24 @@ static int freezer_change_state(struct cgroup *cgroup, | |||
319 | spin_lock_irq(&freezer->lock); | 314 | spin_lock_irq(&freezer->lock); |
320 | 315 | ||
321 | update_if_frozen(cgroup, freezer); | 316 | update_if_frozen(cgroup, freezer); |
322 | if (goal_state == freezer->state) | ||
323 | goto out; | ||
324 | 317 | ||
325 | switch (goal_state) { | 318 | switch (goal_state) { |
326 | case CGROUP_THAWED: | 319 | case CGROUP_THAWED: |
320 | if (freezer->state != CGROUP_THAWED) | ||
321 | atomic_dec(&system_freezing_cnt); | ||
322 | freezer->state = CGROUP_THAWED; | ||
327 | unfreeze_cgroup(cgroup, freezer); | 323 | unfreeze_cgroup(cgroup, freezer); |
328 | break; | 324 | break; |
329 | case CGROUP_FROZEN: | 325 | case CGROUP_FROZEN: |
326 | if (freezer->state == CGROUP_THAWED) | ||
327 | atomic_inc(&system_freezing_cnt); | ||
328 | freezer->state = CGROUP_FREEZING; | ||
330 | retval = try_to_freeze_cgroup(cgroup, freezer); | 329 | retval = try_to_freeze_cgroup(cgroup, freezer); |
331 | break; | 330 | break; |
332 | default: | 331 | default: |
333 | BUG(); | 332 | BUG(); |
334 | } | 333 | } |
335 | out: | 334 | |
336 | spin_unlock_irq(&freezer->lock); | 335 | spin_unlock_irq(&freezer->lock); |
337 | 336 | ||
338 | return retval; | 337 | return retval; |
@@ -381,10 +380,5 @@ struct cgroup_subsys freezer_subsys = { | |||
381 | .populate = freezer_populate, | 380 | .populate = freezer_populate, |
382 | .subsys_id = freezer_subsys_id, | 381 | .subsys_id = freezer_subsys_id, |
383 | .can_attach = freezer_can_attach, | 382 | .can_attach = freezer_can_attach, |
384 | .can_attach_task = freezer_can_attach_task, | ||
385 | .pre_attach = NULL, | ||
386 | .attach_task = NULL, | ||
387 | .attach = NULL, | ||
388 | .fork = freezer_fork, | 383 | .fork = freezer_fork, |
389 | .exit = NULL, | ||
390 | }; | 384 | }; |
diff --git a/kernel/compat.c b/kernel/compat.c index e2435ee9993a..f346cedfe24d 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/unistd.h> | 21 | #include <linux/unistd.h> |
22 | #include <linux/security.h> | 22 | #include <linux/security.h> |
23 | #include <linux/timex.h> | 23 | #include <linux/timex.h> |
24 | #include <linux/export.h> | ||
24 | #include <linux/migrate.h> | 25 | #include <linux/migrate.h> |
25 | #include <linux/posix-timers.h> | 26 | #include <linux/posix-timers.h> |
26 | #include <linux/times.h> | 27 | #include <linux/times.h> |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 12b7458f23b1..2060c6e57027 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -10,11 +10,12 @@ | |||
10 | #include <linux/sched.h> | 10 | #include <linux/sched.h> |
11 | #include <linux/unistd.h> | 11 | #include <linux/unistd.h> |
12 | #include <linux/cpu.h> | 12 | #include <linux/cpu.h> |
13 | #include <linux/module.h> | 13 | #include <linux/export.h> |
14 | #include <linux/kthread.h> | 14 | #include <linux/kthread.h> |
15 | #include <linux/stop_machine.h> | 15 | #include <linux/stop_machine.h> |
16 | #include <linux/mutex.h> | 16 | #include <linux/mutex.h> |
17 | #include <linux/gfp.h> | 17 | #include <linux/gfp.h> |
18 | #include <linux/suspend.h> | ||
18 | 19 | ||
19 | #ifdef CONFIG_SMP | 20 | #ifdef CONFIG_SMP |
20 | /* Serializes the updates to cpu_online_mask, cpu_present_mask */ | 21 | /* Serializes the updates to cpu_online_mask, cpu_present_mask */ |
@@ -177,8 +178,7 @@ static inline void check_for_tasks(int cpu) | |||
177 | write_lock_irq(&tasklist_lock); | 178 | write_lock_irq(&tasklist_lock); |
178 | for_each_process(p) { | 179 | for_each_process(p) { |
179 | if (task_cpu(p) == cpu && p->state == TASK_RUNNING && | 180 | if (task_cpu(p) == cpu && p->state == TASK_RUNNING && |
180 | (!cputime_eq(p->utime, cputime_zero) || | 181 | (p->utime || p->stime)) |
181 | !cputime_eq(p->stime, cputime_zero))) | ||
182 | printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " | 182 | printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " |
183 | "(state = %ld, flags = %x)\n", | 183 | "(state = %ld, flags = %x)\n", |
184 | p->comm, task_pid_nr(p), cpu, | 184 | p->comm, task_pid_nr(p), cpu, |
@@ -379,6 +379,7 @@ out: | |||
379 | cpu_maps_update_done(); | 379 | cpu_maps_update_done(); |
380 | return err; | 380 | return err; |
381 | } | 381 | } |
382 | EXPORT_SYMBOL_GPL(cpu_up); | ||
382 | 383 | ||
383 | #ifdef CONFIG_PM_SLEEP_SMP | 384 | #ifdef CONFIG_PM_SLEEP_SMP |
384 | static cpumask_var_t frozen_cpus; | 385 | static cpumask_var_t frozen_cpus; |
@@ -469,13 +470,86 @@ out: | |||
469 | cpu_maps_update_done(); | 470 | cpu_maps_update_done(); |
470 | } | 471 | } |
471 | 472 | ||
472 | static int alloc_frozen_cpus(void) | 473 | static int __init alloc_frozen_cpus(void) |
473 | { | 474 | { |
474 | if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO)) | 475 | if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO)) |
475 | return -ENOMEM; | 476 | return -ENOMEM; |
476 | return 0; | 477 | return 0; |
477 | } | 478 | } |
478 | core_initcall(alloc_frozen_cpus); | 479 | core_initcall(alloc_frozen_cpus); |
480 | |||
481 | /* | ||
482 | * Prevent regular CPU hotplug from racing with the freezer, by disabling CPU | ||
483 | * hotplug when tasks are about to be frozen. Also, don't allow the freezer | ||
484 | * to continue until any currently running CPU hotplug operation gets | ||
485 | * completed. | ||
486 | * To modify the 'cpu_hotplug_disabled' flag, we need to acquire the | ||
487 | * 'cpu_add_remove_lock'. And this same lock is also taken by the regular | ||
488 | * CPU hotplug path and released only after it is complete. Thus, we | ||
489 | * (and hence the freezer) will block here until any currently running CPU | ||
490 | * hotplug operation gets completed. | ||
491 | */ | ||
492 | void cpu_hotplug_disable_before_freeze(void) | ||
493 | { | ||
494 | cpu_maps_update_begin(); | ||
495 | cpu_hotplug_disabled = 1; | ||
496 | cpu_maps_update_done(); | ||
497 | } | ||
498 | |||
499 | |||
500 | /* | ||
501 | * When tasks have been thawed, re-enable regular CPU hotplug (which had been | ||
502 | * disabled while beginning to freeze tasks). | ||
503 | */ | ||
504 | void cpu_hotplug_enable_after_thaw(void) | ||
505 | { | ||
506 | cpu_maps_update_begin(); | ||
507 | cpu_hotplug_disabled = 0; | ||
508 | cpu_maps_update_done(); | ||
509 | } | ||
510 | |||
511 | /* | ||
512 | * When callbacks for CPU hotplug notifications are being executed, we must | ||
513 | * ensure that the state of the system with respect to the tasks being frozen | ||
514 | * or not, as reported by the notification, remains unchanged *throughout the | ||
515 | * duration* of the execution of the callbacks. | ||
516 | * Hence we need to prevent the freezer from racing with regular CPU hotplug. | ||
517 | * | ||
518 | * This synchronization is implemented by mutually excluding regular CPU | ||
519 | * hotplug and Suspend/Hibernate call paths by hooking onto the Suspend/ | ||
520 | * Hibernate notifications. | ||
521 | */ | ||
522 | static int | ||
523 | cpu_hotplug_pm_callback(struct notifier_block *nb, | ||
524 | unsigned long action, void *ptr) | ||
525 | { | ||
526 | switch (action) { | ||
527 | |||
528 | case PM_SUSPEND_PREPARE: | ||
529 | case PM_HIBERNATION_PREPARE: | ||
530 | cpu_hotplug_disable_before_freeze(); | ||
531 | break; | ||
532 | |||
533 | case PM_POST_SUSPEND: | ||
534 | case PM_POST_HIBERNATION: | ||
535 | cpu_hotplug_enable_after_thaw(); | ||
536 | break; | ||
537 | |||
538 | default: | ||
539 | return NOTIFY_DONE; | ||
540 | } | ||
541 | |||
542 | return NOTIFY_OK; | ||
543 | } | ||
544 | |||
545 | |||
546 | static int __init cpu_hotplug_pm_sync_init(void) | ||
547 | { | ||
548 | pm_notifier(cpu_hotplug_pm_callback, 0); | ||
549 | return 0; | ||
550 | } | ||
551 | core_initcall(cpu_hotplug_pm_sync_init); | ||
552 | |||
479 | #endif /* CONFIG_PM_SLEEP_SMP */ | 553 | #endif /* CONFIG_PM_SLEEP_SMP */ |
480 | 554 | ||
481 | /** | 555 | /** |
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c new file mode 100644 index 000000000000..249152e15308 --- /dev/null +++ b/kernel/cpu_pm.c | |||
@@ -0,0 +1,233 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2011 Google, Inc. | ||
3 | * | ||
4 | * Author: | ||
5 | * Colin Cross <ccross@android.com> | ||
6 | * | ||
7 | * This software is licensed under the terms of the GNU General Public | ||
8 | * License version 2, as published by the Free Software Foundation, and | ||
9 | * may be copied, distributed, and modified under those terms. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
14 | * GNU General Public License for more details. | ||
15 | * | ||
16 | */ | ||
17 | |||
18 | #include <linux/kernel.h> | ||
19 | #include <linux/cpu_pm.h> | ||
20 | #include <linux/module.h> | ||
21 | #include <linux/notifier.h> | ||
22 | #include <linux/spinlock.h> | ||
23 | #include <linux/syscore_ops.h> | ||
24 | |||
25 | static DEFINE_RWLOCK(cpu_pm_notifier_lock); | ||
26 | static RAW_NOTIFIER_HEAD(cpu_pm_notifier_chain); | ||
27 | |||
28 | static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls) | ||
29 | { | ||
30 | int ret; | ||
31 | |||
32 | ret = __raw_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL, | ||
33 | nr_to_call, nr_calls); | ||
34 | |||
35 | return notifier_to_errno(ret); | ||
36 | } | ||
37 | |||
38 | /** | ||
39 | * cpu_pm_register_notifier - register a driver with cpu_pm | ||
40 | * @nb: notifier block to register | ||
41 | * | ||
42 | * Add a driver to a list of drivers that are notified about | ||
43 | * CPU and CPU cluster low power entry and exit. | ||
44 | * | ||
45 | * This function may sleep, and has the same return conditions as | ||
46 | * raw_notifier_chain_register. | ||
47 | */ | ||
48 | int cpu_pm_register_notifier(struct notifier_block *nb) | ||
49 | { | ||
50 | unsigned long flags; | ||
51 | int ret; | ||
52 | |||
53 | write_lock_irqsave(&cpu_pm_notifier_lock, flags); | ||
54 | ret = raw_notifier_chain_register(&cpu_pm_notifier_chain, nb); | ||
55 | write_unlock_irqrestore(&cpu_pm_notifier_lock, flags); | ||
56 | |||
57 | return ret; | ||
58 | } | ||
59 | EXPORT_SYMBOL_GPL(cpu_pm_register_notifier); | ||
60 | |||
61 | /** | ||
62 | * cpu_pm_unregister_notifier - unregister a driver with cpu_pm | ||
63 | * @nb: notifier block to be unregistered | ||
64 | * | ||
65 | * Remove a driver from the CPU PM notifier list. | ||
66 | * | ||
67 | * This function may sleep, and has the same return conditions as | ||
68 | * raw_notifier_chain_unregister. | ||
69 | */ | ||
70 | int cpu_pm_unregister_notifier(struct notifier_block *nb) | ||
71 | { | ||
72 | unsigned long flags; | ||
73 | int ret; | ||
74 | |||
75 | write_lock_irqsave(&cpu_pm_notifier_lock, flags); | ||
76 | ret = raw_notifier_chain_unregister(&cpu_pm_notifier_chain, nb); | ||
77 | write_unlock_irqrestore(&cpu_pm_notifier_lock, flags); | ||
78 | |||
79 | return ret; | ||
80 | } | ||
81 | EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier); | ||
82 | |||
83 | /** | ||
84 | * cpm_pm_enter - CPU low power entry notifier | ||
85 | * | ||
86 | * Notifies listeners that a single CPU is entering a low power state that may | ||
87 | * cause some blocks in the same power domain as the cpu to reset. | ||
88 | * | ||
89 | * Must be called on the affected CPU with interrupts disabled. Platform is | ||
90 | * responsible for ensuring that cpu_pm_enter is not called twice on the same | ||
91 | * CPU before cpu_pm_exit is called. Notified drivers can include VFP | ||
92 | * co-processor, interrupt controller and it's PM extensions, local CPU | ||
93 | * timers context save/restore which shouldn't be interrupted. Hence it | ||
94 | * must be called with interrupts disabled. | ||
95 | * | ||
96 | * Return conditions are same as __raw_notifier_call_chain. | ||
97 | */ | ||
98 | int cpu_pm_enter(void) | ||
99 | { | ||
100 | int nr_calls; | ||
101 | int ret = 0; | ||
102 | |||
103 | read_lock(&cpu_pm_notifier_lock); | ||
104 | ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls); | ||
105 | if (ret) | ||
106 | /* | ||
107 | * Inform listeners (nr_calls - 1) about failure of CPU PM | ||
108 | * PM entry who are notified earlier to prepare for it. | ||
109 | */ | ||
110 | cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL); | ||
111 | read_unlock(&cpu_pm_notifier_lock); | ||
112 | |||
113 | return ret; | ||
114 | } | ||
115 | EXPORT_SYMBOL_GPL(cpu_pm_enter); | ||
116 | |||
117 | /** | ||
118 | * cpm_pm_exit - CPU low power exit notifier | ||
119 | * | ||
120 | * Notifies listeners that a single CPU is exiting a low power state that may | ||
121 | * have caused some blocks in the same power domain as the cpu to reset. | ||
122 | * | ||
123 | * Notified drivers can include VFP co-processor, interrupt controller | ||
124 | * and it's PM extensions, local CPU timers context save/restore which | ||
125 | * shouldn't be interrupted. Hence it must be called with interrupts disabled. | ||
126 | * | ||
127 | * Return conditions are same as __raw_notifier_call_chain. | ||
128 | */ | ||
129 | int cpu_pm_exit(void) | ||
130 | { | ||
131 | int ret; | ||
132 | |||
133 | read_lock(&cpu_pm_notifier_lock); | ||
134 | ret = cpu_pm_notify(CPU_PM_EXIT, -1, NULL); | ||
135 | read_unlock(&cpu_pm_notifier_lock); | ||
136 | |||
137 | return ret; | ||
138 | } | ||
139 | EXPORT_SYMBOL_GPL(cpu_pm_exit); | ||
140 | |||
141 | /** | ||
142 | * cpm_cluster_pm_enter - CPU cluster low power entry notifier | ||
143 | * | ||
144 | * Notifies listeners that all cpus in a power domain are entering a low power | ||
145 | * state that may cause some blocks in the same power domain to reset. | ||
146 | * | ||
147 | * Must be called after cpu_pm_enter has been called on all cpus in the power | ||
148 | * domain, and before cpu_pm_exit has been called on any cpu in the power | ||
149 | * domain. Notified drivers can include VFP co-processor, interrupt controller | ||
150 | * and it's PM extensions, local CPU timers context save/restore which | ||
151 | * shouldn't be interrupted. Hence it must be called with interrupts disabled. | ||
152 | * | ||
153 | * Must be called with interrupts disabled. | ||
154 | * | ||
155 | * Return conditions are same as __raw_notifier_call_chain. | ||
156 | */ | ||
157 | int cpu_cluster_pm_enter(void) | ||
158 | { | ||
159 | int nr_calls; | ||
160 | int ret = 0; | ||
161 | |||
162 | read_lock(&cpu_pm_notifier_lock); | ||
163 | ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls); | ||
164 | if (ret) | ||
165 | /* | ||
166 | * Inform listeners (nr_calls - 1) about failure of CPU cluster | ||
167 | * PM entry who are notified earlier to prepare for it. | ||
168 | */ | ||
169 | cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL); | ||
170 | read_unlock(&cpu_pm_notifier_lock); | ||
171 | |||
172 | return ret; | ||
173 | } | ||
174 | EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter); | ||
175 | |||
176 | /** | ||
177 | * cpm_cluster_pm_exit - CPU cluster low power exit notifier | ||
178 | * | ||
179 | * Notifies listeners that all cpus in a power domain are exiting form a | ||
180 | * low power state that may have caused some blocks in the same power domain | ||
181 | * to reset. | ||
182 | * | ||
183 | * Must be called after cpu_pm_exit has been called on all cpus in the power | ||
184 | * domain, and before cpu_pm_exit has been called on any cpu in the power | ||
185 | * domain. Notified drivers can include VFP co-processor, interrupt controller | ||
186 | * and it's PM extensions, local CPU timers context save/restore which | ||
187 | * shouldn't be interrupted. Hence it must be called with interrupts disabled. | ||
188 | * | ||
189 | * Return conditions are same as __raw_notifier_call_chain. | ||
190 | */ | ||
191 | int cpu_cluster_pm_exit(void) | ||
192 | { | ||
193 | int ret; | ||
194 | |||
195 | read_lock(&cpu_pm_notifier_lock); | ||
196 | ret = cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL); | ||
197 | read_unlock(&cpu_pm_notifier_lock); | ||
198 | |||
199 | return ret; | ||
200 | } | ||
201 | EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit); | ||
202 | |||
203 | #ifdef CONFIG_PM | ||
204 | static int cpu_pm_suspend(void) | ||
205 | { | ||
206 | int ret; | ||
207 | |||
208 | ret = cpu_pm_enter(); | ||
209 | if (ret) | ||
210 | return ret; | ||
211 | |||
212 | ret = cpu_cluster_pm_enter(); | ||
213 | return ret; | ||
214 | } | ||
215 | |||
216 | static void cpu_pm_resume(void) | ||
217 | { | ||
218 | cpu_cluster_pm_exit(); | ||
219 | cpu_pm_exit(); | ||
220 | } | ||
221 | |||
222 | static struct syscore_ops cpu_pm_syscore_ops = { | ||
223 | .suspend = cpu_pm_suspend, | ||
224 | .resume = cpu_pm_resume, | ||
225 | }; | ||
226 | |||
227 | static int cpu_pm_init(void) | ||
228 | { | ||
229 | register_syscore_ops(&cpu_pm_syscore_ops); | ||
230 | return 0; | ||
231 | } | ||
232 | core_initcall(cpu_pm_init); | ||
233 | #endif | ||
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 10131fdaff70..a09ac2b9a661 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -37,7 +37,7 @@ | |||
37 | #include <linux/mempolicy.h> | 37 | #include <linux/mempolicy.h> |
38 | #include <linux/mm.h> | 38 | #include <linux/mm.h> |
39 | #include <linux/memory.h> | 39 | #include <linux/memory.h> |
40 | #include <linux/module.h> | 40 | #include <linux/export.h> |
41 | #include <linux/mount.h> | 41 | #include <linux/mount.h> |
42 | #include <linux/namei.h> | 42 | #include <linux/namei.h> |
43 | #include <linux/pagemap.h> | 43 | #include <linux/pagemap.h> |
@@ -123,6 +123,19 @@ static inline struct cpuset *task_cs(struct task_struct *task) | |||
123 | struct cpuset, css); | 123 | struct cpuset, css); |
124 | } | 124 | } |
125 | 125 | ||
126 | #ifdef CONFIG_NUMA | ||
127 | static inline bool task_has_mempolicy(struct task_struct *task) | ||
128 | { | ||
129 | return task->mempolicy; | ||
130 | } | ||
131 | #else | ||
132 | static inline bool task_has_mempolicy(struct task_struct *task) | ||
133 | { | ||
134 | return false; | ||
135 | } | ||
136 | #endif | ||
137 | |||
138 | |||
126 | /* bits in struct cpuset flags field */ | 139 | /* bits in struct cpuset flags field */ |
127 | typedef enum { | 140 | typedef enum { |
128 | CS_CPU_EXCLUSIVE, | 141 | CS_CPU_EXCLUSIVE, |
@@ -949,6 +962,8 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, | |||
949 | static void cpuset_change_task_nodemask(struct task_struct *tsk, | 962 | static void cpuset_change_task_nodemask(struct task_struct *tsk, |
950 | nodemask_t *newmems) | 963 | nodemask_t *newmems) |
951 | { | 964 | { |
965 | bool need_loop; | ||
966 | |||
952 | repeat: | 967 | repeat: |
953 | /* | 968 | /* |
954 | * Allow tasks that have access to memory reserves because they have | 969 | * Allow tasks that have access to memory reserves because they have |
@@ -960,10 +975,17 @@ repeat: | |||
960 | return; | 975 | return; |
961 | 976 | ||
962 | task_lock(tsk); | 977 | task_lock(tsk); |
978 | /* | ||
979 | * Determine if a loop is necessary if another thread is doing | ||
980 | * get_mems_allowed(). If at least one node remains unchanged and | ||
981 | * tsk does not have a mempolicy, then an empty nodemask will not be | ||
982 | * possible when mems_allowed is larger than a word. | ||
983 | */ | ||
984 | need_loop = task_has_mempolicy(tsk) || | ||
985 | !nodes_intersects(*newmems, tsk->mems_allowed); | ||
963 | nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); | 986 | nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); |
964 | mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); | 987 | mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); |
965 | 988 | ||
966 | |||
967 | /* | 989 | /* |
968 | * ensure checking ->mems_allowed_change_disable after setting all new | 990 | * ensure checking ->mems_allowed_change_disable after setting all new |
969 | * allowed nodes. | 991 | * allowed nodes. |
@@ -982,7 +1004,7 @@ repeat: | |||
982 | * Allocation of memory is very fast, we needn't sleep when waiting | 1004 | * Allocation of memory is very fast, we needn't sleep when waiting |
983 | * for the read-side. | 1005 | * for the read-side. |
984 | */ | 1006 | */ |
985 | while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) { | 1007 | while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) { |
986 | task_unlock(tsk); | 1008 | task_unlock(tsk); |
987 | if (!task_curr(tsk)) | 1009 | if (!task_curr(tsk)) |
988 | yield(); | 1010 | yield(); |
@@ -1367,79 +1389,73 @@ static int fmeter_getrate(struct fmeter *fmp) | |||
1367 | return val; | 1389 | return val; |
1368 | } | 1390 | } |
1369 | 1391 | ||
1370 | /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ | ||
1371 | static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, | ||
1372 | struct task_struct *tsk) | ||
1373 | { | ||
1374 | struct cpuset *cs = cgroup_cs(cont); | ||
1375 | |||
1376 | if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) | ||
1377 | return -ENOSPC; | ||
1378 | |||
1379 | /* | ||
1380 | * Kthreads bound to specific cpus cannot be moved to a new cpuset; we | ||
1381 | * cannot change their cpu affinity and isolating such threads by their | ||
1382 | * set of allowed nodes is unnecessary. Thus, cpusets are not | ||
1383 | * applicable for such threads. This prevents checking for success of | ||
1384 | * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may | ||
1385 | * be changed. | ||
1386 | */ | ||
1387 | if (tsk->flags & PF_THREAD_BOUND) | ||
1388 | return -EINVAL; | ||
1389 | |||
1390 | return 0; | ||
1391 | } | ||
1392 | |||
1393 | static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task) | ||
1394 | { | ||
1395 | return security_task_setscheduler(task); | ||
1396 | } | ||
1397 | |||
1398 | /* | 1392 | /* |
1399 | * Protected by cgroup_lock. The nodemasks must be stored globally because | 1393 | * Protected by cgroup_lock. The nodemasks must be stored globally because |
1400 | * dynamically allocating them is not allowed in pre_attach, and they must | 1394 | * dynamically allocating them is not allowed in can_attach, and they must |
1401 | * persist among pre_attach, attach_task, and attach. | 1395 | * persist until attach. |
1402 | */ | 1396 | */ |
1403 | static cpumask_var_t cpus_attach; | 1397 | static cpumask_var_t cpus_attach; |
1404 | static nodemask_t cpuset_attach_nodemask_from; | 1398 | static nodemask_t cpuset_attach_nodemask_from; |
1405 | static nodemask_t cpuset_attach_nodemask_to; | 1399 | static nodemask_t cpuset_attach_nodemask_to; |
1406 | 1400 | ||
1407 | /* Set-up work for before attaching each task. */ | 1401 | /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ |
1408 | static void cpuset_pre_attach(struct cgroup *cont) | 1402 | static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, |
1403 | struct cgroup_taskset *tset) | ||
1409 | { | 1404 | { |
1410 | struct cpuset *cs = cgroup_cs(cont); | 1405 | struct cpuset *cs = cgroup_cs(cgrp); |
1406 | struct task_struct *task; | ||
1407 | int ret; | ||
1408 | |||
1409 | if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) | ||
1410 | return -ENOSPC; | ||
1411 | |||
1412 | cgroup_taskset_for_each(task, cgrp, tset) { | ||
1413 | /* | ||
1414 | * Kthreads bound to specific cpus cannot be moved to a new | ||
1415 | * cpuset; we cannot change their cpu affinity and | ||
1416 | * isolating such threads by their set of allowed nodes is | ||
1417 | * unnecessary. Thus, cpusets are not applicable for such | ||
1418 | * threads. This prevents checking for success of | ||
1419 | * set_cpus_allowed_ptr() on all attached tasks before | ||
1420 | * cpus_allowed may be changed. | ||
1421 | */ | ||
1422 | if (task->flags & PF_THREAD_BOUND) | ||
1423 | return -EINVAL; | ||
1424 | if ((ret = security_task_setscheduler(task))) | ||
1425 | return ret; | ||
1426 | } | ||
1411 | 1427 | ||
1428 | /* prepare for attach */ | ||
1412 | if (cs == &top_cpuset) | 1429 | if (cs == &top_cpuset) |
1413 | cpumask_copy(cpus_attach, cpu_possible_mask); | 1430 | cpumask_copy(cpus_attach, cpu_possible_mask); |
1414 | else | 1431 | else |
1415 | guarantee_online_cpus(cs, cpus_attach); | 1432 | guarantee_online_cpus(cs, cpus_attach); |
1416 | 1433 | ||
1417 | guarantee_online_mems(cs, &cpuset_attach_nodemask_to); | 1434 | guarantee_online_mems(cs, &cpuset_attach_nodemask_to); |
1418 | } | ||
1419 | 1435 | ||
1420 | /* Per-thread attachment work. */ | 1436 | return 0; |
1421 | static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk) | ||
1422 | { | ||
1423 | int err; | ||
1424 | struct cpuset *cs = cgroup_cs(cont); | ||
1425 | |||
1426 | /* | ||
1427 | * can_attach beforehand should guarantee that this doesn't fail. | ||
1428 | * TODO: have a better way to handle failure here | ||
1429 | */ | ||
1430 | err = set_cpus_allowed_ptr(tsk, cpus_attach); | ||
1431 | WARN_ON_ONCE(err); | ||
1432 | |||
1433 | cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to); | ||
1434 | cpuset_update_task_spread_flag(cs, tsk); | ||
1435 | } | 1437 | } |
1436 | 1438 | ||
1437 | static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, | 1439 | static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, |
1438 | struct cgroup *oldcont, struct task_struct *tsk) | 1440 | struct cgroup_taskset *tset) |
1439 | { | 1441 | { |
1440 | struct mm_struct *mm; | 1442 | struct mm_struct *mm; |
1441 | struct cpuset *cs = cgroup_cs(cont); | 1443 | struct task_struct *task; |
1442 | struct cpuset *oldcs = cgroup_cs(oldcont); | 1444 | struct task_struct *leader = cgroup_taskset_first(tset); |
1445 | struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); | ||
1446 | struct cpuset *cs = cgroup_cs(cgrp); | ||
1447 | struct cpuset *oldcs = cgroup_cs(oldcgrp); | ||
1448 | |||
1449 | cgroup_taskset_for_each(task, cgrp, tset) { | ||
1450 | /* | ||
1451 | * can_attach beforehand should guarantee that this doesn't | ||
1452 | * fail. TODO: have a better way to handle failure here | ||
1453 | */ | ||
1454 | WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); | ||
1455 | |||
1456 | cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); | ||
1457 | cpuset_update_task_spread_flag(cs, task); | ||
1458 | } | ||
1443 | 1459 | ||
1444 | /* | 1460 | /* |
1445 | * Change mm, possibly for multiple threads in a threadgroup. This is | 1461 | * Change mm, possibly for multiple threads in a threadgroup. This is |
@@ -1447,7 +1463,7 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, | |||
1447 | */ | 1463 | */ |
1448 | cpuset_attach_nodemask_from = oldcs->mems_allowed; | 1464 | cpuset_attach_nodemask_from = oldcs->mems_allowed; |
1449 | cpuset_attach_nodemask_to = cs->mems_allowed; | 1465 | cpuset_attach_nodemask_to = cs->mems_allowed; |
1450 | mm = get_task_mm(tsk); | 1466 | mm = get_task_mm(leader); |
1451 | if (mm) { | 1467 | if (mm) { |
1452 | mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); | 1468 | mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); |
1453 | if (is_memory_migrate(cs)) | 1469 | if (is_memory_migrate(cs)) |
@@ -1903,9 +1919,6 @@ struct cgroup_subsys cpuset_subsys = { | |||
1903 | .create = cpuset_create, | 1919 | .create = cpuset_create, |
1904 | .destroy = cpuset_destroy, | 1920 | .destroy = cpuset_destroy, |
1905 | .can_attach = cpuset_can_attach, | 1921 | .can_attach = cpuset_can_attach, |
1906 | .can_attach_task = cpuset_can_attach_task, | ||
1907 | .pre_attach = cpuset_pre_attach, | ||
1908 | .attach_task = cpuset_attach_task, | ||
1909 | .attach = cpuset_attach, | 1922 | .attach = cpuset_attach, |
1910 | .populate = cpuset_populate, | 1923 | .populate = cpuset_populate, |
1911 | .post_clone = cpuset_post_clone, | 1924 | .post_clone = cpuset_post_clone, |
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c index 5f85690285d4..c766ee54c0b1 100644 --- a/kernel/crash_dump.c +++ b/kernel/crash_dump.c | |||
@@ -2,7 +2,7 @@ | |||
2 | #include <linux/crash_dump.h> | 2 | #include <linux/crash_dump.h> |
3 | #include <linux/init.h> | 3 | #include <linux/init.h> |
4 | #include <linux/errno.h> | 4 | #include <linux/errno.h> |
5 | #include <linux/module.h> | 5 | #include <linux/export.h> |
6 | 6 | ||
7 | /* | 7 | /* |
8 | * If we have booted due to a crash, max_pfn will be a very low value. We need | 8 | * If we have booted due to a crash, max_pfn will be a very low value. We need |
@@ -20,8 +20,15 @@ unsigned long saved_max_pfn; | |||
20 | unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; | 20 | unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; |
21 | 21 | ||
22 | /* | 22 | /* |
23 | * stores the size of elf header of crash image | ||
24 | */ | ||
25 | unsigned long long elfcorehdr_size; | ||
26 | |||
27 | /* | ||
23 | * elfcorehdr= specifies the location of elf core header stored by the crashed | 28 | * elfcorehdr= specifies the location of elf core header stored by the crashed |
24 | * kernel. This option will be passed by kexec loader to the capture kernel. | 29 | * kernel. This option will be passed by kexec loader to the capture kernel. |
30 | * | ||
31 | * Syntax: elfcorehdr=[size[KMG]@]offset[KMG] | ||
25 | */ | 32 | */ |
26 | static int __init setup_elfcorehdr(char *arg) | 33 | static int __init setup_elfcorehdr(char *arg) |
27 | { | 34 | { |
@@ -29,6 +36,10 @@ static int __init setup_elfcorehdr(char *arg) | |||
29 | if (!arg) | 36 | if (!arg) |
30 | return -EINVAL; | 37 | return -EINVAL; |
31 | elfcorehdr_addr = memparse(arg, &end); | 38 | elfcorehdr_addr = memparse(arg, &end); |
39 | if (*end == '@') { | ||
40 | elfcorehdr_size = elfcorehdr_addr; | ||
41 | elfcorehdr_addr = memparse(end + 1, &end); | ||
42 | } | ||
32 | return end > arg ? 0 : -EINVAL; | 43 | return end > arg ? 0 : -EINVAL; |
33 | } | 44 | } |
34 | early_param("elfcorehdr", setup_elfcorehdr); | 45 | early_param("elfcorehdr", setup_elfcorehdr); |
diff --git a/kernel/cred.c b/kernel/cred.c index 8ef31f53c44c..5791612a4045 100644 --- a/kernel/cred.c +++ b/kernel/cred.c | |||
@@ -8,7 +8,7 @@ | |||
8 | * as published by the Free Software Foundation; either version | 8 | * as published by the Free Software Foundation; either version |
9 | * 2 of the Licence, or (at your option) any later version. | 9 | * 2 of the Licence, or (at your option) any later version. |
10 | */ | 10 | */ |
11 | #include <linux/module.h> | 11 | #include <linux/export.h> |
12 | #include <linux/cred.h> | 12 | #include <linux/cred.h> |
13 | #include <linux/slab.h> | 13 | #include <linux/slab.h> |
14 | #include <linux/sched.h> | 14 | #include <linux/sched.h> |
@@ -644,6 +644,9 @@ void __init cred_init(void) | |||
644 | */ | 644 | */ |
645 | struct cred *prepare_kernel_cred(struct task_struct *daemon) | 645 | struct cred *prepare_kernel_cred(struct task_struct *daemon) |
646 | { | 646 | { |
647 | #ifdef CONFIG_KEYS | ||
648 | struct thread_group_cred *tgcred; | ||
649 | #endif | ||
647 | const struct cred *old; | 650 | const struct cred *old; |
648 | struct cred *new; | 651 | struct cred *new; |
649 | 652 | ||
@@ -651,6 +654,14 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) | |||
651 | if (!new) | 654 | if (!new) |
652 | return NULL; | 655 | return NULL; |
653 | 656 | ||
657 | #ifdef CONFIG_KEYS | ||
658 | tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); | ||
659 | if (!tgcred) { | ||
660 | kmem_cache_free(cred_jar, new); | ||
661 | return NULL; | ||
662 | } | ||
663 | #endif | ||
664 | |||
654 | kdebug("prepare_kernel_cred() alloc %p", new); | 665 | kdebug("prepare_kernel_cred() alloc %p", new); |
655 | 666 | ||
656 | if (daemon) | 667 | if (daemon) |
@@ -667,8 +678,11 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) | |||
667 | get_group_info(new->group_info); | 678 | get_group_info(new->group_info); |
668 | 679 | ||
669 | #ifdef CONFIG_KEYS | 680 | #ifdef CONFIG_KEYS |
670 | atomic_inc(&init_tgcred.usage); | 681 | atomic_set(&tgcred->usage, 1); |
671 | new->tgcred = &init_tgcred; | 682 | spin_lock_init(&tgcred->lock); |
683 | tgcred->process_keyring = NULL; | ||
684 | tgcred->session_keyring = NULL; | ||
685 | new->tgcred = tgcred; | ||
672 | new->request_key_auth = NULL; | 686 | new->request_key_auth = NULL; |
673 | new->thread_keyring = NULL; | 687 | new->thread_keyring = NULL; |
674 | new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; | 688 | new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; |
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 34872482315e..c22d8c28ad84 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c | |||
@@ -217,7 +217,7 @@ void gdbstub_msg_write(const char *s, int len) | |||
217 | 217 | ||
218 | /* Pack in hex chars */ | 218 | /* Pack in hex chars */ |
219 | for (i = 0; i < wcount; i++) | 219 | for (i = 0; i < wcount; i++) |
220 | bufptr = pack_hex_byte(bufptr, s[i]); | 220 | bufptr = hex_byte_pack(bufptr, s[i]); |
221 | *bufptr = '\0'; | 221 | *bufptr = '\0'; |
222 | 222 | ||
223 | /* Move up */ | 223 | /* Move up */ |
@@ -249,7 +249,7 @@ char *kgdb_mem2hex(char *mem, char *buf, int count) | |||
249 | if (err) | 249 | if (err) |
250 | return NULL; | 250 | return NULL; |
251 | while (count > 0) { | 251 | while (count > 0) { |
252 | buf = pack_hex_byte(buf, *tmp); | 252 | buf = hex_byte_pack(buf, *tmp); |
253 | tmp++; | 253 | tmp++; |
254 | count--; | 254 | count--; |
255 | } | 255 | } |
@@ -411,14 +411,14 @@ static char *pack_threadid(char *pkt, unsigned char *id) | |||
411 | limit = id + (BUF_THREAD_ID_SIZE / 2); | 411 | limit = id + (BUF_THREAD_ID_SIZE / 2); |
412 | while (id < limit) { | 412 | while (id < limit) { |
413 | if (!lzero || *id != 0) { | 413 | if (!lzero || *id != 0) { |
414 | pkt = pack_hex_byte(pkt, *id); | 414 | pkt = hex_byte_pack(pkt, *id); |
415 | lzero = 0; | 415 | lzero = 0; |
416 | } | 416 | } |
417 | id++; | 417 | id++; |
418 | } | 418 | } |
419 | 419 | ||
420 | if (lzero) | 420 | if (lzero) |
421 | pkt = pack_hex_byte(pkt, 0); | 421 | pkt = hex_byte_pack(pkt, 0); |
422 | 422 | ||
423 | return pkt; | 423 | return pkt; |
424 | } | 424 | } |
@@ -486,7 +486,7 @@ static void gdb_cmd_status(struct kgdb_state *ks) | |||
486 | dbg_remove_all_break(); | 486 | dbg_remove_all_break(); |
487 | 487 | ||
488 | remcom_out_buffer[0] = 'S'; | 488 | remcom_out_buffer[0] = 'S'; |
489 | pack_hex_byte(&remcom_out_buffer[1], ks->signo); | 489 | hex_byte_pack(&remcom_out_buffer[1], ks->signo); |
490 | } | 490 | } |
491 | 491 | ||
492 | static void gdb_get_regs_helper(struct kgdb_state *ks) | 492 | static void gdb_get_regs_helper(struct kgdb_state *ks) |
@@ -954,7 +954,7 @@ int gdb_serial_stub(struct kgdb_state *ks) | |||
954 | /* Reply to host that an exception has occurred */ | 954 | /* Reply to host that an exception has occurred */ |
955 | ptr = remcom_out_buffer; | 955 | ptr = remcom_out_buffer; |
956 | *ptr++ = 'T'; | 956 | *ptr++ = 'T'; |
957 | ptr = pack_hex_byte(ptr, ks->signo); | 957 | ptr = hex_byte_pack(ptr, ks->signo); |
958 | ptr += strlen(strcpy(ptr, "thread:")); | 958 | ptr += strlen(strcpy(ptr, "thread:")); |
959 | int_to_threadref(thref, shadow_pid(current->pid)); | 959 | int_to_threadref(thref, shadow_pid(current->pid)); |
960 | ptr = pack_threadid(ptr, thref); | 960 | ptr = pack_threadid(ptr, thref); |
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index d9ca9aa481ec..8b68ce78ff17 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c | |||
@@ -11,6 +11,7 @@ | |||
11 | #include <linux/kgdb.h> | 11 | #include <linux/kgdb.h> |
12 | #include <linux/kdb.h> | 12 | #include <linux/kdb.h> |
13 | #include <linux/kdebug.h> | 13 | #include <linux/kdebug.h> |
14 | #include <linux/export.h> | ||
14 | #include "kdb_private.h" | 15 | #include "kdb_private.h" |
15 | #include "../debug_core.h" | 16 | #include "../debug_core.h" |
16 | 17 | ||
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 63786e71a3cd..e2ae7349437f 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c | |||
@@ -1982,7 +1982,7 @@ static int kdb_lsmod(int argc, const char **argv) | |||
1982 | kdb_printf("%-20s%8u 0x%p ", mod->name, | 1982 | kdb_printf("%-20s%8u 0x%p ", mod->name, |
1983 | mod->core_size, (void *)mod); | 1983 | mod->core_size, (void *)mod); |
1984 | #ifdef CONFIG_MODULE_UNLOAD | 1984 | #ifdef CONFIG_MODULE_UNLOAD |
1985 | kdb_printf("%4d ", module_refcount(mod)); | 1985 | kdb_printf("%4ld ", module_refcount(mod)); |
1986 | #endif | 1986 | #endif |
1987 | if (mod->state == MODULE_STATE_GOING) | 1987 | if (mod->state == MODULE_STATE_GOING) |
1988 | kdb_printf(" (Unloading)"); | 1988 | kdb_printf(" (Unloading)"); |
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 5532dd37aa86..7d6fb40d2188 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c | |||
@@ -636,7 +636,7 @@ char kdb_task_state_char (const struct task_struct *p) | |||
636 | (p->exit_state & EXIT_ZOMBIE) ? 'Z' : | 636 | (p->exit_state & EXIT_ZOMBIE) ? 'Z' : |
637 | (p->exit_state & EXIT_DEAD) ? 'E' : | 637 | (p->exit_state & EXIT_DEAD) ? 'E' : |
638 | (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?'; | 638 | (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?'; |
639 | if (p->pid == 0) { | 639 | if (is_idle_task(p)) { |
640 | /* Idle task. Is it really idle, apart from the kdb | 640 | /* Idle task. Is it really idle, apart from the kdb |
641 | * interrupt? */ | 641 | * interrupt? */ |
642 | if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) { | 642 | if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) { |
diff --git a/kernel/dma.c b/kernel/dma.c index f903189c5304..68a2306522c8 100644 --- a/kernel/dma.c +++ b/kernel/dma.c | |||
@@ -9,7 +9,7 @@ | |||
9 | * [It also happened to remove the sizeof(char *) == sizeof(int) | 9 | * [It also happened to remove the sizeof(char *) == sizeof(int) |
10 | * assumption introduced because of those /proc/dma patches. -- Hennus] | 10 | * assumption introduced because of those /proc/dma patches. -- Hennus] |
11 | */ | 11 | */ |
12 | #include <linux/module.h> | 12 | #include <linux/export.h> |
13 | #include <linux/kernel.h> | 13 | #include <linux/kernel.h> |
14 | #include <linux/errno.h> | 14 | #include <linux/errno.h> |
15 | #include <linux/spinlock.h> | 15 | #include <linux/spinlock.h> |
diff --git a/kernel/events/Makefile b/kernel/events/Makefile index 89e5e8aa4c36..22d901f9caf4 100644 --- a/kernel/events/Makefile +++ b/kernel/events/Makefile | |||
@@ -2,5 +2,5 @@ ifdef CONFIG_FUNCTION_TRACER | |||
2 | CFLAGS_REMOVE_core.o = -pg | 2 | CFLAGS_REMOVE_core.o = -pg |
3 | endif | 3 | endif |
4 | 4 | ||
5 | obj-y := core.o ring_buffer.o | 5 | obj-y := core.o ring_buffer.o callchain.o |
6 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o | 6 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o |
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c new file mode 100644 index 000000000000..057e24b665cf --- /dev/null +++ b/kernel/events/callchain.c | |||
@@ -0,0 +1,191 @@ | |||
1 | /* | ||
2 | * Performance events callchain code, extracted from core.c: | ||
3 | * | ||
4 | * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> | ||
5 | * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar | ||
6 | * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | ||
7 | * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> | ||
8 | * | ||
9 | * For licensing details see kernel-base/COPYING | ||
10 | */ | ||
11 | |||
12 | #include <linux/perf_event.h> | ||
13 | #include <linux/slab.h> | ||
14 | #include "internal.h" | ||
15 | |||
16 | struct callchain_cpus_entries { | ||
17 | struct rcu_head rcu_head; | ||
18 | struct perf_callchain_entry *cpu_entries[0]; | ||
19 | }; | ||
20 | |||
21 | static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); | ||
22 | static atomic_t nr_callchain_events; | ||
23 | static DEFINE_MUTEX(callchain_mutex); | ||
24 | static struct callchain_cpus_entries *callchain_cpus_entries; | ||
25 | |||
26 | |||
27 | __weak void perf_callchain_kernel(struct perf_callchain_entry *entry, | ||
28 | struct pt_regs *regs) | ||
29 | { | ||
30 | } | ||
31 | |||
32 | __weak void perf_callchain_user(struct perf_callchain_entry *entry, | ||
33 | struct pt_regs *regs) | ||
34 | { | ||
35 | } | ||
36 | |||
37 | static void release_callchain_buffers_rcu(struct rcu_head *head) | ||
38 | { | ||
39 | struct callchain_cpus_entries *entries; | ||
40 | int cpu; | ||
41 | |||
42 | entries = container_of(head, struct callchain_cpus_entries, rcu_head); | ||
43 | |||
44 | for_each_possible_cpu(cpu) | ||
45 | kfree(entries->cpu_entries[cpu]); | ||
46 | |||
47 | kfree(entries); | ||
48 | } | ||
49 | |||
50 | static void release_callchain_buffers(void) | ||
51 | { | ||
52 | struct callchain_cpus_entries *entries; | ||
53 | |||
54 | entries = callchain_cpus_entries; | ||
55 | rcu_assign_pointer(callchain_cpus_entries, NULL); | ||
56 | call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); | ||
57 | } | ||
58 | |||
59 | static int alloc_callchain_buffers(void) | ||
60 | { | ||
61 | int cpu; | ||
62 | int size; | ||
63 | struct callchain_cpus_entries *entries; | ||
64 | |||
65 | /* | ||
66 | * We can't use the percpu allocation API for data that can be | ||
67 | * accessed from NMI. Use a temporary manual per cpu allocation | ||
68 | * until that gets sorted out. | ||
69 | */ | ||
70 | size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]); | ||
71 | |||
72 | entries = kzalloc(size, GFP_KERNEL); | ||
73 | if (!entries) | ||
74 | return -ENOMEM; | ||
75 | |||
76 | size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; | ||
77 | |||
78 | for_each_possible_cpu(cpu) { | ||
79 | entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, | ||
80 | cpu_to_node(cpu)); | ||
81 | if (!entries->cpu_entries[cpu]) | ||
82 | goto fail; | ||
83 | } | ||
84 | |||
85 | rcu_assign_pointer(callchain_cpus_entries, entries); | ||
86 | |||
87 | return 0; | ||
88 | |||
89 | fail: | ||
90 | for_each_possible_cpu(cpu) | ||
91 | kfree(entries->cpu_entries[cpu]); | ||
92 | kfree(entries); | ||
93 | |||
94 | return -ENOMEM; | ||
95 | } | ||
96 | |||
97 | int get_callchain_buffers(void) | ||
98 | { | ||
99 | int err = 0; | ||
100 | int count; | ||
101 | |||
102 | mutex_lock(&callchain_mutex); | ||
103 | |||
104 | count = atomic_inc_return(&nr_callchain_events); | ||
105 | if (WARN_ON_ONCE(count < 1)) { | ||
106 | err = -EINVAL; | ||
107 | goto exit; | ||
108 | } | ||
109 | |||
110 | if (count > 1) { | ||
111 | /* If the allocation failed, give up */ | ||
112 | if (!callchain_cpus_entries) | ||
113 | err = -ENOMEM; | ||
114 | goto exit; | ||
115 | } | ||
116 | |||
117 | err = alloc_callchain_buffers(); | ||
118 | if (err) | ||
119 | release_callchain_buffers(); | ||
120 | exit: | ||
121 | mutex_unlock(&callchain_mutex); | ||
122 | |||
123 | return err; | ||
124 | } | ||
125 | |||
126 | void put_callchain_buffers(void) | ||
127 | { | ||
128 | if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { | ||
129 | release_callchain_buffers(); | ||
130 | mutex_unlock(&callchain_mutex); | ||
131 | } | ||
132 | } | ||
133 | |||
134 | static struct perf_callchain_entry *get_callchain_entry(int *rctx) | ||
135 | { | ||
136 | int cpu; | ||
137 | struct callchain_cpus_entries *entries; | ||
138 | |||
139 | *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); | ||
140 | if (*rctx == -1) | ||
141 | return NULL; | ||
142 | |||
143 | entries = rcu_dereference(callchain_cpus_entries); | ||
144 | if (!entries) | ||
145 | return NULL; | ||
146 | |||
147 | cpu = smp_processor_id(); | ||
148 | |||
149 | return &entries->cpu_entries[cpu][*rctx]; | ||
150 | } | ||
151 | |||
152 | static void | ||
153 | put_callchain_entry(int rctx) | ||
154 | { | ||
155 | put_recursion_context(__get_cpu_var(callchain_recursion), rctx); | ||
156 | } | ||
157 | |||
158 | struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) | ||
159 | { | ||
160 | int rctx; | ||
161 | struct perf_callchain_entry *entry; | ||
162 | |||
163 | |||
164 | entry = get_callchain_entry(&rctx); | ||
165 | if (rctx == -1) | ||
166 | return NULL; | ||
167 | |||
168 | if (!entry) | ||
169 | goto exit_put; | ||
170 | |||
171 | entry->nr = 0; | ||
172 | |||
173 | if (!user_mode(regs)) { | ||
174 | perf_callchain_store(entry, PERF_CONTEXT_KERNEL); | ||
175 | perf_callchain_kernel(entry, regs); | ||
176 | if (current->mm) | ||
177 | regs = task_pt_regs(current); | ||
178 | else | ||
179 | regs = NULL; | ||
180 | } | ||
181 | |||
182 | if (regs) { | ||
183 | perf_callchain_store(entry, PERF_CONTEXT_USER); | ||
184 | perf_callchain_user(entry, regs); | ||
185 | } | ||
186 | |||
187 | exit_put: | ||
188 | put_callchain_entry(rctx); | ||
189 | |||
190 | return entry; | ||
191 | } | ||
diff --git a/kernel/events/core.c b/kernel/events/core.c index 0f857782d06f..a8f4ac001a00 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -4,7 +4,7 @@ | |||
4 | * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> | 4 | * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> |
5 | * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar | 5 | * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar |
6 | * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | 6 | * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> |
7 | * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> | 7 | * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> |
8 | * | 8 | * |
9 | * For licensing details see kernel-base/COPYING | 9 | * For licensing details see kernel-base/COPYING |
10 | */ | 10 | */ |
@@ -25,6 +25,7 @@ | |||
25 | #include <linux/reboot.h> | 25 | #include <linux/reboot.h> |
26 | #include <linux/vmstat.h> | 26 | #include <linux/vmstat.h> |
27 | #include <linux/device.h> | 27 | #include <linux/device.h> |
28 | #include <linux/export.h> | ||
28 | #include <linux/vmalloc.h> | 29 | #include <linux/vmalloc.h> |
29 | #include <linux/hardirq.h> | 30 | #include <linux/hardirq.h> |
30 | #include <linux/rculist.h> | 31 | #include <linux/rculist.h> |
@@ -127,7 +128,7 @@ enum event_type_t { | |||
127 | * perf_sched_events : >0 events exist | 128 | * perf_sched_events : >0 events exist |
128 | * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu | 129 | * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu |
129 | */ | 130 | */ |
130 | struct jump_label_key perf_sched_events __read_mostly; | 131 | struct jump_label_key_deferred perf_sched_events __read_mostly; |
131 | static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); | 132 | static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); |
132 | 133 | ||
133 | static atomic_t nr_mmap_events __read_mostly; | 134 | static atomic_t nr_mmap_events __read_mostly; |
@@ -184,6 +185,9 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, | |||
184 | static void update_context_time(struct perf_event_context *ctx); | 185 | static void update_context_time(struct perf_event_context *ctx); |
185 | static u64 perf_event_time(struct perf_event *event); | 186 | static u64 perf_event_time(struct perf_event *event); |
186 | 187 | ||
188 | static void ring_buffer_attach(struct perf_event *event, | ||
189 | struct ring_buffer *rb); | ||
190 | |||
187 | void __weak perf_event_print_debug(void) { } | 191 | void __weak perf_event_print_debug(void) { } |
188 | 192 | ||
189 | extern __weak const char *perf_pmu_name(void) | 193 | extern __weak const char *perf_pmu_name(void) |
@@ -1126,6 +1130,8 @@ event_sched_out(struct perf_event *event, | |||
1126 | if (!is_software_event(event)) | 1130 | if (!is_software_event(event)) |
1127 | cpuctx->active_oncpu--; | 1131 | cpuctx->active_oncpu--; |
1128 | ctx->nr_active--; | 1132 | ctx->nr_active--; |
1133 | if (event->attr.freq && event->attr.sample_freq) | ||
1134 | ctx->nr_freq--; | ||
1129 | if (event->attr.exclusive || !cpuctx->active_oncpu) | 1135 | if (event->attr.exclusive || !cpuctx->active_oncpu) |
1130 | cpuctx->exclusive = 0; | 1136 | cpuctx->exclusive = 0; |
1131 | } | 1137 | } |
@@ -1321,6 +1327,7 @@ retry: | |||
1321 | } | 1327 | } |
1322 | raw_spin_unlock_irq(&ctx->lock); | 1328 | raw_spin_unlock_irq(&ctx->lock); |
1323 | } | 1329 | } |
1330 | EXPORT_SYMBOL_GPL(perf_event_disable); | ||
1324 | 1331 | ||
1325 | static void perf_set_shadow_time(struct perf_event *event, | 1332 | static void perf_set_shadow_time(struct perf_event *event, |
1326 | struct perf_event_context *ctx, | 1333 | struct perf_event_context *ctx, |
@@ -1402,6 +1409,8 @@ event_sched_in(struct perf_event *event, | |||
1402 | if (!is_software_event(event)) | 1409 | if (!is_software_event(event)) |
1403 | cpuctx->active_oncpu++; | 1410 | cpuctx->active_oncpu++; |
1404 | ctx->nr_active++; | 1411 | ctx->nr_active++; |
1412 | if (event->attr.freq && event->attr.sample_freq) | ||
1413 | ctx->nr_freq++; | ||
1405 | 1414 | ||
1406 | if (event->attr.exclusive) | 1415 | if (event->attr.exclusive) |
1407 | cpuctx->exclusive = 1; | 1416 | cpuctx->exclusive = 1; |
@@ -1658,8 +1667,7 @@ retry: | |||
1658 | * Note: this works for group members as well as group leaders | 1667 | * Note: this works for group members as well as group leaders |
1659 | * since the non-leader members' sibling_lists will be empty. | 1668 | * since the non-leader members' sibling_lists will be empty. |
1660 | */ | 1669 | */ |
1661 | static void __perf_event_mark_enabled(struct perf_event *event, | 1670 | static void __perf_event_mark_enabled(struct perf_event *event) |
1662 | struct perf_event_context *ctx) | ||
1663 | { | 1671 | { |
1664 | struct perf_event *sub; | 1672 | struct perf_event *sub; |
1665 | u64 tstamp = perf_event_time(event); | 1673 | u64 tstamp = perf_event_time(event); |
@@ -1697,7 +1705,7 @@ static int __perf_event_enable(void *info) | |||
1697 | */ | 1705 | */ |
1698 | perf_cgroup_set_timestamp(current, ctx); | 1706 | perf_cgroup_set_timestamp(current, ctx); |
1699 | 1707 | ||
1700 | __perf_event_mark_enabled(event, ctx); | 1708 | __perf_event_mark_enabled(event); |
1701 | 1709 | ||
1702 | if (!event_filter_match(event)) { | 1710 | if (!event_filter_match(event)) { |
1703 | if (is_cgroup_event(event)) | 1711 | if (is_cgroup_event(event)) |
@@ -1778,7 +1786,7 @@ void perf_event_enable(struct perf_event *event) | |||
1778 | 1786 | ||
1779 | retry: | 1787 | retry: |
1780 | if (!ctx->is_active) { | 1788 | if (!ctx->is_active) { |
1781 | __perf_event_mark_enabled(event, ctx); | 1789 | __perf_event_mark_enabled(event); |
1782 | goto out; | 1790 | goto out; |
1783 | } | 1791 | } |
1784 | 1792 | ||
@@ -1805,6 +1813,7 @@ retry: | |||
1805 | out: | 1813 | out: |
1806 | raw_spin_unlock_irq(&ctx->lock); | 1814 | raw_spin_unlock_irq(&ctx->lock); |
1807 | } | 1815 | } |
1816 | EXPORT_SYMBOL_GPL(perf_event_enable); | ||
1808 | 1817 | ||
1809 | int perf_event_refresh(struct perf_event *event, int refresh) | 1818 | int perf_event_refresh(struct perf_event *event, int refresh) |
1810 | { | 1819 | { |
@@ -2170,9 +2179,10 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, | |||
2170 | */ | 2179 | */ |
2171 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); | 2180 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); |
2172 | 2181 | ||
2173 | perf_event_sched_in(cpuctx, ctx, task); | 2182 | if (ctx->nr_events) |
2183 | cpuctx->task_ctx = ctx; | ||
2174 | 2184 | ||
2175 | cpuctx->task_ctx = ctx; | 2185 | perf_event_sched_in(cpuctx, cpuctx->task_ctx, task); |
2176 | 2186 | ||
2177 | perf_pmu_enable(ctx->pmu); | 2187 | perf_pmu_enable(ctx->pmu); |
2178 | perf_ctx_unlock(cpuctx, ctx); | 2188 | perf_ctx_unlock(cpuctx, ctx); |
@@ -2322,6 +2332,9 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) | |||
2322 | u64 interrupts, now; | 2332 | u64 interrupts, now; |
2323 | s64 delta; | 2333 | s64 delta; |
2324 | 2334 | ||
2335 | if (!ctx->nr_freq) | ||
2336 | return; | ||
2337 | |||
2325 | list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { | 2338 | list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { |
2326 | if (event->state != PERF_EVENT_STATE_ACTIVE) | 2339 | if (event->state != PERF_EVENT_STATE_ACTIVE) |
2327 | continue; | 2340 | continue; |
@@ -2377,12 +2390,14 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) | |||
2377 | { | 2390 | { |
2378 | u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC; | 2391 | u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC; |
2379 | struct perf_event_context *ctx = NULL; | 2392 | struct perf_event_context *ctx = NULL; |
2380 | int rotate = 0, remove = 1; | 2393 | int rotate = 0, remove = 1, freq = 0; |
2381 | 2394 | ||
2382 | if (cpuctx->ctx.nr_events) { | 2395 | if (cpuctx->ctx.nr_events) { |
2383 | remove = 0; | 2396 | remove = 0; |
2384 | if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) | 2397 | if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) |
2385 | rotate = 1; | 2398 | rotate = 1; |
2399 | if (cpuctx->ctx.nr_freq) | ||
2400 | freq = 1; | ||
2386 | } | 2401 | } |
2387 | 2402 | ||
2388 | ctx = cpuctx->task_ctx; | 2403 | ctx = cpuctx->task_ctx; |
@@ -2390,33 +2405,40 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) | |||
2390 | remove = 0; | 2405 | remove = 0; |
2391 | if (ctx->nr_events != ctx->nr_active) | 2406 | if (ctx->nr_events != ctx->nr_active) |
2392 | rotate = 1; | 2407 | rotate = 1; |
2408 | if (ctx->nr_freq) | ||
2409 | freq = 1; | ||
2393 | } | 2410 | } |
2394 | 2411 | ||
2412 | if (!rotate && !freq) | ||
2413 | goto done; | ||
2414 | |||
2395 | perf_ctx_lock(cpuctx, cpuctx->task_ctx); | 2415 | perf_ctx_lock(cpuctx, cpuctx->task_ctx); |
2396 | perf_pmu_disable(cpuctx->ctx.pmu); | 2416 | perf_pmu_disable(cpuctx->ctx.pmu); |
2397 | perf_ctx_adjust_freq(&cpuctx->ctx, interval); | ||
2398 | if (ctx) | ||
2399 | perf_ctx_adjust_freq(ctx, interval); | ||
2400 | 2417 | ||
2401 | if (!rotate) | 2418 | if (freq) { |
2402 | goto done; | 2419 | perf_ctx_adjust_freq(&cpuctx->ctx, interval); |
2420 | if (ctx) | ||
2421 | perf_ctx_adjust_freq(ctx, interval); | ||
2422 | } | ||
2403 | 2423 | ||
2404 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); | 2424 | if (rotate) { |
2405 | if (ctx) | 2425 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); |
2406 | ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); | 2426 | if (ctx) |
2427 | ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); | ||
2407 | 2428 | ||
2408 | rotate_ctx(&cpuctx->ctx); | 2429 | rotate_ctx(&cpuctx->ctx); |
2409 | if (ctx) | 2430 | if (ctx) |
2410 | rotate_ctx(ctx); | 2431 | rotate_ctx(ctx); |
2432 | |||
2433 | perf_event_sched_in(cpuctx, ctx, current); | ||
2434 | } | ||
2411 | 2435 | ||
2412 | perf_event_sched_in(cpuctx, ctx, current); | 2436 | perf_pmu_enable(cpuctx->ctx.pmu); |
2437 | perf_ctx_unlock(cpuctx, cpuctx->task_ctx); | ||
2413 | 2438 | ||
2414 | done: | 2439 | done: |
2415 | if (remove) | 2440 | if (remove) |
2416 | list_del_init(&cpuctx->rotation_list); | 2441 | list_del_init(&cpuctx->rotation_list); |
2417 | |||
2418 | perf_pmu_enable(cpuctx->ctx.pmu); | ||
2419 | perf_ctx_unlock(cpuctx, cpuctx->task_ctx); | ||
2420 | } | 2442 | } |
2421 | 2443 | ||
2422 | void perf_event_task_tick(void) | 2444 | void perf_event_task_tick(void) |
@@ -2443,7 +2465,7 @@ static int event_enable_on_exec(struct perf_event *event, | |||
2443 | if (event->state >= PERF_EVENT_STATE_INACTIVE) | 2465 | if (event->state >= PERF_EVENT_STATE_INACTIVE) |
2444 | return 0; | 2466 | return 0; |
2445 | 2467 | ||
2446 | __perf_event_mark_enabled(event, ctx); | 2468 | __perf_event_mark_enabled(event); |
2447 | 2469 | ||
2448 | return 1; | 2470 | return 1; |
2449 | } | 2471 | } |
@@ -2475,13 +2497,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) | |||
2475 | raw_spin_lock(&ctx->lock); | 2497 | raw_spin_lock(&ctx->lock); |
2476 | task_ctx_sched_out(ctx); | 2498 | task_ctx_sched_out(ctx); |
2477 | 2499 | ||
2478 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) { | 2500 | list_for_each_entry(event, &ctx->event_list, event_entry) { |
2479 | ret = event_enable_on_exec(event, ctx); | ||
2480 | if (ret) | ||
2481 | enabled = 1; | ||
2482 | } | ||
2483 | |||
2484 | list_for_each_entry(event, &ctx->flexible_groups, group_entry) { | ||
2485 | ret = event_enable_on_exec(event, ctx); | 2501 | ret = event_enable_on_exec(event, ctx); |
2486 | if (ret) | 2502 | if (ret) |
2487 | enabled = 1; | 2503 | enabled = 1; |
@@ -2569,215 +2585,6 @@ static u64 perf_event_read(struct perf_event *event) | |||
2569 | } | 2585 | } |
2570 | 2586 | ||
2571 | /* | 2587 | /* |
2572 | * Callchain support | ||
2573 | */ | ||
2574 | |||
2575 | struct callchain_cpus_entries { | ||
2576 | struct rcu_head rcu_head; | ||
2577 | struct perf_callchain_entry *cpu_entries[0]; | ||
2578 | }; | ||
2579 | |||
2580 | static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); | ||
2581 | static atomic_t nr_callchain_events; | ||
2582 | static DEFINE_MUTEX(callchain_mutex); | ||
2583 | struct callchain_cpus_entries *callchain_cpus_entries; | ||
2584 | |||
2585 | |||
2586 | __weak void perf_callchain_kernel(struct perf_callchain_entry *entry, | ||
2587 | struct pt_regs *regs) | ||
2588 | { | ||
2589 | } | ||
2590 | |||
2591 | __weak void perf_callchain_user(struct perf_callchain_entry *entry, | ||
2592 | struct pt_regs *regs) | ||
2593 | { | ||
2594 | } | ||
2595 | |||
2596 | static void release_callchain_buffers_rcu(struct rcu_head *head) | ||
2597 | { | ||
2598 | struct callchain_cpus_entries *entries; | ||
2599 | int cpu; | ||
2600 | |||
2601 | entries = container_of(head, struct callchain_cpus_entries, rcu_head); | ||
2602 | |||
2603 | for_each_possible_cpu(cpu) | ||
2604 | kfree(entries->cpu_entries[cpu]); | ||
2605 | |||
2606 | kfree(entries); | ||
2607 | } | ||
2608 | |||
2609 | static void release_callchain_buffers(void) | ||
2610 | { | ||
2611 | struct callchain_cpus_entries *entries; | ||
2612 | |||
2613 | entries = callchain_cpus_entries; | ||
2614 | rcu_assign_pointer(callchain_cpus_entries, NULL); | ||
2615 | call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); | ||
2616 | } | ||
2617 | |||
2618 | static int alloc_callchain_buffers(void) | ||
2619 | { | ||
2620 | int cpu; | ||
2621 | int size; | ||
2622 | struct callchain_cpus_entries *entries; | ||
2623 | |||
2624 | /* | ||
2625 | * We can't use the percpu allocation API for data that can be | ||
2626 | * accessed from NMI. Use a temporary manual per cpu allocation | ||
2627 | * until that gets sorted out. | ||
2628 | */ | ||
2629 | size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]); | ||
2630 | |||
2631 | entries = kzalloc(size, GFP_KERNEL); | ||
2632 | if (!entries) | ||
2633 | return -ENOMEM; | ||
2634 | |||
2635 | size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; | ||
2636 | |||
2637 | for_each_possible_cpu(cpu) { | ||
2638 | entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, | ||
2639 | cpu_to_node(cpu)); | ||
2640 | if (!entries->cpu_entries[cpu]) | ||
2641 | goto fail; | ||
2642 | } | ||
2643 | |||
2644 | rcu_assign_pointer(callchain_cpus_entries, entries); | ||
2645 | |||
2646 | return 0; | ||
2647 | |||
2648 | fail: | ||
2649 | for_each_possible_cpu(cpu) | ||
2650 | kfree(entries->cpu_entries[cpu]); | ||
2651 | kfree(entries); | ||
2652 | |||
2653 | return -ENOMEM; | ||
2654 | } | ||
2655 | |||
2656 | static int get_callchain_buffers(void) | ||
2657 | { | ||
2658 | int err = 0; | ||
2659 | int count; | ||
2660 | |||
2661 | mutex_lock(&callchain_mutex); | ||
2662 | |||
2663 | count = atomic_inc_return(&nr_callchain_events); | ||
2664 | if (WARN_ON_ONCE(count < 1)) { | ||
2665 | err = -EINVAL; | ||
2666 | goto exit; | ||
2667 | } | ||
2668 | |||
2669 | if (count > 1) { | ||
2670 | /* If the allocation failed, give up */ | ||
2671 | if (!callchain_cpus_entries) | ||
2672 | err = -ENOMEM; | ||
2673 | goto exit; | ||
2674 | } | ||
2675 | |||
2676 | err = alloc_callchain_buffers(); | ||
2677 | if (err) | ||
2678 | release_callchain_buffers(); | ||
2679 | exit: | ||
2680 | mutex_unlock(&callchain_mutex); | ||
2681 | |||
2682 | return err; | ||
2683 | } | ||
2684 | |||
2685 | static void put_callchain_buffers(void) | ||
2686 | { | ||
2687 | if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { | ||
2688 | release_callchain_buffers(); | ||
2689 | mutex_unlock(&callchain_mutex); | ||
2690 | } | ||
2691 | } | ||
2692 | |||
2693 | static int get_recursion_context(int *recursion) | ||
2694 | { | ||
2695 | int rctx; | ||
2696 | |||
2697 | if (in_nmi()) | ||
2698 | rctx = 3; | ||
2699 | else if (in_irq()) | ||
2700 | rctx = 2; | ||
2701 | else if (in_softirq()) | ||
2702 | rctx = 1; | ||
2703 | else | ||
2704 | rctx = 0; | ||
2705 | |||
2706 | if (recursion[rctx]) | ||
2707 | return -1; | ||
2708 | |||
2709 | recursion[rctx]++; | ||
2710 | barrier(); | ||
2711 | |||
2712 | return rctx; | ||
2713 | } | ||
2714 | |||
2715 | static inline void put_recursion_context(int *recursion, int rctx) | ||
2716 | { | ||
2717 | barrier(); | ||
2718 | recursion[rctx]--; | ||
2719 | } | ||
2720 | |||
2721 | static struct perf_callchain_entry *get_callchain_entry(int *rctx) | ||
2722 | { | ||
2723 | int cpu; | ||
2724 | struct callchain_cpus_entries *entries; | ||
2725 | |||
2726 | *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); | ||
2727 | if (*rctx == -1) | ||
2728 | return NULL; | ||
2729 | |||
2730 | entries = rcu_dereference(callchain_cpus_entries); | ||
2731 | if (!entries) | ||
2732 | return NULL; | ||
2733 | |||
2734 | cpu = smp_processor_id(); | ||
2735 | |||
2736 | return &entries->cpu_entries[cpu][*rctx]; | ||
2737 | } | ||
2738 | |||
2739 | static void | ||
2740 | put_callchain_entry(int rctx) | ||
2741 | { | ||
2742 | put_recursion_context(__get_cpu_var(callchain_recursion), rctx); | ||
2743 | } | ||
2744 | |||
2745 | static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) | ||
2746 | { | ||
2747 | int rctx; | ||
2748 | struct perf_callchain_entry *entry; | ||
2749 | |||
2750 | |||
2751 | entry = get_callchain_entry(&rctx); | ||
2752 | if (rctx == -1) | ||
2753 | return NULL; | ||
2754 | |||
2755 | if (!entry) | ||
2756 | goto exit_put; | ||
2757 | |||
2758 | entry->nr = 0; | ||
2759 | |||
2760 | if (!user_mode(regs)) { | ||
2761 | perf_callchain_store(entry, PERF_CONTEXT_KERNEL); | ||
2762 | perf_callchain_kernel(entry, regs); | ||
2763 | if (current->mm) | ||
2764 | regs = task_pt_regs(current); | ||
2765 | else | ||
2766 | regs = NULL; | ||
2767 | } | ||
2768 | |||
2769 | if (regs) { | ||
2770 | perf_callchain_store(entry, PERF_CONTEXT_USER); | ||
2771 | perf_callchain_user(entry, regs); | ||
2772 | } | ||
2773 | |||
2774 | exit_put: | ||
2775 | put_callchain_entry(rctx); | ||
2776 | |||
2777 | return entry; | ||
2778 | } | ||
2779 | |||
2780 | /* | ||
2781 | * Initialize the perf_event context in a task_struct: | 2588 | * Initialize the perf_event context in a task_struct: |
2782 | */ | 2589 | */ |
2783 | static void __perf_event_init_context(struct perf_event_context *ctx) | 2590 | static void __perf_event_init_context(struct perf_event_context *ctx) |
@@ -2941,7 +2748,7 @@ static void free_event(struct perf_event *event) | |||
2941 | 2748 | ||
2942 | if (!event->parent) { | 2749 | if (!event->parent) { |
2943 | if (event->attach_state & PERF_ATTACH_TASK) | 2750 | if (event->attach_state & PERF_ATTACH_TASK) |
2944 | jump_label_dec(&perf_sched_events); | 2751 | jump_label_dec_deferred(&perf_sched_events); |
2945 | if (event->attr.mmap || event->attr.mmap_data) | 2752 | if (event->attr.mmap || event->attr.mmap_data) |
2946 | atomic_dec(&nr_mmap_events); | 2753 | atomic_dec(&nr_mmap_events); |
2947 | if (event->attr.comm) | 2754 | if (event->attr.comm) |
@@ -2952,7 +2759,7 @@ static void free_event(struct perf_event *event) | |||
2952 | put_callchain_buffers(); | 2759 | put_callchain_buffers(); |
2953 | if (is_cgroup_event(event)) { | 2760 | if (is_cgroup_event(event)) { |
2954 | atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); | 2761 | atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); |
2955 | jump_label_dec(&perf_sched_events); | 2762 | jump_label_dec_deferred(&perf_sched_events); |
2956 | } | 2763 | } |
2957 | } | 2764 | } |
2958 | 2765 | ||
@@ -3189,12 +2996,33 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) | |||
3189 | struct ring_buffer *rb; | 2996 | struct ring_buffer *rb; |
3190 | unsigned int events = POLL_HUP; | 2997 | unsigned int events = POLL_HUP; |
3191 | 2998 | ||
2999 | /* | ||
3000 | * Race between perf_event_set_output() and perf_poll(): perf_poll() | ||
3001 | * grabs the rb reference but perf_event_set_output() overrides it. | ||
3002 | * Here is the timeline for two threads T1, T2: | ||
3003 | * t0: T1, rb = rcu_dereference(event->rb) | ||
3004 | * t1: T2, old_rb = event->rb | ||
3005 | * t2: T2, event->rb = new rb | ||
3006 | * t3: T2, ring_buffer_detach(old_rb) | ||
3007 | * t4: T1, ring_buffer_attach(rb1) | ||
3008 | * t5: T1, poll_wait(event->waitq) | ||
3009 | * | ||
3010 | * To avoid this problem, we grab mmap_mutex in perf_poll() | ||
3011 | * thereby ensuring that the assignment of the new ring buffer | ||
3012 | * and the detachment of the old buffer appear atomic to perf_poll() | ||
3013 | */ | ||
3014 | mutex_lock(&event->mmap_mutex); | ||
3015 | |||
3192 | rcu_read_lock(); | 3016 | rcu_read_lock(); |
3193 | rb = rcu_dereference(event->rb); | 3017 | rb = rcu_dereference(event->rb); |
3194 | if (rb) | 3018 | if (rb) { |
3019 | ring_buffer_attach(event, rb); | ||
3195 | events = atomic_xchg(&rb->poll, 0); | 3020 | events = atomic_xchg(&rb->poll, 0); |
3021 | } | ||
3196 | rcu_read_unlock(); | 3022 | rcu_read_unlock(); |
3197 | 3023 | ||
3024 | mutex_unlock(&event->mmap_mutex); | ||
3025 | |||
3198 | poll_wait(file, &event->waitq, wait); | 3026 | poll_wait(file, &event->waitq, wait); |
3199 | 3027 | ||
3200 | return events; | 3028 | return events; |
@@ -3495,6 +3323,53 @@ unlock: | |||
3495 | return ret; | 3323 | return ret; |
3496 | } | 3324 | } |
3497 | 3325 | ||
3326 | static void ring_buffer_attach(struct perf_event *event, | ||
3327 | struct ring_buffer *rb) | ||
3328 | { | ||
3329 | unsigned long flags; | ||
3330 | |||
3331 | if (!list_empty(&event->rb_entry)) | ||
3332 | return; | ||
3333 | |||
3334 | spin_lock_irqsave(&rb->event_lock, flags); | ||
3335 | if (!list_empty(&event->rb_entry)) | ||
3336 | goto unlock; | ||
3337 | |||
3338 | list_add(&event->rb_entry, &rb->event_list); | ||
3339 | unlock: | ||
3340 | spin_unlock_irqrestore(&rb->event_lock, flags); | ||
3341 | } | ||
3342 | |||
3343 | static void ring_buffer_detach(struct perf_event *event, | ||
3344 | struct ring_buffer *rb) | ||
3345 | { | ||
3346 | unsigned long flags; | ||
3347 | |||
3348 | if (list_empty(&event->rb_entry)) | ||
3349 | return; | ||
3350 | |||
3351 | spin_lock_irqsave(&rb->event_lock, flags); | ||
3352 | list_del_init(&event->rb_entry); | ||
3353 | wake_up_all(&event->waitq); | ||
3354 | spin_unlock_irqrestore(&rb->event_lock, flags); | ||
3355 | } | ||
3356 | |||
3357 | static void ring_buffer_wakeup(struct perf_event *event) | ||
3358 | { | ||
3359 | struct ring_buffer *rb; | ||
3360 | |||
3361 | rcu_read_lock(); | ||
3362 | rb = rcu_dereference(event->rb); | ||
3363 | if (!rb) | ||
3364 | goto unlock; | ||
3365 | |||
3366 | list_for_each_entry_rcu(event, &rb->event_list, rb_entry) | ||
3367 | wake_up_all(&event->waitq); | ||
3368 | |||
3369 | unlock: | ||
3370 | rcu_read_unlock(); | ||
3371 | } | ||
3372 | |||
3498 | static void rb_free_rcu(struct rcu_head *rcu_head) | 3373 | static void rb_free_rcu(struct rcu_head *rcu_head) |
3499 | { | 3374 | { |
3500 | struct ring_buffer *rb; | 3375 | struct ring_buffer *rb; |
@@ -3520,9 +3395,19 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event) | |||
3520 | 3395 | ||
3521 | static void ring_buffer_put(struct ring_buffer *rb) | 3396 | static void ring_buffer_put(struct ring_buffer *rb) |
3522 | { | 3397 | { |
3398 | struct perf_event *event, *n; | ||
3399 | unsigned long flags; | ||
3400 | |||
3523 | if (!atomic_dec_and_test(&rb->refcount)) | 3401 | if (!atomic_dec_and_test(&rb->refcount)) |
3524 | return; | 3402 | return; |
3525 | 3403 | ||
3404 | spin_lock_irqsave(&rb->event_lock, flags); | ||
3405 | list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) { | ||
3406 | list_del_init(&event->rb_entry); | ||
3407 | wake_up_all(&event->waitq); | ||
3408 | } | ||
3409 | spin_unlock_irqrestore(&rb->event_lock, flags); | ||
3410 | |||
3526 | call_rcu(&rb->rcu_head, rb_free_rcu); | 3411 | call_rcu(&rb->rcu_head, rb_free_rcu); |
3527 | } | 3412 | } |
3528 | 3413 | ||
@@ -3543,8 +3428,9 @@ static void perf_mmap_close(struct vm_area_struct *vma) | |||
3543 | struct ring_buffer *rb = event->rb; | 3428 | struct ring_buffer *rb = event->rb; |
3544 | 3429 | ||
3545 | atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); | 3430 | atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); |
3546 | vma->vm_mm->locked_vm -= event->mmap_locked; | 3431 | vma->vm_mm->pinned_vm -= event->mmap_locked; |
3547 | rcu_assign_pointer(event->rb, NULL); | 3432 | rcu_assign_pointer(event->rb, NULL); |
3433 | ring_buffer_detach(event, rb); | ||
3548 | mutex_unlock(&event->mmap_mutex); | 3434 | mutex_unlock(&event->mmap_mutex); |
3549 | 3435 | ||
3550 | ring_buffer_put(rb); | 3436 | ring_buffer_put(rb); |
@@ -3624,7 +3510,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
3624 | 3510 | ||
3625 | lock_limit = rlimit(RLIMIT_MEMLOCK); | 3511 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
3626 | lock_limit >>= PAGE_SHIFT; | 3512 | lock_limit >>= PAGE_SHIFT; |
3627 | locked = vma->vm_mm->locked_vm + extra; | 3513 | locked = vma->vm_mm->pinned_vm + extra; |
3628 | 3514 | ||
3629 | if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && | 3515 | if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && |
3630 | !capable(CAP_IPC_LOCK)) { | 3516 | !capable(CAP_IPC_LOCK)) { |
@@ -3650,7 +3536,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
3650 | atomic_long_add(user_extra, &user->locked_vm); | 3536 | atomic_long_add(user_extra, &user->locked_vm); |
3651 | event->mmap_locked = extra; | 3537 | event->mmap_locked = extra; |
3652 | event->mmap_user = get_current_user(); | 3538 | event->mmap_user = get_current_user(); |
3653 | vma->vm_mm->locked_vm += event->mmap_locked; | 3539 | vma->vm_mm->pinned_vm += event->mmap_locked; |
3654 | 3540 | ||
3655 | unlock: | 3541 | unlock: |
3656 | if (!ret) | 3542 | if (!ret) |
@@ -3699,7 +3585,7 @@ static const struct file_operations perf_fops = { | |||
3699 | 3585 | ||
3700 | void perf_event_wakeup(struct perf_event *event) | 3586 | void perf_event_wakeup(struct perf_event *event) |
3701 | { | 3587 | { |
3702 | wake_up_all(&event->waitq); | 3588 | ring_buffer_wakeup(event); |
3703 | 3589 | ||
3704 | if (event->pending_kill) { | 3590 | if (event->pending_kill) { |
3705 | kill_fasync(&event->fasync, SIGIO, event->pending_kill); | 3591 | kill_fasync(&event->fasync, SIGIO, event->pending_kill); |
@@ -4736,7 +4622,6 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow, | |||
4736 | struct hw_perf_event *hwc = &event->hw; | 4622 | struct hw_perf_event *hwc = &event->hw; |
4737 | int throttle = 0; | 4623 | int throttle = 0; |
4738 | 4624 | ||
4739 | data->period = event->hw.last_period; | ||
4740 | if (!overflow) | 4625 | if (!overflow) |
4741 | overflow = perf_swevent_set_period(event); | 4626 | overflow = perf_swevent_set_period(event); |
4742 | 4627 | ||
@@ -4770,6 +4655,12 @@ static void perf_swevent_event(struct perf_event *event, u64 nr, | |||
4770 | if (!is_sampling_event(event)) | 4655 | if (!is_sampling_event(event)) |
4771 | return; | 4656 | return; |
4772 | 4657 | ||
4658 | if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) { | ||
4659 | data->period = nr; | ||
4660 | return perf_swevent_overflow(event, 1, data, regs); | ||
4661 | } else | ||
4662 | data->period = event->hw.last_period; | ||
4663 | |||
4773 | if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) | 4664 | if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) |
4774 | return perf_swevent_overflow(event, 1, data, regs); | 4665 | return perf_swevent_overflow(event, 1, data, regs); |
4775 | 4666 | ||
@@ -5282,7 +5173,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) | |||
5282 | regs = get_irq_regs(); | 5173 | regs = get_irq_regs(); |
5283 | 5174 | ||
5284 | if (regs && !perf_exclude_event(event, regs)) { | 5175 | if (regs && !perf_exclude_event(event, regs)) { |
5285 | if (!(event->attr.exclude_idle && current->pid == 0)) | 5176 | if (!(event->attr.exclude_idle && is_idle_task(current))) |
5286 | if (perf_event_overflow(event, &data, regs)) | 5177 | if (perf_event_overflow(event, &data, regs)) |
5287 | ret = HRTIMER_NORESTART; | 5178 | ret = HRTIMER_NORESTART; |
5288 | } | 5179 | } |
@@ -5758,6 +5649,7 @@ struct pmu *perf_init_event(struct perf_event *event) | |||
5758 | pmu = idr_find(&pmu_idr, event->attr.type); | 5649 | pmu = idr_find(&pmu_idr, event->attr.type); |
5759 | rcu_read_unlock(); | 5650 | rcu_read_unlock(); |
5760 | if (pmu) { | 5651 | if (pmu) { |
5652 | event->pmu = pmu; | ||
5761 | ret = pmu->event_init(event); | 5653 | ret = pmu->event_init(event); |
5762 | if (ret) | 5654 | if (ret) |
5763 | pmu = ERR_PTR(ret); | 5655 | pmu = ERR_PTR(ret); |
@@ -5765,6 +5657,7 @@ struct pmu *perf_init_event(struct perf_event *event) | |||
5765 | } | 5657 | } |
5766 | 5658 | ||
5767 | list_for_each_entry_rcu(pmu, &pmus, entry) { | 5659 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
5660 | event->pmu = pmu; | ||
5768 | ret = pmu->event_init(event); | 5661 | ret = pmu->event_init(event); |
5769 | if (!ret) | 5662 | if (!ret) |
5770 | goto unlock; | 5663 | goto unlock; |
@@ -5819,6 +5712,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
5819 | INIT_LIST_HEAD(&event->group_entry); | 5712 | INIT_LIST_HEAD(&event->group_entry); |
5820 | INIT_LIST_HEAD(&event->event_entry); | 5713 | INIT_LIST_HEAD(&event->event_entry); |
5821 | INIT_LIST_HEAD(&event->sibling_list); | 5714 | INIT_LIST_HEAD(&event->sibling_list); |
5715 | INIT_LIST_HEAD(&event->rb_entry); | ||
5716 | |||
5822 | init_waitqueue_head(&event->waitq); | 5717 | init_waitqueue_head(&event->waitq); |
5823 | init_irq_work(&event->pending, perf_pending_event); | 5718 | init_irq_work(&event->pending, perf_pending_event); |
5824 | 5719 | ||
@@ -5891,11 +5786,9 @@ done: | |||
5891 | return ERR_PTR(err); | 5786 | return ERR_PTR(err); |
5892 | } | 5787 | } |
5893 | 5788 | ||
5894 | event->pmu = pmu; | ||
5895 | |||
5896 | if (!event->parent) { | 5789 | if (!event->parent) { |
5897 | if (event->attach_state & PERF_ATTACH_TASK) | 5790 | if (event->attach_state & PERF_ATTACH_TASK) |
5898 | jump_label_inc(&perf_sched_events); | 5791 | jump_label_inc(&perf_sched_events.key); |
5899 | if (event->attr.mmap || event->attr.mmap_data) | 5792 | if (event->attr.mmap || event->attr.mmap_data) |
5900 | atomic_inc(&nr_mmap_events); | 5793 | atomic_inc(&nr_mmap_events); |
5901 | if (event->attr.comm) | 5794 | if (event->attr.comm) |
@@ -6027,6 +5920,8 @@ set: | |||
6027 | 5920 | ||
6028 | old_rb = event->rb; | 5921 | old_rb = event->rb; |
6029 | rcu_assign_pointer(event->rb, rb); | 5922 | rcu_assign_pointer(event->rb, rb); |
5923 | if (old_rb) | ||
5924 | ring_buffer_detach(event, old_rb); | ||
6030 | ret = 0; | 5925 | ret = 0; |
6031 | unlock: | 5926 | unlock: |
6032 | mutex_unlock(&event->mmap_mutex); | 5927 | mutex_unlock(&event->mmap_mutex); |
@@ -6131,7 +6026,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
6131 | * - that may need work on context switch | 6026 | * - that may need work on context switch |
6132 | */ | 6027 | */ |
6133 | atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); | 6028 | atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); |
6134 | jump_label_inc(&perf_sched_events); | 6029 | jump_label_inc(&perf_sched_events.key); |
6135 | } | 6030 | } |
6136 | 6031 | ||
6137 | /* | 6032 | /* |
@@ -6977,6 +6872,9 @@ void __init perf_event_init(void) | |||
6977 | 6872 | ||
6978 | ret = init_hw_breakpoint(); | 6873 | ret = init_hw_breakpoint(); |
6979 | WARN(ret, "hw_breakpoint initialization failed with: %d", ret); | 6874 | WARN(ret, "hw_breakpoint initialization failed with: %d", ret); |
6875 | |||
6876 | /* do not patch jump label more than once per second */ | ||
6877 | jump_label_rate_limit(&perf_sched_events, HZ); | ||
6980 | } | 6878 | } |
6981 | 6879 | ||
6982 | static int __init perf_event_sysfs_init(void) | 6880 | static int __init perf_event_sysfs_init(void) |
@@ -7043,10 +6941,13 @@ static int __perf_cgroup_move(void *info) | |||
7043 | return 0; | 6941 | return 0; |
7044 | } | 6942 | } |
7045 | 6943 | ||
7046 | static void | 6944 | static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, |
7047 | perf_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *task) | 6945 | struct cgroup_taskset *tset) |
7048 | { | 6946 | { |
7049 | task_function_call(task, __perf_cgroup_move, task); | 6947 | struct task_struct *task; |
6948 | |||
6949 | cgroup_taskset_for_each(task, cgrp, tset) | ||
6950 | task_function_call(task, __perf_cgroup_move, task); | ||
7050 | } | 6951 | } |
7051 | 6952 | ||
7052 | static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, | 6953 | static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, |
@@ -7060,7 +6961,7 @@ static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, | |||
7060 | if (!(task->flags & PF_EXITING)) | 6961 | if (!(task->flags & PF_EXITING)) |
7061 | return; | 6962 | return; |
7062 | 6963 | ||
7063 | perf_cgroup_attach_task(cgrp, task); | 6964 | task_function_call(task, __perf_cgroup_move, task); |
7064 | } | 6965 | } |
7065 | 6966 | ||
7066 | struct cgroup_subsys perf_subsys = { | 6967 | struct cgroup_subsys perf_subsys = { |
@@ -7069,6 +6970,6 @@ struct cgroup_subsys perf_subsys = { | |||
7069 | .create = perf_cgroup_create, | 6970 | .create = perf_cgroup_create, |
7070 | .destroy = perf_cgroup_destroy, | 6971 | .destroy = perf_cgroup_destroy, |
7071 | .exit = perf_cgroup_exit, | 6972 | .exit = perf_cgroup_exit, |
7072 | .attach_task = perf_cgroup_attach_task, | 6973 | .attach = perf_cgroup_attach, |
7073 | }; | 6974 | }; |
7074 | #endif /* CONFIG_CGROUP_PERF */ | 6975 | #endif /* CONFIG_CGROUP_PERF */ |
diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 09097dd8116c..b0b107f90afc 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h | |||
@@ -1,6 +1,10 @@ | |||
1 | #ifndef _KERNEL_EVENTS_INTERNAL_H | 1 | #ifndef _KERNEL_EVENTS_INTERNAL_H |
2 | #define _KERNEL_EVENTS_INTERNAL_H | 2 | #define _KERNEL_EVENTS_INTERNAL_H |
3 | 3 | ||
4 | #include <linux/hardirq.h> | ||
5 | |||
6 | /* Buffer handling */ | ||
7 | |||
4 | #define RING_BUFFER_WRITABLE 0x01 | 8 | #define RING_BUFFER_WRITABLE 0x01 |
5 | 9 | ||
6 | struct ring_buffer { | 10 | struct ring_buffer { |
@@ -22,6 +26,9 @@ struct ring_buffer { | |||
22 | local_t lost; /* nr records lost */ | 26 | local_t lost; /* nr records lost */ |
23 | 27 | ||
24 | long watermark; /* wakeup watermark */ | 28 | long watermark; /* wakeup watermark */ |
29 | /* poll crap */ | ||
30 | spinlock_t event_lock; | ||
31 | struct list_head event_list; | ||
25 | 32 | ||
26 | struct perf_event_mmap_page *user_page; | 33 | struct perf_event_mmap_page *user_page; |
27 | void *data_pages[0]; | 34 | void *data_pages[0]; |
@@ -64,7 +71,7 @@ static inline int page_order(struct ring_buffer *rb) | |||
64 | } | 71 | } |
65 | #endif | 72 | #endif |
66 | 73 | ||
67 | static unsigned long perf_data_size(struct ring_buffer *rb) | 74 | static inline unsigned long perf_data_size(struct ring_buffer *rb) |
68 | { | 75 | { |
69 | return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); | 76 | return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); |
70 | } | 77 | } |
@@ -93,4 +100,37 @@ __output_copy(struct perf_output_handle *handle, | |||
93 | } while (len); | 100 | } while (len); |
94 | } | 101 | } |
95 | 102 | ||
103 | /* Callchain handling */ | ||
104 | extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); | ||
105 | extern int get_callchain_buffers(void); | ||
106 | extern void put_callchain_buffers(void); | ||
107 | |||
108 | static inline int get_recursion_context(int *recursion) | ||
109 | { | ||
110 | int rctx; | ||
111 | |||
112 | if (in_nmi()) | ||
113 | rctx = 3; | ||
114 | else if (in_irq()) | ||
115 | rctx = 2; | ||
116 | else if (in_softirq()) | ||
117 | rctx = 1; | ||
118 | else | ||
119 | rctx = 0; | ||
120 | |||
121 | if (recursion[rctx]) | ||
122 | return -1; | ||
123 | |||
124 | recursion[rctx]++; | ||
125 | barrier(); | ||
126 | |||
127 | return rctx; | ||
128 | } | ||
129 | |||
130 | static inline void put_recursion_context(int *recursion, int rctx) | ||
131 | { | ||
132 | barrier(); | ||
133 | recursion[rctx]--; | ||
134 | } | ||
135 | |||
96 | #endif /* _KERNEL_EVENTS_INTERNAL_H */ | 136 | #endif /* _KERNEL_EVENTS_INTERNAL_H */ |
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index a2a29205cc0f..6ddaba43fb7a 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c | |||
@@ -4,7 +4,7 @@ | |||
4 | * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> | 4 | * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> |
5 | * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar | 5 | * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar |
6 | * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | 6 | * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> |
7 | * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> | 7 | * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> |
8 | * | 8 | * |
9 | * For licensing details see kernel-base/COPYING | 9 | * For licensing details see kernel-base/COPYING |
10 | */ | 10 | */ |
@@ -209,6 +209,9 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) | |||
209 | rb->writable = 1; | 209 | rb->writable = 1; |
210 | 210 | ||
211 | atomic_set(&rb->refcount, 1); | 211 | atomic_set(&rb->refcount, 1); |
212 | |||
213 | INIT_LIST_HEAD(&rb->event_list); | ||
214 | spin_lock_init(&rb->event_lock); | ||
212 | } | 215 | } |
213 | 216 | ||
214 | #ifndef CONFIG_PERF_USE_VMALLOC | 217 | #ifndef CONFIG_PERF_USE_VMALLOC |
diff --git a/kernel/exit.c b/kernel/exit.c index 2913b3509d42..c44738267be7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -51,6 +51,7 @@ | |||
51 | #include <trace/events/sched.h> | 51 | #include <trace/events/sched.h> |
52 | #include <linux/hw_breakpoint.h> | 52 | #include <linux/hw_breakpoint.h> |
53 | #include <linux/oom.h> | 53 | #include <linux/oom.h> |
54 | #include <linux/writeback.h> | ||
54 | 55 | ||
55 | #include <asm/uaccess.h> | 56 | #include <asm/uaccess.h> |
56 | #include <asm/unistd.h> | 57 | #include <asm/unistd.h> |
@@ -121,9 +122,9 @@ static void __exit_signal(struct task_struct *tsk) | |||
121 | * We won't ever get here for the group leader, since it | 122 | * We won't ever get here for the group leader, since it |
122 | * will have been the last reference on the signal_struct. | 123 | * will have been the last reference on the signal_struct. |
123 | */ | 124 | */ |
124 | sig->utime = cputime_add(sig->utime, tsk->utime); | 125 | sig->utime += tsk->utime; |
125 | sig->stime = cputime_add(sig->stime, tsk->stime); | 126 | sig->stime += tsk->stime; |
126 | sig->gtime = cputime_add(sig->gtime, tsk->gtime); | 127 | sig->gtime += tsk->gtime; |
127 | sig->min_flt += tsk->min_flt; | 128 | sig->min_flt += tsk->min_flt; |
128 | sig->maj_flt += tsk->maj_flt; | 129 | sig->maj_flt += tsk->maj_flt; |
129 | sig->nvcsw += tsk->nvcsw; | 130 | sig->nvcsw += tsk->nvcsw; |
@@ -679,10 +680,6 @@ static void exit_mm(struct task_struct * tsk) | |||
679 | tsk->mm = NULL; | 680 | tsk->mm = NULL; |
680 | up_read(&mm->mmap_sem); | 681 | up_read(&mm->mmap_sem); |
681 | enter_lazy_tlb(mm, current); | 682 | enter_lazy_tlb(mm, current); |
682 | /* We don't want this task to be frozen prematurely */ | ||
683 | clear_freeze_flag(tsk); | ||
684 | if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) | ||
685 | atomic_dec(&mm->oom_disable_count); | ||
686 | task_unlock(tsk); | 683 | task_unlock(tsk); |
687 | mm_update_next_owner(mm); | 684 | mm_update_next_owner(mm); |
688 | mmput(mm); | 685 | mmput(mm); |
@@ -890,7 +887,7 @@ static void check_stack_usage(void) | |||
890 | static inline void check_stack_usage(void) {} | 887 | static inline void check_stack_usage(void) {} |
891 | #endif | 888 | #endif |
892 | 889 | ||
893 | NORET_TYPE void do_exit(long code) | 890 | void do_exit(long code) |
894 | { | 891 | { |
895 | struct task_struct *tsk = current; | 892 | struct task_struct *tsk = current; |
896 | int group_dead; | 893 | int group_dead; |
@@ -1039,9 +1036,12 @@ NORET_TYPE void do_exit(long code) | |||
1039 | validate_creds_for_do_exit(tsk); | 1036 | validate_creds_for_do_exit(tsk); |
1040 | 1037 | ||
1041 | preempt_disable(); | 1038 | preempt_disable(); |
1039 | if (tsk->nr_dirtied) | ||
1040 | __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); | ||
1042 | exit_rcu(); | 1041 | exit_rcu(); |
1043 | /* causes final put_task_struct in finish_task_switch(). */ | 1042 | /* causes final put_task_struct in finish_task_switch(). */ |
1044 | tsk->state = TASK_DEAD; | 1043 | tsk->state = TASK_DEAD; |
1044 | tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ | ||
1045 | schedule(); | 1045 | schedule(); |
1046 | BUG(); | 1046 | BUG(); |
1047 | /* Avoid "noreturn function does return". */ | 1047 | /* Avoid "noreturn function does return". */ |
@@ -1051,7 +1051,7 @@ NORET_TYPE void do_exit(long code) | |||
1051 | 1051 | ||
1052 | EXPORT_SYMBOL_GPL(do_exit); | 1052 | EXPORT_SYMBOL_GPL(do_exit); |
1053 | 1053 | ||
1054 | NORET_TYPE void complete_and_exit(struct completion *comp, long code) | 1054 | void complete_and_exit(struct completion *comp, long code) |
1055 | { | 1055 | { |
1056 | if (comp) | 1056 | if (comp) |
1057 | complete(comp); | 1057 | complete(comp); |
@@ -1070,7 +1070,7 @@ SYSCALL_DEFINE1(exit, int, error_code) | |||
1070 | * Take down every thread in the group. This is called by fatal signals | 1070 | * Take down every thread in the group. This is called by fatal signals |
1071 | * as well as by sys_exit_group (below). | 1071 | * as well as by sys_exit_group (below). |
1072 | */ | 1072 | */ |
1073 | NORET_TYPE void | 1073 | void |
1074 | do_group_exit(int exit_code) | 1074 | do_group_exit(int exit_code) |
1075 | { | 1075 | { |
1076 | struct signal_struct *sig = current->signal; | 1076 | struct signal_struct *sig = current->signal; |
@@ -1257,19 +1257,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
1257 | spin_lock_irq(&p->real_parent->sighand->siglock); | 1257 | spin_lock_irq(&p->real_parent->sighand->siglock); |
1258 | psig = p->real_parent->signal; | 1258 | psig = p->real_parent->signal; |
1259 | sig = p->signal; | 1259 | sig = p->signal; |
1260 | psig->cutime = | 1260 | psig->cutime += tgutime + sig->cutime; |
1261 | cputime_add(psig->cutime, | 1261 | psig->cstime += tgstime + sig->cstime; |
1262 | cputime_add(tgutime, | 1262 | psig->cgtime += p->gtime + sig->gtime + sig->cgtime; |
1263 | sig->cutime)); | ||
1264 | psig->cstime = | ||
1265 | cputime_add(psig->cstime, | ||
1266 | cputime_add(tgstime, | ||
1267 | sig->cstime)); | ||
1268 | psig->cgtime = | ||
1269 | cputime_add(psig->cgtime, | ||
1270 | cputime_add(p->gtime, | ||
1271 | cputime_add(sig->gtime, | ||
1272 | sig->cgtime))); | ||
1273 | psig->cmin_flt += | 1263 | psig->cmin_flt += |
1274 | p->min_flt + sig->min_flt + sig->cmin_flt; | 1264 | p->min_flt + sig->min_flt + sig->cmin_flt; |
1275 | psig->cmaj_flt += | 1265 | psig->cmaj_flt += |
@@ -1542,8 +1532,15 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, | |||
1542 | } | 1532 | } |
1543 | 1533 | ||
1544 | /* dead body doesn't have much to contribute */ | 1534 | /* dead body doesn't have much to contribute */ |
1545 | if (p->exit_state == EXIT_DEAD) | 1535 | if (unlikely(p->exit_state == EXIT_DEAD)) { |
1536 | /* | ||
1537 | * But do not ignore this task until the tracer does | ||
1538 | * wait_task_zombie()->do_notify_parent(). | ||
1539 | */ | ||
1540 | if (likely(!ptrace) && unlikely(ptrace_reparented(p))) | ||
1541 | wo->notask_error = 0; | ||
1546 | return 0; | 1542 | return 0; |
1543 | } | ||
1547 | 1544 | ||
1548 | /* slay zombie? */ | 1545 | /* slay zombie? */ |
1549 | if (p->exit_state == EXIT_ZOMBIE) { | 1546 | if (p->exit_state == EXIT_ZOMBIE) { |
diff --git a/kernel/fork.c b/kernel/fork.c index 8e6b6f4fb272..443f5125f11e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -76,6 +76,9 @@ | |||
76 | 76 | ||
77 | #include <trace/events/sched.h> | 77 | #include <trace/events/sched.h> |
78 | 78 | ||
79 | #define CREATE_TRACE_POINTS | ||
80 | #include <trace/events/task.h> | ||
81 | |||
79 | /* | 82 | /* |
80 | * Protected counters by write_lock_irq(&tasklist_lock) | 83 | * Protected counters by write_lock_irq(&tasklist_lock) |
81 | */ | 84 | */ |
@@ -162,7 +165,6 @@ static void account_kernel_stack(struct thread_info *ti, int account) | |||
162 | 165 | ||
163 | void free_task(struct task_struct *tsk) | 166 | void free_task(struct task_struct *tsk) |
164 | { | 167 | { |
165 | prop_local_destroy_single(&tsk->dirties); | ||
166 | account_kernel_stack(tsk->stack, -1); | 168 | account_kernel_stack(tsk->stack, -1); |
167 | free_thread_info(tsk->stack); | 169 | free_thread_info(tsk->stack); |
168 | rt_mutex_debug_task_free(tsk); | 170 | rt_mutex_debug_task_free(tsk); |
@@ -274,10 +276,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
274 | 276 | ||
275 | tsk->stack = ti; | 277 | tsk->stack = ti; |
276 | 278 | ||
277 | err = prop_local_init_single(&tsk->dirties); | ||
278 | if (err) | ||
279 | goto out; | ||
280 | |||
281 | setup_thread_stack(tsk, orig); | 279 | setup_thread_stack(tsk, orig); |
282 | clear_user_return_notifier(tsk); | 280 | clear_user_return_notifier(tsk); |
283 | clear_tsk_need_resched(tsk); | 281 | clear_tsk_need_resched(tsk); |
@@ -501,7 +499,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) | |||
501 | mm->cached_hole_size = ~0UL; | 499 | mm->cached_hole_size = ~0UL; |
502 | mm_init_aio(mm); | 500 | mm_init_aio(mm); |
503 | mm_init_owner(mm, p); | 501 | mm_init_owner(mm, p); |
504 | atomic_set(&mm->oom_disable_count, 0); | ||
505 | 502 | ||
506 | if (likely(!mm_alloc_pgd(mm))) { | 503 | if (likely(!mm_alloc_pgd(mm))) { |
507 | mm->def_flags = 0; | 504 | mm->def_flags = 0; |
@@ -816,8 +813,6 @@ good_mm: | |||
816 | /* Initializing for Swap token stuff */ | 813 | /* Initializing for Swap token stuff */ |
817 | mm->token_priority = 0; | 814 | mm->token_priority = 0; |
818 | mm->last_interval = 0; | 815 | mm->last_interval = 0; |
819 | if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) | ||
820 | atomic_inc(&mm->oom_disable_count); | ||
821 | 816 | ||
822 | tsk->mm = mm; | 817 | tsk->mm = mm; |
823 | tsk->active_mm = mm; | 818 | tsk->active_mm = mm; |
@@ -980,7 +975,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | |||
980 | sched_autogroup_fork(sig); | 975 | sched_autogroup_fork(sig); |
981 | 976 | ||
982 | #ifdef CONFIG_CGROUPS | 977 | #ifdef CONFIG_CGROUPS |
983 | init_rwsem(&sig->threadgroup_fork_lock); | 978 | init_rwsem(&sig->group_rwsem); |
984 | #endif | 979 | #endif |
985 | 980 | ||
986 | sig->oom_adj = current->signal->oom_adj; | 981 | sig->oom_adj = current->signal->oom_adj; |
@@ -1000,7 +995,6 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p) | |||
1000 | new_flags |= PF_FORKNOEXEC; | 995 | new_flags |= PF_FORKNOEXEC; |
1001 | new_flags |= PF_STARTING; | 996 | new_flags |= PF_STARTING; |
1002 | p->flags = new_flags; | 997 | p->flags = new_flags; |
1003 | clear_freeze_flag(p); | ||
1004 | } | 998 | } |
1005 | 999 | ||
1006 | SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) | 1000 | SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) |
@@ -1031,8 +1025,8 @@ void mm_init_owner(struct mm_struct *mm, struct task_struct *p) | |||
1031 | */ | 1025 | */ |
1032 | static void posix_cpu_timers_init(struct task_struct *tsk) | 1026 | static void posix_cpu_timers_init(struct task_struct *tsk) |
1033 | { | 1027 | { |
1034 | tsk->cputime_expires.prof_exp = cputime_zero; | 1028 | tsk->cputime_expires.prof_exp = 0; |
1035 | tsk->cputime_expires.virt_exp = cputime_zero; | 1029 | tsk->cputime_expires.virt_exp = 0; |
1036 | tsk->cputime_expires.sched_exp = 0; | 1030 | tsk->cputime_expires.sched_exp = 0; |
1037 | INIT_LIST_HEAD(&tsk->cpu_timers[0]); | 1031 | INIT_LIST_HEAD(&tsk->cpu_timers[0]); |
1038 | INIT_LIST_HEAD(&tsk->cpu_timers[1]); | 1032 | INIT_LIST_HEAD(&tsk->cpu_timers[1]); |
@@ -1140,14 +1134,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1140 | 1134 | ||
1141 | init_sigpending(&p->pending); | 1135 | init_sigpending(&p->pending); |
1142 | 1136 | ||
1143 | p->utime = cputime_zero; | 1137 | p->utime = p->stime = p->gtime = 0; |
1144 | p->stime = cputime_zero; | 1138 | p->utimescaled = p->stimescaled = 0; |
1145 | p->gtime = cputime_zero; | ||
1146 | p->utimescaled = cputime_zero; | ||
1147 | p->stimescaled = cputime_zero; | ||
1148 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | 1139 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING |
1149 | p->prev_utime = cputime_zero; | 1140 | p->prev_utime = p->prev_stime = 0; |
1150 | p->prev_stime = cputime_zero; | ||
1151 | #endif | 1141 | #endif |
1152 | #if defined(SPLIT_RSS_COUNTING) | 1142 | #if defined(SPLIT_RSS_COUNTING) |
1153 | memset(&p->rss_stat, 0, sizeof(p->rss_stat)); | 1143 | memset(&p->rss_stat, 0, sizeof(p->rss_stat)); |
@@ -1166,7 +1156,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1166 | p->io_context = NULL; | 1156 | p->io_context = NULL; |
1167 | p->audit_context = NULL; | 1157 | p->audit_context = NULL; |
1168 | if (clone_flags & CLONE_THREAD) | 1158 | if (clone_flags & CLONE_THREAD) |
1169 | threadgroup_fork_read_lock(current); | 1159 | threadgroup_change_begin(current); |
1170 | cgroup_fork(p); | 1160 | cgroup_fork(p); |
1171 | #ifdef CONFIG_NUMA | 1161 | #ifdef CONFIG_NUMA |
1172 | p->mempolicy = mpol_dup(p->mempolicy); | 1162 | p->mempolicy = mpol_dup(p->mempolicy); |
@@ -1302,6 +1292,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1302 | p->pdeath_signal = 0; | 1292 | p->pdeath_signal = 0; |
1303 | p->exit_state = 0; | 1293 | p->exit_state = 0; |
1304 | 1294 | ||
1295 | p->nr_dirtied = 0; | ||
1296 | p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); | ||
1297 | p->dirty_paused_when = 0; | ||
1298 | |||
1305 | /* | 1299 | /* |
1306 | * Ok, make it visible to the rest of the system. | 1300 | * Ok, make it visible to the rest of the system. |
1307 | * We dont wake it up yet. | 1301 | * We dont wake it up yet. |
@@ -1378,8 +1372,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1378 | proc_fork_connector(p); | 1372 | proc_fork_connector(p); |
1379 | cgroup_post_fork(p); | 1373 | cgroup_post_fork(p); |
1380 | if (clone_flags & CLONE_THREAD) | 1374 | if (clone_flags & CLONE_THREAD) |
1381 | threadgroup_fork_read_unlock(current); | 1375 | threadgroup_change_end(current); |
1382 | perf_event_fork(p); | 1376 | perf_event_fork(p); |
1377 | |||
1378 | trace_task_newtask(p, clone_flags); | ||
1379 | |||
1383 | return p; | 1380 | return p; |
1384 | 1381 | ||
1385 | bad_fork_free_pid: | 1382 | bad_fork_free_pid: |
@@ -1391,13 +1388,8 @@ bad_fork_cleanup_io: | |||
1391 | bad_fork_cleanup_namespaces: | 1388 | bad_fork_cleanup_namespaces: |
1392 | exit_task_namespaces(p); | 1389 | exit_task_namespaces(p); |
1393 | bad_fork_cleanup_mm: | 1390 | bad_fork_cleanup_mm: |
1394 | if (p->mm) { | 1391 | if (p->mm) |
1395 | task_lock(p); | ||
1396 | if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) | ||
1397 | atomic_dec(&p->mm->oom_disable_count); | ||
1398 | task_unlock(p); | ||
1399 | mmput(p->mm); | 1392 | mmput(p->mm); |
1400 | } | ||
1401 | bad_fork_cleanup_signal: | 1393 | bad_fork_cleanup_signal: |
1402 | if (!(clone_flags & CLONE_THREAD)) | 1394 | if (!(clone_flags & CLONE_THREAD)) |
1403 | free_signal_struct(p->signal); | 1395 | free_signal_struct(p->signal); |
@@ -1418,7 +1410,7 @@ bad_fork_cleanup_policy: | |||
1418 | bad_fork_cleanup_cgroup: | 1410 | bad_fork_cleanup_cgroup: |
1419 | #endif | 1411 | #endif |
1420 | if (clone_flags & CLONE_THREAD) | 1412 | if (clone_flags & CLONE_THREAD) |
1421 | threadgroup_fork_read_unlock(current); | 1413 | threadgroup_change_end(current); |
1422 | cgroup_exit(p, cgroup_callbacks_done); | 1414 | cgroup_exit(p, cgroup_callbacks_done); |
1423 | delayacct_tsk_free(p); | 1415 | delayacct_tsk_free(p); |
1424 | module_put(task_thread_info(p)->exec_domain->module); | 1416 | module_put(task_thread_info(p)->exec_domain->module); |
diff --git a/kernel/freezer.c b/kernel/freezer.c index 7b01de98bb6a..9815b8d1eed5 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c | |||
@@ -6,104 +6,117 @@ | |||
6 | 6 | ||
7 | #include <linux/interrupt.h> | 7 | #include <linux/interrupt.h> |
8 | #include <linux/suspend.h> | 8 | #include <linux/suspend.h> |
9 | #include <linux/module.h> | 9 | #include <linux/export.h> |
10 | #include <linux/syscalls.h> | 10 | #include <linux/syscalls.h> |
11 | #include <linux/freezer.h> | 11 | #include <linux/freezer.h> |
12 | #include <linux/kthread.h> | ||
12 | 13 | ||
13 | /* | 14 | /* total number of freezing conditions in effect */ |
14 | * freezing is complete, mark current process as frozen | 15 | atomic_t system_freezing_cnt = ATOMIC_INIT(0); |
16 | EXPORT_SYMBOL(system_freezing_cnt); | ||
17 | |||
18 | /* indicate whether PM freezing is in effect, protected by pm_mutex */ | ||
19 | bool pm_freezing; | ||
20 | bool pm_nosig_freezing; | ||
21 | |||
22 | /* protects freezing and frozen transitions */ | ||
23 | static DEFINE_SPINLOCK(freezer_lock); | ||
24 | |||
25 | /** | ||
26 | * freezing_slow_path - slow path for testing whether a task needs to be frozen | ||
27 | * @p: task to be tested | ||
28 | * | ||
29 | * This function is called by freezing() if system_freezing_cnt isn't zero | ||
30 | * and tests whether @p needs to enter and stay in frozen state. Can be | ||
31 | * called under any context. The freezers are responsible for ensuring the | ||
32 | * target tasks see the updated state. | ||
15 | */ | 33 | */ |
16 | static inline void frozen_process(void) | 34 | bool freezing_slow_path(struct task_struct *p) |
17 | { | 35 | { |
18 | if (!unlikely(current->flags & PF_NOFREEZE)) { | 36 | if (p->flags & PF_NOFREEZE) |
19 | current->flags |= PF_FROZEN; | 37 | return false; |
20 | smp_wmb(); | 38 | |
21 | } | 39 | if (pm_nosig_freezing || cgroup_freezing(p)) |
22 | clear_freeze_flag(current); | 40 | return true; |
41 | |||
42 | if (pm_freezing && !(p->flags & PF_KTHREAD)) | ||
43 | return true; | ||
44 | |||
45 | return false; | ||
23 | } | 46 | } |
47 | EXPORT_SYMBOL(freezing_slow_path); | ||
24 | 48 | ||
25 | /* Refrigerator is place where frozen processes are stored :-). */ | 49 | /* Refrigerator is place where frozen processes are stored :-). */ |
26 | void refrigerator(void) | 50 | bool __refrigerator(bool check_kthr_stop) |
27 | { | 51 | { |
28 | /* Hmm, should we be allowed to suspend when there are realtime | 52 | /* Hmm, should we be allowed to suspend when there are realtime |
29 | processes around? */ | 53 | processes around? */ |
30 | long save; | 54 | bool was_frozen = false; |
55 | long save = current->state; | ||
31 | 56 | ||
32 | task_lock(current); | ||
33 | if (freezing(current)) { | ||
34 | frozen_process(); | ||
35 | task_unlock(current); | ||
36 | } else { | ||
37 | task_unlock(current); | ||
38 | return; | ||
39 | } | ||
40 | save = current->state; | ||
41 | pr_debug("%s entered refrigerator\n", current->comm); | 57 | pr_debug("%s entered refrigerator\n", current->comm); |
42 | 58 | ||
43 | spin_lock_irq(¤t->sighand->siglock); | ||
44 | recalc_sigpending(); /* We sent fake signal, clean it up */ | ||
45 | spin_unlock_irq(¤t->sighand->siglock); | ||
46 | |||
47 | /* prevent accounting of that task to load */ | ||
48 | current->flags |= PF_FREEZING; | ||
49 | |||
50 | for (;;) { | 59 | for (;;) { |
51 | set_current_state(TASK_UNINTERRUPTIBLE); | 60 | set_current_state(TASK_UNINTERRUPTIBLE); |
52 | if (!frozen(current)) | 61 | |
62 | spin_lock_irq(&freezer_lock); | ||
63 | current->flags |= PF_FROZEN; | ||
64 | if (!freezing(current) || | ||
65 | (check_kthr_stop && kthread_should_stop())) | ||
66 | current->flags &= ~PF_FROZEN; | ||
67 | spin_unlock_irq(&freezer_lock); | ||
68 | |||
69 | if (!(current->flags & PF_FROZEN)) | ||
53 | break; | 70 | break; |
71 | was_frozen = true; | ||
54 | schedule(); | 72 | schedule(); |
55 | } | 73 | } |
56 | 74 | ||
57 | /* Remove the accounting blocker */ | ||
58 | current->flags &= ~PF_FREEZING; | ||
59 | |||
60 | pr_debug("%s left refrigerator\n", current->comm); | 75 | pr_debug("%s left refrigerator\n", current->comm); |
61 | __set_current_state(save); | 76 | |
77 | /* | ||
78 | * Restore saved task state before returning. The mb'd version | ||
79 | * needs to be used; otherwise, it might silently break | ||
80 | * synchronization which depends on ordered task state change. | ||
81 | */ | ||
82 | set_current_state(save); | ||
83 | |||
84 | return was_frozen; | ||
62 | } | 85 | } |
63 | EXPORT_SYMBOL(refrigerator); | 86 | EXPORT_SYMBOL(__refrigerator); |
64 | 87 | ||
65 | static void fake_signal_wake_up(struct task_struct *p) | 88 | static void fake_signal_wake_up(struct task_struct *p) |
66 | { | 89 | { |
67 | unsigned long flags; | 90 | unsigned long flags; |
68 | 91 | ||
69 | spin_lock_irqsave(&p->sighand->siglock, flags); | 92 | if (lock_task_sighand(p, &flags)) { |
70 | signal_wake_up(p, 0); | 93 | signal_wake_up(p, 0); |
71 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | 94 | unlock_task_sighand(p, &flags); |
95 | } | ||
72 | } | 96 | } |
73 | 97 | ||
74 | /** | 98 | /** |
75 | * freeze_task - send a freeze request to given task | 99 | * freeze_task - send a freeze request to given task |
76 | * @p: task to send the request to | 100 | * @p: task to send the request to |
77 | * @sig_only: if set, the request will only be sent if the task has the | 101 | * |
78 | * PF_FREEZER_NOSIG flag unset | 102 | * If @p is freezing, the freeze request is sent by setting %TIF_FREEZE |
79 | * Return value: 'false', if @sig_only is set and the task has | 103 | * flag and either sending a fake signal to it or waking it up, depending |
80 | * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise | 104 | * on whether it has %PF_FREEZER_NOSIG set. |
81 | * | 105 | * |
82 | * The freeze request is sent by setting the tasks's TIF_FREEZE flag and | 106 | * RETURNS: |
83 | * either sending a fake signal to it or waking it up, depending on whether | 107 | * %false, if @p is not freezing or already frozen; %true, otherwise |
84 | * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task | ||
85 | * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its | ||
86 | * TIF_FREEZE flag will not be set. | ||
87 | */ | 108 | */ |
88 | bool freeze_task(struct task_struct *p, bool sig_only) | 109 | bool freeze_task(struct task_struct *p) |
89 | { | 110 | { |
90 | /* | 111 | unsigned long flags; |
91 | * We first check if the task is freezing and next if it has already | 112 | |
92 | * been frozen to avoid the race with frozen_process() which first marks | 113 | spin_lock_irqsave(&freezer_lock, flags); |
93 | * the task as frozen and next clears its TIF_FREEZE. | 114 | if (!freezing(p) || frozen(p)) { |
94 | */ | 115 | spin_unlock_irqrestore(&freezer_lock, flags); |
95 | if (!freezing(p)) { | 116 | return false; |
96 | smp_rmb(); | ||
97 | if (frozen(p)) | ||
98 | return false; | ||
99 | |||
100 | if (!sig_only || should_send_signal(p)) | ||
101 | set_freeze_flag(p); | ||
102 | else | ||
103 | return false; | ||
104 | } | 117 | } |
105 | 118 | ||
106 | if (should_send_signal(p)) { | 119 | if (!(p->flags & PF_KTHREAD)) { |
107 | fake_signal_wake_up(p); | 120 | fake_signal_wake_up(p); |
108 | /* | 121 | /* |
109 | * fake_signal_wake_up() goes through p's scheduler | 122 | * fake_signal_wake_up() goes through p's scheduler |
@@ -111,56 +124,48 @@ bool freeze_task(struct task_struct *p, bool sig_only) | |||
111 | * TASK_RUNNING transition can't race with task state | 124 | * TASK_RUNNING transition can't race with task state |
112 | * testing in try_to_freeze_tasks(). | 125 | * testing in try_to_freeze_tasks(). |
113 | */ | 126 | */ |
114 | } else if (sig_only) { | ||
115 | return false; | ||
116 | } else { | 127 | } else { |
117 | wake_up_state(p, TASK_INTERRUPTIBLE); | 128 | wake_up_state(p, TASK_INTERRUPTIBLE); |
118 | } | 129 | } |
119 | 130 | ||
131 | spin_unlock_irqrestore(&freezer_lock, flags); | ||
120 | return true; | 132 | return true; |
121 | } | 133 | } |
122 | 134 | ||
123 | void cancel_freezing(struct task_struct *p) | 135 | void __thaw_task(struct task_struct *p) |
124 | { | 136 | { |
125 | unsigned long flags; | 137 | unsigned long flags; |
126 | 138 | ||
127 | if (freezing(p)) { | 139 | /* |
128 | pr_debug(" clean up: %s\n", p->comm); | 140 | * Clear freezing and kick @p if FROZEN. Clearing is guaranteed to |
129 | clear_freeze_flag(p); | 141 | * be visible to @p as waking up implies wmb. Waking up inside |
130 | spin_lock_irqsave(&p->sighand->siglock, flags); | 142 | * freezer_lock also prevents wakeups from leaking outside |
131 | recalc_sigpending_and_wake(p); | 143 | * refrigerator. |
132 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | 144 | */ |
133 | } | 145 | spin_lock_irqsave(&freezer_lock, flags); |
134 | } | 146 | if (frozen(p)) |
135 | 147 | wake_up_process(p); | |
136 | static int __thaw_process(struct task_struct *p) | 148 | spin_unlock_irqrestore(&freezer_lock, flags); |
137 | { | ||
138 | if (frozen(p)) { | ||
139 | p->flags &= ~PF_FROZEN; | ||
140 | return 1; | ||
141 | } | ||
142 | clear_freeze_flag(p); | ||
143 | return 0; | ||
144 | } | 149 | } |
145 | 150 | ||
146 | /* | 151 | /** |
147 | * Wake up a frozen process | 152 | * set_freezable - make %current freezable |
148 | * | 153 | * |
149 | * task_lock() is needed to prevent the race with refrigerator() which may | 154 | * Mark %current freezable and enter refrigerator if necessary. |
150 | * occur if the freezing of tasks fails. Namely, without the lock, if the | ||
151 | * freezing of tasks failed, thaw_tasks() might have run before a task in | ||
152 | * refrigerator() could call frozen_process(), in which case the task would be | ||
153 | * frozen and no one would thaw it. | ||
154 | */ | 155 | */ |
155 | int thaw_process(struct task_struct *p) | 156 | bool set_freezable(void) |
156 | { | 157 | { |
157 | task_lock(p); | 158 | might_sleep(); |
158 | if (__thaw_process(p) == 1) { | 159 | |
159 | task_unlock(p); | 160 | /* |
160 | wake_up_process(p); | 161 | * Modify flags while holding freezer_lock. This ensures the |
161 | return 1; | 162 | * freezer notices that we aren't frozen yet or the freezing |
162 | } | 163 | * condition is visible to try_to_freeze() below. |
163 | task_unlock(p); | 164 | */ |
164 | return 0; | 165 | spin_lock_irq(&freezer_lock); |
166 | current->flags &= ~PF_NOFREEZE; | ||
167 | spin_unlock_irq(&freezer_lock); | ||
168 | |||
169 | return try_to_freeze(); | ||
165 | } | 170 | } |
166 | EXPORT_SYMBOL(thaw_process); | 171 | EXPORT_SYMBOL(set_freezable); |
diff --git a/kernel/futex.c b/kernel/futex.c index 11cbe052b2e8..1614be20173d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -55,7 +55,7 @@ | |||
55 | #include <linux/pagemap.h> | 55 | #include <linux/pagemap.h> |
56 | #include <linux/syscalls.h> | 56 | #include <linux/syscalls.h> |
57 | #include <linux/signal.h> | 57 | #include <linux/signal.h> |
58 | #include <linux/module.h> | 58 | #include <linux/export.h> |
59 | #include <linux/magic.h> | 59 | #include <linux/magic.h> |
60 | #include <linux/pid.h> | 60 | #include <linux/pid.h> |
61 | #include <linux/nsproxy.h> | 61 | #include <linux/nsproxy.h> |
@@ -314,17 +314,29 @@ again: | |||
314 | #endif | 314 | #endif |
315 | 315 | ||
316 | lock_page(page_head); | 316 | lock_page(page_head); |
317 | |||
318 | /* | ||
319 | * If page_head->mapping is NULL, then it cannot be a PageAnon | ||
320 | * page; but it might be the ZERO_PAGE or in the gate area or | ||
321 | * in a special mapping (all cases which we are happy to fail); | ||
322 | * or it may have been a good file page when get_user_pages_fast | ||
323 | * found it, but truncated or holepunched or subjected to | ||
324 | * invalidate_complete_page2 before we got the page lock (also | ||
325 | * cases which we are happy to fail). And we hold a reference, | ||
326 | * so refcount care in invalidate_complete_page's remove_mapping | ||
327 | * prevents drop_caches from setting mapping to NULL beneath us. | ||
328 | * | ||
329 | * The case we do have to guard against is when memory pressure made | ||
330 | * shmem_writepage move it from filecache to swapcache beneath us: | ||
331 | * an unlikely race, but we do need to retry for page_head->mapping. | ||
332 | */ | ||
317 | if (!page_head->mapping) { | 333 | if (!page_head->mapping) { |
334 | int shmem_swizzled = PageSwapCache(page_head); | ||
318 | unlock_page(page_head); | 335 | unlock_page(page_head); |
319 | put_page(page_head); | 336 | put_page(page_head); |
320 | /* | 337 | if (shmem_swizzled) |
321 | * ZERO_PAGE pages don't have a mapping. Avoid a busy loop | 338 | goto again; |
322 | * trying to find one. RW mapping would have COW'd (and thus | 339 | return -EFAULT; |
323 | * have a mapping) so this page is RO and won't ever change. | ||
324 | */ | ||
325 | if ((page_head == ZERO_PAGE(address))) | ||
326 | return -EFAULT; | ||
327 | goto again; | ||
328 | } | 340 | } |
329 | 341 | ||
330 | /* | 342 | /* |
@@ -854,7 +866,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | |||
854 | { | 866 | { |
855 | struct task_struct *new_owner; | 867 | struct task_struct *new_owner; |
856 | struct futex_pi_state *pi_state = this->pi_state; | 868 | struct futex_pi_state *pi_state = this->pi_state; |
857 | u32 curval, newval; | 869 | u32 uninitialized_var(curval), newval; |
858 | 870 | ||
859 | if (!pi_state) | 871 | if (!pi_state) |
860 | return -EINVAL; | 872 | return -EINVAL; |
@@ -916,7 +928,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | |||
916 | 928 | ||
917 | static int unlock_futex_pi(u32 __user *uaddr, u32 uval) | 929 | static int unlock_futex_pi(u32 __user *uaddr, u32 uval) |
918 | { | 930 | { |
919 | u32 oldval; | 931 | u32 uninitialized_var(oldval); |
920 | 932 | ||
921 | /* | 933 | /* |
922 | * There is no waiter, so we unlock the futex. The owner died | 934 | * There is no waiter, so we unlock the futex. The owner died |
@@ -1576,7 +1588,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, | |||
1576 | u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; | 1588 | u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; |
1577 | struct futex_pi_state *pi_state = q->pi_state; | 1589 | struct futex_pi_state *pi_state = q->pi_state; |
1578 | struct task_struct *oldowner = pi_state->owner; | 1590 | struct task_struct *oldowner = pi_state->owner; |
1579 | u32 uval, curval, newval; | 1591 | u32 uval, uninitialized_var(curval), newval; |
1580 | int ret; | 1592 | int ret; |
1581 | 1593 | ||
1582 | /* Owner died? */ | 1594 | /* Owner died? */ |
@@ -1793,7 +1805,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, | |||
1793 | * | 1805 | * |
1794 | * Returns: | 1806 | * Returns: |
1795 | * 0 - uaddr contains val and hb has been locked | 1807 | * 0 - uaddr contains val and hb has been locked |
1796 | * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked | 1808 | * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked |
1797 | */ | 1809 | */ |
1798 | static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, | 1810 | static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, |
1799 | struct futex_q *q, struct futex_hash_bucket **hb) | 1811 | struct futex_q *q, struct futex_hash_bucket **hb) |
@@ -2481,7 +2493,7 @@ err_unlock: | |||
2481 | */ | 2493 | */ |
2482 | int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) | 2494 | int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) |
2483 | { | 2495 | { |
2484 | u32 uval, nval, mval; | 2496 | u32 uval, uninitialized_var(nval), mval; |
2485 | 2497 | ||
2486 | retry: | 2498 | retry: |
2487 | if (get_user(uval, uaddr)) | 2499 | if (get_user(uval, uaddr)) |
diff --git a/kernel/groups.c b/kernel/groups.c index 1cc476d52dd3..99b53d1eb7ea 100644 --- a/kernel/groups.c +++ b/kernel/groups.c | |||
@@ -2,7 +2,7 @@ | |||
2 | * Supplementary group IDs | 2 | * Supplementary group IDs |
3 | */ | 3 | */ |
4 | #include <linux/cred.h> | 4 | #include <linux/cred.h> |
5 | #include <linux/module.h> | 5 | #include <linux/export.h> |
6 | #include <linux/slab.h> | 6 | #include <linux/slab.h> |
7 | #include <linux/security.h> | 7 | #include <linux/security.h> |
8 | #include <linux/syscalls.h> | 8 | #include <linux/syscalls.h> |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index a9205e32a059..ae34bf51682b 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -32,7 +32,7 @@ | |||
32 | */ | 32 | */ |
33 | 33 | ||
34 | #include <linux/cpu.h> | 34 | #include <linux/cpu.h> |
35 | #include <linux/module.h> | 35 | #include <linux/export.h> |
36 | #include <linux/percpu.h> | 36 | #include <linux/percpu.h> |
37 | #include <linux/hrtimer.h> | 37 | #include <linux/hrtimer.h> |
38 | #include <linux/notifier.h> | 38 | #include <linux/notifier.h> |
@@ -885,10 +885,13 @@ static void __remove_hrtimer(struct hrtimer *timer, | |||
885 | struct hrtimer_clock_base *base, | 885 | struct hrtimer_clock_base *base, |
886 | unsigned long newstate, int reprogram) | 886 | unsigned long newstate, int reprogram) |
887 | { | 887 | { |
888 | struct timerqueue_node *next_timer; | ||
888 | if (!(timer->state & HRTIMER_STATE_ENQUEUED)) | 889 | if (!(timer->state & HRTIMER_STATE_ENQUEUED)) |
889 | goto out; | 890 | goto out; |
890 | 891 | ||
891 | if (&timer->node == timerqueue_getnext(&base->active)) { | 892 | next_timer = timerqueue_getnext(&base->active); |
893 | timerqueue_del(&base->active, &timer->node); | ||
894 | if (&timer->node == next_timer) { | ||
892 | #ifdef CONFIG_HIGH_RES_TIMERS | 895 | #ifdef CONFIG_HIGH_RES_TIMERS |
893 | /* Reprogram the clock event device. if enabled */ | 896 | /* Reprogram the clock event device. if enabled */ |
894 | if (reprogram && hrtimer_hres_active()) { | 897 | if (reprogram && hrtimer_hres_active()) { |
@@ -901,7 +904,6 @@ static void __remove_hrtimer(struct hrtimer *timer, | |||
901 | } | 904 | } |
902 | #endif | 905 | #endif |
903 | } | 906 | } |
904 | timerqueue_del(&base->active, &timer->node); | ||
905 | if (!timerqueue_getnext(&base->active)) | 907 | if (!timerqueue_getnext(&base->active)) |
906 | base->cpu_base->active_bases &= ~(1 << base->index); | 908 | base->cpu_base->active_bases &= ~(1 << base->index); |
907 | out: | 909 | out: |
diff --git a/kernel/hung_task.c b/kernel/hung_task.c index ea640120ab86..2e48ec0c2e91 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c | |||
@@ -13,7 +13,7 @@ | |||
13 | #include <linux/freezer.h> | 13 | #include <linux/freezer.h> |
14 | #include <linux/kthread.h> | 14 | #include <linux/kthread.h> |
15 | #include <linux/lockdep.h> | 15 | #include <linux/lockdep.h> |
16 | #include <linux/module.h> | 16 | #include <linux/export.h> |
17 | #include <linux/sysctl.h> | 17 | #include <linux/sysctl.h> |
18 | 18 | ||
19 | /* | 19 | /* |
@@ -74,11 +74,17 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) | |||
74 | 74 | ||
75 | /* | 75 | /* |
76 | * Ensure the task is not frozen. | 76 | * Ensure the task is not frozen. |
77 | * Also, when a freshly created task is scheduled once, changes | 77 | * Also, skip vfork and any other user process that freezer should skip. |
78 | * its state to TASK_UNINTERRUPTIBLE without having ever been | ||
79 | * switched out once, it musn't be checked. | ||
80 | */ | 78 | */ |
81 | if (unlikely(t->flags & PF_FROZEN || !switch_count)) | 79 | if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP))) |
80 | return; | ||
81 | |||
82 | /* | ||
83 | * When a freshly created task is scheduled once, changes its state to | ||
84 | * TASK_UNINTERRUPTIBLE without having ever been switched out once, it | ||
85 | * musn't be checked. | ||
86 | */ | ||
87 | if (unlikely(!switch_count)) | ||
82 | return; | 88 | return; |
83 | 89 | ||
84 | if (switch_count != t->last_switch_count) { | 90 | if (switch_count != t->last_switch_count) { |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index dc5114b4c16c..f7c543a801d9 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -26,7 +26,7 @@ | |||
26 | int irq_set_chip(unsigned int irq, struct irq_chip *chip) | 26 | int irq_set_chip(unsigned int irq, struct irq_chip *chip) |
27 | { | 27 | { |
28 | unsigned long flags; | 28 | unsigned long flags; |
29 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags); | 29 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); |
30 | 30 | ||
31 | if (!desc) | 31 | if (!desc) |
32 | return -EINVAL; | 32 | return -EINVAL; |
@@ -54,7 +54,7 @@ EXPORT_SYMBOL(irq_set_chip); | |||
54 | int irq_set_irq_type(unsigned int irq, unsigned int type) | 54 | int irq_set_irq_type(unsigned int irq, unsigned int type) |
55 | { | 55 | { |
56 | unsigned long flags; | 56 | unsigned long flags; |
57 | struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); | 57 | struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); |
58 | int ret = 0; | 58 | int ret = 0; |
59 | 59 | ||
60 | if (!desc) | 60 | if (!desc) |
@@ -78,7 +78,7 @@ EXPORT_SYMBOL(irq_set_irq_type); | |||
78 | int irq_set_handler_data(unsigned int irq, void *data) | 78 | int irq_set_handler_data(unsigned int irq, void *data) |
79 | { | 79 | { |
80 | unsigned long flags; | 80 | unsigned long flags; |
81 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags); | 81 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); |
82 | 82 | ||
83 | if (!desc) | 83 | if (!desc) |
84 | return -EINVAL; | 84 | return -EINVAL; |
@@ -98,7 +98,7 @@ EXPORT_SYMBOL(irq_set_handler_data); | |||
98 | int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) | 98 | int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) |
99 | { | 99 | { |
100 | unsigned long flags; | 100 | unsigned long flags; |
101 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags); | 101 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); |
102 | 102 | ||
103 | if (!desc) | 103 | if (!desc) |
104 | return -EINVAL; | 104 | return -EINVAL; |
@@ -119,7 +119,7 @@ int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) | |||
119 | int irq_set_chip_data(unsigned int irq, void *data) | 119 | int irq_set_chip_data(unsigned int irq, void *data) |
120 | { | 120 | { |
121 | unsigned long flags; | 121 | unsigned long flags; |
122 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags); | 122 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); |
123 | 123 | ||
124 | if (!desc) | 124 | if (!desc) |
125 | return -EINVAL; | 125 | return -EINVAL; |
@@ -204,6 +204,24 @@ void irq_disable(struct irq_desc *desc) | |||
204 | } | 204 | } |
205 | } | 205 | } |
206 | 206 | ||
207 | void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu) | ||
208 | { | ||
209 | if (desc->irq_data.chip->irq_enable) | ||
210 | desc->irq_data.chip->irq_enable(&desc->irq_data); | ||
211 | else | ||
212 | desc->irq_data.chip->irq_unmask(&desc->irq_data); | ||
213 | cpumask_set_cpu(cpu, desc->percpu_enabled); | ||
214 | } | ||
215 | |||
216 | void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu) | ||
217 | { | ||
218 | if (desc->irq_data.chip->irq_disable) | ||
219 | desc->irq_data.chip->irq_disable(&desc->irq_data); | ||
220 | else | ||
221 | desc->irq_data.chip->irq_mask(&desc->irq_data); | ||
222 | cpumask_clear_cpu(cpu, desc->percpu_enabled); | ||
223 | } | ||
224 | |||
207 | static inline void mask_ack_irq(struct irq_desc *desc) | 225 | static inline void mask_ack_irq(struct irq_desc *desc) |
208 | { | 226 | { |
209 | if (desc->irq_data.chip->irq_mask_ack) | 227 | if (desc->irq_data.chip->irq_mask_ack) |
@@ -544,12 +562,44 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) | |||
544 | chip->irq_eoi(&desc->irq_data); | 562 | chip->irq_eoi(&desc->irq_data); |
545 | } | 563 | } |
546 | 564 | ||
565 | /** | ||
566 | * handle_percpu_devid_irq - Per CPU local irq handler with per cpu dev ids | ||
567 | * @irq: the interrupt number | ||
568 | * @desc: the interrupt description structure for this irq | ||
569 | * | ||
570 | * Per CPU interrupts on SMP machines without locking requirements. Same as | ||
571 | * handle_percpu_irq() above but with the following extras: | ||
572 | * | ||
573 | * action->percpu_dev_id is a pointer to percpu variables which | ||
574 | * contain the real device id for the cpu on which this handler is | ||
575 | * called | ||
576 | */ | ||
577 | void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc) | ||
578 | { | ||
579 | struct irq_chip *chip = irq_desc_get_chip(desc); | ||
580 | struct irqaction *action = desc->action; | ||
581 | void *dev_id = __this_cpu_ptr(action->percpu_dev_id); | ||
582 | irqreturn_t res; | ||
583 | |||
584 | kstat_incr_irqs_this_cpu(irq, desc); | ||
585 | |||
586 | if (chip->irq_ack) | ||
587 | chip->irq_ack(&desc->irq_data); | ||
588 | |||
589 | trace_irq_handler_entry(irq, action); | ||
590 | res = action->handler(irq, dev_id); | ||
591 | trace_irq_handler_exit(irq, action, res); | ||
592 | |||
593 | if (chip->irq_eoi) | ||
594 | chip->irq_eoi(&desc->irq_data); | ||
595 | } | ||
596 | |||
547 | void | 597 | void |
548 | __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, | 598 | __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, |
549 | const char *name) | 599 | const char *name) |
550 | { | 600 | { |
551 | unsigned long flags; | 601 | unsigned long flags; |
552 | struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); | 602 | struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0); |
553 | 603 | ||
554 | if (!desc) | 604 | if (!desc) |
555 | return; | 605 | return; |
@@ -593,7 +643,7 @@ irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, | |||
593 | void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) | 643 | void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) |
594 | { | 644 | { |
595 | unsigned long flags; | 645 | unsigned long flags; |
596 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags); | 646 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); |
597 | 647 | ||
598 | if (!desc) | 648 | if (!desc) |
599 | return; | 649 | return; |
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index e38544dddb18..c89295a8f668 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c | |||
@@ -6,6 +6,7 @@ | |||
6 | #include <linux/io.h> | 6 | #include <linux/io.h> |
7 | #include <linux/irq.h> | 7 | #include <linux/irq.h> |
8 | #include <linux/slab.h> | 8 | #include <linux/slab.h> |
9 | #include <linux/export.h> | ||
9 | #include <linux/interrupt.h> | 10 | #include <linux/interrupt.h> |
10 | #include <linux/kernel_stat.h> | 11 | #include <linux/kernel_stat.h> |
11 | #include <linux/syscore_ops.h> | 12 | #include <linux/syscore_ops.h> |
@@ -211,6 +212,7 @@ irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base, | |||
211 | } | 212 | } |
212 | return gc; | 213 | return gc; |
213 | } | 214 | } |
215 | EXPORT_SYMBOL_GPL(irq_alloc_generic_chip); | ||
214 | 216 | ||
215 | /* | 217 | /* |
216 | * Separate lockdep class for interrupt chip which can nest irq_desc | 218 | * Separate lockdep class for interrupt chip which can nest irq_desc |
@@ -258,6 +260,7 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, | |||
258 | } | 260 | } |
259 | gc->irq_cnt = i - gc->irq_base; | 261 | gc->irq_cnt = i - gc->irq_base; |
260 | } | 262 | } |
263 | EXPORT_SYMBOL_GPL(irq_setup_generic_chip); | ||
261 | 264 | ||
262 | /** | 265 | /** |
263 | * irq_setup_alt_chip - Switch to alternative chip | 266 | * irq_setup_alt_chip - Switch to alternative chip |
@@ -281,6 +284,7 @@ int irq_setup_alt_chip(struct irq_data *d, unsigned int type) | |||
281 | } | 284 | } |
282 | return -EINVAL; | 285 | return -EINVAL; |
283 | } | 286 | } |
287 | EXPORT_SYMBOL_GPL(irq_setup_alt_chip); | ||
284 | 288 | ||
285 | /** | 289 | /** |
286 | * irq_remove_generic_chip - Remove a chip | 290 | * irq_remove_generic_chip - Remove a chip |
@@ -311,6 +315,7 @@ void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk, | |||
311 | irq_modify_status(i, clr, set); | 315 | irq_modify_status(i, clr, set); |
312 | } | 316 | } |
313 | } | 317 | } |
318 | EXPORT_SYMBOL_GPL(irq_remove_generic_chip); | ||
314 | 319 | ||
315 | #ifdef CONFIG_PM | 320 | #ifdef CONFIG_PM |
316 | static int irq_gc_suspend(void) | 321 | static int irq_gc_suspend(void) |
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 6546431447d7..b7952316016a 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h | |||
@@ -15,7 +15,7 @@ | |||
15 | 15 | ||
16 | #define istate core_internal_state__do_not_mess_with_it | 16 | #define istate core_internal_state__do_not_mess_with_it |
17 | 17 | ||
18 | extern int noirqdebug; | 18 | extern bool noirqdebug; |
19 | 19 | ||
20 | /* | 20 | /* |
21 | * Bits used by threaded handlers: | 21 | * Bits used by threaded handlers: |
@@ -71,6 +71,8 @@ extern int irq_startup(struct irq_desc *desc); | |||
71 | extern void irq_shutdown(struct irq_desc *desc); | 71 | extern void irq_shutdown(struct irq_desc *desc); |
72 | extern void irq_enable(struct irq_desc *desc); | 72 | extern void irq_enable(struct irq_desc *desc); |
73 | extern void irq_disable(struct irq_desc *desc); | 73 | extern void irq_disable(struct irq_desc *desc); |
74 | extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu); | ||
75 | extern void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu); | ||
74 | extern void mask_irq(struct irq_desc *desc); | 76 | extern void mask_irq(struct irq_desc *desc); |
75 | extern void unmask_irq(struct irq_desc *desc); | 77 | extern void unmask_irq(struct irq_desc *desc); |
76 | 78 | ||
@@ -114,14 +116,21 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc) | |||
114 | desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data); | 116 | desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data); |
115 | } | 117 | } |
116 | 118 | ||
119 | #define _IRQ_DESC_CHECK (1 << 0) | ||
120 | #define _IRQ_DESC_PERCPU (1 << 1) | ||
121 | |||
122 | #define IRQ_GET_DESC_CHECK_GLOBAL (_IRQ_DESC_CHECK) | ||
123 | #define IRQ_GET_DESC_CHECK_PERCPU (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU) | ||
124 | |||
117 | struct irq_desc * | 125 | struct irq_desc * |
118 | __irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus); | 126 | __irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, |
127 | unsigned int check); | ||
119 | void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus); | 128 | void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus); |
120 | 129 | ||
121 | static inline struct irq_desc * | 130 | static inline struct irq_desc * |
122 | irq_get_desc_buslock(unsigned int irq, unsigned long *flags) | 131 | irq_get_desc_buslock(unsigned int irq, unsigned long *flags, unsigned int check) |
123 | { | 132 | { |
124 | return __irq_get_desc_lock(irq, flags, true); | 133 | return __irq_get_desc_lock(irq, flags, true, check); |
125 | } | 134 | } |
126 | 135 | ||
127 | static inline void | 136 | static inline void |
@@ -131,9 +140,9 @@ irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags) | |||
131 | } | 140 | } |
132 | 141 | ||
133 | static inline struct irq_desc * | 142 | static inline struct irq_desc * |
134 | irq_get_desc_lock(unsigned int irq, unsigned long *flags) | 143 | irq_get_desc_lock(unsigned int irq, unsigned long *flags, unsigned int check) |
135 | { | 144 | { |
136 | return __irq_get_desc_lock(irq, flags, false); | 145 | return __irq_get_desc_lock(irq, flags, false, check); |
137 | } | 146 | } |
138 | 147 | ||
139 | static inline void | 148 | static inline void |
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 039b889ea053..d86e254b95eb 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c | |||
@@ -9,7 +9,7 @@ | |||
9 | */ | 9 | */ |
10 | #include <linux/irq.h> | 10 | #include <linux/irq.h> |
11 | #include <linux/slab.h> | 11 | #include <linux/slab.h> |
12 | #include <linux/module.h> | 12 | #include <linux/export.h> |
13 | #include <linux/interrupt.h> | 13 | #include <linux/interrupt.h> |
14 | #include <linux/kernel_stat.h> | 14 | #include <linux/kernel_stat.h> |
15 | #include <linux/radix-tree.h> | 15 | #include <linux/radix-tree.h> |
@@ -424,11 +424,22 @@ unsigned int irq_get_next_irq(unsigned int offset) | |||
424 | } | 424 | } |
425 | 425 | ||
426 | struct irq_desc * | 426 | struct irq_desc * |
427 | __irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus) | 427 | __irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, |
428 | unsigned int check) | ||
428 | { | 429 | { |
429 | struct irq_desc *desc = irq_to_desc(irq); | 430 | struct irq_desc *desc = irq_to_desc(irq); |
430 | 431 | ||
431 | if (desc) { | 432 | if (desc) { |
433 | if (check & _IRQ_DESC_CHECK) { | ||
434 | if ((check & _IRQ_DESC_PERCPU) && | ||
435 | !irq_settings_is_per_cpu_devid(desc)) | ||
436 | return NULL; | ||
437 | |||
438 | if (!(check & _IRQ_DESC_PERCPU) && | ||
439 | irq_settings_is_per_cpu_devid(desc)) | ||
440 | return NULL; | ||
441 | } | ||
442 | |||
432 | if (bus) | 443 | if (bus) |
433 | chip_bus_lock(desc); | 444 | chip_bus_lock(desc); |
434 | raw_spin_lock_irqsave(&desc->lock, *flags); | 445 | raw_spin_lock_irqsave(&desc->lock, *flags); |
@@ -443,6 +454,25 @@ void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus) | |||
443 | chip_bus_sync_unlock(desc); | 454 | chip_bus_sync_unlock(desc); |
444 | } | 455 | } |
445 | 456 | ||
457 | int irq_set_percpu_devid(unsigned int irq) | ||
458 | { | ||
459 | struct irq_desc *desc = irq_to_desc(irq); | ||
460 | |||
461 | if (!desc) | ||
462 | return -EINVAL; | ||
463 | |||
464 | if (desc->percpu_enabled) | ||
465 | return -EINVAL; | ||
466 | |||
467 | desc->percpu_enabled = kzalloc(sizeof(*desc->percpu_enabled), GFP_KERNEL); | ||
468 | |||
469 | if (!desc->percpu_enabled) | ||
470 | return -ENOMEM; | ||
471 | |||
472 | irq_set_percpu_devid_flags(irq); | ||
473 | return 0; | ||
474 | } | ||
475 | |||
446 | /** | 476 | /** |
447 | * dynamic_irq_cleanup - cleanup a dynamically allocated irq | 477 | * dynamic_irq_cleanup - cleanup a dynamically allocated irq |
448 | * @irq: irq number to initialize | 478 | * @irq: irq number to initialize |
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index b57a3776de44..1f9e26526b69 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c | |||
@@ -20,15 +20,15 @@ static DEFINE_MUTEX(irq_domain_mutex); | |||
20 | void irq_domain_add(struct irq_domain *domain) | 20 | void irq_domain_add(struct irq_domain *domain) |
21 | { | 21 | { |
22 | struct irq_data *d; | 22 | struct irq_data *d; |
23 | int hwirq; | 23 | int hwirq, irq; |
24 | 24 | ||
25 | /* | 25 | /* |
26 | * This assumes that the irq_domain owner has already allocated | 26 | * This assumes that the irq_domain owner has already allocated |
27 | * the irq_descs. This block will be removed when support for dynamic | 27 | * the irq_descs. This block will be removed when support for dynamic |
28 | * allocation of irq_descs is added to irq_domain. | 28 | * allocation of irq_descs is added to irq_domain. |
29 | */ | 29 | */ |
30 | for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) { | 30 | irq_domain_for_each_irq(domain, hwirq, irq) { |
31 | d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq)); | 31 | d = irq_get_irq_data(irq); |
32 | if (!d) { | 32 | if (!d) { |
33 | WARN(1, "error: assigning domain to non existant irq_desc"); | 33 | WARN(1, "error: assigning domain to non existant irq_desc"); |
34 | return; | 34 | return; |
@@ -54,15 +54,15 @@ void irq_domain_add(struct irq_domain *domain) | |||
54 | void irq_domain_del(struct irq_domain *domain) | 54 | void irq_domain_del(struct irq_domain *domain) |
55 | { | 55 | { |
56 | struct irq_data *d; | 56 | struct irq_data *d; |
57 | int hwirq; | 57 | int hwirq, irq; |
58 | 58 | ||
59 | mutex_lock(&irq_domain_mutex); | 59 | mutex_lock(&irq_domain_mutex); |
60 | list_del(&domain->list); | 60 | list_del(&domain->list); |
61 | mutex_unlock(&irq_domain_mutex); | 61 | mutex_unlock(&irq_domain_mutex); |
62 | 62 | ||
63 | /* Clear the irq_domain assignments */ | 63 | /* Clear the irq_domain assignments */ |
64 | for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) { | 64 | irq_domain_for_each_irq(domain, hwirq, irq) { |
65 | d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq)); | 65 | d = irq_get_irq_data(irq); |
66 | d->domain = NULL; | 66 | d->domain = NULL; |
67 | } | 67 | } |
68 | } | 68 | } |
@@ -135,6 +135,9 @@ int irq_domain_simple_dt_translate(struct irq_domain *d, | |||
135 | return -EINVAL; | 135 | return -EINVAL; |
136 | if (intsize < 1) | 136 | if (intsize < 1) |
137 | return -EINVAL; | 137 | return -EINVAL; |
138 | if (d->nr_irq && ((intspec[0] < d->hwirq_base) || | ||
139 | (intspec[0] >= d->hwirq_base + d->nr_irq))) | ||
140 | return -EINVAL; | ||
138 | 141 | ||
139 | *out_hwirq = intspec[0]; | 142 | *out_hwirq = intspec[0]; |
140 | *out_type = IRQ_TYPE_NONE; | 143 | *out_type = IRQ_TYPE_NONE; |
@@ -143,11 +146,6 @@ int irq_domain_simple_dt_translate(struct irq_domain *d, | |||
143 | return 0; | 146 | return 0; |
144 | } | 147 | } |
145 | 148 | ||
146 | struct irq_domain_ops irq_domain_simple_ops = { | ||
147 | .dt_translate = irq_domain_simple_dt_translate, | ||
148 | }; | ||
149 | EXPORT_SYMBOL_GPL(irq_domain_simple_ops); | ||
150 | |||
151 | /** | 149 | /** |
152 | * irq_domain_create_simple() - Set up a 'simple' translation range | 150 | * irq_domain_create_simple() - Set up a 'simple' translation range |
153 | */ | 151 | */ |
@@ -182,3 +180,10 @@ void irq_domain_generate_simple(const struct of_device_id *match, | |||
182 | } | 180 | } |
183 | EXPORT_SYMBOL_GPL(irq_domain_generate_simple); | 181 | EXPORT_SYMBOL_GPL(irq_domain_generate_simple); |
184 | #endif /* CONFIG_OF_IRQ */ | 182 | #endif /* CONFIG_OF_IRQ */ |
183 | |||
184 | struct irq_domain_ops irq_domain_simple_ops = { | ||
185 | #ifdef CONFIG_OF_IRQ | ||
186 | .dt_translate = irq_domain_simple_dt_translate, | ||
187 | #endif /* CONFIG_OF_IRQ */ | ||
188 | }; | ||
189 | EXPORT_SYMBOL_GPL(irq_domain_simple_ops); | ||
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 9b956fa20308..a9a9dbe49fea 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -195,7 +195,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask) | |||
195 | int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) | 195 | int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) |
196 | { | 196 | { |
197 | unsigned long flags; | 197 | unsigned long flags; |
198 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags); | 198 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); |
199 | 199 | ||
200 | if (!desc) | 200 | if (!desc) |
201 | return -EINVAL; | 201 | return -EINVAL; |
@@ -356,7 +356,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) | |||
356 | static int __disable_irq_nosync(unsigned int irq) | 356 | static int __disable_irq_nosync(unsigned int irq) |
357 | { | 357 | { |
358 | unsigned long flags; | 358 | unsigned long flags; |
359 | struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); | 359 | struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); |
360 | 360 | ||
361 | if (!desc) | 361 | if (!desc) |
362 | return -EINVAL; | 362 | return -EINVAL; |
@@ -448,7 +448,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) | |||
448 | void enable_irq(unsigned int irq) | 448 | void enable_irq(unsigned int irq) |
449 | { | 449 | { |
450 | unsigned long flags; | 450 | unsigned long flags; |
451 | struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); | 451 | struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); |
452 | 452 | ||
453 | if (!desc) | 453 | if (!desc) |
454 | return; | 454 | return; |
@@ -467,6 +467,9 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) | |||
467 | struct irq_desc *desc = irq_to_desc(irq); | 467 | struct irq_desc *desc = irq_to_desc(irq); |
468 | int ret = -ENXIO; | 468 | int ret = -ENXIO; |
469 | 469 | ||
470 | if (irq_desc_get_chip(desc)->flags & IRQCHIP_SKIP_SET_WAKE) | ||
471 | return 0; | ||
472 | |||
470 | if (desc->irq_data.chip->irq_set_wake) | 473 | if (desc->irq_data.chip->irq_set_wake) |
471 | ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on); | 474 | ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on); |
472 | 475 | ||
@@ -488,7 +491,7 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) | |||
488 | int irq_set_irq_wake(unsigned int irq, unsigned int on) | 491 | int irq_set_irq_wake(unsigned int irq, unsigned int on) |
489 | { | 492 | { |
490 | unsigned long flags; | 493 | unsigned long flags; |
491 | struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); | 494 | struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); |
492 | int ret = 0; | 495 | int ret = 0; |
493 | 496 | ||
494 | if (!desc) | 497 | if (!desc) |
@@ -529,7 +532,7 @@ EXPORT_SYMBOL(irq_set_irq_wake); | |||
529 | int can_request_irq(unsigned int irq, unsigned long irqflags) | 532 | int can_request_irq(unsigned int irq, unsigned long irqflags) |
530 | { | 533 | { |
531 | unsigned long flags; | 534 | unsigned long flags; |
532 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags); | 535 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); |
533 | int canrequest = 0; | 536 | int canrequest = 0; |
534 | 537 | ||
535 | if (!desc) | 538 | if (!desc) |
@@ -620,8 +623,9 @@ static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id) | |||
620 | 623 | ||
621 | static int irq_wait_for_interrupt(struct irqaction *action) | 624 | static int irq_wait_for_interrupt(struct irqaction *action) |
622 | { | 625 | { |
626 | set_current_state(TASK_INTERRUPTIBLE); | ||
627 | |||
623 | while (!kthread_should_stop()) { | 628 | while (!kthread_should_stop()) { |
624 | set_current_state(TASK_INTERRUPTIBLE); | ||
625 | 629 | ||
626 | if (test_and_clear_bit(IRQTF_RUNTHREAD, | 630 | if (test_and_clear_bit(IRQTF_RUNTHREAD, |
627 | &action->thread_flags)) { | 631 | &action->thread_flags)) { |
@@ -629,7 +633,9 @@ static int irq_wait_for_interrupt(struct irqaction *action) | |||
629 | return 0; | 633 | return 0; |
630 | } | 634 | } |
631 | schedule(); | 635 | schedule(); |
636 | set_current_state(TASK_INTERRUPTIBLE); | ||
632 | } | 637 | } |
638 | __set_current_state(TASK_RUNNING); | ||
633 | return -1; | 639 | return -1; |
634 | } | 640 | } |
635 | 641 | ||
@@ -1118,6 +1124,8 @@ int setup_irq(unsigned int irq, struct irqaction *act) | |||
1118 | int retval; | 1124 | int retval; |
1119 | struct irq_desc *desc = irq_to_desc(irq); | 1125 | struct irq_desc *desc = irq_to_desc(irq); |
1120 | 1126 | ||
1127 | if (WARN_ON(irq_settings_is_per_cpu_devid(desc))) | ||
1128 | return -EINVAL; | ||
1121 | chip_bus_lock(desc); | 1129 | chip_bus_lock(desc); |
1122 | retval = __setup_irq(irq, desc, act); | 1130 | retval = __setup_irq(irq, desc, act); |
1123 | chip_bus_sync_unlock(desc); | 1131 | chip_bus_sync_unlock(desc); |
@@ -1126,7 +1134,7 @@ int setup_irq(unsigned int irq, struct irqaction *act) | |||
1126 | } | 1134 | } |
1127 | EXPORT_SYMBOL_GPL(setup_irq); | 1135 | EXPORT_SYMBOL_GPL(setup_irq); |
1128 | 1136 | ||
1129 | /* | 1137 | /* |
1130 | * Internal function to unregister an irqaction - used to free | 1138 | * Internal function to unregister an irqaction - used to free |
1131 | * regular and special interrupts that are part of the architecture. | 1139 | * regular and special interrupts that are part of the architecture. |
1132 | */ | 1140 | */ |
@@ -1224,7 +1232,10 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) | |||
1224 | */ | 1232 | */ |
1225 | void remove_irq(unsigned int irq, struct irqaction *act) | 1233 | void remove_irq(unsigned int irq, struct irqaction *act) |
1226 | { | 1234 | { |
1227 | __free_irq(irq, act->dev_id); | 1235 | struct irq_desc *desc = irq_to_desc(irq); |
1236 | |||
1237 | if (desc && !WARN_ON(irq_settings_is_per_cpu_devid(desc))) | ||
1238 | __free_irq(irq, act->dev_id); | ||
1228 | } | 1239 | } |
1229 | EXPORT_SYMBOL_GPL(remove_irq); | 1240 | EXPORT_SYMBOL_GPL(remove_irq); |
1230 | 1241 | ||
@@ -1246,7 +1257,7 @@ void free_irq(unsigned int irq, void *dev_id) | |||
1246 | { | 1257 | { |
1247 | struct irq_desc *desc = irq_to_desc(irq); | 1258 | struct irq_desc *desc = irq_to_desc(irq); |
1248 | 1259 | ||
1249 | if (!desc) | 1260 | if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) |
1250 | return; | 1261 | return; |
1251 | 1262 | ||
1252 | #ifdef CONFIG_SMP | 1263 | #ifdef CONFIG_SMP |
@@ -1281,7 +1292,7 @@ EXPORT_SYMBOL(free_irq); | |||
1281 | * and to set up the interrupt handler in the right order. | 1292 | * and to set up the interrupt handler in the right order. |
1282 | * | 1293 | * |
1283 | * If you want to set up a threaded irq handler for your device | 1294 | * If you want to set up a threaded irq handler for your device |
1284 | * then you need to supply @handler and @thread_fn. @handler ist | 1295 | * then you need to supply @handler and @thread_fn. @handler is |
1285 | * still called in hard interrupt context and has to check | 1296 | * still called in hard interrupt context and has to check |
1286 | * whether the interrupt originates from the device. If yes it | 1297 | * whether the interrupt originates from the device. If yes it |
1287 | * needs to disable the interrupt on the device and return | 1298 | * needs to disable the interrupt on the device and return |
@@ -1324,7 +1335,8 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, | |||
1324 | if (!desc) | 1335 | if (!desc) |
1325 | return -EINVAL; | 1336 | return -EINVAL; |
1326 | 1337 | ||
1327 | if (!irq_settings_can_request(desc)) | 1338 | if (!irq_settings_can_request(desc) || |
1339 | WARN_ON(irq_settings_is_per_cpu_devid(desc))) | ||
1328 | return -EINVAL; | 1340 | return -EINVAL; |
1329 | 1341 | ||
1330 | if (!handler) { | 1342 | if (!handler) { |
@@ -1409,3 +1421,194 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler, | |||
1409 | return !ret ? IRQC_IS_HARDIRQ : ret; | 1421 | return !ret ? IRQC_IS_HARDIRQ : ret; |
1410 | } | 1422 | } |
1411 | EXPORT_SYMBOL_GPL(request_any_context_irq); | 1423 | EXPORT_SYMBOL_GPL(request_any_context_irq); |
1424 | |||
1425 | void enable_percpu_irq(unsigned int irq, unsigned int type) | ||
1426 | { | ||
1427 | unsigned int cpu = smp_processor_id(); | ||
1428 | unsigned long flags; | ||
1429 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU); | ||
1430 | |||
1431 | if (!desc) | ||
1432 | return; | ||
1433 | |||
1434 | type &= IRQ_TYPE_SENSE_MASK; | ||
1435 | if (type != IRQ_TYPE_NONE) { | ||
1436 | int ret; | ||
1437 | |||
1438 | ret = __irq_set_trigger(desc, irq, type); | ||
1439 | |||
1440 | if (ret) { | ||
1441 | WARN(1, "failed to set type for IRQ%d\n", irq); | ||
1442 | goto out; | ||
1443 | } | ||
1444 | } | ||
1445 | |||
1446 | irq_percpu_enable(desc, cpu); | ||
1447 | out: | ||
1448 | irq_put_desc_unlock(desc, flags); | ||
1449 | } | ||
1450 | |||
1451 | void disable_percpu_irq(unsigned int irq) | ||
1452 | { | ||
1453 | unsigned int cpu = smp_processor_id(); | ||
1454 | unsigned long flags; | ||
1455 | struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU); | ||
1456 | |||
1457 | if (!desc) | ||
1458 | return; | ||
1459 | |||
1460 | irq_percpu_disable(desc, cpu); | ||
1461 | irq_put_desc_unlock(desc, flags); | ||
1462 | } | ||
1463 | |||
1464 | /* | ||
1465 | * Internal function to unregister a percpu irqaction. | ||
1466 | */ | ||
1467 | static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_id) | ||
1468 | { | ||
1469 | struct irq_desc *desc = irq_to_desc(irq); | ||
1470 | struct irqaction *action; | ||
1471 | unsigned long flags; | ||
1472 | |||
1473 | WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); | ||
1474 | |||
1475 | if (!desc) | ||
1476 | return NULL; | ||
1477 | |||
1478 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
1479 | |||
1480 | action = desc->action; | ||
1481 | if (!action || action->percpu_dev_id != dev_id) { | ||
1482 | WARN(1, "Trying to free already-free IRQ %d\n", irq); | ||
1483 | goto bad; | ||
1484 | } | ||
1485 | |||
1486 | if (!cpumask_empty(desc->percpu_enabled)) { | ||
1487 | WARN(1, "percpu IRQ %d still enabled on CPU%d!\n", | ||
1488 | irq, cpumask_first(desc->percpu_enabled)); | ||
1489 | goto bad; | ||
1490 | } | ||
1491 | |||
1492 | /* Found it - now remove it from the list of entries: */ | ||
1493 | desc->action = NULL; | ||
1494 | |||
1495 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
1496 | |||
1497 | unregister_handler_proc(irq, action); | ||
1498 | |||
1499 | module_put(desc->owner); | ||
1500 | return action; | ||
1501 | |||
1502 | bad: | ||
1503 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
1504 | return NULL; | ||
1505 | } | ||
1506 | |||
1507 | /** | ||
1508 | * remove_percpu_irq - free a per-cpu interrupt | ||
1509 | * @irq: Interrupt line to free | ||
1510 | * @act: irqaction for the interrupt | ||
1511 | * | ||
1512 | * Used to remove interrupts statically setup by the early boot process. | ||
1513 | */ | ||
1514 | void remove_percpu_irq(unsigned int irq, struct irqaction *act) | ||
1515 | { | ||
1516 | struct irq_desc *desc = irq_to_desc(irq); | ||
1517 | |||
1518 | if (desc && irq_settings_is_per_cpu_devid(desc)) | ||
1519 | __free_percpu_irq(irq, act->percpu_dev_id); | ||
1520 | } | ||
1521 | |||
1522 | /** | ||
1523 | * free_percpu_irq - free an interrupt allocated with request_percpu_irq | ||
1524 | * @irq: Interrupt line to free | ||
1525 | * @dev_id: Device identity to free | ||
1526 | * | ||
1527 | * Remove a percpu interrupt handler. The handler is removed, but | ||
1528 | * the interrupt line is not disabled. This must be done on each | ||
1529 | * CPU before calling this function. The function does not return | ||
1530 | * until any executing interrupts for this IRQ have completed. | ||
1531 | * | ||
1532 | * This function must not be called from interrupt context. | ||
1533 | */ | ||
1534 | void free_percpu_irq(unsigned int irq, void __percpu *dev_id) | ||
1535 | { | ||
1536 | struct irq_desc *desc = irq_to_desc(irq); | ||
1537 | |||
1538 | if (!desc || !irq_settings_is_per_cpu_devid(desc)) | ||
1539 | return; | ||
1540 | |||
1541 | chip_bus_lock(desc); | ||
1542 | kfree(__free_percpu_irq(irq, dev_id)); | ||
1543 | chip_bus_sync_unlock(desc); | ||
1544 | } | ||
1545 | |||
1546 | /** | ||
1547 | * setup_percpu_irq - setup a per-cpu interrupt | ||
1548 | * @irq: Interrupt line to setup | ||
1549 | * @act: irqaction for the interrupt | ||
1550 | * | ||
1551 | * Used to statically setup per-cpu interrupts in the early boot process. | ||
1552 | */ | ||
1553 | int setup_percpu_irq(unsigned int irq, struct irqaction *act) | ||
1554 | { | ||
1555 | struct irq_desc *desc = irq_to_desc(irq); | ||
1556 | int retval; | ||
1557 | |||
1558 | if (!desc || !irq_settings_is_per_cpu_devid(desc)) | ||
1559 | return -EINVAL; | ||
1560 | chip_bus_lock(desc); | ||
1561 | retval = __setup_irq(irq, desc, act); | ||
1562 | chip_bus_sync_unlock(desc); | ||
1563 | |||
1564 | return retval; | ||
1565 | } | ||
1566 | |||
1567 | /** | ||
1568 | * request_percpu_irq - allocate a percpu interrupt line | ||
1569 | * @irq: Interrupt line to allocate | ||
1570 | * @handler: Function to be called when the IRQ occurs. | ||
1571 | * @devname: An ascii name for the claiming device | ||
1572 | * @dev_id: A percpu cookie passed back to the handler function | ||
1573 | * | ||
1574 | * This call allocates interrupt resources, but doesn't | ||
1575 | * automatically enable the interrupt. It has to be done on each | ||
1576 | * CPU using enable_percpu_irq(). | ||
1577 | * | ||
1578 | * Dev_id must be globally unique. It is a per-cpu variable, and | ||
1579 | * the handler gets called with the interrupted CPU's instance of | ||
1580 | * that variable. | ||
1581 | */ | ||
1582 | int request_percpu_irq(unsigned int irq, irq_handler_t handler, | ||
1583 | const char *devname, void __percpu *dev_id) | ||
1584 | { | ||
1585 | struct irqaction *action; | ||
1586 | struct irq_desc *desc; | ||
1587 | int retval; | ||
1588 | |||
1589 | if (!dev_id) | ||
1590 | return -EINVAL; | ||
1591 | |||
1592 | desc = irq_to_desc(irq); | ||
1593 | if (!desc || !irq_settings_can_request(desc) || | ||
1594 | !irq_settings_is_per_cpu_devid(desc)) | ||
1595 | return -EINVAL; | ||
1596 | |||
1597 | action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); | ||
1598 | if (!action) | ||
1599 | return -ENOMEM; | ||
1600 | |||
1601 | action->handler = handler; | ||
1602 | action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND; | ||
1603 | action->name = devname; | ||
1604 | action->percpu_dev_id = dev_id; | ||
1605 | |||
1606 | chip_bus_lock(desc); | ||
1607 | retval = __setup_irq(irq, desc, action); | ||
1608 | chip_bus_sync_unlock(desc); | ||
1609 | |||
1610 | if (retval) | ||
1611 | kfree(action); | ||
1612 | |||
1613 | return retval; | ||
1614 | } | ||
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index f76fc00c9877..15e53b1766a6 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <linux/irq.h> | 9 | #include <linux/irq.h> |
10 | #include <linux/module.h> | 10 | #include <linux/module.h> |
11 | #include <linux/interrupt.h> | 11 | #include <linux/interrupt.h> |
12 | #include <linux/syscore_ops.h> | ||
12 | 13 | ||
13 | #include "internals.h" | 14 | #include "internals.h" |
14 | 15 | ||
@@ -39,25 +40,58 @@ void suspend_device_irqs(void) | |||
39 | } | 40 | } |
40 | EXPORT_SYMBOL_GPL(suspend_device_irqs); | 41 | EXPORT_SYMBOL_GPL(suspend_device_irqs); |
41 | 42 | ||
42 | /** | 43 | static void resume_irqs(bool want_early) |
43 | * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() | ||
44 | * | ||
45 | * Enable all interrupt lines previously disabled by suspend_device_irqs() that | ||
46 | * have the IRQS_SUSPENDED flag set. | ||
47 | */ | ||
48 | void resume_device_irqs(void) | ||
49 | { | 44 | { |
50 | struct irq_desc *desc; | 45 | struct irq_desc *desc; |
51 | int irq; | 46 | int irq; |
52 | 47 | ||
53 | for_each_irq_desc(irq, desc) { | 48 | for_each_irq_desc(irq, desc) { |
54 | unsigned long flags; | 49 | unsigned long flags; |
50 | bool is_early = desc->action && | ||
51 | desc->action->flags & IRQF_EARLY_RESUME; | ||
52 | |||
53 | if (is_early != want_early) | ||
54 | continue; | ||
55 | 55 | ||
56 | raw_spin_lock_irqsave(&desc->lock, flags); | 56 | raw_spin_lock_irqsave(&desc->lock, flags); |
57 | __enable_irq(desc, irq, true); | 57 | __enable_irq(desc, irq, true); |
58 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 58 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
59 | } | 59 | } |
60 | } | 60 | } |
61 | |||
62 | /** | ||
63 | * irq_pm_syscore_ops - enable interrupt lines early | ||
64 | * | ||
65 | * Enable all interrupt lines with %IRQF_EARLY_RESUME set. | ||
66 | */ | ||
67 | static void irq_pm_syscore_resume(void) | ||
68 | { | ||
69 | resume_irqs(true); | ||
70 | } | ||
71 | |||
72 | static struct syscore_ops irq_pm_syscore_ops = { | ||
73 | .resume = irq_pm_syscore_resume, | ||
74 | }; | ||
75 | |||
76 | static int __init irq_pm_init_ops(void) | ||
77 | { | ||
78 | register_syscore_ops(&irq_pm_syscore_ops); | ||
79 | return 0; | ||
80 | } | ||
81 | |||
82 | device_initcall(irq_pm_init_ops); | ||
83 | |||
84 | /** | ||
85 | * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() | ||
86 | * | ||
87 | * Enable all non-%IRQF_EARLY_RESUME interrupt lines previously | ||
88 | * disabled by suspend_device_irqs() that have the IRQS_SUSPENDED flag | ||
89 | * set as well as those with %IRQF_FORCE_RESUME. | ||
90 | */ | ||
91 | void resume_device_irqs(void) | ||
92 | { | ||
93 | resume_irqs(false); | ||
94 | } | ||
61 | EXPORT_SYMBOL_GPL(resume_device_irqs); | 95 | EXPORT_SYMBOL_GPL(resume_device_irqs); |
62 | 96 | ||
63 | /** | 97 | /** |
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index f1667833d444..1162f1030f18 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h | |||
@@ -13,6 +13,7 @@ enum { | |||
13 | _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT, | 13 | _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT, |
14 | _IRQ_NO_BALANCING = IRQ_NO_BALANCING, | 14 | _IRQ_NO_BALANCING = IRQ_NO_BALANCING, |
15 | _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD, | 15 | _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD, |
16 | _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID, | ||
16 | _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, | 17 | _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, |
17 | }; | 18 | }; |
18 | 19 | ||
@@ -24,6 +25,7 @@ enum { | |||
24 | #define IRQ_NOTHREAD GOT_YOU_MORON | 25 | #define IRQ_NOTHREAD GOT_YOU_MORON |
25 | #define IRQ_NOAUTOEN GOT_YOU_MORON | 26 | #define IRQ_NOAUTOEN GOT_YOU_MORON |
26 | #define IRQ_NESTED_THREAD GOT_YOU_MORON | 27 | #define IRQ_NESTED_THREAD GOT_YOU_MORON |
28 | #define IRQ_PER_CPU_DEVID GOT_YOU_MORON | ||
27 | #undef IRQF_MODIFY_MASK | 29 | #undef IRQF_MODIFY_MASK |
28 | #define IRQF_MODIFY_MASK GOT_YOU_MORON | 30 | #define IRQF_MODIFY_MASK GOT_YOU_MORON |
29 | 31 | ||
@@ -39,6 +41,11 @@ static inline bool irq_settings_is_per_cpu(struct irq_desc *desc) | |||
39 | return desc->status_use_accessors & _IRQ_PER_CPU; | 41 | return desc->status_use_accessors & _IRQ_PER_CPU; |
40 | } | 42 | } |
41 | 43 | ||
44 | static inline bool irq_settings_is_per_cpu_devid(struct irq_desc *desc) | ||
45 | { | ||
46 | return desc->status_use_accessors & _IRQ_PER_CPU_DEVID; | ||
47 | } | ||
48 | |||
42 | static inline void irq_settings_set_per_cpu(struct irq_desc *desc) | 49 | static inline void irq_settings_set_per_cpu(struct irq_desc *desc) |
43 | { | 50 | { |
44 | desc->status_use_accessors |= _IRQ_PER_CPU; | 51 | desc->status_use_accessors |= _IRQ_PER_CPU; |
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index aa57d5da18c1..611cd6003c45 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c | |||
@@ -84,7 +84,9 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) | |||
84 | */ | 84 | */ |
85 | action = desc->action; | 85 | action = desc->action; |
86 | if (!action || !(action->flags & IRQF_SHARED) || | 86 | if (!action || !(action->flags & IRQF_SHARED) || |
87 | (action->flags & __IRQF_TIMER) || !action->next) | 87 | (action->flags & __IRQF_TIMER) || |
88 | (action->handler(irq, action->dev_id) == IRQ_HANDLED) || | ||
89 | !action->next) | ||
88 | goto out; | 90 | goto out; |
89 | 91 | ||
90 | /* Already running on another processor */ | 92 | /* Already running on another processor */ |
@@ -115,7 +117,7 @@ static int misrouted_irq(int irq) | |||
115 | struct irq_desc *desc; | 117 | struct irq_desc *desc; |
116 | int i, ok = 0; | 118 | int i, ok = 0; |
117 | 119 | ||
118 | if (atomic_inc_return(&irq_poll_active) == 1) | 120 | if (atomic_inc_return(&irq_poll_active) != 1) |
119 | goto out; | 121 | goto out; |
120 | 122 | ||
121 | irq_poll_cpu = smp_processor_id(); | 123 | irq_poll_cpu = smp_processor_id(); |
@@ -323,7 +325,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, | |||
323 | desc->irqs_unhandled = 0; | 325 | desc->irqs_unhandled = 0; |
324 | } | 326 | } |
325 | 327 | ||
326 | int noirqdebug __read_mostly; | 328 | bool noirqdebug __read_mostly; |
327 | 329 | ||
328 | int noirqdebug_setup(char *str) | 330 | int noirqdebug_setup(char *str) |
329 | { | 331 | { |
diff --git a/kernel/irq_work.c b/kernel/irq_work.c index c58fa7da8aef..c3c46c72046e 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c | |||
@@ -6,9 +6,11 @@ | |||
6 | */ | 6 | */ |
7 | 7 | ||
8 | #include <linux/kernel.h> | 8 | #include <linux/kernel.h> |
9 | #include <linux/module.h> | 9 | #include <linux/export.h> |
10 | #include <linux/irq_work.h> | 10 | #include <linux/irq_work.h> |
11 | #include <linux/percpu.h> | ||
11 | #include <linux/hardirq.h> | 12 | #include <linux/hardirq.h> |
13 | #include <asm/processor.h> | ||
12 | 14 | ||
13 | /* | 15 | /* |
14 | * An entry can be in one of four states: | 16 | * An entry can be in one of four states: |
@@ -17,54 +19,34 @@ | |||
17 | * claimed NULL, 3 -> {pending} : claimed to be enqueued | 19 | * claimed NULL, 3 -> {pending} : claimed to be enqueued |
18 | * pending next, 3 -> {busy} : queued, pending callback | 20 | * pending next, 3 -> {busy} : queued, pending callback |
19 | * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed | 21 | * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed |
20 | * | ||
21 | * We use the lower two bits of the next pointer to keep PENDING and BUSY | ||
22 | * flags. | ||
23 | */ | 22 | */ |
24 | 23 | ||
25 | #define IRQ_WORK_PENDING 1UL | 24 | #define IRQ_WORK_PENDING 1UL |
26 | #define IRQ_WORK_BUSY 2UL | 25 | #define IRQ_WORK_BUSY 2UL |
27 | #define IRQ_WORK_FLAGS 3UL | 26 | #define IRQ_WORK_FLAGS 3UL |
28 | 27 | ||
29 | static inline bool irq_work_is_set(struct irq_work *entry, int flags) | 28 | static DEFINE_PER_CPU(struct llist_head, irq_work_list); |
30 | { | ||
31 | return (unsigned long)entry->next & flags; | ||
32 | } | ||
33 | |||
34 | static inline struct irq_work *irq_work_next(struct irq_work *entry) | ||
35 | { | ||
36 | unsigned long next = (unsigned long)entry->next; | ||
37 | next &= ~IRQ_WORK_FLAGS; | ||
38 | return (struct irq_work *)next; | ||
39 | } | ||
40 | |||
41 | static inline struct irq_work *next_flags(struct irq_work *entry, int flags) | ||
42 | { | ||
43 | unsigned long next = (unsigned long)entry; | ||
44 | next |= flags; | ||
45 | return (struct irq_work *)next; | ||
46 | } | ||
47 | |||
48 | static DEFINE_PER_CPU(struct irq_work *, irq_work_list); | ||
49 | 29 | ||
50 | /* | 30 | /* |
51 | * Claim the entry so that no one else will poke at it. | 31 | * Claim the entry so that no one else will poke at it. |
52 | */ | 32 | */ |
53 | static bool irq_work_claim(struct irq_work *entry) | 33 | static bool irq_work_claim(struct irq_work *work) |
54 | { | 34 | { |
55 | struct irq_work *next, *nflags; | 35 | unsigned long flags, nflags; |
56 | 36 | ||
57 | do { | 37 | for (;;) { |
58 | next = entry->next; | 38 | flags = work->flags; |
59 | if ((unsigned long)next & IRQ_WORK_PENDING) | 39 | if (flags & IRQ_WORK_PENDING) |
60 | return false; | 40 | return false; |
61 | nflags = next_flags(next, IRQ_WORK_FLAGS); | 41 | nflags = flags | IRQ_WORK_FLAGS; |
62 | } while (cmpxchg(&entry->next, next, nflags) != next); | 42 | if (cmpxchg(&work->flags, flags, nflags) == flags) |
43 | break; | ||
44 | cpu_relax(); | ||
45 | } | ||
63 | 46 | ||
64 | return true; | 47 | return true; |
65 | } | 48 | } |
66 | 49 | ||
67 | |||
68 | void __weak arch_irq_work_raise(void) | 50 | void __weak arch_irq_work_raise(void) |
69 | { | 51 | { |
70 | /* | 52 | /* |
@@ -75,20 +57,15 @@ void __weak arch_irq_work_raise(void) | |||
75 | /* | 57 | /* |
76 | * Queue the entry and raise the IPI if needed. | 58 | * Queue the entry and raise the IPI if needed. |
77 | */ | 59 | */ |
78 | static void __irq_work_queue(struct irq_work *entry) | 60 | static void __irq_work_queue(struct irq_work *work) |
79 | { | 61 | { |
80 | struct irq_work *next; | 62 | bool empty; |
81 | 63 | ||
82 | preempt_disable(); | 64 | preempt_disable(); |
83 | 65 | ||
84 | do { | 66 | empty = llist_add(&work->llnode, &__get_cpu_var(irq_work_list)); |
85 | next = __this_cpu_read(irq_work_list); | ||
86 | /* Can assign non-atomic because we keep the flags set. */ | ||
87 | entry->next = next_flags(next, IRQ_WORK_FLAGS); | ||
88 | } while (this_cpu_cmpxchg(irq_work_list, next, entry) != next); | ||
89 | |||
90 | /* The list was empty, raise self-interrupt to start processing. */ | 67 | /* The list was empty, raise self-interrupt to start processing. */ |
91 | if (!irq_work_next(entry)) | 68 | if (empty) |
92 | arch_irq_work_raise(); | 69 | arch_irq_work_raise(); |
93 | 70 | ||
94 | preempt_enable(); | 71 | preempt_enable(); |
@@ -100,16 +77,16 @@ static void __irq_work_queue(struct irq_work *entry) | |||
100 | * | 77 | * |
101 | * Can be re-enqueued while the callback is still in progress. | 78 | * Can be re-enqueued while the callback is still in progress. |
102 | */ | 79 | */ |
103 | bool irq_work_queue(struct irq_work *entry) | 80 | bool irq_work_queue(struct irq_work *work) |
104 | { | 81 | { |
105 | if (!irq_work_claim(entry)) { | 82 | if (!irq_work_claim(work)) { |
106 | /* | 83 | /* |
107 | * Already enqueued, can't do! | 84 | * Already enqueued, can't do! |
108 | */ | 85 | */ |
109 | return false; | 86 | return false; |
110 | } | 87 | } |
111 | 88 | ||
112 | __irq_work_queue(entry); | 89 | __irq_work_queue(work); |
113 | return true; | 90 | return true; |
114 | } | 91 | } |
115 | EXPORT_SYMBOL_GPL(irq_work_queue); | 92 | EXPORT_SYMBOL_GPL(irq_work_queue); |
@@ -120,34 +97,34 @@ EXPORT_SYMBOL_GPL(irq_work_queue); | |||
120 | */ | 97 | */ |
121 | void irq_work_run(void) | 98 | void irq_work_run(void) |
122 | { | 99 | { |
123 | struct irq_work *list; | 100 | struct irq_work *work; |
101 | struct llist_head *this_list; | ||
102 | struct llist_node *llnode; | ||
124 | 103 | ||
125 | if (this_cpu_read(irq_work_list) == NULL) | 104 | this_list = &__get_cpu_var(irq_work_list); |
105 | if (llist_empty(this_list)) | ||
126 | return; | 106 | return; |
127 | 107 | ||
128 | BUG_ON(!in_irq()); | 108 | BUG_ON(!in_irq()); |
129 | BUG_ON(!irqs_disabled()); | 109 | BUG_ON(!irqs_disabled()); |
130 | 110 | ||
131 | list = this_cpu_xchg(irq_work_list, NULL); | 111 | llnode = llist_del_all(this_list); |
132 | 112 | while (llnode != NULL) { | |
133 | while (list != NULL) { | 113 | work = llist_entry(llnode, struct irq_work, llnode); |
134 | struct irq_work *entry = list; | ||
135 | 114 | ||
136 | list = irq_work_next(list); | 115 | llnode = llist_next(llnode); |
137 | 116 | ||
138 | /* | 117 | /* |
139 | * Clear the PENDING bit, after this point the @entry | 118 | * Clear the PENDING bit, after this point the @work |
140 | * can be re-used. | 119 | * can be re-used. |
141 | */ | 120 | */ |
142 | entry->next = next_flags(NULL, IRQ_WORK_BUSY); | 121 | work->flags = IRQ_WORK_BUSY; |
143 | entry->func(entry); | 122 | work->func(work); |
144 | /* | 123 | /* |
145 | * Clear the BUSY bit and return to the free state if | 124 | * Clear the BUSY bit and return to the free state if |
146 | * no-one else claimed it meanwhile. | 125 | * no-one else claimed it meanwhile. |
147 | */ | 126 | */ |
148 | (void)cmpxchg(&entry->next, | 127 | (void)cmpxchg(&work->flags, IRQ_WORK_BUSY, 0); |
149 | next_flags(NULL, IRQ_WORK_BUSY), | ||
150 | NULL); | ||
151 | } | 128 | } |
152 | } | 129 | } |
153 | EXPORT_SYMBOL_GPL(irq_work_run); | 130 | EXPORT_SYMBOL_GPL(irq_work_run); |
@@ -156,11 +133,11 @@ EXPORT_SYMBOL_GPL(irq_work_run); | |||
156 | * Synchronize against the irq_work @entry, ensures the entry is not | 133 | * Synchronize against the irq_work @entry, ensures the entry is not |
157 | * currently in use. | 134 | * currently in use. |
158 | */ | 135 | */ |
159 | void irq_work_sync(struct irq_work *entry) | 136 | void irq_work_sync(struct irq_work *work) |
160 | { | 137 | { |
161 | WARN_ON_ONCE(irqs_disabled()); | 138 | WARN_ON_ONCE(irqs_disabled()); |
162 | 139 | ||
163 | while (irq_work_is_set(entry, IRQ_WORK_BUSY)) | 140 | while (work->flags & IRQ_WORK_BUSY) |
164 | cpu_relax(); | 141 | cpu_relax(); |
165 | } | 142 | } |
166 | EXPORT_SYMBOL_GPL(irq_work_sync); | 143 | EXPORT_SYMBOL_GPL(irq_work_sync); |
diff --git a/kernel/itimer.c b/kernel/itimer.c index d802883153da..22000c3db0dd 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c | |||
@@ -52,22 +52,22 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, | |||
52 | 52 | ||
53 | cval = it->expires; | 53 | cval = it->expires; |
54 | cinterval = it->incr; | 54 | cinterval = it->incr; |
55 | if (!cputime_eq(cval, cputime_zero)) { | 55 | if (cval) { |
56 | struct task_cputime cputime; | 56 | struct task_cputime cputime; |
57 | cputime_t t; | 57 | cputime_t t; |
58 | 58 | ||
59 | thread_group_cputimer(tsk, &cputime); | 59 | thread_group_cputimer(tsk, &cputime); |
60 | if (clock_id == CPUCLOCK_PROF) | 60 | if (clock_id == CPUCLOCK_PROF) |
61 | t = cputime_add(cputime.utime, cputime.stime); | 61 | t = cputime.utime + cputime.stime; |
62 | else | 62 | else |
63 | /* CPUCLOCK_VIRT */ | 63 | /* CPUCLOCK_VIRT */ |
64 | t = cputime.utime; | 64 | t = cputime.utime; |
65 | 65 | ||
66 | if (cputime_le(cval, t)) | 66 | if (cval < t) |
67 | /* about to fire */ | 67 | /* about to fire */ |
68 | cval = cputime_one_jiffy; | 68 | cval = cputime_one_jiffy; |
69 | else | 69 | else |
70 | cval = cputime_sub(cval, t); | 70 | cval = cval - t; |
71 | } | 71 | } |
72 | 72 | ||
73 | spin_unlock_irq(&tsk->sighand->siglock); | 73 | spin_unlock_irq(&tsk->sighand->siglock); |
@@ -161,10 +161,9 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, | |||
161 | 161 | ||
162 | cval = it->expires; | 162 | cval = it->expires; |
163 | cinterval = it->incr; | 163 | cinterval = it->incr; |
164 | if (!cputime_eq(cval, cputime_zero) || | 164 | if (cval || nval) { |
165 | !cputime_eq(nval, cputime_zero)) { | 165 | if (nval > 0) |
166 | if (cputime_gt(nval, cputime_zero)) | 166 | nval += cputime_one_jiffy; |
167 | nval = cputime_add(nval, cputime_one_jiffy); | ||
168 | set_process_cpu_timer(tsk, clock_id, &nval, &cval); | 167 | set_process_cpu_timer(tsk, clock_id, &nval, &cval); |
169 | } | 168 | } |
170 | it->expires = nval; | 169 | it->expires = nval; |
diff --git a/kernel/jump_label.c b/kernel/jump_label.c index a8ce45097f3d..01d3b70fc98a 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c | |||
@@ -66,19 +66,53 @@ void jump_label_inc(struct jump_label_key *key) | |||
66 | return; | 66 | return; |
67 | 67 | ||
68 | jump_label_lock(); | 68 | jump_label_lock(); |
69 | if (atomic_add_return(1, &key->enabled) == 1) | 69 | if (atomic_read(&key->enabled) == 0) |
70 | jump_label_update(key, JUMP_LABEL_ENABLE); | 70 | jump_label_update(key, JUMP_LABEL_ENABLE); |
71 | atomic_inc(&key->enabled); | ||
71 | jump_label_unlock(); | 72 | jump_label_unlock(); |
72 | } | 73 | } |
74 | EXPORT_SYMBOL_GPL(jump_label_inc); | ||
73 | 75 | ||
74 | void jump_label_dec(struct jump_label_key *key) | 76 | static void __jump_label_dec(struct jump_label_key *key, |
77 | unsigned long rate_limit, struct delayed_work *work) | ||
75 | { | 78 | { |
76 | if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) | 79 | if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) |
77 | return; | 80 | return; |
78 | 81 | ||
79 | jump_label_update(key, JUMP_LABEL_DISABLE); | 82 | if (rate_limit) { |
83 | atomic_inc(&key->enabled); | ||
84 | schedule_delayed_work(work, rate_limit); | ||
85 | } else | ||
86 | jump_label_update(key, JUMP_LABEL_DISABLE); | ||
87 | |||
80 | jump_label_unlock(); | 88 | jump_label_unlock(); |
81 | } | 89 | } |
90 | EXPORT_SYMBOL_GPL(jump_label_dec); | ||
91 | |||
92 | static void jump_label_update_timeout(struct work_struct *work) | ||
93 | { | ||
94 | struct jump_label_key_deferred *key = | ||
95 | container_of(work, struct jump_label_key_deferred, work.work); | ||
96 | __jump_label_dec(&key->key, 0, NULL); | ||
97 | } | ||
98 | |||
99 | void jump_label_dec(struct jump_label_key *key) | ||
100 | { | ||
101 | __jump_label_dec(key, 0, NULL); | ||
102 | } | ||
103 | |||
104 | void jump_label_dec_deferred(struct jump_label_key_deferred *key) | ||
105 | { | ||
106 | __jump_label_dec(&key->key, key->timeout, &key->work); | ||
107 | } | ||
108 | |||
109 | |||
110 | void jump_label_rate_limit(struct jump_label_key_deferred *key, | ||
111 | unsigned long rl) | ||
112 | { | ||
113 | key->timeout = rl; | ||
114 | INIT_DELAYED_WORK(&key->work, jump_label_update_timeout); | ||
115 | } | ||
82 | 116 | ||
83 | static int addr_conflict(struct jump_entry *entry, void *start, void *end) | 117 | static int addr_conflict(struct jump_entry *entry, void *start, void *end) |
84 | { | 118 | { |
@@ -104,6 +138,18 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start, | |||
104 | return 0; | 138 | return 0; |
105 | } | 139 | } |
106 | 140 | ||
141 | /* | ||
142 | * Update code which is definitely not currently executing. | ||
143 | * Architectures which need heavyweight synchronization to modify | ||
144 | * running code can override this to make the non-live update case | ||
145 | * cheaper. | ||
146 | */ | ||
147 | void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry, | ||
148 | enum jump_label_type type) | ||
149 | { | ||
150 | arch_jump_label_transform(entry, type); | ||
151 | } | ||
152 | |||
107 | static void __jump_label_update(struct jump_label_key *key, | 153 | static void __jump_label_update(struct jump_label_key *key, |
108 | struct jump_entry *entry, | 154 | struct jump_entry *entry, |
109 | struct jump_entry *stop, int enable) | 155 | struct jump_entry *stop, int enable) |
@@ -121,14 +167,7 @@ static void __jump_label_update(struct jump_label_key *key, | |||
121 | } | 167 | } |
122 | } | 168 | } |
123 | 169 | ||
124 | /* | 170 | void __init jump_label_init(void) |
125 | * Not all archs need this. | ||
126 | */ | ||
127 | void __weak arch_jump_label_text_poke_early(jump_label_t addr) | ||
128 | { | ||
129 | } | ||
130 | |||
131 | static __init int jump_label_init(void) | ||
132 | { | 171 | { |
133 | struct jump_entry *iter_start = __start___jump_table; | 172 | struct jump_entry *iter_start = __start___jump_table; |
134 | struct jump_entry *iter_stop = __stop___jump_table; | 173 | struct jump_entry *iter_stop = __stop___jump_table; |
@@ -139,22 +178,22 @@ static __init int jump_label_init(void) | |||
139 | jump_label_sort_entries(iter_start, iter_stop); | 178 | jump_label_sort_entries(iter_start, iter_stop); |
140 | 179 | ||
141 | for (iter = iter_start; iter < iter_stop; iter++) { | 180 | for (iter = iter_start; iter < iter_stop; iter++) { |
142 | arch_jump_label_text_poke_early(iter->code); | 181 | struct jump_label_key *iterk; |
143 | if (iter->key == (jump_label_t)(unsigned long)key) | 182 | |
183 | iterk = (struct jump_label_key *)(unsigned long)iter->key; | ||
184 | arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ? | ||
185 | JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE); | ||
186 | if (iterk == key) | ||
144 | continue; | 187 | continue; |
145 | 188 | ||
146 | key = (struct jump_label_key *)(unsigned long)iter->key; | 189 | key = iterk; |
147 | atomic_set(&key->enabled, 0); | ||
148 | key->entries = iter; | 190 | key->entries = iter; |
149 | #ifdef CONFIG_MODULES | 191 | #ifdef CONFIG_MODULES |
150 | key->next = NULL; | 192 | key->next = NULL; |
151 | #endif | 193 | #endif |
152 | } | 194 | } |
153 | jump_label_unlock(); | 195 | jump_label_unlock(); |
154 | |||
155 | return 0; | ||
156 | } | 196 | } |
157 | early_initcall(jump_label_init); | ||
158 | 197 | ||
159 | #ifdef CONFIG_MODULES | 198 | #ifdef CONFIG_MODULES |
160 | 199 | ||
@@ -211,8 +250,13 @@ void jump_label_apply_nops(struct module *mod) | |||
211 | if (iter_start == iter_stop) | 250 | if (iter_start == iter_stop) |
212 | return; | 251 | return; |
213 | 252 | ||
214 | for (iter = iter_start; iter < iter_stop; iter++) | 253 | for (iter = iter_start; iter < iter_stop; iter++) { |
215 | arch_jump_label_text_poke_early(iter->code); | 254 | struct jump_label_key *iterk; |
255 | |||
256 | iterk = (struct jump_label_key *)(unsigned long)iter->key; | ||
257 | arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ? | ||
258 | JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE); | ||
259 | } | ||
216 | } | 260 | } |
217 | 261 | ||
218 | static int jump_label_add_module(struct module *mod) | 262 | static int jump_label_add_module(struct module *mod) |
@@ -252,8 +296,7 @@ static int jump_label_add_module(struct module *mod) | |||
252 | key->next = jlm; | 296 | key->next = jlm; |
253 | 297 | ||
254 | if (jump_label_enabled(key)) | 298 | if (jump_label_enabled(key)) |
255 | __jump_label_update(key, iter, iter_stop, | 299 | __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE); |
256 | JUMP_LABEL_ENABLE); | ||
257 | } | 300 | } |
258 | 301 | ||
259 | return 0; | 302 | return 0; |
diff --git a/kernel/kexec.c b/kernel/kexec.c index 296fbc84d659..7b0886786701 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -32,7 +32,6 @@ | |||
32 | #include <linux/console.h> | 32 | #include <linux/console.h> |
33 | #include <linux/vmalloc.h> | 33 | #include <linux/vmalloc.h> |
34 | #include <linux/swap.h> | 34 | #include <linux/swap.h> |
35 | #include <linux/kmsg_dump.h> | ||
36 | #include <linux/syscore_ops.h> | 35 | #include <linux/syscore_ops.h> |
37 | 36 | ||
38 | #include <asm/page.h> | 37 | #include <asm/page.h> |
@@ -498,7 +497,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image, | |||
498 | while (hole_end <= crashk_res.end) { | 497 | while (hole_end <= crashk_res.end) { |
499 | unsigned long i; | 498 | unsigned long i; |
500 | 499 | ||
501 | if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT) | 500 | if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) |
502 | break; | 501 | break; |
503 | if (hole_end > crashk_res.end) | 502 | if (hole_end > crashk_res.end) |
504 | break; | 503 | break; |
@@ -999,6 +998,7 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, | |||
999 | kimage_free(xchg(&kexec_crash_image, NULL)); | 998 | kimage_free(xchg(&kexec_crash_image, NULL)); |
1000 | result = kimage_crash_alloc(&image, entry, | 999 | result = kimage_crash_alloc(&image, entry, |
1001 | nr_segments, segments); | 1000 | nr_segments, segments); |
1001 | crash_map_reserved_pages(); | ||
1002 | } | 1002 | } |
1003 | if (result) | 1003 | if (result) |
1004 | goto out; | 1004 | goto out; |
@@ -1015,6 +1015,8 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, | |||
1015 | goto out; | 1015 | goto out; |
1016 | } | 1016 | } |
1017 | kimage_terminate(image); | 1017 | kimage_terminate(image); |
1018 | if (flags & KEXEC_ON_CRASH) | ||
1019 | crash_unmap_reserved_pages(); | ||
1018 | } | 1020 | } |
1019 | /* Install the new kernel, and Uninstall the old */ | 1021 | /* Install the new kernel, and Uninstall the old */ |
1020 | image = xchg(dest_image, image); | 1022 | image = xchg(dest_image, image); |
@@ -1026,6 +1028,18 @@ out: | |||
1026 | return result; | 1028 | return result; |
1027 | } | 1029 | } |
1028 | 1030 | ||
1031 | /* | ||
1032 | * Add and remove page tables for crashkernel memory | ||
1033 | * | ||
1034 | * Provide an empty default implementation here -- architecture | ||
1035 | * code may override this | ||
1036 | */ | ||
1037 | void __weak crash_map_reserved_pages(void) | ||
1038 | {} | ||
1039 | |||
1040 | void __weak crash_unmap_reserved_pages(void) | ||
1041 | {} | ||
1042 | |||
1029 | #ifdef CONFIG_COMPAT | 1043 | #ifdef CONFIG_COMPAT |
1030 | asmlinkage long compat_sys_kexec_load(unsigned long entry, | 1044 | asmlinkage long compat_sys_kexec_load(unsigned long entry, |
1031 | unsigned long nr_segments, | 1045 | unsigned long nr_segments, |
@@ -1079,8 +1093,6 @@ void crash_kexec(struct pt_regs *regs) | |||
1079 | if (kexec_crash_image) { | 1093 | if (kexec_crash_image) { |
1080 | struct pt_regs fixed_regs; | 1094 | struct pt_regs fixed_regs; |
1081 | 1095 | ||
1082 | kmsg_dump(KMSG_DUMP_KEXEC); | ||
1083 | |||
1084 | crash_setup_regs(&fixed_regs, regs); | 1096 | crash_setup_regs(&fixed_regs, regs); |
1085 | crash_save_vmcoreinfo(); | 1097 | crash_save_vmcoreinfo(); |
1086 | machine_crash_shutdown(&fixed_regs); | 1098 | machine_crash_shutdown(&fixed_regs); |
@@ -1117,6 +1129,8 @@ int crash_shrink_memory(unsigned long new_size) | |||
1117 | { | 1129 | { |
1118 | int ret = 0; | 1130 | int ret = 0; |
1119 | unsigned long start, end; | 1131 | unsigned long start, end; |
1132 | unsigned long old_size; | ||
1133 | struct resource *ram_res; | ||
1120 | 1134 | ||
1121 | mutex_lock(&kexec_mutex); | 1135 | mutex_lock(&kexec_mutex); |
1122 | 1136 | ||
@@ -1126,23 +1140,37 @@ int crash_shrink_memory(unsigned long new_size) | |||
1126 | } | 1140 | } |
1127 | start = crashk_res.start; | 1141 | start = crashk_res.start; |
1128 | end = crashk_res.end; | 1142 | end = crashk_res.end; |
1143 | old_size = (end == 0) ? 0 : end - start + 1; | ||
1144 | if (new_size >= old_size) { | ||
1145 | ret = (new_size == old_size) ? 0 : -EINVAL; | ||
1146 | goto unlock; | ||
1147 | } | ||
1129 | 1148 | ||
1130 | if (new_size >= end - start + 1) { | 1149 | ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); |
1131 | ret = -EINVAL; | 1150 | if (!ram_res) { |
1132 | if (new_size == end - start + 1) | 1151 | ret = -ENOMEM; |
1133 | ret = 0; | ||
1134 | goto unlock; | 1152 | goto unlock; |
1135 | } | 1153 | } |
1136 | 1154 | ||
1137 | start = roundup(start, PAGE_SIZE); | 1155 | start = roundup(start, KEXEC_CRASH_MEM_ALIGN); |
1138 | end = roundup(start + new_size, PAGE_SIZE); | 1156 | end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN); |
1139 | 1157 | ||
1158 | crash_map_reserved_pages(); | ||
1140 | crash_free_reserved_phys_range(end, crashk_res.end); | 1159 | crash_free_reserved_phys_range(end, crashk_res.end); |
1141 | 1160 | ||
1142 | if ((start == end) && (crashk_res.parent != NULL)) | 1161 | if ((start == end) && (crashk_res.parent != NULL)) |
1143 | release_resource(&crashk_res); | 1162 | release_resource(&crashk_res); |
1163 | |||
1164 | ram_res->start = end; | ||
1165 | ram_res->end = crashk_res.end; | ||
1166 | ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM; | ||
1167 | ram_res->name = "System RAM"; | ||
1168 | |||
1144 | crashk_res.end = end - 1; | 1169 | crashk_res.end = end - 1; |
1145 | 1170 | ||
1171 | insert_resource(&iomem_resource, ram_res); | ||
1172 | crash_unmap_reserved_pages(); | ||
1173 | |||
1146 | unlock: | 1174 | unlock: |
1147 | mutex_unlock(&kexec_mutex); | 1175 | mutex_unlock(&kexec_mutex); |
1148 | return ret; | 1176 | return ret; |
@@ -1380,24 +1408,23 @@ int __init parse_crashkernel(char *cmdline, | |||
1380 | } | 1408 | } |
1381 | 1409 | ||
1382 | 1410 | ||
1383 | 1411 | static void update_vmcoreinfo_note(void) | |
1384 | void crash_save_vmcoreinfo(void) | ||
1385 | { | 1412 | { |
1386 | u32 *buf; | 1413 | u32 *buf = vmcoreinfo_note; |
1387 | 1414 | ||
1388 | if (!vmcoreinfo_size) | 1415 | if (!vmcoreinfo_size) |
1389 | return; | 1416 | return; |
1390 | |||
1391 | vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds()); | ||
1392 | |||
1393 | buf = (u32 *)vmcoreinfo_note; | ||
1394 | |||
1395 | buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, | 1417 | buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, |
1396 | vmcoreinfo_size); | 1418 | vmcoreinfo_size); |
1397 | |||
1398 | final_note(buf); | 1419 | final_note(buf); |
1399 | } | 1420 | } |
1400 | 1421 | ||
1422 | void crash_save_vmcoreinfo(void) | ||
1423 | { | ||
1424 | vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds()); | ||
1425 | update_vmcoreinfo_note(); | ||
1426 | } | ||
1427 | |||
1401 | void vmcoreinfo_append_str(const char *fmt, ...) | 1428 | void vmcoreinfo_append_str(const char *fmt, ...) |
1402 | { | 1429 | { |
1403 | va_list args; | 1430 | va_list args; |
@@ -1483,6 +1510,7 @@ static int __init crash_save_vmcoreinfo_init(void) | |||
1483 | VMCOREINFO_NUMBER(PG_swapcache); | 1510 | VMCOREINFO_NUMBER(PG_swapcache); |
1484 | 1511 | ||
1485 | arch_crash_save_vmcoreinfo(); | 1512 | arch_crash_save_vmcoreinfo(); |
1513 | update_vmcoreinfo_note(); | ||
1486 | 1514 | ||
1487 | return 0; | 1515 | return 0; |
1488 | } | 1516 | } |
@@ -1506,7 +1534,7 @@ int kernel_kexec(void) | |||
1506 | 1534 | ||
1507 | #ifdef CONFIG_KEXEC_JUMP | 1535 | #ifdef CONFIG_KEXEC_JUMP |
1508 | if (kexec_image->preserve_context) { | 1536 | if (kexec_image->preserve_context) { |
1509 | mutex_lock(&pm_mutex); | 1537 | lock_system_sleep(); |
1510 | pm_prepare_console(); | 1538 | pm_prepare_console(); |
1511 | error = freeze_processes(); | 1539 | error = freeze_processes(); |
1512 | if (error) { | 1540 | if (error) { |
@@ -1559,7 +1587,7 @@ int kernel_kexec(void) | |||
1559 | thaw_processes(); | 1587 | thaw_processes(); |
1560 | Restore_console: | 1588 | Restore_console: |
1561 | pm_restore_console(); | 1589 | pm_restore_console(); |
1562 | mutex_unlock(&pm_mutex); | 1590 | unlock_system_sleep(); |
1563 | } | 1591 | } |
1564 | #endif | 1592 | #endif |
1565 | 1593 | ||
diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 01a0700e873f..c744b88c44e2 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c | |||
@@ -20,7 +20,7 @@ | |||
20 | */ | 20 | */ |
21 | 21 | ||
22 | #include <linux/kernel.h> | 22 | #include <linux/kernel.h> |
23 | #include <linux/module.h> | 23 | #include <linux/export.h> |
24 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
25 | #include <linux/err.h> | 25 | #include <linux/err.h> |
26 | #include <linux/log2.h> | 26 | #include <linux/log2.h> |
diff --git a/kernel/kmod.c b/kernel/kmod.c index ddc7644c1305..a0a88543934e 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -36,6 +36,7 @@ | |||
36 | #include <linux/resource.h> | 36 | #include <linux/resource.h> |
37 | #include <linux/notifier.h> | 37 | #include <linux/notifier.h> |
38 | #include <linux/suspend.h> | 38 | #include <linux/suspend.h> |
39 | #include <linux/rwsem.h> | ||
39 | #include <asm/uaccess.h> | 40 | #include <asm/uaccess.h> |
40 | 41 | ||
41 | #include <trace/events/module.h> | 42 | #include <trace/events/module.h> |
@@ -50,6 +51,7 @@ static struct workqueue_struct *khelper_wq; | |||
50 | static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; | 51 | static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; |
51 | static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; | 52 | static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; |
52 | static DEFINE_SPINLOCK(umh_sysctl_lock); | 53 | static DEFINE_SPINLOCK(umh_sysctl_lock); |
54 | static DECLARE_RWSEM(umhelper_sem); | ||
53 | 55 | ||
54 | #ifdef CONFIG_MODULES | 56 | #ifdef CONFIG_MODULES |
55 | 57 | ||
@@ -114,10 +116,12 @@ int __request_module(bool wait, const char *fmt, ...) | |||
114 | atomic_inc(&kmod_concurrent); | 116 | atomic_inc(&kmod_concurrent); |
115 | if (atomic_read(&kmod_concurrent) > max_modprobes) { | 117 | if (atomic_read(&kmod_concurrent) > max_modprobes) { |
116 | /* We may be blaming an innocent here, but unlikely */ | 118 | /* We may be blaming an innocent here, but unlikely */ |
117 | if (kmod_loop_msg++ < 5) | 119 | if (kmod_loop_msg < 5) { |
118 | printk(KERN_ERR | 120 | printk(KERN_ERR |
119 | "request_module: runaway loop modprobe %s\n", | 121 | "request_module: runaway loop modprobe %s\n", |
120 | module_name); | 122 | module_name); |
123 | kmod_loop_msg++; | ||
124 | } | ||
121 | atomic_dec(&kmod_concurrent); | 125 | atomic_dec(&kmod_concurrent); |
122 | return -ENOMEM; | 126 | return -ENOMEM; |
123 | } | 127 | } |
@@ -273,6 +277,7 @@ static void __call_usermodehelper(struct work_struct *work) | |||
273 | * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY | 277 | * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY |
274 | * (used for preventing user land processes from being created after the user | 278 | * (used for preventing user land processes from being created after the user |
275 | * land has been frozen during a system-wide hibernation or suspend operation). | 279 | * land has been frozen during a system-wide hibernation or suspend operation). |
280 | * Should always be manipulated under umhelper_sem acquired for write. | ||
276 | */ | 281 | */ |
277 | static int usermodehelper_disabled = 1; | 282 | static int usermodehelper_disabled = 1; |
278 | 283 | ||
@@ -280,17 +285,29 @@ static int usermodehelper_disabled = 1; | |||
280 | static atomic_t running_helpers = ATOMIC_INIT(0); | 285 | static atomic_t running_helpers = ATOMIC_INIT(0); |
281 | 286 | ||
282 | /* | 287 | /* |
283 | * Wait queue head used by usermodehelper_pm_callback() to wait for all running | 288 | * Wait queue head used by usermodehelper_disable() to wait for all running |
284 | * helpers to finish. | 289 | * helpers to finish. |
285 | */ | 290 | */ |
286 | static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); | 291 | static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); |
287 | 292 | ||
288 | /* | 293 | /* |
289 | * Time to wait for running_helpers to become zero before the setting of | 294 | * Time to wait for running_helpers to become zero before the setting of |
290 | * usermodehelper_disabled in usermodehelper_pm_callback() fails | 295 | * usermodehelper_disabled in usermodehelper_disable() fails |
291 | */ | 296 | */ |
292 | #define RUNNING_HELPERS_TIMEOUT (5 * HZ) | 297 | #define RUNNING_HELPERS_TIMEOUT (5 * HZ) |
293 | 298 | ||
299 | void read_lock_usermodehelper(void) | ||
300 | { | ||
301 | down_read(&umhelper_sem); | ||
302 | } | ||
303 | EXPORT_SYMBOL_GPL(read_lock_usermodehelper); | ||
304 | |||
305 | void read_unlock_usermodehelper(void) | ||
306 | { | ||
307 | up_read(&umhelper_sem); | ||
308 | } | ||
309 | EXPORT_SYMBOL_GPL(read_unlock_usermodehelper); | ||
310 | |||
294 | /** | 311 | /** |
295 | * usermodehelper_disable - prevent new helpers from being started | 312 | * usermodehelper_disable - prevent new helpers from being started |
296 | */ | 313 | */ |
@@ -298,8 +315,10 @@ int usermodehelper_disable(void) | |||
298 | { | 315 | { |
299 | long retval; | 316 | long retval; |
300 | 317 | ||
318 | down_write(&umhelper_sem); | ||
301 | usermodehelper_disabled = 1; | 319 | usermodehelper_disabled = 1; |
302 | smp_mb(); | 320 | up_write(&umhelper_sem); |
321 | |||
303 | /* | 322 | /* |
304 | * From now on call_usermodehelper_exec() won't start any new | 323 | * From now on call_usermodehelper_exec() won't start any new |
305 | * helpers, so it is sufficient if running_helpers turns out to | 324 | * helpers, so it is sufficient if running_helpers turns out to |
@@ -312,7 +331,9 @@ int usermodehelper_disable(void) | |||
312 | if (retval) | 331 | if (retval) |
313 | return 0; | 332 | return 0; |
314 | 333 | ||
334 | down_write(&umhelper_sem); | ||
315 | usermodehelper_disabled = 0; | 335 | usermodehelper_disabled = 0; |
336 | up_write(&umhelper_sem); | ||
316 | return -EAGAIN; | 337 | return -EAGAIN; |
317 | } | 338 | } |
318 | 339 | ||
@@ -321,7 +342,9 @@ int usermodehelper_disable(void) | |||
321 | */ | 342 | */ |
322 | void usermodehelper_enable(void) | 343 | void usermodehelper_enable(void) |
323 | { | 344 | { |
345 | down_write(&umhelper_sem); | ||
324 | usermodehelper_disabled = 0; | 346 | usermodehelper_disabled = 0; |
347 | up_write(&umhelper_sem); | ||
325 | } | 348 | } |
326 | 349 | ||
327 | /** | 350 | /** |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index b30fd54eb985..95dd7212e610 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -36,7 +36,7 @@ | |||
36 | #include <linux/init.h> | 36 | #include <linux/init.h> |
37 | #include <linux/slab.h> | 37 | #include <linux/slab.h> |
38 | #include <linux/stddef.h> | 38 | #include <linux/stddef.h> |
39 | #include <linux/module.h> | 39 | #include <linux/export.h> |
40 | #include <linux/moduleloader.h> | 40 | #include <linux/moduleloader.h> |
41 | #include <linux/kallsyms.h> | 41 | #include <linux/kallsyms.h> |
42 | #include <linux/freezer.h> | 42 | #include <linux/freezer.h> |
@@ -78,10 +78,10 @@ static bool kprobes_all_disarmed; | |||
78 | static DEFINE_MUTEX(kprobe_mutex); | 78 | static DEFINE_MUTEX(kprobe_mutex); |
79 | static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; | 79 | static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; |
80 | static struct { | 80 | static struct { |
81 | spinlock_t lock ____cacheline_aligned_in_smp; | 81 | raw_spinlock_t lock ____cacheline_aligned_in_smp; |
82 | } kretprobe_table_locks[KPROBE_TABLE_SIZE]; | 82 | } kretprobe_table_locks[KPROBE_TABLE_SIZE]; |
83 | 83 | ||
84 | static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) | 84 | static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) |
85 | { | 85 | { |
86 | return &(kretprobe_table_locks[hash].lock); | 86 | return &(kretprobe_table_locks[hash].lock); |
87 | } | 87 | } |
@@ -1013,9 +1013,9 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, | |||
1013 | hlist_del(&ri->hlist); | 1013 | hlist_del(&ri->hlist); |
1014 | INIT_HLIST_NODE(&ri->hlist); | 1014 | INIT_HLIST_NODE(&ri->hlist); |
1015 | if (likely(rp)) { | 1015 | if (likely(rp)) { |
1016 | spin_lock(&rp->lock); | 1016 | raw_spin_lock(&rp->lock); |
1017 | hlist_add_head(&ri->hlist, &rp->free_instances); | 1017 | hlist_add_head(&ri->hlist, &rp->free_instances); |
1018 | spin_unlock(&rp->lock); | 1018 | raw_spin_unlock(&rp->lock); |
1019 | } else | 1019 | } else |
1020 | /* Unregistering */ | 1020 | /* Unregistering */ |
1021 | hlist_add_head(&ri->hlist, head); | 1021 | hlist_add_head(&ri->hlist, head); |
@@ -1026,19 +1026,19 @@ void __kprobes kretprobe_hash_lock(struct task_struct *tsk, | |||
1026 | __acquires(hlist_lock) | 1026 | __acquires(hlist_lock) |
1027 | { | 1027 | { |
1028 | unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); | 1028 | unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); |
1029 | spinlock_t *hlist_lock; | 1029 | raw_spinlock_t *hlist_lock; |
1030 | 1030 | ||
1031 | *head = &kretprobe_inst_table[hash]; | 1031 | *head = &kretprobe_inst_table[hash]; |
1032 | hlist_lock = kretprobe_table_lock_ptr(hash); | 1032 | hlist_lock = kretprobe_table_lock_ptr(hash); |
1033 | spin_lock_irqsave(hlist_lock, *flags); | 1033 | raw_spin_lock_irqsave(hlist_lock, *flags); |
1034 | } | 1034 | } |
1035 | 1035 | ||
1036 | static void __kprobes kretprobe_table_lock(unsigned long hash, | 1036 | static void __kprobes kretprobe_table_lock(unsigned long hash, |
1037 | unsigned long *flags) | 1037 | unsigned long *flags) |
1038 | __acquires(hlist_lock) | 1038 | __acquires(hlist_lock) |
1039 | { | 1039 | { |
1040 | spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); | 1040 | raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); |
1041 | spin_lock_irqsave(hlist_lock, *flags); | 1041 | raw_spin_lock_irqsave(hlist_lock, *flags); |
1042 | } | 1042 | } |
1043 | 1043 | ||
1044 | void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, | 1044 | void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, |
@@ -1046,18 +1046,18 @@ void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, | |||
1046 | __releases(hlist_lock) | 1046 | __releases(hlist_lock) |
1047 | { | 1047 | { |
1048 | unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); | 1048 | unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); |
1049 | spinlock_t *hlist_lock; | 1049 | raw_spinlock_t *hlist_lock; |
1050 | 1050 | ||
1051 | hlist_lock = kretprobe_table_lock_ptr(hash); | 1051 | hlist_lock = kretprobe_table_lock_ptr(hash); |
1052 | spin_unlock_irqrestore(hlist_lock, *flags); | 1052 | raw_spin_unlock_irqrestore(hlist_lock, *flags); |
1053 | } | 1053 | } |
1054 | 1054 | ||
1055 | static void __kprobes kretprobe_table_unlock(unsigned long hash, | 1055 | static void __kprobes kretprobe_table_unlock(unsigned long hash, |
1056 | unsigned long *flags) | 1056 | unsigned long *flags) |
1057 | __releases(hlist_lock) | 1057 | __releases(hlist_lock) |
1058 | { | 1058 | { |
1059 | spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); | 1059 | raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); |
1060 | spin_unlock_irqrestore(hlist_lock, *flags); | 1060 | raw_spin_unlock_irqrestore(hlist_lock, *flags); |
1061 | } | 1061 | } |
1062 | 1062 | ||
1063 | /* | 1063 | /* |
@@ -1663,12 +1663,12 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, | |||
1663 | 1663 | ||
1664 | /*TODO: consider to only swap the RA after the last pre_handler fired */ | 1664 | /*TODO: consider to only swap the RA after the last pre_handler fired */ |
1665 | hash = hash_ptr(current, KPROBE_HASH_BITS); | 1665 | hash = hash_ptr(current, KPROBE_HASH_BITS); |
1666 | spin_lock_irqsave(&rp->lock, flags); | 1666 | raw_spin_lock_irqsave(&rp->lock, flags); |
1667 | if (!hlist_empty(&rp->free_instances)) { | 1667 | if (!hlist_empty(&rp->free_instances)) { |
1668 | ri = hlist_entry(rp->free_instances.first, | 1668 | ri = hlist_entry(rp->free_instances.first, |
1669 | struct kretprobe_instance, hlist); | 1669 | struct kretprobe_instance, hlist); |
1670 | hlist_del(&ri->hlist); | 1670 | hlist_del(&ri->hlist); |
1671 | spin_unlock_irqrestore(&rp->lock, flags); | 1671 | raw_spin_unlock_irqrestore(&rp->lock, flags); |
1672 | 1672 | ||
1673 | ri->rp = rp; | 1673 | ri->rp = rp; |
1674 | ri->task = current; | 1674 | ri->task = current; |
@@ -1685,7 +1685,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, | |||
1685 | kretprobe_table_unlock(hash, &flags); | 1685 | kretprobe_table_unlock(hash, &flags); |
1686 | } else { | 1686 | } else { |
1687 | rp->nmissed++; | 1687 | rp->nmissed++; |
1688 | spin_unlock_irqrestore(&rp->lock, flags); | 1688 | raw_spin_unlock_irqrestore(&rp->lock, flags); |
1689 | } | 1689 | } |
1690 | return 0; | 1690 | return 0; |
1691 | } | 1691 | } |
@@ -1721,7 +1721,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp) | |||
1721 | rp->maxactive = num_possible_cpus(); | 1721 | rp->maxactive = num_possible_cpus(); |
1722 | #endif | 1722 | #endif |
1723 | } | 1723 | } |
1724 | spin_lock_init(&rp->lock); | 1724 | raw_spin_lock_init(&rp->lock); |
1725 | INIT_HLIST_HEAD(&rp->free_instances); | 1725 | INIT_HLIST_HEAD(&rp->free_instances); |
1726 | for (i = 0; i < rp->maxactive; i++) { | 1726 | for (i = 0; i < rp->maxactive; i++) { |
1727 | inst = kmalloc(sizeof(struct kretprobe_instance) + | 1727 | inst = kmalloc(sizeof(struct kretprobe_instance) + |
@@ -1959,7 +1959,7 @@ static int __init init_kprobes(void) | |||
1959 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | 1959 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
1960 | INIT_HLIST_HEAD(&kprobe_table[i]); | 1960 | INIT_HLIST_HEAD(&kprobe_table[i]); |
1961 | INIT_HLIST_HEAD(&kretprobe_inst_table[i]); | 1961 | INIT_HLIST_HEAD(&kretprobe_inst_table[i]); |
1962 | spin_lock_init(&(kretprobe_table_locks[i].lock)); | 1962 | raw_spin_lock_init(&(kretprobe_table_locks[i].lock)); |
1963 | } | 1963 | } |
1964 | 1964 | ||
1965 | /* | 1965 | /* |
@@ -2198,7 +2198,7 @@ static ssize_t write_enabled_file_bool(struct file *file, | |||
2198 | const char __user *user_buf, size_t count, loff_t *ppos) | 2198 | const char __user *user_buf, size_t count, loff_t *ppos) |
2199 | { | 2199 | { |
2200 | char buf[32]; | 2200 | char buf[32]; |
2201 | int buf_size; | 2201 | size_t buf_size; |
2202 | 2202 | ||
2203 | buf_size = min(count, (sizeof(buf)-1)); | 2203 | buf_size = min(count, (sizeof(buf)-1)); |
2204 | if (copy_from_user(buf, user_buf, buf_size)) | 2204 | if (copy_from_user(buf, user_buf, buf_size)) |
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 3b053c04dd86..4e316e1acf58 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
@@ -11,10 +11,11 @@ | |||
11 | #include <linux/kobject.h> | 11 | #include <linux/kobject.h> |
12 | #include <linux/string.h> | 12 | #include <linux/string.h> |
13 | #include <linux/sysfs.h> | 13 | #include <linux/sysfs.h> |
14 | #include <linux/module.h> | 14 | #include <linux/export.h> |
15 | #include <linux/init.h> | 15 | #include <linux/init.h> |
16 | #include <linux/kexec.h> | 16 | #include <linux/kexec.h> |
17 | #include <linux/profile.h> | 17 | #include <linux/profile.h> |
18 | #include <linux/stat.h> | ||
18 | #include <linux/sched.h> | 19 | #include <linux/sched.h> |
19 | #include <linux/capability.h> | 20 | #include <linux/capability.h> |
20 | 21 | ||
diff --git a/kernel/kthread.c b/kernel/kthread.c index 4ba7cccb4994..3d3de633702e 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -12,7 +12,7 @@ | |||
12 | #include <linux/cpuset.h> | 12 | #include <linux/cpuset.h> |
13 | #include <linux/unistd.h> | 13 | #include <linux/unistd.h> |
14 | #include <linux/file.h> | 14 | #include <linux/file.h> |
15 | #include <linux/module.h> | 15 | #include <linux/export.h> |
16 | #include <linux/mutex.h> | 16 | #include <linux/mutex.h> |
17 | #include <linux/slab.h> | 17 | #include <linux/slab.h> |
18 | #include <linux/freezer.h> | 18 | #include <linux/freezer.h> |
@@ -59,6 +59,31 @@ int kthread_should_stop(void) | |||
59 | EXPORT_SYMBOL(kthread_should_stop); | 59 | EXPORT_SYMBOL(kthread_should_stop); |
60 | 60 | ||
61 | /** | 61 | /** |
62 | * kthread_freezable_should_stop - should this freezable kthread return now? | ||
63 | * @was_frozen: optional out parameter, indicates whether %current was frozen | ||
64 | * | ||
65 | * kthread_should_stop() for freezable kthreads, which will enter | ||
66 | * refrigerator if necessary. This function is safe from kthread_stop() / | ||
67 | * freezer deadlock and freezable kthreads should use this function instead | ||
68 | * of calling try_to_freeze() directly. | ||
69 | */ | ||
70 | bool kthread_freezable_should_stop(bool *was_frozen) | ||
71 | { | ||
72 | bool frozen = false; | ||
73 | |||
74 | might_sleep(); | ||
75 | |||
76 | if (unlikely(freezing(current))) | ||
77 | frozen = __refrigerator(true); | ||
78 | |||
79 | if (was_frozen) | ||
80 | *was_frozen = frozen; | ||
81 | |||
82 | return kthread_should_stop(); | ||
83 | } | ||
84 | EXPORT_SYMBOL_GPL(kthread_freezable_should_stop); | ||
85 | |||
86 | /** | ||
62 | * kthread_data - return data value specified on kthread creation | 87 | * kthread_data - return data value specified on kthread creation |
63 | * @task: kthread task in question | 88 | * @task: kthread task in question |
64 | * | 89 | * |
@@ -257,7 +282,7 @@ int kthreadd(void *unused) | |||
257 | set_cpus_allowed_ptr(tsk, cpu_all_mask); | 282 | set_cpus_allowed_ptr(tsk, cpu_all_mask); |
258 | set_mems_allowed(node_states[N_HIGH_MEMORY]); | 283 | set_mems_allowed(node_states[N_HIGH_MEMORY]); |
259 | 284 | ||
260 | current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; | 285 | current->flags |= PF_NOFREEZE; |
261 | 286 | ||
262 | for (;;) { | 287 | for (;;) { |
263 | set_current_state(TASK_INTERRUPTIBLE); | 288 | set_current_state(TASK_INTERRUPTIBLE); |
diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 376066e10413..a462b317f9a0 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c | |||
@@ -53,12 +53,12 @@ | |||
53 | #include <linux/notifier.h> | 53 | #include <linux/notifier.h> |
54 | #include <linux/spinlock.h> | 54 | #include <linux/spinlock.h> |
55 | #include <linux/proc_fs.h> | 55 | #include <linux/proc_fs.h> |
56 | #include <linux/module.h> | 56 | #include <linux/export.h> |
57 | #include <linux/sched.h> | 57 | #include <linux/sched.h> |
58 | #include <linux/list.h> | 58 | #include <linux/list.h> |
59 | #include <linux/stacktrace.h> | 59 | #include <linux/stacktrace.h> |
60 | 60 | ||
61 | static DEFINE_SPINLOCK(latency_lock); | 61 | static DEFINE_RAW_SPINLOCK(latency_lock); |
62 | 62 | ||
63 | #define MAXLR 128 | 63 | #define MAXLR 128 |
64 | static struct latency_record latency_record[MAXLR]; | 64 | static struct latency_record latency_record[MAXLR]; |
@@ -72,19 +72,19 @@ void clear_all_latency_tracing(struct task_struct *p) | |||
72 | if (!latencytop_enabled) | 72 | if (!latencytop_enabled) |
73 | return; | 73 | return; |
74 | 74 | ||
75 | spin_lock_irqsave(&latency_lock, flags); | 75 | raw_spin_lock_irqsave(&latency_lock, flags); |
76 | memset(&p->latency_record, 0, sizeof(p->latency_record)); | 76 | memset(&p->latency_record, 0, sizeof(p->latency_record)); |
77 | p->latency_record_count = 0; | 77 | p->latency_record_count = 0; |
78 | spin_unlock_irqrestore(&latency_lock, flags); | 78 | raw_spin_unlock_irqrestore(&latency_lock, flags); |
79 | } | 79 | } |
80 | 80 | ||
81 | static void clear_global_latency_tracing(void) | 81 | static void clear_global_latency_tracing(void) |
82 | { | 82 | { |
83 | unsigned long flags; | 83 | unsigned long flags; |
84 | 84 | ||
85 | spin_lock_irqsave(&latency_lock, flags); | 85 | raw_spin_lock_irqsave(&latency_lock, flags); |
86 | memset(&latency_record, 0, sizeof(latency_record)); | 86 | memset(&latency_record, 0, sizeof(latency_record)); |
87 | spin_unlock_irqrestore(&latency_lock, flags); | 87 | raw_spin_unlock_irqrestore(&latency_lock, flags); |
88 | } | 88 | } |
89 | 89 | ||
90 | static void __sched | 90 | static void __sched |
@@ -190,7 +190,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) | |||
190 | lat.max = usecs; | 190 | lat.max = usecs; |
191 | store_stacktrace(tsk, &lat); | 191 | store_stacktrace(tsk, &lat); |
192 | 192 | ||
193 | spin_lock_irqsave(&latency_lock, flags); | 193 | raw_spin_lock_irqsave(&latency_lock, flags); |
194 | 194 | ||
195 | account_global_scheduler_latency(tsk, &lat); | 195 | account_global_scheduler_latency(tsk, &lat); |
196 | 196 | ||
@@ -231,7 +231,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) | |||
231 | memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); | 231 | memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); |
232 | 232 | ||
233 | out_unlock: | 233 | out_unlock: |
234 | spin_unlock_irqrestore(&latency_lock, flags); | 234 | raw_spin_unlock_irqrestore(&latency_lock, flags); |
235 | } | 235 | } |
236 | 236 | ||
237 | static int lstats_show(struct seq_file *m, void *v) | 237 | static int lstats_show(struct seq_file *m, void *v) |
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 91d67ce3a8d5..8889f7dd7c46 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
@@ -44,6 +44,7 @@ | |||
44 | #include <linux/stringify.h> | 44 | #include <linux/stringify.h> |
45 | #include <linux/bitops.h> | 45 | #include <linux/bitops.h> |
46 | #include <linux/gfp.h> | 46 | #include <linux/gfp.h> |
47 | #include <linux/kmemcheck.h> | ||
47 | 48 | ||
48 | #include <asm/sections.h> | 49 | #include <asm/sections.h> |
49 | 50 | ||
@@ -96,8 +97,13 @@ static int graph_lock(void) | |||
96 | 97 | ||
97 | static inline int graph_unlock(void) | 98 | static inline int graph_unlock(void) |
98 | { | 99 | { |
99 | if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) | 100 | if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) { |
101 | /* | ||
102 | * The lockdep graph lock isn't locked while we expect it to | ||
103 | * be, we're confused now, bye! | ||
104 | */ | ||
100 | return DEBUG_LOCKS_WARN_ON(1); | 105 | return DEBUG_LOCKS_WARN_ON(1); |
106 | } | ||
101 | 107 | ||
102 | current->lockdep_recursion--; | 108 | current->lockdep_recursion--; |
103 | arch_spin_unlock(&lockdep_lock); | 109 | arch_spin_unlock(&lockdep_lock); |
@@ -134,6 +140,9 @@ static struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; | |||
134 | static inline struct lock_class *hlock_class(struct held_lock *hlock) | 140 | static inline struct lock_class *hlock_class(struct held_lock *hlock) |
135 | { | 141 | { |
136 | if (!hlock->class_idx) { | 142 | if (!hlock->class_idx) { |
143 | /* | ||
144 | * Someone passed in garbage, we give up. | ||
145 | */ | ||
137 | DEBUG_LOCKS_WARN_ON(1); | 146 | DEBUG_LOCKS_WARN_ON(1); |
138 | return NULL; | 147 | return NULL; |
139 | } | 148 | } |
@@ -422,6 +431,7 @@ unsigned int max_lockdep_depth; | |||
422 | * about it later on, in lockdep_info(). | 431 | * about it later on, in lockdep_info(). |
423 | */ | 432 | */ |
424 | static int lockdep_init_error; | 433 | static int lockdep_init_error; |
434 | static const char *lock_init_error; | ||
425 | static unsigned long lockdep_init_trace_data[20]; | 435 | static unsigned long lockdep_init_trace_data[20]; |
426 | static struct stack_trace lockdep_init_trace = { | 436 | static struct stack_trace lockdep_init_trace = { |
427 | .max_entries = ARRAY_SIZE(lockdep_init_trace_data), | 437 | .max_entries = ARRAY_SIZE(lockdep_init_trace_data), |
@@ -490,36 +500,32 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]) | |||
490 | usage[i] = '\0'; | 500 | usage[i] = '\0'; |
491 | } | 501 | } |
492 | 502 | ||
493 | static int __print_lock_name(struct lock_class *class) | 503 | static void __print_lock_name(struct lock_class *class) |
494 | { | 504 | { |
495 | char str[KSYM_NAME_LEN]; | 505 | char str[KSYM_NAME_LEN]; |
496 | const char *name; | 506 | const char *name; |
497 | 507 | ||
498 | name = class->name; | 508 | name = class->name; |
499 | if (!name) | ||
500 | name = __get_key_name(class->key, str); | ||
501 | |||
502 | return printk("%s", name); | ||
503 | } | ||
504 | |||
505 | static void print_lock_name(struct lock_class *class) | ||
506 | { | ||
507 | char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS]; | ||
508 | const char *name; | ||
509 | |||
510 | get_usage_chars(class, usage); | ||
511 | |||
512 | name = class->name; | ||
513 | if (!name) { | 509 | if (!name) { |
514 | name = __get_key_name(class->key, str); | 510 | name = __get_key_name(class->key, str); |
515 | printk(" (%s", name); | 511 | printk("%s", name); |
516 | } else { | 512 | } else { |
517 | printk(" (%s", name); | 513 | printk("%s", name); |
518 | if (class->name_version > 1) | 514 | if (class->name_version > 1) |
519 | printk("#%d", class->name_version); | 515 | printk("#%d", class->name_version); |
520 | if (class->subclass) | 516 | if (class->subclass) |
521 | printk("/%d", class->subclass); | 517 | printk("/%d", class->subclass); |
522 | } | 518 | } |
519 | } | ||
520 | |||
521 | static void print_lock_name(struct lock_class *class) | ||
522 | { | ||
523 | char usage[LOCK_USAGE_CHARS]; | ||
524 | |||
525 | get_usage_chars(class, usage); | ||
526 | |||
527 | printk(" ("); | ||
528 | __print_lock_name(class); | ||
523 | printk("){%s}", usage); | 529 | printk("){%s}", usage); |
524 | } | 530 | } |
525 | 531 | ||
@@ -559,11 +565,12 @@ static void lockdep_print_held_locks(struct task_struct *curr) | |||
559 | } | 565 | } |
560 | } | 566 | } |
561 | 567 | ||
562 | static void print_kernel_version(void) | 568 | static void print_kernel_ident(void) |
563 | { | 569 | { |
564 | printk("%s %.*s\n", init_utsname()->release, | 570 | printk("%s %.*s %s\n", init_utsname()->release, |
565 | (int)strcspn(init_utsname()->version, " "), | 571 | (int)strcspn(init_utsname()->version, " "), |
566 | init_utsname()->version); | 572 | init_utsname()->version, |
573 | print_tainted()); | ||
567 | } | 574 | } |
568 | 575 | ||
569 | static int very_verbose(struct lock_class *class) | 576 | static int very_verbose(struct lock_class *class) |
@@ -647,6 +654,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) | |||
647 | if (unlikely(!lockdep_initialized)) { | 654 | if (unlikely(!lockdep_initialized)) { |
648 | lockdep_init(); | 655 | lockdep_init(); |
649 | lockdep_init_error = 1; | 656 | lockdep_init_error = 1; |
657 | lock_init_error = lock->name; | ||
650 | save_stack_trace(&lockdep_init_trace); | 658 | save_stack_trace(&lockdep_init_trace); |
651 | } | 659 | } |
652 | #endif | 660 | #endif |
@@ -687,6 +695,10 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) | |||
687 | */ | 695 | */ |
688 | list_for_each_entry(class, hash_head, hash_entry) { | 696 | list_for_each_entry(class, hash_head, hash_entry) { |
689 | if (class->key == key) { | 697 | if (class->key == key) { |
698 | /* | ||
699 | * Huh! same key, different name? Did someone trample | ||
700 | * on some memory? We're most confused. | ||
701 | */ | ||
690 | WARN_ON_ONCE(class->name != lock->name); | 702 | WARN_ON_ONCE(class->name != lock->name); |
691 | return class; | 703 | return class; |
692 | } | 704 | } |
@@ -710,7 +722,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) | |||
710 | 722 | ||
711 | class = look_up_lock_class(lock, subclass); | 723 | class = look_up_lock_class(lock, subclass); |
712 | if (likely(class)) | 724 | if (likely(class)) |
713 | return class; | 725 | goto out_set_class_cache; |
714 | 726 | ||
715 | /* | 727 | /* |
716 | * Debug-check: all keys must be persistent! | 728 | * Debug-check: all keys must be persistent! |
@@ -795,11 +807,16 @@ out_unlock_set: | |||
795 | graph_unlock(); | 807 | graph_unlock(); |
796 | raw_local_irq_restore(flags); | 808 | raw_local_irq_restore(flags); |
797 | 809 | ||
810 | out_set_class_cache: | ||
798 | if (!subclass || force) | 811 | if (!subclass || force) |
799 | lock->class_cache[0] = class; | 812 | lock->class_cache[0] = class; |
800 | else if (subclass < NR_LOCKDEP_CACHING_CLASSES) | 813 | else if (subclass < NR_LOCKDEP_CACHING_CLASSES) |
801 | lock->class_cache[subclass] = class; | 814 | lock->class_cache[subclass] = class; |
802 | 815 | ||
816 | /* | ||
817 | * Hash collision, did we smoke some? We found a class with a matching | ||
818 | * hash but the subclass -- which is hashed in -- didn't match. | ||
819 | */ | ||
803 | if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) | 820 | if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) |
804 | return NULL; | 821 | return NULL; |
805 | 822 | ||
@@ -926,7 +943,7 @@ static inline void mark_lock_accessed(struct lock_list *lock, | |||
926 | unsigned long nr; | 943 | unsigned long nr; |
927 | 944 | ||
928 | nr = lock - list_entries; | 945 | nr = lock - list_entries; |
929 | WARN_ON(nr >= nr_list_entries); | 946 | WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */ |
930 | lock->parent = parent; | 947 | lock->parent = parent; |
931 | lock->class->dep_gen_id = lockdep_dependency_gen_id; | 948 | lock->class->dep_gen_id = lockdep_dependency_gen_id; |
932 | } | 949 | } |
@@ -936,7 +953,7 @@ static inline unsigned long lock_accessed(struct lock_list *lock) | |||
936 | unsigned long nr; | 953 | unsigned long nr; |
937 | 954 | ||
938 | nr = lock - list_entries; | 955 | nr = lock - list_entries; |
939 | WARN_ON(nr >= nr_list_entries); | 956 | WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */ |
940 | return lock->class->dep_gen_id == lockdep_dependency_gen_id; | 957 | return lock->class->dep_gen_id == lockdep_dependency_gen_id; |
941 | } | 958 | } |
942 | 959 | ||
@@ -1129,10 +1146,11 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, | |||
1129 | if (debug_locks_silent) | 1146 | if (debug_locks_silent) |
1130 | return 0; | 1147 | return 0; |
1131 | 1148 | ||
1132 | printk("\n=======================================================\n"); | 1149 | printk("\n"); |
1133 | printk( "[ INFO: possible circular locking dependency detected ]\n"); | 1150 | printk("======================================================\n"); |
1134 | print_kernel_version(); | 1151 | printk("[ INFO: possible circular locking dependency detected ]\n"); |
1135 | printk( "-------------------------------------------------------\n"); | 1152 | print_kernel_ident(); |
1153 | printk("-------------------------------------------------------\n"); | ||
1136 | printk("%s/%d is trying to acquire lock:\n", | 1154 | printk("%s/%d is trying to acquire lock:\n", |
1137 | curr->comm, task_pid_nr(curr)); | 1155 | curr->comm, task_pid_nr(curr)); |
1138 | print_lock(check_src); | 1156 | print_lock(check_src); |
@@ -1196,6 +1214,9 @@ static noinline int print_bfs_bug(int ret) | |||
1196 | if (!debug_locks_off_graph_unlock()) | 1214 | if (!debug_locks_off_graph_unlock()) |
1197 | return 0; | 1215 | return 0; |
1198 | 1216 | ||
1217 | /* | ||
1218 | * Breadth-first-search failed, graph got corrupted? | ||
1219 | */ | ||
1199 | WARN(1, "lockdep bfs error:%d\n", ret); | 1220 | WARN(1, "lockdep bfs error:%d\n", ret); |
1200 | 1221 | ||
1201 | return 0; | 1222 | return 0; |
@@ -1463,11 +1484,12 @@ print_bad_irq_dependency(struct task_struct *curr, | |||
1463 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) | 1484 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) |
1464 | return 0; | 1485 | return 0; |
1465 | 1486 | ||
1466 | printk("\n======================================================\n"); | 1487 | printk("\n"); |
1467 | printk( "[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", | 1488 | printk("======================================================\n"); |
1489 | printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", | ||
1468 | irqclass, irqclass); | 1490 | irqclass, irqclass); |
1469 | print_kernel_version(); | 1491 | print_kernel_ident(); |
1470 | printk( "------------------------------------------------------\n"); | 1492 | printk("------------------------------------------------------\n"); |
1471 | printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", | 1493 | printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", |
1472 | curr->comm, task_pid_nr(curr), | 1494 | curr->comm, task_pid_nr(curr), |
1473 | curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, | 1495 | curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, |
@@ -1692,10 +1714,11 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, | |||
1692 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) | 1714 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) |
1693 | return 0; | 1715 | return 0; |
1694 | 1716 | ||
1695 | printk("\n=============================================\n"); | 1717 | printk("\n"); |
1696 | printk( "[ INFO: possible recursive locking detected ]\n"); | 1718 | printk("=============================================\n"); |
1697 | print_kernel_version(); | 1719 | printk("[ INFO: possible recursive locking detected ]\n"); |
1698 | printk( "---------------------------------------------\n"); | 1720 | print_kernel_ident(); |
1721 | printk("---------------------------------------------\n"); | ||
1699 | printk("%s/%d is trying to acquire lock:\n", | 1722 | printk("%s/%d is trying to acquire lock:\n", |
1700 | curr->comm, task_pid_nr(curr)); | 1723 | curr->comm, task_pid_nr(curr)); |
1701 | print_lock(next); | 1724 | print_lock(next); |
@@ -1944,6 +1967,11 @@ out_bug: | |||
1944 | if (!debug_locks_off_graph_unlock()) | 1967 | if (!debug_locks_off_graph_unlock()) |
1945 | return 0; | 1968 | return 0; |
1946 | 1969 | ||
1970 | /* | ||
1971 | * Clearly we all shouldn't be here, but since we made it we | ||
1972 | * can reliable say we messed up our state. See the above two | ||
1973 | * gotos for reasons why we could possibly end up here. | ||
1974 | */ | ||
1947 | WARN_ON(1); | 1975 | WARN_ON(1); |
1948 | 1976 | ||
1949 | return 0; | 1977 | return 0; |
@@ -1975,6 +2003,11 @@ static inline int lookup_chain_cache(struct task_struct *curr, | |||
1975 | struct held_lock *hlock_curr, *hlock_next; | 2003 | struct held_lock *hlock_curr, *hlock_next; |
1976 | int i, j; | 2004 | int i, j; |
1977 | 2005 | ||
2006 | /* | ||
2007 | * We might need to take the graph lock, ensure we've got IRQs | ||
2008 | * disabled to make this an IRQ-safe lock.. for recursion reasons | ||
2009 | * lockdep won't complain about its own locking errors. | ||
2010 | */ | ||
1978 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | 2011 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) |
1979 | return 0; | 2012 | return 0; |
1980 | /* | 2013 | /* |
@@ -2126,6 +2159,10 @@ static void check_chain_key(struct task_struct *curr) | |||
2126 | hlock = curr->held_locks + i; | 2159 | hlock = curr->held_locks + i; |
2127 | if (chain_key != hlock->prev_chain_key) { | 2160 | if (chain_key != hlock->prev_chain_key) { |
2128 | debug_locks_off(); | 2161 | debug_locks_off(); |
2162 | /* | ||
2163 | * We got mighty confused, our chain keys don't match | ||
2164 | * with what we expect, someone trample on our task state? | ||
2165 | */ | ||
2129 | WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n", | 2166 | WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n", |
2130 | curr->lockdep_depth, i, | 2167 | curr->lockdep_depth, i, |
2131 | (unsigned long long)chain_key, | 2168 | (unsigned long long)chain_key, |
@@ -2133,6 +2170,9 @@ static void check_chain_key(struct task_struct *curr) | |||
2133 | return; | 2170 | return; |
2134 | } | 2171 | } |
2135 | id = hlock->class_idx - 1; | 2172 | id = hlock->class_idx - 1; |
2173 | /* | ||
2174 | * Whoops ran out of static storage again? | ||
2175 | */ | ||
2136 | if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) | 2176 | if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) |
2137 | return; | 2177 | return; |
2138 | 2178 | ||
@@ -2144,6 +2184,10 @@ static void check_chain_key(struct task_struct *curr) | |||
2144 | } | 2184 | } |
2145 | if (chain_key != curr->curr_chain_key) { | 2185 | if (chain_key != curr->curr_chain_key) { |
2146 | debug_locks_off(); | 2186 | debug_locks_off(); |
2187 | /* | ||
2188 | * More smoking hash instead of calculating it, damn see these | ||
2189 | * numbers float.. I bet that a pink elephant stepped on my memory. | ||
2190 | */ | ||
2147 | WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n", | 2191 | WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n", |
2148 | curr->lockdep_depth, i, | 2192 | curr->lockdep_depth, i, |
2149 | (unsigned long long)chain_key, | 2193 | (unsigned long long)chain_key, |
@@ -2177,10 +2221,11 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, | |||
2177 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) | 2221 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) |
2178 | return 0; | 2222 | return 0; |
2179 | 2223 | ||
2180 | printk("\n=================================\n"); | 2224 | printk("\n"); |
2181 | printk( "[ INFO: inconsistent lock state ]\n"); | 2225 | printk("=================================\n"); |
2182 | print_kernel_version(); | 2226 | printk("[ INFO: inconsistent lock state ]\n"); |
2183 | printk( "---------------------------------\n"); | 2227 | print_kernel_ident(); |
2228 | printk("---------------------------------\n"); | ||
2184 | 2229 | ||
2185 | printk("inconsistent {%s} -> {%s} usage.\n", | 2230 | printk("inconsistent {%s} -> {%s} usage.\n", |
2186 | usage_str[prev_bit], usage_str[new_bit]); | 2231 | usage_str[prev_bit], usage_str[new_bit]); |
@@ -2241,10 +2286,11 @@ print_irq_inversion_bug(struct task_struct *curr, | |||
2241 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) | 2286 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) |
2242 | return 0; | 2287 | return 0; |
2243 | 2288 | ||
2244 | printk("\n=========================================================\n"); | 2289 | printk("\n"); |
2245 | printk( "[ INFO: possible irq lock inversion dependency detected ]\n"); | 2290 | printk("=========================================================\n"); |
2246 | print_kernel_version(); | 2291 | printk("[ INFO: possible irq lock inversion dependency detected ]\n"); |
2247 | printk( "---------------------------------------------------------\n"); | 2292 | print_kernel_ident(); |
2293 | printk("---------------------------------------------------------\n"); | ||
2248 | printk("%s/%d just changed the state of lock:\n", | 2294 | printk("%s/%d just changed the state of lock:\n", |
2249 | curr->comm, task_pid_nr(curr)); | 2295 | curr->comm, task_pid_nr(curr)); |
2250 | print_lock(this); | 2296 | print_lock(this); |
@@ -2525,12 +2571,24 @@ void trace_hardirqs_on_caller(unsigned long ip) | |||
2525 | return; | 2571 | return; |
2526 | } | 2572 | } |
2527 | 2573 | ||
2574 | /* | ||
2575 | * We're enabling irqs and according to our state above irqs weren't | ||
2576 | * already enabled, yet we find the hardware thinks they are in fact | ||
2577 | * enabled.. someone messed up their IRQ state tracing. | ||
2578 | */ | ||
2528 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | 2579 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) |
2529 | return; | 2580 | return; |
2530 | 2581 | ||
2582 | /* | ||
2583 | * See the fine text that goes along with this variable definition. | ||
2584 | */ | ||
2531 | if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) | 2585 | if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) |
2532 | return; | 2586 | return; |
2533 | 2587 | ||
2588 | /* | ||
2589 | * Can't allow enabling interrupts while in an interrupt handler, | ||
2590 | * that's general bad form and such. Recursion, limited stack etc.. | ||
2591 | */ | ||
2534 | if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) | 2592 | if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) |
2535 | return; | 2593 | return; |
2536 | 2594 | ||
@@ -2558,6 +2616,10 @@ void trace_hardirqs_off_caller(unsigned long ip) | |||
2558 | if (unlikely(!debug_locks || current->lockdep_recursion)) | 2616 | if (unlikely(!debug_locks || current->lockdep_recursion)) |
2559 | return; | 2617 | return; |
2560 | 2618 | ||
2619 | /* | ||
2620 | * So we're supposed to get called after you mask local IRQs, but for | ||
2621 | * some reason the hardware doesn't quite think you did a proper job. | ||
2622 | */ | ||
2561 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | 2623 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) |
2562 | return; | 2624 | return; |
2563 | 2625 | ||
@@ -2590,6 +2652,10 @@ void trace_softirqs_on(unsigned long ip) | |||
2590 | if (unlikely(!debug_locks || current->lockdep_recursion)) | 2652 | if (unlikely(!debug_locks || current->lockdep_recursion)) |
2591 | return; | 2653 | return; |
2592 | 2654 | ||
2655 | /* | ||
2656 | * We fancy IRQs being disabled here, see softirq.c, avoids | ||
2657 | * funny state and nesting things. | ||
2658 | */ | ||
2593 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | 2659 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) |
2594 | return; | 2660 | return; |
2595 | 2661 | ||
@@ -2626,6 +2692,9 @@ void trace_softirqs_off(unsigned long ip) | |||
2626 | if (unlikely(!debug_locks || current->lockdep_recursion)) | 2692 | if (unlikely(!debug_locks || current->lockdep_recursion)) |
2627 | return; | 2693 | return; |
2628 | 2694 | ||
2695 | /* | ||
2696 | * We fancy IRQs being disabled here, see softirq.c | ||
2697 | */ | ||
2629 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | 2698 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) |
2630 | return; | 2699 | return; |
2631 | 2700 | ||
@@ -2637,6 +2706,9 @@ void trace_softirqs_off(unsigned long ip) | |||
2637 | curr->softirq_disable_ip = ip; | 2706 | curr->softirq_disable_ip = ip; |
2638 | curr->softirq_disable_event = ++curr->irq_events; | 2707 | curr->softirq_disable_event = ++curr->irq_events; |
2639 | debug_atomic_inc(softirqs_off_events); | 2708 | debug_atomic_inc(softirqs_off_events); |
2709 | /* | ||
2710 | * Whoops, we wanted softirqs off, so why aren't they? | ||
2711 | */ | ||
2640 | DEBUG_LOCKS_WARN_ON(!softirq_count()); | 2712 | DEBUG_LOCKS_WARN_ON(!softirq_count()); |
2641 | } else | 2713 | } else |
2642 | debug_atomic_inc(redundant_softirqs_off); | 2714 | debug_atomic_inc(redundant_softirqs_off); |
@@ -2661,6 +2733,9 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) | |||
2661 | if (!(gfp_mask & __GFP_FS)) | 2733 | if (!(gfp_mask & __GFP_FS)) |
2662 | return; | 2734 | return; |
2663 | 2735 | ||
2736 | /* | ||
2737 | * Oi! Can't be having __GFP_FS allocations with IRQs disabled. | ||
2738 | */ | ||
2664 | if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))) | 2739 | if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))) |
2665 | return; | 2740 | return; |
2666 | 2741 | ||
@@ -2773,13 +2848,13 @@ static int separate_irq_context(struct task_struct *curr, | |||
2773 | return 0; | 2848 | return 0; |
2774 | } | 2849 | } |
2775 | 2850 | ||
2776 | #else | 2851 | #else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ |
2777 | 2852 | ||
2778 | static inline | 2853 | static inline |
2779 | int mark_lock_irq(struct task_struct *curr, struct held_lock *this, | 2854 | int mark_lock_irq(struct task_struct *curr, struct held_lock *this, |
2780 | enum lock_usage_bit new_bit) | 2855 | enum lock_usage_bit new_bit) |
2781 | { | 2856 | { |
2782 | WARN_ON(1); | 2857 | WARN_ON(1); /* Impossible innit? when we don't have TRACE_IRQFLAG */ |
2783 | return 1; | 2858 | return 1; |
2784 | } | 2859 | } |
2785 | 2860 | ||
@@ -2799,7 +2874,7 @@ void lockdep_trace_alloc(gfp_t gfp_mask) | |||
2799 | { | 2874 | { |
2800 | } | 2875 | } |
2801 | 2876 | ||
2802 | #endif | 2877 | #endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ |
2803 | 2878 | ||
2804 | /* | 2879 | /* |
2805 | * Mark a lock with a usage bit, and validate the state transition: | 2880 | * Mark a lock with a usage bit, and validate the state transition: |
@@ -2874,12 +2949,20 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, | |||
2874 | void lockdep_init_map(struct lockdep_map *lock, const char *name, | 2949 | void lockdep_init_map(struct lockdep_map *lock, const char *name, |
2875 | struct lock_class_key *key, int subclass) | 2950 | struct lock_class_key *key, int subclass) |
2876 | { | 2951 | { |
2877 | memset(lock, 0, sizeof(*lock)); | 2952 | int i; |
2953 | |||
2954 | kmemcheck_mark_initialized(lock, sizeof(*lock)); | ||
2955 | |||
2956 | for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) | ||
2957 | lock->class_cache[i] = NULL; | ||
2878 | 2958 | ||
2879 | #ifdef CONFIG_LOCK_STAT | 2959 | #ifdef CONFIG_LOCK_STAT |
2880 | lock->cpu = raw_smp_processor_id(); | 2960 | lock->cpu = raw_smp_processor_id(); |
2881 | #endif | 2961 | #endif |
2882 | 2962 | ||
2963 | /* | ||
2964 | * Can't be having no nameless bastards around this place! | ||
2965 | */ | ||
2883 | if (DEBUG_LOCKS_WARN_ON(!name)) { | 2966 | if (DEBUG_LOCKS_WARN_ON(!name)) { |
2884 | lock->name = "NULL"; | 2967 | lock->name = "NULL"; |
2885 | return; | 2968 | return; |
@@ -2887,6 +2970,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, | |||
2887 | 2970 | ||
2888 | lock->name = name; | 2971 | lock->name = name; |
2889 | 2972 | ||
2973 | /* | ||
2974 | * No key, no joy, we need to hash something. | ||
2975 | */ | ||
2890 | if (DEBUG_LOCKS_WARN_ON(!key)) | 2976 | if (DEBUG_LOCKS_WARN_ON(!key)) |
2891 | return; | 2977 | return; |
2892 | /* | 2978 | /* |
@@ -2894,6 +2980,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, | |||
2894 | */ | 2980 | */ |
2895 | if (!static_obj(key)) { | 2981 | if (!static_obj(key)) { |
2896 | printk("BUG: key %p not in .data!\n", key); | 2982 | printk("BUG: key %p not in .data!\n", key); |
2983 | /* | ||
2984 | * What it says above ^^^^^, I suggest you read it. | ||
2985 | */ | ||
2897 | DEBUG_LOCKS_WARN_ON(1); | 2986 | DEBUG_LOCKS_WARN_ON(1); |
2898 | return; | 2987 | return; |
2899 | } | 2988 | } |
@@ -2932,6 +3021,11 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
2932 | if (unlikely(!debug_locks)) | 3021 | if (unlikely(!debug_locks)) |
2933 | return 0; | 3022 | return 0; |
2934 | 3023 | ||
3024 | /* | ||
3025 | * Lockdep should run with IRQs disabled, otherwise we could | ||
3026 | * get an interrupt which would want to take locks, which would | ||
3027 | * end up in lockdep and have you got a head-ache already? | ||
3028 | */ | ||
2935 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | 3029 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) |
2936 | return 0; | 3030 | return 0; |
2937 | 3031 | ||
@@ -2963,6 +3057,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
2963 | * dependency checks are done) | 3057 | * dependency checks are done) |
2964 | */ | 3058 | */ |
2965 | depth = curr->lockdep_depth; | 3059 | depth = curr->lockdep_depth; |
3060 | /* | ||
3061 | * Ran out of static storage for our per-task lock stack again have we? | ||
3062 | */ | ||
2966 | if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) | 3063 | if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) |
2967 | return 0; | 3064 | return 0; |
2968 | 3065 | ||
@@ -2981,6 +3078,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
2981 | } | 3078 | } |
2982 | 3079 | ||
2983 | hlock = curr->held_locks + depth; | 3080 | hlock = curr->held_locks + depth; |
3081 | /* | ||
3082 | * Plain impossible, we just registered it and checked it weren't no | ||
3083 | * NULL like.. I bet this mushroom I ate was good! | ||
3084 | */ | ||
2984 | if (DEBUG_LOCKS_WARN_ON(!class)) | 3085 | if (DEBUG_LOCKS_WARN_ON(!class)) |
2985 | return 0; | 3086 | return 0; |
2986 | hlock->class_idx = class_idx; | 3087 | hlock->class_idx = class_idx; |
@@ -3015,11 +3116,17 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
3015 | * the hash, not class->key. | 3116 | * the hash, not class->key. |
3016 | */ | 3117 | */ |
3017 | id = class - lock_classes; | 3118 | id = class - lock_classes; |
3119 | /* | ||
3120 | * Whoops, we did it again.. ran straight out of our static allocation. | ||
3121 | */ | ||
3018 | if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) | 3122 | if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) |
3019 | return 0; | 3123 | return 0; |
3020 | 3124 | ||
3021 | chain_key = curr->curr_chain_key; | 3125 | chain_key = curr->curr_chain_key; |
3022 | if (!depth) { | 3126 | if (!depth) { |
3127 | /* | ||
3128 | * How can we have a chain hash when we ain't got no keys?! | ||
3129 | */ | ||
3023 | if (DEBUG_LOCKS_WARN_ON(chain_key != 0)) | 3130 | if (DEBUG_LOCKS_WARN_ON(chain_key != 0)) |
3024 | return 0; | 3131 | return 0; |
3025 | chain_head = 1; | 3132 | chain_head = 1; |
@@ -3065,9 +3172,11 @@ print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock, | |||
3065 | if (debug_locks_silent) | 3172 | if (debug_locks_silent) |
3066 | return 0; | 3173 | return 0; |
3067 | 3174 | ||
3068 | printk("\n=====================================\n"); | 3175 | printk("\n"); |
3069 | printk( "[ BUG: bad unlock balance detected! ]\n"); | 3176 | printk("=====================================\n"); |
3070 | printk( "-------------------------------------\n"); | 3177 | printk("[ BUG: bad unlock balance detected! ]\n"); |
3178 | print_kernel_ident(); | ||
3179 | printk("-------------------------------------\n"); | ||
3071 | printk("%s/%d is trying to release lock (", | 3180 | printk("%s/%d is trying to release lock (", |
3072 | curr->comm, task_pid_nr(curr)); | 3181 | curr->comm, task_pid_nr(curr)); |
3073 | print_lockdep_cache(lock); | 3182 | print_lockdep_cache(lock); |
@@ -3091,6 +3200,9 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock, | |||
3091 | { | 3200 | { |
3092 | if (unlikely(!debug_locks)) | 3201 | if (unlikely(!debug_locks)) |
3093 | return 0; | 3202 | return 0; |
3203 | /* | ||
3204 | * Lockdep should run with IRQs disabled, recursion, head-ache, etc.. | ||
3205 | */ | ||
3094 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | 3206 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) |
3095 | return 0; | 3207 | return 0; |
3096 | 3208 | ||
@@ -3120,6 +3232,11 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) | |||
3120 | if (!class) | 3232 | if (!class) |
3121 | return 0; | 3233 | return 0; |
3122 | 3234 | ||
3235 | /* | ||
3236 | * References, but not a lock we're actually ref-counting? | ||
3237 | * State got messed up, follow the sites that change ->references | ||
3238 | * and try to make sense of it. | ||
3239 | */ | ||
3123 | if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock)) | 3240 | if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock)) |
3124 | return 0; | 3241 | return 0; |
3125 | 3242 | ||
@@ -3142,6 +3259,10 @@ __lock_set_class(struct lockdep_map *lock, const char *name, | |||
3142 | int i; | 3259 | int i; |
3143 | 3260 | ||
3144 | depth = curr->lockdep_depth; | 3261 | depth = curr->lockdep_depth; |
3262 | /* | ||
3263 | * This function is about (re)setting the class of a held lock, | ||
3264 | * yet we're not actually holding any locks. Naughty user! | ||
3265 | */ | ||
3145 | if (DEBUG_LOCKS_WARN_ON(!depth)) | 3266 | if (DEBUG_LOCKS_WARN_ON(!depth)) |
3146 | return 0; | 3267 | return 0; |
3147 | 3268 | ||
@@ -3177,6 +3298,10 @@ found_it: | |||
3177 | return 0; | 3298 | return 0; |
3178 | } | 3299 | } |
3179 | 3300 | ||
3301 | /* | ||
3302 | * I took it apart and put it back together again, except now I have | ||
3303 | * these 'spare' parts.. where shall I put them. | ||
3304 | */ | ||
3180 | if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) | 3305 | if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) |
3181 | return 0; | 3306 | return 0; |
3182 | return 1; | 3307 | return 1; |
@@ -3201,6 +3326,10 @@ lock_release_non_nested(struct task_struct *curr, | |||
3201 | * of held locks: | 3326 | * of held locks: |
3202 | */ | 3327 | */ |
3203 | depth = curr->lockdep_depth; | 3328 | depth = curr->lockdep_depth; |
3329 | /* | ||
3330 | * So we're all set to release this lock.. wait what lock? We don't | ||
3331 | * own any locks, you've been drinking again? | ||
3332 | */ | ||
3204 | if (DEBUG_LOCKS_WARN_ON(!depth)) | 3333 | if (DEBUG_LOCKS_WARN_ON(!depth)) |
3205 | return 0; | 3334 | return 0; |
3206 | 3335 | ||
@@ -3253,6 +3382,10 @@ found_it: | |||
3253 | return 0; | 3382 | return 0; |
3254 | } | 3383 | } |
3255 | 3384 | ||
3385 | /* | ||
3386 | * We had N bottles of beer on the wall, we drank one, but now | ||
3387 | * there's not N-1 bottles of beer left on the wall... | ||
3388 | */ | ||
3256 | if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1)) | 3389 | if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1)) |
3257 | return 0; | 3390 | return 0; |
3258 | return 1; | 3391 | return 1; |
@@ -3283,6 +3416,9 @@ static int lock_release_nested(struct task_struct *curr, | |||
3283 | return lock_release_non_nested(curr, lock, ip); | 3416 | return lock_release_non_nested(curr, lock, ip); |
3284 | curr->lockdep_depth--; | 3417 | curr->lockdep_depth--; |
3285 | 3418 | ||
3419 | /* | ||
3420 | * No more locks, but somehow we've got hash left over, who left it? | ||
3421 | */ | ||
3286 | if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0))) | 3422 | if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0))) |
3287 | return 0; | 3423 | return 0; |
3288 | 3424 | ||
@@ -3365,10 +3501,13 @@ static void check_flags(unsigned long flags) | |||
3365 | * check if not in hardirq contexts: | 3501 | * check if not in hardirq contexts: |
3366 | */ | 3502 | */ |
3367 | if (!hardirq_count()) { | 3503 | if (!hardirq_count()) { |
3368 | if (softirq_count()) | 3504 | if (softirq_count()) { |
3505 | /* like the above, but with softirqs */ | ||
3369 | DEBUG_LOCKS_WARN_ON(current->softirqs_enabled); | 3506 | DEBUG_LOCKS_WARN_ON(current->softirqs_enabled); |
3370 | else | 3507 | } else { |
3508 | /* lick the above, does it taste good? */ | ||
3371 | DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled); | 3509 | DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled); |
3510 | } | ||
3372 | } | 3511 | } |
3373 | 3512 | ||
3374 | if (!debug_locks) | 3513 | if (!debug_locks) |
@@ -3478,9 +3617,11 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, | |||
3478 | if (debug_locks_silent) | 3617 | if (debug_locks_silent) |
3479 | return 0; | 3618 | return 0; |
3480 | 3619 | ||
3481 | printk("\n=================================\n"); | 3620 | printk("\n"); |
3482 | printk( "[ BUG: bad contention detected! ]\n"); | 3621 | printk("=================================\n"); |
3483 | printk( "---------------------------------\n"); | 3622 | printk("[ BUG: bad contention detected! ]\n"); |
3623 | print_kernel_ident(); | ||
3624 | printk("---------------------------------\n"); | ||
3484 | printk("%s/%d is trying to contend lock (", | 3625 | printk("%s/%d is trying to contend lock (", |
3485 | curr->comm, task_pid_nr(curr)); | 3626 | curr->comm, task_pid_nr(curr)); |
3486 | print_lockdep_cache(lock); | 3627 | print_lockdep_cache(lock); |
@@ -3506,6 +3647,10 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip) | |||
3506 | int i, contention_point, contending_point; | 3647 | int i, contention_point, contending_point; |
3507 | 3648 | ||
3508 | depth = curr->lockdep_depth; | 3649 | depth = curr->lockdep_depth; |
3650 | /* | ||
3651 | * Whee, we contended on this lock, except it seems we're not | ||
3652 | * actually trying to acquire anything much at all.. | ||
3653 | */ | ||
3509 | if (DEBUG_LOCKS_WARN_ON(!depth)) | 3654 | if (DEBUG_LOCKS_WARN_ON(!depth)) |
3510 | return; | 3655 | return; |
3511 | 3656 | ||
@@ -3555,6 +3700,10 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip) | |||
3555 | int i, cpu; | 3700 | int i, cpu; |
3556 | 3701 | ||
3557 | depth = curr->lockdep_depth; | 3702 | depth = curr->lockdep_depth; |
3703 | /* | ||
3704 | * Yay, we acquired ownership of this lock we didn't try to | ||
3705 | * acquire, how the heck did that happen? | ||
3706 | */ | ||
3558 | if (DEBUG_LOCKS_WARN_ON(!depth)) | 3707 | if (DEBUG_LOCKS_WARN_ON(!depth)) |
3559 | return; | 3708 | return; |
3560 | 3709 | ||
@@ -3759,8 +3908,12 @@ void lockdep_reset_lock(struct lockdep_map *lock) | |||
3759 | match |= class == lock->class_cache[j]; | 3908 | match |= class == lock->class_cache[j]; |
3760 | 3909 | ||
3761 | if (unlikely(match)) { | 3910 | if (unlikely(match)) { |
3762 | if (debug_locks_off_graph_unlock()) | 3911 | if (debug_locks_off_graph_unlock()) { |
3912 | /* | ||
3913 | * We all just reset everything, how did it match? | ||
3914 | */ | ||
3763 | WARN_ON(1); | 3915 | WARN_ON(1); |
3916 | } | ||
3764 | goto out_restore; | 3917 | goto out_restore; |
3765 | } | 3918 | } |
3766 | } | 3919 | } |
@@ -3823,7 +3976,8 @@ void __init lockdep_info(void) | |||
3823 | 3976 | ||
3824 | #ifdef CONFIG_DEBUG_LOCKDEP | 3977 | #ifdef CONFIG_DEBUG_LOCKDEP |
3825 | if (lockdep_init_error) { | 3978 | if (lockdep_init_error) { |
3826 | printk("WARNING: lockdep init error! Arch code didn't call lockdep_init() early enough?\n"); | 3979 | printk("WARNING: lockdep init error! lock-%s was acquired" |
3980 | "before lockdep_init\n", lock_init_error); | ||
3827 | printk("Call stack leading to lockdep invocation was:\n"); | 3981 | printk("Call stack leading to lockdep invocation was:\n"); |
3828 | print_stack_trace(&lockdep_init_trace, 0); | 3982 | print_stack_trace(&lockdep_init_trace, 0); |
3829 | } | 3983 | } |
@@ -3839,9 +3993,11 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, | |||
3839 | if (debug_locks_silent) | 3993 | if (debug_locks_silent) |
3840 | return; | 3994 | return; |
3841 | 3995 | ||
3842 | printk("\n=========================\n"); | 3996 | printk("\n"); |
3843 | printk( "[ BUG: held lock freed! ]\n"); | 3997 | printk("=========================\n"); |
3844 | printk( "-------------------------\n"); | 3998 | printk("[ BUG: held lock freed! ]\n"); |
3999 | print_kernel_ident(); | ||
4000 | printk("-------------------------\n"); | ||
3845 | printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", | 4001 | printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", |
3846 | curr->comm, task_pid_nr(curr), mem_from, mem_to-1); | 4002 | curr->comm, task_pid_nr(curr), mem_from, mem_to-1); |
3847 | print_lock(hlock); | 4003 | print_lock(hlock); |
@@ -3895,9 +4051,11 @@ static void print_held_locks_bug(struct task_struct *curr) | |||
3895 | if (debug_locks_silent) | 4051 | if (debug_locks_silent) |
3896 | return; | 4052 | return; |
3897 | 4053 | ||
3898 | printk("\n=====================================\n"); | 4054 | printk("\n"); |
3899 | printk( "[ BUG: lock held at task exit time! ]\n"); | 4055 | printk("=====================================\n"); |
3900 | printk( "-------------------------------------\n"); | 4056 | printk("[ BUG: lock held at task exit time! ]\n"); |
4057 | print_kernel_ident(); | ||
4058 | printk("-------------------------------------\n"); | ||
3901 | printk("%s/%d is exiting with locks still held!\n", | 4059 | printk("%s/%d is exiting with locks still held!\n", |
3902 | curr->comm, task_pid_nr(curr)); | 4060 | curr->comm, task_pid_nr(curr)); |
3903 | lockdep_print_held_locks(curr); | 4061 | lockdep_print_held_locks(curr); |
@@ -3991,16 +4149,18 @@ void lockdep_sys_exit(void) | |||
3991 | if (unlikely(curr->lockdep_depth)) { | 4149 | if (unlikely(curr->lockdep_depth)) { |
3992 | if (!debug_locks_off()) | 4150 | if (!debug_locks_off()) |
3993 | return; | 4151 | return; |
3994 | printk("\n================================================\n"); | 4152 | printk("\n"); |
3995 | printk( "[ BUG: lock held when returning to user space! ]\n"); | 4153 | printk("================================================\n"); |
3996 | printk( "------------------------------------------------\n"); | 4154 | printk("[ BUG: lock held when returning to user space! ]\n"); |
4155 | print_kernel_ident(); | ||
4156 | printk("------------------------------------------------\n"); | ||
3997 | printk("%s/%d is leaving the kernel with locks still held!\n", | 4157 | printk("%s/%d is leaving the kernel with locks still held!\n", |
3998 | curr->comm, curr->pid); | 4158 | curr->comm, curr->pid); |
3999 | lockdep_print_held_locks(curr); | 4159 | lockdep_print_held_locks(curr); |
4000 | } | 4160 | } |
4001 | } | 4161 | } |
4002 | 4162 | ||
4003 | void lockdep_rcu_dereference(const char *file, const int line) | 4163 | void lockdep_rcu_suspicious(const char *file, const int line, const char *s) |
4004 | { | 4164 | { |
4005 | struct task_struct *curr = current; | 4165 | struct task_struct *curr = current; |
4006 | 4166 | ||
@@ -4009,15 +4169,38 @@ void lockdep_rcu_dereference(const char *file, const int line) | |||
4009 | return; | 4169 | return; |
4010 | #endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ | 4170 | #endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ |
4011 | /* Note: the following can be executed concurrently, so be careful. */ | 4171 | /* Note: the following can be executed concurrently, so be careful. */ |
4012 | printk("\n===================================================\n"); | 4172 | printk("\n"); |
4013 | printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n"); | 4173 | printk("===============================\n"); |
4014 | printk( "---------------------------------------------------\n"); | 4174 | printk("[ INFO: suspicious RCU usage. ]\n"); |
4015 | printk("%s:%d invoked rcu_dereference_check() without protection!\n", | 4175 | print_kernel_ident(); |
4016 | file, line); | 4176 | printk("-------------------------------\n"); |
4177 | printk("%s:%d %s!\n", file, line, s); | ||
4017 | printk("\nother info that might help us debug this:\n\n"); | 4178 | printk("\nother info that might help us debug this:\n\n"); |
4018 | printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks); | 4179 | printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks); |
4180 | |||
4181 | /* | ||
4182 | * If a CPU is in the RCU-free window in idle (ie: in the section | ||
4183 | * between rcu_idle_enter() and rcu_idle_exit(), then RCU | ||
4184 | * considers that CPU to be in an "extended quiescent state", | ||
4185 | * which means that RCU will be completely ignoring that CPU. | ||
4186 | * Therefore, rcu_read_lock() and friends have absolutely no | ||
4187 | * effect on a CPU running in that state. In other words, even if | ||
4188 | * such an RCU-idle CPU has called rcu_read_lock(), RCU might well | ||
4189 | * delete data structures out from under it. RCU really has no | ||
4190 | * choice here: we need to keep an RCU-free window in idle where | ||
4191 | * the CPU may possibly enter into low power mode. This way we can | ||
4192 | * notice an extended quiescent state to other CPUs that started a grace | ||
4193 | * period. Otherwise we would delay any grace period as long as we run | ||
4194 | * in the idle task. | ||
4195 | * | ||
4196 | * So complain bitterly if someone does call rcu_read_lock(), | ||
4197 | * rcu_read_lock_bh() and so on from extended quiescent states. | ||
4198 | */ | ||
4199 | if (rcu_is_cpu_idle()) | ||
4200 | printk("RCU used illegally from extended quiescent state!\n"); | ||
4201 | |||
4019 | lockdep_print_held_locks(curr); | 4202 | lockdep_print_held_locks(curr); |
4020 | printk("\nstack backtrace:\n"); | 4203 | printk("\nstack backtrace:\n"); |
4021 | dump_stack(); | 4204 | dump_stack(); |
4022 | } | 4205 | } |
4023 | EXPORT_SYMBOL_GPL(lockdep_rcu_dereference); | 4206 | EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); |
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 71edd2f60c02..91c32a0b612c 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c | |||
@@ -11,7 +11,7 @@ | |||
11 | * Code for /proc/lockdep and /proc/lockdep_stats: | 11 | * Code for /proc/lockdep and /proc/lockdep_stats: |
12 | * | 12 | * |
13 | */ | 13 | */ |
14 | #include <linux/module.h> | 14 | #include <linux/export.h> |
15 | #include <linux/proc_fs.h> | 15 | #include <linux/proc_fs.h> |
16 | #include <linux/seq_file.h> | 16 | #include <linux/seq_file.h> |
17 | #include <linux/kallsyms.h> | 17 | #include <linux/kallsyms.h> |
diff --git a/kernel/module.c b/kernel/module.c index 04379f92f843..acf6ed3ebe81 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -16,7 +16,7 @@ | |||
16 | along with this program; if not, write to the Free Software | 16 | along with this program; if not, write to the Free Software |
17 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | 17 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
18 | */ | 18 | */ |
19 | #include <linux/module.h> | 19 | #include <linux/export.h> |
20 | #include <linux/moduleloader.h> | 20 | #include <linux/moduleloader.h> |
21 | #include <linux/ftrace_event.h> | 21 | #include <linux/ftrace_event.h> |
22 | #include <linux/init.h> | 22 | #include <linux/init.h> |
@@ -62,12 +62,6 @@ | |||
62 | #define CREATE_TRACE_POINTS | 62 | #define CREATE_TRACE_POINTS |
63 | #include <trace/events/module.h> | 63 | #include <trace/events/module.h> |
64 | 64 | ||
65 | #if 0 | ||
66 | #define DEBUGP printk | ||
67 | #else | ||
68 | #define DEBUGP(fmt , a...) | ||
69 | #endif | ||
70 | |||
71 | #ifndef ARCH_SHF_SMALL | 65 | #ifndef ARCH_SHF_SMALL |
72 | #define ARCH_SHF_SMALL 0 | 66 | #define ARCH_SHF_SMALL 0 |
73 | #endif | 67 | #endif |
@@ -138,7 +132,6 @@ struct load_info { | |||
138 | unsigned long len; | 132 | unsigned long len; |
139 | Elf_Shdr *sechdrs; | 133 | Elf_Shdr *sechdrs; |
140 | char *secstrings, *strtab; | 134 | char *secstrings, *strtab; |
141 | unsigned long *strmap; | ||
142 | unsigned long symoffs, stroffs; | 135 | unsigned long symoffs, stroffs; |
143 | struct _ddebug *debug; | 136 | struct _ddebug *debug; |
144 | unsigned int num_debug; | 137 | unsigned int num_debug; |
@@ -410,7 +403,7 @@ const struct kernel_symbol *find_symbol(const char *name, | |||
410 | return fsa.sym; | 403 | return fsa.sym; |
411 | } | 404 | } |
412 | 405 | ||
413 | DEBUGP("Failed to find symbol %s\n", name); | 406 | pr_debug("Failed to find symbol %s\n", name); |
414 | return NULL; | 407 | return NULL; |
415 | } | 408 | } |
416 | EXPORT_SYMBOL_GPL(find_symbol); | 409 | EXPORT_SYMBOL_GPL(find_symbol); |
@@ -600,11 +593,11 @@ static int already_uses(struct module *a, struct module *b) | |||
600 | 593 | ||
601 | list_for_each_entry(use, &b->source_list, source_list) { | 594 | list_for_each_entry(use, &b->source_list, source_list) { |
602 | if (use->source == a) { | 595 | if (use->source == a) { |
603 | DEBUGP("%s uses %s!\n", a->name, b->name); | 596 | pr_debug("%s uses %s!\n", a->name, b->name); |
604 | return 1; | 597 | return 1; |
605 | } | 598 | } |
606 | } | 599 | } |
607 | DEBUGP("%s does not use %s!\n", a->name, b->name); | 600 | pr_debug("%s does not use %s!\n", a->name, b->name); |
608 | return 0; | 601 | return 0; |
609 | } | 602 | } |
610 | 603 | ||
@@ -619,7 +612,7 @@ static int add_module_usage(struct module *a, struct module *b) | |||
619 | { | 612 | { |
620 | struct module_use *use; | 613 | struct module_use *use; |
621 | 614 | ||
622 | DEBUGP("Allocating new usage for %s.\n", a->name); | 615 | pr_debug("Allocating new usage for %s.\n", a->name); |
623 | use = kmalloc(sizeof(*use), GFP_ATOMIC); | 616 | use = kmalloc(sizeof(*use), GFP_ATOMIC); |
624 | if (!use) { | 617 | if (!use) { |
625 | printk(KERN_WARNING "%s: out of memory loading\n", a->name); | 618 | printk(KERN_WARNING "%s: out of memory loading\n", a->name); |
@@ -663,7 +656,7 @@ static void module_unload_free(struct module *mod) | |||
663 | mutex_lock(&module_mutex); | 656 | mutex_lock(&module_mutex); |
664 | list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) { | 657 | list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) { |
665 | struct module *i = use->target; | 658 | struct module *i = use->target; |
666 | DEBUGP("%s unusing %s\n", mod->name, i->name); | 659 | pr_debug("%s unusing %s\n", mod->name, i->name); |
667 | module_put(i); | 660 | module_put(i); |
668 | list_del(&use->source_list); | 661 | list_del(&use->source_list); |
669 | list_del(&use->target_list); | 662 | list_del(&use->target_list); |
@@ -726,9 +719,9 @@ static int try_stop_module(struct module *mod, int flags, int *forced) | |||
726 | } | 719 | } |
727 | } | 720 | } |
728 | 721 | ||
729 | unsigned int module_refcount(struct module *mod) | 722 | unsigned long module_refcount(struct module *mod) |
730 | { | 723 | { |
731 | unsigned int incs = 0, decs = 0; | 724 | unsigned long incs = 0, decs = 0; |
732 | int cpu; | 725 | int cpu; |
733 | 726 | ||
734 | for_each_possible_cpu(cpu) | 727 | for_each_possible_cpu(cpu) |
@@ -761,7 +754,7 @@ static void wait_for_zero_refcount(struct module *mod) | |||
761 | /* Since we might sleep for some time, release the mutex first */ | 754 | /* Since we might sleep for some time, release the mutex first */ |
762 | mutex_unlock(&module_mutex); | 755 | mutex_unlock(&module_mutex); |
763 | for (;;) { | 756 | for (;;) { |
764 | DEBUGP("Looking at refcount...\n"); | 757 | pr_debug("Looking at refcount...\n"); |
765 | set_current_state(TASK_UNINTERRUPTIBLE); | 758 | set_current_state(TASK_UNINTERRUPTIBLE); |
766 | if (module_refcount(mod) == 0) | 759 | if (module_refcount(mod) == 0) |
767 | break; | 760 | break; |
@@ -804,7 +797,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, | |||
804 | if (mod->state != MODULE_STATE_LIVE) { | 797 | if (mod->state != MODULE_STATE_LIVE) { |
805 | /* FIXME: if (force), slam module count and wake up | 798 | /* FIXME: if (force), slam module count and wake up |
806 | waiter --RR */ | 799 | waiter --RR */ |
807 | DEBUGP("%s already dying\n", mod->name); | 800 | pr_debug("%s already dying\n", mod->name); |
808 | ret = -EBUSY; | 801 | ret = -EBUSY; |
809 | goto out; | 802 | goto out; |
810 | } | 803 | } |
@@ -849,12 +842,32 @@ out: | |||
849 | return ret; | 842 | return ret; |
850 | } | 843 | } |
851 | 844 | ||
845 | static size_t module_flags_taint(struct module *mod, char *buf) | ||
846 | { | ||
847 | size_t l = 0; | ||
848 | |||
849 | if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE)) | ||
850 | buf[l++] = 'P'; | ||
851 | if (mod->taints & (1 << TAINT_OOT_MODULE)) | ||
852 | buf[l++] = 'O'; | ||
853 | if (mod->taints & (1 << TAINT_FORCED_MODULE)) | ||
854 | buf[l++] = 'F'; | ||
855 | if (mod->taints & (1 << TAINT_CRAP)) | ||
856 | buf[l++] = 'C'; | ||
857 | /* | ||
858 | * TAINT_FORCED_RMMOD: could be added. | ||
859 | * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't | ||
860 | * apply to modules. | ||
861 | */ | ||
862 | return l; | ||
863 | } | ||
864 | |||
852 | static inline void print_unload_info(struct seq_file *m, struct module *mod) | 865 | static inline void print_unload_info(struct seq_file *m, struct module *mod) |
853 | { | 866 | { |
854 | struct module_use *use; | 867 | struct module_use *use; |
855 | int printed_something = 0; | 868 | int printed_something = 0; |
856 | 869 | ||
857 | seq_printf(m, " %u ", module_refcount(mod)); | 870 | seq_printf(m, " %lu ", module_refcount(mod)); |
858 | 871 | ||
859 | /* Always include a trailing , so userspace can differentiate | 872 | /* Always include a trailing , so userspace can differentiate |
860 | between this and the old multi-field proc format. */ | 873 | between this and the old multi-field proc format. */ |
@@ -904,13 +917,11 @@ EXPORT_SYMBOL_GPL(symbol_put_addr); | |||
904 | static ssize_t show_refcnt(struct module_attribute *mattr, | 917 | static ssize_t show_refcnt(struct module_attribute *mattr, |
905 | struct module_kobject *mk, char *buffer) | 918 | struct module_kobject *mk, char *buffer) |
906 | { | 919 | { |
907 | return sprintf(buffer, "%u\n", module_refcount(mk->mod)); | 920 | return sprintf(buffer, "%lu\n", module_refcount(mk->mod)); |
908 | } | 921 | } |
909 | 922 | ||
910 | static struct module_attribute refcnt = { | 923 | static struct module_attribute modinfo_refcnt = |
911 | .attr = { .name = "refcnt", .mode = 0444 }, | 924 | __ATTR(refcnt, 0444, show_refcnt, NULL); |
912 | .show = show_refcnt, | ||
913 | }; | ||
914 | 925 | ||
915 | void module_put(struct module *module) | 926 | void module_put(struct module *module) |
916 | { | 927 | { |
@@ -970,10 +981,8 @@ static ssize_t show_initstate(struct module_attribute *mattr, | |||
970 | return sprintf(buffer, "%s\n", state); | 981 | return sprintf(buffer, "%s\n", state); |
971 | } | 982 | } |
972 | 983 | ||
973 | static struct module_attribute initstate = { | 984 | static struct module_attribute modinfo_initstate = |
974 | .attr = { .name = "initstate", .mode = 0444 }, | 985 | __ATTR(initstate, 0444, show_initstate, NULL); |
975 | .show = show_initstate, | ||
976 | }; | ||
977 | 986 | ||
978 | static ssize_t store_uevent(struct module_attribute *mattr, | 987 | static ssize_t store_uevent(struct module_attribute *mattr, |
979 | struct module_kobject *mk, | 988 | struct module_kobject *mk, |
@@ -986,18 +995,50 @@ static ssize_t store_uevent(struct module_attribute *mattr, | |||
986 | return count; | 995 | return count; |
987 | } | 996 | } |
988 | 997 | ||
989 | struct module_attribute module_uevent = { | 998 | struct module_attribute module_uevent = |
990 | .attr = { .name = "uevent", .mode = 0200 }, | 999 | __ATTR(uevent, 0200, NULL, store_uevent); |
991 | .store = store_uevent, | 1000 | |
992 | }; | 1001 | static ssize_t show_coresize(struct module_attribute *mattr, |
1002 | struct module_kobject *mk, char *buffer) | ||
1003 | { | ||
1004 | return sprintf(buffer, "%u\n", mk->mod->core_size); | ||
1005 | } | ||
1006 | |||
1007 | static struct module_attribute modinfo_coresize = | ||
1008 | __ATTR(coresize, 0444, show_coresize, NULL); | ||
1009 | |||
1010 | static ssize_t show_initsize(struct module_attribute *mattr, | ||
1011 | struct module_kobject *mk, char *buffer) | ||
1012 | { | ||
1013 | return sprintf(buffer, "%u\n", mk->mod->init_size); | ||
1014 | } | ||
1015 | |||
1016 | static struct module_attribute modinfo_initsize = | ||
1017 | __ATTR(initsize, 0444, show_initsize, NULL); | ||
1018 | |||
1019 | static ssize_t show_taint(struct module_attribute *mattr, | ||
1020 | struct module_kobject *mk, char *buffer) | ||
1021 | { | ||
1022 | size_t l; | ||
1023 | |||
1024 | l = module_flags_taint(mk->mod, buffer); | ||
1025 | buffer[l++] = '\n'; | ||
1026 | return l; | ||
1027 | } | ||
1028 | |||
1029 | static struct module_attribute modinfo_taint = | ||
1030 | __ATTR(taint, 0444, show_taint, NULL); | ||
993 | 1031 | ||
994 | static struct module_attribute *modinfo_attrs[] = { | 1032 | static struct module_attribute *modinfo_attrs[] = { |
1033 | &module_uevent, | ||
995 | &modinfo_version, | 1034 | &modinfo_version, |
996 | &modinfo_srcversion, | 1035 | &modinfo_srcversion, |
997 | &initstate, | 1036 | &modinfo_initstate, |
998 | &module_uevent, | 1037 | &modinfo_coresize, |
1038 | &modinfo_initsize, | ||
1039 | &modinfo_taint, | ||
999 | #ifdef CONFIG_MODULE_UNLOAD | 1040 | #ifdef CONFIG_MODULE_UNLOAD |
1000 | &refcnt, | 1041 | &modinfo_refcnt, |
1001 | #endif | 1042 | #endif |
1002 | NULL, | 1043 | NULL, |
1003 | }; | 1044 | }; |
@@ -1057,7 +1098,7 @@ static int check_version(Elf_Shdr *sechdrs, | |||
1057 | 1098 | ||
1058 | if (versions[i].crc == maybe_relocated(*crc, crc_owner)) | 1099 | if (versions[i].crc == maybe_relocated(*crc, crc_owner)) |
1059 | return 1; | 1100 | return 1; |
1060 | DEBUGP("Found checksum %lX vs module %lX\n", | 1101 | pr_debug("Found checksum %lX vs module %lX\n", |
1061 | maybe_relocated(*crc, crc_owner), versions[i].crc); | 1102 | maybe_relocated(*crc, crc_owner), versions[i].crc); |
1062 | goto bad_version; | 1103 | goto bad_version; |
1063 | } | 1104 | } |
@@ -1834,7 +1875,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) | |||
1834 | case SHN_COMMON: | 1875 | case SHN_COMMON: |
1835 | /* We compiled with -fno-common. These are not | 1876 | /* We compiled with -fno-common. These are not |
1836 | supposed to happen. */ | 1877 | supposed to happen. */ |
1837 | DEBUGP("Common symbol: %s\n", name); | 1878 | pr_debug("Common symbol: %s\n", name); |
1838 | printk("%s: please compile with -fno-common\n", | 1879 | printk("%s: please compile with -fno-common\n", |
1839 | mod->name); | 1880 | mod->name); |
1840 | ret = -ENOEXEC; | 1881 | ret = -ENOEXEC; |
@@ -1842,7 +1883,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) | |||
1842 | 1883 | ||
1843 | case SHN_ABS: | 1884 | case SHN_ABS: |
1844 | /* Don't need to do anything */ | 1885 | /* Don't need to do anything */ |
1845 | DEBUGP("Absolute symbol: 0x%08lx\n", | 1886 | pr_debug("Absolute symbol: 0x%08lx\n", |
1846 | (long)sym[i].st_value); | 1887 | (long)sym[i].st_value); |
1847 | break; | 1888 | break; |
1848 | 1889 | ||
@@ -1966,7 +2007,7 @@ static void layout_sections(struct module *mod, struct load_info *info) | |||
1966 | for (i = 0; i < info->hdr->e_shnum; i++) | 2007 | for (i = 0; i < info->hdr->e_shnum; i++) |
1967 | info->sechdrs[i].sh_entsize = ~0UL; | 2008 | info->sechdrs[i].sh_entsize = ~0UL; |
1968 | 2009 | ||
1969 | DEBUGP("Core section allocation order:\n"); | 2010 | pr_debug("Core section allocation order:\n"); |
1970 | for (m = 0; m < ARRAY_SIZE(masks); ++m) { | 2011 | for (m = 0; m < ARRAY_SIZE(masks); ++m) { |
1971 | for (i = 0; i < info->hdr->e_shnum; ++i) { | 2012 | for (i = 0; i < info->hdr->e_shnum; ++i) { |
1972 | Elf_Shdr *s = &info->sechdrs[i]; | 2013 | Elf_Shdr *s = &info->sechdrs[i]; |
@@ -1978,7 +2019,7 @@ static void layout_sections(struct module *mod, struct load_info *info) | |||
1978 | || strstarts(sname, ".init")) | 2019 | || strstarts(sname, ".init")) |
1979 | continue; | 2020 | continue; |
1980 | s->sh_entsize = get_offset(mod, &mod->core_size, s, i); | 2021 | s->sh_entsize = get_offset(mod, &mod->core_size, s, i); |
1981 | DEBUGP("\t%s\n", name); | 2022 | pr_debug("\t%s\n", sname); |
1982 | } | 2023 | } |
1983 | switch (m) { | 2024 | switch (m) { |
1984 | case 0: /* executable */ | 2025 | case 0: /* executable */ |
@@ -1995,7 +2036,7 @@ static void layout_sections(struct module *mod, struct load_info *info) | |||
1995 | } | 2036 | } |
1996 | } | 2037 | } |
1997 | 2038 | ||
1998 | DEBUGP("Init section allocation order:\n"); | 2039 | pr_debug("Init section allocation order:\n"); |
1999 | for (m = 0; m < ARRAY_SIZE(masks); ++m) { | 2040 | for (m = 0; m < ARRAY_SIZE(masks); ++m) { |
2000 | for (i = 0; i < info->hdr->e_shnum; ++i) { | 2041 | for (i = 0; i < info->hdr->e_shnum; ++i) { |
2001 | Elf_Shdr *s = &info->sechdrs[i]; | 2042 | Elf_Shdr *s = &info->sechdrs[i]; |
@@ -2008,7 +2049,7 @@ static void layout_sections(struct module *mod, struct load_info *info) | |||
2008 | continue; | 2049 | continue; |
2009 | s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) | 2050 | s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) |
2010 | | INIT_OFFSET_MASK); | 2051 | | INIT_OFFSET_MASK); |
2011 | DEBUGP("\t%s\n", sname); | 2052 | pr_debug("\t%s\n", sname); |
2012 | } | 2053 | } |
2013 | switch (m) { | 2054 | switch (m) { |
2014 | case 0: /* executable */ | 2055 | case 0: /* executable */ |
@@ -2178,45 +2219,46 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, | |||
2178 | return true; | 2219 | return true; |
2179 | } | 2220 | } |
2180 | 2221 | ||
2222 | /* | ||
2223 | * We only allocate and copy the strings needed by the parts of symtab | ||
2224 | * we keep. This is simple, but has the effect of making multiple | ||
2225 | * copies of duplicates. We could be more sophisticated, see | ||
2226 | * linux-kernel thread starting with | ||
2227 | * <73defb5e4bca04a6431392cc341112b1@localhost>. | ||
2228 | */ | ||
2181 | static void layout_symtab(struct module *mod, struct load_info *info) | 2229 | static void layout_symtab(struct module *mod, struct load_info *info) |
2182 | { | 2230 | { |
2183 | Elf_Shdr *symsect = info->sechdrs + info->index.sym; | 2231 | Elf_Shdr *symsect = info->sechdrs + info->index.sym; |
2184 | Elf_Shdr *strsect = info->sechdrs + info->index.str; | 2232 | Elf_Shdr *strsect = info->sechdrs + info->index.str; |
2185 | const Elf_Sym *src; | 2233 | const Elf_Sym *src; |
2186 | unsigned int i, nsrc, ndst; | 2234 | unsigned int i, nsrc, ndst, strtab_size; |
2187 | 2235 | ||
2188 | /* Put symbol section at end of init part of module. */ | 2236 | /* Put symbol section at end of init part of module. */ |
2189 | symsect->sh_flags |= SHF_ALLOC; | 2237 | symsect->sh_flags |= SHF_ALLOC; |
2190 | symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, | 2238 | symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, |
2191 | info->index.sym) | INIT_OFFSET_MASK; | 2239 | info->index.sym) | INIT_OFFSET_MASK; |
2192 | DEBUGP("\t%s\n", info->secstrings + symsect->sh_name); | 2240 | pr_debug("\t%s\n", info->secstrings + symsect->sh_name); |
2193 | 2241 | ||
2194 | src = (void *)info->hdr + symsect->sh_offset; | 2242 | src = (void *)info->hdr + symsect->sh_offset; |
2195 | nsrc = symsect->sh_size / sizeof(*src); | 2243 | nsrc = symsect->sh_size / sizeof(*src); |
2196 | for (ndst = i = 1; i < nsrc; ++i, ++src) | ||
2197 | if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) { | ||
2198 | unsigned int j = src->st_name; | ||
2199 | 2244 | ||
2200 | while (!__test_and_set_bit(j, info->strmap) | 2245 | /* Compute total space required for the core symbols' strtab. */ |
2201 | && info->strtab[j]) | 2246 | for (ndst = i = strtab_size = 1; i < nsrc; ++i, ++src) |
2202 | ++j; | 2247 | if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) { |
2203 | ++ndst; | 2248 | strtab_size += strlen(&info->strtab[src->st_name]) + 1; |
2249 | ndst++; | ||
2204 | } | 2250 | } |
2205 | 2251 | ||
2206 | /* Append room for core symbols at end of core part. */ | 2252 | /* Append room for core symbols at end of core part. */ |
2207 | info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); | 2253 | info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); |
2208 | mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym); | 2254 | info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym); |
2255 | mod->core_size += strtab_size; | ||
2209 | 2256 | ||
2210 | /* Put string table section at end of init part of module. */ | 2257 | /* Put string table section at end of init part of module. */ |
2211 | strsect->sh_flags |= SHF_ALLOC; | 2258 | strsect->sh_flags |= SHF_ALLOC; |
2212 | strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, | 2259 | strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, |
2213 | info->index.str) | INIT_OFFSET_MASK; | 2260 | info->index.str) | INIT_OFFSET_MASK; |
2214 | DEBUGP("\t%s\n", info->secstrings + strsect->sh_name); | 2261 | pr_debug("\t%s\n", info->secstrings + strsect->sh_name); |
2215 | |||
2216 | /* Append room for core symbols' strings at end of core part. */ | ||
2217 | info->stroffs = mod->core_size; | ||
2218 | __set_bit(0, info->strmap); | ||
2219 | mod->core_size += bitmap_weight(info->strmap, strsect->sh_size); | ||
2220 | } | 2262 | } |
2221 | 2263 | ||
2222 | static void add_kallsyms(struct module *mod, const struct load_info *info) | 2264 | static void add_kallsyms(struct module *mod, const struct load_info *info) |
@@ -2237,22 +2279,19 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) | |||
2237 | mod->symtab[i].st_info = elf_type(&mod->symtab[i], info); | 2279 | mod->symtab[i].st_info = elf_type(&mod->symtab[i], info); |
2238 | 2280 | ||
2239 | mod->core_symtab = dst = mod->module_core + info->symoffs; | 2281 | mod->core_symtab = dst = mod->module_core + info->symoffs; |
2282 | mod->core_strtab = s = mod->module_core + info->stroffs; | ||
2240 | src = mod->symtab; | 2283 | src = mod->symtab; |
2241 | *dst = *src; | 2284 | *dst = *src; |
2285 | *s++ = 0; | ||
2242 | for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { | 2286 | for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { |
2243 | if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) | 2287 | if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) |
2244 | continue; | 2288 | continue; |
2289 | |||
2245 | dst[ndst] = *src; | 2290 | dst[ndst] = *src; |
2246 | dst[ndst].st_name = bitmap_weight(info->strmap, | 2291 | dst[ndst++].st_name = s - mod->core_strtab; |
2247 | dst[ndst].st_name); | 2292 | s += strlcpy(s, &mod->strtab[src->st_name], KSYM_NAME_LEN) + 1; |
2248 | ++ndst; | ||
2249 | } | 2293 | } |
2250 | mod->core_num_syms = ndst; | 2294 | mod->core_num_syms = ndst; |
2251 | |||
2252 | mod->core_strtab = s = mod->module_core + info->stroffs; | ||
2253 | for (*s = 0, i = 1; i < info->sechdrs[info->index.str].sh_size; ++i) | ||
2254 | if (test_bit(i, info->strmap)) | ||
2255 | *++s = mod->strtab[i]; | ||
2256 | } | 2295 | } |
2257 | #else | 2296 | #else |
2258 | static inline void layout_symtab(struct module *mod, struct load_info *info) | 2297 | static inline void layout_symtab(struct module *mod, struct load_info *info) |
@@ -2487,6 +2526,9 @@ static int check_modinfo(struct module *mod, struct load_info *info) | |||
2487 | return -ENOEXEC; | 2526 | return -ENOEXEC; |
2488 | } | 2527 | } |
2489 | 2528 | ||
2529 | if (!get_modinfo(info, "intree")) | ||
2530 | add_taint_module(mod, TAINT_OOT_MODULE); | ||
2531 | |||
2490 | if (get_modinfo(info, "staging")) { | 2532 | if (get_modinfo(info, "staging")) { |
2491 | add_taint_module(mod, TAINT_CRAP); | 2533 | add_taint_module(mod, TAINT_CRAP); |
2492 | printk(KERN_WARNING "%s: module is from the staging directory," | 2534 | printk(KERN_WARNING "%s: module is from the staging directory," |
@@ -2618,7 +2660,7 @@ static int move_module(struct module *mod, struct load_info *info) | |||
2618 | mod->module_init = ptr; | 2660 | mod->module_init = ptr; |
2619 | 2661 | ||
2620 | /* Transfer each section which specifies SHF_ALLOC */ | 2662 | /* Transfer each section which specifies SHF_ALLOC */ |
2621 | DEBUGP("final section addresses:\n"); | 2663 | pr_debug("final section addresses:\n"); |
2622 | for (i = 0; i < info->hdr->e_shnum; i++) { | 2664 | for (i = 0; i < info->hdr->e_shnum; i++) { |
2623 | void *dest; | 2665 | void *dest; |
2624 | Elf_Shdr *shdr = &info->sechdrs[i]; | 2666 | Elf_Shdr *shdr = &info->sechdrs[i]; |
@@ -2636,8 +2678,8 @@ static int move_module(struct module *mod, struct load_info *info) | |||
2636 | memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size); | 2678 | memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size); |
2637 | /* Update sh_addr to point to copy in image. */ | 2679 | /* Update sh_addr to point to copy in image. */ |
2638 | shdr->sh_addr = (unsigned long)dest; | 2680 | shdr->sh_addr = (unsigned long)dest; |
2639 | DEBUGP("\t0x%lx %s\n", | 2681 | pr_debug("\t0x%lx %s\n", |
2640 | shdr->sh_addr, info->secstrings + shdr->sh_name); | 2682 | (long)shdr->sh_addr, info->secstrings + shdr->sh_name); |
2641 | } | 2683 | } |
2642 | 2684 | ||
2643 | return 0; | 2685 | return 0; |
@@ -2739,27 +2781,18 @@ static struct module *layout_and_allocate(struct load_info *info) | |||
2739 | this is done generically; there doesn't appear to be any | 2781 | this is done generically; there doesn't appear to be any |
2740 | special cases for the architectures. */ | 2782 | special cases for the architectures. */ |
2741 | layout_sections(mod, info); | 2783 | layout_sections(mod, info); |
2742 | |||
2743 | info->strmap = kzalloc(BITS_TO_LONGS(info->sechdrs[info->index.str].sh_size) | ||
2744 | * sizeof(long), GFP_KERNEL); | ||
2745 | if (!info->strmap) { | ||
2746 | err = -ENOMEM; | ||
2747 | goto free_percpu; | ||
2748 | } | ||
2749 | layout_symtab(mod, info); | 2784 | layout_symtab(mod, info); |
2750 | 2785 | ||
2751 | /* Allocate and move to the final place */ | 2786 | /* Allocate and move to the final place */ |
2752 | err = move_module(mod, info); | 2787 | err = move_module(mod, info); |
2753 | if (err) | 2788 | if (err) |
2754 | goto free_strmap; | 2789 | goto free_percpu; |
2755 | 2790 | ||
2756 | /* Module has been copied to its final place now: return it. */ | 2791 | /* Module has been copied to its final place now: return it. */ |
2757 | mod = (void *)info->sechdrs[info->index.mod].sh_addr; | 2792 | mod = (void *)info->sechdrs[info->index.mod].sh_addr; |
2758 | kmemleak_load_module(mod, info); | 2793 | kmemleak_load_module(mod, info); |
2759 | return mod; | 2794 | return mod; |
2760 | 2795 | ||
2761 | free_strmap: | ||
2762 | kfree(info->strmap); | ||
2763 | free_percpu: | 2796 | free_percpu: |
2764 | percpu_modfree(mod); | 2797 | percpu_modfree(mod); |
2765 | out: | 2798 | out: |
@@ -2769,7 +2802,6 @@ out: | |||
2769 | /* mod is no longer valid after this! */ | 2802 | /* mod is no longer valid after this! */ |
2770 | static void module_deallocate(struct module *mod, struct load_info *info) | 2803 | static void module_deallocate(struct module *mod, struct load_info *info) |
2771 | { | 2804 | { |
2772 | kfree(info->strmap); | ||
2773 | percpu_modfree(mod); | 2805 | percpu_modfree(mod); |
2774 | module_free(mod, mod->module_init); | 2806 | module_free(mod, mod->module_init); |
2775 | module_free(mod, mod->module_core); | 2807 | module_free(mod, mod->module_core); |
@@ -2808,7 +2840,7 @@ static struct module *load_module(void __user *umod, | |||
2808 | struct module *mod; | 2840 | struct module *mod; |
2809 | long err; | 2841 | long err; |
2810 | 2842 | ||
2811 | DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", | 2843 | pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n", |
2812 | umod, len, uargs); | 2844 | umod, len, uargs); |
2813 | 2845 | ||
2814 | /* Copy in the blobs from userspace, check they are vaguely sane. */ | 2846 | /* Copy in the blobs from userspace, check they are vaguely sane. */ |
@@ -2878,8 +2910,7 @@ static struct module *load_module(void __user *umod, | |||
2878 | } | 2910 | } |
2879 | 2911 | ||
2880 | /* This has to be done once we're sure module name is unique. */ | 2912 | /* This has to be done once we're sure module name is unique. */ |
2881 | if (!mod->taints || mod->taints == (1U<<TAINT_CRAP)) | 2913 | dynamic_debug_setup(info.debug, info.num_debug); |
2882 | dynamic_debug_setup(info.debug, info.num_debug); | ||
2883 | 2914 | ||
2884 | /* Find duplicate symbols */ | 2915 | /* Find duplicate symbols */ |
2885 | err = verify_export_symbols(mod); | 2916 | err = verify_export_symbols(mod); |
@@ -2900,8 +2931,7 @@ static struct module *load_module(void __user *umod, | |||
2900 | if (err < 0) | 2931 | if (err < 0) |
2901 | goto unlink; | 2932 | goto unlink; |
2902 | 2933 | ||
2903 | /* Get rid of temporary copy and strmap. */ | 2934 | /* Get rid of temporary copy. */ |
2904 | kfree(info.strmap); | ||
2905 | free_copy(&info); | 2935 | free_copy(&info); |
2906 | 2936 | ||
2907 | /* Done! */ | 2937 | /* Done! */ |
@@ -2915,8 +2945,7 @@ static struct module *load_module(void __user *umod, | |||
2915 | module_bug_cleanup(mod); | 2945 | module_bug_cleanup(mod); |
2916 | 2946 | ||
2917 | ddebug: | 2947 | ddebug: |
2918 | if (!mod->taints || mod->taints == (1U<<TAINT_CRAP)) | 2948 | dynamic_debug_remove(info.debug); |
2919 | dynamic_debug_remove(info.debug); | ||
2920 | unlock: | 2949 | unlock: |
2921 | mutex_unlock(&module_mutex); | 2950 | mutex_unlock(&module_mutex); |
2922 | synchronize_sched(); | 2951 | synchronize_sched(); |
@@ -3255,18 +3284,7 @@ static char *module_flags(struct module *mod, char *buf) | |||
3255 | mod->state == MODULE_STATE_GOING || | 3284 | mod->state == MODULE_STATE_GOING || |
3256 | mod->state == MODULE_STATE_COMING) { | 3285 | mod->state == MODULE_STATE_COMING) { |
3257 | buf[bx++] = '('; | 3286 | buf[bx++] = '('; |
3258 | if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE)) | 3287 | bx += module_flags_taint(mod, buf + bx); |
3259 | buf[bx++] = 'P'; | ||
3260 | if (mod->taints & (1 << TAINT_FORCED_MODULE)) | ||
3261 | buf[bx++] = 'F'; | ||
3262 | if (mod->taints & (1 << TAINT_CRAP)) | ||
3263 | buf[bx++] = 'C'; | ||
3264 | /* | ||
3265 | * TAINT_FORCED_RMMOD: could be added. | ||
3266 | * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't | ||
3267 | * apply to modules. | ||
3268 | */ | ||
3269 | |||
3270 | /* Show a - for module-is-being-unloaded */ | 3288 | /* Show a - for module-is-being-unloaded */ |
3271 | if (mod->state == MODULE_STATE_GOING) | 3289 | if (mod->state == MODULE_STATE_GOING) |
3272 | buf[bx++] = '-'; | 3290 | buf[bx++] = '-'; |
@@ -3487,50 +3505,3 @@ void module_layout(struct module *mod, | |||
3487 | } | 3505 | } |
3488 | EXPORT_SYMBOL(module_layout); | 3506 | EXPORT_SYMBOL(module_layout); |
3489 | #endif | 3507 | #endif |
3490 | |||
3491 | #ifdef CONFIG_TRACEPOINTS | ||
3492 | void module_update_tracepoints(void) | ||
3493 | { | ||
3494 | struct module *mod; | ||
3495 | |||
3496 | mutex_lock(&module_mutex); | ||
3497 | list_for_each_entry(mod, &modules, list) | ||
3498 | if (!mod->taints) | ||
3499 | tracepoint_update_probe_range(mod->tracepoints_ptrs, | ||
3500 | mod->tracepoints_ptrs + mod->num_tracepoints); | ||
3501 | mutex_unlock(&module_mutex); | ||
3502 | } | ||
3503 | |||
3504 | /* | ||
3505 | * Returns 0 if current not found. | ||
3506 | * Returns 1 if current found. | ||
3507 | */ | ||
3508 | int module_get_iter_tracepoints(struct tracepoint_iter *iter) | ||
3509 | { | ||
3510 | struct module *iter_mod; | ||
3511 | int found = 0; | ||
3512 | |||
3513 | mutex_lock(&module_mutex); | ||
3514 | list_for_each_entry(iter_mod, &modules, list) { | ||
3515 | if (!iter_mod->taints) { | ||
3516 | /* | ||
3517 | * Sorted module list | ||
3518 | */ | ||
3519 | if (iter_mod < iter->module) | ||
3520 | continue; | ||
3521 | else if (iter_mod > iter->module) | ||
3522 | iter->tracepoint = NULL; | ||
3523 | found = tracepoint_get_iter_range(&iter->tracepoint, | ||
3524 | iter_mod->tracepoints_ptrs, | ||
3525 | iter_mod->tracepoints_ptrs | ||
3526 | + iter_mod->num_tracepoints); | ||
3527 | if (found) { | ||
3528 | iter->module = iter_mod; | ||
3529 | break; | ||
3530 | } | ||
3531 | } | ||
3532 | } | ||
3533 | mutex_unlock(&module_mutex); | ||
3534 | return found; | ||
3535 | } | ||
3536 | #endif | ||
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index 73da83aff418..7e3443fe1f48 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c | |||
@@ -14,7 +14,7 @@ | |||
14 | */ | 14 | */ |
15 | #include <linux/mutex.h> | 15 | #include <linux/mutex.h> |
16 | #include <linux/delay.h> | 16 | #include <linux/delay.h> |
17 | #include <linux/module.h> | 17 | #include <linux/export.h> |
18 | #include <linux/poison.h> | 18 | #include <linux/poison.h> |
19 | #include <linux/sched.h> | 19 | #include <linux/sched.h> |
20 | #include <linux/spinlock.h> | 20 | #include <linux/spinlock.h> |
diff --git a/kernel/mutex.c b/kernel/mutex.c index d607ed5dd441..89096dd8786f 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c | |||
@@ -19,7 +19,7 @@ | |||
19 | */ | 19 | */ |
20 | #include <linux/mutex.h> | 20 | #include <linux/mutex.h> |
21 | #include <linux/sched.h> | 21 | #include <linux/sched.h> |
22 | #include <linux/module.h> | 22 | #include <linux/export.h> |
23 | #include <linux/spinlock.h> | 23 | #include <linux/spinlock.h> |
24 | #include <linux/interrupt.h> | 24 | #include <linux/interrupt.h> |
25 | #include <linux/debug_locks.h> | 25 | #include <linux/debug_locks.h> |
diff --git a/kernel/notifier.c b/kernel/notifier.c index 8d7b435806c9..2d5cc4ccff7f 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c | |||
@@ -1,6 +1,6 @@ | |||
1 | #include <linux/kdebug.h> | 1 | #include <linux/kdebug.h> |
2 | #include <linux/kprobes.h> | 2 | #include <linux/kprobes.h> |
3 | #include <linux/module.h> | 3 | #include <linux/export.h> |
4 | #include <linux/notifier.h> | 4 | #include <linux/notifier.h> |
5 | #include <linux/rcupdate.h> | 5 | #include <linux/rcupdate.h> |
6 | #include <linux/vmalloc.h> | 6 | #include <linux/vmalloc.h> |
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 9aeab4b98c64..b576f7f14bc6 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c | |||
@@ -14,7 +14,7 @@ | |||
14 | */ | 14 | */ |
15 | 15 | ||
16 | #include <linux/slab.h> | 16 | #include <linux/slab.h> |
17 | #include <linux/module.h> | 17 | #include <linux/export.h> |
18 | #include <linux/nsproxy.h> | 18 | #include <linux/nsproxy.h> |
19 | #include <linux/init_task.h> | 19 | #include <linux/init_task.h> |
20 | #include <linux/mnt_namespace.h> | 20 | #include <linux/mnt_namespace.h> |
diff --git a/kernel/padata.c b/kernel/padata.c index b91941df5e63..b45259931512 100644 --- a/kernel/padata.c +++ b/kernel/padata.c | |||
@@ -18,7 +18,7 @@ | |||
18 | * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. | 18 | * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. |
19 | */ | 19 | */ |
20 | 20 | ||
21 | #include <linux/module.h> | 21 | #include <linux/export.h> |
22 | #include <linux/cpumask.h> | 22 | #include <linux/cpumask.h> |
23 | #include <linux/err.h> | 23 | #include <linux/err.h> |
24 | #include <linux/cpu.h> | 24 | #include <linux/cpu.h> |
diff --git a/kernel/panic.c b/kernel/panic.c index d7bb6974efb5..80aed44e345a 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -49,6 +49,15 @@ static long no_blink(int state) | |||
49 | long (*panic_blink)(int state); | 49 | long (*panic_blink)(int state); |
50 | EXPORT_SYMBOL(panic_blink); | 50 | EXPORT_SYMBOL(panic_blink); |
51 | 51 | ||
52 | /* | ||
53 | * Stop ourself in panic -- architecture code may override this | ||
54 | */ | ||
55 | void __weak panic_smp_self_stop(void) | ||
56 | { | ||
57 | while (1) | ||
58 | cpu_relax(); | ||
59 | } | ||
60 | |||
52 | /** | 61 | /** |
53 | * panic - halt the system | 62 | * panic - halt the system |
54 | * @fmt: The text string to print | 63 | * @fmt: The text string to print |
@@ -57,8 +66,9 @@ EXPORT_SYMBOL(panic_blink); | |||
57 | * | 66 | * |
58 | * This function never returns. | 67 | * This function never returns. |
59 | */ | 68 | */ |
60 | NORET_TYPE void panic(const char * fmt, ...) | 69 | void panic(const char *fmt, ...) |
61 | { | 70 | { |
71 | static DEFINE_SPINLOCK(panic_lock); | ||
62 | static char buf[1024]; | 72 | static char buf[1024]; |
63 | va_list args; | 73 | va_list args; |
64 | long i, i_next = 0; | 74 | long i, i_next = 0; |
@@ -68,8 +78,14 @@ NORET_TYPE void panic(const char * fmt, ...) | |||
68 | * It's possible to come here directly from a panic-assertion and | 78 | * It's possible to come here directly from a panic-assertion and |
69 | * not have preempt disabled. Some functions called from here want | 79 | * not have preempt disabled. Some functions called from here want |
70 | * preempt to be disabled. No point enabling it later though... | 80 | * preempt to be disabled. No point enabling it later though... |
81 | * | ||
82 | * Only one CPU is allowed to execute the panic code from here. For | ||
83 | * multiple parallel invocations of panic, all other CPUs either | ||
84 | * stop themself or will wait until they are stopped by the 1st CPU | ||
85 | * with smp_send_stop(). | ||
71 | */ | 86 | */ |
72 | preempt_disable(); | 87 | if (!spin_trylock(&panic_lock)) |
88 | panic_smp_self_stop(); | ||
73 | 89 | ||
74 | console_verbose(); | 90 | console_verbose(); |
75 | bust_spinlocks(1); | 91 | bust_spinlocks(1); |
@@ -78,7 +94,11 @@ NORET_TYPE void panic(const char * fmt, ...) | |||
78 | va_end(args); | 94 | va_end(args); |
79 | printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); | 95 | printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); |
80 | #ifdef CONFIG_DEBUG_BUGVERBOSE | 96 | #ifdef CONFIG_DEBUG_BUGVERBOSE |
81 | dump_stack(); | 97 | /* |
98 | * Avoid nested stack-dumping if a panic occurs during oops processing | ||
99 | */ | ||
100 | if (!oops_in_progress) | ||
101 | dump_stack(); | ||
82 | #endif | 102 | #endif |
83 | 103 | ||
84 | /* | 104 | /* |
@@ -177,6 +197,7 @@ static const struct tnt tnts[] = { | |||
177 | { TAINT_WARN, 'W', ' ' }, | 197 | { TAINT_WARN, 'W', ' ' }, |
178 | { TAINT_CRAP, 'C', ' ' }, | 198 | { TAINT_CRAP, 'C', ' ' }, |
179 | { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' }, | 199 | { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' }, |
200 | { TAINT_OOT_MODULE, 'O', ' ' }, | ||
180 | }; | 201 | }; |
181 | 202 | ||
182 | /** | 203 | /** |
@@ -194,6 +215,7 @@ static const struct tnt tnts[] = { | |||
194 | * 'W' - Taint on warning. | 215 | * 'W' - Taint on warning. |
195 | * 'C' - modules from drivers/staging are loaded. | 216 | * 'C' - modules from drivers/staging are loaded. |
196 | * 'I' - Working around severe firmware bug. | 217 | * 'I' - Working around severe firmware bug. |
218 | * 'O' - Out-of-tree module has been loaded. | ||
197 | * | 219 | * |
198 | * The string is overwritten by the next call to print_tainted(). | 220 | * The string is overwritten by the next call to print_tainted(). |
199 | */ | 221 | */ |
@@ -235,11 +257,20 @@ void add_taint(unsigned flag) | |||
235 | * Can't trust the integrity of the kernel anymore. | 257 | * Can't trust the integrity of the kernel anymore. |
236 | * We don't call directly debug_locks_off() because the issue | 258 | * We don't call directly debug_locks_off() because the issue |
237 | * is not necessarily serious enough to set oops_in_progress to 1 | 259 | * is not necessarily serious enough to set oops_in_progress to 1 |
238 | * Also we want to keep up lockdep for staging development and | 260 | * Also we want to keep up lockdep for staging/out-of-tree |
239 | * post-warning case. | 261 | * development and post-warning case. |
240 | */ | 262 | */ |
241 | if (flag != TAINT_CRAP && flag != TAINT_WARN && __debug_locks_off()) | 263 | switch (flag) { |
242 | printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n"); | 264 | case TAINT_CRAP: |
265 | case TAINT_OOT_MODULE: | ||
266 | case TAINT_WARN: | ||
267 | case TAINT_FIRMWARE_WORKAROUND: | ||
268 | break; | ||
269 | |||
270 | default: | ||
271 | if (__debug_locks_off()) | ||
272 | printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n"); | ||
273 | } | ||
243 | 274 | ||
244 | set_bit(flag, &tainted_mask); | 275 | set_bit(flag, &tainted_mask); |
245 | } | 276 | } |
diff --git a/kernel/params.c b/kernel/params.c index 22df3e0d142a..32ee04308285 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -15,7 +15,7 @@ | |||
15 | along with this program; if not, write to the Free Software | 15 | along with this program; if not, write to the Free Software |
16 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | 16 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
17 | */ | 17 | */ |
18 | #include <linux/moduleparam.h> | 18 | #include <linux/module.h> |
19 | #include <linux/kernel.h> | 19 | #include <linux/kernel.h> |
20 | #include <linux/string.h> | 20 | #include <linux/string.h> |
21 | #include <linux/errno.h> | 21 | #include <linux/errno.h> |
@@ -25,12 +25,6 @@ | |||
25 | #include <linux/slab.h> | 25 | #include <linux/slab.h> |
26 | #include <linux/ctype.h> | 26 | #include <linux/ctype.h> |
27 | 27 | ||
28 | #if 0 | ||
29 | #define DEBUGP printk | ||
30 | #else | ||
31 | #define DEBUGP(fmt, a...) | ||
32 | #endif | ||
33 | |||
34 | /* Protects all parameters, and incidentally kmalloced_param list. */ | 28 | /* Protects all parameters, and incidentally kmalloced_param list. */ |
35 | static DEFINE_MUTEX(param_lock); | 29 | static DEFINE_MUTEX(param_lock); |
36 | 30 | ||
@@ -67,20 +61,27 @@ static void maybe_kfree_parameter(void *param) | |||
67 | } | 61 | } |
68 | } | 62 | } |
69 | 63 | ||
70 | static inline char dash2underscore(char c) | 64 | static char dash2underscore(char c) |
71 | { | 65 | { |
72 | if (c == '-') | 66 | if (c == '-') |
73 | return '_'; | 67 | return '_'; |
74 | return c; | 68 | return c; |
75 | } | 69 | } |
76 | 70 | ||
77 | static inline int parameq(const char *input, const char *paramname) | 71 | bool parameqn(const char *a, const char *b, size_t n) |
78 | { | 72 | { |
79 | unsigned int i; | 73 | size_t i; |
80 | for (i = 0; dash2underscore(input[i]) == paramname[i]; i++) | 74 | |
81 | if (input[i] == '\0') | 75 | for (i = 0; i < n; i++) { |
82 | return 1; | 76 | if (dash2underscore(a[i]) != dash2underscore(b[i])) |
83 | return 0; | 77 | return false; |
78 | } | ||
79 | return true; | ||
80 | } | ||
81 | |||
82 | bool parameq(const char *a, const char *b) | ||
83 | { | ||
84 | return parameqn(a, b, strlen(a)+1); | ||
84 | } | 85 | } |
85 | 86 | ||
86 | static int parse_one(char *param, | 87 | static int parse_one(char *param, |
@@ -98,7 +99,7 @@ static int parse_one(char *param, | |||
98 | /* No one handled NULL, so do it here. */ | 99 | /* No one handled NULL, so do it here. */ |
99 | if (!val && params[i].ops->set != param_set_bool) | 100 | if (!val && params[i].ops->set != param_set_bool) |
100 | return -EINVAL; | 101 | return -EINVAL; |
101 | DEBUGP("They are equal! Calling %p\n", | 102 | pr_debug("They are equal! Calling %p\n", |
102 | params[i].ops->set); | 103 | params[i].ops->set); |
103 | mutex_lock(¶m_lock); | 104 | mutex_lock(¶m_lock); |
104 | err = params[i].ops->set(val, ¶ms[i]); | 105 | err = params[i].ops->set(val, ¶ms[i]); |
@@ -108,11 +109,11 @@ static int parse_one(char *param, | |||
108 | } | 109 | } |
109 | 110 | ||
110 | if (handle_unknown) { | 111 | if (handle_unknown) { |
111 | DEBUGP("Unknown argument: calling %p\n", handle_unknown); | 112 | pr_debug("Unknown argument: calling %p\n", handle_unknown); |
112 | return handle_unknown(param, val); | 113 | return handle_unknown(param, val); |
113 | } | 114 | } |
114 | 115 | ||
115 | DEBUGP("Unknown argument `%s'\n", param); | 116 | pr_debug("Unknown argument `%s'\n", param); |
116 | return -ENOENT; | 117 | return -ENOENT; |
117 | } | 118 | } |
118 | 119 | ||
@@ -177,7 +178,7 @@ int parse_args(const char *name, | |||
177 | { | 178 | { |
178 | char *param, *val; | 179 | char *param, *val; |
179 | 180 | ||
180 | DEBUGP("Parsing ARGS: %s\n", args); | 181 | pr_debug("Parsing ARGS: %s\n", args); |
181 | 182 | ||
182 | /* Chew leading spaces */ | 183 | /* Chew leading spaces */ |
183 | args = skip_spaces(args); | 184 | args = skip_spaces(args); |
@@ -362,6 +363,30 @@ struct kernel_param_ops param_ops_invbool = { | |||
362 | }; | 363 | }; |
363 | EXPORT_SYMBOL(param_ops_invbool); | 364 | EXPORT_SYMBOL(param_ops_invbool); |
364 | 365 | ||
366 | int param_set_bint(const char *val, const struct kernel_param *kp) | ||
367 | { | ||
368 | struct kernel_param boolkp; | ||
369 | bool v; | ||
370 | int ret; | ||
371 | |||
372 | /* Match bool exactly, by re-using it. */ | ||
373 | boolkp = *kp; | ||
374 | boolkp.arg = &v; | ||
375 | boolkp.flags |= KPARAM_ISBOOL; | ||
376 | |||
377 | ret = param_set_bool(val, &boolkp); | ||
378 | if (ret == 0) | ||
379 | *(int *)kp->arg = v; | ||
380 | return ret; | ||
381 | } | ||
382 | EXPORT_SYMBOL(param_set_bint); | ||
383 | |||
384 | struct kernel_param_ops param_ops_bint = { | ||
385 | .set = param_set_bint, | ||
386 | .get = param_get_int, | ||
387 | }; | ||
388 | EXPORT_SYMBOL(param_ops_bint); | ||
389 | |||
365 | /* We break the rule and mangle the string. */ | 390 | /* We break the rule and mangle the string. */ |
366 | static int param_array(const char *name, | 391 | static int param_array(const char *name, |
367 | const char *val, | 392 | const char *val, |
diff --git a/kernel/pid.c b/kernel/pid.c index e432057f3b21..ce8e00deaccb 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -27,7 +27,7 @@ | |||
27 | */ | 27 | */ |
28 | 28 | ||
29 | #include <linux/mm.h> | 29 | #include <linux/mm.h> |
30 | #include <linux/module.h> | 30 | #include <linux/export.h> |
31 | #include <linux/slab.h> | 31 | #include <linux/slab.h> |
32 | #include <linux/init.h> | 32 | #include <linux/init.h> |
33 | #include <linux/rculist.h> | 33 | #include <linux/rculist.h> |
@@ -137,7 +137,9 @@ static int pid_before(int base, int a, int b) | |||
137 | } | 137 | } |
138 | 138 | ||
139 | /* | 139 | /* |
140 | * We might be racing with someone else trying to set pid_ns->last_pid. | 140 | * We might be racing with someone else trying to set pid_ns->last_pid |
141 | * at the pid allocation time (there's also a sysctl for this, but racing | ||
142 | * with this one is OK, see comment in kernel/pid_namespace.c about it). | ||
141 | * We want the winner to have the "later" value, because if the | 143 | * We want the winner to have the "later" value, because if the |
142 | * "earlier" value prevails, then a pid may get reused immediately. | 144 | * "earlier" value prevails, then a pid may get reused immediately. |
143 | * | 145 | * |
@@ -418,7 +420,9 @@ EXPORT_SYMBOL(pid_task); | |||
418 | */ | 420 | */ |
419 | struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) | 421 | struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) |
420 | { | 422 | { |
421 | rcu_lockdep_assert(rcu_read_lock_held()); | 423 | rcu_lockdep_assert(rcu_read_lock_held(), |
424 | "find_task_by_pid_ns() needs rcu_read_lock()" | ||
425 | " protection"); | ||
422 | return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); | 426 | return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); |
423 | } | 427 | } |
424 | 428 | ||
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index e9c9adc84ca6..a8968396046d 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c | |||
@@ -191,9 +191,40 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) | |||
191 | return; | 191 | return; |
192 | } | 192 | } |
193 | 193 | ||
194 | static int pid_ns_ctl_handler(struct ctl_table *table, int write, | ||
195 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
196 | { | ||
197 | struct ctl_table tmp = *table; | ||
198 | |||
199 | if (write && !capable(CAP_SYS_ADMIN)) | ||
200 | return -EPERM; | ||
201 | |||
202 | /* | ||
203 | * Writing directly to ns' last_pid field is OK, since this field | ||
204 | * is volatile in a living namespace anyway and a code writing to | ||
205 | * it should synchronize its usage with external means. | ||
206 | */ | ||
207 | |||
208 | tmp.data = ¤t->nsproxy->pid_ns->last_pid; | ||
209 | return proc_dointvec(&tmp, write, buffer, lenp, ppos); | ||
210 | } | ||
211 | |||
212 | static struct ctl_table pid_ns_ctl_table[] = { | ||
213 | { | ||
214 | .procname = "ns_last_pid", | ||
215 | .maxlen = sizeof(int), | ||
216 | .mode = 0666, /* permissions are checked in the handler */ | ||
217 | .proc_handler = pid_ns_ctl_handler, | ||
218 | }, | ||
219 | { } | ||
220 | }; | ||
221 | |||
222 | static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } }; | ||
223 | |||
194 | static __init int pid_namespaces_init(void) | 224 | static __init int pid_namespaces_init(void) |
195 | { | 225 | { |
196 | pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); | 226 | pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); |
227 | register_sysctl_paths(kern_path, pid_ns_ctl_table); | ||
197 | return 0; | 228 | return 0; |
198 | } | 229 | } |
199 | 230 | ||
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 640ded8f5c48..125cb67daa21 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
@@ -78,7 +78,7 @@ static inline int cpu_time_before(const clockid_t which_clock, | |||
78 | if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { | 78 | if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { |
79 | return now.sched < then.sched; | 79 | return now.sched < then.sched; |
80 | } else { | 80 | } else { |
81 | return cputime_lt(now.cpu, then.cpu); | 81 | return now.cpu < then.cpu; |
82 | } | 82 | } |
83 | } | 83 | } |
84 | static inline void cpu_time_add(const clockid_t which_clock, | 84 | static inline void cpu_time_add(const clockid_t which_clock, |
@@ -88,7 +88,7 @@ static inline void cpu_time_add(const clockid_t which_clock, | |||
88 | if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { | 88 | if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { |
89 | acc->sched += val.sched; | 89 | acc->sched += val.sched; |
90 | } else { | 90 | } else { |
91 | acc->cpu = cputime_add(acc->cpu, val.cpu); | 91 | acc->cpu += val.cpu; |
92 | } | 92 | } |
93 | } | 93 | } |
94 | static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, | 94 | static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, |
@@ -98,25 +98,12 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, | |||
98 | if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { | 98 | if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { |
99 | a.sched -= b.sched; | 99 | a.sched -= b.sched; |
100 | } else { | 100 | } else { |
101 | a.cpu = cputime_sub(a.cpu, b.cpu); | 101 | a.cpu -= b.cpu; |
102 | } | 102 | } |
103 | return a; | 103 | return a; |
104 | } | 104 | } |
105 | 105 | ||
106 | /* | 106 | /* |
107 | * Divide and limit the result to res >= 1 | ||
108 | * | ||
109 | * This is necessary to prevent signal delivery starvation, when the result of | ||
110 | * the division would be rounded down to 0. | ||
111 | */ | ||
112 | static inline cputime_t cputime_div_non_zero(cputime_t time, unsigned long div) | ||
113 | { | ||
114 | cputime_t res = cputime_div(time, div); | ||
115 | |||
116 | return max_t(cputime_t, res, 1); | ||
117 | } | ||
118 | |||
119 | /* | ||
120 | * Update expiry time from increment, and increase overrun count, | 107 | * Update expiry time from increment, and increase overrun count, |
121 | * given the current clock sample. | 108 | * given the current clock sample. |
122 | */ | 109 | */ |
@@ -148,28 +135,26 @@ static void bump_cpu_timer(struct k_itimer *timer, | |||
148 | } else { | 135 | } else { |
149 | cputime_t delta, incr; | 136 | cputime_t delta, incr; |
150 | 137 | ||
151 | if (cputime_lt(now.cpu, timer->it.cpu.expires.cpu)) | 138 | if (now.cpu < timer->it.cpu.expires.cpu) |
152 | return; | 139 | return; |
153 | incr = timer->it.cpu.incr.cpu; | 140 | incr = timer->it.cpu.incr.cpu; |
154 | delta = cputime_sub(cputime_add(now.cpu, incr), | 141 | delta = now.cpu + incr - timer->it.cpu.expires.cpu; |
155 | timer->it.cpu.expires.cpu); | ||
156 | /* Don't use (incr*2 < delta), incr*2 might overflow. */ | 142 | /* Don't use (incr*2 < delta), incr*2 might overflow. */ |
157 | for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++) | 143 | for (i = 0; incr < delta - incr; i++) |
158 | incr = cputime_add(incr, incr); | 144 | incr += incr; |
159 | for (; i >= 0; incr = cputime_halve(incr), i--) { | 145 | for (; i >= 0; incr = incr >> 1, i--) { |
160 | if (cputime_lt(delta, incr)) | 146 | if (delta < incr) |
161 | continue; | 147 | continue; |
162 | timer->it.cpu.expires.cpu = | 148 | timer->it.cpu.expires.cpu += incr; |
163 | cputime_add(timer->it.cpu.expires.cpu, incr); | ||
164 | timer->it_overrun += 1 << i; | 149 | timer->it_overrun += 1 << i; |
165 | delta = cputime_sub(delta, incr); | 150 | delta -= incr; |
166 | } | 151 | } |
167 | } | 152 | } |
168 | } | 153 | } |
169 | 154 | ||
170 | static inline cputime_t prof_ticks(struct task_struct *p) | 155 | static inline cputime_t prof_ticks(struct task_struct *p) |
171 | { | 156 | { |
172 | return cputime_add(p->utime, p->stime); | 157 | return p->utime + p->stime; |
173 | } | 158 | } |
174 | static inline cputime_t virt_ticks(struct task_struct *p) | 159 | static inline cputime_t virt_ticks(struct task_struct *p) |
175 | { | 160 | { |
@@ -248,8 +233,8 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | |||
248 | 233 | ||
249 | t = tsk; | 234 | t = tsk; |
250 | do { | 235 | do { |
251 | times->utime = cputime_add(times->utime, t->utime); | 236 | times->utime += t->utime; |
252 | times->stime = cputime_add(times->stime, t->stime); | 237 | times->stime += t->stime; |
253 | times->sum_exec_runtime += task_sched_runtime(t); | 238 | times->sum_exec_runtime += task_sched_runtime(t); |
254 | } while_each_thread(tsk, t); | 239 | } while_each_thread(tsk, t); |
255 | out: | 240 | out: |
@@ -258,10 +243,10 @@ out: | |||
258 | 243 | ||
259 | static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) | 244 | static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) |
260 | { | 245 | { |
261 | if (cputime_gt(b->utime, a->utime)) | 246 | if (b->utime > a->utime) |
262 | a->utime = b->utime; | 247 | a->utime = b->utime; |
263 | 248 | ||
264 | if (cputime_gt(b->stime, a->stime)) | 249 | if (b->stime > a->stime) |
265 | a->stime = b->stime; | 250 | a->stime = b->stime; |
266 | 251 | ||
267 | if (b->sum_exec_runtime > a->sum_exec_runtime) | 252 | if (b->sum_exec_runtime > a->sum_exec_runtime) |
@@ -282,13 +267,13 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) | |||
282 | * it. | 267 | * it. |
283 | */ | 268 | */ |
284 | thread_group_cputime(tsk, &sum); | 269 | thread_group_cputime(tsk, &sum); |
285 | spin_lock_irqsave(&cputimer->lock, flags); | 270 | raw_spin_lock_irqsave(&cputimer->lock, flags); |
286 | cputimer->running = 1; | 271 | cputimer->running = 1; |
287 | update_gt_cputime(&cputimer->cputime, &sum); | 272 | update_gt_cputime(&cputimer->cputime, &sum); |
288 | } else | 273 | } else |
289 | spin_lock_irqsave(&cputimer->lock, flags); | 274 | raw_spin_lock_irqsave(&cputimer->lock, flags); |
290 | *times = cputimer->cputime; | 275 | *times = cputimer->cputime; |
291 | spin_unlock_irqrestore(&cputimer->lock, flags); | 276 | raw_spin_unlock_irqrestore(&cputimer->lock, flags); |
292 | } | 277 | } |
293 | 278 | ||
294 | /* | 279 | /* |
@@ -306,7 +291,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock, | |||
306 | return -EINVAL; | 291 | return -EINVAL; |
307 | case CPUCLOCK_PROF: | 292 | case CPUCLOCK_PROF: |
308 | thread_group_cputime(p, &cputime); | 293 | thread_group_cputime(p, &cputime); |
309 | cpu->cpu = cputime_add(cputime.utime, cputime.stime); | 294 | cpu->cpu = cputime.utime + cputime.stime; |
310 | break; | 295 | break; |
311 | case CPUCLOCK_VIRT: | 296 | case CPUCLOCK_VIRT: |
312 | thread_group_cputime(p, &cputime); | 297 | thread_group_cputime(p, &cputime); |
@@ -470,26 +455,24 @@ static void cleanup_timers(struct list_head *head, | |||
470 | unsigned long long sum_exec_runtime) | 455 | unsigned long long sum_exec_runtime) |
471 | { | 456 | { |
472 | struct cpu_timer_list *timer, *next; | 457 | struct cpu_timer_list *timer, *next; |
473 | cputime_t ptime = cputime_add(utime, stime); | 458 | cputime_t ptime = utime + stime; |
474 | 459 | ||
475 | list_for_each_entry_safe(timer, next, head, entry) { | 460 | list_for_each_entry_safe(timer, next, head, entry) { |
476 | list_del_init(&timer->entry); | 461 | list_del_init(&timer->entry); |
477 | if (cputime_lt(timer->expires.cpu, ptime)) { | 462 | if (timer->expires.cpu < ptime) { |
478 | timer->expires.cpu = cputime_zero; | 463 | timer->expires.cpu = 0; |
479 | } else { | 464 | } else { |
480 | timer->expires.cpu = cputime_sub(timer->expires.cpu, | 465 | timer->expires.cpu -= ptime; |
481 | ptime); | ||
482 | } | 466 | } |
483 | } | 467 | } |
484 | 468 | ||
485 | ++head; | 469 | ++head; |
486 | list_for_each_entry_safe(timer, next, head, entry) { | 470 | list_for_each_entry_safe(timer, next, head, entry) { |
487 | list_del_init(&timer->entry); | 471 | list_del_init(&timer->entry); |
488 | if (cputime_lt(timer->expires.cpu, utime)) { | 472 | if (timer->expires.cpu < utime) { |
489 | timer->expires.cpu = cputime_zero; | 473 | timer->expires.cpu = 0; |
490 | } else { | 474 | } else { |
491 | timer->expires.cpu = cputime_sub(timer->expires.cpu, | 475 | timer->expires.cpu -= utime; |
492 | utime); | ||
493 | } | 476 | } |
494 | } | 477 | } |
495 | 478 | ||
@@ -520,8 +503,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk) | |||
520 | struct signal_struct *const sig = tsk->signal; | 503 | struct signal_struct *const sig = tsk->signal; |
521 | 504 | ||
522 | cleanup_timers(tsk->signal->cpu_timers, | 505 | cleanup_timers(tsk->signal->cpu_timers, |
523 | cputime_add(tsk->utime, sig->utime), | 506 | tsk->utime + sig->utime, tsk->stime + sig->stime, |
524 | cputime_add(tsk->stime, sig->stime), | ||
525 | tsk->se.sum_exec_runtime + sig->sum_sched_runtime); | 507 | tsk->se.sum_exec_runtime + sig->sum_sched_runtime); |
526 | } | 508 | } |
527 | 509 | ||
@@ -540,8 +522,7 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) | |||
540 | 522 | ||
541 | static inline int expires_gt(cputime_t expires, cputime_t new_exp) | 523 | static inline int expires_gt(cputime_t expires, cputime_t new_exp) |
542 | { | 524 | { |
543 | return cputime_eq(expires, cputime_zero) || | 525 | return expires == 0 || expires > new_exp; |
544 | cputime_gt(expires, new_exp); | ||
545 | } | 526 | } |
546 | 527 | ||
547 | /* | 528 | /* |
@@ -651,7 +632,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock, | |||
651 | default: | 632 | default: |
652 | return -EINVAL; | 633 | return -EINVAL; |
653 | case CPUCLOCK_PROF: | 634 | case CPUCLOCK_PROF: |
654 | cpu->cpu = cputime_add(cputime.utime, cputime.stime); | 635 | cpu->cpu = cputime.utime + cputime.stime; |
655 | break; | 636 | break; |
656 | case CPUCLOCK_VIRT: | 637 | case CPUCLOCK_VIRT: |
657 | cpu->cpu = cputime.utime; | 638 | cpu->cpu = cputime.utime; |
@@ -918,12 +899,12 @@ static void check_thread_timers(struct task_struct *tsk, | |||
918 | unsigned long soft; | 899 | unsigned long soft; |
919 | 900 | ||
920 | maxfire = 20; | 901 | maxfire = 20; |
921 | tsk->cputime_expires.prof_exp = cputime_zero; | 902 | tsk->cputime_expires.prof_exp = 0; |
922 | while (!list_empty(timers)) { | 903 | while (!list_empty(timers)) { |
923 | struct cpu_timer_list *t = list_first_entry(timers, | 904 | struct cpu_timer_list *t = list_first_entry(timers, |
924 | struct cpu_timer_list, | 905 | struct cpu_timer_list, |
925 | entry); | 906 | entry); |
926 | if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) { | 907 | if (!--maxfire || prof_ticks(tsk) < t->expires.cpu) { |
927 | tsk->cputime_expires.prof_exp = t->expires.cpu; | 908 | tsk->cputime_expires.prof_exp = t->expires.cpu; |
928 | break; | 909 | break; |
929 | } | 910 | } |
@@ -933,12 +914,12 @@ static void check_thread_timers(struct task_struct *tsk, | |||
933 | 914 | ||
934 | ++timers; | 915 | ++timers; |
935 | maxfire = 20; | 916 | maxfire = 20; |
936 | tsk->cputime_expires.virt_exp = cputime_zero; | 917 | tsk->cputime_expires.virt_exp = 0; |
937 | while (!list_empty(timers)) { | 918 | while (!list_empty(timers)) { |
938 | struct cpu_timer_list *t = list_first_entry(timers, | 919 | struct cpu_timer_list *t = list_first_entry(timers, |
939 | struct cpu_timer_list, | 920 | struct cpu_timer_list, |
940 | entry); | 921 | entry); |
941 | if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) { | 922 | if (!--maxfire || virt_ticks(tsk) < t->expires.cpu) { |
942 | tsk->cputime_expires.virt_exp = t->expires.cpu; | 923 | tsk->cputime_expires.virt_exp = t->expires.cpu; |
943 | break; | 924 | break; |
944 | } | 925 | } |
@@ -999,9 +980,9 @@ static void stop_process_timers(struct signal_struct *sig) | |||
999 | struct thread_group_cputimer *cputimer = &sig->cputimer; | 980 | struct thread_group_cputimer *cputimer = &sig->cputimer; |
1000 | unsigned long flags; | 981 | unsigned long flags; |
1001 | 982 | ||
1002 | spin_lock_irqsave(&cputimer->lock, flags); | 983 | raw_spin_lock_irqsave(&cputimer->lock, flags); |
1003 | cputimer->running = 0; | 984 | cputimer->running = 0; |
1004 | spin_unlock_irqrestore(&cputimer->lock, flags); | 985 | raw_spin_unlock_irqrestore(&cputimer->lock, flags); |
1005 | } | 986 | } |
1006 | 987 | ||
1007 | static u32 onecputick; | 988 | static u32 onecputick; |
@@ -1009,20 +990,19 @@ static u32 onecputick; | |||
1009 | static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, | 990 | static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, |
1010 | cputime_t *expires, cputime_t cur_time, int signo) | 991 | cputime_t *expires, cputime_t cur_time, int signo) |
1011 | { | 992 | { |
1012 | if (cputime_eq(it->expires, cputime_zero)) | 993 | if (!it->expires) |
1013 | return; | 994 | return; |
1014 | 995 | ||
1015 | if (cputime_ge(cur_time, it->expires)) { | 996 | if (cur_time >= it->expires) { |
1016 | if (!cputime_eq(it->incr, cputime_zero)) { | 997 | if (it->incr) { |
1017 | it->expires = cputime_add(it->expires, it->incr); | 998 | it->expires += it->incr; |
1018 | it->error += it->incr_error; | 999 | it->error += it->incr_error; |
1019 | if (it->error >= onecputick) { | 1000 | if (it->error >= onecputick) { |
1020 | it->expires = cputime_sub(it->expires, | 1001 | it->expires -= cputime_one_jiffy; |
1021 | cputime_one_jiffy); | ||
1022 | it->error -= onecputick; | 1002 | it->error -= onecputick; |
1023 | } | 1003 | } |
1024 | } else { | 1004 | } else { |
1025 | it->expires = cputime_zero; | 1005 | it->expires = 0; |
1026 | } | 1006 | } |
1027 | 1007 | ||
1028 | trace_itimer_expire(signo == SIGPROF ? | 1008 | trace_itimer_expire(signo == SIGPROF ? |
@@ -1031,9 +1011,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, | |||
1031 | __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); | 1011 | __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); |
1032 | } | 1012 | } |
1033 | 1013 | ||
1034 | if (!cputime_eq(it->expires, cputime_zero) && | 1014 | if (it->expires && (!*expires || it->expires < *expires)) { |
1035 | (cputime_eq(*expires, cputime_zero) || | ||
1036 | cputime_lt(it->expires, *expires))) { | ||
1037 | *expires = it->expires; | 1015 | *expires = it->expires; |
1038 | } | 1016 | } |
1039 | } | 1017 | } |
@@ -1048,9 +1026,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, | |||
1048 | */ | 1026 | */ |
1049 | static inline int task_cputime_zero(const struct task_cputime *cputime) | 1027 | static inline int task_cputime_zero(const struct task_cputime *cputime) |
1050 | { | 1028 | { |
1051 | if (cputime_eq(cputime->utime, cputime_zero) && | 1029 | if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime) |
1052 | cputime_eq(cputime->stime, cputime_zero) && | ||
1053 | cputime->sum_exec_runtime == 0) | ||
1054 | return 1; | 1030 | return 1; |
1055 | return 0; | 1031 | return 0; |
1056 | } | 1032 | } |
@@ -1076,15 +1052,15 @@ static void check_process_timers(struct task_struct *tsk, | |||
1076 | */ | 1052 | */ |
1077 | thread_group_cputimer(tsk, &cputime); | 1053 | thread_group_cputimer(tsk, &cputime); |
1078 | utime = cputime.utime; | 1054 | utime = cputime.utime; |
1079 | ptime = cputime_add(utime, cputime.stime); | 1055 | ptime = utime + cputime.stime; |
1080 | sum_sched_runtime = cputime.sum_exec_runtime; | 1056 | sum_sched_runtime = cputime.sum_exec_runtime; |
1081 | maxfire = 20; | 1057 | maxfire = 20; |
1082 | prof_expires = cputime_zero; | 1058 | prof_expires = 0; |
1083 | while (!list_empty(timers)) { | 1059 | while (!list_empty(timers)) { |
1084 | struct cpu_timer_list *tl = list_first_entry(timers, | 1060 | struct cpu_timer_list *tl = list_first_entry(timers, |
1085 | struct cpu_timer_list, | 1061 | struct cpu_timer_list, |
1086 | entry); | 1062 | entry); |
1087 | if (!--maxfire || cputime_lt(ptime, tl->expires.cpu)) { | 1063 | if (!--maxfire || ptime < tl->expires.cpu) { |
1088 | prof_expires = tl->expires.cpu; | 1064 | prof_expires = tl->expires.cpu; |
1089 | break; | 1065 | break; |
1090 | } | 1066 | } |
@@ -1094,12 +1070,12 @@ static void check_process_timers(struct task_struct *tsk, | |||
1094 | 1070 | ||
1095 | ++timers; | 1071 | ++timers; |
1096 | maxfire = 20; | 1072 | maxfire = 20; |
1097 | virt_expires = cputime_zero; | 1073 | virt_expires = 0; |
1098 | while (!list_empty(timers)) { | 1074 | while (!list_empty(timers)) { |
1099 | struct cpu_timer_list *tl = list_first_entry(timers, | 1075 | struct cpu_timer_list *tl = list_first_entry(timers, |
1100 | struct cpu_timer_list, | 1076 | struct cpu_timer_list, |
1101 | entry); | 1077 | entry); |
1102 | if (!--maxfire || cputime_lt(utime, tl->expires.cpu)) { | 1078 | if (!--maxfire || utime < tl->expires.cpu) { |
1103 | virt_expires = tl->expires.cpu; | 1079 | virt_expires = tl->expires.cpu; |
1104 | break; | 1080 | break; |
1105 | } | 1081 | } |
@@ -1154,8 +1130,7 @@ static void check_process_timers(struct task_struct *tsk, | |||
1154 | } | 1130 | } |
1155 | } | 1131 | } |
1156 | x = secs_to_cputime(soft); | 1132 | x = secs_to_cputime(soft); |
1157 | if (cputime_eq(prof_expires, cputime_zero) || | 1133 | if (!prof_expires || x < prof_expires) { |
1158 | cputime_lt(x, prof_expires)) { | ||
1159 | prof_expires = x; | 1134 | prof_expires = x; |
1160 | } | 1135 | } |
1161 | } | 1136 | } |
@@ -1249,12 +1224,9 @@ out: | |||
1249 | static inline int task_cputime_expired(const struct task_cputime *sample, | 1224 | static inline int task_cputime_expired(const struct task_cputime *sample, |
1250 | const struct task_cputime *expires) | 1225 | const struct task_cputime *expires) |
1251 | { | 1226 | { |
1252 | if (!cputime_eq(expires->utime, cputime_zero) && | 1227 | if (expires->utime && sample->utime >= expires->utime) |
1253 | cputime_ge(sample->utime, expires->utime)) | ||
1254 | return 1; | 1228 | return 1; |
1255 | if (!cputime_eq(expires->stime, cputime_zero) && | 1229 | if (expires->stime && sample->utime + sample->stime >= expires->stime) |
1256 | cputime_ge(cputime_add(sample->utime, sample->stime), | ||
1257 | expires->stime)) | ||
1258 | return 1; | 1230 | return 1; |
1259 | if (expires->sum_exec_runtime != 0 && | 1231 | if (expires->sum_exec_runtime != 0 && |
1260 | sample->sum_exec_runtime >= expires->sum_exec_runtime) | 1232 | sample->sum_exec_runtime >= expires->sum_exec_runtime) |
@@ -1291,9 +1263,9 @@ static inline int fastpath_timer_check(struct task_struct *tsk) | |||
1291 | if (sig->cputimer.running) { | 1263 | if (sig->cputimer.running) { |
1292 | struct task_cputime group_sample; | 1264 | struct task_cputime group_sample; |
1293 | 1265 | ||
1294 | spin_lock(&sig->cputimer.lock); | 1266 | raw_spin_lock(&sig->cputimer.lock); |
1295 | group_sample = sig->cputimer.cputime; | 1267 | group_sample = sig->cputimer.cputime; |
1296 | spin_unlock(&sig->cputimer.lock); | 1268 | raw_spin_unlock(&sig->cputimer.lock); |
1297 | 1269 | ||
1298 | if (task_cputime_expired(&group_sample, &sig->cputime_expires)) | 1270 | if (task_cputime_expired(&group_sample, &sig->cputime_expires)) |
1299 | return 1; | 1271 | return 1; |
@@ -1389,18 +1361,18 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, | |||
1389 | * it to be relative, *newval argument is relative and we update | 1361 | * it to be relative, *newval argument is relative and we update |
1390 | * it to be absolute. | 1362 | * it to be absolute. |
1391 | */ | 1363 | */ |
1392 | if (!cputime_eq(*oldval, cputime_zero)) { | 1364 | if (*oldval) { |
1393 | if (cputime_le(*oldval, now.cpu)) { | 1365 | if (*oldval <= now.cpu) { |
1394 | /* Just about to fire. */ | 1366 | /* Just about to fire. */ |
1395 | *oldval = cputime_one_jiffy; | 1367 | *oldval = cputime_one_jiffy; |
1396 | } else { | 1368 | } else { |
1397 | *oldval = cputime_sub(*oldval, now.cpu); | 1369 | *oldval -= now.cpu; |
1398 | } | 1370 | } |
1399 | } | 1371 | } |
1400 | 1372 | ||
1401 | if (cputime_eq(*newval, cputime_zero)) | 1373 | if (!*newval) |
1402 | return; | 1374 | return; |
1403 | *newval = cputime_add(*newval, now.cpu); | 1375 | *newval += now.cpu; |
1404 | } | 1376 | } |
1405 | 1377 | ||
1406 | /* | 1378 | /* |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 4556182527f3..69185ae6b701 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
@@ -46,7 +46,7 @@ | |||
46 | #include <linux/syscalls.h> | 46 | #include <linux/syscalls.h> |
47 | #include <linux/wait.h> | 47 | #include <linux/wait.h> |
48 | #include <linux/workqueue.h> | 48 | #include <linux/workqueue.h> |
49 | #include <linux/module.h> | 49 | #include <linux/export.h> |
50 | 50 | ||
51 | /* | 51 | /* |
52 | * Management arrays for POSIX timers. Timers are kept in slab memory | 52 | * Management arrays for POSIX timers. Timers are kept in slab memory |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 3744c594b19b..deb5461e3216 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -27,6 +27,7 @@ config HIBERNATION | |||
27 | select HIBERNATE_CALLBACKS | 27 | select HIBERNATE_CALLBACKS |
28 | select LZO_COMPRESS | 28 | select LZO_COMPRESS |
29 | select LZO_DECOMPRESS | 29 | select LZO_DECOMPRESS |
30 | select CRC32 | ||
30 | ---help--- | 31 | ---help--- |
31 | Enable the suspend to disk (STD) functionality, which is usually | 32 | Enable the suspend to disk (STD) functionality, which is usually |
32 | called "hibernation" in user interfaces. STD checkpoints the | 33 | called "hibernation" in user interfaces. STD checkpoints the |
@@ -65,6 +66,9 @@ config HIBERNATION | |||
65 | 66 | ||
66 | For more information take a look at <file:Documentation/power/swsusp.txt>. | 67 | For more information take a look at <file:Documentation/power/swsusp.txt>. |
67 | 68 | ||
69 | config ARCH_SAVE_PAGE_KEYS | ||
70 | bool | ||
71 | |||
68 | config PM_STD_PARTITION | 72 | config PM_STD_PARTITION |
69 | string "Default resume partition" | 73 | string "Default resume partition" |
70 | depends on HIBERNATION | 74 | depends on HIBERNATION |
@@ -235,3 +239,7 @@ config PM_GENERIC_DOMAINS | |||
235 | config PM_GENERIC_DOMAINS_RUNTIME | 239 | config PM_GENERIC_DOMAINS_RUNTIME |
236 | def_bool y | 240 | def_bool y |
237 | depends on PM_RUNTIME && PM_GENERIC_DOMAINS | 241 | depends on PM_RUNTIME && PM_GENERIC_DOMAINS |
242 | |||
243 | config CPU_PM | ||
244 | bool | ||
245 | depends on SUSPEND || CPU_IDLE | ||
diff --git a/kernel/power/Makefile b/kernel/power/Makefile index c5ebc6a90643..07e0e28ffba7 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile | |||
@@ -1,8 +1,8 @@ | |||
1 | 1 | ||
2 | ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG | 2 | ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG |
3 | 3 | ||
4 | obj-$(CONFIG_PM) += main.o | 4 | obj-$(CONFIG_PM) += main.o qos.o |
5 | obj-$(CONFIG_PM_SLEEP) += console.o | 5 | obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o |
6 | obj-$(CONFIG_FREEZER) += process.o | 6 | obj-$(CONFIG_FREEZER) += process.o |
7 | obj-$(CONFIG_SUSPEND) += suspend.o | 7 | obj-$(CONFIG_SUSPEND) += suspend.o |
8 | obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o | 8 | obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o |
diff --git a/kernel/power/console.c b/kernel/power/console.c index 218e5af90156..b1dc456474b5 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * drivers/power/process.c - Functions for saving/restoring console. | 2 | * Functions for saving/restoring console. |
3 | * | 3 | * |
4 | * Originally from swsusp. | 4 | * Originally from swsusp. |
5 | */ | 5 | */ |
@@ -10,7 +10,6 @@ | |||
10 | #include <linux/module.h> | 10 | #include <linux/module.h> |
11 | #include "power.h" | 11 | #include "power.h" |
12 | 12 | ||
13 | #if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE) | ||
14 | #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) | 13 | #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) |
15 | 14 | ||
16 | static int orig_fgconsole, orig_kmsg; | 15 | static int orig_fgconsole, orig_kmsg; |
@@ -32,4 +31,3 @@ void pm_restore_console(void) | |||
32 | vt_kmsg_redirect(orig_kmsg); | 31 | vt_kmsg_redirect(orig_kmsg); |
33 | } | 32 | } |
34 | } | 33 | } |
35 | #endif | ||
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 8f7b1db1ece1..6d6d28870335 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
@@ -9,11 +9,13 @@ | |||
9 | * This file is released under the GPLv2. | 9 | * This file is released under the GPLv2. |
10 | */ | 10 | */ |
11 | 11 | ||
12 | #include <linux/export.h> | ||
12 | #include <linux/suspend.h> | 13 | #include <linux/suspend.h> |
13 | #include <linux/syscalls.h> | 14 | #include <linux/syscalls.h> |
14 | #include <linux/reboot.h> | 15 | #include <linux/reboot.h> |
15 | #include <linux/string.h> | 16 | #include <linux/string.h> |
16 | #include <linux/device.h> | 17 | #include <linux/device.h> |
18 | #include <linux/async.h> | ||
17 | #include <linux/kmod.h> | 19 | #include <linux/kmod.h> |
18 | #include <linux/delay.h> | 20 | #include <linux/delay.h> |
19 | #include <linux/fs.h> | 21 | #include <linux/fs.h> |
@@ -29,18 +31,18 @@ | |||
29 | #include "power.h" | 31 | #include "power.h" |
30 | 32 | ||
31 | 33 | ||
32 | static int nocompress = 0; | 34 | static int nocompress; |
33 | static int noresume = 0; | 35 | static int noresume; |
36 | static int resume_wait; | ||
37 | static int resume_delay; | ||
34 | static char resume_file[256] = CONFIG_PM_STD_PARTITION; | 38 | static char resume_file[256] = CONFIG_PM_STD_PARTITION; |
35 | dev_t swsusp_resume_device; | 39 | dev_t swsusp_resume_device; |
36 | sector_t swsusp_resume_block; | 40 | sector_t swsusp_resume_block; |
37 | int in_suspend __nosavedata = 0; | 41 | int in_suspend __nosavedata; |
38 | 42 | ||
39 | enum { | 43 | enum { |
40 | HIBERNATION_INVALID, | 44 | HIBERNATION_INVALID, |
41 | HIBERNATION_PLATFORM, | 45 | HIBERNATION_PLATFORM, |
42 | HIBERNATION_TEST, | ||
43 | HIBERNATION_TESTPROC, | ||
44 | HIBERNATION_SHUTDOWN, | 46 | HIBERNATION_SHUTDOWN, |
45 | HIBERNATION_REBOOT, | 47 | HIBERNATION_REBOOT, |
46 | /* keep last */ | 48 | /* keep last */ |
@@ -51,6 +53,8 @@ enum { | |||
51 | 53 | ||
52 | static int hibernation_mode = HIBERNATION_SHUTDOWN; | 54 | static int hibernation_mode = HIBERNATION_SHUTDOWN; |
53 | 55 | ||
56 | bool freezer_test_done; | ||
57 | |||
54 | static const struct platform_hibernation_ops *hibernation_ops; | 58 | static const struct platform_hibernation_ops *hibernation_ops; |
55 | 59 | ||
56 | /** | 60 | /** |
@@ -65,14 +69,14 @@ void hibernation_set_ops(const struct platform_hibernation_ops *ops) | |||
65 | WARN_ON(1); | 69 | WARN_ON(1); |
66 | return; | 70 | return; |
67 | } | 71 | } |
68 | mutex_lock(&pm_mutex); | 72 | lock_system_sleep(); |
69 | hibernation_ops = ops; | 73 | hibernation_ops = ops; |
70 | if (ops) | 74 | if (ops) |
71 | hibernation_mode = HIBERNATION_PLATFORM; | 75 | hibernation_mode = HIBERNATION_PLATFORM; |
72 | else if (hibernation_mode == HIBERNATION_PLATFORM) | 76 | else if (hibernation_mode == HIBERNATION_PLATFORM) |
73 | hibernation_mode = HIBERNATION_SHUTDOWN; | 77 | hibernation_mode = HIBERNATION_SHUTDOWN; |
74 | 78 | ||
75 | mutex_unlock(&pm_mutex); | 79 | unlock_system_sleep(); |
76 | } | 80 | } |
77 | 81 | ||
78 | static bool entering_platform_hibernation; | 82 | static bool entering_platform_hibernation; |
@@ -90,15 +94,6 @@ static void hibernation_debug_sleep(void) | |||
90 | mdelay(5000); | 94 | mdelay(5000); |
91 | } | 95 | } |
92 | 96 | ||
93 | static int hibernation_testmode(int mode) | ||
94 | { | ||
95 | if (hibernation_mode == mode) { | ||
96 | hibernation_debug_sleep(); | ||
97 | return 1; | ||
98 | } | ||
99 | return 0; | ||
100 | } | ||
101 | |||
102 | static int hibernation_test(int level) | 97 | static int hibernation_test(int level) |
103 | { | 98 | { |
104 | if (pm_test_level == level) { | 99 | if (pm_test_level == level) { |
@@ -108,7 +103,6 @@ static int hibernation_test(int level) | |||
108 | return 0; | 103 | return 0; |
109 | } | 104 | } |
110 | #else /* !CONFIG_PM_DEBUG */ | 105 | #else /* !CONFIG_PM_DEBUG */ |
111 | static int hibernation_testmode(int mode) { return 0; } | ||
112 | static int hibernation_test(int level) { return 0; } | 106 | static int hibernation_test(int level) { return 0; } |
113 | #endif /* !CONFIG_PM_DEBUG */ | 107 | #endif /* !CONFIG_PM_DEBUG */ |
114 | 108 | ||
@@ -272,8 +266,7 @@ static int create_image(int platform_mode) | |||
272 | goto Platform_finish; | 266 | goto Platform_finish; |
273 | 267 | ||
274 | error = disable_nonboot_cpus(); | 268 | error = disable_nonboot_cpus(); |
275 | if (error || hibernation_test(TEST_CPUS) | 269 | if (error || hibernation_test(TEST_CPUS)) |
276 | || hibernation_testmode(HIBERNATION_TEST)) | ||
277 | goto Enable_cpus; | 270 | goto Enable_cpus; |
278 | 271 | ||
279 | local_irq_disable(); | 272 | local_irq_disable(); |
@@ -327,38 +320,54 @@ static int create_image(int platform_mode) | |||
327 | */ | 320 | */ |
328 | int hibernation_snapshot(int platform_mode) | 321 | int hibernation_snapshot(int platform_mode) |
329 | { | 322 | { |
330 | pm_message_t msg = PMSG_RECOVER; | 323 | pm_message_t msg; |
331 | int error; | 324 | int error; |
332 | 325 | ||
333 | error = platform_begin(platform_mode); | 326 | error = platform_begin(platform_mode); |
334 | if (error) | 327 | if (error) |
335 | goto Close; | 328 | goto Close; |
336 | 329 | ||
337 | error = dpm_prepare(PMSG_FREEZE); | ||
338 | if (error) | ||
339 | goto Complete_devices; | ||
340 | |||
341 | /* Preallocate image memory before shutting down devices. */ | 330 | /* Preallocate image memory before shutting down devices. */ |
342 | error = hibernate_preallocate_memory(); | 331 | error = hibernate_preallocate_memory(); |
343 | if (error) | 332 | if (error) |
344 | goto Complete_devices; | 333 | goto Close; |
334 | |||
335 | error = freeze_kernel_threads(); | ||
336 | if (error) | ||
337 | goto Cleanup; | ||
338 | |||
339 | if (hibernation_test(TEST_FREEZER)) { | ||
340 | |||
341 | /* | ||
342 | * Indicate to the caller that we are returning due to a | ||
343 | * successful freezer test. | ||
344 | */ | ||
345 | freezer_test_done = true; | ||
346 | goto Cleanup; | ||
347 | } | ||
348 | |||
349 | error = dpm_prepare(PMSG_FREEZE); | ||
350 | if (error) { | ||
351 | dpm_complete(PMSG_RECOVER); | ||
352 | goto Cleanup; | ||
353 | } | ||
345 | 354 | ||
346 | suspend_console(); | 355 | suspend_console(); |
347 | pm_restrict_gfp_mask(); | 356 | pm_restrict_gfp_mask(); |
357 | |||
348 | error = dpm_suspend(PMSG_FREEZE); | 358 | error = dpm_suspend(PMSG_FREEZE); |
349 | if (error) | ||
350 | goto Recover_platform; | ||
351 | 359 | ||
352 | if (hibernation_test(TEST_DEVICES)) | 360 | if (error || hibernation_test(TEST_DEVICES)) |
353 | goto Recover_platform; | 361 | platform_recover(platform_mode); |
362 | else | ||
363 | error = create_image(platform_mode); | ||
354 | 364 | ||
355 | error = create_image(platform_mode); | ||
356 | /* | 365 | /* |
357 | * Control returns here (1) after the image has been created or the | 366 | * In the case that we call create_image() above, the control |
367 | * returns here (1) after the image has been created or the | ||
358 | * image creation has failed and (2) after a successful restore. | 368 | * image creation has failed and (2) after a successful restore. |
359 | */ | 369 | */ |
360 | 370 | ||
361 | Resume_devices: | ||
362 | /* We may need to release the preallocated image pages here. */ | 371 | /* We may need to release the preallocated image pages here. */ |
363 | if (error || !in_suspend) | 372 | if (error || !in_suspend) |
364 | swsusp_free(); | 373 | swsusp_free(); |
@@ -370,17 +379,15 @@ int hibernation_snapshot(int platform_mode) | |||
370 | pm_restore_gfp_mask(); | 379 | pm_restore_gfp_mask(); |
371 | 380 | ||
372 | resume_console(); | 381 | resume_console(); |
373 | |||
374 | Complete_devices: | ||
375 | dpm_complete(msg); | 382 | dpm_complete(msg); |
376 | 383 | ||
377 | Close: | 384 | Close: |
378 | platform_end(platform_mode); | 385 | platform_end(platform_mode); |
379 | return error; | 386 | return error; |
380 | 387 | ||
381 | Recover_platform: | 388 | Cleanup: |
382 | platform_recover(platform_mode); | 389 | swsusp_free(); |
383 | goto Resume_devices; | 390 | goto Close; |
384 | } | 391 | } |
385 | 392 | ||
386 | /** | 393 | /** |
@@ -463,7 +470,7 @@ static int resume_target_kernel(bool platform_mode) | |||
463 | * @platform_mode: If set, use platform driver to prepare for the transition. | 470 | * @platform_mode: If set, use platform driver to prepare for the transition. |
464 | * | 471 | * |
465 | * This routine must be called with pm_mutex held. If it is successful, control | 472 | * This routine must be called with pm_mutex held. If it is successful, control |
466 | * reappears in the restored target kernel in hibernation_snaphot(). | 473 | * reappears in the restored target kernel in hibernation_snapshot(). |
467 | */ | 474 | */ |
468 | int hibernation_restore(int platform_mode) | 475 | int hibernation_restore(int platform_mode) |
469 | { | 476 | { |
@@ -565,9 +572,6 @@ int hibernation_platform_enter(void) | |||
565 | static void power_down(void) | 572 | static void power_down(void) |
566 | { | 573 | { |
567 | switch (hibernation_mode) { | 574 | switch (hibernation_mode) { |
568 | case HIBERNATION_TEST: | ||
569 | case HIBERNATION_TESTPROC: | ||
570 | break; | ||
571 | case HIBERNATION_REBOOT: | 575 | case HIBERNATION_REBOOT: |
572 | kernel_restart(NULL); | 576 | kernel_restart(NULL); |
573 | break; | 577 | break; |
@@ -586,17 +590,6 @@ static void power_down(void) | |||
586 | while(1); | 590 | while(1); |
587 | } | 591 | } |
588 | 592 | ||
589 | static int prepare_processes(void) | ||
590 | { | ||
591 | int error = 0; | ||
592 | |||
593 | if (freeze_processes()) { | ||
594 | error = -EBUSY; | ||
595 | thaw_processes(); | ||
596 | } | ||
597 | return error; | ||
598 | } | ||
599 | |||
600 | /** | 593 | /** |
601 | * hibernate - Carry out system hibernation, including saving the image. | 594 | * hibernate - Carry out system hibernation, including saving the image. |
602 | */ | 595 | */ |
@@ -604,7 +597,7 @@ int hibernate(void) | |||
604 | { | 597 | { |
605 | int error; | 598 | int error; |
606 | 599 | ||
607 | mutex_lock(&pm_mutex); | 600 | lock_system_sleep(); |
608 | /* The snapshot device should not be opened while we're running */ | 601 | /* The snapshot device should not be opened while we're running */ |
609 | if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { | 602 | if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { |
610 | error = -EBUSY; | 603 | error = -EBUSY; |
@@ -629,19 +622,17 @@ int hibernate(void) | |||
629 | sys_sync(); | 622 | sys_sync(); |
630 | printk("done.\n"); | 623 | printk("done.\n"); |
631 | 624 | ||
632 | error = prepare_processes(); | 625 | error = freeze_processes(); |
633 | if (error) | 626 | if (error) |
634 | goto Finish; | 627 | goto Finish; |
635 | 628 | ||
636 | if (hibernation_test(TEST_FREEZER)) | ||
637 | goto Thaw; | ||
638 | |||
639 | if (hibernation_testmode(HIBERNATION_TESTPROC)) | ||
640 | goto Thaw; | ||
641 | |||
642 | error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); | 629 | error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); |
643 | if (error) | 630 | if (error) |
644 | goto Thaw; | 631 | goto Thaw; |
632 | if (freezer_test_done) { | ||
633 | freezer_test_done = false; | ||
634 | goto Thaw; | ||
635 | } | ||
645 | 636 | ||
646 | if (in_suspend) { | 637 | if (in_suspend) { |
647 | unsigned int flags = 0; | 638 | unsigned int flags = 0; |
@@ -650,6 +641,9 @@ int hibernate(void) | |||
650 | flags |= SF_PLATFORM_MODE; | 641 | flags |= SF_PLATFORM_MODE; |
651 | if (nocompress) | 642 | if (nocompress) |
652 | flags |= SF_NOCOMPRESS_MODE; | 643 | flags |= SF_NOCOMPRESS_MODE; |
644 | else | ||
645 | flags |= SF_CRC32_MODE; | ||
646 | |||
653 | pr_debug("PM: writing image.\n"); | 647 | pr_debug("PM: writing image.\n"); |
654 | error = swsusp_write(flags); | 648 | error = swsusp_write(flags); |
655 | swsusp_free(); | 649 | swsusp_free(); |
@@ -671,7 +665,7 @@ int hibernate(void) | |||
671 | pm_restore_console(); | 665 | pm_restore_console(); |
672 | atomic_inc(&snapshot_device_available); | 666 | atomic_inc(&snapshot_device_available); |
673 | Unlock: | 667 | Unlock: |
674 | mutex_unlock(&pm_mutex); | 668 | unlock_system_sleep(); |
675 | return error; | 669 | return error; |
676 | } | 670 | } |
677 | 671 | ||
@@ -724,6 +718,12 @@ static int software_resume(void) | |||
724 | 718 | ||
725 | pr_debug("PM: Checking hibernation image partition %s\n", resume_file); | 719 | pr_debug("PM: Checking hibernation image partition %s\n", resume_file); |
726 | 720 | ||
721 | if (resume_delay) { | ||
722 | printk(KERN_INFO "Waiting %dsec before reading resume device...\n", | ||
723 | resume_delay); | ||
724 | ssleep(resume_delay); | ||
725 | } | ||
726 | |||
727 | /* Check if the device is there */ | 727 | /* Check if the device is there */ |
728 | swsusp_resume_device = name_to_dev_t(resume_file); | 728 | swsusp_resume_device = name_to_dev_t(resume_file); |
729 | if (!swsusp_resume_device) { | 729 | if (!swsusp_resume_device) { |
@@ -732,6 +732,13 @@ static int software_resume(void) | |||
732 | * to wait for this to finish. | 732 | * to wait for this to finish. |
733 | */ | 733 | */ |
734 | wait_for_device_probe(); | 734 | wait_for_device_probe(); |
735 | |||
736 | if (resume_wait) { | ||
737 | while ((swsusp_resume_device = name_to_dev_t(resume_file)) == 0) | ||
738 | msleep(10); | ||
739 | async_synchronize_full(); | ||
740 | } | ||
741 | |||
735 | /* | 742 | /* |
736 | * We can't depend on SCSI devices being available after loading | 743 | * We can't depend on SCSI devices being available after loading |
737 | * one of their modules until scsi_complete_async_scans() is | 744 | * one of their modules until scsi_complete_async_scans() is |
@@ -772,11 +779,13 @@ static int software_resume(void) | |||
772 | goto close_finish; | 779 | goto close_finish; |
773 | 780 | ||
774 | error = create_basic_memory_bitmaps(); | 781 | error = create_basic_memory_bitmaps(); |
775 | if (error) | 782 | if (error) { |
783 | usermodehelper_enable(); | ||
776 | goto close_finish; | 784 | goto close_finish; |
785 | } | ||
777 | 786 | ||
778 | pr_debug("PM: Preparing processes for restore.\n"); | 787 | pr_debug("PM: Preparing processes for restore.\n"); |
779 | error = prepare_processes(); | 788 | error = freeze_processes(); |
780 | if (error) { | 789 | if (error) { |
781 | swsusp_close(FMODE_READ); | 790 | swsusp_close(FMODE_READ); |
782 | goto Done; | 791 | goto Done; |
@@ -816,8 +825,6 @@ static const char * const hibernation_modes[] = { | |||
816 | [HIBERNATION_PLATFORM] = "platform", | 825 | [HIBERNATION_PLATFORM] = "platform", |
817 | [HIBERNATION_SHUTDOWN] = "shutdown", | 826 | [HIBERNATION_SHUTDOWN] = "shutdown", |
818 | [HIBERNATION_REBOOT] = "reboot", | 827 | [HIBERNATION_REBOOT] = "reboot", |
819 | [HIBERNATION_TEST] = "test", | ||
820 | [HIBERNATION_TESTPROC] = "testproc", | ||
821 | }; | 828 | }; |
822 | 829 | ||
823 | /* | 830 | /* |
@@ -826,17 +833,15 @@ static const char * const hibernation_modes[] = { | |||
826 | * Hibernation can be handled in several ways. There are a few different ways | 833 | * Hibernation can be handled in several ways. There are a few different ways |
827 | * to put the system into the sleep state: using the platform driver (e.g. ACPI | 834 | * to put the system into the sleep state: using the platform driver (e.g. ACPI |
828 | * or other hibernation_ops), powering it off or rebooting it (for testing | 835 | * or other hibernation_ops), powering it off or rebooting it (for testing |
829 | * mostly), or using one of the two available test modes. | 836 | * mostly). |
830 | * | 837 | * |
831 | * The sysfs file /sys/power/disk provides an interface for selecting the | 838 | * The sysfs file /sys/power/disk provides an interface for selecting the |
832 | * hibernation mode to use. Reading from this file causes the available modes | 839 | * hibernation mode to use. Reading from this file causes the available modes |
833 | * to be printed. There are 5 modes that can be supported: | 840 | * to be printed. There are 3 modes that can be supported: |
834 | * | 841 | * |
835 | * 'platform' | 842 | * 'platform' |
836 | * 'shutdown' | 843 | * 'shutdown' |
837 | * 'reboot' | 844 | * 'reboot' |
838 | * 'test' | ||
839 | * 'testproc' | ||
840 | * | 845 | * |
841 | * If a platform hibernation driver is in use, 'platform' will be supported | 846 | * If a platform hibernation driver is in use, 'platform' will be supported |
842 | * and will be used by default. Otherwise, 'shutdown' will be used by default. | 847 | * and will be used by default. Otherwise, 'shutdown' will be used by default. |
@@ -860,8 +865,6 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, | |||
860 | switch (i) { | 865 | switch (i) { |
861 | case HIBERNATION_SHUTDOWN: | 866 | case HIBERNATION_SHUTDOWN: |
862 | case HIBERNATION_REBOOT: | 867 | case HIBERNATION_REBOOT: |
863 | case HIBERNATION_TEST: | ||
864 | case HIBERNATION_TESTPROC: | ||
865 | break; | 868 | break; |
866 | case HIBERNATION_PLATFORM: | 869 | case HIBERNATION_PLATFORM: |
867 | if (hibernation_ops) | 870 | if (hibernation_ops) |
@@ -890,7 +893,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
890 | p = memchr(buf, '\n', n); | 893 | p = memchr(buf, '\n', n); |
891 | len = p ? p - buf : n; | 894 | len = p ? p - buf : n; |
892 | 895 | ||
893 | mutex_lock(&pm_mutex); | 896 | lock_system_sleep(); |
894 | for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { | 897 | for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { |
895 | if (len == strlen(hibernation_modes[i]) | 898 | if (len == strlen(hibernation_modes[i]) |
896 | && !strncmp(buf, hibernation_modes[i], len)) { | 899 | && !strncmp(buf, hibernation_modes[i], len)) { |
@@ -902,8 +905,6 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
902 | switch (mode) { | 905 | switch (mode) { |
903 | case HIBERNATION_SHUTDOWN: | 906 | case HIBERNATION_SHUTDOWN: |
904 | case HIBERNATION_REBOOT: | 907 | case HIBERNATION_REBOOT: |
905 | case HIBERNATION_TEST: | ||
906 | case HIBERNATION_TESTPROC: | ||
907 | hibernation_mode = mode; | 908 | hibernation_mode = mode; |
908 | break; | 909 | break; |
909 | case HIBERNATION_PLATFORM: | 910 | case HIBERNATION_PLATFORM: |
@@ -918,7 +919,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
918 | if (!error) | 919 | if (!error) |
919 | pr_debug("PM: Hibernation mode set to '%s'\n", | 920 | pr_debug("PM: Hibernation mode set to '%s'\n", |
920 | hibernation_modes[mode]); | 921 | hibernation_modes[mode]); |
921 | mutex_unlock(&pm_mutex); | 922 | unlock_system_sleep(); |
922 | return error ? error : n; | 923 | return error ? error : n; |
923 | } | 924 | } |
924 | 925 | ||
@@ -945,9 +946,9 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
945 | if (maj != MAJOR(res) || min != MINOR(res)) | 946 | if (maj != MAJOR(res) || min != MINOR(res)) |
946 | goto out; | 947 | goto out; |
947 | 948 | ||
948 | mutex_lock(&pm_mutex); | 949 | lock_system_sleep(); |
949 | swsusp_resume_device = res; | 950 | swsusp_resume_device = res; |
950 | mutex_unlock(&pm_mutex); | 951 | unlock_system_sleep(); |
951 | printk(KERN_INFO "PM: Starting manual resume from disk\n"); | 952 | printk(KERN_INFO "PM: Starting manual resume from disk\n"); |
952 | noresume = 0; | 953 | noresume = 0; |
953 | software_resume(); | 954 | software_resume(); |
@@ -1060,7 +1061,21 @@ static int __init noresume_setup(char *str) | |||
1060 | return 1; | 1061 | return 1; |
1061 | } | 1062 | } |
1062 | 1063 | ||
1064 | static int __init resumewait_setup(char *str) | ||
1065 | { | ||
1066 | resume_wait = 1; | ||
1067 | return 1; | ||
1068 | } | ||
1069 | |||
1070 | static int __init resumedelay_setup(char *str) | ||
1071 | { | ||
1072 | resume_delay = simple_strtoul(str, NULL, 0); | ||
1073 | return 1; | ||
1074 | } | ||
1075 | |||
1063 | __setup("noresume", noresume_setup); | 1076 | __setup("noresume", noresume_setup); |
1064 | __setup("resume_offset=", resume_offset_setup); | 1077 | __setup("resume_offset=", resume_offset_setup); |
1065 | __setup("resume=", resume_setup); | 1078 | __setup("resume=", resume_setup); |
1066 | __setup("hibernate=", hibernate_setup); | 1079 | __setup("hibernate=", hibernate_setup); |
1080 | __setup("resumewait", resumewait_setup); | ||
1081 | __setup("resumedelay=", resumedelay_setup); | ||
diff --git a/kernel/power/main.c b/kernel/power/main.c index 6c601f871964..9824b41e5a18 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -3,15 +3,18 @@ | |||
3 | * | 3 | * |
4 | * Copyright (c) 2003 Patrick Mochel | 4 | * Copyright (c) 2003 Patrick Mochel |
5 | * Copyright (c) 2003 Open Source Development Lab | 5 | * Copyright (c) 2003 Open Source Development Lab |
6 | * | 6 | * |
7 | * This file is released under the GPLv2 | 7 | * This file is released under the GPLv2 |
8 | * | 8 | * |
9 | */ | 9 | */ |
10 | 10 | ||
11 | #include <linux/export.h> | ||
11 | #include <linux/kobject.h> | 12 | #include <linux/kobject.h> |
12 | #include <linux/string.h> | 13 | #include <linux/string.h> |
13 | #include <linux/resume-trace.h> | 14 | #include <linux/resume-trace.h> |
14 | #include <linux/workqueue.h> | 15 | #include <linux/workqueue.h> |
16 | #include <linux/debugfs.h> | ||
17 | #include <linux/seq_file.h> | ||
15 | 18 | ||
16 | #include "power.h" | 19 | #include "power.h" |
17 | 20 | ||
@@ -113,7 +116,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
113 | p = memchr(buf, '\n', n); | 116 | p = memchr(buf, '\n', n); |
114 | len = p ? p - buf : n; | 117 | len = p ? p - buf : n; |
115 | 118 | ||
116 | mutex_lock(&pm_mutex); | 119 | lock_system_sleep(); |
117 | 120 | ||
118 | level = TEST_FIRST; | 121 | level = TEST_FIRST; |
119 | for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++) | 122 | for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++) |
@@ -123,7 +126,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
123 | break; | 126 | break; |
124 | } | 127 | } |
125 | 128 | ||
126 | mutex_unlock(&pm_mutex); | 129 | unlock_system_sleep(); |
127 | 130 | ||
128 | return error ? error : n; | 131 | return error ? error : n; |
129 | } | 132 | } |
@@ -131,6 +134,101 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
131 | power_attr(pm_test); | 134 | power_attr(pm_test); |
132 | #endif /* CONFIG_PM_DEBUG */ | 135 | #endif /* CONFIG_PM_DEBUG */ |
133 | 136 | ||
137 | #ifdef CONFIG_DEBUG_FS | ||
138 | static char *suspend_step_name(enum suspend_stat_step step) | ||
139 | { | ||
140 | switch (step) { | ||
141 | case SUSPEND_FREEZE: | ||
142 | return "freeze"; | ||
143 | case SUSPEND_PREPARE: | ||
144 | return "prepare"; | ||
145 | case SUSPEND_SUSPEND: | ||
146 | return "suspend"; | ||
147 | case SUSPEND_SUSPEND_NOIRQ: | ||
148 | return "suspend_noirq"; | ||
149 | case SUSPEND_RESUME_NOIRQ: | ||
150 | return "resume_noirq"; | ||
151 | case SUSPEND_RESUME: | ||
152 | return "resume"; | ||
153 | default: | ||
154 | return ""; | ||
155 | } | ||
156 | } | ||
157 | |||
158 | static int suspend_stats_show(struct seq_file *s, void *unused) | ||
159 | { | ||
160 | int i, index, last_dev, last_errno, last_step; | ||
161 | |||
162 | last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; | ||
163 | last_dev %= REC_FAILED_NUM; | ||
164 | last_errno = suspend_stats.last_failed_errno + REC_FAILED_NUM - 1; | ||
165 | last_errno %= REC_FAILED_NUM; | ||
166 | last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; | ||
167 | last_step %= REC_FAILED_NUM; | ||
168 | seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n" | ||
169 | "%s: %d\n%s: %d\n%s: %d\n%s: %d\n", | ||
170 | "success", suspend_stats.success, | ||
171 | "fail", suspend_stats.fail, | ||
172 | "failed_freeze", suspend_stats.failed_freeze, | ||
173 | "failed_prepare", suspend_stats.failed_prepare, | ||
174 | "failed_suspend", suspend_stats.failed_suspend, | ||
175 | "failed_suspend_noirq", | ||
176 | suspend_stats.failed_suspend_noirq, | ||
177 | "failed_resume", suspend_stats.failed_resume, | ||
178 | "failed_resume_noirq", | ||
179 | suspend_stats.failed_resume_noirq); | ||
180 | seq_printf(s, "failures:\n last_failed_dev:\t%-s\n", | ||
181 | suspend_stats.failed_devs[last_dev]); | ||
182 | for (i = 1; i < REC_FAILED_NUM; i++) { | ||
183 | index = last_dev + REC_FAILED_NUM - i; | ||
184 | index %= REC_FAILED_NUM; | ||
185 | seq_printf(s, "\t\t\t%-s\n", | ||
186 | suspend_stats.failed_devs[index]); | ||
187 | } | ||
188 | seq_printf(s, " last_failed_errno:\t%-d\n", | ||
189 | suspend_stats.errno[last_errno]); | ||
190 | for (i = 1; i < REC_FAILED_NUM; i++) { | ||
191 | index = last_errno + REC_FAILED_NUM - i; | ||
192 | index %= REC_FAILED_NUM; | ||
193 | seq_printf(s, "\t\t\t%-d\n", | ||
194 | suspend_stats.errno[index]); | ||
195 | } | ||
196 | seq_printf(s, " last_failed_step:\t%-s\n", | ||
197 | suspend_step_name( | ||
198 | suspend_stats.failed_steps[last_step])); | ||
199 | for (i = 1; i < REC_FAILED_NUM; i++) { | ||
200 | index = last_step + REC_FAILED_NUM - i; | ||
201 | index %= REC_FAILED_NUM; | ||
202 | seq_printf(s, "\t\t\t%-s\n", | ||
203 | suspend_step_name( | ||
204 | suspend_stats.failed_steps[index])); | ||
205 | } | ||
206 | |||
207 | return 0; | ||
208 | } | ||
209 | |||
210 | static int suspend_stats_open(struct inode *inode, struct file *file) | ||
211 | { | ||
212 | return single_open(file, suspend_stats_show, NULL); | ||
213 | } | ||
214 | |||
215 | static const struct file_operations suspend_stats_operations = { | ||
216 | .open = suspend_stats_open, | ||
217 | .read = seq_read, | ||
218 | .llseek = seq_lseek, | ||
219 | .release = single_release, | ||
220 | }; | ||
221 | |||
222 | static int __init pm_debugfs_init(void) | ||
223 | { | ||
224 | debugfs_create_file("suspend_stats", S_IFREG | S_IRUGO, | ||
225 | NULL, NULL, &suspend_stats_operations); | ||
226 | return 0; | ||
227 | } | ||
228 | |||
229 | late_initcall(pm_debugfs_init); | ||
230 | #endif /* CONFIG_DEBUG_FS */ | ||
231 | |||
134 | #endif /* CONFIG_PM_SLEEP */ | 232 | #endif /* CONFIG_PM_SLEEP */ |
135 | 233 | ||
136 | struct kobject *power_kobj; | 234 | struct kobject *power_kobj; |
@@ -142,7 +240,7 @@ struct kobject *power_kobj; | |||
142 | * 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and | 240 | * 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and |
143 | * 'disk' (Suspend-to-Disk). | 241 | * 'disk' (Suspend-to-Disk). |
144 | * | 242 | * |
145 | * store() accepts one of those strings, translates it into the | 243 | * store() accepts one of those strings, translates it into the |
146 | * proper enumerated value, and initiates a suspend transition. | 244 | * proper enumerated value, and initiates a suspend transition. |
147 | */ | 245 | */ |
148 | static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, | 246 | static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, |
@@ -184,7 +282,7 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
184 | /* First, check if we are requested to hibernate */ | 282 | /* First, check if we are requested to hibernate */ |
185 | if (len == 4 && !strncmp(buf, "disk", len)) { | 283 | if (len == 4 && !strncmp(buf, "disk", len)) { |
186 | error = hibernate(); | 284 | error = hibernate(); |
187 | goto Exit; | 285 | goto Exit; |
188 | } | 286 | } |
189 | 287 | ||
190 | #ifdef CONFIG_SUSPEND | 288 | #ifdef CONFIG_SUSPEND |
@@ -192,8 +290,14 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
192 | if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) | 290 | if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) |
193 | break; | 291 | break; |
194 | } | 292 | } |
195 | if (state < PM_SUSPEND_MAX && *s) | 293 | if (state < PM_SUSPEND_MAX && *s) { |
196 | error = enter_state(state); | 294 | error = enter_state(state); |
295 | if (error) { | ||
296 | suspend_stats.fail++; | ||
297 | dpm_save_failed_errno(error); | ||
298 | } else | ||
299 | suspend_stats.success++; | ||
300 | } | ||
197 | #endif | 301 | #endif |
198 | 302 | ||
199 | Exit: | 303 | Exit: |
diff --git a/kernel/power/power.h b/kernel/power/power.h index 9a00a0a26280..0c4defe6d3b8 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
@@ -50,6 +50,8 @@ static inline char *check_image_kernel(struct swsusp_info *info) | |||
50 | #define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) | 50 | #define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) |
51 | 51 | ||
52 | /* kernel/power/hibernate.c */ | 52 | /* kernel/power/hibernate.c */ |
53 | extern bool freezer_test_done; | ||
54 | |||
53 | extern int hibernation_snapshot(int platform_mode); | 55 | extern int hibernation_snapshot(int platform_mode); |
54 | extern int hibernation_restore(int platform_mode); | 56 | extern int hibernation_restore(int platform_mode); |
55 | extern int hibernation_platform_enter(void); | 57 | extern int hibernation_platform_enter(void); |
@@ -146,6 +148,7 @@ extern int swsusp_swap_in_use(void); | |||
146 | */ | 148 | */ |
147 | #define SF_PLATFORM_MODE 1 | 149 | #define SF_PLATFORM_MODE 1 |
148 | #define SF_NOCOMPRESS_MODE 2 | 150 | #define SF_NOCOMPRESS_MODE 2 |
151 | #define SF_CRC32_MODE 4 | ||
149 | 152 | ||
150 | /* kernel/power/hibernate.c */ | 153 | /* kernel/power/hibernate.c */ |
151 | extern int swsusp_check(void); | 154 | extern int swsusp_check(void); |
@@ -228,7 +231,8 @@ extern int pm_test_level; | |||
228 | #ifdef CONFIG_SUSPEND_FREEZER | 231 | #ifdef CONFIG_SUSPEND_FREEZER |
229 | static inline int suspend_freeze_processes(void) | 232 | static inline int suspend_freeze_processes(void) |
230 | { | 233 | { |
231 | return freeze_processes(); | 234 | int error = freeze_processes(); |
235 | return error ? : freeze_kernel_threads(); | ||
232 | } | 236 | } |
233 | 237 | ||
234 | static inline void suspend_thaw_processes(void) | 238 | static inline void suspend_thaw_processes(void) |
diff --git a/kernel/power/process.c b/kernel/power/process.c index 0cf3a27a6c9d..77274c9ba2f1 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
@@ -22,16 +22,7 @@ | |||
22 | */ | 22 | */ |
23 | #define TIMEOUT (20 * HZ) | 23 | #define TIMEOUT (20 * HZ) |
24 | 24 | ||
25 | static inline int freezable(struct task_struct * p) | 25 | static int try_to_freeze_tasks(bool user_only) |
26 | { | ||
27 | if ((p == current) || | ||
28 | (p->flags & PF_NOFREEZE) || | ||
29 | (p->exit_state != 0)) | ||
30 | return 0; | ||
31 | return 1; | ||
32 | } | ||
33 | |||
34 | static int try_to_freeze_tasks(bool sig_only) | ||
35 | { | 26 | { |
36 | struct task_struct *g, *p; | 27 | struct task_struct *g, *p; |
37 | unsigned long end_time; | 28 | unsigned long end_time; |
@@ -46,17 +37,14 @@ static int try_to_freeze_tasks(bool sig_only) | |||
46 | 37 | ||
47 | end_time = jiffies + TIMEOUT; | 38 | end_time = jiffies + TIMEOUT; |
48 | 39 | ||
49 | if (!sig_only) | 40 | if (!user_only) |
50 | freeze_workqueues_begin(); | 41 | freeze_workqueues_begin(); |
51 | 42 | ||
52 | while (true) { | 43 | while (true) { |
53 | todo = 0; | 44 | todo = 0; |
54 | read_lock(&tasklist_lock); | 45 | read_lock(&tasklist_lock); |
55 | do_each_thread(g, p) { | 46 | do_each_thread(g, p) { |
56 | if (frozen(p) || !freezable(p)) | 47 | if (p == current || !freeze_task(p)) |
57 | continue; | ||
58 | |||
59 | if (!freeze_task(p, sig_only)) | ||
60 | continue; | 48 | continue; |
61 | 49 | ||
62 | /* | 50 | /* |
@@ -77,7 +65,7 @@ static int try_to_freeze_tasks(bool sig_only) | |||
77 | } while_each_thread(g, p); | 65 | } while_each_thread(g, p); |
78 | read_unlock(&tasklist_lock); | 66 | read_unlock(&tasklist_lock); |
79 | 67 | ||
80 | if (!sig_only) { | 68 | if (!user_only) { |
81 | wq_busy = freeze_workqueues_busy(); | 69 | wq_busy = freeze_workqueues_busy(); |
82 | todo += wq_busy; | 70 | todo += wq_busy; |
83 | } | 71 | } |
@@ -103,11 +91,6 @@ static int try_to_freeze_tasks(bool sig_only) | |||
103 | elapsed_csecs = elapsed_csecs64; | 91 | elapsed_csecs = elapsed_csecs64; |
104 | 92 | ||
105 | if (todo) { | 93 | if (todo) { |
106 | /* This does not unfreeze processes that are already frozen | ||
107 | * (we have slightly ugly calling convention in that respect, | ||
108 | * and caller must call thaw_processes() if something fails), | ||
109 | * but it cleans up leftover PF_FREEZE requests. | ||
110 | */ | ||
111 | printk("\n"); | 94 | printk("\n"); |
112 | printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds " | 95 | printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds " |
113 | "(%d tasks refusing to freeze, wq_busy=%d):\n", | 96 | "(%d tasks refusing to freeze, wq_busy=%d):\n", |
@@ -115,15 +98,11 @@ static int try_to_freeze_tasks(bool sig_only) | |||
115 | elapsed_csecs / 100, elapsed_csecs % 100, | 98 | elapsed_csecs / 100, elapsed_csecs % 100, |
116 | todo - wq_busy, wq_busy); | 99 | todo - wq_busy, wq_busy); |
117 | 100 | ||
118 | thaw_workqueues(); | ||
119 | |||
120 | read_lock(&tasklist_lock); | 101 | read_lock(&tasklist_lock); |
121 | do_each_thread(g, p) { | 102 | do_each_thread(g, p) { |
122 | task_lock(p); | 103 | if (!wakeup && !freezer_should_skip(p) && |
123 | if (!wakeup && freezing(p) && !freezer_should_skip(p)) | 104 | p != current && freezing(p) && !frozen(p)) |
124 | sched_show_task(p); | 105 | sched_show_task(p); |
125 | cancel_freezing(p); | ||
126 | task_unlock(p); | ||
127 | } while_each_thread(g, p); | 106 | } while_each_thread(g, p); |
128 | read_unlock(&tasklist_lock); | 107 | read_unlock(&tasklist_lock); |
129 | } else { | 108 | } else { |
@@ -135,60 +114,76 @@ static int try_to_freeze_tasks(bool sig_only) | |||
135 | } | 114 | } |
136 | 115 | ||
137 | /** | 116 | /** |
138 | * freeze_processes - tell processes to enter the refrigerator | 117 | * freeze_processes - Signal user space processes to enter the refrigerator. |
118 | * | ||
119 | * On success, returns 0. On failure, -errno and system is fully thawed. | ||
139 | */ | 120 | */ |
140 | int freeze_processes(void) | 121 | int freeze_processes(void) |
141 | { | 122 | { |
142 | int error; | 123 | int error; |
143 | 124 | ||
125 | if (!pm_freezing) | ||
126 | atomic_inc(&system_freezing_cnt); | ||
127 | |||
144 | printk("Freezing user space processes ... "); | 128 | printk("Freezing user space processes ... "); |
129 | pm_freezing = true; | ||
145 | error = try_to_freeze_tasks(true); | 130 | error = try_to_freeze_tasks(true); |
131 | if (!error) { | ||
132 | printk("done."); | ||
133 | oom_killer_disable(); | ||
134 | } | ||
135 | printk("\n"); | ||
136 | BUG_ON(in_atomic()); | ||
137 | |||
146 | if (error) | 138 | if (error) |
147 | goto Exit; | 139 | thaw_processes(); |
148 | printk("done.\n"); | 140 | return error; |
141 | } | ||
142 | |||
143 | /** | ||
144 | * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator. | ||
145 | * | ||
146 | * On success, returns 0. On failure, -errno and system is fully thawed. | ||
147 | */ | ||
148 | int freeze_kernel_threads(void) | ||
149 | { | ||
150 | int error; | ||
149 | 151 | ||
150 | printk("Freezing remaining freezable tasks ... "); | 152 | printk("Freezing remaining freezable tasks ... "); |
153 | pm_nosig_freezing = true; | ||
151 | error = try_to_freeze_tasks(false); | 154 | error = try_to_freeze_tasks(false); |
152 | if (error) | 155 | if (!error) |
153 | goto Exit; | 156 | printk("done."); |
154 | printk("done."); | ||
155 | 157 | ||
156 | oom_killer_disable(); | ||
157 | Exit: | ||
158 | BUG_ON(in_atomic()); | ||
159 | printk("\n"); | 158 | printk("\n"); |
159 | BUG_ON(in_atomic()); | ||
160 | 160 | ||
161 | if (error) | ||
162 | thaw_processes(); | ||
161 | return error; | 163 | return error; |
162 | } | 164 | } |
163 | 165 | ||
164 | static void thaw_tasks(bool nosig_only) | 166 | void thaw_processes(void) |
165 | { | 167 | { |
166 | struct task_struct *g, *p; | 168 | struct task_struct *g, *p; |
167 | 169 | ||
168 | read_lock(&tasklist_lock); | 170 | if (pm_freezing) |
169 | do_each_thread(g, p) { | 171 | atomic_dec(&system_freezing_cnt); |
170 | if (!freezable(p)) | 172 | pm_freezing = false; |
171 | continue; | 173 | pm_nosig_freezing = false; |
172 | 174 | ||
173 | if (nosig_only && should_send_signal(p)) | 175 | oom_killer_enable(); |
174 | continue; | ||
175 | 176 | ||
176 | if (cgroup_freezing_or_frozen(p)) | 177 | printk("Restarting tasks ... "); |
177 | continue; | ||
178 | 178 | ||
179 | thaw_process(p); | 179 | thaw_workqueues(); |
180 | |||
181 | read_lock(&tasklist_lock); | ||
182 | do_each_thread(g, p) { | ||
183 | __thaw_task(p); | ||
180 | } while_each_thread(g, p); | 184 | } while_each_thread(g, p); |
181 | read_unlock(&tasklist_lock); | 185 | read_unlock(&tasklist_lock); |
182 | } | ||
183 | |||
184 | void thaw_processes(void) | ||
185 | { | ||
186 | oom_killer_enable(); | ||
187 | 186 | ||
188 | printk("Restarting tasks ... "); | ||
189 | thaw_workqueues(); | ||
190 | thaw_tasks(true); | ||
191 | thaw_tasks(false); | ||
192 | schedule(); | 187 | schedule(); |
193 | printk("done.\n"); | 188 | printk("done.\n"); |
194 | } | 189 | } |
diff --git a/kernel/pm_qos_params.c b/kernel/power/qos.c index 37f05d0f0793..995e3bd3417b 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/power/qos.c | |||
@@ -29,7 +29,7 @@ | |||
29 | 29 | ||
30 | /*#define DEBUG*/ | 30 | /*#define DEBUG*/ |
31 | 31 | ||
32 | #include <linux/pm_qos_params.h> | 32 | #include <linux/pm_qos.h> |
33 | #include <linux/sched.h> | 33 | #include <linux/sched.h> |
34 | #include <linux/spinlock.h> | 34 | #include <linux/spinlock.h> |
35 | #include <linux/slab.h> | 35 | #include <linux/slab.h> |
@@ -43,64 +43,61 @@ | |||
43 | #include <linux/kernel.h> | 43 | #include <linux/kernel.h> |
44 | 44 | ||
45 | #include <linux/uaccess.h> | 45 | #include <linux/uaccess.h> |
46 | #include <linux/export.h> | ||
46 | 47 | ||
47 | /* | 48 | /* |
48 | * locking rule: all changes to requests or notifiers lists | 49 | * locking rule: all changes to constraints or notifiers lists |
49 | * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock | 50 | * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock |
50 | * held, taken with _irqsave. One lock to rule them all | 51 | * held, taken with _irqsave. One lock to rule them all |
51 | */ | 52 | */ |
52 | enum pm_qos_type { | ||
53 | PM_QOS_MAX, /* return the largest value */ | ||
54 | PM_QOS_MIN /* return the smallest value */ | ||
55 | }; | ||
56 | |||
57 | /* | ||
58 | * Note: The lockless read path depends on the CPU accessing | ||
59 | * target_value atomically. Atomic access is only guaranteed on all CPU | ||
60 | * types linux supports for 32 bit quantites | ||
61 | */ | ||
62 | struct pm_qos_object { | 53 | struct pm_qos_object { |
63 | struct plist_head requests; | 54 | struct pm_qos_constraints *constraints; |
64 | struct blocking_notifier_head *notifiers; | ||
65 | struct miscdevice pm_qos_power_miscdev; | 55 | struct miscdevice pm_qos_power_miscdev; |
66 | char *name; | 56 | char *name; |
67 | s32 target_value; /* Do not change to 64 bit */ | ||
68 | s32 default_value; | ||
69 | enum pm_qos_type type; | ||
70 | }; | 57 | }; |
71 | 58 | ||
72 | static DEFINE_SPINLOCK(pm_qos_lock); | 59 | static DEFINE_SPINLOCK(pm_qos_lock); |
73 | 60 | ||
74 | static struct pm_qos_object null_pm_qos; | 61 | static struct pm_qos_object null_pm_qos; |
62 | |||
75 | static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); | 63 | static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); |
76 | static struct pm_qos_object cpu_dma_pm_qos = { | 64 | static struct pm_qos_constraints cpu_dma_constraints = { |
77 | .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests), | 65 | .list = PLIST_HEAD_INIT(cpu_dma_constraints.list), |
78 | .notifiers = &cpu_dma_lat_notifier, | ||
79 | .name = "cpu_dma_latency", | ||
80 | .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, | 66 | .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, |
81 | .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, | 67 | .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, |
82 | .type = PM_QOS_MIN, | 68 | .type = PM_QOS_MIN, |
69 | .notifiers = &cpu_dma_lat_notifier, | ||
70 | }; | ||
71 | static struct pm_qos_object cpu_dma_pm_qos = { | ||
72 | .constraints = &cpu_dma_constraints, | ||
73 | .name = "cpu_dma_latency", | ||
83 | }; | 74 | }; |
84 | 75 | ||
85 | static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); | 76 | static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); |
86 | static struct pm_qos_object network_lat_pm_qos = { | 77 | static struct pm_qos_constraints network_lat_constraints = { |
87 | .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests), | 78 | .list = PLIST_HEAD_INIT(network_lat_constraints.list), |
88 | .notifiers = &network_lat_notifier, | ||
89 | .name = "network_latency", | ||
90 | .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, | 79 | .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, |
91 | .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, | 80 | .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, |
92 | .type = PM_QOS_MIN | 81 | .type = PM_QOS_MIN, |
82 | .notifiers = &network_lat_notifier, | ||
83 | }; | ||
84 | static struct pm_qos_object network_lat_pm_qos = { | ||
85 | .constraints = &network_lat_constraints, | ||
86 | .name = "network_latency", | ||
93 | }; | 87 | }; |
94 | 88 | ||
95 | 89 | ||
96 | static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); | 90 | static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); |
97 | static struct pm_qos_object network_throughput_pm_qos = { | 91 | static struct pm_qos_constraints network_tput_constraints = { |
98 | .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests), | 92 | .list = PLIST_HEAD_INIT(network_tput_constraints.list), |
99 | .notifiers = &network_throughput_notifier, | ||
100 | .name = "network_throughput", | ||
101 | .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, | 93 | .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, |
102 | .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, | 94 | .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, |
103 | .type = PM_QOS_MAX, | 95 | .type = PM_QOS_MAX, |
96 | .notifiers = &network_throughput_notifier, | ||
97 | }; | ||
98 | static struct pm_qos_object network_throughput_pm_qos = { | ||
99 | .constraints = &network_tput_constraints, | ||
100 | .name = "network_throughput", | ||
104 | }; | 101 | }; |
105 | 102 | ||
106 | 103 | ||
@@ -127,17 +124,17 @@ static const struct file_operations pm_qos_power_fops = { | |||
127 | }; | 124 | }; |
128 | 125 | ||
129 | /* unlocked internal variant */ | 126 | /* unlocked internal variant */ |
130 | static inline int pm_qos_get_value(struct pm_qos_object *o) | 127 | static inline int pm_qos_get_value(struct pm_qos_constraints *c) |
131 | { | 128 | { |
132 | if (plist_head_empty(&o->requests)) | 129 | if (plist_head_empty(&c->list)) |
133 | return o->default_value; | 130 | return c->default_value; |
134 | 131 | ||
135 | switch (o->type) { | 132 | switch (c->type) { |
136 | case PM_QOS_MIN: | 133 | case PM_QOS_MIN: |
137 | return plist_first(&o->requests)->prio; | 134 | return plist_first(&c->list)->prio; |
138 | 135 | ||
139 | case PM_QOS_MAX: | 136 | case PM_QOS_MAX: |
140 | return plist_last(&o->requests)->prio; | 137 | return plist_last(&c->list)->prio; |
141 | 138 | ||
142 | default: | 139 | default: |
143 | /* runtime check for not using enum */ | 140 | /* runtime check for not using enum */ |
@@ -145,69 +142,73 @@ static inline int pm_qos_get_value(struct pm_qos_object *o) | |||
145 | } | 142 | } |
146 | } | 143 | } |
147 | 144 | ||
148 | static inline s32 pm_qos_read_value(struct pm_qos_object *o) | 145 | s32 pm_qos_read_value(struct pm_qos_constraints *c) |
149 | { | 146 | { |
150 | return o->target_value; | 147 | return c->target_value; |
151 | } | 148 | } |
152 | 149 | ||
153 | static inline void pm_qos_set_value(struct pm_qos_object *o, s32 value) | 150 | static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value) |
154 | { | 151 | { |
155 | o->target_value = value; | 152 | c->target_value = value; |
156 | } | 153 | } |
157 | 154 | ||
158 | static void update_target(struct pm_qos_object *o, struct plist_node *node, | 155 | /** |
159 | int del, int value) | 156 | * pm_qos_update_target - manages the constraints list and calls the notifiers |
157 | * if needed | ||
158 | * @c: constraints data struct | ||
159 | * @node: request to add to the list, to update or to remove | ||
160 | * @action: action to take on the constraints list | ||
161 | * @value: value of the request to add or update | ||
162 | * | ||
163 | * This function returns 1 if the aggregated constraint value has changed, 0 | ||
164 | * otherwise. | ||
165 | */ | ||
166 | int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, | ||
167 | enum pm_qos_req_action action, int value) | ||
160 | { | 168 | { |
161 | unsigned long flags; | 169 | unsigned long flags; |
162 | int prev_value, curr_value; | 170 | int prev_value, curr_value, new_value; |
163 | 171 | ||
164 | spin_lock_irqsave(&pm_qos_lock, flags); | 172 | spin_lock_irqsave(&pm_qos_lock, flags); |
165 | prev_value = pm_qos_get_value(o); | 173 | prev_value = pm_qos_get_value(c); |
166 | /* PM_QOS_DEFAULT_VALUE is a signal that the value is unchanged */ | 174 | if (value == PM_QOS_DEFAULT_VALUE) |
167 | if (value != PM_QOS_DEFAULT_VALUE) { | 175 | new_value = c->default_value; |
176 | else | ||
177 | new_value = value; | ||
178 | |||
179 | switch (action) { | ||
180 | case PM_QOS_REMOVE_REQ: | ||
181 | plist_del(node, &c->list); | ||
182 | break; | ||
183 | case PM_QOS_UPDATE_REQ: | ||
168 | /* | 184 | /* |
169 | * to change the list, we atomically remove, reinit | 185 | * to change the list, we atomically remove, reinit |
170 | * with new value and add, then see if the extremal | 186 | * with new value and add, then see if the extremal |
171 | * changed | 187 | * changed |
172 | */ | 188 | */ |
173 | plist_del(node, &o->requests); | 189 | plist_del(node, &c->list); |
174 | plist_node_init(node, value); | 190 | case PM_QOS_ADD_REQ: |
175 | plist_add(node, &o->requests); | 191 | plist_node_init(node, new_value); |
176 | } else if (del) { | 192 | plist_add(node, &c->list); |
177 | plist_del(node, &o->requests); | 193 | break; |
178 | } else { | 194 | default: |
179 | plist_add(node, &o->requests); | 195 | /* no action */ |
196 | ; | ||
180 | } | 197 | } |
181 | curr_value = pm_qos_get_value(o); | 198 | |
182 | pm_qos_set_value(o, curr_value); | 199 | curr_value = pm_qos_get_value(c); |
200 | pm_qos_set_value(c, curr_value); | ||
201 | |||
183 | spin_unlock_irqrestore(&pm_qos_lock, flags); | 202 | spin_unlock_irqrestore(&pm_qos_lock, flags); |
184 | 203 | ||
185 | if (prev_value != curr_value) | 204 | if (prev_value != curr_value) { |
186 | blocking_notifier_call_chain(o->notifiers, | 205 | blocking_notifier_call_chain(c->notifiers, |
187 | (unsigned long)curr_value, | 206 | (unsigned long)curr_value, |
188 | NULL); | 207 | NULL); |
189 | } | 208 | return 1; |
190 | 209 | } else { | |
191 | static int register_pm_qos_misc(struct pm_qos_object *qos) | 210 | return 0; |
192 | { | ||
193 | qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR; | ||
194 | qos->pm_qos_power_miscdev.name = qos->name; | ||
195 | qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops; | ||
196 | |||
197 | return misc_register(&qos->pm_qos_power_miscdev); | ||
198 | } | ||
199 | |||
200 | static int find_pm_qos_object_by_minor(int minor) | ||
201 | { | ||
202 | int pm_qos_class; | ||
203 | |||
204 | for (pm_qos_class = 0; | ||
205 | pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) { | ||
206 | if (minor == | ||
207 | pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor) | ||
208 | return pm_qos_class; | ||
209 | } | 211 | } |
210 | return -1; | ||
211 | } | 212 | } |
212 | 213 | ||
213 | /** | 214 | /** |
@@ -218,11 +219,11 @@ static int find_pm_qos_object_by_minor(int minor) | |||
218 | */ | 219 | */ |
219 | int pm_qos_request(int pm_qos_class) | 220 | int pm_qos_request(int pm_qos_class) |
220 | { | 221 | { |
221 | return pm_qos_read_value(pm_qos_array[pm_qos_class]); | 222 | return pm_qos_read_value(pm_qos_array[pm_qos_class]->constraints); |
222 | } | 223 | } |
223 | EXPORT_SYMBOL_GPL(pm_qos_request); | 224 | EXPORT_SYMBOL_GPL(pm_qos_request); |
224 | 225 | ||
225 | int pm_qos_request_active(struct pm_qos_request_list *req) | 226 | int pm_qos_request_active(struct pm_qos_request *req) |
226 | { | 227 | { |
227 | return req->pm_qos_class != 0; | 228 | return req->pm_qos_class != 0; |
228 | } | 229 | } |
@@ -230,40 +231,36 @@ EXPORT_SYMBOL_GPL(pm_qos_request_active); | |||
230 | 231 | ||
231 | /** | 232 | /** |
232 | * pm_qos_add_request - inserts new qos request into the list | 233 | * pm_qos_add_request - inserts new qos request into the list |
233 | * @dep: pointer to a preallocated handle | 234 | * @req: pointer to a preallocated handle |
234 | * @pm_qos_class: identifies which list of qos request to use | 235 | * @pm_qos_class: identifies which list of qos request to use |
235 | * @value: defines the qos request | 236 | * @value: defines the qos request |
236 | * | 237 | * |
237 | * This function inserts a new entry in the pm_qos_class list of requested qos | 238 | * This function inserts a new entry in the pm_qos_class list of requested qos |
238 | * performance characteristics. It recomputes the aggregate QoS expectations | 239 | * performance characteristics. It recomputes the aggregate QoS expectations |
239 | * for the pm_qos_class of parameters and initializes the pm_qos_request_list | 240 | * for the pm_qos_class of parameters and initializes the pm_qos_request |
240 | * handle. Caller needs to save this handle for later use in updates and | 241 | * handle. Caller needs to save this handle for later use in updates and |
241 | * removal. | 242 | * removal. |
242 | */ | 243 | */ |
243 | 244 | ||
244 | void pm_qos_add_request(struct pm_qos_request_list *dep, | 245 | void pm_qos_add_request(struct pm_qos_request *req, |
245 | int pm_qos_class, s32 value) | 246 | int pm_qos_class, s32 value) |
246 | { | 247 | { |
247 | struct pm_qos_object *o = pm_qos_array[pm_qos_class]; | 248 | if (!req) /*guard against callers passing in null */ |
248 | int new_value; | 249 | return; |
249 | 250 | ||
250 | if (pm_qos_request_active(dep)) { | 251 | if (pm_qos_request_active(req)) { |
251 | WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n"); | 252 | WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n"); |
252 | return; | 253 | return; |
253 | } | 254 | } |
254 | if (value == PM_QOS_DEFAULT_VALUE) | 255 | req->pm_qos_class = pm_qos_class; |
255 | new_value = o->default_value; | 256 | pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints, |
256 | else | 257 | &req->node, PM_QOS_ADD_REQ, value); |
257 | new_value = value; | ||
258 | plist_node_init(&dep->list, new_value); | ||
259 | dep->pm_qos_class = pm_qos_class; | ||
260 | update_target(o, &dep->list, 0, PM_QOS_DEFAULT_VALUE); | ||
261 | } | 258 | } |
262 | EXPORT_SYMBOL_GPL(pm_qos_add_request); | 259 | EXPORT_SYMBOL_GPL(pm_qos_add_request); |
263 | 260 | ||
264 | /** | 261 | /** |
265 | * pm_qos_update_request - modifies an existing qos request | 262 | * pm_qos_update_request - modifies an existing qos request |
266 | * @pm_qos_req : handle to list element holding a pm_qos request to use | 263 | * @req : handle to list element holding a pm_qos request to use |
267 | * @value: defines the qos request | 264 | * @value: defines the qos request |
268 | * | 265 | * |
269 | * Updates an existing qos request for the pm_qos_class of parameters along | 266 | * Updates an existing qos request for the pm_qos_class of parameters along |
@@ -271,56 +268,47 @@ EXPORT_SYMBOL_GPL(pm_qos_add_request); | |||
271 | * | 268 | * |
272 | * Attempts are made to make this code callable on hot code paths. | 269 | * Attempts are made to make this code callable on hot code paths. |
273 | */ | 270 | */ |
274 | void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req, | 271 | void pm_qos_update_request(struct pm_qos_request *req, |
275 | s32 new_value) | 272 | s32 new_value) |
276 | { | 273 | { |
277 | s32 temp; | 274 | if (!req) /*guard against callers passing in null */ |
278 | struct pm_qos_object *o; | ||
279 | |||
280 | if (!pm_qos_req) /*guard against callers passing in null */ | ||
281 | return; | 275 | return; |
282 | 276 | ||
283 | if (!pm_qos_request_active(pm_qos_req)) { | 277 | if (!pm_qos_request_active(req)) { |
284 | WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n"); | 278 | WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n"); |
285 | return; | 279 | return; |
286 | } | 280 | } |
287 | 281 | ||
288 | o = pm_qos_array[pm_qos_req->pm_qos_class]; | 282 | if (new_value != req->node.prio) |
289 | 283 | pm_qos_update_target( | |
290 | if (new_value == PM_QOS_DEFAULT_VALUE) | 284 | pm_qos_array[req->pm_qos_class]->constraints, |
291 | temp = o->default_value; | 285 | &req->node, PM_QOS_UPDATE_REQ, new_value); |
292 | else | ||
293 | temp = new_value; | ||
294 | |||
295 | if (temp != pm_qos_req->list.prio) | ||
296 | update_target(o, &pm_qos_req->list, 0, temp); | ||
297 | } | 286 | } |
298 | EXPORT_SYMBOL_GPL(pm_qos_update_request); | 287 | EXPORT_SYMBOL_GPL(pm_qos_update_request); |
299 | 288 | ||
300 | /** | 289 | /** |
301 | * pm_qos_remove_request - modifies an existing qos request | 290 | * pm_qos_remove_request - modifies an existing qos request |
302 | * @pm_qos_req: handle to request list element | 291 | * @req: handle to request list element |
303 | * | 292 | * |
304 | * Will remove pm qos request from the list of requests and | 293 | * Will remove pm qos request from the list of constraints and |
305 | * recompute the current target value for the pm_qos_class. Call this | 294 | * recompute the current target value for the pm_qos_class. Call this |
306 | * on slow code paths. | 295 | * on slow code paths. |
307 | */ | 296 | */ |
308 | void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req) | 297 | void pm_qos_remove_request(struct pm_qos_request *req) |
309 | { | 298 | { |
310 | struct pm_qos_object *o; | 299 | if (!req) /*guard against callers passing in null */ |
311 | |||
312 | if (pm_qos_req == NULL) | ||
313 | return; | 300 | return; |
314 | /* silent return to keep pcm code cleaner */ | 301 | /* silent return to keep pcm code cleaner */ |
315 | 302 | ||
316 | if (!pm_qos_request_active(pm_qos_req)) { | 303 | if (!pm_qos_request_active(req)) { |
317 | WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n"); | 304 | WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n"); |
318 | return; | 305 | return; |
319 | } | 306 | } |
320 | 307 | ||
321 | o = pm_qos_array[pm_qos_req->pm_qos_class]; | 308 | pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints, |
322 | update_target(o, &pm_qos_req->list, 1, PM_QOS_DEFAULT_VALUE); | 309 | &req->node, PM_QOS_REMOVE_REQ, |
323 | memset(pm_qos_req, 0, sizeof(*pm_qos_req)); | 310 | PM_QOS_DEFAULT_VALUE); |
311 | memset(req, 0, sizeof(*req)); | ||
324 | } | 312 | } |
325 | EXPORT_SYMBOL_GPL(pm_qos_remove_request); | 313 | EXPORT_SYMBOL_GPL(pm_qos_remove_request); |
326 | 314 | ||
@@ -337,7 +325,8 @@ int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier) | |||
337 | int retval; | 325 | int retval; |
338 | 326 | ||
339 | retval = blocking_notifier_chain_register( | 327 | retval = blocking_notifier_chain_register( |
340 | pm_qos_array[pm_qos_class]->notifiers, notifier); | 328 | pm_qos_array[pm_qos_class]->constraints->notifiers, |
329 | notifier); | ||
341 | 330 | ||
342 | return retval; | 331 | return retval; |
343 | } | 332 | } |
@@ -356,34 +345,57 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier) | |||
356 | int retval; | 345 | int retval; |
357 | 346 | ||
358 | retval = blocking_notifier_chain_unregister( | 347 | retval = blocking_notifier_chain_unregister( |
359 | pm_qos_array[pm_qos_class]->notifiers, notifier); | 348 | pm_qos_array[pm_qos_class]->constraints->notifiers, |
349 | notifier); | ||
360 | 350 | ||
361 | return retval; | 351 | return retval; |
362 | } | 352 | } |
363 | EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); | 353 | EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); |
364 | 354 | ||
355 | /* User space interface to PM QoS classes via misc devices */ | ||
356 | static int register_pm_qos_misc(struct pm_qos_object *qos) | ||
357 | { | ||
358 | qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR; | ||
359 | qos->pm_qos_power_miscdev.name = qos->name; | ||
360 | qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops; | ||
361 | |||
362 | return misc_register(&qos->pm_qos_power_miscdev); | ||
363 | } | ||
364 | |||
365 | static int find_pm_qos_object_by_minor(int minor) | ||
366 | { | ||
367 | int pm_qos_class; | ||
368 | |||
369 | for (pm_qos_class = 0; | ||
370 | pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) { | ||
371 | if (minor == | ||
372 | pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor) | ||
373 | return pm_qos_class; | ||
374 | } | ||
375 | return -1; | ||
376 | } | ||
377 | |||
365 | static int pm_qos_power_open(struct inode *inode, struct file *filp) | 378 | static int pm_qos_power_open(struct inode *inode, struct file *filp) |
366 | { | 379 | { |
367 | long pm_qos_class; | 380 | long pm_qos_class; |
368 | 381 | ||
369 | pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); | 382 | pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); |
370 | if (pm_qos_class >= 0) { | 383 | if (pm_qos_class >= 0) { |
371 | struct pm_qos_request_list *req = kzalloc(sizeof(*req), GFP_KERNEL); | 384 | struct pm_qos_request *req = kzalloc(sizeof(*req), GFP_KERNEL); |
372 | if (!req) | 385 | if (!req) |
373 | return -ENOMEM; | 386 | return -ENOMEM; |
374 | 387 | ||
375 | pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE); | 388 | pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE); |
376 | filp->private_data = req; | 389 | filp->private_data = req; |
377 | 390 | ||
378 | if (filp->private_data) | 391 | return 0; |
379 | return 0; | ||
380 | } | 392 | } |
381 | return -EPERM; | 393 | return -EPERM; |
382 | } | 394 | } |
383 | 395 | ||
384 | static int pm_qos_power_release(struct inode *inode, struct file *filp) | 396 | static int pm_qos_power_release(struct inode *inode, struct file *filp) |
385 | { | 397 | { |
386 | struct pm_qos_request_list *req; | 398 | struct pm_qos_request *req; |
387 | 399 | ||
388 | req = filp->private_data; | 400 | req = filp->private_data; |
389 | pm_qos_remove_request(req); | 401 | pm_qos_remove_request(req); |
@@ -398,17 +410,15 @@ static ssize_t pm_qos_power_read(struct file *filp, char __user *buf, | |||
398 | { | 410 | { |
399 | s32 value; | 411 | s32 value; |
400 | unsigned long flags; | 412 | unsigned long flags; |
401 | struct pm_qos_object *o; | 413 | struct pm_qos_request *req = filp->private_data; |
402 | struct pm_qos_request_list *pm_qos_req = filp->private_data; | ||
403 | 414 | ||
404 | if (!pm_qos_req) | 415 | if (!req) |
405 | return -EINVAL; | 416 | return -EINVAL; |
406 | if (!pm_qos_request_active(pm_qos_req)) | 417 | if (!pm_qos_request_active(req)) |
407 | return -EINVAL; | 418 | return -EINVAL; |
408 | 419 | ||
409 | o = pm_qos_array[pm_qos_req->pm_qos_class]; | ||
410 | spin_lock_irqsave(&pm_qos_lock, flags); | 420 | spin_lock_irqsave(&pm_qos_lock, flags); |
411 | value = pm_qos_get_value(o); | 421 | value = pm_qos_get_value(pm_qos_array[req->pm_qos_class]->constraints); |
412 | spin_unlock_irqrestore(&pm_qos_lock, flags); | 422 | spin_unlock_irqrestore(&pm_qos_lock, flags); |
413 | 423 | ||
414 | return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32)); | 424 | return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32)); |
@@ -418,7 +428,7 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, | |||
418 | size_t count, loff_t *f_pos) | 428 | size_t count, loff_t *f_pos) |
419 | { | 429 | { |
420 | s32 value; | 430 | s32 value; |
421 | struct pm_qos_request_list *pm_qos_req; | 431 | struct pm_qos_request *req; |
422 | 432 | ||
423 | if (count == sizeof(s32)) { | 433 | if (count == sizeof(s32)) { |
424 | if (copy_from_user(&value, buf, sizeof(s32))) | 434 | if (copy_from_user(&value, buf, sizeof(s32))) |
@@ -449,8 +459,8 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, | |||
449 | return -EINVAL; | 459 | return -EINVAL; |
450 | } | 460 | } |
451 | 461 | ||
452 | pm_qos_req = filp->private_data; | 462 | req = filp->private_data; |
453 | pm_qos_update_request(pm_qos_req, value); | 463 | pm_qos_update_request(req, value); |
454 | 464 | ||
455 | return count; | 465 | return count; |
456 | } | 466 | } |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 06efa54f93d6..1cf88900ec4f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -858,6 +858,9 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) | |||
858 | PageReserved(page)) | 858 | PageReserved(page)) |
859 | return NULL; | 859 | return NULL; |
860 | 860 | ||
861 | if (page_is_guard(page)) | ||
862 | return NULL; | ||
863 | |||
861 | return page; | 864 | return page; |
862 | } | 865 | } |
863 | 866 | ||
@@ -920,6 +923,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn) | |||
920 | && (!kernel_page_present(page) || pfn_is_nosave(pfn))) | 923 | && (!kernel_page_present(page) || pfn_is_nosave(pfn))) |
921 | return NULL; | 924 | return NULL; |
922 | 925 | ||
926 | if (page_is_guard(page)) | ||
927 | return NULL; | ||
928 | |||
923 | return page; | 929 | return page; |
924 | } | 930 | } |
925 | 931 | ||
@@ -1339,6 +1345,9 @@ int hibernate_preallocate_memory(void) | |||
1339 | count += highmem; | 1345 | count += highmem; |
1340 | count -= totalreserve_pages; | 1346 | count -= totalreserve_pages; |
1341 | 1347 | ||
1348 | /* Add number of pages required for page keys (s390 only). */ | ||
1349 | size += page_key_additional_pages(saveable); | ||
1350 | |||
1342 | /* Compute the maximum number of saveable pages to leave in memory. */ | 1351 | /* Compute the maximum number of saveable pages to leave in memory. */ |
1343 | max_size = (count - (size + PAGES_FOR_IO)) / 2 | 1352 | max_size = (count - (size + PAGES_FOR_IO)) / 2 |
1344 | - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE); | 1353 | - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE); |
@@ -1662,6 +1671,8 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm) | |||
1662 | buf[j] = memory_bm_next_pfn(bm); | 1671 | buf[j] = memory_bm_next_pfn(bm); |
1663 | if (unlikely(buf[j] == BM_END_OF_MAP)) | 1672 | if (unlikely(buf[j] == BM_END_OF_MAP)) |
1664 | break; | 1673 | break; |
1674 | /* Save page key for data page (s390 only). */ | ||
1675 | page_key_read(buf + j); | ||
1665 | } | 1676 | } |
1666 | } | 1677 | } |
1667 | 1678 | ||
@@ -1821,6 +1832,9 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) | |||
1821 | if (unlikely(buf[j] == BM_END_OF_MAP)) | 1832 | if (unlikely(buf[j] == BM_END_OF_MAP)) |
1822 | break; | 1833 | break; |
1823 | 1834 | ||
1835 | /* Extract and buffer page key for data page (s390 only). */ | ||
1836 | page_key_memorize(buf + j); | ||
1837 | |||
1824 | if (memory_bm_pfn_present(bm, buf[j])) | 1838 | if (memory_bm_pfn_present(bm, buf[j])) |
1825 | memory_bm_set_bit(bm, buf[j]); | 1839 | memory_bm_set_bit(bm, buf[j]); |
1826 | else | 1840 | else |
@@ -2223,6 +2237,11 @@ int snapshot_write_next(struct snapshot_handle *handle) | |||
2223 | if (error) | 2237 | if (error) |
2224 | return error; | 2238 | return error; |
2225 | 2239 | ||
2240 | /* Allocate buffer for page keys. */ | ||
2241 | error = page_key_alloc(nr_copy_pages); | ||
2242 | if (error) | ||
2243 | return error; | ||
2244 | |||
2226 | } else if (handle->cur <= nr_meta_pages + 1) { | 2245 | } else if (handle->cur <= nr_meta_pages + 1) { |
2227 | error = unpack_orig_pfns(buffer, ©_bm); | 2246 | error = unpack_orig_pfns(buffer, ©_bm); |
2228 | if (error) | 2247 | if (error) |
@@ -2243,6 +2262,8 @@ int snapshot_write_next(struct snapshot_handle *handle) | |||
2243 | } | 2262 | } |
2244 | } else { | 2263 | } else { |
2245 | copy_last_highmem_page(); | 2264 | copy_last_highmem_page(); |
2265 | /* Restore page key for data page (s390 only). */ | ||
2266 | page_key_write(handle->buffer); | ||
2246 | handle->buffer = get_buffer(&orig_bm, &ca); | 2267 | handle->buffer = get_buffer(&orig_bm, &ca); |
2247 | if (IS_ERR(handle->buffer)) | 2268 | if (IS_ERR(handle->buffer)) |
2248 | return PTR_ERR(handle->buffer); | 2269 | return PTR_ERR(handle->buffer); |
@@ -2264,6 +2285,9 @@ int snapshot_write_next(struct snapshot_handle *handle) | |||
2264 | void snapshot_write_finalize(struct snapshot_handle *handle) | 2285 | void snapshot_write_finalize(struct snapshot_handle *handle) |
2265 | { | 2286 | { |
2266 | copy_last_highmem_page(); | 2287 | copy_last_highmem_page(); |
2288 | /* Restore page key for data page (s390 only). */ | ||
2289 | page_key_write(handle->buffer); | ||
2290 | page_key_free(); | ||
2267 | /* Free only if we have loaded the image entirely */ | 2291 | /* Free only if we have loaded the image entirely */ |
2268 | if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) { | 2292 | if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) { |
2269 | memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); | 2293 | memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index b6b71ad2208f..4fd51beed879 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
@@ -12,6 +12,7 @@ | |||
12 | #include <linux/delay.h> | 12 | #include <linux/delay.h> |
13 | #include <linux/errno.h> | 13 | #include <linux/errno.h> |
14 | #include <linux/init.h> | 14 | #include <linux/init.h> |
15 | #include <linux/kmod.h> | ||
15 | #include <linux/console.h> | 16 | #include <linux/console.h> |
16 | #include <linux/cpu.h> | 17 | #include <linux/cpu.h> |
17 | #include <linux/syscalls.h> | 18 | #include <linux/syscalls.h> |
@@ -21,6 +22,7 @@ | |||
21 | #include <linux/list.h> | 22 | #include <linux/list.h> |
22 | #include <linux/mm.h> | 23 | #include <linux/mm.h> |
23 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
25 | #include <linux/export.h> | ||
24 | #include <linux/suspend.h> | 26 | #include <linux/suspend.h> |
25 | #include <linux/syscore_ops.h> | 27 | #include <linux/syscore_ops.h> |
26 | #include <trace/events/power.h> | 28 | #include <trace/events/power.h> |
@@ -40,9 +42,9 @@ static const struct platform_suspend_ops *suspend_ops; | |||
40 | */ | 42 | */ |
41 | void suspend_set_ops(const struct platform_suspend_ops *ops) | 43 | void suspend_set_ops(const struct platform_suspend_ops *ops) |
42 | { | 44 | { |
43 | mutex_lock(&pm_mutex); | 45 | lock_system_sleep(); |
44 | suspend_ops = ops; | 46 | suspend_ops = ops; |
45 | mutex_unlock(&pm_mutex); | 47 | unlock_system_sleep(); |
46 | } | 48 | } |
47 | EXPORT_SYMBOL_GPL(suspend_set_ops); | 49 | EXPORT_SYMBOL_GPL(suspend_set_ops); |
48 | 50 | ||
@@ -107,7 +109,8 @@ static int suspend_prepare(void) | |||
107 | if (!error) | 109 | if (!error) |
108 | return 0; | 110 | return 0; |
109 | 111 | ||
110 | suspend_thaw_processes(); | 112 | suspend_stats.failed_freeze++; |
113 | dpm_save_failed_step(SUSPEND_FREEZE); | ||
111 | usermodehelper_enable(); | 114 | usermodehelper_enable(); |
112 | Finish: | 115 | Finish: |
113 | pm_notifier_call_chain(PM_POST_SUSPEND); | 116 | pm_notifier_call_chain(PM_POST_SUSPEND); |
@@ -315,8 +318,16 @@ int enter_state(suspend_state_t state) | |||
315 | */ | 318 | */ |
316 | int pm_suspend(suspend_state_t state) | 319 | int pm_suspend(suspend_state_t state) |
317 | { | 320 | { |
318 | if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX) | 321 | int ret; |
319 | return enter_state(state); | 322 | if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX) { |
323 | ret = enter_state(state); | ||
324 | if (ret) { | ||
325 | suspend_stats.fail++; | ||
326 | dpm_save_failed_errno(ret); | ||
327 | } else | ||
328 | suspend_stats.success++; | ||
329 | return ret; | ||
330 | } | ||
320 | return -EINVAL; | 331 | return -EINVAL; |
321 | } | 332 | } |
322 | EXPORT_SYMBOL(pm_suspend); | 333 | EXPORT_SYMBOL(pm_suspend); |
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 7c97c3a0eee3..3739ecced085 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
@@ -18,7 +18,6 @@ | |||
18 | #include <linux/bitops.h> | 18 | #include <linux/bitops.h> |
19 | #include <linux/genhd.h> | 19 | #include <linux/genhd.h> |
20 | #include <linux/device.h> | 20 | #include <linux/device.h> |
21 | #include <linux/buffer_head.h> | ||
22 | #include <linux/bio.h> | 21 | #include <linux/bio.h> |
23 | #include <linux/blkdev.h> | 22 | #include <linux/blkdev.h> |
24 | #include <linux/swap.h> | 23 | #include <linux/swap.h> |
@@ -27,6 +26,10 @@ | |||
27 | #include <linux/slab.h> | 26 | #include <linux/slab.h> |
28 | #include <linux/lzo.h> | 27 | #include <linux/lzo.h> |
29 | #include <linux/vmalloc.h> | 28 | #include <linux/vmalloc.h> |
29 | #include <linux/cpumask.h> | ||
30 | #include <linux/atomic.h> | ||
31 | #include <linux/kthread.h> | ||
32 | #include <linux/crc32.h> | ||
30 | 33 | ||
31 | #include "power.h" | 34 | #include "power.h" |
32 | 35 | ||
@@ -43,8 +46,7 @@ | |||
43 | * allocated and populated one at a time, so we only need one memory | 46 | * allocated and populated one at a time, so we only need one memory |
44 | * page to set up the entire structure. | 47 | * page to set up the entire structure. |
45 | * | 48 | * |
46 | * During resume we also only need to use one swap_map_page structure | 49 | * During resume we pick up all swap_map_page structures into a list. |
47 | * at a time. | ||
48 | */ | 50 | */ |
49 | 51 | ||
50 | #define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) | 52 | #define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) |
@@ -54,6 +56,11 @@ struct swap_map_page { | |||
54 | sector_t next_swap; | 56 | sector_t next_swap; |
55 | }; | 57 | }; |
56 | 58 | ||
59 | struct swap_map_page_list { | ||
60 | struct swap_map_page *map; | ||
61 | struct swap_map_page_list *next; | ||
62 | }; | ||
63 | |||
57 | /** | 64 | /** |
58 | * The swap_map_handle structure is used for handling swap in | 65 | * The swap_map_handle structure is used for handling swap in |
59 | * a file-alike way | 66 | * a file-alike way |
@@ -61,13 +68,18 @@ struct swap_map_page { | |||
61 | 68 | ||
62 | struct swap_map_handle { | 69 | struct swap_map_handle { |
63 | struct swap_map_page *cur; | 70 | struct swap_map_page *cur; |
71 | struct swap_map_page_list *maps; | ||
64 | sector_t cur_swap; | 72 | sector_t cur_swap; |
65 | sector_t first_sector; | 73 | sector_t first_sector; |
66 | unsigned int k; | 74 | unsigned int k; |
75 | unsigned long nr_free_pages, written; | ||
76 | u32 crc32; | ||
67 | }; | 77 | }; |
68 | 78 | ||
69 | struct swsusp_header { | 79 | struct swsusp_header { |
70 | char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)]; | 80 | char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int) - |
81 | sizeof(u32)]; | ||
82 | u32 crc32; | ||
71 | sector_t image; | 83 | sector_t image; |
72 | unsigned int flags; /* Flags to pass to the "boot" kernel */ | 84 | unsigned int flags; /* Flags to pass to the "boot" kernel */ |
73 | char orig_sig[10]; | 85 | char orig_sig[10]; |
@@ -199,6 +211,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) | |||
199 | memcpy(swsusp_header->sig, HIBERNATE_SIG, 10); | 211 | memcpy(swsusp_header->sig, HIBERNATE_SIG, 10); |
200 | swsusp_header->image = handle->first_sector; | 212 | swsusp_header->image = handle->first_sector; |
201 | swsusp_header->flags = flags; | 213 | swsusp_header->flags = flags; |
214 | if (flags & SF_CRC32_MODE) | ||
215 | swsusp_header->crc32 = handle->crc32; | ||
202 | error = hib_bio_write_page(swsusp_resume_block, | 216 | error = hib_bio_write_page(swsusp_resume_block, |
203 | swsusp_header, NULL); | 217 | swsusp_header, NULL); |
204 | } else { | 218 | } else { |
@@ -245,6 +259,7 @@ static int swsusp_swap_check(void) | |||
245 | static int write_page(void *buf, sector_t offset, struct bio **bio_chain) | 259 | static int write_page(void *buf, sector_t offset, struct bio **bio_chain) |
246 | { | 260 | { |
247 | void *src; | 261 | void *src; |
262 | int ret; | ||
248 | 263 | ||
249 | if (!offset) | 264 | if (!offset) |
250 | return -ENOSPC; | 265 | return -ENOSPC; |
@@ -254,9 +269,17 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain) | |||
254 | if (src) { | 269 | if (src) { |
255 | copy_page(src, buf); | 270 | copy_page(src, buf); |
256 | } else { | 271 | } else { |
257 | WARN_ON_ONCE(1); | 272 | ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */ |
258 | bio_chain = NULL; /* Go synchronous */ | 273 | if (ret) |
259 | src = buf; | 274 | return ret; |
275 | src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); | ||
276 | if (src) { | ||
277 | copy_page(src, buf); | ||
278 | } else { | ||
279 | WARN_ON_ONCE(1); | ||
280 | bio_chain = NULL; /* Go synchronous */ | ||
281 | src = buf; | ||
282 | } | ||
260 | } | 283 | } |
261 | } else { | 284 | } else { |
262 | src = buf; | 285 | src = buf; |
@@ -293,6 +316,8 @@ static int get_swap_writer(struct swap_map_handle *handle) | |||
293 | goto err_rel; | 316 | goto err_rel; |
294 | } | 317 | } |
295 | handle->k = 0; | 318 | handle->k = 0; |
319 | handle->nr_free_pages = nr_free_pages() >> 1; | ||
320 | handle->written = 0; | ||
296 | handle->first_sector = handle->cur_swap; | 321 | handle->first_sector = handle->cur_swap; |
297 | return 0; | 322 | return 0; |
298 | err_rel: | 323 | err_rel: |
@@ -316,20 +341,23 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, | |||
316 | return error; | 341 | return error; |
317 | handle->cur->entries[handle->k++] = offset; | 342 | handle->cur->entries[handle->k++] = offset; |
318 | if (handle->k >= MAP_PAGE_ENTRIES) { | 343 | if (handle->k >= MAP_PAGE_ENTRIES) { |
319 | error = hib_wait_on_bio_chain(bio_chain); | ||
320 | if (error) | ||
321 | goto out; | ||
322 | offset = alloc_swapdev_block(root_swap); | 344 | offset = alloc_swapdev_block(root_swap); |
323 | if (!offset) | 345 | if (!offset) |
324 | return -ENOSPC; | 346 | return -ENOSPC; |
325 | handle->cur->next_swap = offset; | 347 | handle->cur->next_swap = offset; |
326 | error = write_page(handle->cur, handle->cur_swap, NULL); | 348 | error = write_page(handle->cur, handle->cur_swap, bio_chain); |
327 | if (error) | 349 | if (error) |
328 | goto out; | 350 | goto out; |
329 | clear_page(handle->cur); | 351 | clear_page(handle->cur); |
330 | handle->cur_swap = offset; | 352 | handle->cur_swap = offset; |
331 | handle->k = 0; | 353 | handle->k = 0; |
332 | } | 354 | } |
355 | if (bio_chain && ++handle->written > handle->nr_free_pages) { | ||
356 | error = hib_wait_on_bio_chain(bio_chain); | ||
357 | if (error) | ||
358 | goto out; | ||
359 | handle->written = 0; | ||
360 | } | ||
333 | out: | 361 | out: |
334 | return error; | 362 | return error; |
335 | } | 363 | } |
@@ -372,6 +400,13 @@ static int swap_writer_finish(struct swap_map_handle *handle, | |||
372 | LZO_HEADER, PAGE_SIZE) | 400 | LZO_HEADER, PAGE_SIZE) |
373 | #define LZO_CMP_SIZE (LZO_CMP_PAGES * PAGE_SIZE) | 401 | #define LZO_CMP_SIZE (LZO_CMP_PAGES * PAGE_SIZE) |
374 | 402 | ||
403 | /* Maximum number of threads for compression/decompression. */ | ||
404 | #define LZO_THREADS 3 | ||
405 | |||
406 | /* Maximum number of pages for read buffering. */ | ||
407 | #define LZO_READ_PAGES (MAP_PAGE_ENTRIES * 8) | ||
408 | |||
409 | |||
375 | /** | 410 | /** |
376 | * save_image - save the suspend image data | 411 | * save_image - save the suspend image data |
377 | */ | 412 | */ |
@@ -419,6 +454,92 @@ static int save_image(struct swap_map_handle *handle, | |||
419 | return ret; | 454 | return ret; |
420 | } | 455 | } |
421 | 456 | ||
457 | /** | ||
458 | * Structure used for CRC32. | ||
459 | */ | ||
460 | struct crc_data { | ||
461 | struct task_struct *thr; /* thread */ | ||
462 | atomic_t ready; /* ready to start flag */ | ||
463 | atomic_t stop; /* ready to stop flag */ | ||
464 | unsigned run_threads; /* nr current threads */ | ||
465 | wait_queue_head_t go; /* start crc update */ | ||
466 | wait_queue_head_t done; /* crc update done */ | ||
467 | u32 *crc32; /* points to handle's crc32 */ | ||
468 | size_t *unc_len[LZO_THREADS]; /* uncompressed lengths */ | ||
469 | unsigned char *unc[LZO_THREADS]; /* uncompressed data */ | ||
470 | }; | ||
471 | |||
472 | /** | ||
473 | * CRC32 update function that runs in its own thread. | ||
474 | */ | ||
475 | static int crc32_threadfn(void *data) | ||
476 | { | ||
477 | struct crc_data *d = data; | ||
478 | unsigned i; | ||
479 | |||
480 | while (1) { | ||
481 | wait_event(d->go, atomic_read(&d->ready) || | ||
482 | kthread_should_stop()); | ||
483 | if (kthread_should_stop()) { | ||
484 | d->thr = NULL; | ||
485 | atomic_set(&d->stop, 1); | ||
486 | wake_up(&d->done); | ||
487 | break; | ||
488 | } | ||
489 | atomic_set(&d->ready, 0); | ||
490 | |||
491 | for (i = 0; i < d->run_threads; i++) | ||
492 | *d->crc32 = crc32_le(*d->crc32, | ||
493 | d->unc[i], *d->unc_len[i]); | ||
494 | atomic_set(&d->stop, 1); | ||
495 | wake_up(&d->done); | ||
496 | } | ||
497 | return 0; | ||
498 | } | ||
499 | /** | ||
500 | * Structure used for LZO data compression. | ||
501 | */ | ||
502 | struct cmp_data { | ||
503 | struct task_struct *thr; /* thread */ | ||
504 | atomic_t ready; /* ready to start flag */ | ||
505 | atomic_t stop; /* ready to stop flag */ | ||
506 | int ret; /* return code */ | ||
507 | wait_queue_head_t go; /* start compression */ | ||
508 | wait_queue_head_t done; /* compression done */ | ||
509 | size_t unc_len; /* uncompressed length */ | ||
510 | size_t cmp_len; /* compressed length */ | ||
511 | unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */ | ||
512 | unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */ | ||
513 | unsigned char wrk[LZO1X_1_MEM_COMPRESS]; /* compression workspace */ | ||
514 | }; | ||
515 | |||
516 | /** | ||
517 | * Compression function that runs in its own thread. | ||
518 | */ | ||
519 | static int lzo_compress_threadfn(void *data) | ||
520 | { | ||
521 | struct cmp_data *d = data; | ||
522 | |||
523 | while (1) { | ||
524 | wait_event(d->go, atomic_read(&d->ready) || | ||
525 | kthread_should_stop()); | ||
526 | if (kthread_should_stop()) { | ||
527 | d->thr = NULL; | ||
528 | d->ret = -1; | ||
529 | atomic_set(&d->stop, 1); | ||
530 | wake_up(&d->done); | ||
531 | break; | ||
532 | } | ||
533 | atomic_set(&d->ready, 0); | ||
534 | |||
535 | d->ret = lzo1x_1_compress(d->unc, d->unc_len, | ||
536 | d->cmp + LZO_HEADER, &d->cmp_len, | ||
537 | d->wrk); | ||
538 | atomic_set(&d->stop, 1); | ||
539 | wake_up(&d->done); | ||
540 | } | ||
541 | return 0; | ||
542 | } | ||
422 | 543 | ||
423 | /** | 544 | /** |
424 | * save_image_lzo - Save the suspend image data compressed with LZO. | 545 | * save_image_lzo - Save the suspend image data compressed with LZO. |
@@ -437,42 +558,93 @@ static int save_image_lzo(struct swap_map_handle *handle, | |||
437 | struct bio *bio; | 558 | struct bio *bio; |
438 | struct timeval start; | 559 | struct timeval start; |
439 | struct timeval stop; | 560 | struct timeval stop; |
440 | size_t off, unc_len, cmp_len; | 561 | size_t off; |
441 | unsigned char *unc, *cmp, *wrk, *page; | 562 | unsigned thr, run_threads, nr_threads; |
563 | unsigned char *page = NULL; | ||
564 | struct cmp_data *data = NULL; | ||
565 | struct crc_data *crc = NULL; | ||
566 | |||
567 | /* | ||
568 | * We'll limit the number of threads for compression to limit memory | ||
569 | * footprint. | ||
570 | */ | ||
571 | nr_threads = num_online_cpus() - 1; | ||
572 | nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); | ||
442 | 573 | ||
443 | page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); | 574 | page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); |
444 | if (!page) { | 575 | if (!page) { |
445 | printk(KERN_ERR "PM: Failed to allocate LZO page\n"); | 576 | printk(KERN_ERR "PM: Failed to allocate LZO page\n"); |
446 | return -ENOMEM; | 577 | ret = -ENOMEM; |
578 | goto out_clean; | ||
447 | } | 579 | } |
448 | 580 | ||
449 | wrk = vmalloc(LZO1X_1_MEM_COMPRESS); | 581 | data = vmalloc(sizeof(*data) * nr_threads); |
450 | if (!wrk) { | 582 | if (!data) { |
451 | printk(KERN_ERR "PM: Failed to allocate LZO workspace\n"); | 583 | printk(KERN_ERR "PM: Failed to allocate LZO data\n"); |
452 | free_page((unsigned long)page); | 584 | ret = -ENOMEM; |
453 | return -ENOMEM; | 585 | goto out_clean; |
454 | } | 586 | } |
587 | for (thr = 0; thr < nr_threads; thr++) | ||
588 | memset(&data[thr], 0, offsetof(struct cmp_data, go)); | ||
455 | 589 | ||
456 | unc = vmalloc(LZO_UNC_SIZE); | 590 | crc = kmalloc(sizeof(*crc), GFP_KERNEL); |
457 | if (!unc) { | 591 | if (!crc) { |
458 | printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); | 592 | printk(KERN_ERR "PM: Failed to allocate crc\n"); |
459 | vfree(wrk); | 593 | ret = -ENOMEM; |
460 | free_page((unsigned long)page); | 594 | goto out_clean; |
461 | return -ENOMEM; | 595 | } |
596 | memset(crc, 0, offsetof(struct crc_data, go)); | ||
597 | |||
598 | /* | ||
599 | * Start the compression threads. | ||
600 | */ | ||
601 | for (thr = 0; thr < nr_threads; thr++) { | ||
602 | init_waitqueue_head(&data[thr].go); | ||
603 | init_waitqueue_head(&data[thr].done); | ||
604 | |||
605 | data[thr].thr = kthread_run(lzo_compress_threadfn, | ||
606 | &data[thr], | ||
607 | "image_compress/%u", thr); | ||
608 | if (IS_ERR(data[thr].thr)) { | ||
609 | data[thr].thr = NULL; | ||
610 | printk(KERN_ERR | ||
611 | "PM: Cannot start compression threads\n"); | ||
612 | ret = -ENOMEM; | ||
613 | goto out_clean; | ||
614 | } | ||
462 | } | 615 | } |
463 | 616 | ||
464 | cmp = vmalloc(LZO_CMP_SIZE); | 617 | /* |
465 | if (!cmp) { | 618 | * Adjust number of free pages after all allocations have been done. |
466 | printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); | 619 | * We don't want to run out of pages when writing. |
467 | vfree(unc); | 620 | */ |
468 | vfree(wrk); | 621 | handle->nr_free_pages = nr_free_pages() >> 1; |
469 | free_page((unsigned long)page); | 622 | |
470 | return -ENOMEM; | 623 | /* |
624 | * Start the CRC32 thread. | ||
625 | */ | ||
626 | init_waitqueue_head(&crc->go); | ||
627 | init_waitqueue_head(&crc->done); | ||
628 | |||
629 | handle->crc32 = 0; | ||
630 | crc->crc32 = &handle->crc32; | ||
631 | for (thr = 0; thr < nr_threads; thr++) { | ||
632 | crc->unc[thr] = data[thr].unc; | ||
633 | crc->unc_len[thr] = &data[thr].unc_len; | ||
634 | } | ||
635 | |||
636 | crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32"); | ||
637 | if (IS_ERR(crc->thr)) { | ||
638 | crc->thr = NULL; | ||
639 | printk(KERN_ERR "PM: Cannot start CRC32 thread\n"); | ||
640 | ret = -ENOMEM; | ||
641 | goto out_clean; | ||
471 | } | 642 | } |
472 | 643 | ||
473 | printk(KERN_INFO | 644 | printk(KERN_INFO |
645 | "PM: Using %u thread(s) for compression.\n" | ||
474 | "PM: Compressing and saving image data (%u pages) ... ", | 646 | "PM: Compressing and saving image data (%u pages) ... ", |
475 | nr_to_write); | 647 | nr_threads, nr_to_write); |
476 | m = nr_to_write / 100; | 648 | m = nr_to_write / 100; |
477 | if (!m) | 649 | if (!m) |
478 | m = 1; | 650 | m = 1; |
@@ -480,55 +652,83 @@ static int save_image_lzo(struct swap_map_handle *handle, | |||
480 | bio = NULL; | 652 | bio = NULL; |
481 | do_gettimeofday(&start); | 653 | do_gettimeofday(&start); |
482 | for (;;) { | 654 | for (;;) { |
483 | for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { | 655 | for (thr = 0; thr < nr_threads; thr++) { |
484 | ret = snapshot_read_next(snapshot); | 656 | for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { |
485 | if (ret < 0) | 657 | ret = snapshot_read_next(snapshot); |
486 | goto out_finish; | 658 | if (ret < 0) |
487 | 659 | goto out_finish; | |
488 | if (!ret) | 660 | |
661 | if (!ret) | ||
662 | break; | ||
663 | |||
664 | memcpy(data[thr].unc + off, | ||
665 | data_of(*snapshot), PAGE_SIZE); | ||
666 | |||
667 | if (!(nr_pages % m)) | ||
668 | printk(KERN_CONT "\b\b\b\b%3d%%", | ||
669 | nr_pages / m); | ||
670 | nr_pages++; | ||
671 | } | ||
672 | if (!off) | ||
489 | break; | 673 | break; |
490 | 674 | ||
491 | memcpy(unc + off, data_of(*snapshot), PAGE_SIZE); | 675 | data[thr].unc_len = off; |
492 | 676 | ||
493 | if (!(nr_pages % m)) | 677 | atomic_set(&data[thr].ready, 1); |
494 | printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m); | 678 | wake_up(&data[thr].go); |
495 | nr_pages++; | ||
496 | } | 679 | } |
497 | 680 | ||
498 | if (!off) | 681 | if (!thr) |
499 | break; | 682 | break; |
500 | 683 | ||
501 | unc_len = off; | 684 | crc->run_threads = thr; |
502 | ret = lzo1x_1_compress(unc, unc_len, | 685 | atomic_set(&crc->ready, 1); |
503 | cmp + LZO_HEADER, &cmp_len, wrk); | 686 | wake_up(&crc->go); |
504 | if (ret < 0) { | ||
505 | printk(KERN_ERR "PM: LZO compression failed\n"); | ||
506 | break; | ||
507 | } | ||
508 | 687 | ||
509 | if (unlikely(!cmp_len || | 688 | for (run_threads = thr, thr = 0; thr < run_threads; thr++) { |
510 | cmp_len > lzo1x_worst_compress(unc_len))) { | 689 | wait_event(data[thr].done, |
511 | printk(KERN_ERR "PM: Invalid LZO compressed length\n"); | 690 | atomic_read(&data[thr].stop)); |
512 | ret = -1; | 691 | atomic_set(&data[thr].stop, 0); |
513 | break; | ||
514 | } | ||
515 | 692 | ||
516 | *(size_t *)cmp = cmp_len; | 693 | ret = data[thr].ret; |
517 | 694 | ||
518 | /* | 695 | if (ret < 0) { |
519 | * Given we are writing one page at a time to disk, we copy | 696 | printk(KERN_ERR "PM: LZO compression failed\n"); |
520 | * that much from the buffer, although the last bit will likely | 697 | goto out_finish; |
521 | * be smaller than full page. This is OK - we saved the length | 698 | } |
522 | * of the compressed data, so any garbage at the end will be | ||
523 | * discarded when we read it. | ||
524 | */ | ||
525 | for (off = 0; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) { | ||
526 | memcpy(page, cmp + off, PAGE_SIZE); | ||
527 | 699 | ||
528 | ret = swap_write_page(handle, page, &bio); | 700 | if (unlikely(!data[thr].cmp_len || |
529 | if (ret) | 701 | data[thr].cmp_len > |
702 | lzo1x_worst_compress(data[thr].unc_len))) { | ||
703 | printk(KERN_ERR | ||
704 | "PM: Invalid LZO compressed length\n"); | ||
705 | ret = -1; | ||
530 | goto out_finish; | 706 | goto out_finish; |
707 | } | ||
708 | |||
709 | *(size_t *)data[thr].cmp = data[thr].cmp_len; | ||
710 | |||
711 | /* | ||
712 | * Given we are writing one page at a time to disk, we | ||
713 | * copy that much from the buffer, although the last | ||
714 | * bit will likely be smaller than full page. This is | ||
715 | * OK - we saved the length of the compressed data, so | ||
716 | * any garbage at the end will be discarded when we | ||
717 | * read it. | ||
718 | */ | ||
719 | for (off = 0; | ||
720 | off < LZO_HEADER + data[thr].cmp_len; | ||
721 | off += PAGE_SIZE) { | ||
722 | memcpy(page, data[thr].cmp + off, PAGE_SIZE); | ||
723 | |||
724 | ret = swap_write_page(handle, page, &bio); | ||
725 | if (ret) | ||
726 | goto out_finish; | ||
727 | } | ||
531 | } | 728 | } |
729 | |||
730 | wait_event(crc->done, atomic_read(&crc->stop)); | ||
731 | atomic_set(&crc->stop, 0); | ||
532 | } | 732 | } |
533 | 733 | ||
534 | out_finish: | 734 | out_finish: |
@@ -536,16 +736,25 @@ out_finish: | |||
536 | do_gettimeofday(&stop); | 736 | do_gettimeofday(&stop); |
537 | if (!ret) | 737 | if (!ret) |
538 | ret = err2; | 738 | ret = err2; |
539 | if (!ret) | 739 | if (!ret) { |
540 | printk(KERN_CONT "\b\b\b\bdone\n"); | 740 | printk(KERN_CONT "\b\b\b\bdone\n"); |
541 | else | 741 | } else { |
542 | printk(KERN_CONT "\n"); | 742 | printk(KERN_CONT "\n"); |
743 | } | ||
543 | swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); | 744 | swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); |
544 | 745 | out_clean: | |
545 | vfree(cmp); | 746 | if (crc) { |
546 | vfree(unc); | 747 | if (crc->thr) |
547 | vfree(wrk); | 748 | kthread_stop(crc->thr); |
548 | free_page((unsigned long)page); | 749 | kfree(crc); |
750 | } | ||
751 | if (data) { | ||
752 | for (thr = 0; thr < nr_threads; thr++) | ||
753 | if (data[thr].thr) | ||
754 | kthread_stop(data[thr].thr); | ||
755 | vfree(data); | ||
756 | } | ||
757 | if (page) free_page((unsigned long)page); | ||
549 | 758 | ||
550 | return ret; | 759 | return ret; |
551 | } | 760 | } |
@@ -625,8 +834,15 @@ out_finish: | |||
625 | 834 | ||
626 | static void release_swap_reader(struct swap_map_handle *handle) | 835 | static void release_swap_reader(struct swap_map_handle *handle) |
627 | { | 836 | { |
628 | if (handle->cur) | 837 | struct swap_map_page_list *tmp; |
629 | free_page((unsigned long)handle->cur); | 838 | |
839 | while (handle->maps) { | ||
840 | if (handle->maps->map) | ||
841 | free_page((unsigned long)handle->maps->map); | ||
842 | tmp = handle->maps; | ||
843 | handle->maps = handle->maps->next; | ||
844 | kfree(tmp); | ||
845 | } | ||
630 | handle->cur = NULL; | 846 | handle->cur = NULL; |
631 | } | 847 | } |
632 | 848 | ||
@@ -634,22 +850,46 @@ static int get_swap_reader(struct swap_map_handle *handle, | |||
634 | unsigned int *flags_p) | 850 | unsigned int *flags_p) |
635 | { | 851 | { |
636 | int error; | 852 | int error; |
853 | struct swap_map_page_list *tmp, *last; | ||
854 | sector_t offset; | ||
637 | 855 | ||
638 | *flags_p = swsusp_header->flags; | 856 | *flags_p = swsusp_header->flags; |
639 | 857 | ||
640 | if (!swsusp_header->image) /* how can this happen? */ | 858 | if (!swsusp_header->image) /* how can this happen? */ |
641 | return -EINVAL; | 859 | return -EINVAL; |
642 | 860 | ||
643 | handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH); | 861 | handle->cur = NULL; |
644 | if (!handle->cur) | 862 | last = handle->maps = NULL; |
645 | return -ENOMEM; | 863 | offset = swsusp_header->image; |
864 | while (offset) { | ||
865 | tmp = kmalloc(sizeof(*handle->maps), GFP_KERNEL); | ||
866 | if (!tmp) { | ||
867 | release_swap_reader(handle); | ||
868 | return -ENOMEM; | ||
869 | } | ||
870 | memset(tmp, 0, sizeof(*tmp)); | ||
871 | if (!handle->maps) | ||
872 | handle->maps = tmp; | ||
873 | if (last) | ||
874 | last->next = tmp; | ||
875 | last = tmp; | ||
876 | |||
877 | tmp->map = (struct swap_map_page *) | ||
878 | __get_free_page(__GFP_WAIT | __GFP_HIGH); | ||
879 | if (!tmp->map) { | ||
880 | release_swap_reader(handle); | ||
881 | return -ENOMEM; | ||
882 | } | ||
646 | 883 | ||
647 | error = hib_bio_read_page(swsusp_header->image, handle->cur, NULL); | 884 | error = hib_bio_read_page(offset, tmp->map, NULL); |
648 | if (error) { | 885 | if (error) { |
649 | release_swap_reader(handle); | 886 | release_swap_reader(handle); |
650 | return error; | 887 | return error; |
888 | } | ||
889 | offset = tmp->map->next_swap; | ||
651 | } | 890 | } |
652 | handle->k = 0; | 891 | handle->k = 0; |
892 | handle->cur = handle->maps->map; | ||
653 | return 0; | 893 | return 0; |
654 | } | 894 | } |
655 | 895 | ||
@@ -658,6 +898,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf, | |||
658 | { | 898 | { |
659 | sector_t offset; | 899 | sector_t offset; |
660 | int error; | 900 | int error; |
901 | struct swap_map_page_list *tmp; | ||
661 | 902 | ||
662 | if (!handle->cur) | 903 | if (!handle->cur) |
663 | return -EINVAL; | 904 | return -EINVAL; |
@@ -668,13 +909,15 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf, | |||
668 | if (error) | 909 | if (error) |
669 | return error; | 910 | return error; |
670 | if (++handle->k >= MAP_PAGE_ENTRIES) { | 911 | if (++handle->k >= MAP_PAGE_ENTRIES) { |
671 | error = hib_wait_on_bio_chain(bio_chain); | ||
672 | handle->k = 0; | 912 | handle->k = 0; |
673 | offset = handle->cur->next_swap; | 913 | free_page((unsigned long)handle->maps->map); |
674 | if (!offset) | 914 | tmp = handle->maps; |
915 | handle->maps = handle->maps->next; | ||
916 | kfree(tmp); | ||
917 | if (!handle->maps) | ||
675 | release_swap_reader(handle); | 918 | release_swap_reader(handle); |
676 | else if (!error) | 919 | else |
677 | error = hib_bio_read_page(offset, handle->cur, NULL); | 920 | handle->cur = handle->maps->map; |
678 | } | 921 | } |
679 | return error; | 922 | return error; |
680 | } | 923 | } |
@@ -697,7 +940,7 @@ static int load_image(struct swap_map_handle *handle, | |||
697 | unsigned int nr_to_read) | 940 | unsigned int nr_to_read) |
698 | { | 941 | { |
699 | unsigned int m; | 942 | unsigned int m; |
700 | int error = 0; | 943 | int ret = 0; |
701 | struct timeval start; | 944 | struct timeval start; |
702 | struct timeval stop; | 945 | struct timeval stop; |
703 | struct bio *bio; | 946 | struct bio *bio; |
@@ -713,15 +956,15 @@ static int load_image(struct swap_map_handle *handle, | |||
713 | bio = NULL; | 956 | bio = NULL; |
714 | do_gettimeofday(&start); | 957 | do_gettimeofday(&start); |
715 | for ( ; ; ) { | 958 | for ( ; ; ) { |
716 | error = snapshot_write_next(snapshot); | 959 | ret = snapshot_write_next(snapshot); |
717 | if (error <= 0) | 960 | if (ret <= 0) |
718 | break; | 961 | break; |
719 | error = swap_read_page(handle, data_of(*snapshot), &bio); | 962 | ret = swap_read_page(handle, data_of(*snapshot), &bio); |
720 | if (error) | 963 | if (ret) |
721 | break; | 964 | break; |
722 | if (snapshot->sync_read) | 965 | if (snapshot->sync_read) |
723 | error = hib_wait_on_bio_chain(&bio); | 966 | ret = hib_wait_on_bio_chain(&bio); |
724 | if (error) | 967 | if (ret) |
725 | break; | 968 | break; |
726 | if (!(nr_pages % m)) | 969 | if (!(nr_pages % m)) |
727 | printk("\b\b\b\b%3d%%", nr_pages / m); | 970 | printk("\b\b\b\b%3d%%", nr_pages / m); |
@@ -729,17 +972,61 @@ static int load_image(struct swap_map_handle *handle, | |||
729 | } | 972 | } |
730 | err2 = hib_wait_on_bio_chain(&bio); | 973 | err2 = hib_wait_on_bio_chain(&bio); |
731 | do_gettimeofday(&stop); | 974 | do_gettimeofday(&stop); |
732 | if (!error) | 975 | if (!ret) |
733 | error = err2; | 976 | ret = err2; |
734 | if (!error) { | 977 | if (!ret) { |
735 | printk("\b\b\b\bdone\n"); | 978 | printk("\b\b\b\bdone\n"); |
736 | snapshot_write_finalize(snapshot); | 979 | snapshot_write_finalize(snapshot); |
737 | if (!snapshot_image_loaded(snapshot)) | 980 | if (!snapshot_image_loaded(snapshot)) |
738 | error = -ENODATA; | 981 | ret = -ENODATA; |
739 | } else | 982 | } else |
740 | printk("\n"); | 983 | printk("\n"); |
741 | swsusp_show_speed(&start, &stop, nr_to_read, "Read"); | 984 | swsusp_show_speed(&start, &stop, nr_to_read, "Read"); |
742 | return error; | 985 | return ret; |
986 | } | ||
987 | |||
988 | /** | ||
989 | * Structure used for LZO data decompression. | ||
990 | */ | ||
991 | struct dec_data { | ||
992 | struct task_struct *thr; /* thread */ | ||
993 | atomic_t ready; /* ready to start flag */ | ||
994 | atomic_t stop; /* ready to stop flag */ | ||
995 | int ret; /* return code */ | ||
996 | wait_queue_head_t go; /* start decompression */ | ||
997 | wait_queue_head_t done; /* decompression done */ | ||
998 | size_t unc_len; /* uncompressed length */ | ||
999 | size_t cmp_len; /* compressed length */ | ||
1000 | unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */ | ||
1001 | unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */ | ||
1002 | }; | ||
1003 | |||
1004 | /** | ||
1005 | * Deompression function that runs in its own thread. | ||
1006 | */ | ||
1007 | static int lzo_decompress_threadfn(void *data) | ||
1008 | { | ||
1009 | struct dec_data *d = data; | ||
1010 | |||
1011 | while (1) { | ||
1012 | wait_event(d->go, atomic_read(&d->ready) || | ||
1013 | kthread_should_stop()); | ||
1014 | if (kthread_should_stop()) { | ||
1015 | d->thr = NULL; | ||
1016 | d->ret = -1; | ||
1017 | atomic_set(&d->stop, 1); | ||
1018 | wake_up(&d->done); | ||
1019 | break; | ||
1020 | } | ||
1021 | atomic_set(&d->ready, 0); | ||
1022 | |||
1023 | d->unc_len = LZO_UNC_SIZE; | ||
1024 | d->ret = lzo1x_decompress_safe(d->cmp + LZO_HEADER, d->cmp_len, | ||
1025 | d->unc, &d->unc_len); | ||
1026 | atomic_set(&d->stop, 1); | ||
1027 | wake_up(&d->done); | ||
1028 | } | ||
1029 | return 0; | ||
743 | } | 1030 | } |
744 | 1031 | ||
745 | /** | 1032 | /** |
@@ -753,50 +1040,120 @@ static int load_image_lzo(struct swap_map_handle *handle, | |||
753 | unsigned int nr_to_read) | 1040 | unsigned int nr_to_read) |
754 | { | 1041 | { |
755 | unsigned int m; | 1042 | unsigned int m; |
756 | int error = 0; | 1043 | int ret = 0; |
1044 | int eof = 0; | ||
757 | struct bio *bio; | 1045 | struct bio *bio; |
758 | struct timeval start; | 1046 | struct timeval start; |
759 | struct timeval stop; | 1047 | struct timeval stop; |
760 | unsigned nr_pages; | 1048 | unsigned nr_pages; |
761 | size_t i, off, unc_len, cmp_len; | 1049 | size_t off; |
762 | unsigned char *unc, *cmp, *page[LZO_CMP_PAGES]; | 1050 | unsigned i, thr, run_threads, nr_threads; |
763 | 1051 | unsigned ring = 0, pg = 0, ring_size = 0, | |
764 | for (i = 0; i < LZO_CMP_PAGES; i++) { | 1052 | have = 0, want, need, asked = 0; |
765 | page[i] = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); | 1053 | unsigned long read_pages; |
766 | if (!page[i]) { | 1054 | unsigned char **page = NULL; |
767 | printk(KERN_ERR "PM: Failed to allocate LZO page\n"); | 1055 | struct dec_data *data = NULL; |
1056 | struct crc_data *crc = NULL; | ||
1057 | |||
1058 | /* | ||
1059 | * We'll limit the number of threads for decompression to limit memory | ||
1060 | * footprint. | ||
1061 | */ | ||
1062 | nr_threads = num_online_cpus() - 1; | ||
1063 | nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); | ||
1064 | |||
1065 | page = vmalloc(sizeof(*page) * LZO_READ_PAGES); | ||
1066 | if (!page) { | ||
1067 | printk(KERN_ERR "PM: Failed to allocate LZO page\n"); | ||
1068 | ret = -ENOMEM; | ||
1069 | goto out_clean; | ||
1070 | } | ||
768 | 1071 | ||
769 | while (i) | 1072 | data = vmalloc(sizeof(*data) * nr_threads); |
770 | free_page((unsigned long)page[--i]); | 1073 | if (!data) { |
1074 | printk(KERN_ERR "PM: Failed to allocate LZO data\n"); | ||
1075 | ret = -ENOMEM; | ||
1076 | goto out_clean; | ||
1077 | } | ||
1078 | for (thr = 0; thr < nr_threads; thr++) | ||
1079 | memset(&data[thr], 0, offsetof(struct dec_data, go)); | ||
771 | 1080 | ||
772 | return -ENOMEM; | 1081 | crc = kmalloc(sizeof(*crc), GFP_KERNEL); |
1082 | if (!crc) { | ||
1083 | printk(KERN_ERR "PM: Failed to allocate crc\n"); | ||
1084 | ret = -ENOMEM; | ||
1085 | goto out_clean; | ||
1086 | } | ||
1087 | memset(crc, 0, offsetof(struct crc_data, go)); | ||
1088 | |||
1089 | /* | ||
1090 | * Start the decompression threads. | ||
1091 | */ | ||
1092 | for (thr = 0; thr < nr_threads; thr++) { | ||
1093 | init_waitqueue_head(&data[thr].go); | ||
1094 | init_waitqueue_head(&data[thr].done); | ||
1095 | |||
1096 | data[thr].thr = kthread_run(lzo_decompress_threadfn, | ||
1097 | &data[thr], | ||
1098 | "image_decompress/%u", thr); | ||
1099 | if (IS_ERR(data[thr].thr)) { | ||
1100 | data[thr].thr = NULL; | ||
1101 | printk(KERN_ERR | ||
1102 | "PM: Cannot start decompression threads\n"); | ||
1103 | ret = -ENOMEM; | ||
1104 | goto out_clean; | ||
773 | } | 1105 | } |
774 | } | 1106 | } |
775 | 1107 | ||
776 | unc = vmalloc(LZO_UNC_SIZE); | 1108 | /* |
777 | if (!unc) { | 1109 | * Start the CRC32 thread. |
778 | printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); | 1110 | */ |
779 | 1111 | init_waitqueue_head(&crc->go); | |
780 | for (i = 0; i < LZO_CMP_PAGES; i++) | 1112 | init_waitqueue_head(&crc->done); |
781 | free_page((unsigned long)page[i]); | 1113 | |
782 | 1114 | handle->crc32 = 0; | |
783 | return -ENOMEM; | 1115 | crc->crc32 = &handle->crc32; |
1116 | for (thr = 0; thr < nr_threads; thr++) { | ||
1117 | crc->unc[thr] = data[thr].unc; | ||
1118 | crc->unc_len[thr] = &data[thr].unc_len; | ||
784 | } | 1119 | } |
785 | 1120 | ||
786 | cmp = vmalloc(LZO_CMP_SIZE); | 1121 | crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32"); |
787 | if (!cmp) { | 1122 | if (IS_ERR(crc->thr)) { |
788 | printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); | 1123 | crc->thr = NULL; |
1124 | printk(KERN_ERR "PM: Cannot start CRC32 thread\n"); | ||
1125 | ret = -ENOMEM; | ||
1126 | goto out_clean; | ||
1127 | } | ||
789 | 1128 | ||
790 | vfree(unc); | 1129 | /* |
791 | for (i = 0; i < LZO_CMP_PAGES; i++) | 1130 | * Adjust number of pages for read buffering, in case we are short. |
792 | free_page((unsigned long)page[i]); | 1131 | */ |
1132 | read_pages = (nr_free_pages() - snapshot_get_image_size()) >> 1; | ||
1133 | read_pages = clamp_val(read_pages, LZO_CMP_PAGES, LZO_READ_PAGES); | ||
793 | 1134 | ||
794 | return -ENOMEM; | 1135 | for (i = 0; i < read_pages; i++) { |
1136 | page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? | ||
1137 | __GFP_WAIT | __GFP_HIGH : | ||
1138 | __GFP_WAIT); | ||
1139 | if (!page[i]) { | ||
1140 | if (i < LZO_CMP_PAGES) { | ||
1141 | ring_size = i; | ||
1142 | printk(KERN_ERR | ||
1143 | "PM: Failed to allocate LZO pages\n"); | ||
1144 | ret = -ENOMEM; | ||
1145 | goto out_clean; | ||
1146 | } else { | ||
1147 | break; | ||
1148 | } | ||
1149 | } | ||
795 | } | 1150 | } |
1151 | want = ring_size = i; | ||
796 | 1152 | ||
797 | printk(KERN_INFO | 1153 | printk(KERN_INFO |
1154 | "PM: Using %u thread(s) for decompression.\n" | ||
798 | "PM: Loading and decompressing image data (%u pages) ... ", | 1155 | "PM: Loading and decompressing image data (%u pages) ... ", |
799 | nr_to_read); | 1156 | nr_threads, nr_to_read); |
800 | m = nr_to_read / 100; | 1157 | m = nr_to_read / 100; |
801 | if (!m) | 1158 | if (!m) |
802 | m = 1; | 1159 | m = 1; |
@@ -804,85 +1161,189 @@ static int load_image_lzo(struct swap_map_handle *handle, | |||
804 | bio = NULL; | 1161 | bio = NULL; |
805 | do_gettimeofday(&start); | 1162 | do_gettimeofday(&start); |
806 | 1163 | ||
807 | error = snapshot_write_next(snapshot); | 1164 | ret = snapshot_write_next(snapshot); |
808 | if (error <= 0) | 1165 | if (ret <= 0) |
809 | goto out_finish; | 1166 | goto out_finish; |
810 | 1167 | ||
811 | for (;;) { | 1168 | for(;;) { |
812 | error = swap_read_page(handle, page[0], NULL); /* sync */ | 1169 | for (i = 0; !eof && i < want; i++) { |
813 | if (error) | 1170 | ret = swap_read_page(handle, page[ring], &bio); |
814 | break; | 1171 | if (ret) { |
815 | 1172 | /* | |
816 | cmp_len = *(size_t *)page[0]; | 1173 | * On real read error, finish. On end of data, |
817 | if (unlikely(!cmp_len || | 1174 | * set EOF flag and just exit the read loop. |
818 | cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) { | 1175 | */ |
819 | printk(KERN_ERR "PM: Invalid LZO compressed length\n"); | 1176 | if (handle->cur && |
820 | error = -1; | 1177 | handle->cur->entries[handle->k]) { |
821 | break; | 1178 | goto out_finish; |
1179 | } else { | ||
1180 | eof = 1; | ||
1181 | break; | ||
1182 | } | ||
1183 | } | ||
1184 | if (++ring >= ring_size) | ||
1185 | ring = 0; | ||
822 | } | 1186 | } |
1187 | asked += i; | ||
1188 | want -= i; | ||
823 | 1189 | ||
824 | for (off = PAGE_SIZE, i = 1; | 1190 | /* |
825 | off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) { | 1191 | * We are out of data, wait for some more. |
826 | error = swap_read_page(handle, page[i], &bio); | 1192 | */ |
827 | if (error) | 1193 | if (!have) { |
1194 | if (!asked) | ||
1195 | break; | ||
1196 | |||
1197 | ret = hib_wait_on_bio_chain(&bio); | ||
1198 | if (ret) | ||
828 | goto out_finish; | 1199 | goto out_finish; |
1200 | have += asked; | ||
1201 | asked = 0; | ||
1202 | if (eof) | ||
1203 | eof = 2; | ||
829 | } | 1204 | } |
830 | 1205 | ||
831 | error = hib_wait_on_bio_chain(&bio); /* need all data now */ | 1206 | if (crc->run_threads) { |
832 | if (error) | 1207 | wait_event(crc->done, atomic_read(&crc->stop)); |
833 | goto out_finish; | 1208 | atomic_set(&crc->stop, 0); |
834 | 1209 | crc->run_threads = 0; | |
835 | for (off = 0, i = 0; | ||
836 | off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) { | ||
837 | memcpy(cmp + off, page[i], PAGE_SIZE); | ||
838 | } | 1210 | } |
839 | 1211 | ||
840 | unc_len = LZO_UNC_SIZE; | 1212 | for (thr = 0; have && thr < nr_threads; thr++) { |
841 | error = lzo1x_decompress_safe(cmp + LZO_HEADER, cmp_len, | 1213 | data[thr].cmp_len = *(size_t *)page[pg]; |
842 | unc, &unc_len); | 1214 | if (unlikely(!data[thr].cmp_len || |
843 | if (error < 0) { | 1215 | data[thr].cmp_len > |
844 | printk(KERN_ERR "PM: LZO decompression failed\n"); | 1216 | lzo1x_worst_compress(LZO_UNC_SIZE))) { |
845 | break; | 1217 | printk(KERN_ERR |
1218 | "PM: Invalid LZO compressed length\n"); | ||
1219 | ret = -1; | ||
1220 | goto out_finish; | ||
1221 | } | ||
1222 | |||
1223 | need = DIV_ROUND_UP(data[thr].cmp_len + LZO_HEADER, | ||
1224 | PAGE_SIZE); | ||
1225 | if (need > have) { | ||
1226 | if (eof > 1) { | ||
1227 | ret = -1; | ||
1228 | goto out_finish; | ||
1229 | } | ||
1230 | break; | ||
1231 | } | ||
1232 | |||
1233 | for (off = 0; | ||
1234 | off < LZO_HEADER + data[thr].cmp_len; | ||
1235 | off += PAGE_SIZE) { | ||
1236 | memcpy(data[thr].cmp + off, | ||
1237 | page[pg], PAGE_SIZE); | ||
1238 | have--; | ||
1239 | want++; | ||
1240 | if (++pg >= ring_size) | ||
1241 | pg = 0; | ||
1242 | } | ||
1243 | |||
1244 | atomic_set(&data[thr].ready, 1); | ||
1245 | wake_up(&data[thr].go); | ||
846 | } | 1246 | } |
847 | 1247 | ||
848 | if (unlikely(!unc_len || | 1248 | /* |
849 | unc_len > LZO_UNC_SIZE || | 1249 | * Wait for more data while we are decompressing. |
850 | unc_len & (PAGE_SIZE - 1))) { | 1250 | */ |
851 | printk(KERN_ERR "PM: Invalid LZO uncompressed length\n"); | 1251 | if (have < LZO_CMP_PAGES && asked) { |
852 | error = -1; | 1252 | ret = hib_wait_on_bio_chain(&bio); |
853 | break; | 1253 | if (ret) |
1254 | goto out_finish; | ||
1255 | have += asked; | ||
1256 | asked = 0; | ||
1257 | if (eof) | ||
1258 | eof = 2; | ||
854 | } | 1259 | } |
855 | 1260 | ||
856 | for (off = 0; off < unc_len; off += PAGE_SIZE) { | 1261 | for (run_threads = thr, thr = 0; thr < run_threads; thr++) { |
857 | memcpy(data_of(*snapshot), unc + off, PAGE_SIZE); | 1262 | wait_event(data[thr].done, |
1263 | atomic_read(&data[thr].stop)); | ||
1264 | atomic_set(&data[thr].stop, 0); | ||
1265 | |||
1266 | ret = data[thr].ret; | ||
858 | 1267 | ||
859 | if (!(nr_pages % m)) | 1268 | if (ret < 0) { |
860 | printk("\b\b\b\b%3d%%", nr_pages / m); | 1269 | printk(KERN_ERR |
861 | nr_pages++; | 1270 | "PM: LZO decompression failed\n"); |
1271 | goto out_finish; | ||
1272 | } | ||
862 | 1273 | ||
863 | error = snapshot_write_next(snapshot); | 1274 | if (unlikely(!data[thr].unc_len || |
864 | if (error <= 0) | 1275 | data[thr].unc_len > LZO_UNC_SIZE || |
1276 | data[thr].unc_len & (PAGE_SIZE - 1))) { | ||
1277 | printk(KERN_ERR | ||
1278 | "PM: Invalid LZO uncompressed length\n"); | ||
1279 | ret = -1; | ||
865 | goto out_finish; | 1280 | goto out_finish; |
1281 | } | ||
1282 | |||
1283 | for (off = 0; | ||
1284 | off < data[thr].unc_len; off += PAGE_SIZE) { | ||
1285 | memcpy(data_of(*snapshot), | ||
1286 | data[thr].unc + off, PAGE_SIZE); | ||
1287 | |||
1288 | if (!(nr_pages % m)) | ||
1289 | printk("\b\b\b\b%3d%%", nr_pages / m); | ||
1290 | nr_pages++; | ||
1291 | |||
1292 | ret = snapshot_write_next(snapshot); | ||
1293 | if (ret <= 0) { | ||
1294 | crc->run_threads = thr + 1; | ||
1295 | atomic_set(&crc->ready, 1); | ||
1296 | wake_up(&crc->go); | ||
1297 | goto out_finish; | ||
1298 | } | ||
1299 | } | ||
866 | } | 1300 | } |
1301 | |||
1302 | crc->run_threads = thr; | ||
1303 | atomic_set(&crc->ready, 1); | ||
1304 | wake_up(&crc->go); | ||
867 | } | 1305 | } |
868 | 1306 | ||
869 | out_finish: | 1307 | out_finish: |
1308 | if (crc->run_threads) { | ||
1309 | wait_event(crc->done, atomic_read(&crc->stop)); | ||
1310 | atomic_set(&crc->stop, 0); | ||
1311 | } | ||
870 | do_gettimeofday(&stop); | 1312 | do_gettimeofday(&stop); |
871 | if (!error) { | 1313 | if (!ret) { |
872 | printk("\b\b\b\bdone\n"); | 1314 | printk("\b\b\b\bdone\n"); |
873 | snapshot_write_finalize(snapshot); | 1315 | snapshot_write_finalize(snapshot); |
874 | if (!snapshot_image_loaded(snapshot)) | 1316 | if (!snapshot_image_loaded(snapshot)) |
875 | error = -ENODATA; | 1317 | ret = -ENODATA; |
1318 | if (!ret) { | ||
1319 | if (swsusp_header->flags & SF_CRC32_MODE) { | ||
1320 | if(handle->crc32 != swsusp_header->crc32) { | ||
1321 | printk(KERN_ERR | ||
1322 | "PM: Invalid image CRC32!\n"); | ||
1323 | ret = -ENODATA; | ||
1324 | } | ||
1325 | } | ||
1326 | } | ||
876 | } else | 1327 | } else |
877 | printk("\n"); | 1328 | printk("\n"); |
878 | swsusp_show_speed(&start, &stop, nr_to_read, "Read"); | 1329 | swsusp_show_speed(&start, &stop, nr_to_read, "Read"); |
879 | 1330 | out_clean: | |
880 | vfree(cmp); | 1331 | for (i = 0; i < ring_size; i++) |
881 | vfree(unc); | ||
882 | for (i = 0; i < LZO_CMP_PAGES; i++) | ||
883 | free_page((unsigned long)page[i]); | 1332 | free_page((unsigned long)page[i]); |
1333 | if (crc) { | ||
1334 | if (crc->thr) | ||
1335 | kthread_stop(crc->thr); | ||
1336 | kfree(crc); | ||
1337 | } | ||
1338 | if (data) { | ||
1339 | for (thr = 0; thr < nr_threads; thr++) | ||
1340 | if (data[thr].thr) | ||
1341 | kthread_stop(data[thr].thr); | ||
1342 | vfree(data); | ||
1343 | } | ||
1344 | if (page) vfree(page); | ||
884 | 1345 | ||
885 | return error; | 1346 | return ret; |
886 | } | 1347 | } |
887 | 1348 | ||
888 | /** | 1349 | /** |
diff --git a/kernel/power/user.c b/kernel/power/user.c index 42ddbc6f0de6..6b1ab7a88522 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c | |||
@@ -12,6 +12,7 @@ | |||
12 | #include <linux/suspend.h> | 12 | #include <linux/suspend.h> |
13 | #include <linux/syscalls.h> | 13 | #include <linux/syscalls.h> |
14 | #include <linux/reboot.h> | 14 | #include <linux/reboot.h> |
15 | #include <linux/kmod.h> | ||
15 | #include <linux/string.h> | 16 | #include <linux/string.h> |
16 | #include <linux/device.h> | 17 | #include <linux/device.h> |
17 | #include <linux/miscdevice.h> | 18 | #include <linux/miscdevice.h> |
@@ -20,6 +21,7 @@ | |||
20 | #include <linux/swapops.h> | 21 | #include <linux/swapops.h> |
21 | #include <linux/pm.h> | 22 | #include <linux/pm.h> |
22 | #include <linux/fs.h> | 23 | #include <linux/fs.h> |
24 | #include <linux/compat.h> | ||
23 | #include <linux/console.h> | 25 | #include <linux/console.h> |
24 | #include <linux/cpu.h> | 26 | #include <linux/cpu.h> |
25 | #include <linux/freezer.h> | 27 | #include <linux/freezer.h> |
@@ -29,28 +31,6 @@ | |||
29 | 31 | ||
30 | #include "power.h" | 32 | #include "power.h" |
31 | 33 | ||
32 | /* | ||
33 | * NOTE: The SNAPSHOT_SET_SWAP_FILE and SNAPSHOT_PMOPS ioctls are obsolete and | ||
34 | * will be removed in the future. They are only preserved here for | ||
35 | * compatibility with existing userland utilities. | ||
36 | */ | ||
37 | #define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int) | ||
38 | #define SNAPSHOT_PMOPS _IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int) | ||
39 | |||
40 | #define PMOPS_PREPARE 1 | ||
41 | #define PMOPS_ENTER 2 | ||
42 | #define PMOPS_FINISH 3 | ||
43 | |||
44 | /* | ||
45 | * NOTE: The following ioctl definitions are wrong and have been replaced with | ||
46 | * correct ones. They are only preserved here for compatibility with existing | ||
47 | * userland utilities and will be removed in the future. | ||
48 | */ | ||
49 | #define SNAPSHOT_ATOMIC_SNAPSHOT _IOW(SNAPSHOT_IOC_MAGIC, 3, void *) | ||
50 | #define SNAPSHOT_SET_IMAGE_SIZE _IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long) | ||
51 | #define SNAPSHOT_AVAIL_SWAP _IOR(SNAPSHOT_IOC_MAGIC, 7, void *) | ||
52 | #define SNAPSHOT_GET_SWAP_PAGE _IOR(SNAPSHOT_IOC_MAGIC, 8, void *) | ||
53 | |||
54 | 34 | ||
55 | #define SNAPSHOT_MINOR 231 | 35 | #define SNAPSHOT_MINOR 231 |
56 | 36 | ||
@@ -70,7 +50,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) | |||
70 | struct snapshot_data *data; | 50 | struct snapshot_data *data; |
71 | int error; | 51 | int error; |
72 | 52 | ||
73 | mutex_lock(&pm_mutex); | 53 | lock_system_sleep(); |
74 | 54 | ||
75 | if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { | 55 | if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { |
76 | error = -EBUSY; | 56 | error = -EBUSY; |
@@ -122,7 +102,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) | |||
122 | data->platform_support = 0; | 102 | data->platform_support = 0; |
123 | 103 | ||
124 | Unlock: | 104 | Unlock: |
125 | mutex_unlock(&pm_mutex); | 105 | unlock_system_sleep(); |
126 | 106 | ||
127 | return error; | 107 | return error; |
128 | } | 108 | } |
@@ -131,7 +111,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) | |||
131 | { | 111 | { |
132 | struct snapshot_data *data; | 112 | struct snapshot_data *data; |
133 | 113 | ||
134 | mutex_lock(&pm_mutex); | 114 | lock_system_sleep(); |
135 | 115 | ||
136 | swsusp_free(); | 116 | swsusp_free(); |
137 | free_basic_memory_bitmaps(); | 117 | free_basic_memory_bitmaps(); |
@@ -145,7 +125,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) | |||
145 | PM_POST_HIBERNATION : PM_POST_RESTORE); | 125 | PM_POST_HIBERNATION : PM_POST_RESTORE); |
146 | atomic_inc(&snapshot_device_available); | 126 | atomic_inc(&snapshot_device_available); |
147 | 127 | ||
148 | mutex_unlock(&pm_mutex); | 128 | unlock_system_sleep(); |
149 | 129 | ||
150 | return 0; | 130 | return 0; |
151 | } | 131 | } |
@@ -157,7 +137,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf, | |||
157 | ssize_t res; | 137 | ssize_t res; |
158 | loff_t pg_offp = *offp & ~PAGE_MASK; | 138 | loff_t pg_offp = *offp & ~PAGE_MASK; |
159 | 139 | ||
160 | mutex_lock(&pm_mutex); | 140 | lock_system_sleep(); |
161 | 141 | ||
162 | data = filp->private_data; | 142 | data = filp->private_data; |
163 | if (!data->ready) { | 143 | if (!data->ready) { |
@@ -178,7 +158,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf, | |||
178 | *offp += res; | 158 | *offp += res; |
179 | 159 | ||
180 | Unlock: | 160 | Unlock: |
181 | mutex_unlock(&pm_mutex); | 161 | unlock_system_sleep(); |
182 | 162 | ||
183 | return res; | 163 | return res; |
184 | } | 164 | } |
@@ -190,7 +170,7 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, | |||
190 | ssize_t res; | 170 | ssize_t res; |
191 | loff_t pg_offp = *offp & ~PAGE_MASK; | 171 | loff_t pg_offp = *offp & ~PAGE_MASK; |
192 | 172 | ||
193 | mutex_lock(&pm_mutex); | 173 | lock_system_sleep(); |
194 | 174 | ||
195 | data = filp->private_data; | 175 | data = filp->private_data; |
196 | 176 | ||
@@ -207,20 +187,11 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, | |||
207 | if (res > 0) | 187 | if (res > 0) |
208 | *offp += res; | 188 | *offp += res; |
209 | unlock: | 189 | unlock: |
210 | mutex_unlock(&pm_mutex); | 190 | unlock_system_sleep(); |
211 | 191 | ||
212 | return res; | 192 | return res; |
213 | } | 193 | } |
214 | 194 | ||
215 | static void snapshot_deprecated_ioctl(unsigned int cmd) | ||
216 | { | ||
217 | if (printk_ratelimit()) | ||
218 | printk(KERN_NOTICE "%pf: ioctl '%.8x' is deprecated and will " | ||
219 | "be removed soon, update your suspend-to-disk " | ||
220 | "utilities\n", | ||
221 | __builtin_return_address(0), cmd); | ||
222 | } | ||
223 | |||
224 | static long snapshot_ioctl(struct file *filp, unsigned int cmd, | 195 | static long snapshot_ioctl(struct file *filp, unsigned int cmd, |
225 | unsigned long arg) | 196 | unsigned long arg) |
226 | { | 197 | { |
@@ -256,11 +227,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
256 | break; | 227 | break; |
257 | 228 | ||
258 | error = freeze_processes(); | 229 | error = freeze_processes(); |
259 | if (error) { | 230 | if (error) |
260 | thaw_processes(); | ||
261 | usermodehelper_enable(); | 231 | usermodehelper_enable(); |
262 | } | 232 | else |
263 | if (!error) | ||
264 | data->frozen = 1; | 233 | data->frozen = 1; |
265 | break; | 234 | break; |
266 | 235 | ||
@@ -273,8 +242,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
273 | data->frozen = 0; | 242 | data->frozen = 0; |
274 | break; | 243 | break; |
275 | 244 | ||
276 | case SNAPSHOT_ATOMIC_SNAPSHOT: | ||
277 | snapshot_deprecated_ioctl(cmd); | ||
278 | case SNAPSHOT_CREATE_IMAGE: | 245 | case SNAPSHOT_CREATE_IMAGE: |
279 | if (data->mode != O_RDONLY || !data->frozen || data->ready) { | 246 | if (data->mode != O_RDONLY || !data->frozen || data->ready) { |
280 | error = -EPERM; | 247 | error = -EPERM; |
@@ -282,10 +249,15 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
282 | } | 249 | } |
283 | pm_restore_gfp_mask(); | 250 | pm_restore_gfp_mask(); |
284 | error = hibernation_snapshot(data->platform_support); | 251 | error = hibernation_snapshot(data->platform_support); |
285 | if (!error) | 252 | if (!error) { |
286 | error = put_user(in_suspend, (int __user *)arg); | 253 | error = put_user(in_suspend, (int __user *)arg); |
287 | if (!error) | 254 | if (!error && !freezer_test_done) |
288 | data->ready = 1; | 255 | data->ready = 1; |
256 | if (freezer_test_done) { | ||
257 | freezer_test_done = false; | ||
258 | thaw_processes(); | ||
259 | } | ||
260 | } | ||
289 | break; | 261 | break; |
290 | 262 | ||
291 | case SNAPSHOT_ATOMIC_RESTORE: | 263 | case SNAPSHOT_ATOMIC_RESTORE: |
@@ -304,8 +276,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
304 | data->ready = 0; | 276 | data->ready = 0; |
305 | break; | 277 | break; |
306 | 278 | ||
307 | case SNAPSHOT_SET_IMAGE_SIZE: | ||
308 | snapshot_deprecated_ioctl(cmd); | ||
309 | case SNAPSHOT_PREF_IMAGE_SIZE: | 279 | case SNAPSHOT_PREF_IMAGE_SIZE: |
310 | image_size = arg; | 280 | image_size = arg; |
311 | break; | 281 | break; |
@@ -320,16 +290,12 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
320 | error = put_user(size, (loff_t __user *)arg); | 290 | error = put_user(size, (loff_t __user *)arg); |
321 | break; | 291 | break; |
322 | 292 | ||
323 | case SNAPSHOT_AVAIL_SWAP: | ||
324 | snapshot_deprecated_ioctl(cmd); | ||
325 | case SNAPSHOT_AVAIL_SWAP_SIZE: | 293 | case SNAPSHOT_AVAIL_SWAP_SIZE: |
326 | size = count_swap_pages(data->swap, 1); | 294 | size = count_swap_pages(data->swap, 1); |
327 | size <<= PAGE_SHIFT; | 295 | size <<= PAGE_SHIFT; |
328 | error = put_user(size, (loff_t __user *)arg); | 296 | error = put_user(size, (loff_t __user *)arg); |
329 | break; | 297 | break; |
330 | 298 | ||
331 | case SNAPSHOT_GET_SWAP_PAGE: | ||
332 | snapshot_deprecated_ioctl(cmd); | ||
333 | case SNAPSHOT_ALLOC_SWAP_PAGE: | 299 | case SNAPSHOT_ALLOC_SWAP_PAGE: |
334 | if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { | 300 | if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { |
335 | error = -ENODEV; | 301 | error = -ENODEV; |
@@ -352,27 +318,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
352 | free_all_swap_pages(data->swap); | 318 | free_all_swap_pages(data->swap); |
353 | break; | 319 | break; |
354 | 320 | ||
355 | case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */ | ||
356 | snapshot_deprecated_ioctl(cmd); | ||
357 | if (!swsusp_swap_in_use()) { | ||
358 | /* | ||
359 | * User space encodes device types as two-byte values, | ||
360 | * so we need to recode them | ||
361 | */ | ||
362 | if (old_decode_dev(arg)) { | ||
363 | data->swap = swap_type_of(old_decode_dev(arg), | ||
364 | 0, NULL); | ||
365 | if (data->swap < 0) | ||
366 | error = -ENODEV; | ||
367 | } else { | ||
368 | data->swap = -1; | ||
369 | error = -EINVAL; | ||
370 | } | ||
371 | } else { | ||
372 | error = -EPERM; | ||
373 | } | ||
374 | break; | ||
375 | |||
376 | case SNAPSHOT_S2RAM: | 321 | case SNAPSHOT_S2RAM: |
377 | if (!data->frozen) { | 322 | if (!data->frozen) { |
378 | error = -EPERM; | 323 | error = -EPERM; |
@@ -395,33 +340,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
395 | error = hibernation_platform_enter(); | 340 | error = hibernation_platform_enter(); |
396 | break; | 341 | break; |
397 | 342 | ||
398 | case SNAPSHOT_PMOPS: /* This ioctl is deprecated */ | ||
399 | snapshot_deprecated_ioctl(cmd); | ||
400 | error = -EINVAL; | ||
401 | |||
402 | switch (arg) { | ||
403 | |||
404 | case PMOPS_PREPARE: | ||
405 | data->platform_support = 1; | ||
406 | error = 0; | ||
407 | break; | ||
408 | |||
409 | case PMOPS_ENTER: | ||
410 | if (data->platform_support) | ||
411 | error = hibernation_platform_enter(); | ||
412 | break; | ||
413 | |||
414 | case PMOPS_FINISH: | ||
415 | if (data->platform_support) | ||
416 | error = 0; | ||
417 | break; | ||
418 | |||
419 | default: | ||
420 | printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg); | ||
421 | |||
422 | } | ||
423 | break; | ||
424 | |||
425 | case SNAPSHOT_SET_SWAP_AREA: | 343 | case SNAPSHOT_SET_SWAP_AREA: |
426 | if (swsusp_swap_in_use()) { | 344 | if (swsusp_swap_in_use()) { |
427 | error = -EPERM; | 345 | error = -EPERM; |
@@ -463,6 +381,66 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
463 | return error; | 381 | return error; |
464 | } | 382 | } |
465 | 383 | ||
384 | #ifdef CONFIG_COMPAT | ||
385 | |||
386 | struct compat_resume_swap_area { | ||
387 | compat_loff_t offset; | ||
388 | u32 dev; | ||
389 | } __packed; | ||
390 | |||
391 | static long | ||
392 | snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) | ||
393 | { | ||
394 | BUILD_BUG_ON(sizeof(loff_t) != sizeof(compat_loff_t)); | ||
395 | |||
396 | switch (cmd) { | ||
397 | case SNAPSHOT_GET_IMAGE_SIZE: | ||
398 | case SNAPSHOT_AVAIL_SWAP_SIZE: | ||
399 | case SNAPSHOT_ALLOC_SWAP_PAGE: { | ||
400 | compat_loff_t __user *uoffset = compat_ptr(arg); | ||
401 | loff_t offset; | ||
402 | mm_segment_t old_fs; | ||
403 | int err; | ||
404 | |||
405 | old_fs = get_fs(); | ||
406 | set_fs(KERNEL_DS); | ||
407 | err = snapshot_ioctl(file, cmd, (unsigned long) &offset); | ||
408 | set_fs(old_fs); | ||
409 | if (!err && put_user(offset, uoffset)) | ||
410 | err = -EFAULT; | ||
411 | return err; | ||
412 | } | ||
413 | |||
414 | case SNAPSHOT_CREATE_IMAGE: | ||
415 | return snapshot_ioctl(file, cmd, | ||
416 | (unsigned long) compat_ptr(arg)); | ||
417 | |||
418 | case SNAPSHOT_SET_SWAP_AREA: { | ||
419 | struct compat_resume_swap_area __user *u_swap_area = | ||
420 | compat_ptr(arg); | ||
421 | struct resume_swap_area swap_area; | ||
422 | mm_segment_t old_fs; | ||
423 | int err; | ||
424 | |||
425 | err = get_user(swap_area.offset, &u_swap_area->offset); | ||
426 | err |= get_user(swap_area.dev, &u_swap_area->dev); | ||
427 | if (err) | ||
428 | return -EFAULT; | ||
429 | old_fs = get_fs(); | ||
430 | set_fs(KERNEL_DS); | ||
431 | err = snapshot_ioctl(file, SNAPSHOT_SET_SWAP_AREA, | ||
432 | (unsigned long) &swap_area); | ||
433 | set_fs(old_fs); | ||
434 | return err; | ||
435 | } | ||
436 | |||
437 | default: | ||
438 | return snapshot_ioctl(file, cmd, arg); | ||
439 | } | ||
440 | } | ||
441 | |||
442 | #endif /* CONFIG_COMPAT */ | ||
443 | |||
466 | static const struct file_operations snapshot_fops = { | 444 | static const struct file_operations snapshot_fops = { |
467 | .open = snapshot_open, | 445 | .open = snapshot_open, |
468 | .release = snapshot_release, | 446 | .release = snapshot_release, |
@@ -470,6 +448,9 @@ static const struct file_operations snapshot_fops = { | |||
470 | .write = snapshot_write, | 448 | .write = snapshot_write, |
471 | .llseek = no_llseek, | 449 | .llseek = no_llseek, |
472 | .unlocked_ioctl = snapshot_ioctl, | 450 | .unlocked_ioctl = snapshot_ioctl, |
451 | #ifdef CONFIG_COMPAT | ||
452 | .compat_ioctl = snapshot_compat_ioctl, | ||
453 | #endif | ||
473 | }; | 454 | }; |
474 | 455 | ||
475 | static struct miscdevice snapshot_device = { | 456 | static struct miscdevice snapshot_device = { |
diff --git a/kernel/printk.c b/kernel/printk.c index 28a40d8171b8..13c0a1143f49 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -100,7 +100,7 @@ static int console_locked, console_suspended; | |||
100 | * It is also used in interesting ways to provide interlocking in | 100 | * It is also used in interesting ways to provide interlocking in |
101 | * console_unlock();. | 101 | * console_unlock();. |
102 | */ | 102 | */ |
103 | static DEFINE_SPINLOCK(logbuf_lock); | 103 | static DEFINE_RAW_SPINLOCK(logbuf_lock); |
104 | 104 | ||
105 | #define LOG_BUF_MASK (log_buf_len-1) | 105 | #define LOG_BUF_MASK (log_buf_len-1) |
106 | #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) | 106 | #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) |
@@ -199,7 +199,7 @@ void __init setup_log_buf(int early) | |||
199 | unsigned long mem; | 199 | unsigned long mem; |
200 | 200 | ||
201 | mem = memblock_alloc(new_log_buf_len, PAGE_SIZE); | 201 | mem = memblock_alloc(new_log_buf_len, PAGE_SIZE); |
202 | if (mem == MEMBLOCK_ERROR) | 202 | if (!mem) |
203 | return; | 203 | return; |
204 | new_log_buf = __va(mem); | 204 | new_log_buf = __va(mem); |
205 | } else { | 205 | } else { |
@@ -212,7 +212,7 @@ void __init setup_log_buf(int early) | |||
212 | return; | 212 | return; |
213 | } | 213 | } |
214 | 214 | ||
215 | spin_lock_irqsave(&logbuf_lock, flags); | 215 | raw_spin_lock_irqsave(&logbuf_lock, flags); |
216 | log_buf_len = new_log_buf_len; | 216 | log_buf_len = new_log_buf_len; |
217 | log_buf = new_log_buf; | 217 | log_buf = new_log_buf; |
218 | new_log_buf_len = 0; | 218 | new_log_buf_len = 0; |
@@ -230,7 +230,7 @@ void __init setup_log_buf(int early) | |||
230 | log_start -= offset; | 230 | log_start -= offset; |
231 | con_start -= offset; | 231 | con_start -= offset; |
232 | log_end -= offset; | 232 | log_end -= offset; |
233 | spin_unlock_irqrestore(&logbuf_lock, flags); | 233 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
234 | 234 | ||
235 | pr_info("log_buf_len: %d\n", log_buf_len); | 235 | pr_info("log_buf_len: %d\n", log_buf_len); |
236 | pr_info("early log buf free: %d(%d%%)\n", | 236 | pr_info("early log buf free: %d(%d%%)\n", |
@@ -365,18 +365,18 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
365 | if (error) | 365 | if (error) |
366 | goto out; | 366 | goto out; |
367 | i = 0; | 367 | i = 0; |
368 | spin_lock_irq(&logbuf_lock); | 368 | raw_spin_lock_irq(&logbuf_lock); |
369 | while (!error && (log_start != log_end) && i < len) { | 369 | while (!error && (log_start != log_end) && i < len) { |
370 | c = LOG_BUF(log_start); | 370 | c = LOG_BUF(log_start); |
371 | log_start++; | 371 | log_start++; |
372 | spin_unlock_irq(&logbuf_lock); | 372 | raw_spin_unlock_irq(&logbuf_lock); |
373 | error = __put_user(c,buf); | 373 | error = __put_user(c,buf); |
374 | buf++; | 374 | buf++; |
375 | i++; | 375 | i++; |
376 | cond_resched(); | 376 | cond_resched(); |
377 | spin_lock_irq(&logbuf_lock); | 377 | raw_spin_lock_irq(&logbuf_lock); |
378 | } | 378 | } |
379 | spin_unlock_irq(&logbuf_lock); | 379 | raw_spin_unlock_irq(&logbuf_lock); |
380 | if (!error) | 380 | if (!error) |
381 | error = i; | 381 | error = i; |
382 | break; | 382 | break; |
@@ -399,7 +399,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
399 | count = len; | 399 | count = len; |
400 | if (count > log_buf_len) | 400 | if (count > log_buf_len) |
401 | count = log_buf_len; | 401 | count = log_buf_len; |
402 | spin_lock_irq(&logbuf_lock); | 402 | raw_spin_lock_irq(&logbuf_lock); |
403 | if (count > logged_chars) | 403 | if (count > logged_chars) |
404 | count = logged_chars; | 404 | count = logged_chars; |
405 | if (do_clear) | 405 | if (do_clear) |
@@ -416,12 +416,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) | |||
416 | if (j + log_buf_len < log_end) | 416 | if (j + log_buf_len < log_end) |
417 | break; | 417 | break; |
418 | c = LOG_BUF(j); | 418 | c = LOG_BUF(j); |
419 | spin_unlock_irq(&logbuf_lock); | 419 | raw_spin_unlock_irq(&logbuf_lock); |
420 | error = __put_user(c,&buf[count-1-i]); | 420 | error = __put_user(c,&buf[count-1-i]); |
421 | cond_resched(); | 421 | cond_resched(); |
422 | spin_lock_irq(&logbuf_lock); | 422 | raw_spin_lock_irq(&logbuf_lock); |
423 | } | 423 | } |
424 | spin_unlock_irq(&logbuf_lock); | 424 | raw_spin_unlock_irq(&logbuf_lock); |
425 | if (error) | 425 | if (error) |
426 | break; | 426 | break; |
427 | error = i; | 427 | error = i; |
@@ -521,7 +521,7 @@ static void __call_console_drivers(unsigned start, unsigned end) | |||
521 | } | 521 | } |
522 | } | 522 | } |
523 | 523 | ||
524 | static int __read_mostly ignore_loglevel; | 524 | static bool __read_mostly ignore_loglevel; |
525 | 525 | ||
526 | static int __init ignore_loglevel_setup(char *str) | 526 | static int __init ignore_loglevel_setup(char *str) |
527 | { | 527 | { |
@@ -532,6 +532,9 @@ static int __init ignore_loglevel_setup(char *str) | |||
532 | } | 532 | } |
533 | 533 | ||
534 | early_param("ignore_loglevel", ignore_loglevel_setup); | 534 | early_param("ignore_loglevel", ignore_loglevel_setup); |
535 | module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); | ||
536 | MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" | ||
537 | "print all kernel messages to the console."); | ||
535 | 538 | ||
536 | /* | 539 | /* |
537 | * Write out chars from start to end - 1 inclusive | 540 | * Write out chars from start to end - 1 inclusive |
@@ -592,9 +595,6 @@ static size_t log_prefix(const char *p, unsigned int *level, char *special) | |||
592 | /* multi digit including the level and facility number */ | 595 | /* multi digit including the level and facility number */ |
593 | char *endp = NULL; | 596 | char *endp = NULL; |
594 | 597 | ||
595 | if (p[1] < '0' && p[1] > '9') | ||
596 | return 0; | ||
597 | |||
598 | lev = (simple_strtoul(&p[1], &endp, 10) & 7); | 598 | lev = (simple_strtoul(&p[1], &endp, 10) & 7); |
599 | if (endp == NULL || endp[0] != '>') | 599 | if (endp == NULL || endp[0] != '>') |
600 | return 0; | 600 | return 0; |
@@ -688,16 +688,17 @@ static void zap_locks(void) | |||
688 | 688 | ||
689 | oops_timestamp = jiffies; | 689 | oops_timestamp = jiffies; |
690 | 690 | ||
691 | debug_locks_off(); | ||
691 | /* If a crash is occurring, make sure we can't deadlock */ | 692 | /* If a crash is occurring, make sure we can't deadlock */ |
692 | spin_lock_init(&logbuf_lock); | 693 | raw_spin_lock_init(&logbuf_lock); |
693 | /* And make sure that we print immediately */ | 694 | /* And make sure that we print immediately */ |
694 | sema_init(&console_sem, 1); | 695 | sema_init(&console_sem, 1); |
695 | } | 696 | } |
696 | 697 | ||
697 | #if defined(CONFIG_PRINTK_TIME) | 698 | #if defined(CONFIG_PRINTK_TIME) |
698 | static int printk_time = 1; | 699 | static bool printk_time = 1; |
699 | #else | 700 | #else |
700 | static int printk_time = 0; | 701 | static bool printk_time = 0; |
701 | #endif | 702 | #endif |
702 | module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); | 703 | module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); |
703 | 704 | ||
@@ -802,9 +803,9 @@ static int console_trylock_for_printk(unsigned int cpu) | |||
802 | } | 803 | } |
803 | } | 804 | } |
804 | printk_cpu = UINT_MAX; | 805 | printk_cpu = UINT_MAX; |
805 | spin_unlock(&logbuf_lock); | ||
806 | if (wake) | 806 | if (wake) |
807 | up(&console_sem); | 807 | up(&console_sem); |
808 | raw_spin_unlock(&logbuf_lock); | ||
808 | return retval; | 809 | return retval; |
809 | } | 810 | } |
810 | static const char recursion_bug_msg [] = | 811 | static const char recursion_bug_msg [] = |
@@ -840,9 +841,8 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
840 | boot_delay_msec(); | 841 | boot_delay_msec(); |
841 | printk_delay(); | 842 | printk_delay(); |
842 | 843 | ||
843 | preempt_disable(); | ||
844 | /* This stops the holder of console_sem just where we want him */ | 844 | /* This stops the holder of console_sem just where we want him */ |
845 | raw_local_irq_save(flags); | 845 | local_irq_save(flags); |
846 | this_cpu = smp_processor_id(); | 846 | this_cpu = smp_processor_id(); |
847 | 847 | ||
848 | /* | 848 | /* |
@@ -856,7 +856,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
856 | * recursion and return - but flag the recursion so that | 856 | * recursion and return - but flag the recursion so that |
857 | * it can be printed at the next appropriate moment: | 857 | * it can be printed at the next appropriate moment: |
858 | */ | 858 | */ |
859 | if (!oops_in_progress) { | 859 | if (!oops_in_progress && !lockdep_recursing(current)) { |
860 | recursion_bug = 1; | 860 | recursion_bug = 1; |
861 | goto out_restore_irqs; | 861 | goto out_restore_irqs; |
862 | } | 862 | } |
@@ -864,7 +864,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
864 | } | 864 | } |
865 | 865 | ||
866 | lockdep_off(); | 866 | lockdep_off(); |
867 | spin_lock(&logbuf_lock); | 867 | raw_spin_lock(&logbuf_lock); |
868 | printk_cpu = this_cpu; | 868 | printk_cpu = this_cpu; |
869 | 869 | ||
870 | if (recursion_bug) { | 870 | if (recursion_bug) { |
@@ -962,9 +962,8 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
962 | 962 | ||
963 | lockdep_on(); | 963 | lockdep_on(); |
964 | out_restore_irqs: | 964 | out_restore_irqs: |
965 | raw_local_irq_restore(flags); | 965 | local_irq_restore(flags); |
966 | 966 | ||
967 | preempt_enable(); | ||
968 | return printed_len; | 967 | return printed_len; |
969 | } | 968 | } |
970 | EXPORT_SYMBOL(printk); | 969 | EXPORT_SYMBOL(printk); |
@@ -1099,7 +1098,7 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha | |||
1099 | return -1; | 1098 | return -1; |
1100 | } | 1099 | } |
1101 | 1100 | ||
1102 | int console_suspend_enabled = 1; | 1101 | bool console_suspend_enabled = 1; |
1103 | EXPORT_SYMBOL(console_suspend_enabled); | 1102 | EXPORT_SYMBOL(console_suspend_enabled); |
1104 | 1103 | ||
1105 | static int __init console_suspend_disable(char *str) | 1104 | static int __init console_suspend_disable(char *str) |
@@ -1108,6 +1107,10 @@ static int __init console_suspend_disable(char *str) | |||
1108 | return 1; | 1107 | return 1; |
1109 | } | 1108 | } |
1110 | __setup("no_console_suspend", console_suspend_disable); | 1109 | __setup("no_console_suspend", console_suspend_disable); |
1110 | module_param_named(console_suspend, console_suspend_enabled, | ||
1111 | bool, S_IRUGO | S_IWUSR); | ||
1112 | MODULE_PARM_DESC(console_suspend, "suspend console during suspend" | ||
1113 | " and hibernate operations"); | ||
1111 | 1114 | ||
1112 | /** | 1115 | /** |
1113 | * suspend_console - suspend the console subsystem | 1116 | * suspend_console - suspend the console subsystem |
@@ -1257,14 +1260,14 @@ void console_unlock(void) | |||
1257 | 1260 | ||
1258 | again: | 1261 | again: |
1259 | for ( ; ; ) { | 1262 | for ( ; ; ) { |
1260 | spin_lock_irqsave(&logbuf_lock, flags); | 1263 | raw_spin_lock_irqsave(&logbuf_lock, flags); |
1261 | wake_klogd |= log_start - log_end; | 1264 | wake_klogd |= log_start - log_end; |
1262 | if (con_start == log_end) | 1265 | if (con_start == log_end) |
1263 | break; /* Nothing to print */ | 1266 | break; /* Nothing to print */ |
1264 | _con_start = con_start; | 1267 | _con_start = con_start; |
1265 | _log_end = log_end; | 1268 | _log_end = log_end; |
1266 | con_start = log_end; /* Flush */ | 1269 | con_start = log_end; /* Flush */ |
1267 | spin_unlock(&logbuf_lock); | 1270 | raw_spin_unlock(&logbuf_lock); |
1268 | stop_critical_timings(); /* don't trace print latency */ | 1271 | stop_critical_timings(); /* don't trace print latency */ |
1269 | call_console_drivers(_con_start, _log_end); | 1272 | call_console_drivers(_con_start, _log_end); |
1270 | start_critical_timings(); | 1273 | start_critical_timings(); |
@@ -1276,7 +1279,7 @@ again: | |||
1276 | if (unlikely(exclusive_console)) | 1279 | if (unlikely(exclusive_console)) |
1277 | exclusive_console = NULL; | 1280 | exclusive_console = NULL; |
1278 | 1281 | ||
1279 | spin_unlock(&logbuf_lock); | 1282 | raw_spin_unlock(&logbuf_lock); |
1280 | 1283 | ||
1281 | up(&console_sem); | 1284 | up(&console_sem); |
1282 | 1285 | ||
@@ -1286,10 +1289,11 @@ again: | |||
1286 | * there's a new owner and the console_unlock() from them will do the | 1289 | * there's a new owner and the console_unlock() from them will do the |
1287 | * flush, no worries. | 1290 | * flush, no worries. |
1288 | */ | 1291 | */ |
1289 | spin_lock(&logbuf_lock); | 1292 | raw_spin_lock(&logbuf_lock); |
1290 | if (con_start != log_end) | 1293 | if (con_start != log_end) |
1291 | retry = 1; | 1294 | retry = 1; |
1292 | spin_unlock_irqrestore(&logbuf_lock, flags); | 1295 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
1296 | |||
1293 | if (retry && console_trylock()) | 1297 | if (retry && console_trylock()) |
1294 | goto again; | 1298 | goto again; |
1295 | 1299 | ||
@@ -1522,9 +1526,9 @@ void register_console(struct console *newcon) | |||
1522 | * console_unlock(); will print out the buffered messages | 1526 | * console_unlock(); will print out the buffered messages |
1523 | * for us. | 1527 | * for us. |
1524 | */ | 1528 | */ |
1525 | spin_lock_irqsave(&logbuf_lock, flags); | 1529 | raw_spin_lock_irqsave(&logbuf_lock, flags); |
1526 | con_start = log_start; | 1530 | con_start = log_start; |
1527 | spin_unlock_irqrestore(&logbuf_lock, flags); | 1531 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
1528 | /* | 1532 | /* |
1529 | * We're about to replay the log buffer. Only do this to the | 1533 | * We're about to replay the log buffer. Only do this to the |
1530 | * just-registered console to avoid excessive message spam to | 1534 | * just-registered console to avoid excessive message spam to |
@@ -1731,10 +1735,10 @@ void kmsg_dump(enum kmsg_dump_reason reason) | |||
1731 | /* Theoretically, the log could move on after we do this, but | 1735 | /* Theoretically, the log could move on after we do this, but |
1732 | there's not a lot we can do about that. The new messages | 1736 | there's not a lot we can do about that. The new messages |
1733 | will overwrite the start of what we dump. */ | 1737 | will overwrite the start of what we dump. */ |
1734 | spin_lock_irqsave(&logbuf_lock, flags); | 1738 | raw_spin_lock_irqsave(&logbuf_lock, flags); |
1735 | end = log_end & LOG_BUF_MASK; | 1739 | end = log_end & LOG_BUF_MASK; |
1736 | chars = logged_chars; | 1740 | chars = logged_chars; |
1737 | spin_unlock_irqrestore(&logbuf_lock, flags); | 1741 | raw_spin_unlock_irqrestore(&logbuf_lock, flags); |
1738 | 1742 | ||
1739 | if (chars > end) { | 1743 | if (chars > end) { |
1740 | s1 = log_buf + log_buf_len - chars + end; | 1744 | s1 = log_buf + log_buf_len - chars + end; |
diff --git a/kernel/profile.c b/kernel/profile.c index 961b389fe52f..76b8e77773ee 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
@@ -13,7 +13,7 @@ | |||
13 | * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004 | 13 | * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004 |
14 | */ | 14 | */ |
15 | 15 | ||
16 | #include <linux/module.h> | 16 | #include <linux/export.h> |
17 | #include <linux/profile.h> | 17 | #include <linux/profile.h> |
18 | #include <linux/bootmem.h> | 18 | #include <linux/bootmem.h> |
19 | #include <linux/notifier.h> | 19 | #include <linux/notifier.h> |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index c890ac9a7962..00ab2ca5ed11 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -8,7 +8,7 @@ | |||
8 | */ | 8 | */ |
9 | 9 | ||
10 | #include <linux/capability.h> | 10 | #include <linux/capability.h> |
11 | #include <linux/module.h> | 11 | #include <linux/export.h> |
12 | #include <linux/sched.h> | 12 | #include <linux/sched.h> |
13 | #include <linux/errno.h> | 13 | #include <linux/errno.h> |
14 | #include <linux/mm.h> | 14 | #include <linux/mm.h> |
@@ -96,9 +96,20 @@ void __ptrace_unlink(struct task_struct *child) | |||
96 | */ | 96 | */ |
97 | if (!(child->flags & PF_EXITING) && | 97 | if (!(child->flags & PF_EXITING) && |
98 | (child->signal->flags & SIGNAL_STOP_STOPPED || | 98 | (child->signal->flags & SIGNAL_STOP_STOPPED || |
99 | child->signal->group_stop_count)) | 99 | child->signal->group_stop_count)) { |
100 | child->jobctl |= JOBCTL_STOP_PENDING; | 100 | child->jobctl |= JOBCTL_STOP_PENDING; |
101 | 101 | ||
102 | /* | ||
103 | * This is only possible if this thread was cloned by the | ||
104 | * traced task running in the stopped group, set the signal | ||
105 | * for the future reports. | ||
106 | * FIXME: we should change ptrace_init_task() to handle this | ||
107 | * case. | ||
108 | */ | ||
109 | if (!(child->jobctl & JOBCTL_STOP_SIGMASK)) | ||
110 | child->jobctl |= SIGSTOP; | ||
111 | } | ||
112 | |||
102 | /* | 113 | /* |
103 | * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick | 114 | * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick |
104 | * @child in the butt. Note that @resume should be used iff @child | 115 | * @child in the butt. Note that @resume should be used iff @child |
diff --git a/kernel/range.c b/kernel/range.c index 37fa9b99ad58..9b8ae2d6ed68 100644 --- a/kernel/range.c +++ b/kernel/range.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * Range add and subtract | 2 | * Range add and subtract |
3 | */ | 3 | */ |
4 | #include <linux/module.h> | 4 | #include <linux/kernel.h> |
5 | #include <linux/init.h> | 5 | #include <linux/init.h> |
6 | #include <linux/sort.h> | 6 | #include <linux/sort.h> |
7 | 7 | ||
diff --git a/kernel/rcu.h b/kernel/rcu.h new file mode 100644 index 000000000000..aa88baab5f78 --- /dev/null +++ b/kernel/rcu.h | |||
@@ -0,0 +1,92 @@ | |||
1 | /* | ||
2 | * Read-Copy Update definitions shared among RCU implementations. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
17 | * | ||
18 | * Copyright IBM Corporation, 2011 | ||
19 | * | ||
20 | * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> | ||
21 | */ | ||
22 | |||
23 | #ifndef __LINUX_RCU_H | ||
24 | #define __LINUX_RCU_H | ||
25 | |||
26 | #ifdef CONFIG_RCU_TRACE | ||
27 | #define RCU_TRACE(stmt) stmt | ||
28 | #else /* #ifdef CONFIG_RCU_TRACE */ | ||
29 | #define RCU_TRACE(stmt) | ||
30 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ | ||
31 | |||
32 | /* | ||
33 | * Process-level increment to ->dynticks_nesting field. This allows for | ||
34 | * architectures that use half-interrupts and half-exceptions from | ||
35 | * process context. | ||
36 | */ | ||
37 | #define DYNTICK_TASK_NESTING (LLONG_MAX / 2 - 1) | ||
38 | |||
39 | /* | ||
40 | * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally | ||
41 | * by call_rcu() and rcu callback execution, and are therefore not part of the | ||
42 | * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors. | ||
43 | */ | ||
44 | |||
45 | #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD | ||
46 | # define STATE_RCU_HEAD_READY 0 | ||
47 | # define STATE_RCU_HEAD_QUEUED 1 | ||
48 | |||
49 | extern struct debug_obj_descr rcuhead_debug_descr; | ||
50 | |||
51 | static inline void debug_rcu_head_queue(struct rcu_head *head) | ||
52 | { | ||
53 | WARN_ON_ONCE((unsigned long)head & 0x3); | ||
54 | debug_object_activate(head, &rcuhead_debug_descr); | ||
55 | debug_object_active_state(head, &rcuhead_debug_descr, | ||
56 | STATE_RCU_HEAD_READY, | ||
57 | STATE_RCU_HEAD_QUEUED); | ||
58 | } | ||
59 | |||
60 | static inline void debug_rcu_head_unqueue(struct rcu_head *head) | ||
61 | { | ||
62 | debug_object_active_state(head, &rcuhead_debug_descr, | ||
63 | STATE_RCU_HEAD_QUEUED, | ||
64 | STATE_RCU_HEAD_READY); | ||
65 | debug_object_deactivate(head, &rcuhead_debug_descr); | ||
66 | } | ||
67 | #else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | ||
68 | static inline void debug_rcu_head_queue(struct rcu_head *head) | ||
69 | { | ||
70 | } | ||
71 | |||
72 | static inline void debug_rcu_head_unqueue(struct rcu_head *head) | ||
73 | { | ||
74 | } | ||
75 | #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | ||
76 | |||
77 | extern void kfree(const void *); | ||
78 | |||
79 | static inline void __rcu_reclaim(char *rn, struct rcu_head *head) | ||
80 | { | ||
81 | unsigned long offset = (unsigned long)head->func; | ||
82 | |||
83 | if (__is_kfree_rcu_offset(offset)) { | ||
84 | RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); | ||
85 | kfree((void *)head - offset); | ||
86 | } else { | ||
87 | RCU_TRACE(trace_rcu_invoke_callback(rn, head)); | ||
88 | head->func(head); | ||
89 | } | ||
90 | } | ||
91 | |||
92 | #endif /* __LINUX_RCU_H */ | ||
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index ddddb320be61..2bc4e135ff23 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -43,9 +43,14 @@ | |||
43 | #include <linux/notifier.h> | 43 | #include <linux/notifier.h> |
44 | #include <linux/cpu.h> | 44 | #include <linux/cpu.h> |
45 | #include <linux/mutex.h> | 45 | #include <linux/mutex.h> |
46 | #include <linux/module.h> | 46 | #include <linux/export.h> |
47 | #include <linux/hardirq.h> | 47 | #include <linux/hardirq.h> |
48 | 48 | ||
49 | #define CREATE_TRACE_POINTS | ||
50 | #include <trace/events/rcu.h> | ||
51 | |||
52 | #include "rcu.h" | ||
53 | |||
49 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 54 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
50 | static struct lock_class_key rcu_lock_key; | 55 | static struct lock_class_key rcu_lock_key; |
51 | struct lockdep_map rcu_lock_map = | 56 | struct lockdep_map rcu_lock_map = |
@@ -88,17 +93,24 @@ int rcu_read_lock_bh_held(void) | |||
88 | { | 93 | { |
89 | if (!debug_lockdep_rcu_enabled()) | 94 | if (!debug_lockdep_rcu_enabled()) |
90 | return 1; | 95 | return 1; |
96 | if (rcu_is_cpu_idle()) | ||
97 | return 0; | ||
91 | return in_softirq() || irqs_disabled(); | 98 | return in_softirq() || irqs_disabled(); |
92 | } | 99 | } |
93 | EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); | 100 | EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); |
94 | 101 | ||
95 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | 102 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ |
96 | 103 | ||
104 | struct rcu_synchronize { | ||
105 | struct rcu_head head; | ||
106 | struct completion completion; | ||
107 | }; | ||
108 | |||
97 | /* | 109 | /* |
98 | * Awaken the corresponding synchronize_rcu() instance now that a | 110 | * Awaken the corresponding synchronize_rcu() instance now that a |
99 | * grace period has elapsed. | 111 | * grace period has elapsed. |
100 | */ | 112 | */ |
101 | void wakeme_after_rcu(struct rcu_head *head) | 113 | static void wakeme_after_rcu(struct rcu_head *head) |
102 | { | 114 | { |
103 | struct rcu_synchronize *rcu; | 115 | struct rcu_synchronize *rcu; |
104 | 116 | ||
@@ -106,6 +118,20 @@ void wakeme_after_rcu(struct rcu_head *head) | |||
106 | complete(&rcu->completion); | 118 | complete(&rcu->completion); |
107 | } | 119 | } |
108 | 120 | ||
121 | void wait_rcu_gp(call_rcu_func_t crf) | ||
122 | { | ||
123 | struct rcu_synchronize rcu; | ||
124 | |||
125 | init_rcu_head_on_stack(&rcu.head); | ||
126 | init_completion(&rcu.completion); | ||
127 | /* Will wake me after RCU finished. */ | ||
128 | crf(&rcu.head, wakeme_after_rcu); | ||
129 | /* Wait for it. */ | ||
130 | wait_for_completion(&rcu.completion); | ||
131 | destroy_rcu_head_on_stack(&rcu.head); | ||
132 | } | ||
133 | EXPORT_SYMBOL_GPL(wait_rcu_gp); | ||
134 | |||
109 | #ifdef CONFIG_PROVE_RCU | 135 | #ifdef CONFIG_PROVE_RCU |
110 | /* | 136 | /* |
111 | * wrapper function to avoid #include problems. | 137 | * wrapper function to avoid #include problems. |
@@ -292,3 +318,13 @@ struct debug_obj_descr rcuhead_debug_descr = { | |||
292 | }; | 318 | }; |
293 | EXPORT_SYMBOL_GPL(rcuhead_debug_descr); | 319 | EXPORT_SYMBOL_GPL(rcuhead_debug_descr); |
294 | #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | 320 | #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ |
321 | |||
322 | #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) | ||
323 | void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp) | ||
324 | { | ||
325 | trace_rcu_torture_read(rcutorturename, rhp); | ||
326 | } | ||
327 | EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); | ||
328 | #else | ||
329 | #define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0) | ||
330 | #endif | ||
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 7bbac7d0f5ab..977296dca0a4 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c | |||
@@ -22,13 +22,12 @@ | |||
22 | * For detailed explanation of Read-Copy Update mechanism see - | 22 | * For detailed explanation of Read-Copy Update mechanism see - |
23 | * Documentation/RCU | 23 | * Documentation/RCU |
24 | */ | 24 | */ |
25 | #include <linux/moduleparam.h> | ||
26 | #include <linux/completion.h> | 25 | #include <linux/completion.h> |
27 | #include <linux/interrupt.h> | 26 | #include <linux/interrupt.h> |
28 | #include <linux/notifier.h> | 27 | #include <linux/notifier.h> |
29 | #include <linux/rcupdate.h> | 28 | #include <linux/rcupdate.h> |
30 | #include <linux/kernel.h> | 29 | #include <linux/kernel.h> |
31 | #include <linux/module.h> | 30 | #include <linux/export.h> |
32 | #include <linux/mutex.h> | 31 | #include <linux/mutex.h> |
33 | #include <linux/sched.h> | 32 | #include <linux/sched.h> |
34 | #include <linux/types.h> | 33 | #include <linux/types.h> |
@@ -37,47 +36,154 @@ | |||
37 | #include <linux/cpu.h> | 36 | #include <linux/cpu.h> |
38 | #include <linux/prefetch.h> | 37 | #include <linux/prefetch.h> |
39 | 38 | ||
40 | /* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */ | 39 | #ifdef CONFIG_RCU_TRACE |
41 | static struct task_struct *rcu_kthread_task; | 40 | #include <trace/events/rcu.h> |
42 | static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); | 41 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ |
43 | static unsigned long have_rcu_kthread_work; | 42 | |
43 | #include "rcu.h" | ||
44 | 44 | ||
45 | /* Forward declarations for rcutiny_plugin.h. */ | 45 | /* Forward declarations for rcutiny_plugin.h. */ |
46 | struct rcu_ctrlblk; | 46 | struct rcu_ctrlblk; |
47 | static void invoke_rcu_kthread(void); | 47 | static void invoke_rcu_callbacks(void); |
48 | static void rcu_process_callbacks(struct rcu_ctrlblk *rcp); | 48 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); |
49 | static int rcu_kthread(void *arg); | 49 | static void rcu_process_callbacks(struct softirq_action *unused); |
50 | static void __call_rcu(struct rcu_head *head, | 50 | static void __call_rcu(struct rcu_head *head, |
51 | void (*func)(struct rcu_head *rcu), | 51 | void (*func)(struct rcu_head *rcu), |
52 | struct rcu_ctrlblk *rcp); | 52 | struct rcu_ctrlblk *rcp); |
53 | 53 | ||
54 | #include "rcutiny_plugin.h" | 54 | #include "rcutiny_plugin.h" |
55 | 55 | ||
56 | #ifdef CONFIG_NO_HZ | 56 | static long long rcu_dynticks_nesting = DYNTICK_TASK_NESTING; |
57 | 57 | ||
58 | static long rcu_dynticks_nesting = 1; | 58 | /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ |
59 | static void rcu_idle_enter_common(long long oldval) | ||
60 | { | ||
61 | if (rcu_dynticks_nesting) { | ||
62 | RCU_TRACE(trace_rcu_dyntick("--=", | ||
63 | oldval, rcu_dynticks_nesting)); | ||
64 | return; | ||
65 | } | ||
66 | RCU_TRACE(trace_rcu_dyntick("Start", oldval, rcu_dynticks_nesting)); | ||
67 | if (!is_idle_task(current)) { | ||
68 | struct task_struct *idle = idle_task(smp_processor_id()); | ||
69 | |||
70 | RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", | ||
71 | oldval, rcu_dynticks_nesting)); | ||
72 | ftrace_dump(DUMP_ALL); | ||
73 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", | ||
74 | current->pid, current->comm, | ||
75 | idle->pid, idle->comm); /* must be idle task! */ | ||
76 | } | ||
77 | rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ | ||
78 | } | ||
59 | 79 | ||
60 | /* | 80 | /* |
61 | * Enter dynticks-idle mode, which is an extended quiescent state | 81 | * Enter idle, which is an extended quiescent state if we have fully |
62 | * if we have fully entered that mode (i.e., if the new value of | 82 | * entered that mode (i.e., if the new value of dynticks_nesting is zero). |
63 | * dynticks_nesting is zero). | ||
64 | */ | 83 | */ |
65 | void rcu_enter_nohz(void) | 84 | void rcu_idle_enter(void) |
66 | { | 85 | { |
67 | if (--rcu_dynticks_nesting == 0) | 86 | unsigned long flags; |
68 | rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ | 87 | long long oldval; |
88 | |||
89 | local_irq_save(flags); | ||
90 | oldval = rcu_dynticks_nesting; | ||
91 | rcu_dynticks_nesting = 0; | ||
92 | rcu_idle_enter_common(oldval); | ||
93 | local_irq_restore(flags); | ||
69 | } | 94 | } |
70 | 95 | ||
71 | /* | 96 | /* |
72 | * Exit dynticks-idle mode, so that we are no longer in an extended | 97 | * Exit an interrupt handler towards idle. |
73 | * quiescent state. | 98 | */ |
99 | void rcu_irq_exit(void) | ||
100 | { | ||
101 | unsigned long flags; | ||
102 | long long oldval; | ||
103 | |||
104 | local_irq_save(flags); | ||
105 | oldval = rcu_dynticks_nesting; | ||
106 | rcu_dynticks_nesting--; | ||
107 | WARN_ON_ONCE(rcu_dynticks_nesting < 0); | ||
108 | rcu_idle_enter_common(oldval); | ||
109 | local_irq_restore(flags); | ||
110 | } | ||
111 | |||
112 | /* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */ | ||
113 | static void rcu_idle_exit_common(long long oldval) | ||
114 | { | ||
115 | if (oldval) { | ||
116 | RCU_TRACE(trace_rcu_dyntick("++=", | ||
117 | oldval, rcu_dynticks_nesting)); | ||
118 | return; | ||
119 | } | ||
120 | RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting)); | ||
121 | if (!is_idle_task(current)) { | ||
122 | struct task_struct *idle = idle_task(smp_processor_id()); | ||
123 | |||
124 | RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task", | ||
125 | oldval, rcu_dynticks_nesting)); | ||
126 | ftrace_dump(DUMP_ALL); | ||
127 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", | ||
128 | current->pid, current->comm, | ||
129 | idle->pid, idle->comm); /* must be idle task! */ | ||
130 | } | ||
131 | } | ||
132 | |||
133 | /* | ||
134 | * Exit idle, so that we are no longer in an extended quiescent state. | ||
74 | */ | 135 | */ |
75 | void rcu_exit_nohz(void) | 136 | void rcu_idle_exit(void) |
76 | { | 137 | { |
138 | unsigned long flags; | ||
139 | long long oldval; | ||
140 | |||
141 | local_irq_save(flags); | ||
142 | oldval = rcu_dynticks_nesting; | ||
143 | WARN_ON_ONCE(oldval != 0); | ||
144 | rcu_dynticks_nesting = DYNTICK_TASK_NESTING; | ||
145 | rcu_idle_exit_common(oldval); | ||
146 | local_irq_restore(flags); | ||
147 | } | ||
148 | |||
149 | /* | ||
150 | * Enter an interrupt handler, moving away from idle. | ||
151 | */ | ||
152 | void rcu_irq_enter(void) | ||
153 | { | ||
154 | unsigned long flags; | ||
155 | long long oldval; | ||
156 | |||
157 | local_irq_save(flags); | ||
158 | oldval = rcu_dynticks_nesting; | ||
77 | rcu_dynticks_nesting++; | 159 | rcu_dynticks_nesting++; |
160 | WARN_ON_ONCE(rcu_dynticks_nesting == 0); | ||
161 | rcu_idle_exit_common(oldval); | ||
162 | local_irq_restore(flags); | ||
163 | } | ||
164 | |||
165 | #ifdef CONFIG_PROVE_RCU | ||
166 | |||
167 | /* | ||
168 | * Test whether RCU thinks that the current CPU is idle. | ||
169 | */ | ||
170 | int rcu_is_cpu_idle(void) | ||
171 | { | ||
172 | return !rcu_dynticks_nesting; | ||
78 | } | 173 | } |
174 | EXPORT_SYMBOL(rcu_is_cpu_idle); | ||
79 | 175 | ||
80 | #endif /* #ifdef CONFIG_NO_HZ */ | 176 | #endif /* #ifdef CONFIG_PROVE_RCU */ |
177 | |||
178 | /* | ||
179 | * Test whether the current CPU was interrupted from idle. Nested | ||
180 | * interrupts don't count, we must be running at the first interrupt | ||
181 | * level. | ||
182 | */ | ||
183 | int rcu_is_cpu_rrupt_from_idle(void) | ||
184 | { | ||
185 | return rcu_dynticks_nesting <= 0; | ||
186 | } | ||
81 | 187 | ||
82 | /* | 188 | /* |
83 | * Helper function for rcu_sched_qs() and rcu_bh_qs(). | 189 | * Helper function for rcu_sched_qs() and rcu_bh_qs(). |
@@ -96,16 +202,6 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) | |||
96 | } | 202 | } |
97 | 203 | ||
98 | /* | 204 | /* |
99 | * Wake up rcu_kthread() to process callbacks now eligible for invocation | ||
100 | * or to boost readers. | ||
101 | */ | ||
102 | static void invoke_rcu_kthread(void) | ||
103 | { | ||
104 | have_rcu_kthread_work = 1; | ||
105 | wake_up(&rcu_kthread_wq); | ||
106 | } | ||
107 | |||
108 | /* | ||
109 | * Record an rcu quiescent state. And an rcu_bh quiescent state while we | 205 | * Record an rcu quiescent state. And an rcu_bh quiescent state while we |
110 | * are at it, given that any rcu quiescent state is also an rcu_bh | 206 | * are at it, given that any rcu quiescent state is also an rcu_bh |
111 | * quiescent state. Use "+" instead of "||" to defeat short circuiting. | 207 | * quiescent state. Use "+" instead of "||" to defeat short circuiting. |
@@ -117,7 +213,7 @@ void rcu_sched_qs(int cpu) | |||
117 | local_irq_save(flags); | 213 | local_irq_save(flags); |
118 | if (rcu_qsctr_help(&rcu_sched_ctrlblk) + | 214 | if (rcu_qsctr_help(&rcu_sched_ctrlblk) + |
119 | rcu_qsctr_help(&rcu_bh_ctrlblk)) | 215 | rcu_qsctr_help(&rcu_bh_ctrlblk)) |
120 | invoke_rcu_kthread(); | 216 | invoke_rcu_callbacks(); |
121 | local_irq_restore(flags); | 217 | local_irq_restore(flags); |
122 | } | 218 | } |
123 | 219 | ||
@@ -130,20 +226,19 @@ void rcu_bh_qs(int cpu) | |||
130 | 226 | ||
131 | local_irq_save(flags); | 227 | local_irq_save(flags); |
132 | if (rcu_qsctr_help(&rcu_bh_ctrlblk)) | 228 | if (rcu_qsctr_help(&rcu_bh_ctrlblk)) |
133 | invoke_rcu_kthread(); | 229 | invoke_rcu_callbacks(); |
134 | local_irq_restore(flags); | 230 | local_irq_restore(flags); |
135 | } | 231 | } |
136 | 232 | ||
137 | /* | 233 | /* |
138 | * Check to see if the scheduling-clock interrupt came from an extended | 234 | * Check to see if the scheduling-clock interrupt came from an extended |
139 | * quiescent state, and, if so, tell RCU about it. | 235 | * quiescent state, and, if so, tell RCU about it. This function must |
236 | * be called from hardirq context. It is normally called from the | ||
237 | * scheduling-clock interrupt. | ||
140 | */ | 238 | */ |
141 | void rcu_check_callbacks(int cpu, int user) | 239 | void rcu_check_callbacks(int cpu, int user) |
142 | { | 240 | { |
143 | if (user || | 241 | if (user || rcu_is_cpu_rrupt_from_idle()) |
144 | (idle_cpu(cpu) && | ||
145 | !in_softirq() && | ||
146 | hardirq_count() <= (1 << HARDIRQ_SHIFT))) | ||
147 | rcu_sched_qs(cpu); | 242 | rcu_sched_qs(cpu); |
148 | else if (!in_softirq()) | 243 | else if (!in_softirq()) |
149 | rcu_bh_qs(cpu); | 244 | rcu_bh_qs(cpu); |
@@ -154,18 +249,27 @@ void rcu_check_callbacks(int cpu, int user) | |||
154 | * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure | 249 | * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure |
155 | * whose grace period has elapsed. | 250 | * whose grace period has elapsed. |
156 | */ | 251 | */ |
157 | static void rcu_process_callbacks(struct rcu_ctrlblk *rcp) | 252 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) |
158 | { | 253 | { |
254 | char *rn = NULL; | ||
159 | struct rcu_head *next, *list; | 255 | struct rcu_head *next, *list; |
160 | unsigned long flags; | 256 | unsigned long flags; |
161 | RCU_TRACE(int cb_count = 0); | 257 | RCU_TRACE(int cb_count = 0); |
162 | 258 | ||
163 | /* If no RCU callbacks ready to invoke, just return. */ | 259 | /* If no RCU callbacks ready to invoke, just return. */ |
164 | if (&rcp->rcucblist == rcp->donetail) | 260 | if (&rcp->rcucblist == rcp->donetail) { |
261 | RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); | ||
262 | RCU_TRACE(trace_rcu_batch_end(rcp->name, 0, | ||
263 | ACCESS_ONCE(rcp->rcucblist), | ||
264 | need_resched(), | ||
265 | is_idle_task(current), | ||
266 | rcu_is_callbacks_kthread())); | ||
165 | return; | 267 | return; |
268 | } | ||
166 | 269 | ||
167 | /* Move the ready-to-invoke callbacks to a local list. */ | 270 | /* Move the ready-to-invoke callbacks to a local list. */ |
168 | local_irq_save(flags); | 271 | local_irq_save(flags); |
272 | RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); | ||
169 | list = rcp->rcucblist; | 273 | list = rcp->rcucblist; |
170 | rcp->rcucblist = *rcp->donetail; | 274 | rcp->rcucblist = *rcp->donetail; |
171 | *rcp->donetail = NULL; | 275 | *rcp->donetail = NULL; |
@@ -176,49 +280,28 @@ static void rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
176 | local_irq_restore(flags); | 280 | local_irq_restore(flags); |
177 | 281 | ||
178 | /* Invoke the callbacks on the local list. */ | 282 | /* Invoke the callbacks on the local list. */ |
283 | RCU_TRACE(rn = rcp->name); | ||
179 | while (list) { | 284 | while (list) { |
180 | next = list->next; | 285 | next = list->next; |
181 | prefetch(next); | 286 | prefetch(next); |
182 | debug_rcu_head_unqueue(list); | 287 | debug_rcu_head_unqueue(list); |
183 | local_bh_disable(); | 288 | local_bh_disable(); |
184 | __rcu_reclaim(list); | 289 | __rcu_reclaim(rn, list); |
185 | local_bh_enable(); | 290 | local_bh_enable(); |
186 | list = next; | 291 | list = next; |
187 | RCU_TRACE(cb_count++); | 292 | RCU_TRACE(cb_count++); |
188 | } | 293 | } |
189 | RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); | 294 | RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); |
295 | RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(), | ||
296 | is_idle_task(current), | ||
297 | rcu_is_callbacks_kthread())); | ||
190 | } | 298 | } |
191 | 299 | ||
192 | /* | 300 | static void rcu_process_callbacks(struct softirq_action *unused) |
193 | * This kthread invokes RCU callbacks whose grace periods have | ||
194 | * elapsed. It is awakened as needed, and takes the place of the | ||
195 | * RCU_SOFTIRQ that was used previously for this purpose. | ||
196 | * This is a kthread, but it is never stopped, at least not until | ||
197 | * the system goes down. | ||
198 | */ | ||
199 | static int rcu_kthread(void *arg) | ||
200 | { | 301 | { |
201 | unsigned long work; | 302 | __rcu_process_callbacks(&rcu_sched_ctrlblk); |
202 | unsigned long morework; | 303 | __rcu_process_callbacks(&rcu_bh_ctrlblk); |
203 | unsigned long flags; | 304 | rcu_preempt_process_callbacks(); |
204 | |||
205 | for (;;) { | ||
206 | wait_event_interruptible(rcu_kthread_wq, | ||
207 | have_rcu_kthread_work != 0); | ||
208 | morework = rcu_boost(); | ||
209 | local_irq_save(flags); | ||
210 | work = have_rcu_kthread_work; | ||
211 | have_rcu_kthread_work = morework; | ||
212 | local_irq_restore(flags); | ||
213 | if (work) { | ||
214 | rcu_process_callbacks(&rcu_sched_ctrlblk); | ||
215 | rcu_process_callbacks(&rcu_bh_ctrlblk); | ||
216 | rcu_preempt_process_callbacks(); | ||
217 | } | ||
218 | schedule_timeout_interruptible(1); /* Leave CPU for others. */ | ||
219 | } | ||
220 | |||
221 | return 0; /* Not reached, but needed to shut gcc up. */ | ||
222 | } | 305 | } |
223 | 306 | ||
224 | /* | 307 | /* |
@@ -280,45 +363,3 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | |||
280 | __call_rcu(head, func, &rcu_bh_ctrlblk); | 363 | __call_rcu(head, func, &rcu_bh_ctrlblk); |
281 | } | 364 | } |
282 | EXPORT_SYMBOL_GPL(call_rcu_bh); | 365 | EXPORT_SYMBOL_GPL(call_rcu_bh); |
283 | |||
284 | void rcu_barrier_bh(void) | ||
285 | { | ||
286 | struct rcu_synchronize rcu; | ||
287 | |||
288 | init_rcu_head_on_stack(&rcu.head); | ||
289 | init_completion(&rcu.completion); | ||
290 | /* Will wake me after RCU finished. */ | ||
291 | call_rcu_bh(&rcu.head, wakeme_after_rcu); | ||
292 | /* Wait for it. */ | ||
293 | wait_for_completion(&rcu.completion); | ||
294 | destroy_rcu_head_on_stack(&rcu.head); | ||
295 | } | ||
296 | EXPORT_SYMBOL_GPL(rcu_barrier_bh); | ||
297 | |||
298 | void rcu_barrier_sched(void) | ||
299 | { | ||
300 | struct rcu_synchronize rcu; | ||
301 | |||
302 | init_rcu_head_on_stack(&rcu.head); | ||
303 | init_completion(&rcu.completion); | ||
304 | /* Will wake me after RCU finished. */ | ||
305 | call_rcu_sched(&rcu.head, wakeme_after_rcu); | ||
306 | /* Wait for it. */ | ||
307 | wait_for_completion(&rcu.completion); | ||
308 | destroy_rcu_head_on_stack(&rcu.head); | ||
309 | } | ||
310 | EXPORT_SYMBOL_GPL(rcu_barrier_sched); | ||
311 | |||
312 | /* | ||
313 | * Spawn the kthread that invokes RCU callbacks. | ||
314 | */ | ||
315 | static int __init rcu_spawn_kthreads(void) | ||
316 | { | ||
317 | struct sched_param sp; | ||
318 | |||
319 | rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread"); | ||
320 | sp.sched_priority = RCU_BOOST_PRIO; | ||
321 | sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp); | ||
322 | return 0; | ||
323 | } | ||
324 | early_initcall(rcu_spawn_kthreads); | ||
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index f259c676195f..9cb1ae4aabdd 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h | |||
@@ -23,32 +23,30 @@ | |||
23 | */ | 23 | */ |
24 | 24 | ||
25 | #include <linux/kthread.h> | 25 | #include <linux/kthread.h> |
26 | #include <linux/module.h> | ||
26 | #include <linux/debugfs.h> | 27 | #include <linux/debugfs.h> |
27 | #include <linux/seq_file.h> | 28 | #include <linux/seq_file.h> |
28 | 29 | ||
29 | #ifdef CONFIG_RCU_TRACE | ||
30 | #define RCU_TRACE(stmt) stmt | ||
31 | #else /* #ifdef CONFIG_RCU_TRACE */ | ||
32 | #define RCU_TRACE(stmt) | ||
33 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ | ||
34 | |||
35 | /* Global control variables for rcupdate callback mechanism. */ | 30 | /* Global control variables for rcupdate callback mechanism. */ |
36 | struct rcu_ctrlblk { | 31 | struct rcu_ctrlblk { |
37 | struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ | 32 | struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ |
38 | struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ | 33 | struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ |
39 | struct rcu_head **curtail; /* ->next pointer of last CB. */ | 34 | struct rcu_head **curtail; /* ->next pointer of last CB. */ |
40 | RCU_TRACE(long qlen); /* Number of pending CBs. */ | 35 | RCU_TRACE(long qlen); /* Number of pending CBs. */ |
36 | RCU_TRACE(char *name); /* Name of RCU type. */ | ||
41 | }; | 37 | }; |
42 | 38 | ||
43 | /* Definition for rcupdate control block. */ | 39 | /* Definition for rcupdate control block. */ |
44 | static struct rcu_ctrlblk rcu_sched_ctrlblk = { | 40 | static struct rcu_ctrlblk rcu_sched_ctrlblk = { |
45 | .donetail = &rcu_sched_ctrlblk.rcucblist, | 41 | .donetail = &rcu_sched_ctrlblk.rcucblist, |
46 | .curtail = &rcu_sched_ctrlblk.rcucblist, | 42 | .curtail = &rcu_sched_ctrlblk.rcucblist, |
43 | RCU_TRACE(.name = "rcu_sched") | ||
47 | }; | 44 | }; |
48 | 45 | ||
49 | static struct rcu_ctrlblk rcu_bh_ctrlblk = { | 46 | static struct rcu_ctrlblk rcu_bh_ctrlblk = { |
50 | .donetail = &rcu_bh_ctrlblk.rcucblist, | 47 | .donetail = &rcu_bh_ctrlblk.rcucblist, |
51 | .curtail = &rcu_bh_ctrlblk.rcucblist, | 48 | .curtail = &rcu_bh_ctrlblk.rcucblist, |
49 | RCU_TRACE(.name = "rcu_bh") | ||
52 | }; | 50 | }; |
53 | 51 | ||
54 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 52 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
@@ -131,6 +129,7 @@ static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { | |||
131 | .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist, | 129 | .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist, |
132 | .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist, | 130 | .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist, |
133 | .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks), | 131 | .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks), |
132 | RCU_TRACE(.rcb.name = "rcu_preempt") | ||
134 | }; | 133 | }; |
135 | 134 | ||
136 | static int rcu_preempted_readers_exp(void); | 135 | static int rcu_preempted_readers_exp(void); |
@@ -247,6 +246,13 @@ static void show_tiny_preempt_stats(struct seq_file *m) | |||
247 | 246 | ||
248 | #include "rtmutex_common.h" | 247 | #include "rtmutex_common.h" |
249 | 248 | ||
249 | #define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO | ||
250 | |||
251 | /* Controls for rcu_kthread() kthread. */ | ||
252 | static struct task_struct *rcu_kthread_task; | ||
253 | static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); | ||
254 | static unsigned long have_rcu_kthread_work; | ||
255 | |||
250 | /* | 256 | /* |
251 | * Carry out RCU priority boosting on the task indicated by ->boost_tasks, | 257 | * Carry out RCU priority boosting on the task indicated by ->boost_tasks, |
252 | * and advance ->boost_tasks to the next task in the ->blkd_tasks list. | 258 | * and advance ->boost_tasks to the next task in the ->blkd_tasks list. |
@@ -306,8 +312,8 @@ static int rcu_boost(void) | |||
306 | rt_mutex_lock(&mtx); | 312 | rt_mutex_lock(&mtx); |
307 | rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ | 313 | rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ |
308 | 314 | ||
309 | return rcu_preempt_ctrlblk.boost_tasks != NULL || | 315 | return ACCESS_ONCE(rcu_preempt_ctrlblk.boost_tasks) != NULL || |
310 | rcu_preempt_ctrlblk.exp_tasks != NULL; | 316 | ACCESS_ONCE(rcu_preempt_ctrlblk.exp_tasks) != NULL; |
311 | } | 317 | } |
312 | 318 | ||
313 | /* | 319 | /* |
@@ -334,7 +340,7 @@ static int rcu_initiate_boost(void) | |||
334 | if (rcu_preempt_ctrlblk.exp_tasks == NULL) | 340 | if (rcu_preempt_ctrlblk.exp_tasks == NULL) |
335 | rcu_preempt_ctrlblk.boost_tasks = | 341 | rcu_preempt_ctrlblk.boost_tasks = |
336 | rcu_preempt_ctrlblk.gp_tasks; | 342 | rcu_preempt_ctrlblk.gp_tasks; |
337 | invoke_rcu_kthread(); | 343 | invoke_rcu_callbacks(); |
338 | } else | 344 | } else |
339 | RCU_TRACE(rcu_initiate_boost_trace()); | 345 | RCU_TRACE(rcu_initiate_boost_trace()); |
340 | return 1; | 346 | return 1; |
@@ -353,14 +359,6 @@ static void rcu_preempt_boost_start_gp(void) | |||
353 | #else /* #ifdef CONFIG_RCU_BOOST */ | 359 | #else /* #ifdef CONFIG_RCU_BOOST */ |
354 | 360 | ||
355 | /* | 361 | /* |
356 | * If there is no RCU priority boosting, we don't boost. | ||
357 | */ | ||
358 | static int rcu_boost(void) | ||
359 | { | ||
360 | return 0; | ||
361 | } | ||
362 | |||
363 | /* | ||
364 | * If there is no RCU priority boosting, we don't initiate boosting, | 362 | * If there is no RCU priority boosting, we don't initiate boosting, |
365 | * but we do indicate whether there are blocked readers blocking the | 363 | * but we do indicate whether there are blocked readers blocking the |
366 | * current grace period. | 364 | * current grace period. |
@@ -427,7 +425,7 @@ static void rcu_preempt_cpu_qs(void) | |||
427 | 425 | ||
428 | /* If there are done callbacks, cause them to be invoked. */ | 426 | /* If there are done callbacks, cause them to be invoked. */ |
429 | if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) | 427 | if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) |
430 | invoke_rcu_kthread(); | 428 | invoke_rcu_callbacks(); |
431 | } | 429 | } |
432 | 430 | ||
433 | /* | 431 | /* |
@@ -648,7 +646,7 @@ static void rcu_preempt_check_callbacks(void) | |||
648 | rcu_preempt_cpu_qs(); | 646 | rcu_preempt_cpu_qs(); |
649 | if (&rcu_preempt_ctrlblk.rcb.rcucblist != | 647 | if (&rcu_preempt_ctrlblk.rcb.rcucblist != |
650 | rcu_preempt_ctrlblk.rcb.donetail) | 648 | rcu_preempt_ctrlblk.rcb.donetail) |
651 | invoke_rcu_kthread(); | 649 | invoke_rcu_callbacks(); |
652 | if (rcu_preempt_gp_in_progress() && | 650 | if (rcu_preempt_gp_in_progress() && |
653 | rcu_cpu_blocking_cur_gp() && | 651 | rcu_cpu_blocking_cur_gp() && |
654 | rcu_preempt_running_reader()) | 652 | rcu_preempt_running_reader()) |
@@ -674,7 +672,7 @@ static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) | |||
674 | */ | 672 | */ |
675 | static void rcu_preempt_process_callbacks(void) | 673 | static void rcu_preempt_process_callbacks(void) |
676 | { | 674 | { |
677 | rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); | 675 | __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); |
678 | } | 676 | } |
679 | 677 | ||
680 | /* | 678 | /* |
@@ -697,20 +695,6 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | |||
697 | } | 695 | } |
698 | EXPORT_SYMBOL_GPL(call_rcu); | 696 | EXPORT_SYMBOL_GPL(call_rcu); |
699 | 697 | ||
700 | void rcu_barrier(void) | ||
701 | { | ||
702 | struct rcu_synchronize rcu; | ||
703 | |||
704 | init_rcu_head_on_stack(&rcu.head); | ||
705 | init_completion(&rcu.completion); | ||
706 | /* Will wake me after RCU finished. */ | ||
707 | call_rcu(&rcu.head, wakeme_after_rcu); | ||
708 | /* Wait for it. */ | ||
709 | wait_for_completion(&rcu.completion); | ||
710 | destroy_rcu_head_on_stack(&rcu.head); | ||
711 | } | ||
712 | EXPORT_SYMBOL_GPL(rcu_barrier); | ||
713 | |||
714 | /* | 698 | /* |
715 | * synchronize_rcu - wait until a grace period has elapsed. | 699 | * synchronize_rcu - wait until a grace period has elapsed. |
716 | * | 700 | * |
@@ -864,15 +848,6 @@ static void show_tiny_preempt_stats(struct seq_file *m) | |||
864 | #endif /* #ifdef CONFIG_RCU_TRACE */ | 848 | #endif /* #ifdef CONFIG_RCU_TRACE */ |
865 | 849 | ||
866 | /* | 850 | /* |
867 | * Because preemptible RCU does not exist, it is never necessary to | ||
868 | * boost preempted RCU readers. | ||
869 | */ | ||
870 | static int rcu_boost(void) | ||
871 | { | ||
872 | return 0; | ||
873 | } | ||
874 | |||
875 | /* | ||
876 | * Because preemptible RCU does not exist, it never has any callbacks | 851 | * Because preemptible RCU does not exist, it never has any callbacks |
877 | * to check. | 852 | * to check. |
878 | */ | 853 | */ |
@@ -898,6 +873,103 @@ static void rcu_preempt_process_callbacks(void) | |||
898 | 873 | ||
899 | #endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ | 874 | #endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ |
900 | 875 | ||
876 | #ifdef CONFIG_RCU_BOOST | ||
877 | |||
878 | /* | ||
879 | * Wake up rcu_kthread() to process callbacks now eligible for invocation | ||
880 | * or to boost readers. | ||
881 | */ | ||
882 | static void invoke_rcu_callbacks(void) | ||
883 | { | ||
884 | have_rcu_kthread_work = 1; | ||
885 | wake_up(&rcu_kthread_wq); | ||
886 | } | ||
887 | |||
888 | #ifdef CONFIG_RCU_TRACE | ||
889 | |||
890 | /* | ||
891 | * Is the current CPU running the RCU-callbacks kthread? | ||
892 | * Caller must have preemption disabled. | ||
893 | */ | ||
894 | static bool rcu_is_callbacks_kthread(void) | ||
895 | { | ||
896 | return rcu_kthread_task == current; | ||
897 | } | ||
898 | |||
899 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
900 | |||
901 | /* | ||
902 | * This kthread invokes RCU callbacks whose grace periods have | ||
903 | * elapsed. It is awakened as needed, and takes the place of the | ||
904 | * RCU_SOFTIRQ that is used for this purpose when boosting is disabled. | ||
905 | * This is a kthread, but it is never stopped, at least not until | ||
906 | * the system goes down. | ||
907 | */ | ||
908 | static int rcu_kthread(void *arg) | ||
909 | { | ||
910 | unsigned long work; | ||
911 | unsigned long morework; | ||
912 | unsigned long flags; | ||
913 | |||
914 | for (;;) { | ||
915 | wait_event_interruptible(rcu_kthread_wq, | ||
916 | have_rcu_kthread_work != 0); | ||
917 | morework = rcu_boost(); | ||
918 | local_irq_save(flags); | ||
919 | work = have_rcu_kthread_work; | ||
920 | have_rcu_kthread_work = morework; | ||
921 | local_irq_restore(flags); | ||
922 | if (work) | ||
923 | rcu_process_callbacks(NULL); | ||
924 | schedule_timeout_interruptible(1); /* Leave CPU for others. */ | ||
925 | } | ||
926 | |||
927 | return 0; /* Not reached, but needed to shut gcc up. */ | ||
928 | } | ||
929 | |||
930 | /* | ||
931 | * Spawn the kthread that invokes RCU callbacks. | ||
932 | */ | ||
933 | static int __init rcu_spawn_kthreads(void) | ||
934 | { | ||
935 | struct sched_param sp; | ||
936 | |||
937 | rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread"); | ||
938 | sp.sched_priority = RCU_BOOST_PRIO; | ||
939 | sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp); | ||
940 | return 0; | ||
941 | } | ||
942 | early_initcall(rcu_spawn_kthreads); | ||
943 | |||
944 | #else /* #ifdef CONFIG_RCU_BOOST */ | ||
945 | |||
946 | /* | ||
947 | * Start up softirq processing of callbacks. | ||
948 | */ | ||
949 | void invoke_rcu_callbacks(void) | ||
950 | { | ||
951 | raise_softirq(RCU_SOFTIRQ); | ||
952 | } | ||
953 | |||
954 | #ifdef CONFIG_RCU_TRACE | ||
955 | |||
956 | /* | ||
957 | * There is no callback kthread, so this thread is never it. | ||
958 | */ | ||
959 | static bool rcu_is_callbacks_kthread(void) | ||
960 | { | ||
961 | return false; | ||
962 | } | ||
963 | |||
964 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
965 | |||
966 | void rcu_init(void) | ||
967 | { | ||
968 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | ||
969 | } | ||
970 | |||
971 | #endif /* #else #ifdef CONFIG_RCU_BOOST */ | ||
972 | |||
901 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 973 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
902 | #include <linux/kernel_stat.h> | 974 | #include <linux/kernel_stat.h> |
903 | 975 | ||
@@ -913,12 +985,6 @@ void __init rcu_scheduler_starting(void) | |||
913 | 985 | ||
914 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | 986 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ |
915 | 987 | ||
916 | #ifdef CONFIG_RCU_BOOST | ||
917 | #define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO | ||
918 | #else /* #ifdef CONFIG_RCU_BOOST */ | ||
919 | #define RCU_BOOST_PRIO 1 | ||
920 | #endif /* #else #ifdef CONFIG_RCU_BOOST */ | ||
921 | |||
922 | #ifdef CONFIG_RCU_TRACE | 988 | #ifdef CONFIG_RCU_TRACE |
923 | 989 | ||
924 | #ifdef CONFIG_RCU_BOOST | 990 | #ifdef CONFIG_RCU_BOOST |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 98f51b13bb7e..88f17b8a3b1d 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -61,9 +61,11 @@ static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ | |||
61 | static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ | 61 | static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ |
62 | static int stutter = 5; /* Start/stop testing interval (in sec) */ | 62 | static int stutter = 5; /* Start/stop testing interval (in sec) */ |
63 | static int irqreader = 1; /* RCU readers from irq (timers). */ | 63 | static int irqreader = 1; /* RCU readers from irq (timers). */ |
64 | static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ | 64 | static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ |
65 | static int fqs_holdoff = 0; /* Hold time within burst (us). */ | 65 | static int fqs_holdoff; /* Hold time within burst (us). */ |
66 | static int fqs_stutter = 3; /* Wait time between bursts (s). */ | 66 | static int fqs_stutter = 3; /* Wait time between bursts (s). */ |
67 | static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ | ||
68 | static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ | ||
67 | static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ | 69 | static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ |
68 | static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ | 70 | static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ |
69 | static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ | 71 | static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ |
@@ -73,7 +75,7 @@ module_param(nreaders, int, 0444); | |||
73 | MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); | 75 | MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); |
74 | module_param(nfakewriters, int, 0444); | 76 | module_param(nfakewriters, int, 0444); |
75 | MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); | 77 | MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); |
76 | module_param(stat_interval, int, 0444); | 78 | module_param(stat_interval, int, 0644); |
77 | MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); | 79 | MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); |
78 | module_param(verbose, bool, 0444); | 80 | module_param(verbose, bool, 0444); |
79 | MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); | 81 | MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); |
@@ -91,6 +93,10 @@ module_param(fqs_holdoff, int, 0444); | |||
91 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); | 93 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); |
92 | module_param(fqs_stutter, int, 0444); | 94 | module_param(fqs_stutter, int, 0444); |
93 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); | 95 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); |
96 | module_param(onoff_interval, int, 0444); | ||
97 | MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); | ||
98 | module_param(shutdown_secs, int, 0444); | ||
99 | MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable."); | ||
94 | module_param(test_boost, int, 0444); | 100 | module_param(test_boost, int, 0444); |
95 | MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); | 101 | MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); |
96 | module_param(test_boost_interval, int, 0444); | 102 | module_param(test_boost_interval, int, 0444); |
@@ -119,6 +125,10 @@ static struct task_struct *shuffler_task; | |||
119 | static struct task_struct *stutter_task; | 125 | static struct task_struct *stutter_task; |
120 | static struct task_struct *fqs_task; | 126 | static struct task_struct *fqs_task; |
121 | static struct task_struct *boost_tasks[NR_CPUS]; | 127 | static struct task_struct *boost_tasks[NR_CPUS]; |
128 | static struct task_struct *shutdown_task; | ||
129 | #ifdef CONFIG_HOTPLUG_CPU | ||
130 | static struct task_struct *onoff_task; | ||
131 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
122 | 132 | ||
123 | #define RCU_TORTURE_PIPE_LEN 10 | 133 | #define RCU_TORTURE_PIPE_LEN 10 |
124 | 134 | ||
@@ -149,6 +159,10 @@ static long n_rcu_torture_boost_rterror; | |||
149 | static long n_rcu_torture_boost_failure; | 159 | static long n_rcu_torture_boost_failure; |
150 | static long n_rcu_torture_boosts; | 160 | static long n_rcu_torture_boosts; |
151 | static long n_rcu_torture_timers; | 161 | static long n_rcu_torture_timers; |
162 | static long n_offline_attempts; | ||
163 | static long n_offline_successes; | ||
164 | static long n_online_attempts; | ||
165 | static long n_online_successes; | ||
152 | static struct list_head rcu_torture_removed; | 166 | static struct list_head rcu_torture_removed; |
153 | static cpumask_var_t shuffle_tmp_mask; | 167 | static cpumask_var_t shuffle_tmp_mask; |
154 | 168 | ||
@@ -160,6 +174,8 @@ static int stutter_pause_test; | |||
160 | #define RCUTORTURE_RUNNABLE_INIT 0 | 174 | #define RCUTORTURE_RUNNABLE_INIT 0 |
161 | #endif | 175 | #endif |
162 | int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; | 176 | int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; |
177 | module_param(rcutorture_runnable, int, 0444); | ||
178 | MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot"); | ||
163 | 179 | ||
164 | #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) | 180 | #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) |
165 | #define rcu_can_boost() 1 | 181 | #define rcu_can_boost() 1 |
@@ -167,6 +183,7 @@ int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; | |||
167 | #define rcu_can_boost() 0 | 183 | #define rcu_can_boost() 0 |
168 | #endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ | 184 | #endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ |
169 | 185 | ||
186 | static unsigned long shutdown_time; /* jiffies to system shutdown. */ | ||
170 | static unsigned long boost_starttime; /* jiffies of next boost test start. */ | 187 | static unsigned long boost_starttime; /* jiffies of next boost test start. */ |
171 | DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ | 188 | DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ |
172 | /* and boost task create/destroy. */ | 189 | /* and boost task create/destroy. */ |
@@ -182,6 +199,9 @@ static int fullstop = FULLSTOP_RMMOD; | |||
182 | */ | 199 | */ |
183 | static DEFINE_MUTEX(fullstop_mutex); | 200 | static DEFINE_MUTEX(fullstop_mutex); |
184 | 201 | ||
202 | /* Forward reference. */ | ||
203 | static void rcu_torture_cleanup(void); | ||
204 | |||
185 | /* | 205 | /* |
186 | * Detect and respond to a system shutdown. | 206 | * Detect and respond to a system shutdown. |
187 | */ | 207 | */ |
@@ -480,30 +500,6 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p) | |||
480 | call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); | 500 | call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); |
481 | } | 501 | } |
482 | 502 | ||
483 | struct rcu_bh_torture_synchronize { | ||
484 | struct rcu_head head; | ||
485 | struct completion completion; | ||
486 | }; | ||
487 | |||
488 | static void rcu_bh_torture_wakeme_after_cb(struct rcu_head *head) | ||
489 | { | ||
490 | struct rcu_bh_torture_synchronize *rcu; | ||
491 | |||
492 | rcu = container_of(head, struct rcu_bh_torture_synchronize, head); | ||
493 | complete(&rcu->completion); | ||
494 | } | ||
495 | |||
496 | static void rcu_bh_torture_synchronize(void) | ||
497 | { | ||
498 | struct rcu_bh_torture_synchronize rcu; | ||
499 | |||
500 | init_rcu_head_on_stack(&rcu.head); | ||
501 | init_completion(&rcu.completion); | ||
502 | call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb); | ||
503 | wait_for_completion(&rcu.completion); | ||
504 | destroy_rcu_head_on_stack(&rcu.head); | ||
505 | } | ||
506 | |||
507 | static struct rcu_torture_ops rcu_bh_ops = { | 503 | static struct rcu_torture_ops rcu_bh_ops = { |
508 | .init = NULL, | 504 | .init = NULL, |
509 | .cleanup = NULL, | 505 | .cleanup = NULL, |
@@ -512,7 +508,7 @@ static struct rcu_torture_ops rcu_bh_ops = { | |||
512 | .readunlock = rcu_bh_torture_read_unlock, | 508 | .readunlock = rcu_bh_torture_read_unlock, |
513 | .completed = rcu_bh_torture_completed, | 509 | .completed = rcu_bh_torture_completed, |
514 | .deferred_free = rcu_bh_torture_deferred_free, | 510 | .deferred_free = rcu_bh_torture_deferred_free, |
515 | .sync = rcu_bh_torture_synchronize, | 511 | .sync = synchronize_rcu_bh, |
516 | .cb_barrier = rcu_barrier_bh, | 512 | .cb_barrier = rcu_barrier_bh, |
517 | .fqs = rcu_bh_force_quiescent_state, | 513 | .fqs = rcu_bh_force_quiescent_state, |
518 | .stats = NULL, | 514 | .stats = NULL, |
@@ -528,7 +524,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = { | |||
528 | .readunlock = rcu_bh_torture_read_unlock, | 524 | .readunlock = rcu_bh_torture_read_unlock, |
529 | .completed = rcu_bh_torture_completed, | 525 | .completed = rcu_bh_torture_completed, |
530 | .deferred_free = rcu_sync_torture_deferred_free, | 526 | .deferred_free = rcu_sync_torture_deferred_free, |
531 | .sync = rcu_bh_torture_synchronize, | 527 | .sync = synchronize_rcu_bh, |
532 | .cb_barrier = NULL, | 528 | .cb_barrier = NULL, |
533 | .fqs = rcu_bh_force_quiescent_state, | 529 | .fqs = rcu_bh_force_quiescent_state, |
534 | .stats = NULL, | 530 | .stats = NULL, |
@@ -536,6 +532,22 @@ static struct rcu_torture_ops rcu_bh_sync_ops = { | |||
536 | .name = "rcu_bh_sync" | 532 | .name = "rcu_bh_sync" |
537 | }; | 533 | }; |
538 | 534 | ||
535 | static struct rcu_torture_ops rcu_bh_expedited_ops = { | ||
536 | .init = rcu_sync_torture_init, | ||
537 | .cleanup = NULL, | ||
538 | .readlock = rcu_bh_torture_read_lock, | ||
539 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | ||
540 | .readunlock = rcu_bh_torture_read_unlock, | ||
541 | .completed = rcu_bh_torture_completed, | ||
542 | .deferred_free = rcu_sync_torture_deferred_free, | ||
543 | .sync = synchronize_rcu_bh_expedited, | ||
544 | .cb_barrier = NULL, | ||
545 | .fqs = rcu_bh_force_quiescent_state, | ||
546 | .stats = NULL, | ||
547 | .irq_capable = 1, | ||
548 | .name = "rcu_bh_expedited" | ||
549 | }; | ||
550 | |||
539 | /* | 551 | /* |
540 | * Definitions for srcu torture testing. | 552 | * Definitions for srcu torture testing. |
541 | */ | 553 | */ |
@@ -620,6 +632,30 @@ static struct rcu_torture_ops srcu_ops = { | |||
620 | .name = "srcu" | 632 | .name = "srcu" |
621 | }; | 633 | }; |
622 | 634 | ||
635 | static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl) | ||
636 | { | ||
637 | return srcu_read_lock_raw(&srcu_ctl); | ||
638 | } | ||
639 | |||
640 | static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl) | ||
641 | { | ||
642 | srcu_read_unlock_raw(&srcu_ctl, idx); | ||
643 | } | ||
644 | |||
645 | static struct rcu_torture_ops srcu_raw_ops = { | ||
646 | .init = srcu_torture_init, | ||
647 | .cleanup = srcu_torture_cleanup, | ||
648 | .readlock = srcu_torture_read_lock_raw, | ||
649 | .read_delay = srcu_read_delay, | ||
650 | .readunlock = srcu_torture_read_unlock_raw, | ||
651 | .completed = srcu_torture_completed, | ||
652 | .deferred_free = rcu_sync_torture_deferred_free, | ||
653 | .sync = srcu_torture_synchronize, | ||
654 | .cb_barrier = NULL, | ||
655 | .stats = srcu_torture_stats, | ||
656 | .name = "srcu_raw" | ||
657 | }; | ||
658 | |||
623 | static void srcu_torture_synchronize_expedited(void) | 659 | static void srcu_torture_synchronize_expedited(void) |
624 | { | 660 | { |
625 | synchronize_srcu_expedited(&srcu_ctl); | 661 | synchronize_srcu_expedited(&srcu_ctl); |
@@ -659,11 +695,6 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p) | |||
659 | call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); | 695 | call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); |
660 | } | 696 | } |
661 | 697 | ||
662 | static void sched_torture_synchronize(void) | ||
663 | { | ||
664 | synchronize_sched(); | ||
665 | } | ||
666 | |||
667 | static struct rcu_torture_ops sched_ops = { | 698 | static struct rcu_torture_ops sched_ops = { |
668 | .init = rcu_sync_torture_init, | 699 | .init = rcu_sync_torture_init, |
669 | .cleanup = NULL, | 700 | .cleanup = NULL, |
@@ -672,7 +703,7 @@ static struct rcu_torture_ops sched_ops = { | |||
672 | .readunlock = sched_torture_read_unlock, | 703 | .readunlock = sched_torture_read_unlock, |
673 | .completed = rcu_no_completed, | 704 | .completed = rcu_no_completed, |
674 | .deferred_free = rcu_sched_torture_deferred_free, | 705 | .deferred_free = rcu_sched_torture_deferred_free, |
675 | .sync = sched_torture_synchronize, | 706 | .sync = synchronize_sched, |
676 | .cb_barrier = rcu_barrier_sched, | 707 | .cb_barrier = rcu_barrier_sched, |
677 | .fqs = rcu_sched_force_quiescent_state, | 708 | .fqs = rcu_sched_force_quiescent_state, |
678 | .stats = NULL, | 709 | .stats = NULL, |
@@ -688,7 +719,7 @@ static struct rcu_torture_ops sched_sync_ops = { | |||
688 | .readunlock = sched_torture_read_unlock, | 719 | .readunlock = sched_torture_read_unlock, |
689 | .completed = rcu_no_completed, | 720 | .completed = rcu_no_completed, |
690 | .deferred_free = rcu_sync_torture_deferred_free, | 721 | .deferred_free = rcu_sync_torture_deferred_free, |
691 | .sync = sched_torture_synchronize, | 722 | .sync = synchronize_sched, |
692 | .cb_barrier = NULL, | 723 | .cb_barrier = NULL, |
693 | .fqs = rcu_sched_force_quiescent_state, | 724 | .fqs = rcu_sched_force_quiescent_state, |
694 | .stats = NULL, | 725 | .stats = NULL, |
@@ -754,7 +785,7 @@ static int rcu_torture_boost(void *arg) | |||
754 | do { | 785 | do { |
755 | /* Wait for the next test interval. */ | 786 | /* Wait for the next test interval. */ |
756 | oldstarttime = boost_starttime; | 787 | oldstarttime = boost_starttime; |
757 | while (jiffies - oldstarttime > ULONG_MAX / 2) { | 788 | while (ULONG_CMP_LT(jiffies, oldstarttime)) { |
758 | schedule_timeout_uninterruptible(1); | 789 | schedule_timeout_uninterruptible(1); |
759 | rcu_stutter_wait("rcu_torture_boost"); | 790 | rcu_stutter_wait("rcu_torture_boost"); |
760 | if (kthread_should_stop() || | 791 | if (kthread_should_stop() || |
@@ -765,7 +796,7 @@ static int rcu_torture_boost(void *arg) | |||
765 | /* Do one boost-test interval. */ | 796 | /* Do one boost-test interval. */ |
766 | endtime = oldstarttime + test_boost_duration * HZ; | 797 | endtime = oldstarttime + test_boost_duration * HZ; |
767 | call_rcu_time = jiffies; | 798 | call_rcu_time = jiffies; |
768 | while (jiffies - endtime > ULONG_MAX / 2) { | 799 | while (ULONG_CMP_LT(jiffies, endtime)) { |
769 | /* If we don't have a callback in flight, post one. */ | 800 | /* If we don't have a callback in flight, post one. */ |
770 | if (!rbi.inflight) { | 801 | if (!rbi.inflight) { |
771 | smp_mb(); /* RCU core before ->inflight = 1. */ | 802 | smp_mb(); /* RCU core before ->inflight = 1. */ |
@@ -792,7 +823,8 @@ static int rcu_torture_boost(void *arg) | |||
792 | * interval. Besides, we are running at RT priority, | 823 | * interval. Besides, we are running at RT priority, |
793 | * so delays should be relatively rare. | 824 | * so delays should be relatively rare. |
794 | */ | 825 | */ |
795 | while (oldstarttime == boost_starttime) { | 826 | while (oldstarttime == boost_starttime && |
827 | !kthread_should_stop()) { | ||
796 | if (mutex_trylock(&boost_mutex)) { | 828 | if (mutex_trylock(&boost_mutex)) { |
797 | boost_starttime = jiffies + | 829 | boost_starttime = jiffies + |
798 | test_boost_interval * HZ; | 830 | test_boost_interval * HZ; |
@@ -809,11 +841,11 @@ checkwait: rcu_stutter_wait("rcu_torture_boost"); | |||
809 | 841 | ||
810 | /* Clean up and exit. */ | 842 | /* Clean up and exit. */ |
811 | VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); | 843 | VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); |
812 | destroy_rcu_head_on_stack(&rbi.rcu); | ||
813 | rcutorture_shutdown_absorb("rcu_torture_boost"); | 844 | rcutorture_shutdown_absorb("rcu_torture_boost"); |
814 | while (!kthread_should_stop() || rbi.inflight) | 845 | while (!kthread_should_stop() || rbi.inflight) |
815 | schedule_timeout_uninterruptible(1); | 846 | schedule_timeout_uninterruptible(1); |
816 | smp_mb(); /* order accesses to ->inflight before stack-frame death. */ | 847 | smp_mb(); /* order accesses to ->inflight before stack-frame death. */ |
848 | destroy_rcu_head_on_stack(&rbi.rcu); | ||
817 | return 0; | 849 | return 0; |
818 | } | 850 | } |
819 | 851 | ||
@@ -831,11 +863,13 @@ rcu_torture_fqs(void *arg) | |||
831 | VERBOSE_PRINTK_STRING("rcu_torture_fqs task started"); | 863 | VERBOSE_PRINTK_STRING("rcu_torture_fqs task started"); |
832 | do { | 864 | do { |
833 | fqs_resume_time = jiffies + fqs_stutter * HZ; | 865 | fqs_resume_time = jiffies + fqs_stutter * HZ; |
834 | while (jiffies - fqs_resume_time > LONG_MAX) { | 866 | while (ULONG_CMP_LT(jiffies, fqs_resume_time) && |
867 | !kthread_should_stop()) { | ||
835 | schedule_timeout_interruptible(1); | 868 | schedule_timeout_interruptible(1); |
836 | } | 869 | } |
837 | fqs_burst_remaining = fqs_duration; | 870 | fqs_burst_remaining = fqs_duration; |
838 | while (fqs_burst_remaining > 0) { | 871 | while (fqs_burst_remaining > 0 && |
872 | !kthread_should_stop()) { | ||
839 | cur_ops->fqs(); | 873 | cur_ops->fqs(); |
840 | udelay(fqs_holdoff); | 874 | udelay(fqs_holdoff); |
841 | fqs_burst_remaining -= fqs_holdoff; | 875 | fqs_burst_remaining -= fqs_holdoff; |
@@ -923,6 +957,18 @@ rcu_torture_fakewriter(void *arg) | |||
923 | return 0; | 957 | return 0; |
924 | } | 958 | } |
925 | 959 | ||
960 | void rcutorture_trace_dump(void) | ||
961 | { | ||
962 | static atomic_t beenhere = ATOMIC_INIT(0); | ||
963 | |||
964 | if (atomic_read(&beenhere)) | ||
965 | return; | ||
966 | if (atomic_xchg(&beenhere, 1) != 0) | ||
967 | return; | ||
968 | do_trace_rcu_torture_read(cur_ops->name, (struct rcu_head *)~0UL); | ||
969 | ftrace_dump(DUMP_ALL); | ||
970 | } | ||
971 | |||
926 | /* | 972 | /* |
927 | * RCU torture reader from timer handler. Dereferences rcu_torture_current, | 973 | * RCU torture reader from timer handler. Dereferences rcu_torture_current, |
928 | * incrementing the corresponding element of the pipeline array. The | 974 | * incrementing the corresponding element of the pipeline array. The |
@@ -944,6 +990,7 @@ static void rcu_torture_timer(unsigned long unused) | |||
944 | rcu_read_lock_bh_held() || | 990 | rcu_read_lock_bh_held() || |
945 | rcu_read_lock_sched_held() || | 991 | rcu_read_lock_sched_held() || |
946 | srcu_read_lock_held(&srcu_ctl)); | 992 | srcu_read_lock_held(&srcu_ctl)); |
993 | do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu); | ||
947 | if (p == NULL) { | 994 | if (p == NULL) { |
948 | /* Leave because rcu_torture_writer is not yet underway */ | 995 | /* Leave because rcu_torture_writer is not yet underway */ |
949 | cur_ops->readunlock(idx); | 996 | cur_ops->readunlock(idx); |
@@ -961,6 +1008,8 @@ static void rcu_torture_timer(unsigned long unused) | |||
961 | /* Should not happen, but... */ | 1008 | /* Should not happen, but... */ |
962 | pipe_count = RCU_TORTURE_PIPE_LEN; | 1009 | pipe_count = RCU_TORTURE_PIPE_LEN; |
963 | } | 1010 | } |
1011 | if (pipe_count > 1) | ||
1012 | rcutorture_trace_dump(); | ||
964 | __this_cpu_inc(rcu_torture_count[pipe_count]); | 1013 | __this_cpu_inc(rcu_torture_count[pipe_count]); |
965 | completed = cur_ops->completed() - completed; | 1014 | completed = cur_ops->completed() - completed; |
966 | if (completed > RCU_TORTURE_PIPE_LEN) { | 1015 | if (completed > RCU_TORTURE_PIPE_LEN) { |
@@ -1004,6 +1053,7 @@ rcu_torture_reader(void *arg) | |||
1004 | rcu_read_lock_bh_held() || | 1053 | rcu_read_lock_bh_held() || |
1005 | rcu_read_lock_sched_held() || | 1054 | rcu_read_lock_sched_held() || |
1006 | srcu_read_lock_held(&srcu_ctl)); | 1055 | srcu_read_lock_held(&srcu_ctl)); |
1056 | do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu); | ||
1007 | if (p == NULL) { | 1057 | if (p == NULL) { |
1008 | /* Wait for rcu_torture_writer to get underway */ | 1058 | /* Wait for rcu_torture_writer to get underway */ |
1009 | cur_ops->readunlock(idx); | 1059 | cur_ops->readunlock(idx); |
@@ -1019,6 +1069,8 @@ rcu_torture_reader(void *arg) | |||
1019 | /* Should not happen, but... */ | 1069 | /* Should not happen, but... */ |
1020 | pipe_count = RCU_TORTURE_PIPE_LEN; | 1070 | pipe_count = RCU_TORTURE_PIPE_LEN; |
1021 | } | 1071 | } |
1072 | if (pipe_count > 1) | ||
1073 | rcutorture_trace_dump(); | ||
1022 | __this_cpu_inc(rcu_torture_count[pipe_count]); | 1074 | __this_cpu_inc(rcu_torture_count[pipe_count]); |
1023 | completed = cur_ops->completed() - completed; | 1075 | completed = cur_ops->completed() - completed; |
1024 | if (completed > RCU_TORTURE_PIPE_LEN) { | 1076 | if (completed > RCU_TORTURE_PIPE_LEN) { |
@@ -1066,7 +1118,8 @@ rcu_torture_printk(char *page) | |||
1066 | cnt += sprintf(&page[cnt], | 1118 | cnt += sprintf(&page[cnt], |
1067 | "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " | 1119 | "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " |
1068 | "rtmbe: %d rtbke: %ld rtbre: %ld " | 1120 | "rtmbe: %d rtbke: %ld rtbre: %ld " |
1069 | "rtbf: %ld rtb: %ld nt: %ld", | 1121 | "rtbf: %ld rtb: %ld nt: %ld " |
1122 | "onoff: %ld/%ld:%ld/%ld", | ||
1070 | rcu_torture_current, | 1123 | rcu_torture_current, |
1071 | rcu_torture_current_version, | 1124 | rcu_torture_current_version, |
1072 | list_empty(&rcu_torture_freelist), | 1125 | list_empty(&rcu_torture_freelist), |
@@ -1078,7 +1131,11 @@ rcu_torture_printk(char *page) | |||
1078 | n_rcu_torture_boost_rterror, | 1131 | n_rcu_torture_boost_rterror, |
1079 | n_rcu_torture_boost_failure, | 1132 | n_rcu_torture_boost_failure, |
1080 | n_rcu_torture_boosts, | 1133 | n_rcu_torture_boosts, |
1081 | n_rcu_torture_timers); | 1134 | n_rcu_torture_timers, |
1135 | n_online_successes, | ||
1136 | n_online_attempts, | ||
1137 | n_offline_successes, | ||
1138 | n_offline_attempts); | ||
1082 | if (atomic_read(&n_rcu_torture_mberror) != 0 || | 1139 | if (atomic_read(&n_rcu_torture_mberror) != 0 || |
1083 | n_rcu_torture_boost_ktrerror != 0 || | 1140 | n_rcu_torture_boost_ktrerror != 0 || |
1084 | n_rcu_torture_boost_rterror != 0 || | 1141 | n_rcu_torture_boost_rterror != 0 || |
@@ -1242,12 +1299,14 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) | |||
1242 | "shuffle_interval=%d stutter=%d irqreader=%d " | 1299 | "shuffle_interval=%d stutter=%d irqreader=%d " |
1243 | "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " | 1300 | "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " |
1244 | "test_boost=%d/%d test_boost_interval=%d " | 1301 | "test_boost=%d/%d test_boost_interval=%d " |
1245 | "test_boost_duration=%d\n", | 1302 | "test_boost_duration=%d shutdown_secs=%d " |
1303 | "onoff_interval=%d\n", | ||
1246 | torture_type, tag, nrealreaders, nfakewriters, | 1304 | torture_type, tag, nrealreaders, nfakewriters, |
1247 | stat_interval, verbose, test_no_idle_hz, shuffle_interval, | 1305 | stat_interval, verbose, test_no_idle_hz, shuffle_interval, |
1248 | stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, | 1306 | stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, |
1249 | test_boost, cur_ops->can_boost, | 1307 | test_boost, cur_ops->can_boost, |
1250 | test_boost_interval, test_boost_duration); | 1308 | test_boost_interval, test_boost_duration, shutdown_secs, |
1309 | onoff_interval); | ||
1251 | } | 1310 | } |
1252 | 1311 | ||
1253 | static struct notifier_block rcutorture_shutdown_nb = { | 1312 | static struct notifier_block rcutorture_shutdown_nb = { |
@@ -1280,8 +1339,9 @@ static int rcutorture_booster_init(int cpu) | |||
1280 | /* Don't allow time recalculation while creating a new task. */ | 1339 | /* Don't allow time recalculation while creating a new task. */ |
1281 | mutex_lock(&boost_mutex); | 1340 | mutex_lock(&boost_mutex); |
1282 | VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task"); | 1341 | VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task"); |
1283 | boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL, | 1342 | boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL, |
1284 | "rcu_torture_boost"); | 1343 | cpu_to_node(cpu), |
1344 | "rcu_torture_boost"); | ||
1285 | if (IS_ERR(boost_tasks[cpu])) { | 1345 | if (IS_ERR(boost_tasks[cpu])) { |
1286 | retval = PTR_ERR(boost_tasks[cpu]); | 1346 | retval = PTR_ERR(boost_tasks[cpu]); |
1287 | VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed"); | 1347 | VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed"); |
@@ -1296,6 +1356,131 @@ static int rcutorture_booster_init(int cpu) | |||
1296 | return 0; | 1356 | return 0; |
1297 | } | 1357 | } |
1298 | 1358 | ||
1359 | /* | ||
1360 | * Cause the rcutorture test to shutdown the system after the test has | ||
1361 | * run for the time specified by the shutdown_secs module parameter. | ||
1362 | */ | ||
1363 | static int | ||
1364 | rcu_torture_shutdown(void *arg) | ||
1365 | { | ||
1366 | long delta; | ||
1367 | unsigned long jiffies_snap; | ||
1368 | |||
1369 | VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started"); | ||
1370 | jiffies_snap = ACCESS_ONCE(jiffies); | ||
1371 | while (ULONG_CMP_LT(jiffies_snap, shutdown_time) && | ||
1372 | !kthread_should_stop()) { | ||
1373 | delta = shutdown_time - jiffies_snap; | ||
1374 | if (verbose) | ||
1375 | printk(KERN_ALERT "%s" TORTURE_FLAG | ||
1376 | "rcu_torture_shutdown task: %lu " | ||
1377 | "jiffies remaining\n", | ||
1378 | torture_type, delta); | ||
1379 | schedule_timeout_interruptible(delta); | ||
1380 | jiffies_snap = ACCESS_ONCE(jiffies); | ||
1381 | } | ||
1382 | if (kthread_should_stop()) { | ||
1383 | VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping"); | ||
1384 | return 0; | ||
1385 | } | ||
1386 | |||
1387 | /* OK, shut down the system. */ | ||
1388 | |||
1389 | VERBOSE_PRINTK_STRING("rcu_torture_shutdown task shutting down system"); | ||
1390 | shutdown_task = NULL; /* Avoid self-kill deadlock. */ | ||
1391 | rcu_torture_cleanup(); /* Get the success/failure message. */ | ||
1392 | kernel_power_off(); /* Shut down the system. */ | ||
1393 | return 0; | ||
1394 | } | ||
1395 | |||
1396 | #ifdef CONFIG_HOTPLUG_CPU | ||
1397 | |||
1398 | /* | ||
1399 | * Execute random CPU-hotplug operations at the interval specified | ||
1400 | * by the onoff_interval. | ||
1401 | */ | ||
1402 | static int | ||
1403 | rcu_torture_onoff(void *arg) | ||
1404 | { | ||
1405 | int cpu; | ||
1406 | int maxcpu = -1; | ||
1407 | DEFINE_RCU_RANDOM(rand); | ||
1408 | |||
1409 | VERBOSE_PRINTK_STRING("rcu_torture_onoff task started"); | ||
1410 | for_each_online_cpu(cpu) | ||
1411 | maxcpu = cpu; | ||
1412 | WARN_ON(maxcpu < 0); | ||
1413 | while (!kthread_should_stop()) { | ||
1414 | cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1); | ||
1415 | if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { | ||
1416 | if (verbose) | ||
1417 | printk(KERN_ALERT "%s" TORTURE_FLAG | ||
1418 | "rcu_torture_onoff task: offlining %d\n", | ||
1419 | torture_type, cpu); | ||
1420 | n_offline_attempts++; | ||
1421 | if (cpu_down(cpu) == 0) { | ||
1422 | if (verbose) | ||
1423 | printk(KERN_ALERT "%s" TORTURE_FLAG | ||
1424 | "rcu_torture_onoff task: " | ||
1425 | "offlined %d\n", | ||
1426 | torture_type, cpu); | ||
1427 | n_offline_successes++; | ||
1428 | } | ||
1429 | } else if (cpu_is_hotpluggable(cpu)) { | ||
1430 | if (verbose) | ||
1431 | printk(KERN_ALERT "%s" TORTURE_FLAG | ||
1432 | "rcu_torture_onoff task: onlining %d\n", | ||
1433 | torture_type, cpu); | ||
1434 | n_online_attempts++; | ||
1435 | if (cpu_up(cpu) == 0) { | ||
1436 | if (verbose) | ||
1437 | printk(KERN_ALERT "%s" TORTURE_FLAG | ||
1438 | "rcu_torture_onoff task: " | ||
1439 | "onlined %d\n", | ||
1440 | torture_type, cpu); | ||
1441 | n_online_successes++; | ||
1442 | } | ||
1443 | } | ||
1444 | schedule_timeout_interruptible(onoff_interval * HZ); | ||
1445 | } | ||
1446 | VERBOSE_PRINTK_STRING("rcu_torture_onoff task stopping"); | ||
1447 | return 0; | ||
1448 | } | ||
1449 | |||
1450 | static int | ||
1451 | rcu_torture_onoff_init(void) | ||
1452 | { | ||
1453 | if (onoff_interval <= 0) | ||
1454 | return 0; | ||
1455 | onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff"); | ||
1456 | if (IS_ERR(onoff_task)) { | ||
1457 | onoff_task = NULL; | ||
1458 | return PTR_ERR(onoff_task); | ||
1459 | } | ||
1460 | return 0; | ||
1461 | } | ||
1462 | |||
1463 | static void rcu_torture_onoff_cleanup(void) | ||
1464 | { | ||
1465 | if (onoff_task == NULL) | ||
1466 | return; | ||
1467 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task"); | ||
1468 | kthread_stop(onoff_task); | ||
1469 | } | ||
1470 | |||
1471 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
1472 | |||
1473 | static void | ||
1474 | rcu_torture_onoff_init(void) | ||
1475 | { | ||
1476 | } | ||
1477 | |||
1478 | static void rcu_torture_onoff_cleanup(void) | ||
1479 | { | ||
1480 | } | ||
1481 | |||
1482 | #endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ | ||
1483 | |||
1299 | static int rcutorture_cpu_notify(struct notifier_block *self, | 1484 | static int rcutorture_cpu_notify(struct notifier_block *self, |
1300 | unsigned long action, void *hcpu) | 1485 | unsigned long action, void *hcpu) |
1301 | { | 1486 | { |
@@ -1400,6 +1585,11 @@ rcu_torture_cleanup(void) | |||
1400 | for_each_possible_cpu(i) | 1585 | for_each_possible_cpu(i) |
1401 | rcutorture_booster_cleanup(i); | 1586 | rcutorture_booster_cleanup(i); |
1402 | } | 1587 | } |
1588 | if (shutdown_task != NULL) { | ||
1589 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task"); | ||
1590 | kthread_stop(shutdown_task); | ||
1591 | } | ||
1592 | rcu_torture_onoff_cleanup(); | ||
1403 | 1593 | ||
1404 | /* Wait for all RCU callbacks to fire. */ | 1594 | /* Wait for all RCU callbacks to fire. */ |
1405 | 1595 | ||
@@ -1424,8 +1614,8 @@ rcu_torture_init(void) | |||
1424 | int firsterr = 0; | 1614 | int firsterr = 0; |
1425 | static struct rcu_torture_ops *torture_ops[] = | 1615 | static struct rcu_torture_ops *torture_ops[] = |
1426 | { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, | 1616 | { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, |
1427 | &rcu_bh_ops, &rcu_bh_sync_ops, | 1617 | &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, |
1428 | &srcu_ops, &srcu_expedited_ops, | 1618 | &srcu_ops, &srcu_raw_ops, &srcu_expedited_ops, |
1429 | &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; | 1619 | &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; |
1430 | 1620 | ||
1431 | mutex_lock(&fullstop_mutex); | 1621 | mutex_lock(&fullstop_mutex); |
@@ -1616,6 +1806,18 @@ rcu_torture_init(void) | |||
1616 | } | 1806 | } |
1617 | } | 1807 | } |
1618 | } | 1808 | } |
1809 | if (shutdown_secs > 0) { | ||
1810 | shutdown_time = jiffies + shutdown_secs * HZ; | ||
1811 | shutdown_task = kthread_run(rcu_torture_shutdown, NULL, | ||
1812 | "rcu_torture_shutdown"); | ||
1813 | if (IS_ERR(shutdown_task)) { | ||
1814 | firsterr = PTR_ERR(shutdown_task); | ||
1815 | VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown"); | ||
1816 | shutdown_task = NULL; | ||
1817 | goto unwind; | ||
1818 | } | ||
1819 | } | ||
1820 | rcu_torture_onoff_init(); | ||
1619 | register_reboot_notifier(&rcutorture_shutdown_nb); | 1821 | register_reboot_notifier(&rcutorture_shutdown_nb); |
1620 | rcutorture_record_test_transition(); | 1822 | rcutorture_record_test_transition(); |
1621 | mutex_unlock(&fullstop_mutex); | 1823 | mutex_unlock(&fullstop_mutex); |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index ba06207b1dd3..6c4a6722abfd 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -38,7 +38,7 @@ | |||
38 | #include <linux/nmi.h> | 38 | #include <linux/nmi.h> |
39 | #include <linux/atomic.h> | 39 | #include <linux/atomic.h> |
40 | #include <linux/bitops.h> | 40 | #include <linux/bitops.h> |
41 | #include <linux/module.h> | 41 | #include <linux/export.h> |
42 | #include <linux/completion.h> | 42 | #include <linux/completion.h> |
43 | #include <linux/moduleparam.h> | 43 | #include <linux/moduleparam.h> |
44 | #include <linux/percpu.h> | 44 | #include <linux/percpu.h> |
@@ -52,13 +52,16 @@ | |||
52 | #include <linux/prefetch.h> | 52 | #include <linux/prefetch.h> |
53 | 53 | ||
54 | #include "rcutree.h" | 54 | #include "rcutree.h" |
55 | #include <trace/events/rcu.h> | ||
56 | |||
57 | #include "rcu.h" | ||
55 | 58 | ||
56 | /* Data structures. */ | 59 | /* Data structures. */ |
57 | 60 | ||
58 | static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; | 61 | static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; |
59 | 62 | ||
60 | #define RCU_STATE_INITIALIZER(structname) { \ | 63 | #define RCU_STATE_INITIALIZER(structname) { \ |
61 | .level = { &structname.node[0] }, \ | 64 | .level = { &structname##_state.node[0] }, \ |
62 | .levelcnt = { \ | 65 | .levelcnt = { \ |
63 | NUM_RCU_LVL_0, /* root of hierarchy. */ \ | 66 | NUM_RCU_LVL_0, /* root of hierarchy. */ \ |
64 | NUM_RCU_LVL_1, \ | 67 | NUM_RCU_LVL_1, \ |
@@ -66,20 +69,20 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; | |||
66 | NUM_RCU_LVL_3, \ | 69 | NUM_RCU_LVL_3, \ |
67 | NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \ | 70 | NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \ |
68 | }, \ | 71 | }, \ |
69 | .signaled = RCU_GP_IDLE, \ | 72 | .fqs_state = RCU_GP_IDLE, \ |
70 | .gpnum = -300, \ | 73 | .gpnum = -300, \ |
71 | .completed = -300, \ | 74 | .completed = -300, \ |
72 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \ | 75 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ |
73 | .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \ | 76 | .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \ |
74 | .n_force_qs = 0, \ | 77 | .n_force_qs = 0, \ |
75 | .n_force_qs_ngp = 0, \ | 78 | .n_force_qs_ngp = 0, \ |
76 | .name = #structname, \ | 79 | .name = #structname, \ |
77 | } | 80 | } |
78 | 81 | ||
79 | struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state); | 82 | struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched); |
80 | DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); | 83 | DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); |
81 | 84 | ||
82 | struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); | 85 | struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh); |
83 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); | 86 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); |
84 | 87 | ||
85 | static struct rcu_state *rcu_state; | 88 | static struct rcu_state *rcu_state; |
@@ -128,8 +131,6 @@ static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); | |||
128 | static void invoke_rcu_core(void); | 131 | static void invoke_rcu_core(void); |
129 | static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); | 132 | static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); |
130 | 133 | ||
131 | #define RCU_KTHREAD_PRIO 1 /* RT priority for per-CPU kthreads. */ | ||
132 | |||
133 | /* | 134 | /* |
134 | * Track the rcutorture test sequence number and the update version | 135 | * Track the rcutorture test sequence number and the update version |
135 | * number within a given test. The rcutorture_testseq is incremented | 136 | * number within a given test. The rcutorture_testseq is incremented |
@@ -156,44 +157,50 @@ static int rcu_gp_in_progress(struct rcu_state *rsp) | |||
156 | * Note a quiescent state. Because we do not need to know | 157 | * Note a quiescent state. Because we do not need to know |
157 | * how many quiescent states passed, just if there was at least | 158 | * how many quiescent states passed, just if there was at least |
158 | * one since the start of the grace period, this just sets a flag. | 159 | * one since the start of the grace period, this just sets a flag. |
160 | * The caller must have disabled preemption. | ||
159 | */ | 161 | */ |
160 | void rcu_sched_qs(int cpu) | 162 | void rcu_sched_qs(int cpu) |
161 | { | 163 | { |
162 | struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); | 164 | struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); |
163 | 165 | ||
164 | rdp->passed_quiesc_completed = rdp->gpnum - 1; | 166 | rdp->passed_quiesce_gpnum = rdp->gpnum; |
165 | barrier(); | 167 | barrier(); |
166 | rdp->passed_quiesc = 1; | 168 | if (rdp->passed_quiesce == 0) |
169 | trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs"); | ||
170 | rdp->passed_quiesce = 1; | ||
167 | } | 171 | } |
168 | 172 | ||
169 | void rcu_bh_qs(int cpu) | 173 | void rcu_bh_qs(int cpu) |
170 | { | 174 | { |
171 | struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); | 175 | struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); |
172 | 176 | ||
173 | rdp->passed_quiesc_completed = rdp->gpnum - 1; | 177 | rdp->passed_quiesce_gpnum = rdp->gpnum; |
174 | barrier(); | 178 | barrier(); |
175 | rdp->passed_quiesc = 1; | 179 | if (rdp->passed_quiesce == 0) |
180 | trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs"); | ||
181 | rdp->passed_quiesce = 1; | ||
176 | } | 182 | } |
177 | 183 | ||
178 | /* | 184 | /* |
179 | * Note a context switch. This is a quiescent state for RCU-sched, | 185 | * Note a context switch. This is a quiescent state for RCU-sched, |
180 | * and requires special handling for preemptible RCU. | 186 | * and requires special handling for preemptible RCU. |
187 | * The caller must have disabled preemption. | ||
181 | */ | 188 | */ |
182 | void rcu_note_context_switch(int cpu) | 189 | void rcu_note_context_switch(int cpu) |
183 | { | 190 | { |
191 | trace_rcu_utilization("Start context switch"); | ||
184 | rcu_sched_qs(cpu); | 192 | rcu_sched_qs(cpu); |
185 | rcu_preempt_note_context_switch(cpu); | 193 | rcu_preempt_note_context_switch(cpu); |
194 | trace_rcu_utilization("End context switch"); | ||
186 | } | 195 | } |
187 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); | 196 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); |
188 | 197 | ||
189 | #ifdef CONFIG_NO_HZ | ||
190 | DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { | 198 | DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { |
191 | .dynticks_nesting = 1, | 199 | .dynticks_nesting = DYNTICK_TASK_NESTING, |
192 | .dynticks = ATOMIC_INIT(1), | 200 | .dynticks = ATOMIC_INIT(1), |
193 | }; | 201 | }; |
194 | #endif /* #ifdef CONFIG_NO_HZ */ | ||
195 | 202 | ||
196 | static int blimit = 10; /* Maximum callbacks per softirq. */ | 203 | static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ |
197 | static int qhimark = 10000; /* If this many pending, ignore blimit. */ | 204 | static int qhimark = 10000; /* If this many pending, ignore blimit. */ |
198 | static int qlowmark = 100; /* Once only this many pending, use blimit. */ | 205 | static int qlowmark = 100; /* Once only this many pending, use blimit. */ |
199 | 206 | ||
@@ -314,15 +321,16 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) | |||
314 | * trust its state not to change because interrupts are disabled. | 321 | * trust its state not to change because interrupts are disabled. |
315 | */ | 322 | */ |
316 | if (cpu_is_offline(rdp->cpu)) { | 323 | if (cpu_is_offline(rdp->cpu)) { |
324 | trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl"); | ||
317 | rdp->offline_fqs++; | 325 | rdp->offline_fqs++; |
318 | return 1; | 326 | return 1; |
319 | } | 327 | } |
320 | 328 | ||
321 | /* If preemptible RCU, no point in sending reschedule IPI. */ | 329 | /* |
322 | if (rdp->preemptible) | 330 | * The CPU is online, so send it a reschedule IPI. This forces |
323 | return 0; | 331 | * it through the scheduler, and (inefficiently) also handles cases |
324 | 332 | * where idle loops fail to inform RCU about the CPU being idle. | |
325 | /* The CPU is online, so send it a reschedule IPI. */ | 333 | */ |
326 | if (rdp->cpu != smp_processor_id()) | 334 | if (rdp->cpu != smp_processor_id()) |
327 | smp_send_reschedule(rdp->cpu); | 335 | smp_send_reschedule(rdp->cpu); |
328 | else | 336 | else |
@@ -333,64 +341,181 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) | |||
333 | 341 | ||
334 | #endif /* #ifdef CONFIG_SMP */ | 342 | #endif /* #ifdef CONFIG_SMP */ |
335 | 343 | ||
336 | #ifdef CONFIG_NO_HZ | 344 | /* |
345 | * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle | ||
346 | * | ||
347 | * If the new value of the ->dynticks_nesting counter now is zero, | ||
348 | * we really have entered idle, and must do the appropriate accounting. | ||
349 | * The caller must have disabled interrupts. | ||
350 | */ | ||
351 | static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) | ||
352 | { | ||
353 | trace_rcu_dyntick("Start", oldval, 0); | ||
354 | if (!is_idle_task(current)) { | ||
355 | struct task_struct *idle = idle_task(smp_processor_id()); | ||
356 | |||
357 | trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); | ||
358 | ftrace_dump(DUMP_ALL); | ||
359 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", | ||
360 | current->pid, current->comm, | ||
361 | idle->pid, idle->comm); /* must be idle task! */ | ||
362 | } | ||
363 | rcu_prepare_for_idle(smp_processor_id()); | ||
364 | /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ | ||
365 | smp_mb__before_atomic_inc(); /* See above. */ | ||
366 | atomic_inc(&rdtp->dynticks); | ||
367 | smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ | ||
368 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); | ||
369 | } | ||
337 | 370 | ||
338 | /** | 371 | /** |
339 | * rcu_enter_nohz - inform RCU that current CPU is entering nohz | 372 | * rcu_idle_enter - inform RCU that current CPU is entering idle |
340 | * | 373 | * |
341 | * Enter nohz mode, in other words, -leave- the mode in which RCU | 374 | * Enter idle mode, in other words, -leave- the mode in which RCU |
342 | * read-side critical sections can occur. (Though RCU read-side | 375 | * read-side critical sections can occur. (Though RCU read-side |
343 | * critical sections can occur in irq handlers in nohz mode, a possibility | 376 | * critical sections can occur in irq handlers in idle, a possibility |
344 | * handled by rcu_irq_enter() and rcu_irq_exit()). | 377 | * handled by irq_enter() and irq_exit().) |
378 | * | ||
379 | * We crowbar the ->dynticks_nesting field to zero to allow for | ||
380 | * the possibility of usermode upcalls having messed up our count | ||
381 | * of interrupt nesting level during the prior busy period. | ||
345 | */ | 382 | */ |
346 | void rcu_enter_nohz(void) | 383 | void rcu_idle_enter(void) |
347 | { | 384 | { |
348 | unsigned long flags; | 385 | unsigned long flags; |
386 | long long oldval; | ||
349 | struct rcu_dynticks *rdtp; | 387 | struct rcu_dynticks *rdtp; |
350 | 388 | ||
351 | local_irq_save(flags); | 389 | local_irq_save(flags); |
352 | rdtp = &__get_cpu_var(rcu_dynticks); | 390 | rdtp = &__get_cpu_var(rcu_dynticks); |
353 | if (--rdtp->dynticks_nesting) { | 391 | oldval = rdtp->dynticks_nesting; |
354 | local_irq_restore(flags); | 392 | rdtp->dynticks_nesting = 0; |
355 | return; | 393 | rcu_idle_enter_common(rdtp, oldval); |
356 | } | ||
357 | /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ | ||
358 | smp_mb__before_atomic_inc(); /* See above. */ | ||
359 | atomic_inc(&rdtp->dynticks); | ||
360 | smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ | ||
361 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); | ||
362 | local_irq_restore(flags); | 394 | local_irq_restore(flags); |
363 | |||
364 | /* If the interrupt queued a callback, get out of dyntick mode. */ | ||
365 | if (in_irq() && | ||
366 | (__get_cpu_var(rcu_sched_data).nxtlist || | ||
367 | __get_cpu_var(rcu_bh_data).nxtlist || | ||
368 | rcu_preempt_needs_cpu(smp_processor_id()))) | ||
369 | set_need_resched(); | ||
370 | } | 395 | } |
371 | 396 | ||
372 | /* | 397 | /** |
373 | * rcu_exit_nohz - inform RCU that current CPU is leaving nohz | 398 | * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle |
399 | * | ||
400 | * Exit from an interrupt handler, which might possibly result in entering | ||
401 | * idle mode, in other words, leaving the mode in which read-side critical | ||
402 | * sections can occur. | ||
403 | * | ||
404 | * This code assumes that the idle loop never does anything that might | ||
405 | * result in unbalanced calls to irq_enter() and irq_exit(). If your | ||
406 | * architecture violates this assumption, RCU will give you what you | ||
407 | * deserve, good and hard. But very infrequently and irreproducibly. | ||
374 | * | 408 | * |
375 | * Exit nohz mode, in other words, -enter- the mode in which RCU | 409 | * Use things like work queues to work around this limitation. |
376 | * read-side critical sections normally occur. | 410 | * |
411 | * You have been warned. | ||
377 | */ | 412 | */ |
378 | void rcu_exit_nohz(void) | 413 | void rcu_irq_exit(void) |
379 | { | 414 | { |
380 | unsigned long flags; | 415 | unsigned long flags; |
416 | long long oldval; | ||
381 | struct rcu_dynticks *rdtp; | 417 | struct rcu_dynticks *rdtp; |
382 | 418 | ||
383 | local_irq_save(flags); | 419 | local_irq_save(flags); |
384 | rdtp = &__get_cpu_var(rcu_dynticks); | 420 | rdtp = &__get_cpu_var(rcu_dynticks); |
385 | if (rdtp->dynticks_nesting++) { | 421 | oldval = rdtp->dynticks_nesting; |
386 | local_irq_restore(flags); | 422 | rdtp->dynticks_nesting--; |
387 | return; | 423 | WARN_ON_ONCE(rdtp->dynticks_nesting < 0); |
388 | } | 424 | if (rdtp->dynticks_nesting) |
425 | trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting); | ||
426 | else | ||
427 | rcu_idle_enter_common(rdtp, oldval); | ||
428 | local_irq_restore(flags); | ||
429 | } | ||
430 | |||
431 | /* | ||
432 | * rcu_idle_exit_common - inform RCU that current CPU is moving away from idle | ||
433 | * | ||
434 | * If the new value of the ->dynticks_nesting counter was previously zero, | ||
435 | * we really have exited idle, and must do the appropriate accounting. | ||
436 | * The caller must have disabled interrupts. | ||
437 | */ | ||
438 | static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) | ||
439 | { | ||
389 | smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ | 440 | smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ |
390 | atomic_inc(&rdtp->dynticks); | 441 | atomic_inc(&rdtp->dynticks); |
391 | /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ | 442 | /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ |
392 | smp_mb__after_atomic_inc(); /* See above. */ | 443 | smp_mb__after_atomic_inc(); /* See above. */ |
393 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); | 444 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); |
445 | rcu_cleanup_after_idle(smp_processor_id()); | ||
446 | trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); | ||
447 | if (!is_idle_task(current)) { | ||
448 | struct task_struct *idle = idle_task(smp_processor_id()); | ||
449 | |||
450 | trace_rcu_dyntick("Error on exit: not idle task", | ||
451 | oldval, rdtp->dynticks_nesting); | ||
452 | ftrace_dump(DUMP_ALL); | ||
453 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", | ||
454 | current->pid, current->comm, | ||
455 | idle->pid, idle->comm); /* must be idle task! */ | ||
456 | } | ||
457 | } | ||
458 | |||
459 | /** | ||
460 | * rcu_idle_exit - inform RCU that current CPU is leaving idle | ||
461 | * | ||
462 | * Exit idle mode, in other words, -enter- the mode in which RCU | ||
463 | * read-side critical sections can occur. | ||
464 | * | ||
465 | * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NESTING to | ||
466 | * allow for the possibility of usermode upcalls messing up our count | ||
467 | * of interrupt nesting level during the busy period that is just | ||
468 | * now starting. | ||
469 | */ | ||
470 | void rcu_idle_exit(void) | ||
471 | { | ||
472 | unsigned long flags; | ||
473 | struct rcu_dynticks *rdtp; | ||
474 | long long oldval; | ||
475 | |||
476 | local_irq_save(flags); | ||
477 | rdtp = &__get_cpu_var(rcu_dynticks); | ||
478 | oldval = rdtp->dynticks_nesting; | ||
479 | WARN_ON_ONCE(oldval != 0); | ||
480 | rdtp->dynticks_nesting = DYNTICK_TASK_NESTING; | ||
481 | rcu_idle_exit_common(rdtp, oldval); | ||
482 | local_irq_restore(flags); | ||
483 | } | ||
484 | |||
485 | /** | ||
486 | * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle | ||
487 | * | ||
488 | * Enter an interrupt handler, which might possibly result in exiting | ||
489 | * idle mode, in other words, entering the mode in which read-side critical | ||
490 | * sections can occur. | ||
491 | * | ||
492 | * Note that the Linux kernel is fully capable of entering an interrupt | ||
493 | * handler that it never exits, for example when doing upcalls to | ||
494 | * user mode! This code assumes that the idle loop never does upcalls to | ||
495 | * user mode. If your architecture does do upcalls from the idle loop (or | ||
496 | * does anything else that results in unbalanced calls to the irq_enter() | ||
497 | * and irq_exit() functions), RCU will give you what you deserve, good | ||
498 | * and hard. But very infrequently and irreproducibly. | ||
499 | * | ||
500 | * Use things like work queues to work around this limitation. | ||
501 | * | ||
502 | * You have been warned. | ||
503 | */ | ||
504 | void rcu_irq_enter(void) | ||
505 | { | ||
506 | unsigned long flags; | ||
507 | struct rcu_dynticks *rdtp; | ||
508 | long long oldval; | ||
509 | |||
510 | local_irq_save(flags); | ||
511 | rdtp = &__get_cpu_var(rcu_dynticks); | ||
512 | oldval = rdtp->dynticks_nesting; | ||
513 | rdtp->dynticks_nesting++; | ||
514 | WARN_ON_ONCE(rdtp->dynticks_nesting == 0); | ||
515 | if (oldval) | ||
516 | trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting); | ||
517 | else | ||
518 | rcu_idle_exit_common(rdtp, oldval); | ||
394 | local_irq_restore(flags); | 519 | local_irq_restore(flags); |
395 | } | 520 | } |
396 | 521 | ||
@@ -437,27 +562,37 @@ void rcu_nmi_exit(void) | |||
437 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); | 562 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); |
438 | } | 563 | } |
439 | 564 | ||
565 | #ifdef CONFIG_PROVE_RCU | ||
566 | |||
440 | /** | 567 | /** |
441 | * rcu_irq_enter - inform RCU of entry to hard irq context | 568 | * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle |
442 | * | 569 | * |
443 | * If the CPU was idle with dynamic ticks active, this updates the | 570 | * If the current CPU is in its idle loop and is neither in an interrupt |
444 | * rdtp->dynticks to let the RCU handling know that the CPU is active. | 571 | * or NMI handler, return true. |
445 | */ | 572 | */ |
446 | void rcu_irq_enter(void) | 573 | int rcu_is_cpu_idle(void) |
447 | { | 574 | { |
448 | rcu_exit_nohz(); | 575 | int ret; |
576 | |||
577 | preempt_disable(); | ||
578 | ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0; | ||
579 | preempt_enable(); | ||
580 | return ret; | ||
449 | } | 581 | } |
582 | EXPORT_SYMBOL(rcu_is_cpu_idle); | ||
583 | |||
584 | #endif /* #ifdef CONFIG_PROVE_RCU */ | ||
450 | 585 | ||
451 | /** | 586 | /** |
452 | * rcu_irq_exit - inform RCU of exit from hard irq context | 587 | * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle |
453 | * | 588 | * |
454 | * If the CPU was idle with dynamic ticks active, update the rdp->dynticks | 589 | * If the current CPU is idle or running at a first-level (not nested) |
455 | * to put let the RCU handling be aware that the CPU is going back to idle | 590 | * interrupt from idle, return true. The caller must have at least |
456 | * with no ticks. | 591 | * disabled preemption. |
457 | */ | 592 | */ |
458 | void rcu_irq_exit(void) | 593 | int rcu_is_cpu_rrupt_from_idle(void) |
459 | { | 594 | { |
460 | rcu_enter_nohz(); | 595 | return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; |
461 | } | 596 | } |
462 | 597 | ||
463 | #ifdef CONFIG_SMP | 598 | #ifdef CONFIG_SMP |
@@ -470,7 +605,7 @@ void rcu_irq_exit(void) | |||
470 | static int dyntick_save_progress_counter(struct rcu_data *rdp) | 605 | static int dyntick_save_progress_counter(struct rcu_data *rdp) |
471 | { | 606 | { |
472 | rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); | 607 | rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); |
473 | return 0; | 608 | return (rdp->dynticks_snap & 0x1) == 0; |
474 | } | 609 | } |
475 | 610 | ||
476 | /* | 611 | /* |
@@ -481,11 +616,11 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) | |||
481 | */ | 616 | */ |
482 | static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | 617 | static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) |
483 | { | 618 | { |
484 | unsigned long curr; | 619 | unsigned int curr; |
485 | unsigned long snap; | 620 | unsigned int snap; |
486 | 621 | ||
487 | curr = (unsigned long)atomic_add_return(0, &rdp->dynticks->dynticks); | 622 | curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks); |
488 | snap = (unsigned long)rdp->dynticks_snap; | 623 | snap = (unsigned int)rdp->dynticks_snap; |
489 | 624 | ||
490 | /* | 625 | /* |
491 | * If the CPU passed through or entered a dynticks idle phase with | 626 | * If the CPU passed through or entered a dynticks idle phase with |
@@ -495,7 +630,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | |||
495 | * read-side critical section that started before the beginning | 630 | * read-side critical section that started before the beginning |
496 | * of the current RCU grace period. | 631 | * of the current RCU grace period. |
497 | */ | 632 | */ |
498 | if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) { | 633 | if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) { |
634 | trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "dti"); | ||
499 | rdp->dynticks_fqs++; | 635 | rdp->dynticks_fqs++; |
500 | return 1; | 636 | return 1; |
501 | } | 637 | } |
@@ -506,26 +642,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | |||
506 | 642 | ||
507 | #endif /* #ifdef CONFIG_SMP */ | 643 | #endif /* #ifdef CONFIG_SMP */ |
508 | 644 | ||
509 | #else /* #ifdef CONFIG_NO_HZ */ | ||
510 | |||
511 | #ifdef CONFIG_SMP | ||
512 | |||
513 | static int dyntick_save_progress_counter(struct rcu_data *rdp) | ||
514 | { | ||
515 | return 0; | ||
516 | } | ||
517 | |||
518 | static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | ||
519 | { | ||
520 | return rcu_implicit_offline_qs(rdp); | ||
521 | } | ||
522 | |||
523 | #endif /* #ifdef CONFIG_SMP */ | ||
524 | |||
525 | #endif /* #else #ifdef CONFIG_NO_HZ */ | ||
526 | |||
527 | int rcu_cpu_stall_suppress __read_mostly; | ||
528 | |||
529 | static void record_gp_stall_check_time(struct rcu_state *rsp) | 645 | static void record_gp_stall_check_time(struct rcu_state *rsp) |
530 | { | 646 | { |
531 | rsp->gp_start = jiffies; | 647 | rsp->gp_start = jiffies; |
@@ -537,6 +653,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
537 | int cpu; | 653 | int cpu; |
538 | long delta; | 654 | long delta; |
539 | unsigned long flags; | 655 | unsigned long flags; |
656 | int ndetected; | ||
540 | struct rcu_node *rnp = rcu_get_root(rsp); | 657 | struct rcu_node *rnp = rcu_get_root(rsp); |
541 | 658 | ||
542 | /* Only let one CPU complain about others per time interval. */ | 659 | /* Only let one CPU complain about others per time interval. */ |
@@ -553,7 +670,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
553 | * Now rat on any tasks that got kicked up to the root rcu_node | 670 | * Now rat on any tasks that got kicked up to the root rcu_node |
554 | * due to CPU offlining. | 671 | * due to CPU offlining. |
555 | */ | 672 | */ |
556 | rcu_print_task_stall(rnp); | 673 | ndetected = rcu_print_task_stall(rnp); |
557 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 674 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
558 | 675 | ||
559 | /* | 676 | /* |
@@ -565,17 +682,22 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
565 | rsp->name); | 682 | rsp->name); |
566 | rcu_for_each_leaf_node(rsp, rnp) { | 683 | rcu_for_each_leaf_node(rsp, rnp) { |
567 | raw_spin_lock_irqsave(&rnp->lock, flags); | 684 | raw_spin_lock_irqsave(&rnp->lock, flags); |
568 | rcu_print_task_stall(rnp); | 685 | ndetected += rcu_print_task_stall(rnp); |
569 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 686 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
570 | if (rnp->qsmask == 0) | 687 | if (rnp->qsmask == 0) |
571 | continue; | 688 | continue; |
572 | for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) | 689 | for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) |
573 | if (rnp->qsmask & (1UL << cpu)) | 690 | if (rnp->qsmask & (1UL << cpu)) { |
574 | printk(" %d", rnp->grplo + cpu); | 691 | printk(" %d", rnp->grplo + cpu); |
692 | ndetected++; | ||
693 | } | ||
575 | } | 694 | } |
576 | printk("} (detected by %d, t=%ld jiffies)\n", | 695 | printk("} (detected by %d, t=%ld jiffies)\n", |
577 | smp_processor_id(), (long)(jiffies - rsp->gp_start)); | 696 | smp_processor_id(), (long)(jiffies - rsp->gp_start)); |
578 | trigger_all_cpu_backtrace(); | 697 | if (ndetected == 0) |
698 | printk(KERN_ERR "INFO: Stall ended before state dump start\n"); | ||
699 | else if (!trigger_all_cpu_backtrace()) | ||
700 | dump_stack(); | ||
579 | 701 | ||
580 | /* If so configured, complain about tasks blocking the grace period. */ | 702 | /* If so configured, complain about tasks blocking the grace period. */ |
581 | 703 | ||
@@ -596,7 +718,8 @@ static void print_cpu_stall(struct rcu_state *rsp) | |||
596 | */ | 718 | */ |
597 | printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", | 719 | printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", |
598 | rsp->name, smp_processor_id(), jiffies - rsp->gp_start); | 720 | rsp->name, smp_processor_id(), jiffies - rsp->gp_start); |
599 | trigger_all_cpu_backtrace(); | 721 | if (!trigger_all_cpu_backtrace()) |
722 | dump_stack(); | ||
600 | 723 | ||
601 | raw_spin_lock_irqsave(&rnp->lock, flags); | 724 | raw_spin_lock_irqsave(&rnp->lock, flags); |
602 | if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) | 725 | if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) |
@@ -678,9 +801,10 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct | |||
678 | * go looking for one. | 801 | * go looking for one. |
679 | */ | 802 | */ |
680 | rdp->gpnum = rnp->gpnum; | 803 | rdp->gpnum = rnp->gpnum; |
804 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); | ||
681 | if (rnp->qsmask & rdp->grpmask) { | 805 | if (rnp->qsmask & rdp->grpmask) { |
682 | rdp->qs_pending = 1; | 806 | rdp->qs_pending = 1; |
683 | rdp->passed_quiesc = 0; | 807 | rdp->passed_quiesce = 0; |
684 | } else | 808 | } else |
685 | rdp->qs_pending = 0; | 809 | rdp->qs_pending = 0; |
686 | } | 810 | } |
@@ -741,6 +865,7 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat | |||
741 | 865 | ||
742 | /* Remember that we saw this grace-period completion. */ | 866 | /* Remember that we saw this grace-period completion. */ |
743 | rdp->completed = rnp->completed; | 867 | rdp->completed = rnp->completed; |
868 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend"); | ||
744 | 869 | ||
745 | /* | 870 | /* |
746 | * If we were in an extended quiescent state, we may have | 871 | * If we were in an extended quiescent state, we may have |
@@ -826,33 +951,33 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
826 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); | 951 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); |
827 | struct rcu_node *rnp = rcu_get_root(rsp); | 952 | struct rcu_node *rnp = rcu_get_root(rsp); |
828 | 953 | ||
829 | if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) { | 954 | if (!rcu_scheduler_fully_active || |
830 | if (cpu_needs_another_gp(rsp, rdp)) | 955 | !cpu_needs_another_gp(rsp, rdp)) { |
831 | rsp->fqs_need_gp = 1; | 956 | /* |
832 | if (rnp->completed == rsp->completed) { | 957 | * Either the scheduler hasn't yet spawned the first |
833 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 958 | * non-idle task or this CPU does not need another |
834 | return; | 959 | * grace period. Either way, don't start a new grace |
835 | } | 960 | * period. |
836 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 961 | */ |
962 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
963 | return; | ||
964 | } | ||
837 | 965 | ||
966 | if (rsp->fqs_active) { | ||
838 | /* | 967 | /* |
839 | * Propagate new ->completed value to rcu_node structures | 968 | * This CPU needs a grace period, but force_quiescent_state() |
840 | * so that other CPUs don't have to wait until the start | 969 | * is running. Tell it to start one on this CPU's behalf. |
841 | * of the next grace period to process their callbacks. | ||
842 | */ | 970 | */ |
843 | rcu_for_each_node_breadth_first(rsp, rnp) { | 971 | rsp->fqs_need_gp = 1; |
844 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | 972 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
845 | rnp->completed = rsp->completed; | ||
846 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | ||
847 | } | ||
848 | local_irq_restore(flags); | ||
849 | return; | 973 | return; |
850 | } | 974 | } |
851 | 975 | ||
852 | /* Advance to a new grace period and initialize state. */ | 976 | /* Advance to a new grace period and initialize state. */ |
853 | rsp->gpnum++; | 977 | rsp->gpnum++; |
854 | WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT); | 978 | trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); |
855 | rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ | 979 | WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT); |
980 | rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */ | ||
856 | rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; | 981 | rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; |
857 | record_gp_stall_check_time(rsp); | 982 | record_gp_stall_check_time(rsp); |
858 | 983 | ||
@@ -862,9 +987,12 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
862 | rnp->qsmask = rnp->qsmaskinit; | 987 | rnp->qsmask = rnp->qsmaskinit; |
863 | rnp->gpnum = rsp->gpnum; | 988 | rnp->gpnum = rsp->gpnum; |
864 | rnp->completed = rsp->completed; | 989 | rnp->completed = rsp->completed; |
865 | rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ | 990 | rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state OK */ |
866 | rcu_start_gp_per_cpu(rsp, rnp, rdp); | 991 | rcu_start_gp_per_cpu(rsp, rnp, rdp); |
867 | rcu_preempt_boost_start_gp(rnp); | 992 | rcu_preempt_boost_start_gp(rnp); |
993 | trace_rcu_grace_period_init(rsp->name, rnp->gpnum, | ||
994 | rnp->level, rnp->grplo, | ||
995 | rnp->grphi, rnp->qsmask); | ||
868 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 996 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
869 | return; | 997 | return; |
870 | } | 998 | } |
@@ -901,12 +1029,15 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
901 | if (rnp == rdp->mynode) | 1029 | if (rnp == rdp->mynode) |
902 | rcu_start_gp_per_cpu(rsp, rnp, rdp); | 1030 | rcu_start_gp_per_cpu(rsp, rnp, rdp); |
903 | rcu_preempt_boost_start_gp(rnp); | 1031 | rcu_preempt_boost_start_gp(rnp); |
1032 | trace_rcu_grace_period_init(rsp->name, rnp->gpnum, | ||
1033 | rnp->level, rnp->grplo, | ||
1034 | rnp->grphi, rnp->qsmask); | ||
904 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 1035 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
905 | } | 1036 | } |
906 | 1037 | ||
907 | rnp = rcu_get_root(rsp); | 1038 | rnp = rcu_get_root(rsp); |
908 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | 1039 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ |
909 | rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ | 1040 | rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ |
910 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 1041 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
911 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | 1042 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); |
912 | } | 1043 | } |
@@ -922,6 +1053,8 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) | |||
922 | __releases(rcu_get_root(rsp)->lock) | 1053 | __releases(rcu_get_root(rsp)->lock) |
923 | { | 1054 | { |
924 | unsigned long gp_duration; | 1055 | unsigned long gp_duration; |
1056 | struct rcu_node *rnp = rcu_get_root(rsp); | ||
1057 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); | ||
925 | 1058 | ||
926 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); | 1059 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); |
927 | 1060 | ||
@@ -933,8 +1066,42 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) | |||
933 | gp_duration = jiffies - rsp->gp_start; | 1066 | gp_duration = jiffies - rsp->gp_start; |
934 | if (gp_duration > rsp->gp_max) | 1067 | if (gp_duration > rsp->gp_max) |
935 | rsp->gp_max = gp_duration; | 1068 | rsp->gp_max = gp_duration; |
936 | rsp->completed = rsp->gpnum; | 1069 | |
937 | rsp->signaled = RCU_GP_IDLE; | 1070 | /* |
1071 | * We know the grace period is complete, but to everyone else | ||
1072 | * it appears to still be ongoing. But it is also the case | ||
1073 | * that to everyone else it looks like there is nothing that | ||
1074 | * they can do to advance the grace period. It is therefore | ||
1075 | * safe for us to drop the lock in order to mark the grace | ||
1076 | * period as completed in all of the rcu_node structures. | ||
1077 | * | ||
1078 | * But if this CPU needs another grace period, it will take | ||
1079 | * care of this while initializing the next grace period. | ||
1080 | * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL | ||
1081 | * because the callbacks have not yet been advanced: Those | ||
1082 | * callbacks are waiting on the grace period that just now | ||
1083 | * completed. | ||
1084 | */ | ||
1085 | if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) { | ||
1086 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | ||
1087 | |||
1088 | /* | ||
1089 | * Propagate new ->completed value to rcu_node structures | ||
1090 | * so that other CPUs don't have to wait until the start | ||
1091 | * of the next grace period to process their callbacks. | ||
1092 | */ | ||
1093 | rcu_for_each_node_breadth_first(rsp, rnp) { | ||
1094 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | ||
1095 | rnp->completed = rsp->gpnum; | ||
1096 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | ||
1097 | } | ||
1098 | rnp = rcu_get_root(rsp); | ||
1099 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | ||
1100 | } | ||
1101 | |||
1102 | rsp->completed = rsp->gpnum; /* Declare the grace period complete. */ | ||
1103 | trace_rcu_grace_period(rsp->name, rsp->completed, "end"); | ||
1104 | rsp->fqs_state = RCU_GP_IDLE; | ||
938 | rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ | 1105 | rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ |
939 | } | 1106 | } |
940 | 1107 | ||
@@ -962,6 +1129,10 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, | |||
962 | return; | 1129 | return; |
963 | } | 1130 | } |
964 | rnp->qsmask &= ~mask; | 1131 | rnp->qsmask &= ~mask; |
1132 | trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum, | ||
1133 | mask, rnp->qsmask, rnp->level, | ||
1134 | rnp->grplo, rnp->grphi, | ||
1135 | !!rnp->gp_tasks); | ||
965 | if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { | 1136 | if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { |
966 | 1137 | ||
967 | /* Other bits still set at this level, so done. */ | 1138 | /* Other bits still set at this level, so done. */ |
@@ -1000,7 +1171,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, | |||
1000 | * based on quiescent states detected in an earlier grace period! | 1171 | * based on quiescent states detected in an earlier grace period! |
1001 | */ | 1172 | */ |
1002 | static void | 1173 | static void |
1003 | rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) | 1174 | rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastgp) |
1004 | { | 1175 | { |
1005 | unsigned long flags; | 1176 | unsigned long flags; |
1006 | unsigned long mask; | 1177 | unsigned long mask; |
@@ -1008,17 +1179,15 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las | |||
1008 | 1179 | ||
1009 | rnp = rdp->mynode; | 1180 | rnp = rdp->mynode; |
1010 | raw_spin_lock_irqsave(&rnp->lock, flags); | 1181 | raw_spin_lock_irqsave(&rnp->lock, flags); |
1011 | if (lastcomp != rnp->completed) { | 1182 | if (lastgp != rnp->gpnum || rnp->completed == rnp->gpnum) { |
1012 | 1183 | ||
1013 | /* | 1184 | /* |
1014 | * Someone beat us to it for this grace period, so leave. | 1185 | * The grace period in which this quiescent state was |
1015 | * The race with GP start is resolved by the fact that we | 1186 | * recorded has ended, so don't report it upwards. |
1016 | * hold the leaf rcu_node lock, so that the per-CPU bits | 1187 | * We will instead need a new quiescent state that lies |
1017 | * cannot yet be initialized -- so we would simply find our | 1188 | * within the current grace period. |
1018 | * CPU's bit already cleared in rcu_report_qs_rnp() if this | ||
1019 | * race occurred. | ||
1020 | */ | 1189 | */ |
1021 | rdp->passed_quiesc = 0; /* try again later! */ | 1190 | rdp->passed_quiesce = 0; /* need qs for new gp. */ |
1022 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1191 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1023 | return; | 1192 | return; |
1024 | } | 1193 | } |
@@ -1062,14 +1231,14 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1062 | * Was there a quiescent state since the beginning of the grace | 1231 | * Was there a quiescent state since the beginning of the grace |
1063 | * period? If no, then exit and wait for the next call. | 1232 | * period? If no, then exit and wait for the next call. |
1064 | */ | 1233 | */ |
1065 | if (!rdp->passed_quiesc) | 1234 | if (!rdp->passed_quiesce) |
1066 | return; | 1235 | return; |
1067 | 1236 | ||
1068 | /* | 1237 | /* |
1069 | * Tell RCU we are done (but rcu_report_qs_rdp() will be the | 1238 | * Tell RCU we are done (but rcu_report_qs_rdp() will be the |
1070 | * judge of that). | 1239 | * judge of that). |
1071 | */ | 1240 | */ |
1072 | rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed); | 1241 | rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesce_gpnum); |
1073 | } | 1242 | } |
1074 | 1243 | ||
1075 | #ifdef CONFIG_HOTPLUG_CPU | 1244 | #ifdef CONFIG_HOTPLUG_CPU |
@@ -1130,11 +1299,20 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) | |||
1130 | if (rnp->qsmaskinit != 0) { | 1299 | if (rnp->qsmaskinit != 0) { |
1131 | if (rnp != rdp->mynode) | 1300 | if (rnp != rdp->mynode) |
1132 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 1301 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
1302 | else | ||
1303 | trace_rcu_grace_period(rsp->name, | ||
1304 | rnp->gpnum + 1 - | ||
1305 | !!(rnp->qsmask & mask), | ||
1306 | "cpuofl"); | ||
1133 | break; | 1307 | break; |
1134 | } | 1308 | } |
1135 | if (rnp == rdp->mynode) | 1309 | if (rnp == rdp->mynode) { |
1310 | trace_rcu_grace_period(rsp->name, | ||
1311 | rnp->gpnum + 1 - | ||
1312 | !!(rnp->qsmask & mask), | ||
1313 | "cpuofl"); | ||
1136 | need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); | 1314 | need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); |
1137 | else | 1315 | } else |
1138 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 1316 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
1139 | mask = rnp->grpmask; | 1317 | mask = rnp->grpmask; |
1140 | rnp = rnp->parent; | 1318 | rnp = rnp->parent; |
@@ -1153,7 +1331,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) | |||
1153 | else | 1331 | else |
1154 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1332 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1155 | if (need_report & RCU_OFL_TASKS_EXP_GP) | 1333 | if (need_report & RCU_OFL_TASKS_EXP_GP) |
1156 | rcu_report_exp_rnp(rsp, rnp); | 1334 | rcu_report_exp_rnp(rsp, rnp, true); |
1157 | rcu_node_kthread_setaffinity(rnp, -1); | 1335 | rcu_node_kthread_setaffinity(rnp, -1); |
1158 | } | 1336 | } |
1159 | 1337 | ||
@@ -1190,17 +1368,24 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1190 | { | 1368 | { |
1191 | unsigned long flags; | 1369 | unsigned long flags; |
1192 | struct rcu_head *next, *list, **tail; | 1370 | struct rcu_head *next, *list, **tail; |
1193 | int count; | 1371 | int bl, count; |
1194 | 1372 | ||
1195 | /* If no callbacks are ready, just return.*/ | 1373 | /* If no callbacks are ready, just return.*/ |
1196 | if (!cpu_has_callbacks_ready_to_invoke(rdp)) | 1374 | if (!cpu_has_callbacks_ready_to_invoke(rdp)) { |
1375 | trace_rcu_batch_start(rsp->name, 0, 0); | ||
1376 | trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist), | ||
1377 | need_resched(), is_idle_task(current), | ||
1378 | rcu_is_callbacks_kthread()); | ||
1197 | return; | 1379 | return; |
1380 | } | ||
1198 | 1381 | ||
1199 | /* | 1382 | /* |
1200 | * Extract the list of ready callbacks, disabling to prevent | 1383 | * Extract the list of ready callbacks, disabling to prevent |
1201 | * races with call_rcu() from interrupt handlers. | 1384 | * races with call_rcu() from interrupt handlers. |
1202 | */ | 1385 | */ |
1203 | local_irq_save(flags); | 1386 | local_irq_save(flags); |
1387 | bl = rdp->blimit; | ||
1388 | trace_rcu_batch_start(rsp->name, rdp->qlen, bl); | ||
1204 | list = rdp->nxtlist; | 1389 | list = rdp->nxtlist; |
1205 | rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; | 1390 | rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; |
1206 | *rdp->nxttail[RCU_DONE_TAIL] = NULL; | 1391 | *rdp->nxttail[RCU_DONE_TAIL] = NULL; |
@@ -1216,13 +1401,19 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1216 | next = list->next; | 1401 | next = list->next; |
1217 | prefetch(next); | 1402 | prefetch(next); |
1218 | debug_rcu_head_unqueue(list); | 1403 | debug_rcu_head_unqueue(list); |
1219 | __rcu_reclaim(list); | 1404 | __rcu_reclaim(rsp->name, list); |
1220 | list = next; | 1405 | list = next; |
1221 | if (++count >= rdp->blimit) | 1406 | /* Stop only if limit reached and CPU has something to do. */ |
1407 | if (++count >= bl && | ||
1408 | (need_resched() || | ||
1409 | (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) | ||
1222 | break; | 1410 | break; |
1223 | } | 1411 | } |
1224 | 1412 | ||
1225 | local_irq_save(flags); | 1413 | local_irq_save(flags); |
1414 | trace_rcu_batch_end(rsp->name, count, !!list, need_resched(), | ||
1415 | is_idle_task(current), | ||
1416 | rcu_is_callbacks_kthread()); | ||
1226 | 1417 | ||
1227 | /* Update count, and requeue any remaining callbacks. */ | 1418 | /* Update count, and requeue any remaining callbacks. */ |
1228 | rdp->qlen -= count; | 1419 | rdp->qlen -= count; |
@@ -1250,7 +1441,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1250 | 1441 | ||
1251 | local_irq_restore(flags); | 1442 | local_irq_restore(flags); |
1252 | 1443 | ||
1253 | /* Re-raise the RCU softirq if there are callbacks remaining. */ | 1444 | /* Re-invoke RCU core processing if there are callbacks remaining. */ |
1254 | if (cpu_has_callbacks_ready_to_invoke(rdp)) | 1445 | if (cpu_has_callbacks_ready_to_invoke(rdp)) |
1255 | invoke_rcu_core(); | 1446 | invoke_rcu_core(); |
1256 | } | 1447 | } |
@@ -1258,17 +1449,16 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1258 | /* | 1449 | /* |
1259 | * Check to see if this CPU is in a non-context-switch quiescent state | 1450 | * Check to see if this CPU is in a non-context-switch quiescent state |
1260 | * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). | 1451 | * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). |
1261 | * Also schedule the RCU softirq handler. | 1452 | * Also schedule RCU core processing. |
1262 | * | 1453 | * |
1263 | * This function must be called with hardirqs disabled. It is normally | 1454 | * This function must be called from hardirq context. It is normally |
1264 | * invoked from the scheduling-clock interrupt. If rcu_pending returns | 1455 | * invoked from the scheduling-clock interrupt. If rcu_pending returns |
1265 | * false, there is no point in invoking rcu_check_callbacks(). | 1456 | * false, there is no point in invoking rcu_check_callbacks(). |
1266 | */ | 1457 | */ |
1267 | void rcu_check_callbacks(int cpu, int user) | 1458 | void rcu_check_callbacks(int cpu, int user) |
1268 | { | 1459 | { |
1269 | if (user || | 1460 | trace_rcu_utilization("Start scheduler-tick"); |
1270 | (idle_cpu(cpu) && rcu_scheduler_active && | 1461 | if (user || rcu_is_cpu_rrupt_from_idle()) { |
1271 | !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { | ||
1272 | 1462 | ||
1273 | /* | 1463 | /* |
1274 | * Get here if this CPU took its interrupt from user | 1464 | * Get here if this CPU took its interrupt from user |
@@ -1299,6 +1489,7 @@ void rcu_check_callbacks(int cpu, int user) | |||
1299 | rcu_preempt_check_callbacks(cpu); | 1489 | rcu_preempt_check_callbacks(cpu); |
1300 | if (rcu_pending(cpu)) | 1490 | if (rcu_pending(cpu)) |
1301 | invoke_rcu_core(); | 1491 | invoke_rcu_core(); |
1492 | trace_rcu_utilization("End scheduler-tick"); | ||
1302 | } | 1493 | } |
1303 | 1494 | ||
1304 | #ifdef CONFIG_SMP | 1495 | #ifdef CONFIG_SMP |
@@ -1360,10 +1551,14 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) | |||
1360 | unsigned long flags; | 1551 | unsigned long flags; |
1361 | struct rcu_node *rnp = rcu_get_root(rsp); | 1552 | struct rcu_node *rnp = rcu_get_root(rsp); |
1362 | 1553 | ||
1363 | if (!rcu_gp_in_progress(rsp)) | 1554 | trace_rcu_utilization("Start fqs"); |
1555 | if (!rcu_gp_in_progress(rsp)) { | ||
1556 | trace_rcu_utilization("End fqs"); | ||
1364 | return; /* No grace period in progress, nothing to force. */ | 1557 | return; /* No grace period in progress, nothing to force. */ |
1558 | } | ||
1365 | if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) { | 1559 | if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) { |
1366 | rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ | 1560 | rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ |
1561 | trace_rcu_utilization("End fqs"); | ||
1367 | return; /* Someone else is already on the job. */ | 1562 | return; /* Someone else is already on the job. */ |
1368 | } | 1563 | } |
1369 | if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies)) | 1564 | if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies)) |
@@ -1377,7 +1572,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) | |||
1377 | goto unlock_fqs_ret; /* no GP in progress, time updated. */ | 1572 | goto unlock_fqs_ret; /* no GP in progress, time updated. */ |
1378 | } | 1573 | } |
1379 | rsp->fqs_active = 1; | 1574 | rsp->fqs_active = 1; |
1380 | switch (rsp->signaled) { | 1575 | switch (rsp->fqs_state) { |
1381 | case RCU_GP_IDLE: | 1576 | case RCU_GP_IDLE: |
1382 | case RCU_GP_INIT: | 1577 | case RCU_GP_INIT: |
1383 | 1578 | ||
@@ -1393,7 +1588,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) | |||
1393 | force_qs_rnp(rsp, dyntick_save_progress_counter); | 1588 | force_qs_rnp(rsp, dyntick_save_progress_counter); |
1394 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ | 1589 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ |
1395 | if (rcu_gp_in_progress(rsp)) | 1590 | if (rcu_gp_in_progress(rsp)) |
1396 | rsp->signaled = RCU_FORCE_QS; | 1591 | rsp->fqs_state = RCU_FORCE_QS; |
1397 | break; | 1592 | break; |
1398 | 1593 | ||
1399 | case RCU_FORCE_QS: | 1594 | case RCU_FORCE_QS: |
@@ -1412,11 +1607,13 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) | |||
1412 | raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */ | 1607 | raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */ |
1413 | rsp->fqs_need_gp = 0; | 1608 | rsp->fqs_need_gp = 0; |
1414 | rcu_start_gp(rsp, flags); /* releases rnp->lock */ | 1609 | rcu_start_gp(rsp, flags); /* releases rnp->lock */ |
1610 | trace_rcu_utilization("End fqs"); | ||
1415 | return; | 1611 | return; |
1416 | } | 1612 | } |
1417 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ | 1613 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ |
1418 | unlock_fqs_ret: | 1614 | unlock_fqs_ret: |
1419 | raw_spin_unlock_irqrestore(&rsp->fqslock, flags); | 1615 | raw_spin_unlock_irqrestore(&rsp->fqslock, flags); |
1616 | trace_rcu_utilization("End fqs"); | ||
1420 | } | 1617 | } |
1421 | 1618 | ||
1422 | #else /* #ifdef CONFIG_SMP */ | 1619 | #else /* #ifdef CONFIG_SMP */ |
@@ -1429,9 +1626,9 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) | |||
1429 | #endif /* #else #ifdef CONFIG_SMP */ | 1626 | #endif /* #else #ifdef CONFIG_SMP */ |
1430 | 1627 | ||
1431 | /* | 1628 | /* |
1432 | * This does the RCU processing work from softirq context for the | 1629 | * This does the RCU core processing work for the specified rcu_state |
1433 | * specified rcu_state and rcu_data structures. This may be called | 1630 | * and rcu_data structures. This may be called only from the CPU to |
1434 | * only from the CPU to whom the rdp belongs. | 1631 | * whom the rdp belongs. |
1435 | */ | 1632 | */ |
1436 | static void | 1633 | static void |
1437 | __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) | 1634 | __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) |
@@ -1468,24 +1665,24 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1468 | } | 1665 | } |
1469 | 1666 | ||
1470 | /* | 1667 | /* |
1471 | * Do softirq processing for the current CPU. | 1668 | * Do RCU core processing for the current CPU. |
1472 | */ | 1669 | */ |
1473 | static void rcu_process_callbacks(struct softirq_action *unused) | 1670 | static void rcu_process_callbacks(struct softirq_action *unused) |
1474 | { | 1671 | { |
1672 | trace_rcu_utilization("Start RCU core"); | ||
1475 | __rcu_process_callbacks(&rcu_sched_state, | 1673 | __rcu_process_callbacks(&rcu_sched_state, |
1476 | &__get_cpu_var(rcu_sched_data)); | 1674 | &__get_cpu_var(rcu_sched_data)); |
1477 | __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); | 1675 | __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); |
1478 | rcu_preempt_process_callbacks(); | 1676 | rcu_preempt_process_callbacks(); |
1479 | 1677 | trace_rcu_utilization("End RCU core"); | |
1480 | /* If we are last CPU on way to dyntick-idle mode, accelerate it. */ | ||
1481 | rcu_needs_cpu_flush(); | ||
1482 | } | 1678 | } |
1483 | 1679 | ||
1484 | /* | 1680 | /* |
1485 | * Wake up the current CPU's kthread. This replaces raise_softirq() | 1681 | * Schedule RCU callback invocation. If the specified type of RCU |
1486 | * in earlier versions of RCU. Note that because we are running on | 1682 | * does not support RCU priority boosting, just do a direct call, |
1487 | * the current CPU with interrupts disabled, the rcu_cpu_kthread_task | 1683 | * otherwise wake up the per-CPU kernel kthread. Note that because we |
1488 | * cannot disappear out from under us. | 1684 | * are running on the current CPU with interrupts disabled, the |
1685 | * rcu_cpu_kthread_task cannot disappear out from under us. | ||
1489 | */ | 1686 | */ |
1490 | static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) | 1687 | static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) |
1491 | { | 1688 | { |
@@ -1530,6 +1727,12 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
1530 | rdp->nxttail[RCU_NEXT_TAIL] = &head->next; | 1727 | rdp->nxttail[RCU_NEXT_TAIL] = &head->next; |
1531 | rdp->qlen++; | 1728 | rdp->qlen++; |
1532 | 1729 | ||
1730 | if (__is_kfree_rcu_offset((unsigned long)func)) | ||
1731 | trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, | ||
1732 | rdp->qlen); | ||
1733 | else | ||
1734 | trace_rcu_callback(rsp->name, head, rdp->qlen); | ||
1735 | |||
1533 | /* If interrupts were disabled, don't dive into RCU core. */ | 1736 | /* If interrupts were disabled, don't dive into RCU core. */ |
1534 | if (irqs_disabled_flags(flags)) { | 1737 | if (irqs_disabled_flags(flags)) { |
1535 | local_irq_restore(flags); | 1738 | local_irq_restore(flags); |
@@ -1613,18 +1816,9 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); | |||
1613 | */ | 1816 | */ |
1614 | void synchronize_sched(void) | 1817 | void synchronize_sched(void) |
1615 | { | 1818 | { |
1616 | struct rcu_synchronize rcu; | ||
1617 | |||
1618 | if (rcu_blocking_is_gp()) | 1819 | if (rcu_blocking_is_gp()) |
1619 | return; | 1820 | return; |
1620 | 1821 | wait_rcu_gp(call_rcu_sched); | |
1621 | init_rcu_head_on_stack(&rcu.head); | ||
1622 | init_completion(&rcu.completion); | ||
1623 | /* Will wake me after RCU finished. */ | ||
1624 | call_rcu_sched(&rcu.head, wakeme_after_rcu); | ||
1625 | /* Wait for it. */ | ||
1626 | wait_for_completion(&rcu.completion); | ||
1627 | destroy_rcu_head_on_stack(&rcu.head); | ||
1628 | } | 1822 | } |
1629 | EXPORT_SYMBOL_GPL(synchronize_sched); | 1823 | EXPORT_SYMBOL_GPL(synchronize_sched); |
1630 | 1824 | ||
@@ -1639,18 +1833,9 @@ EXPORT_SYMBOL_GPL(synchronize_sched); | |||
1639 | */ | 1833 | */ |
1640 | void synchronize_rcu_bh(void) | 1834 | void synchronize_rcu_bh(void) |
1641 | { | 1835 | { |
1642 | struct rcu_synchronize rcu; | ||
1643 | |||
1644 | if (rcu_blocking_is_gp()) | 1836 | if (rcu_blocking_is_gp()) |
1645 | return; | 1837 | return; |
1646 | 1838 | wait_rcu_gp(call_rcu_bh); | |
1647 | init_rcu_head_on_stack(&rcu.head); | ||
1648 | init_completion(&rcu.completion); | ||
1649 | /* Will wake me after RCU finished. */ | ||
1650 | call_rcu_bh(&rcu.head, wakeme_after_rcu); | ||
1651 | /* Wait for it. */ | ||
1652 | wait_for_completion(&rcu.completion); | ||
1653 | destroy_rcu_head_on_stack(&rcu.head); | ||
1654 | } | 1839 | } |
1655 | EXPORT_SYMBOL_GPL(synchronize_rcu_bh); | 1840 | EXPORT_SYMBOL_GPL(synchronize_rcu_bh); |
1656 | 1841 | ||
@@ -1671,7 +1856,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1671 | check_cpu_stall(rsp, rdp); | 1856 | check_cpu_stall(rsp, rdp); |
1672 | 1857 | ||
1673 | /* Is the RCU core waiting for a quiescent state from this CPU? */ | 1858 | /* Is the RCU core waiting for a quiescent state from this CPU? */ |
1674 | if (rdp->qs_pending && !rdp->passed_quiesc) { | 1859 | if (rcu_scheduler_fully_active && |
1860 | rdp->qs_pending && !rdp->passed_quiesce) { | ||
1675 | 1861 | ||
1676 | /* | 1862 | /* |
1677 | * If force_quiescent_state() coming soon and this CPU | 1863 | * If force_quiescent_state() coming soon and this CPU |
@@ -1683,7 +1869,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1683 | ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, | 1869 | ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, |
1684 | jiffies)) | 1870 | jiffies)) |
1685 | set_need_resched(); | 1871 | set_need_resched(); |
1686 | } else if (rdp->qs_pending && rdp->passed_quiesc) { | 1872 | } else if (rdp->qs_pending && rdp->passed_quiesce) { |
1687 | rdp->n_rp_report_qs++; | 1873 | rdp->n_rp_report_qs++; |
1688 | return 1; | 1874 | return 1; |
1689 | } | 1875 | } |
@@ -1741,7 +1927,7 @@ static int rcu_pending(int cpu) | |||
1741 | * by the current CPU, even if none need be done immediately, returning | 1927 | * by the current CPU, even if none need be done immediately, returning |
1742 | * 1 if so. | 1928 | * 1 if so. |
1743 | */ | 1929 | */ |
1744 | static int rcu_needs_cpu_quick_check(int cpu) | 1930 | static int rcu_cpu_has_callbacks(int cpu) |
1745 | { | 1931 | { |
1746 | /* RCU callbacks either ready or pending? */ | 1932 | /* RCU callbacks either ready or pending? */ |
1747 | return per_cpu(rcu_sched_data, cpu).nxtlist || | 1933 | return per_cpu(rcu_sched_data, cpu).nxtlist || |
@@ -1842,10 +2028,11 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
1842 | for (i = 0; i < RCU_NEXT_SIZE; i++) | 2028 | for (i = 0; i < RCU_NEXT_SIZE; i++) |
1843 | rdp->nxttail[i] = &rdp->nxtlist; | 2029 | rdp->nxttail[i] = &rdp->nxtlist; |
1844 | rdp->qlen = 0; | 2030 | rdp->qlen = 0; |
1845 | #ifdef CONFIG_NO_HZ | ||
1846 | rdp->dynticks = &per_cpu(rcu_dynticks, cpu); | 2031 | rdp->dynticks = &per_cpu(rcu_dynticks, cpu); |
1847 | #endif /* #ifdef CONFIG_NO_HZ */ | 2032 | WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING); |
2033 | WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); | ||
1848 | rdp->cpu = cpu; | 2034 | rdp->cpu = cpu; |
2035 | rdp->rsp = rsp; | ||
1849 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 2036 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1850 | } | 2037 | } |
1851 | 2038 | ||
@@ -1865,13 +2052,15 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) | |||
1865 | 2052 | ||
1866 | /* Set up local state, ensuring consistent view of global state. */ | 2053 | /* Set up local state, ensuring consistent view of global state. */ |
1867 | raw_spin_lock_irqsave(&rnp->lock, flags); | 2054 | raw_spin_lock_irqsave(&rnp->lock, flags); |
1868 | rdp->passed_quiesc = 0; /* We could be racing with new GP, */ | ||
1869 | rdp->qs_pending = 1; /* so set up to respond to current GP. */ | ||
1870 | rdp->beenonline = 1; /* We have now been online. */ | 2055 | rdp->beenonline = 1; /* We have now been online. */ |
1871 | rdp->preemptible = preemptible; | 2056 | rdp->preemptible = preemptible; |
1872 | rdp->qlen_last_fqs_check = 0; | 2057 | rdp->qlen_last_fqs_check = 0; |
1873 | rdp->n_force_qs_snap = rsp->n_force_qs; | 2058 | rdp->n_force_qs_snap = rsp->n_force_qs; |
1874 | rdp->blimit = blimit; | 2059 | rdp->blimit = blimit; |
2060 | rdp->dynticks->dynticks_nesting = DYNTICK_TASK_NESTING; | ||
2061 | atomic_set(&rdp->dynticks->dynticks, | ||
2062 | (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); | ||
2063 | rcu_prepare_for_idle_init(cpu); | ||
1875 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 2064 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
1876 | 2065 | ||
1877 | /* | 2066 | /* |
@@ -1891,9 +2080,17 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) | |||
1891 | rnp->qsmaskinit |= mask; | 2080 | rnp->qsmaskinit |= mask; |
1892 | mask = rnp->grpmask; | 2081 | mask = rnp->grpmask; |
1893 | if (rnp == rdp->mynode) { | 2082 | if (rnp == rdp->mynode) { |
1894 | rdp->gpnum = rnp->completed; /* if GP in progress... */ | 2083 | /* |
2084 | * If there is a grace period in progress, we will | ||
2085 | * set up to wait for it next time we run the | ||
2086 | * RCU core code. | ||
2087 | */ | ||
2088 | rdp->gpnum = rnp->completed; | ||
1895 | rdp->completed = rnp->completed; | 2089 | rdp->completed = rnp->completed; |
1896 | rdp->passed_quiesc_completed = rnp->completed - 1; | 2090 | rdp->passed_quiesce = 0; |
2091 | rdp->qs_pending = 0; | ||
2092 | rdp->passed_quiesce_gpnum = rnp->gpnum - 1; | ||
2093 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl"); | ||
1897 | } | 2094 | } |
1898 | raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ | 2095 | raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ |
1899 | rnp = rnp->parent; | 2096 | rnp = rnp->parent; |
@@ -1919,6 +2116,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
1919 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); | 2116 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); |
1920 | struct rcu_node *rnp = rdp->mynode; | 2117 | struct rcu_node *rnp = rdp->mynode; |
1921 | 2118 | ||
2119 | trace_rcu_utilization("Start CPU hotplug"); | ||
1922 | switch (action) { | 2120 | switch (action) { |
1923 | case CPU_UP_PREPARE: | 2121 | case CPU_UP_PREPARE: |
1924 | case CPU_UP_PREPARE_FROZEN: | 2122 | case CPU_UP_PREPARE_FROZEN: |
@@ -1944,6 +2142,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
1944 | rcu_send_cbs_to_online(&rcu_bh_state); | 2142 | rcu_send_cbs_to_online(&rcu_bh_state); |
1945 | rcu_send_cbs_to_online(&rcu_sched_state); | 2143 | rcu_send_cbs_to_online(&rcu_sched_state); |
1946 | rcu_preempt_send_cbs_to_online(); | 2144 | rcu_preempt_send_cbs_to_online(); |
2145 | rcu_cleanup_after_idle(cpu); | ||
1947 | break; | 2146 | break; |
1948 | case CPU_DEAD: | 2147 | case CPU_DEAD: |
1949 | case CPU_DEAD_FROZEN: | 2148 | case CPU_DEAD_FROZEN: |
@@ -1954,6 +2153,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
1954 | default: | 2153 | default: |
1955 | break; | 2154 | break; |
1956 | } | 2155 | } |
2156 | trace_rcu_utilization("End CPU hotplug"); | ||
1957 | return NOTIFY_OK; | 2157 | return NOTIFY_OK; |
1958 | } | 2158 | } |
1959 | 2159 | ||
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 01b2ccda26fb..fddff92d6676 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
@@ -84,9 +84,10 @@ | |||
84 | * Dynticks per-CPU state. | 84 | * Dynticks per-CPU state. |
85 | */ | 85 | */ |
86 | struct rcu_dynticks { | 86 | struct rcu_dynticks { |
87 | int dynticks_nesting; /* Track irq/process nesting level. */ | 87 | long long dynticks_nesting; /* Track irq/process nesting level. */ |
88 | int dynticks_nmi_nesting; /* Track NMI nesting level. */ | 88 | /* Process level is worth LLONG_MAX/2. */ |
89 | atomic_t dynticks; /* Even value for dynticks-idle, else odd. */ | 89 | int dynticks_nmi_nesting; /* Track NMI nesting level. */ |
90 | atomic_t dynticks; /* Even value for idle, else odd. */ | ||
90 | }; | 91 | }; |
91 | 92 | ||
92 | /* RCU's kthread states for tracing. */ | 93 | /* RCU's kthread states for tracing. */ |
@@ -230,9 +231,9 @@ struct rcu_data { | |||
230 | /* in order to detect GP end. */ | 231 | /* in order to detect GP end. */ |
231 | unsigned long gpnum; /* Highest gp number that this CPU */ | 232 | unsigned long gpnum; /* Highest gp number that this CPU */ |
232 | /* is aware of having started. */ | 233 | /* is aware of having started. */ |
233 | unsigned long passed_quiesc_completed; | 234 | unsigned long passed_quiesce_gpnum; |
234 | /* Value of completed at time of qs. */ | 235 | /* gpnum at time of quiescent state. */ |
235 | bool passed_quiesc; /* User-mode/idle loop etc. */ | 236 | bool passed_quiesce; /* User-mode/idle loop etc. */ |
236 | bool qs_pending; /* Core waits for quiesc state. */ | 237 | bool qs_pending; /* Core waits for quiesc state. */ |
237 | bool beenonline; /* CPU online at least once. */ | 238 | bool beenonline; /* CPU online at least once. */ |
238 | bool preemptible; /* Preemptible RCU? */ | 239 | bool preemptible; /* Preemptible RCU? */ |
@@ -274,16 +275,12 @@ struct rcu_data { | |||
274 | /* did other CPU force QS recently? */ | 275 | /* did other CPU force QS recently? */ |
275 | long blimit; /* Upper limit on a processed batch */ | 276 | long blimit; /* Upper limit on a processed batch */ |
276 | 277 | ||
277 | #ifdef CONFIG_NO_HZ | ||
278 | /* 3) dynticks interface. */ | 278 | /* 3) dynticks interface. */ |
279 | struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ | 279 | struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ |
280 | int dynticks_snap; /* Per-GP tracking for dynticks. */ | 280 | int dynticks_snap; /* Per-GP tracking for dynticks. */ |
281 | #endif /* #ifdef CONFIG_NO_HZ */ | ||
282 | 281 | ||
283 | /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ | 282 | /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ |
284 | #ifdef CONFIG_NO_HZ | ||
285 | unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ | 283 | unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ |
286 | #endif /* #ifdef CONFIG_NO_HZ */ | ||
287 | unsigned long offline_fqs; /* Kicked due to being offline. */ | 284 | unsigned long offline_fqs; /* Kicked due to being offline. */ |
288 | unsigned long resched_ipi; /* Sent a resched IPI. */ | 285 | unsigned long resched_ipi; /* Sent a resched IPI. */ |
289 | 286 | ||
@@ -299,18 +296,15 @@ struct rcu_data { | |||
299 | unsigned long n_rp_need_nothing; | 296 | unsigned long n_rp_need_nothing; |
300 | 297 | ||
301 | int cpu; | 298 | int cpu; |
299 | struct rcu_state *rsp; | ||
302 | }; | 300 | }; |
303 | 301 | ||
304 | /* Values for signaled field in struct rcu_state. */ | 302 | /* Values for fqs_state field in struct rcu_state. */ |
305 | #define RCU_GP_IDLE 0 /* No grace period in progress. */ | 303 | #define RCU_GP_IDLE 0 /* No grace period in progress. */ |
306 | #define RCU_GP_INIT 1 /* Grace period being initialized. */ | 304 | #define RCU_GP_INIT 1 /* Grace period being initialized. */ |
307 | #define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ | 305 | #define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ |
308 | #define RCU_FORCE_QS 3 /* Need to force quiescent state. */ | 306 | #define RCU_FORCE_QS 3 /* Need to force quiescent state. */ |
309 | #ifdef CONFIG_NO_HZ | ||
310 | #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK | 307 | #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK |
311 | #else /* #ifdef CONFIG_NO_HZ */ | ||
312 | #define RCU_SIGNAL_INIT RCU_FORCE_QS | ||
313 | #endif /* #else #ifdef CONFIG_NO_HZ */ | ||
314 | 308 | ||
315 | #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ | 309 | #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ |
316 | 310 | ||
@@ -360,7 +354,7 @@ struct rcu_state { | |||
360 | 354 | ||
361 | /* The following fields are guarded by the root rcu_node's lock. */ | 355 | /* The following fields are guarded by the root rcu_node's lock. */ |
362 | 356 | ||
363 | u8 signaled ____cacheline_internodealigned_in_smp; | 357 | u8 fqs_state ____cacheline_internodealigned_in_smp; |
364 | /* Force QS state. */ | 358 | /* Force QS state. */ |
365 | u8 fqs_active; /* force_quiescent_state() */ | 359 | u8 fqs_active; /* force_quiescent_state() */ |
366 | /* is running. */ | 360 | /* is running. */ |
@@ -417,6 +411,13 @@ extern struct rcu_state rcu_preempt_state; | |||
417 | DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); | 411 | DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); |
418 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 412 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ |
419 | 413 | ||
414 | #ifdef CONFIG_RCU_BOOST | ||
415 | DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); | ||
416 | DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu); | ||
417 | DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); | ||
418 | DECLARE_PER_CPU(char, rcu_cpu_has_work); | ||
419 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
420 | |||
420 | #ifndef RCU_TREE_NONCORE | 421 | #ifndef RCU_TREE_NONCORE |
421 | 422 | ||
422 | /* Forward declarations for rcutree_plugin.h */ | 423 | /* Forward declarations for rcutree_plugin.h */ |
@@ -430,7 +431,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, | |||
430 | static void rcu_stop_cpu_kthread(int cpu); | 431 | static void rcu_stop_cpu_kthread(int cpu); |
431 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 432 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
432 | static void rcu_print_detail_task_stall(struct rcu_state *rsp); | 433 | static void rcu_print_detail_task_stall(struct rcu_state *rsp); |
433 | static void rcu_print_task_stall(struct rcu_node *rnp); | 434 | static int rcu_print_task_stall(struct rcu_node *rnp); |
434 | static void rcu_preempt_stall_reset(void); | 435 | static void rcu_preempt_stall_reset(void); |
435 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); | 436 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); |
436 | #ifdef CONFIG_HOTPLUG_CPU | 437 | #ifdef CONFIG_HOTPLUG_CPU |
@@ -443,17 +444,18 @@ static void rcu_preempt_check_callbacks(int cpu); | |||
443 | static void rcu_preempt_process_callbacks(void); | 444 | static void rcu_preempt_process_callbacks(void); |
444 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); | 445 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); |
445 | #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) | 446 | #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) |
446 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp); | 447 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, |
448 | bool wake); | ||
447 | #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ | 449 | #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ |
448 | static int rcu_preempt_pending(int cpu); | 450 | static int rcu_preempt_pending(int cpu); |
449 | static int rcu_preempt_needs_cpu(int cpu); | 451 | static int rcu_preempt_needs_cpu(int cpu); |
450 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu); | 452 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu); |
451 | static void rcu_preempt_send_cbs_to_online(void); | 453 | static void rcu_preempt_send_cbs_to_online(void); |
452 | static void __init __rcu_init_preempt(void); | 454 | static void __init __rcu_init_preempt(void); |
453 | static void rcu_needs_cpu_flush(void); | ||
454 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); | 455 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); |
455 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); | 456 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); |
456 | static void invoke_rcu_callbacks_kthread(void); | 457 | static void invoke_rcu_callbacks_kthread(void); |
458 | static bool rcu_is_callbacks_kthread(void); | ||
457 | #ifdef CONFIG_RCU_BOOST | 459 | #ifdef CONFIG_RCU_BOOST |
458 | static void rcu_preempt_do_callbacks(void); | 460 | static void rcu_preempt_do_callbacks(void); |
459 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, | 461 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, |
@@ -466,5 +468,8 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg); | |||
466 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 468 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
467 | static void rcu_cpu_kthread_setrt(int cpu, int to_rt); | 469 | static void rcu_cpu_kthread_setrt(int cpu, int to_rt); |
468 | static void __cpuinit rcu_prepare_kthreads(int cpu); | 470 | static void __cpuinit rcu_prepare_kthreads(int cpu); |
471 | static void rcu_prepare_for_idle_init(int cpu); | ||
472 | static void rcu_cleanup_after_idle(int cpu); | ||
473 | static void rcu_prepare_for_idle(int cpu); | ||
469 | 474 | ||
470 | #endif /* #ifndef RCU_TREE_NONCORE */ | 475 | #endif /* #ifndef RCU_TREE_NONCORE */ |
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 8aafbb80b8b0..8bb35d73e1f9 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
@@ -27,6 +27,14 @@ | |||
27 | #include <linux/delay.h> | 27 | #include <linux/delay.h> |
28 | #include <linux/stop_machine.h> | 28 | #include <linux/stop_machine.h> |
29 | 29 | ||
30 | #define RCU_KTHREAD_PRIO 1 | ||
31 | |||
32 | #ifdef CONFIG_RCU_BOOST | ||
33 | #define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO | ||
34 | #else | ||
35 | #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO | ||
36 | #endif | ||
37 | |||
30 | /* | 38 | /* |
31 | * Check the RCU kernel configuration parameters and print informative | 39 | * Check the RCU kernel configuration parameters and print informative |
32 | * messages about anything out of the ordinary. If you like #ifdef, you | 40 | * messages about anything out of the ordinary. If you like #ifdef, you |
@@ -64,7 +72,7 @@ static void __init rcu_bootup_announce_oddness(void) | |||
64 | 72 | ||
65 | #ifdef CONFIG_TREE_PREEMPT_RCU | 73 | #ifdef CONFIG_TREE_PREEMPT_RCU |
66 | 74 | ||
67 | struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); | 75 | struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt); |
68 | DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); | 76 | DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); |
69 | static struct rcu_state *rcu_state = &rcu_preempt_state; | 77 | static struct rcu_state *rcu_state = &rcu_preempt_state; |
70 | 78 | ||
@@ -122,9 +130,11 @@ static void rcu_preempt_qs(int cpu) | |||
122 | { | 130 | { |
123 | struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); | 131 | struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); |
124 | 132 | ||
125 | rdp->passed_quiesc_completed = rdp->gpnum - 1; | 133 | rdp->passed_quiesce_gpnum = rdp->gpnum; |
126 | barrier(); | 134 | barrier(); |
127 | rdp->passed_quiesc = 1; | 135 | if (rdp->passed_quiesce == 0) |
136 | trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs"); | ||
137 | rdp->passed_quiesce = 1; | ||
128 | current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; | 138 | current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; |
129 | } | 139 | } |
130 | 140 | ||
@@ -190,6 +200,11 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
190 | if (rnp->qsmask & rdp->grpmask) | 200 | if (rnp->qsmask & rdp->grpmask) |
191 | rnp->gp_tasks = &t->rcu_node_entry; | 201 | rnp->gp_tasks = &t->rcu_node_entry; |
192 | } | 202 | } |
203 | trace_rcu_preempt_task(rdp->rsp->name, | ||
204 | t->pid, | ||
205 | (rnp->qsmask & rdp->grpmask) | ||
206 | ? rnp->gpnum | ||
207 | : rnp->gpnum + 1); | ||
193 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 208 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
194 | } else if (t->rcu_read_lock_nesting < 0 && | 209 | } else if (t->rcu_read_lock_nesting < 0 && |
195 | t->rcu_read_unlock_special) { | 210 | t->rcu_read_unlock_special) { |
@@ -297,8 +312,12 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) | |||
297 | { | 312 | { |
298 | int empty; | 313 | int empty; |
299 | int empty_exp; | 314 | int empty_exp; |
315 | int empty_exp_now; | ||
300 | unsigned long flags; | 316 | unsigned long flags; |
301 | struct list_head *np; | 317 | struct list_head *np; |
318 | #ifdef CONFIG_RCU_BOOST | ||
319 | struct rt_mutex *rbmp = NULL; | ||
320 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
302 | struct rcu_node *rnp; | 321 | struct rcu_node *rnp; |
303 | int special; | 322 | int special; |
304 | 323 | ||
@@ -344,6 +363,9 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) | |||
344 | smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ | 363 | smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ |
345 | np = rcu_next_node_entry(t, rnp); | 364 | np = rcu_next_node_entry(t, rnp); |
346 | list_del_init(&t->rcu_node_entry); | 365 | list_del_init(&t->rcu_node_entry); |
366 | t->rcu_blocked_node = NULL; | ||
367 | trace_rcu_unlock_preempted_task("rcu_preempt", | ||
368 | rnp->gpnum, t->pid); | ||
347 | if (&t->rcu_node_entry == rnp->gp_tasks) | 369 | if (&t->rcu_node_entry == rnp->gp_tasks) |
348 | rnp->gp_tasks = np; | 370 | rnp->gp_tasks = np; |
349 | if (&t->rcu_node_entry == rnp->exp_tasks) | 371 | if (&t->rcu_node_entry == rnp->exp_tasks) |
@@ -351,38 +373,44 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) | |||
351 | #ifdef CONFIG_RCU_BOOST | 373 | #ifdef CONFIG_RCU_BOOST |
352 | if (&t->rcu_node_entry == rnp->boost_tasks) | 374 | if (&t->rcu_node_entry == rnp->boost_tasks) |
353 | rnp->boost_tasks = np; | 375 | rnp->boost_tasks = np; |
354 | /* Snapshot and clear ->rcu_boosted with rcu_node lock held. */ | 376 | /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */ |
355 | if (t->rcu_boosted) { | 377 | if (t->rcu_boost_mutex) { |
356 | special |= RCU_READ_UNLOCK_BOOSTED; | 378 | rbmp = t->rcu_boost_mutex; |
357 | t->rcu_boosted = 0; | 379 | t->rcu_boost_mutex = NULL; |
358 | } | 380 | } |
359 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 381 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
360 | t->rcu_blocked_node = NULL; | ||
361 | 382 | ||
362 | /* | 383 | /* |
363 | * If this was the last task on the current list, and if | 384 | * If this was the last task on the current list, and if |
364 | * we aren't waiting on any CPUs, report the quiescent state. | 385 | * we aren't waiting on any CPUs, report the quiescent state. |
365 | * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. | 386 | * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, |
387 | * so we must take a snapshot of the expedited state. | ||
366 | */ | 388 | */ |
367 | if (empty) | 389 | empty_exp_now = !rcu_preempted_readers_exp(rnp); |
368 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 390 | if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { |
369 | else | 391 | trace_rcu_quiescent_state_report("preempt_rcu", |
392 | rnp->gpnum, | ||
393 | 0, rnp->qsmask, | ||
394 | rnp->level, | ||
395 | rnp->grplo, | ||
396 | rnp->grphi, | ||
397 | !!rnp->gp_tasks); | ||
370 | rcu_report_unblock_qs_rnp(rnp, flags); | 398 | rcu_report_unblock_qs_rnp(rnp, flags); |
399 | } else | ||
400 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
371 | 401 | ||
372 | #ifdef CONFIG_RCU_BOOST | 402 | #ifdef CONFIG_RCU_BOOST |
373 | /* Unboost if we were boosted. */ | 403 | /* Unboost if we were boosted. */ |
374 | if (special & RCU_READ_UNLOCK_BOOSTED) { | 404 | if (rbmp) |
375 | rt_mutex_unlock(t->rcu_boost_mutex); | 405 | rt_mutex_unlock(rbmp); |
376 | t->rcu_boost_mutex = NULL; | ||
377 | } | ||
378 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 406 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
379 | 407 | ||
380 | /* | 408 | /* |
381 | * If this was the last task on the expedited lists, | 409 | * If this was the last task on the expedited lists, |
382 | * then we need to report up the rcu_node hierarchy. | 410 | * then we need to report up the rcu_node hierarchy. |
383 | */ | 411 | */ |
384 | if (!empty_exp && !rcu_preempted_readers_exp(rnp)) | 412 | if (!empty_exp && empty_exp_now) |
385 | rcu_report_exp_rnp(&rcu_preempt_state, rnp); | 413 | rcu_report_exp_rnp(&rcu_preempt_state, rnp, true); |
386 | } else { | 414 | } else { |
387 | local_irq_restore(flags); | 415 | local_irq_restore(flags); |
388 | } | 416 | } |
@@ -399,10 +427,10 @@ void __rcu_read_unlock(void) | |||
399 | { | 427 | { |
400 | struct task_struct *t = current; | 428 | struct task_struct *t = current; |
401 | 429 | ||
402 | barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */ | ||
403 | if (t->rcu_read_lock_nesting != 1) | 430 | if (t->rcu_read_lock_nesting != 1) |
404 | --t->rcu_read_lock_nesting; | 431 | --t->rcu_read_lock_nesting; |
405 | else { | 432 | else { |
433 | barrier(); /* critical section before exit code. */ | ||
406 | t->rcu_read_lock_nesting = INT_MIN; | 434 | t->rcu_read_lock_nesting = INT_MIN; |
407 | barrier(); /* assign before ->rcu_read_unlock_special load */ | 435 | barrier(); /* assign before ->rcu_read_unlock_special load */ |
408 | if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) | 436 | if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) |
@@ -466,16 +494,20 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) | |||
466 | * Scan the current list of tasks blocked within RCU read-side critical | 494 | * Scan the current list of tasks blocked within RCU read-side critical |
467 | * sections, printing out the tid of each. | 495 | * sections, printing out the tid of each. |
468 | */ | 496 | */ |
469 | static void rcu_print_task_stall(struct rcu_node *rnp) | 497 | static int rcu_print_task_stall(struct rcu_node *rnp) |
470 | { | 498 | { |
471 | struct task_struct *t; | 499 | struct task_struct *t; |
500 | int ndetected = 0; | ||
472 | 501 | ||
473 | if (!rcu_preempt_blocked_readers_cgp(rnp)) | 502 | if (!rcu_preempt_blocked_readers_cgp(rnp)) |
474 | return; | 503 | return 0; |
475 | t = list_entry(rnp->gp_tasks, | 504 | t = list_entry(rnp->gp_tasks, |
476 | struct task_struct, rcu_node_entry); | 505 | struct task_struct, rcu_node_entry); |
477 | list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) | 506 | list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { |
478 | printk(" P%d", t->pid); | 507 | printk(" P%d", t->pid); |
508 | ndetected++; | ||
509 | } | ||
510 | return ndetected; | ||
479 | } | 511 | } |
480 | 512 | ||
481 | /* | 513 | /* |
@@ -656,18 +688,9 @@ EXPORT_SYMBOL_GPL(call_rcu); | |||
656 | */ | 688 | */ |
657 | void synchronize_rcu(void) | 689 | void synchronize_rcu(void) |
658 | { | 690 | { |
659 | struct rcu_synchronize rcu; | ||
660 | |||
661 | if (!rcu_scheduler_active) | 691 | if (!rcu_scheduler_active) |
662 | return; | 692 | return; |
663 | 693 | wait_rcu_gp(call_rcu); | |
664 | init_rcu_head_on_stack(&rcu.head); | ||
665 | init_completion(&rcu.completion); | ||
666 | /* Will wake me after RCU finished. */ | ||
667 | call_rcu(&rcu.head, wakeme_after_rcu); | ||
668 | /* Wait for it. */ | ||
669 | wait_for_completion(&rcu.completion); | ||
670 | destroy_rcu_head_on_stack(&rcu.head); | ||
671 | } | 694 | } |
672 | EXPORT_SYMBOL_GPL(synchronize_rcu); | 695 | EXPORT_SYMBOL_GPL(synchronize_rcu); |
673 | 696 | ||
@@ -709,9 +732,13 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) | |||
709 | * recursively up the tree. (Calm down, calm down, we do the recursion | 732 | * recursively up the tree. (Calm down, calm down, we do the recursion |
710 | * iteratively!) | 733 | * iteratively!) |
711 | * | 734 | * |
735 | * Most callers will set the "wake" flag, but the task initiating the | ||
736 | * expedited grace period need not wake itself. | ||
737 | * | ||
712 | * Caller must hold sync_rcu_preempt_exp_mutex. | 738 | * Caller must hold sync_rcu_preempt_exp_mutex. |
713 | */ | 739 | */ |
714 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) | 740 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, |
741 | bool wake) | ||
715 | { | 742 | { |
716 | unsigned long flags; | 743 | unsigned long flags; |
717 | unsigned long mask; | 744 | unsigned long mask; |
@@ -724,7 +751,8 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) | |||
724 | } | 751 | } |
725 | if (rnp->parent == NULL) { | 752 | if (rnp->parent == NULL) { |
726 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 753 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
727 | wake_up(&sync_rcu_preempt_exp_wq); | 754 | if (wake) |
755 | wake_up(&sync_rcu_preempt_exp_wq); | ||
728 | break; | 756 | break; |
729 | } | 757 | } |
730 | mask = rnp->grpmask; | 758 | mask = rnp->grpmask; |
@@ -757,7 +785,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) | |||
757 | must_wait = 1; | 785 | must_wait = 1; |
758 | } | 786 | } |
759 | if (!must_wait) | 787 | if (!must_wait) |
760 | rcu_report_exp_rnp(rsp, rnp); | 788 | rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */ |
761 | } | 789 | } |
762 | 790 | ||
763 | /* | 791 | /* |
@@ -968,8 +996,9 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) | |||
968 | * Because preemptible RCU does not exist, we never have to check for | 996 | * Because preemptible RCU does not exist, we never have to check for |
969 | * tasks blocked within RCU read-side critical sections. | 997 | * tasks blocked within RCU read-side critical sections. |
970 | */ | 998 | */ |
971 | static void rcu_print_task_stall(struct rcu_node *rnp) | 999 | static int rcu_print_task_stall(struct rcu_node *rnp) |
972 | { | 1000 | { |
1001 | return 0; | ||
973 | } | 1002 | } |
974 | 1003 | ||
975 | /* | 1004 | /* |
@@ -1048,9 +1077,9 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | |||
1048 | * report on tasks preempted in RCU read-side critical sections during | 1077 | * report on tasks preempted in RCU read-side critical sections during |
1049 | * expedited RCU grace periods. | 1078 | * expedited RCU grace periods. |
1050 | */ | 1079 | */ |
1051 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) | 1080 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, |
1081 | bool wake) | ||
1052 | { | 1082 | { |
1053 | return; | ||
1054 | } | 1083 | } |
1055 | 1084 | ||
1056 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 1085 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
@@ -1199,12 +1228,12 @@ static int rcu_boost(struct rcu_node *rnp) | |||
1199 | t = container_of(tb, struct task_struct, rcu_node_entry); | 1228 | t = container_of(tb, struct task_struct, rcu_node_entry); |
1200 | rt_mutex_init_proxy_locked(&mtx, t); | 1229 | rt_mutex_init_proxy_locked(&mtx, t); |
1201 | t->rcu_boost_mutex = &mtx; | 1230 | t->rcu_boost_mutex = &mtx; |
1202 | t->rcu_boosted = 1; | ||
1203 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1231 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1204 | rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ | 1232 | rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ |
1205 | rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ | 1233 | rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ |
1206 | 1234 | ||
1207 | return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL; | 1235 | return ACCESS_ONCE(rnp->exp_tasks) != NULL || |
1236 | ACCESS_ONCE(rnp->boost_tasks) != NULL; | ||
1208 | } | 1237 | } |
1209 | 1238 | ||
1210 | /* | 1239 | /* |
@@ -1228,9 +1257,12 @@ static int rcu_boost_kthread(void *arg) | |||
1228 | int spincnt = 0; | 1257 | int spincnt = 0; |
1229 | int more2boost; | 1258 | int more2boost; |
1230 | 1259 | ||
1260 | trace_rcu_utilization("Start boost kthread@init"); | ||
1231 | for (;;) { | 1261 | for (;;) { |
1232 | rnp->boost_kthread_status = RCU_KTHREAD_WAITING; | 1262 | rnp->boost_kthread_status = RCU_KTHREAD_WAITING; |
1263 | trace_rcu_utilization("End boost kthread@rcu_wait"); | ||
1233 | rcu_wait(rnp->boost_tasks || rnp->exp_tasks); | 1264 | rcu_wait(rnp->boost_tasks || rnp->exp_tasks); |
1265 | trace_rcu_utilization("Start boost kthread@rcu_wait"); | ||
1234 | rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; | 1266 | rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; |
1235 | more2boost = rcu_boost(rnp); | 1267 | more2boost = rcu_boost(rnp); |
1236 | if (more2boost) | 1268 | if (more2boost) |
@@ -1238,11 +1270,14 @@ static int rcu_boost_kthread(void *arg) | |||
1238 | else | 1270 | else |
1239 | spincnt = 0; | 1271 | spincnt = 0; |
1240 | if (spincnt > 10) { | 1272 | if (spincnt > 10) { |
1273 | trace_rcu_utilization("End boost kthread@rcu_yield"); | ||
1241 | rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp); | 1274 | rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp); |
1275 | trace_rcu_utilization("Start boost kthread@rcu_yield"); | ||
1242 | spincnt = 0; | 1276 | spincnt = 0; |
1243 | } | 1277 | } |
1244 | } | 1278 | } |
1245 | /* NOTREACHED */ | 1279 | /* NOTREACHED */ |
1280 | trace_rcu_utilization("End boost kthread@notreached"); | ||
1246 | return 0; | 1281 | return 0; |
1247 | } | 1282 | } |
1248 | 1283 | ||
@@ -1291,15 +1326,22 @@ static void invoke_rcu_callbacks_kthread(void) | |||
1291 | 1326 | ||
1292 | local_irq_save(flags); | 1327 | local_irq_save(flags); |
1293 | __this_cpu_write(rcu_cpu_has_work, 1); | 1328 | __this_cpu_write(rcu_cpu_has_work, 1); |
1294 | if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) { | 1329 | if (__this_cpu_read(rcu_cpu_kthread_task) != NULL && |
1295 | local_irq_restore(flags); | 1330 | current != __this_cpu_read(rcu_cpu_kthread_task)) |
1296 | return; | 1331 | wake_up_process(__this_cpu_read(rcu_cpu_kthread_task)); |
1297 | } | ||
1298 | wake_up_process(__this_cpu_read(rcu_cpu_kthread_task)); | ||
1299 | local_irq_restore(flags); | 1332 | local_irq_restore(flags); |
1300 | } | 1333 | } |
1301 | 1334 | ||
1302 | /* | 1335 | /* |
1336 | * Is the current CPU running the RCU-callbacks kthread? | ||
1337 | * Caller must have preemption disabled. | ||
1338 | */ | ||
1339 | static bool rcu_is_callbacks_kthread(void) | ||
1340 | { | ||
1341 | return __get_cpu_var(rcu_cpu_kthread_task) == current; | ||
1342 | } | ||
1343 | |||
1344 | /* | ||
1303 | * Set the affinity of the boost kthread. The CPU-hotplug locks are | 1345 | * Set the affinity of the boost kthread. The CPU-hotplug locks are |
1304 | * held, so no one should be messing with the existence of the boost | 1346 | * held, so no one should be messing with the existence of the boost |
1305 | * kthread. | 1347 | * kthread. |
@@ -1343,13 +1385,13 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | |||
1343 | if (rnp->boost_kthread_task != NULL) | 1385 | if (rnp->boost_kthread_task != NULL) |
1344 | return 0; | 1386 | return 0; |
1345 | t = kthread_create(rcu_boost_kthread, (void *)rnp, | 1387 | t = kthread_create(rcu_boost_kthread, (void *)rnp, |
1346 | "rcub%d", rnp_index); | 1388 | "rcub/%d", rnp_index); |
1347 | if (IS_ERR(t)) | 1389 | if (IS_ERR(t)) |
1348 | return PTR_ERR(t); | 1390 | return PTR_ERR(t); |
1349 | raw_spin_lock_irqsave(&rnp->lock, flags); | 1391 | raw_spin_lock_irqsave(&rnp->lock, flags); |
1350 | rnp->boost_kthread_task = t; | 1392 | rnp->boost_kthread_task = t; |
1351 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1393 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1352 | sp.sched_priority = RCU_KTHREAD_PRIO; | 1394 | sp.sched_priority = RCU_BOOST_PRIO; |
1353 | sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); | 1395 | sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); |
1354 | wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ | 1396 | wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ |
1355 | return 0; | 1397 | return 0; |
@@ -1444,6 +1486,7 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg) | |||
1444 | { | 1486 | { |
1445 | struct sched_param sp; | 1487 | struct sched_param sp; |
1446 | struct timer_list yield_timer; | 1488 | struct timer_list yield_timer; |
1489 | int prio = current->rt_priority; | ||
1447 | 1490 | ||
1448 | setup_timer_on_stack(&yield_timer, f, arg); | 1491 | setup_timer_on_stack(&yield_timer, f, arg); |
1449 | mod_timer(&yield_timer, jiffies + 2); | 1492 | mod_timer(&yield_timer, jiffies + 2); |
@@ -1451,7 +1494,8 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg) | |||
1451 | sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp); | 1494 | sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp); |
1452 | set_user_nice(current, 19); | 1495 | set_user_nice(current, 19); |
1453 | schedule(); | 1496 | schedule(); |
1454 | sp.sched_priority = RCU_KTHREAD_PRIO; | 1497 | set_user_nice(current, 0); |
1498 | sp.sched_priority = prio; | ||
1455 | sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); | 1499 | sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); |
1456 | del_timer(&yield_timer); | 1500 | del_timer(&yield_timer); |
1457 | } | 1501 | } |
@@ -1489,7 +1533,8 @@ static int rcu_cpu_kthread_should_stop(int cpu) | |||
1489 | 1533 | ||
1490 | /* | 1534 | /* |
1491 | * Per-CPU kernel thread that invokes RCU callbacks. This replaces the | 1535 | * Per-CPU kernel thread that invokes RCU callbacks. This replaces the |
1492 | * earlier RCU softirq. | 1536 | * RCU softirq used in flavors and configurations of RCU that do not |
1537 | * support RCU priority boosting. | ||
1493 | */ | 1538 | */ |
1494 | static int rcu_cpu_kthread(void *arg) | 1539 | static int rcu_cpu_kthread(void *arg) |
1495 | { | 1540 | { |
@@ -1500,9 +1545,12 @@ static int rcu_cpu_kthread(void *arg) | |||
1500 | char work; | 1545 | char work; |
1501 | char *workp = &per_cpu(rcu_cpu_has_work, cpu); | 1546 | char *workp = &per_cpu(rcu_cpu_has_work, cpu); |
1502 | 1547 | ||
1548 | trace_rcu_utilization("Start CPU kthread@init"); | ||
1503 | for (;;) { | 1549 | for (;;) { |
1504 | *statusp = RCU_KTHREAD_WAITING; | 1550 | *statusp = RCU_KTHREAD_WAITING; |
1551 | trace_rcu_utilization("End CPU kthread@rcu_wait"); | ||
1505 | rcu_wait(*workp != 0 || kthread_should_stop()); | 1552 | rcu_wait(*workp != 0 || kthread_should_stop()); |
1553 | trace_rcu_utilization("Start CPU kthread@rcu_wait"); | ||
1506 | local_bh_disable(); | 1554 | local_bh_disable(); |
1507 | if (rcu_cpu_kthread_should_stop(cpu)) { | 1555 | if (rcu_cpu_kthread_should_stop(cpu)) { |
1508 | local_bh_enable(); | 1556 | local_bh_enable(); |
@@ -1523,11 +1571,14 @@ static int rcu_cpu_kthread(void *arg) | |||
1523 | spincnt = 0; | 1571 | spincnt = 0; |
1524 | if (spincnt > 10) { | 1572 | if (spincnt > 10) { |
1525 | *statusp = RCU_KTHREAD_YIELDING; | 1573 | *statusp = RCU_KTHREAD_YIELDING; |
1574 | trace_rcu_utilization("End CPU kthread@rcu_yield"); | ||
1526 | rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu); | 1575 | rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu); |
1576 | trace_rcu_utilization("Start CPU kthread@rcu_yield"); | ||
1527 | spincnt = 0; | 1577 | spincnt = 0; |
1528 | } | 1578 | } |
1529 | } | 1579 | } |
1530 | *statusp = RCU_KTHREAD_STOPPED; | 1580 | *statusp = RCU_KTHREAD_STOPPED; |
1581 | trace_rcu_utilization("End CPU kthread@term"); | ||
1531 | return 0; | 1582 | return 0; |
1532 | } | 1583 | } |
1533 | 1584 | ||
@@ -1560,7 +1611,10 @@ static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu) | |||
1560 | if (!rcu_scheduler_fully_active || | 1611 | if (!rcu_scheduler_fully_active || |
1561 | per_cpu(rcu_cpu_kthread_task, cpu) != NULL) | 1612 | per_cpu(rcu_cpu_kthread_task, cpu) != NULL) |
1562 | return 0; | 1613 | return 0; |
1563 | t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu); | 1614 | t = kthread_create_on_node(rcu_cpu_kthread, |
1615 | (void *)(long)cpu, | ||
1616 | cpu_to_node(cpu), | ||
1617 | "rcuc/%d", cpu); | ||
1564 | if (IS_ERR(t)) | 1618 | if (IS_ERR(t)) |
1565 | return PTR_ERR(t); | 1619 | return PTR_ERR(t); |
1566 | if (cpu_online(cpu)) | 1620 | if (cpu_online(cpu)) |
@@ -1669,7 +1723,7 @@ static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp, | |||
1669 | return 0; | 1723 | return 0; |
1670 | if (rnp->node_kthread_task == NULL) { | 1724 | if (rnp->node_kthread_task == NULL) { |
1671 | t = kthread_create(rcu_node_kthread, (void *)rnp, | 1725 | t = kthread_create(rcu_node_kthread, (void *)rnp, |
1672 | "rcun%d", rnp_index); | 1726 | "rcun/%d", rnp_index); |
1673 | if (IS_ERR(t)) | 1727 | if (IS_ERR(t)) |
1674 | return PTR_ERR(t); | 1728 | return PTR_ERR(t); |
1675 | raw_spin_lock_irqsave(&rnp->lock, flags); | 1729 | raw_spin_lock_irqsave(&rnp->lock, flags); |
@@ -1731,6 +1785,11 @@ static void invoke_rcu_callbacks_kthread(void) | |||
1731 | WARN_ON_ONCE(1); | 1785 | WARN_ON_ONCE(1); |
1732 | } | 1786 | } |
1733 | 1787 | ||
1788 | static bool rcu_is_callbacks_kthread(void) | ||
1789 | { | ||
1790 | return false; | ||
1791 | } | ||
1792 | |||
1734 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) | 1793 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) |
1735 | { | 1794 | { |
1736 | } | 1795 | } |
@@ -1866,7 +1925,7 @@ void synchronize_sched_expedited(void) | |||
1866 | * grace period works for us. | 1925 | * grace period works for us. |
1867 | */ | 1926 | */ |
1868 | get_online_cpus(); | 1927 | get_online_cpus(); |
1869 | snap = atomic_read(&sync_sched_expedited_started) - 1; | 1928 | snap = atomic_read(&sync_sched_expedited_started); |
1870 | smp_mb(); /* ensure read is before try_stop_cpus(). */ | 1929 | smp_mb(); /* ensure read is before try_stop_cpus(). */ |
1871 | } | 1930 | } |
1872 | 1931 | ||
@@ -1898,113 +1957,243 @@ EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | |||
1898 | * 1 if so. This function is part of the RCU implementation; it is -not- | 1957 | * 1 if so. This function is part of the RCU implementation; it is -not- |
1899 | * an exported member of the RCU API. | 1958 | * an exported member of the RCU API. |
1900 | * | 1959 | * |
1901 | * Because we have preemptible RCU, just check whether this CPU needs | 1960 | * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs |
1902 | * any flavor of RCU. Do not chew up lots of CPU cycles with preemption | 1961 | * any flavor of RCU. |
1903 | * disabled in a most-likely vain attempt to cause RCU not to need this CPU. | ||
1904 | */ | 1962 | */ |
1905 | int rcu_needs_cpu(int cpu) | 1963 | int rcu_needs_cpu(int cpu) |
1906 | { | 1964 | { |
1907 | return rcu_needs_cpu_quick_check(cpu); | 1965 | return rcu_cpu_has_callbacks(cpu); |
1966 | } | ||
1967 | |||
1968 | /* | ||
1969 | * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it. | ||
1970 | */ | ||
1971 | static void rcu_prepare_for_idle_init(int cpu) | ||
1972 | { | ||
1908 | } | 1973 | } |
1909 | 1974 | ||
1910 | /* | 1975 | /* |
1911 | * Check to see if we need to continue a callback-flush operations to | 1976 | * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up |
1912 | * allow the last CPU to enter dyntick-idle mode. But fast dyntick-idle | 1977 | * after it. |
1913 | * entry is not configured, so we never do need to. | ||
1914 | */ | 1978 | */ |
1915 | static void rcu_needs_cpu_flush(void) | 1979 | static void rcu_cleanup_after_idle(int cpu) |
1980 | { | ||
1981 | } | ||
1982 | |||
1983 | /* | ||
1984 | * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=y, | ||
1985 | * is nothing. | ||
1986 | */ | ||
1987 | static void rcu_prepare_for_idle(int cpu) | ||
1916 | { | 1988 | { |
1917 | } | 1989 | } |
1918 | 1990 | ||
1919 | #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ | 1991 | #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ |
1920 | 1992 | ||
1921 | #define RCU_NEEDS_CPU_FLUSHES 5 | 1993 | /* |
1994 | * This code is invoked when a CPU goes idle, at which point we want | ||
1995 | * to have the CPU do everything required for RCU so that it can enter | ||
1996 | * the energy-efficient dyntick-idle mode. This is handled by a | ||
1997 | * state machine implemented by rcu_prepare_for_idle() below. | ||
1998 | * | ||
1999 | * The following three proprocessor symbols control this state machine: | ||
2000 | * | ||
2001 | * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt | ||
2002 | * to satisfy RCU. Beyond this point, it is better to incur a periodic | ||
2003 | * scheduling-clock interrupt than to loop through the state machine | ||
2004 | * at full power. | ||
2005 | * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are | ||
2006 | * optional if RCU does not need anything immediately from this | ||
2007 | * CPU, even if this CPU still has RCU callbacks queued. The first | ||
2008 | * times through the state machine are mandatory: we need to give | ||
2009 | * the state machine a chance to communicate a quiescent state | ||
2010 | * to the RCU core. | ||
2011 | * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted | ||
2012 | * to sleep in dyntick-idle mode with RCU callbacks pending. This | ||
2013 | * is sized to be roughly one RCU grace period. Those energy-efficiency | ||
2014 | * benchmarkers who might otherwise be tempted to set this to a large | ||
2015 | * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your | ||
2016 | * system. And if you are -that- concerned about energy efficiency, | ||
2017 | * just power the system down and be done with it! | ||
2018 | * | ||
2019 | * The values below work well in practice. If future workloads require | ||
2020 | * adjustment, they can be converted into kernel config parameters, though | ||
2021 | * making the state machine smarter might be a better option. | ||
2022 | */ | ||
2023 | #define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */ | ||
2024 | #define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */ | ||
2025 | #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ | ||
2026 | |||
1922 | static DEFINE_PER_CPU(int, rcu_dyntick_drain); | 2027 | static DEFINE_PER_CPU(int, rcu_dyntick_drain); |
1923 | static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); | 2028 | static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); |
2029 | static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer); | ||
2030 | static ktime_t rcu_idle_gp_wait; | ||
1924 | 2031 | ||
1925 | /* | 2032 | /* |
1926 | * Check to see if any future RCU-related work will need to be done | 2033 | * Allow the CPU to enter dyntick-idle mode if either: (1) There are no |
1927 | * by the current CPU, even if none need be done immediately, returning | 2034 | * callbacks on this CPU, (2) this CPU has not yet attempted to enter |
1928 | * 1 if so. This function is part of the RCU implementation; it is -not- | 2035 | * dyntick-idle mode, or (3) this CPU is in the process of attempting to |
1929 | * an exported member of the RCU API. | 2036 | * enter dyntick-idle mode. Otherwise, if we have recently tried and failed |
2037 | * to enter dyntick-idle mode, we refuse to try to enter it. After all, | ||
2038 | * it is better to incur scheduling-clock interrupts than to spin | ||
2039 | * continuously for the same time duration! | ||
2040 | */ | ||
2041 | int rcu_needs_cpu(int cpu) | ||
2042 | { | ||
2043 | /* If no callbacks, RCU doesn't need the CPU. */ | ||
2044 | if (!rcu_cpu_has_callbacks(cpu)) | ||
2045 | return 0; | ||
2046 | /* Otherwise, RCU needs the CPU only if it recently tried and failed. */ | ||
2047 | return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies; | ||
2048 | } | ||
2049 | |||
2050 | /* | ||
2051 | * Timer handler used to force CPU to start pushing its remaining RCU | ||
2052 | * callbacks in the case where it entered dyntick-idle mode with callbacks | ||
2053 | * pending. The hander doesn't really need to do anything because the | ||
2054 | * real work is done upon re-entry to idle, or by the next scheduling-clock | ||
2055 | * interrupt should idle not be re-entered. | ||
2056 | */ | ||
2057 | static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp) | ||
2058 | { | ||
2059 | trace_rcu_prep_idle("Timer"); | ||
2060 | return HRTIMER_NORESTART; | ||
2061 | } | ||
2062 | |||
2063 | /* | ||
2064 | * Initialize the timer used to pull CPUs out of dyntick-idle mode. | ||
2065 | */ | ||
2066 | static void rcu_prepare_for_idle_init(int cpu) | ||
2067 | { | ||
2068 | static int firsttime = 1; | ||
2069 | struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); | ||
2070 | |||
2071 | hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
2072 | hrtp->function = rcu_idle_gp_timer_func; | ||
2073 | if (firsttime) { | ||
2074 | unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY); | ||
2075 | |||
2076 | rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000); | ||
2077 | firsttime = 0; | ||
2078 | } | ||
2079 | } | ||
2080 | |||
2081 | /* | ||
2082 | * Clean up for exit from idle. Because we are exiting from idle, there | ||
2083 | * is no longer any point to rcu_idle_gp_timer, so cancel it. This will | ||
2084 | * do nothing if this timer is not active, so just cancel it unconditionally. | ||
2085 | */ | ||
2086 | static void rcu_cleanup_after_idle(int cpu) | ||
2087 | { | ||
2088 | hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu)); | ||
2089 | } | ||
2090 | |||
2091 | /* | ||
2092 | * Check to see if any RCU-related work can be done by the current CPU, | ||
2093 | * and if so, schedule a softirq to get it done. This function is part | ||
2094 | * of the RCU implementation; it is -not- an exported member of the RCU API. | ||
1930 | * | 2095 | * |
1931 | * Because we are not supporting preemptible RCU, attempt to accelerate | 2096 | * The idea is for the current CPU to clear out all work required by the |
1932 | * any current grace periods so that RCU no longer needs this CPU, but | 2097 | * RCU core for the current grace period, so that this CPU can be permitted |
1933 | * only if all other CPUs are already in dynticks-idle mode. This will | 2098 | * to enter dyntick-idle mode. In some cases, it will need to be awakened |
1934 | * allow the CPU cores to be powered down immediately, as opposed to after | 2099 | * at the end of the grace period by whatever CPU ends the grace period. |
1935 | * waiting many milliseconds for grace periods to elapse. | 2100 | * This allows CPUs to go dyntick-idle more quickly, and to reduce the |
2101 | * number of wakeups by a modest integer factor. | ||
1936 | * | 2102 | * |
1937 | * Because it is not legal to invoke rcu_process_callbacks() with irqs | 2103 | * Because it is not legal to invoke rcu_process_callbacks() with irqs |
1938 | * disabled, we do one pass of force_quiescent_state(), then do a | 2104 | * disabled, we do one pass of force_quiescent_state(), then do a |
1939 | * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked | 2105 | * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked |
1940 | * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. | 2106 | * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. |
2107 | * | ||
2108 | * The caller must have disabled interrupts. | ||
1941 | */ | 2109 | */ |
1942 | int rcu_needs_cpu(int cpu) | 2110 | static void rcu_prepare_for_idle(int cpu) |
1943 | { | 2111 | { |
1944 | int c = 0; | 2112 | unsigned long flags; |
1945 | int snap; | 2113 | |
1946 | int thatcpu; | 2114 | local_irq_save(flags); |
1947 | 2115 | ||
1948 | /* Check for being in the holdoff period. */ | 2116 | /* |
1949 | if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) | 2117 | * If there are no callbacks on this CPU, enter dyntick-idle mode. |
1950 | return rcu_needs_cpu_quick_check(cpu); | 2118 | * Also reset state to avoid prejudicing later attempts. |
1951 | 2119 | */ | |
1952 | /* Don't bother unless we are the last non-dyntick-idle CPU. */ | 2120 | if (!rcu_cpu_has_callbacks(cpu)) { |
1953 | for_each_online_cpu(thatcpu) { | 2121 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; |
1954 | if (thatcpu == cpu) | 2122 | per_cpu(rcu_dyntick_drain, cpu) = 0; |
1955 | continue; | 2123 | local_irq_restore(flags); |
1956 | snap = atomic_add_return(0, &per_cpu(rcu_dynticks, | 2124 | trace_rcu_prep_idle("No callbacks"); |
1957 | thatcpu).dynticks); | 2125 | return; |
1958 | smp_mb(); /* Order sampling of snap with end of grace period. */ | 2126 | } |
1959 | if ((snap & 0x1) != 0) { | 2127 | |
1960 | per_cpu(rcu_dyntick_drain, cpu) = 0; | 2128 | /* |
1961 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; | 2129 | * If in holdoff mode, just return. We will presumably have |
1962 | return rcu_needs_cpu_quick_check(cpu); | 2130 | * refrained from disabling the scheduling-clock tick. |
1963 | } | 2131 | */ |
2132 | if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) { | ||
2133 | local_irq_restore(flags); | ||
2134 | trace_rcu_prep_idle("In holdoff"); | ||
2135 | return; | ||
1964 | } | 2136 | } |
1965 | 2137 | ||
1966 | /* Check and update the rcu_dyntick_drain sequencing. */ | 2138 | /* Check and update the rcu_dyntick_drain sequencing. */ |
1967 | if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { | 2139 | if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { |
1968 | /* First time through, initialize the counter. */ | 2140 | /* First time through, initialize the counter. */ |
1969 | per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES; | 2141 | per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES; |
2142 | } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES && | ||
2143 | !rcu_pending(cpu)) { | ||
2144 | /* Can we go dyntick-idle despite still having callbacks? */ | ||
2145 | trace_rcu_prep_idle("Dyntick with callbacks"); | ||
2146 | per_cpu(rcu_dyntick_drain, cpu) = 0; | ||
2147 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; | ||
2148 | hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), | ||
2149 | rcu_idle_gp_wait, HRTIMER_MODE_REL); | ||
2150 | return; /* Nothing more to do immediately. */ | ||
1970 | } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { | 2151 | } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { |
1971 | /* We have hit the limit, so time to give up. */ | 2152 | /* We have hit the limit, so time to give up. */ |
1972 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; | 2153 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; |
1973 | return rcu_needs_cpu_quick_check(cpu); | 2154 | local_irq_restore(flags); |
2155 | trace_rcu_prep_idle("Begin holdoff"); | ||
2156 | invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ | ||
2157 | return; | ||
1974 | } | 2158 | } |
1975 | 2159 | ||
1976 | /* Do one step pushing remaining RCU callbacks through. */ | 2160 | /* |
2161 | * Do one step of pushing the remaining RCU callbacks through | ||
2162 | * the RCU core state machine. | ||
2163 | */ | ||
2164 | #ifdef CONFIG_TREE_PREEMPT_RCU | ||
2165 | if (per_cpu(rcu_preempt_data, cpu).nxtlist) { | ||
2166 | local_irq_restore(flags); | ||
2167 | rcu_preempt_qs(cpu); | ||
2168 | force_quiescent_state(&rcu_preempt_state, 0); | ||
2169 | local_irq_save(flags); | ||
2170 | } | ||
2171 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | ||
1977 | if (per_cpu(rcu_sched_data, cpu).nxtlist) { | 2172 | if (per_cpu(rcu_sched_data, cpu).nxtlist) { |
2173 | local_irq_restore(flags); | ||
1978 | rcu_sched_qs(cpu); | 2174 | rcu_sched_qs(cpu); |
1979 | force_quiescent_state(&rcu_sched_state, 0); | 2175 | force_quiescent_state(&rcu_sched_state, 0); |
1980 | c = c || per_cpu(rcu_sched_data, cpu).nxtlist; | 2176 | local_irq_save(flags); |
1981 | } | 2177 | } |
1982 | if (per_cpu(rcu_bh_data, cpu).nxtlist) { | 2178 | if (per_cpu(rcu_bh_data, cpu).nxtlist) { |
2179 | local_irq_restore(flags); | ||
1983 | rcu_bh_qs(cpu); | 2180 | rcu_bh_qs(cpu); |
1984 | force_quiescent_state(&rcu_bh_state, 0); | 2181 | force_quiescent_state(&rcu_bh_state, 0); |
1985 | c = c || per_cpu(rcu_bh_data, cpu).nxtlist; | 2182 | local_irq_save(flags); |
1986 | } | 2183 | } |
1987 | 2184 | ||
1988 | /* If RCU callbacks are still pending, RCU still needs this CPU. */ | 2185 | /* |
1989 | if (c) | 2186 | * If RCU callbacks are still pending, RCU still needs this CPU. |
2187 | * So try forcing the callbacks through the grace period. | ||
2188 | */ | ||
2189 | if (rcu_cpu_has_callbacks(cpu)) { | ||
2190 | local_irq_restore(flags); | ||
2191 | trace_rcu_prep_idle("More callbacks"); | ||
1990 | invoke_rcu_core(); | 2192 | invoke_rcu_core(); |
1991 | return c; | 2193 | } else { |
1992 | } | 2194 | local_irq_restore(flags); |
1993 | 2195 | trace_rcu_prep_idle("Callbacks drained"); | |
1994 | /* | 2196 | } |
1995 | * Check to see if we need to continue a callback-flush operations to | ||
1996 | * allow the last CPU to enter dyntick-idle mode. | ||
1997 | */ | ||
1998 | static void rcu_needs_cpu_flush(void) | ||
1999 | { | ||
2000 | int cpu = smp_processor_id(); | ||
2001 | unsigned long flags; | ||
2002 | |||
2003 | if (per_cpu(rcu_dyntick_drain, cpu) <= 0) | ||
2004 | return; | ||
2005 | local_irq_save(flags); | ||
2006 | (void)rcu_needs_cpu(cpu); | ||
2007 | local_irq_restore(flags); | ||
2008 | } | 2197 | } |
2009 | 2198 | ||
2010 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ | 2199 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ |
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 3b0c0986afc0..654cfe67f0d1 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c | |||
@@ -48,11 +48,6 @@ | |||
48 | 48 | ||
49 | #ifdef CONFIG_RCU_BOOST | 49 | #ifdef CONFIG_RCU_BOOST |
50 | 50 | ||
51 | DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); | ||
52 | DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu); | ||
53 | DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); | ||
54 | DECLARE_PER_CPU(char, rcu_cpu_has_work); | ||
55 | |||
56 | static char convert_kthread_status(unsigned int kthread_status) | 51 | static char convert_kthread_status(unsigned int kthread_status) |
57 | { | 52 | { |
58 | if (kthread_status > RCU_KTHREAD_MAX) | 53 | if (kthread_status > RCU_KTHREAD_MAX) |
@@ -66,19 +61,17 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | |||
66 | { | 61 | { |
67 | if (!rdp->beenonline) | 62 | if (!rdp->beenonline) |
68 | return; | 63 | return; |
69 | seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pqc=%lu qp=%d", | 64 | seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pgp=%lu qp=%d", |
70 | rdp->cpu, | 65 | rdp->cpu, |
71 | cpu_is_offline(rdp->cpu) ? '!' : ' ', | 66 | cpu_is_offline(rdp->cpu) ? '!' : ' ', |
72 | rdp->completed, rdp->gpnum, | 67 | rdp->completed, rdp->gpnum, |
73 | rdp->passed_quiesc, rdp->passed_quiesc_completed, | 68 | rdp->passed_quiesce, rdp->passed_quiesce_gpnum, |
74 | rdp->qs_pending); | 69 | rdp->qs_pending); |
75 | #ifdef CONFIG_NO_HZ | 70 | seq_printf(m, " dt=%d/%llx/%d df=%lu", |
76 | seq_printf(m, " dt=%d/%d/%d df=%lu", | ||
77 | atomic_read(&rdp->dynticks->dynticks), | 71 | atomic_read(&rdp->dynticks->dynticks), |
78 | rdp->dynticks->dynticks_nesting, | 72 | rdp->dynticks->dynticks_nesting, |
79 | rdp->dynticks->dynticks_nmi_nesting, | 73 | rdp->dynticks->dynticks_nmi_nesting, |
80 | rdp->dynticks_fqs); | 74 | rdp->dynticks_fqs); |
81 | #endif /* #ifdef CONFIG_NO_HZ */ | ||
82 | seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); | 75 | seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); |
83 | seq_printf(m, " ql=%ld qs=%c%c%c%c", | 76 | seq_printf(m, " ql=%ld qs=%c%c%c%c", |
84 | rdp->qlen, | 77 | rdp->qlen, |
@@ -144,15 +137,13 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) | |||
144 | rdp->cpu, | 137 | rdp->cpu, |
145 | cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", | 138 | cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", |
146 | rdp->completed, rdp->gpnum, | 139 | rdp->completed, rdp->gpnum, |
147 | rdp->passed_quiesc, rdp->passed_quiesc_completed, | 140 | rdp->passed_quiesce, rdp->passed_quiesce_gpnum, |
148 | rdp->qs_pending); | 141 | rdp->qs_pending); |
149 | #ifdef CONFIG_NO_HZ | 142 | seq_printf(m, ",%d,%llx,%d,%lu", |
150 | seq_printf(m, ",%d,%d,%d,%lu", | ||
151 | atomic_read(&rdp->dynticks->dynticks), | 143 | atomic_read(&rdp->dynticks->dynticks), |
152 | rdp->dynticks->dynticks_nesting, | 144 | rdp->dynticks->dynticks_nesting, |
153 | rdp->dynticks->dynticks_nmi_nesting, | 145 | rdp->dynticks->dynticks_nmi_nesting, |
154 | rdp->dynticks_fqs); | 146 | rdp->dynticks_fqs); |
155 | #endif /* #ifdef CONFIG_NO_HZ */ | ||
156 | seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); | 147 | seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); |
157 | seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen, | 148 | seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen, |
158 | ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != | 149 | ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != |
@@ -175,10 +166,8 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) | |||
175 | 166 | ||
176 | static int show_rcudata_csv(struct seq_file *m, void *unused) | 167 | static int show_rcudata_csv(struct seq_file *m, void *unused) |
177 | { | 168 | { |
178 | seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\","); | 169 | seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); |
179 | #ifdef CONFIG_NO_HZ | ||
180 | seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); | 170 | seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); |
181 | #endif /* #ifdef CONFIG_NO_HZ */ | ||
182 | seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\""); | 171 | seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\""); |
183 | #ifdef CONFIG_RCU_BOOST | 172 | #ifdef CONFIG_RCU_BOOST |
184 | seq_puts(m, "\"kt\",\"ktl\""); | 173 | seq_puts(m, "\"kt\",\"ktl\""); |
@@ -283,7 +272,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | |||
283 | gpnum = rsp->gpnum; | 272 | gpnum = rsp->gpnum; |
284 | seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " | 273 | seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " |
285 | "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", | 274 | "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", |
286 | rsp->completed, gpnum, rsp->signaled, | 275 | rsp->completed, gpnum, rsp->fqs_state, |
287 | (long)(rsp->jiffies_force_qs - jiffies), | 276 | (long)(rsp->jiffies_force_qs - jiffies), |
288 | (int)(jiffies & 0xffff), | 277 | (int)(jiffies & 0xffff), |
289 | rsp->n_force_qs, rsp->n_force_qs_ngp, | 278 | rsp->n_force_qs, rsp->n_force_qs_ngp, |
diff --git a/kernel/relay.c b/kernel/relay.c index 859ea5a9605f..4335e1d7ee2d 100644 --- a/kernel/relay.c +++ b/kernel/relay.c | |||
@@ -15,7 +15,7 @@ | |||
15 | #include <linux/errno.h> | 15 | #include <linux/errno.h> |
16 | #include <linux/stddef.h> | 16 | #include <linux/stddef.h> |
17 | #include <linux/slab.h> | 17 | #include <linux/slab.h> |
18 | #include <linux/module.h> | 18 | #include <linux/export.h> |
19 | #include <linux/string.h> | 19 | #include <linux/string.h> |
20 | #include <linux/relay.h> | 20 | #include <linux/relay.h> |
21 | #include <linux/vmalloc.h> | 21 | #include <linux/vmalloc.h> |
@@ -302,7 +302,7 @@ static void buf_unmapped_default_callback(struct rchan_buf *buf, | |||
302 | */ | 302 | */ |
303 | static struct dentry *create_buf_file_default_callback(const char *filename, | 303 | static struct dentry *create_buf_file_default_callback(const char *filename, |
304 | struct dentry *parent, | 304 | struct dentry *parent, |
305 | int mode, | 305 | umode_t mode, |
306 | struct rchan_buf *buf, | 306 | struct rchan_buf *buf, |
307 | int *is_global) | 307 | int *is_global) |
308 | { | 308 | { |
diff --git a/kernel/res_counter.c b/kernel/res_counter.c index 34683efa2cce..6d269cce7aa1 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c | |||
@@ -159,8 +159,7 @@ int res_counter_memparse_write_strategy(const char *buf, | |||
159 | return 0; | 159 | return 0; |
160 | } | 160 | } |
161 | 161 | ||
162 | /* FIXME - make memparse() take const char* args */ | 162 | *res = memparse(buf, &end); |
163 | *res = memparse((char *)buf, &end); | ||
164 | if (*end != '\0') | 163 | if (*end != '\0') |
165 | return -EINVAL; | 164 | return -EINVAL; |
166 | 165 | ||
diff --git a/kernel/resource.c b/kernel/resource.c index c8dc249da5ce..7640b3a947d0 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -7,7 +7,7 @@ | |||
7 | * Arbitrary resource management. | 7 | * Arbitrary resource management. |
8 | */ | 8 | */ |
9 | 9 | ||
10 | #include <linux/module.h> | 10 | #include <linux/export.h> |
11 | #include <linux/errno.h> | 11 | #include <linux/errno.h> |
12 | #include <linux/ioport.h> | 12 | #include <linux/ioport.h> |
13 | #include <linux/init.h> | 13 | #include <linux/init.h> |
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c index 3c7cbc2c33be..16502d3a71c8 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/rtmutex-debug.c | |||
@@ -18,7 +18,7 @@ | |||
18 | */ | 18 | */ |
19 | #include <linux/sched.h> | 19 | #include <linux/sched.h> |
20 | #include <linux/delay.h> | 20 | #include <linux/delay.h> |
21 | #include <linux/module.h> | 21 | #include <linux/export.h> |
22 | #include <linux/spinlock.h> | 22 | #include <linux/spinlock.h> |
23 | #include <linux/kallsyms.h> | 23 | #include <linux/kallsyms.h> |
24 | #include <linux/syscalls.h> | 24 | #include <linux/syscalls.h> |
@@ -29,61 +29,6 @@ | |||
29 | 29 | ||
30 | #include "rtmutex_common.h" | 30 | #include "rtmutex_common.h" |
31 | 31 | ||
32 | # define TRACE_WARN_ON(x) WARN_ON(x) | ||
33 | # define TRACE_BUG_ON(x) BUG_ON(x) | ||
34 | |||
35 | # define TRACE_OFF() \ | ||
36 | do { \ | ||
37 | if (rt_trace_on) { \ | ||
38 | rt_trace_on = 0; \ | ||
39 | console_verbose(); \ | ||
40 | if (raw_spin_is_locked(¤t->pi_lock)) \ | ||
41 | raw_spin_unlock(¤t->pi_lock); \ | ||
42 | } \ | ||
43 | } while (0) | ||
44 | |||
45 | # define TRACE_OFF_NOLOCK() \ | ||
46 | do { \ | ||
47 | if (rt_trace_on) { \ | ||
48 | rt_trace_on = 0; \ | ||
49 | console_verbose(); \ | ||
50 | } \ | ||
51 | } while (0) | ||
52 | |||
53 | # define TRACE_BUG_LOCKED() \ | ||
54 | do { \ | ||
55 | TRACE_OFF(); \ | ||
56 | BUG(); \ | ||
57 | } while (0) | ||
58 | |||
59 | # define TRACE_WARN_ON_LOCKED(c) \ | ||
60 | do { \ | ||
61 | if (unlikely(c)) { \ | ||
62 | TRACE_OFF(); \ | ||
63 | WARN_ON(1); \ | ||
64 | } \ | ||
65 | } while (0) | ||
66 | |||
67 | # define TRACE_BUG_ON_LOCKED(c) \ | ||
68 | do { \ | ||
69 | if (unlikely(c)) \ | ||
70 | TRACE_BUG_LOCKED(); \ | ||
71 | } while (0) | ||
72 | |||
73 | #ifdef CONFIG_SMP | ||
74 | # define SMP_TRACE_BUG_ON_LOCKED(c) TRACE_BUG_ON_LOCKED(c) | ||
75 | #else | ||
76 | # define SMP_TRACE_BUG_ON_LOCKED(c) do { } while (0) | ||
77 | #endif | ||
78 | |||
79 | /* | ||
80 | * deadlock detection flag. We turn it off when we detect | ||
81 | * the first problem because we dont want to recurse back | ||
82 | * into the tracing code when doing error printk or | ||
83 | * executing a BUG(): | ||
84 | */ | ||
85 | static int rt_trace_on = 1; | ||
86 | |||
87 | static void printk_task(struct task_struct *p) | 32 | static void printk_task(struct task_struct *p) |
88 | { | 33 | { |
89 | if (p) | 34 | if (p) |
@@ -111,8 +56,8 @@ static void printk_lock(struct rt_mutex *lock, int print_owner) | |||
111 | 56 | ||
112 | void rt_mutex_debug_task_free(struct task_struct *task) | 57 | void rt_mutex_debug_task_free(struct task_struct *task) |
113 | { | 58 | { |
114 | WARN_ON(!plist_head_empty(&task->pi_waiters)); | 59 | DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters)); |
115 | WARN_ON(task->pi_blocked_on); | 60 | DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); |
116 | } | 61 | } |
117 | 62 | ||
118 | /* | 63 | /* |
@@ -125,7 +70,7 @@ void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter, | |||
125 | { | 70 | { |
126 | struct task_struct *task; | 71 | struct task_struct *task; |
127 | 72 | ||
128 | if (!rt_trace_on || detect || !act_waiter) | 73 | if (!debug_locks || detect || !act_waiter) |
129 | return; | 74 | return; |
130 | 75 | ||
131 | task = rt_mutex_owner(act_waiter->lock); | 76 | task = rt_mutex_owner(act_waiter->lock); |
@@ -139,7 +84,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) | |||
139 | { | 84 | { |
140 | struct task_struct *task; | 85 | struct task_struct *task; |
141 | 86 | ||
142 | if (!waiter->deadlock_lock || !rt_trace_on) | 87 | if (!waiter->deadlock_lock || !debug_locks) |
143 | return; | 88 | return; |
144 | 89 | ||
145 | rcu_read_lock(); | 90 | rcu_read_lock(); |
@@ -149,10 +94,14 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) | |||
149 | return; | 94 | return; |
150 | } | 95 | } |
151 | 96 | ||
152 | TRACE_OFF_NOLOCK(); | 97 | if (!debug_locks_off()) { |
98 | rcu_read_unlock(); | ||
99 | return; | ||
100 | } | ||
153 | 101 | ||
154 | printk("\n============================================\n"); | 102 | printk("\n============================================\n"); |
155 | printk( "[ BUG: circular locking deadlock detected! ]\n"); | 103 | printk( "[ BUG: circular locking deadlock detected! ]\n"); |
104 | printk("%s\n", print_tainted()); | ||
156 | printk( "--------------------------------------------\n"); | 105 | printk( "--------------------------------------------\n"); |
157 | printk("%s/%d is deadlocking current task %s/%d\n\n", | 106 | printk("%s/%d is deadlocking current task %s/%d\n\n", |
158 | task->comm, task_pid_nr(task), | 107 | task->comm, task_pid_nr(task), |
@@ -180,7 +129,6 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) | |||
180 | 129 | ||
181 | printk("[ turning off deadlock detection." | 130 | printk("[ turning off deadlock detection." |
182 | "Please report this trace. ]\n\n"); | 131 | "Please report this trace. ]\n\n"); |
183 | local_irq_disable(); | ||
184 | } | 132 | } |
185 | 133 | ||
186 | void debug_rt_mutex_lock(struct rt_mutex *lock) | 134 | void debug_rt_mutex_lock(struct rt_mutex *lock) |
@@ -189,7 +137,7 @@ void debug_rt_mutex_lock(struct rt_mutex *lock) | |||
189 | 137 | ||
190 | void debug_rt_mutex_unlock(struct rt_mutex *lock) | 138 | void debug_rt_mutex_unlock(struct rt_mutex *lock) |
191 | { | 139 | { |
192 | TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current); | 140 | DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current); |
193 | } | 141 | } |
194 | 142 | ||
195 | void | 143 | void |
@@ -199,7 +147,7 @@ debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner) | |||
199 | 147 | ||
200 | void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) | 148 | void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) |
201 | { | 149 | { |
202 | TRACE_WARN_ON_LOCKED(!rt_mutex_owner(lock)); | 150 | DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock)); |
203 | } | 151 | } |
204 | 152 | ||
205 | void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) | 153 | void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) |
@@ -213,8 +161,8 @@ void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) | |||
213 | void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) | 161 | void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) |
214 | { | 162 | { |
215 | put_pid(waiter->deadlock_task_pid); | 163 | put_pid(waiter->deadlock_task_pid); |
216 | TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); | 164 | DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry)); |
217 | TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); | 165 | DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); |
218 | memset(waiter, 0x22, sizeof(*waiter)); | 166 | memset(waiter, 0x22, sizeof(*waiter)); |
219 | } | 167 | } |
220 | 168 | ||
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 5c9ccd380966..98ec49475460 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c | |||
@@ -6,11 +6,11 @@ | |||
6 | * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> | 6 | * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> |
7 | * | 7 | * |
8 | */ | 8 | */ |
9 | #include <linux/device.h> | ||
9 | #include <linux/kthread.h> | 10 | #include <linux/kthread.h> |
10 | #include <linux/module.h> | 11 | #include <linux/export.h> |
11 | #include <linux/sched.h> | 12 | #include <linux/sched.h> |
12 | #include <linux/spinlock.h> | 13 | #include <linux/spinlock.h> |
13 | #include <linux/sysdev.h> | ||
14 | #include <linux/timer.h> | 14 | #include <linux/timer.h> |
15 | #include <linux/freezer.h> | 15 | #include <linux/freezer.h> |
16 | 16 | ||
@@ -27,7 +27,7 @@ struct test_thread_data { | |||
27 | int opdata; | 27 | int opdata; |
28 | int mutexes[MAX_RT_TEST_MUTEXES]; | 28 | int mutexes[MAX_RT_TEST_MUTEXES]; |
29 | int event; | 29 | int event; |
30 | struct sys_device sysdev; | 30 | struct device dev; |
31 | }; | 31 | }; |
32 | 32 | ||
33 | static struct test_thread_data thread_data[MAX_RT_TEST_THREADS]; | 33 | static struct test_thread_data thread_data[MAX_RT_TEST_THREADS]; |
@@ -271,7 +271,7 @@ static int test_func(void *data) | |||
271 | * | 271 | * |
272 | * opcode:data | 272 | * opcode:data |
273 | */ | 273 | */ |
274 | static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribute *attr, | 274 | static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *attr, |
275 | const char *buf, size_t count) | 275 | const char *buf, size_t count) |
276 | { | 276 | { |
277 | struct sched_param schedpar; | 277 | struct sched_param schedpar; |
@@ -279,8 +279,8 @@ static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribut | |||
279 | char cmdbuf[32]; | 279 | char cmdbuf[32]; |
280 | int op, dat, tid, ret; | 280 | int op, dat, tid, ret; |
281 | 281 | ||
282 | td = container_of(dev, struct test_thread_data, sysdev); | 282 | td = container_of(dev, struct test_thread_data, dev); |
283 | tid = td->sysdev.id; | 283 | tid = td->dev.id; |
284 | 284 | ||
285 | /* strings from sysfs write are not 0 terminated! */ | 285 | /* strings from sysfs write are not 0 terminated! */ |
286 | if (count >= sizeof(cmdbuf)) | 286 | if (count >= sizeof(cmdbuf)) |
@@ -334,7 +334,7 @@ static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribut | |||
334 | * @dev: thread to query | 334 | * @dev: thread to query |
335 | * @buf: char buffer to be filled with thread status info | 335 | * @buf: char buffer to be filled with thread status info |
336 | */ | 336 | */ |
337 | static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute *attr, | 337 | static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *attr, |
338 | char *buf) | 338 | char *buf) |
339 | { | 339 | { |
340 | struct test_thread_data *td; | 340 | struct test_thread_data *td; |
@@ -342,8 +342,8 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute | |||
342 | char *curr = buf; | 342 | char *curr = buf; |
343 | int i; | 343 | int i; |
344 | 344 | ||
345 | td = container_of(dev, struct test_thread_data, sysdev); | 345 | td = container_of(dev, struct test_thread_data, dev); |
346 | tsk = threads[td->sysdev.id]; | 346 | tsk = threads[td->dev.id]; |
347 | 347 | ||
348 | spin_lock(&rttest_lock); | 348 | spin_lock(&rttest_lock); |
349 | 349 | ||
@@ -360,28 +360,29 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute | |||
360 | spin_unlock(&rttest_lock); | 360 | spin_unlock(&rttest_lock); |
361 | 361 | ||
362 | curr += sprintf(curr, ", T: %p, R: %p\n", tsk, | 362 | curr += sprintf(curr, ", T: %p, R: %p\n", tsk, |
363 | mutexes[td->sysdev.id].owner); | 363 | mutexes[td->dev.id].owner); |
364 | 364 | ||
365 | return curr - buf; | 365 | return curr - buf; |
366 | } | 366 | } |
367 | 367 | ||
368 | static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL); | 368 | static DEVICE_ATTR(status, 0600, sysfs_test_status, NULL); |
369 | static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command); | 369 | static DEVICE_ATTR(command, 0600, NULL, sysfs_test_command); |
370 | 370 | ||
371 | static struct sysdev_class rttest_sysclass = { | 371 | static struct bus_type rttest_subsys = { |
372 | .name = "rttest", | 372 | .name = "rttest", |
373 | .dev_name = "rttest", | ||
373 | }; | 374 | }; |
374 | 375 | ||
375 | static int init_test_thread(int id) | 376 | static int init_test_thread(int id) |
376 | { | 377 | { |
377 | thread_data[id].sysdev.cls = &rttest_sysclass; | 378 | thread_data[id].dev.bus = &rttest_subsys; |
378 | thread_data[id].sysdev.id = id; | 379 | thread_data[id].dev.id = id; |
379 | 380 | ||
380 | threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id); | 381 | threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id); |
381 | if (IS_ERR(threads[id])) | 382 | if (IS_ERR(threads[id])) |
382 | return PTR_ERR(threads[id]); | 383 | return PTR_ERR(threads[id]); |
383 | 384 | ||
384 | return sysdev_register(&thread_data[id].sysdev); | 385 | return device_register(&thread_data[id].dev); |
385 | } | 386 | } |
386 | 387 | ||
387 | static int init_rttest(void) | 388 | static int init_rttest(void) |
@@ -393,7 +394,7 @@ static int init_rttest(void) | |||
393 | for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) | 394 | for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) |
394 | rt_mutex_init(&mutexes[i]); | 395 | rt_mutex_init(&mutexes[i]); |
395 | 396 | ||
396 | ret = sysdev_class_register(&rttest_sysclass); | 397 | ret = subsys_system_register(&rttest_subsys, NULL); |
397 | if (ret) | 398 | if (ret) |
398 | return ret; | 399 | return ret; |
399 | 400 | ||
@@ -401,10 +402,10 @@ static int init_rttest(void) | |||
401 | ret = init_test_thread(i); | 402 | ret = init_test_thread(i); |
402 | if (ret) | 403 | if (ret) |
403 | break; | 404 | break; |
404 | ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status); | 405 | ret = device_create_file(&thread_data[i].dev, &dev_attr_status); |
405 | if (ret) | 406 | if (ret) |
406 | break; | 407 | break; |
407 | ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command); | 408 | ret = device_create_file(&thread_data[i].dev, &dev_attr_command); |
408 | if (ret) | 409 | if (ret) |
409 | break; | 410 | break; |
410 | } | 411 | } |
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 255e1662acdb..a242e691c993 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c | |||
@@ -11,7 +11,7 @@ | |||
11 | * See Documentation/rt-mutex-design.txt for details. | 11 | * See Documentation/rt-mutex-design.txt for details. |
12 | */ | 12 | */ |
13 | #include <linux/spinlock.h> | 13 | #include <linux/spinlock.h> |
14 | #include <linux/module.h> | 14 | #include <linux/export.h> |
15 | #include <linux/sched.h> | 15 | #include <linux/sched.h> |
16 | #include <linux/timer.h> | 16 | #include <linux/timer.h> |
17 | 17 | ||
diff --git a/kernel/rwsem.c b/kernel/rwsem.c index 9f48f3d82e9b..b152f74f02de 100644 --- a/kernel/rwsem.c +++ b/kernel/rwsem.c | |||
@@ -7,7 +7,7 @@ | |||
7 | #include <linux/types.h> | 7 | #include <linux/types.h> |
8 | #include <linux/kernel.h> | 8 | #include <linux/kernel.h> |
9 | #include <linux/sched.h> | 9 | #include <linux/sched.h> |
10 | #include <linux/module.h> | 10 | #include <linux/export.h> |
11 | #include <linux/rwsem.h> | 11 | #include <linux/rwsem.h> |
12 | 12 | ||
13 | #include <asm/system.h> | 13 | #include <asm/system.h> |
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile new file mode 100644 index 000000000000..9a7dd35102a3 --- /dev/null +++ b/kernel/sched/Makefile | |||
@@ -0,0 +1,20 @@ | |||
1 | ifdef CONFIG_FUNCTION_TRACER | ||
2 | CFLAGS_REMOVE_clock.o = -pg | ||
3 | endif | ||
4 | |||
5 | ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) | ||
6 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is | ||
7 | # needed for x86 only. Why this used to be enabled for all architectures is beyond | ||
8 | # me. I suspect most platforms don't need this, but until we know that for sure | ||
9 | # I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k | ||
10 | # to get a correct value for the wait-channel (WCHAN in ps). --davidm | ||
11 | CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer | ||
12 | endif | ||
13 | |||
14 | obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o | ||
15 | obj-$(CONFIG_SMP) += cpupri.o | ||
16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o | ||
17 | obj-$(CONFIG_SCHEDSTATS) += stats.o | ||
18 | obj-$(CONFIG_SCHED_DEBUG) += debug.o | ||
19 | |||
20 | |||
diff --git a/kernel/sched_autogroup.c b/kernel/sched/auto_group.c index 429242f3c484..e8a1f83ee0e7 100644 --- a/kernel/sched_autogroup.c +++ b/kernel/sched/auto_group.c | |||
@@ -1,15 +1,19 @@ | |||
1 | #ifdef CONFIG_SCHED_AUTOGROUP | 1 | #ifdef CONFIG_SCHED_AUTOGROUP |
2 | 2 | ||
3 | #include "sched.h" | ||
4 | |||
3 | #include <linux/proc_fs.h> | 5 | #include <linux/proc_fs.h> |
4 | #include <linux/seq_file.h> | 6 | #include <linux/seq_file.h> |
5 | #include <linux/kallsyms.h> | 7 | #include <linux/kallsyms.h> |
6 | #include <linux/utsname.h> | 8 | #include <linux/utsname.h> |
9 | #include <linux/security.h> | ||
10 | #include <linux/export.h> | ||
7 | 11 | ||
8 | unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; | 12 | unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; |
9 | static struct autogroup autogroup_default; | 13 | static struct autogroup autogroup_default; |
10 | static atomic_t autogroup_seq_nr; | 14 | static atomic_t autogroup_seq_nr; |
11 | 15 | ||
12 | static void __init autogroup_init(struct task_struct *init_task) | 16 | void __init autogroup_init(struct task_struct *init_task) |
13 | { | 17 | { |
14 | autogroup_default.tg = &root_task_group; | 18 | autogroup_default.tg = &root_task_group; |
15 | kref_init(&autogroup_default.kref); | 19 | kref_init(&autogroup_default.kref); |
@@ -17,7 +21,7 @@ static void __init autogroup_init(struct task_struct *init_task) | |||
17 | init_task->signal->autogroup = &autogroup_default; | 21 | init_task->signal->autogroup = &autogroup_default; |
18 | } | 22 | } |
19 | 23 | ||
20 | static inline void autogroup_free(struct task_group *tg) | 24 | void autogroup_free(struct task_group *tg) |
21 | { | 25 | { |
22 | kfree(tg->autogroup); | 26 | kfree(tg->autogroup); |
23 | } | 27 | } |
@@ -59,10 +63,6 @@ static inline struct autogroup *autogroup_task_get(struct task_struct *p) | |||
59 | return ag; | 63 | return ag; |
60 | } | 64 | } |
61 | 65 | ||
62 | #ifdef CONFIG_RT_GROUP_SCHED | ||
63 | static void free_rt_sched_group(struct task_group *tg); | ||
64 | #endif | ||
65 | |||
66 | static inline struct autogroup *autogroup_create(void) | 66 | static inline struct autogroup *autogroup_create(void) |
67 | { | 67 | { |
68 | struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); | 68 | struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); |
@@ -108,8 +108,7 @@ out_fail: | |||
108 | return autogroup_kref_get(&autogroup_default); | 108 | return autogroup_kref_get(&autogroup_default); |
109 | } | 109 | } |
110 | 110 | ||
111 | static inline bool | 111 | bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) |
112 | task_wants_autogroup(struct task_struct *p, struct task_group *tg) | ||
113 | { | 112 | { |
114 | if (tg != &root_task_group) | 113 | if (tg != &root_task_group) |
115 | return false; | 114 | return false; |
@@ -127,22 +126,6 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg) | |||
127 | return true; | 126 | return true; |
128 | } | 127 | } |
129 | 128 | ||
130 | static inline bool task_group_is_autogroup(struct task_group *tg) | ||
131 | { | ||
132 | return !!tg->autogroup; | ||
133 | } | ||
134 | |||
135 | static inline struct task_group * | ||
136 | autogroup_task_group(struct task_struct *p, struct task_group *tg) | ||
137 | { | ||
138 | int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); | ||
139 | |||
140 | if (enabled && task_wants_autogroup(p, tg)) | ||
141 | return p->signal->autogroup->tg; | ||
142 | |||
143 | return tg; | ||
144 | } | ||
145 | |||
146 | static void | 129 | static void |
147 | autogroup_move_group(struct task_struct *p, struct autogroup *ag) | 130 | autogroup_move_group(struct task_struct *p, struct autogroup *ag) |
148 | { | 131 | { |
@@ -263,7 +246,7 @@ out: | |||
263 | #endif /* CONFIG_PROC_FS */ | 246 | #endif /* CONFIG_PROC_FS */ |
264 | 247 | ||
265 | #ifdef CONFIG_SCHED_DEBUG | 248 | #ifdef CONFIG_SCHED_DEBUG |
266 | static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) | 249 | int autogroup_path(struct task_group *tg, char *buf, int buflen) |
267 | { | 250 | { |
268 | if (!task_group_is_autogroup(tg)) | 251 | if (!task_group_is_autogroup(tg)) |
269 | return 0; | 252 | return 0; |
diff --git a/kernel/sched_autogroup.h b/kernel/sched/auto_group.h index c2f0e7248dca..8bd047142816 100644 --- a/kernel/sched_autogroup.h +++ b/kernel/sched/auto_group.h | |||
@@ -1,5 +1,8 @@ | |||
1 | #ifdef CONFIG_SCHED_AUTOGROUP | 1 | #ifdef CONFIG_SCHED_AUTOGROUP |
2 | 2 | ||
3 | #include <linux/kref.h> | ||
4 | #include <linux/rwsem.h> | ||
5 | |||
3 | struct autogroup { | 6 | struct autogroup { |
4 | /* | 7 | /* |
5 | * reference doesn't mean how many thread attach to this | 8 | * reference doesn't mean how many thread attach to this |
@@ -13,9 +16,28 @@ struct autogroup { | |||
13 | int nice; | 16 | int nice; |
14 | }; | 17 | }; |
15 | 18 | ||
16 | static inline bool task_group_is_autogroup(struct task_group *tg); | 19 | extern void autogroup_init(struct task_struct *init_task); |
20 | extern void autogroup_free(struct task_group *tg); | ||
21 | |||
22 | static inline bool task_group_is_autogroup(struct task_group *tg) | ||
23 | { | ||
24 | return !!tg->autogroup; | ||
25 | } | ||
26 | |||
27 | extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg); | ||
28 | |||
17 | static inline struct task_group * | 29 | static inline struct task_group * |
18 | autogroup_task_group(struct task_struct *p, struct task_group *tg); | 30 | autogroup_task_group(struct task_struct *p, struct task_group *tg) |
31 | { | ||
32 | int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); | ||
33 | |||
34 | if (enabled && task_wants_autogroup(p, tg)) | ||
35 | return p->signal->autogroup->tg; | ||
36 | |||
37 | return tg; | ||
38 | } | ||
39 | |||
40 | extern int autogroup_path(struct task_group *tg, char *buf, int buflen); | ||
19 | 41 | ||
20 | #else /* !CONFIG_SCHED_AUTOGROUP */ | 42 | #else /* !CONFIG_SCHED_AUTOGROUP */ |
21 | 43 | ||
diff --git a/kernel/sched_clock.c b/kernel/sched/clock.c index 9d8af0b3fb64..c685e31492df 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched/clock.c | |||
@@ -62,7 +62,7 @@ | |||
62 | */ | 62 | */ |
63 | #include <linux/spinlock.h> | 63 | #include <linux/spinlock.h> |
64 | #include <linux/hardirq.h> | 64 | #include <linux/hardirq.h> |
65 | #include <linux/module.h> | 65 | #include <linux/export.h> |
66 | #include <linux/percpu.h> | 66 | #include <linux/percpu.h> |
67 | #include <linux/ktime.h> | 67 | #include <linux/ktime.h> |
68 | #include <linux/sched.h> | 68 | #include <linux/sched.h> |
diff --git a/kernel/sched.c b/kernel/sched/core.c index 5670028a9c16..df00cb09263e 100644 --- a/kernel/sched.c +++ b/kernel/sched/core.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * kernel/sched.c | 2 | * kernel/sched/core.c |
3 | * | 3 | * |
4 | * Kernel scheduler and related syscalls | 4 | * Kernel scheduler and related syscalls |
5 | * | 5 | * |
@@ -56,7 +56,6 @@ | |||
56 | #include <linux/percpu.h> | 56 | #include <linux/percpu.h> |
57 | #include <linux/proc_fs.h> | 57 | #include <linux/proc_fs.h> |
58 | #include <linux/seq_file.h> | 58 | #include <linux/seq_file.h> |
59 | #include <linux/stop_machine.h> | ||
60 | #include <linux/sysctl.h> | 59 | #include <linux/sysctl.h> |
61 | #include <linux/syscalls.h> | 60 | #include <linux/syscalls.h> |
62 | #include <linux/times.h> | 61 | #include <linux/times.h> |
@@ -71,593 +70,46 @@ | |||
71 | #include <linux/ctype.h> | 70 | #include <linux/ctype.h> |
72 | #include <linux/ftrace.h> | 71 | #include <linux/ftrace.h> |
73 | #include <linux/slab.h> | 72 | #include <linux/slab.h> |
73 | #include <linux/init_task.h> | ||
74 | 74 | ||
75 | #include <asm/tlb.h> | 75 | #include <asm/tlb.h> |
76 | #include <asm/irq_regs.h> | 76 | #include <asm/irq_regs.h> |
77 | #include <asm/mutex.h> | ||
78 | #ifdef CONFIG_PARAVIRT | 77 | #ifdef CONFIG_PARAVIRT |
79 | #include <asm/paravirt.h> | 78 | #include <asm/paravirt.h> |
80 | #endif | 79 | #endif |
81 | 80 | ||
82 | #include "sched_cpupri.h" | 81 | #include "sched.h" |
83 | #include "workqueue_sched.h" | 82 | #include "../workqueue_sched.h" |
84 | #include "sched_autogroup.h" | ||
85 | 83 | ||
86 | #define CREATE_TRACE_POINTS | 84 | #define CREATE_TRACE_POINTS |
87 | #include <trace/events/sched.h> | 85 | #include <trace/events/sched.h> |
88 | 86 | ||
89 | /* | 87 | void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) |
90 | * Convert user-nice values [ -20 ... 0 ... 19 ] | ||
91 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], | ||
92 | * and back. | ||
93 | */ | ||
94 | #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) | ||
95 | #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) | ||
96 | #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) | ||
97 | |||
98 | /* | ||
99 | * 'User priority' is the nice value converted to something we | ||
100 | * can work with better when scaling various scheduler parameters, | ||
101 | * it's a [ 0 ... 39 ] range. | ||
102 | */ | ||
103 | #define USER_PRIO(p) ((p)-MAX_RT_PRIO) | ||
104 | #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) | ||
105 | #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) | ||
106 | |||
107 | /* | ||
108 | * Helpers for converting nanosecond timing to jiffy resolution | ||
109 | */ | ||
110 | #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) | ||
111 | |||
112 | #define NICE_0_LOAD SCHED_LOAD_SCALE | ||
113 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT | ||
114 | |||
115 | /* | ||
116 | * These are the 'tuning knobs' of the scheduler: | ||
117 | * | ||
118 | * default timeslice is 100 msecs (used only for SCHED_RR tasks). | ||
119 | * Timeslices get refilled after they expire. | ||
120 | */ | ||
121 | #define DEF_TIMESLICE (100 * HZ / 1000) | ||
122 | |||
123 | /* | ||
124 | * single value that denotes runtime == period, ie unlimited time. | ||
125 | */ | ||
126 | #define RUNTIME_INF ((u64)~0ULL) | ||
127 | |||
128 | static inline int rt_policy(int policy) | ||
129 | { | ||
130 | if (policy == SCHED_FIFO || policy == SCHED_RR) | ||
131 | return 1; | ||
132 | return 0; | ||
133 | } | ||
134 | |||
135 | static inline int task_has_rt_policy(struct task_struct *p) | ||
136 | { | ||
137 | return rt_policy(p->policy); | ||
138 | } | ||
139 | |||
140 | /* | ||
141 | * This is the priority-queue data structure of the RT scheduling class: | ||
142 | */ | ||
143 | struct rt_prio_array { | ||
144 | DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ | ||
145 | struct list_head queue[MAX_RT_PRIO]; | ||
146 | }; | ||
147 | |||
148 | struct rt_bandwidth { | ||
149 | /* nests inside the rq lock: */ | ||
150 | raw_spinlock_t rt_runtime_lock; | ||
151 | ktime_t rt_period; | ||
152 | u64 rt_runtime; | ||
153 | struct hrtimer rt_period_timer; | ||
154 | }; | ||
155 | |||
156 | static struct rt_bandwidth def_rt_bandwidth; | ||
157 | |||
158 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); | ||
159 | |||
160 | static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) | ||
161 | { | ||
162 | struct rt_bandwidth *rt_b = | ||
163 | container_of(timer, struct rt_bandwidth, rt_period_timer); | ||
164 | ktime_t now; | ||
165 | int overrun; | ||
166 | int idle = 0; | ||
167 | |||
168 | for (;;) { | ||
169 | now = hrtimer_cb_get_time(timer); | ||
170 | overrun = hrtimer_forward(timer, now, rt_b->rt_period); | ||
171 | |||
172 | if (!overrun) | ||
173 | break; | ||
174 | |||
175 | idle = do_sched_rt_period_timer(rt_b, overrun); | ||
176 | } | ||
177 | |||
178 | return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; | ||
179 | } | ||
180 | |||
181 | static | ||
182 | void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) | ||
183 | { | ||
184 | rt_b->rt_period = ns_to_ktime(period); | ||
185 | rt_b->rt_runtime = runtime; | ||
186 | |||
187 | raw_spin_lock_init(&rt_b->rt_runtime_lock); | ||
188 | |||
189 | hrtimer_init(&rt_b->rt_period_timer, | ||
190 | CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
191 | rt_b->rt_period_timer.function = sched_rt_period_timer; | ||
192 | } | ||
193 | |||
194 | static inline int rt_bandwidth_enabled(void) | ||
195 | { | 88 | { |
196 | return sysctl_sched_rt_runtime >= 0; | 89 | unsigned long delta; |
197 | } | 90 | ktime_t soft, hard, now; |
198 | |||
199 | static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | ||
200 | { | ||
201 | ktime_t now; | ||
202 | |||
203 | if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) | ||
204 | return; | ||
205 | 91 | ||
206 | if (hrtimer_active(&rt_b->rt_period_timer)) | ||
207 | return; | ||
208 | |||
209 | raw_spin_lock(&rt_b->rt_runtime_lock); | ||
210 | for (;;) { | 92 | for (;;) { |
211 | unsigned long delta; | 93 | if (hrtimer_active(period_timer)) |
212 | ktime_t soft, hard; | ||
213 | |||
214 | if (hrtimer_active(&rt_b->rt_period_timer)) | ||
215 | break; | 94 | break; |
216 | 95 | ||
217 | now = hrtimer_cb_get_time(&rt_b->rt_period_timer); | 96 | now = hrtimer_cb_get_time(period_timer); |
218 | hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); | 97 | hrtimer_forward(period_timer, now, period); |
219 | 98 | ||
220 | soft = hrtimer_get_softexpires(&rt_b->rt_period_timer); | 99 | soft = hrtimer_get_softexpires(period_timer); |
221 | hard = hrtimer_get_expires(&rt_b->rt_period_timer); | 100 | hard = hrtimer_get_expires(period_timer); |
222 | delta = ktime_to_ns(ktime_sub(hard, soft)); | 101 | delta = ktime_to_ns(ktime_sub(hard, soft)); |
223 | __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, | 102 | __hrtimer_start_range_ns(period_timer, soft, delta, |
224 | HRTIMER_MODE_ABS_PINNED, 0); | 103 | HRTIMER_MODE_ABS_PINNED, 0); |
225 | } | 104 | } |
226 | raw_spin_unlock(&rt_b->rt_runtime_lock); | ||
227 | } | 105 | } |
228 | 106 | ||
229 | #ifdef CONFIG_RT_GROUP_SCHED | 107 | DEFINE_MUTEX(sched_domains_mutex); |
230 | static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) | 108 | DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
231 | { | ||
232 | hrtimer_cancel(&rt_b->rt_period_timer); | ||
233 | } | ||
234 | #endif | ||
235 | |||
236 | /* | ||
237 | * sched_domains_mutex serializes calls to init_sched_domains, | ||
238 | * detach_destroy_domains and partition_sched_domains. | ||
239 | */ | ||
240 | static DEFINE_MUTEX(sched_domains_mutex); | ||
241 | |||
242 | #ifdef CONFIG_CGROUP_SCHED | ||
243 | |||
244 | #include <linux/cgroup.h> | ||
245 | |||
246 | struct cfs_rq; | ||
247 | |||
248 | static LIST_HEAD(task_groups); | ||
249 | |||
250 | /* task group related information */ | ||
251 | struct task_group { | ||
252 | struct cgroup_subsys_state css; | ||
253 | |||
254 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
255 | /* schedulable entities of this group on each cpu */ | ||
256 | struct sched_entity **se; | ||
257 | /* runqueue "owned" by this group on each cpu */ | ||
258 | struct cfs_rq **cfs_rq; | ||
259 | unsigned long shares; | ||
260 | |||
261 | atomic_t load_weight; | ||
262 | #endif | ||
263 | |||
264 | #ifdef CONFIG_RT_GROUP_SCHED | ||
265 | struct sched_rt_entity **rt_se; | ||
266 | struct rt_rq **rt_rq; | ||
267 | |||
268 | struct rt_bandwidth rt_bandwidth; | ||
269 | #endif | ||
270 | |||
271 | struct rcu_head rcu; | ||
272 | struct list_head list; | ||
273 | |||
274 | struct task_group *parent; | ||
275 | struct list_head siblings; | ||
276 | struct list_head children; | ||
277 | |||
278 | #ifdef CONFIG_SCHED_AUTOGROUP | ||
279 | struct autogroup *autogroup; | ||
280 | #endif | ||
281 | }; | ||
282 | |||
283 | /* task_group_lock serializes the addition/removal of task groups */ | ||
284 | static DEFINE_SPINLOCK(task_group_lock); | ||
285 | |||
286 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
287 | |||
288 | # define ROOT_TASK_GROUP_LOAD NICE_0_LOAD | ||
289 | |||
290 | /* | ||
291 | * A weight of 0 or 1 can cause arithmetics problems. | ||
292 | * A weight of a cfs_rq is the sum of weights of which entities | ||
293 | * are queued on this cfs_rq, so a weight of a entity should not be | ||
294 | * too large, so as the shares value of a task group. | ||
295 | * (The default weight is 1024 - so there's no practical | ||
296 | * limitation from this.) | ||
297 | */ | ||
298 | #define MIN_SHARES (1UL << 1) | ||
299 | #define MAX_SHARES (1UL << 18) | ||
300 | |||
301 | static int root_task_group_load = ROOT_TASK_GROUP_LOAD; | ||
302 | #endif | ||
303 | |||
304 | /* Default task group. | ||
305 | * Every task in system belong to this group at bootup. | ||
306 | */ | ||
307 | struct task_group root_task_group; | ||
308 | |||
309 | #endif /* CONFIG_CGROUP_SCHED */ | ||
310 | |||
311 | /* CFS-related fields in a runqueue */ | ||
312 | struct cfs_rq { | ||
313 | struct load_weight load; | ||
314 | unsigned long nr_running; | ||
315 | |||
316 | u64 exec_clock; | ||
317 | u64 min_vruntime; | ||
318 | #ifndef CONFIG_64BIT | ||
319 | u64 min_vruntime_copy; | ||
320 | #endif | ||
321 | |||
322 | struct rb_root tasks_timeline; | ||
323 | struct rb_node *rb_leftmost; | ||
324 | |||
325 | struct list_head tasks; | ||
326 | struct list_head *balance_iterator; | ||
327 | |||
328 | /* | ||
329 | * 'curr' points to currently running entity on this cfs_rq. | ||
330 | * It is set to NULL otherwise (i.e when none are currently running). | ||
331 | */ | ||
332 | struct sched_entity *curr, *next, *last, *skip; | ||
333 | |||
334 | #ifdef CONFIG_SCHED_DEBUG | ||
335 | unsigned int nr_spread_over; | ||
336 | #endif | ||
337 | |||
338 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
339 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | ||
340 | |||
341 | /* | ||
342 | * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in | ||
343 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities | ||
344 | * (like users, containers etc.) | ||
345 | * | ||
346 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This | ||
347 | * list is used during load balance. | ||
348 | */ | ||
349 | int on_list; | ||
350 | struct list_head leaf_cfs_rq_list; | ||
351 | struct task_group *tg; /* group that "owns" this runqueue */ | ||
352 | |||
353 | #ifdef CONFIG_SMP | ||
354 | /* | ||
355 | * the part of load.weight contributed by tasks | ||
356 | */ | ||
357 | unsigned long task_weight; | ||
358 | |||
359 | /* | ||
360 | * h_load = weight * f(tg) | ||
361 | * | ||
362 | * Where f(tg) is the recursive weight fraction assigned to | ||
363 | * this group. | ||
364 | */ | ||
365 | unsigned long h_load; | ||
366 | |||
367 | /* | ||
368 | * Maintaining per-cpu shares distribution for group scheduling | ||
369 | * | ||
370 | * load_stamp is the last time we updated the load average | ||
371 | * load_last is the last time we updated the load average and saw load | ||
372 | * load_unacc_exec_time is currently unaccounted execution time | ||
373 | */ | ||
374 | u64 load_avg; | ||
375 | u64 load_period; | ||
376 | u64 load_stamp, load_last, load_unacc_exec_time; | ||
377 | |||
378 | unsigned long load_contribution; | ||
379 | #endif | ||
380 | #endif | ||
381 | }; | ||
382 | |||
383 | /* Real-Time classes' related field in a runqueue: */ | ||
384 | struct rt_rq { | ||
385 | struct rt_prio_array active; | ||
386 | unsigned long rt_nr_running; | ||
387 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED | ||
388 | struct { | ||
389 | int curr; /* highest queued rt task prio */ | ||
390 | #ifdef CONFIG_SMP | ||
391 | int next; /* next highest */ | ||
392 | #endif | ||
393 | } highest_prio; | ||
394 | #endif | ||
395 | #ifdef CONFIG_SMP | ||
396 | unsigned long rt_nr_migratory; | ||
397 | unsigned long rt_nr_total; | ||
398 | int overloaded; | ||
399 | struct plist_head pushable_tasks; | ||
400 | #endif | ||
401 | int rt_throttled; | ||
402 | u64 rt_time; | ||
403 | u64 rt_runtime; | ||
404 | /* Nests inside the rq lock: */ | ||
405 | raw_spinlock_t rt_runtime_lock; | ||
406 | |||
407 | #ifdef CONFIG_RT_GROUP_SCHED | ||
408 | unsigned long rt_nr_boosted; | ||
409 | |||
410 | struct rq *rq; | ||
411 | struct list_head leaf_rt_rq_list; | ||
412 | struct task_group *tg; | ||
413 | #endif | ||
414 | }; | ||
415 | |||
416 | #ifdef CONFIG_SMP | ||
417 | |||
418 | /* | ||
419 | * We add the notion of a root-domain which will be used to define per-domain | ||
420 | * variables. Each exclusive cpuset essentially defines an island domain by | ||
421 | * fully partitioning the member cpus from any other cpuset. Whenever a new | ||
422 | * exclusive cpuset is created, we also create and attach a new root-domain | ||
423 | * object. | ||
424 | * | ||
425 | */ | ||
426 | struct root_domain { | ||
427 | atomic_t refcount; | ||
428 | atomic_t rto_count; | ||
429 | struct rcu_head rcu; | ||
430 | cpumask_var_t span; | ||
431 | cpumask_var_t online; | ||
432 | |||
433 | /* | ||
434 | * The "RT overload" flag: it gets set if a CPU has more than | ||
435 | * one runnable RT task. | ||
436 | */ | ||
437 | cpumask_var_t rto_mask; | ||
438 | struct cpupri cpupri; | ||
439 | }; | ||
440 | |||
441 | /* | ||
442 | * By default the system creates a single root-domain with all cpus as | ||
443 | * members (mimicking the global state we have today). | ||
444 | */ | ||
445 | static struct root_domain def_root_domain; | ||
446 | |||
447 | #endif /* CONFIG_SMP */ | ||
448 | |||
449 | /* | ||
450 | * This is the main, per-CPU runqueue data structure. | ||
451 | * | ||
452 | * Locking rule: those places that want to lock multiple runqueues | ||
453 | * (such as the load balancing or the thread migration code), lock | ||
454 | * acquire operations must be ordered by ascending &runqueue. | ||
455 | */ | ||
456 | struct rq { | ||
457 | /* runqueue lock: */ | ||
458 | raw_spinlock_t lock; | ||
459 | |||
460 | /* | ||
461 | * nr_running and cpu_load should be in the same cacheline because | ||
462 | * remote CPUs use both these fields when doing load calculation. | ||
463 | */ | ||
464 | unsigned long nr_running; | ||
465 | #define CPU_LOAD_IDX_MAX 5 | ||
466 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | ||
467 | unsigned long last_load_update_tick; | ||
468 | #ifdef CONFIG_NO_HZ | ||
469 | u64 nohz_stamp; | ||
470 | unsigned char nohz_balance_kick; | ||
471 | #endif | ||
472 | int skip_clock_update; | ||
473 | |||
474 | /* capture load from *all* tasks on this cpu: */ | ||
475 | struct load_weight load; | ||
476 | unsigned long nr_load_updates; | ||
477 | u64 nr_switches; | ||
478 | |||
479 | struct cfs_rq cfs; | ||
480 | struct rt_rq rt; | ||
481 | |||
482 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
483 | /* list of leaf cfs_rq on this cpu: */ | ||
484 | struct list_head leaf_cfs_rq_list; | ||
485 | #endif | ||
486 | #ifdef CONFIG_RT_GROUP_SCHED | ||
487 | struct list_head leaf_rt_rq_list; | ||
488 | #endif | ||
489 | |||
490 | /* | ||
491 | * This is part of a global counter where only the total sum | ||
492 | * over all CPUs matters. A task can increase this counter on | ||
493 | * one CPU and if it got migrated afterwards it may decrease | ||
494 | * it on another CPU. Always updated under the runqueue lock: | ||
495 | */ | ||
496 | unsigned long nr_uninterruptible; | ||
497 | |||
498 | struct task_struct *curr, *idle, *stop; | ||
499 | unsigned long next_balance; | ||
500 | struct mm_struct *prev_mm; | ||
501 | |||
502 | u64 clock; | ||
503 | u64 clock_task; | ||
504 | |||
505 | atomic_t nr_iowait; | ||
506 | |||
507 | #ifdef CONFIG_SMP | ||
508 | struct root_domain *rd; | ||
509 | struct sched_domain *sd; | ||
510 | |||
511 | unsigned long cpu_power; | ||
512 | |||
513 | unsigned char idle_at_tick; | ||
514 | /* For active balancing */ | ||
515 | int post_schedule; | ||
516 | int active_balance; | ||
517 | int push_cpu; | ||
518 | struct cpu_stop_work active_balance_work; | ||
519 | /* cpu of this runqueue: */ | ||
520 | int cpu; | ||
521 | int online; | ||
522 | |||
523 | unsigned long avg_load_per_task; | ||
524 | |||
525 | u64 rt_avg; | ||
526 | u64 age_stamp; | ||
527 | u64 idle_stamp; | ||
528 | u64 avg_idle; | ||
529 | #endif | ||
530 | |||
531 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
532 | u64 prev_irq_time; | ||
533 | #endif | ||
534 | #ifdef CONFIG_PARAVIRT | ||
535 | u64 prev_steal_time; | ||
536 | #endif | ||
537 | #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING | ||
538 | u64 prev_steal_time_rq; | ||
539 | #endif | ||
540 | |||
541 | /* calc_load related fields */ | ||
542 | unsigned long calc_load_update; | ||
543 | long calc_load_active; | ||
544 | |||
545 | #ifdef CONFIG_SCHED_HRTICK | ||
546 | #ifdef CONFIG_SMP | ||
547 | int hrtick_csd_pending; | ||
548 | struct call_single_data hrtick_csd; | ||
549 | #endif | ||
550 | struct hrtimer hrtick_timer; | ||
551 | #endif | ||
552 | |||
553 | #ifdef CONFIG_SCHEDSTATS | ||
554 | /* latency stats */ | ||
555 | struct sched_info rq_sched_info; | ||
556 | unsigned long long rq_cpu_time; | ||
557 | /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ | ||
558 | |||
559 | /* sys_sched_yield() stats */ | ||
560 | unsigned int yld_count; | ||
561 | |||
562 | /* schedule() stats */ | ||
563 | unsigned int sched_switch; | ||
564 | unsigned int sched_count; | ||
565 | unsigned int sched_goidle; | ||
566 | |||
567 | /* try_to_wake_up() stats */ | ||
568 | unsigned int ttwu_count; | ||
569 | unsigned int ttwu_local; | ||
570 | #endif | ||
571 | |||
572 | #ifdef CONFIG_SMP | ||
573 | struct task_struct *wake_list; | ||
574 | #endif | ||
575 | }; | ||
576 | |||
577 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | ||
578 | |||
579 | |||
580 | static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); | ||
581 | |||
582 | static inline int cpu_of(struct rq *rq) | ||
583 | { | ||
584 | #ifdef CONFIG_SMP | ||
585 | return rq->cpu; | ||
586 | #else | ||
587 | return 0; | ||
588 | #endif | ||
589 | } | ||
590 | |||
591 | #define rcu_dereference_check_sched_domain(p) \ | ||
592 | rcu_dereference_check((p), \ | ||
593 | lockdep_is_held(&sched_domains_mutex)) | ||
594 | |||
595 | /* | ||
596 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. | ||
597 | * See detach_destroy_domains: synchronize_sched for details. | ||
598 | * | ||
599 | * The domain tree of any CPU may only be accessed from within | ||
600 | * preempt-disabled sections. | ||
601 | */ | ||
602 | #define for_each_domain(cpu, __sd) \ | ||
603 | for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) | ||
604 | |||
605 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) | ||
606 | #define this_rq() (&__get_cpu_var(runqueues)) | ||
607 | #define task_rq(p) cpu_rq(task_cpu(p)) | ||
608 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | ||
609 | #define raw_rq() (&__raw_get_cpu_var(runqueues)) | ||
610 | |||
611 | #ifdef CONFIG_CGROUP_SCHED | ||
612 | |||
613 | /* | ||
614 | * Return the group to which this tasks belongs. | ||
615 | * | ||
616 | * We use task_subsys_state_check() and extend the RCU verification with | ||
617 | * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each | ||
618 | * task it moves into the cgroup. Therefore by holding either of those locks, | ||
619 | * we pin the task to the current cgroup. | ||
620 | */ | ||
621 | static inline struct task_group *task_group(struct task_struct *p) | ||
622 | { | ||
623 | struct task_group *tg; | ||
624 | struct cgroup_subsys_state *css; | ||
625 | |||
626 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, | ||
627 | lockdep_is_held(&p->pi_lock) || | ||
628 | lockdep_is_held(&task_rq(p)->lock)); | ||
629 | tg = container_of(css, struct task_group, css); | ||
630 | |||
631 | return autogroup_task_group(p, tg); | ||
632 | } | ||
633 | |||
634 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ | ||
635 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) | ||
636 | { | ||
637 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
638 | p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; | ||
639 | p->se.parent = task_group(p)->se[cpu]; | ||
640 | #endif | ||
641 | |||
642 | #ifdef CONFIG_RT_GROUP_SCHED | ||
643 | p->rt.rt_rq = task_group(p)->rt_rq[cpu]; | ||
644 | p->rt.parent = task_group(p)->rt_se[cpu]; | ||
645 | #endif | ||
646 | } | ||
647 | |||
648 | #else /* CONFIG_CGROUP_SCHED */ | ||
649 | |||
650 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } | ||
651 | static inline struct task_group *task_group(struct task_struct *p) | ||
652 | { | ||
653 | return NULL; | ||
654 | } | ||
655 | |||
656 | #endif /* CONFIG_CGROUP_SCHED */ | ||
657 | 109 | ||
658 | static void update_rq_clock_task(struct rq *rq, s64 delta); | 110 | static void update_rq_clock_task(struct rq *rq, s64 delta); |
659 | 111 | ||
660 | static void update_rq_clock(struct rq *rq) | 112 | void update_rq_clock(struct rq *rq) |
661 | { | 113 | { |
662 | s64 delta; | 114 | s64 delta; |
663 | 115 | ||
@@ -670,44 +122,14 @@ static void update_rq_clock(struct rq *rq) | |||
670 | } | 122 | } |
671 | 123 | ||
672 | /* | 124 | /* |
673 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: | ||
674 | */ | ||
675 | #ifdef CONFIG_SCHED_DEBUG | ||
676 | # define const_debug __read_mostly | ||
677 | #else | ||
678 | # define const_debug static const | ||
679 | #endif | ||
680 | |||
681 | /** | ||
682 | * runqueue_is_locked - Returns true if the current cpu runqueue is locked | ||
683 | * @cpu: the processor in question. | ||
684 | * | ||
685 | * This interface allows printk to be called with the runqueue lock | ||
686 | * held and know whether or not it is OK to wake up the klogd. | ||
687 | */ | ||
688 | int runqueue_is_locked(int cpu) | ||
689 | { | ||
690 | return raw_spin_is_locked(&cpu_rq(cpu)->lock); | ||
691 | } | ||
692 | |||
693 | /* | ||
694 | * Debugging: various feature bits | 125 | * Debugging: various feature bits |
695 | */ | 126 | */ |
696 | 127 | ||
697 | #define SCHED_FEAT(name, enabled) \ | 128 | #define SCHED_FEAT(name, enabled) \ |
698 | __SCHED_FEAT_##name , | ||
699 | |||
700 | enum { | ||
701 | #include "sched_features.h" | ||
702 | }; | ||
703 | |||
704 | #undef SCHED_FEAT | ||
705 | |||
706 | #define SCHED_FEAT(name, enabled) \ | ||
707 | (1UL << __SCHED_FEAT_##name) * enabled | | 129 | (1UL << __SCHED_FEAT_##name) * enabled | |
708 | 130 | ||
709 | const_debug unsigned int sysctl_sched_features = | 131 | const_debug unsigned int sysctl_sched_features = |
710 | #include "sched_features.h" | 132 | #include "features.h" |
711 | 0; | 133 | 0; |
712 | 134 | ||
713 | #undef SCHED_FEAT | 135 | #undef SCHED_FEAT |
@@ -717,7 +139,7 @@ const_debug unsigned int sysctl_sched_features = | |||
717 | #name , | 139 | #name , |
718 | 140 | ||
719 | static __read_mostly char *sched_feat_names[] = { | 141 | static __read_mostly char *sched_feat_names[] = { |
720 | #include "sched_features.h" | 142 | #include "features.h" |
721 | NULL | 143 | NULL |
722 | }; | 144 | }; |
723 | 145 | ||
@@ -727,7 +149,7 @@ static int sched_feat_show(struct seq_file *m, void *v) | |||
727 | { | 149 | { |
728 | int i; | 150 | int i; |
729 | 151 | ||
730 | for (i = 0; sched_feat_names[i]; i++) { | 152 | for (i = 0; i < __SCHED_FEAT_NR; i++) { |
731 | if (!(sysctl_sched_features & (1UL << i))) | 153 | if (!(sysctl_sched_features & (1UL << i))) |
732 | seq_puts(m, "NO_"); | 154 | seq_puts(m, "NO_"); |
733 | seq_printf(m, "%s ", sched_feat_names[i]); | 155 | seq_printf(m, "%s ", sched_feat_names[i]); |
@@ -737,6 +159,36 @@ static int sched_feat_show(struct seq_file *m, void *v) | |||
737 | return 0; | 159 | return 0; |
738 | } | 160 | } |
739 | 161 | ||
162 | #ifdef HAVE_JUMP_LABEL | ||
163 | |||
164 | #define jump_label_key__true jump_label_key_enabled | ||
165 | #define jump_label_key__false jump_label_key_disabled | ||
166 | |||
167 | #define SCHED_FEAT(name, enabled) \ | ||
168 | jump_label_key__##enabled , | ||
169 | |||
170 | struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = { | ||
171 | #include "features.h" | ||
172 | }; | ||
173 | |||
174 | #undef SCHED_FEAT | ||
175 | |||
176 | static void sched_feat_disable(int i) | ||
177 | { | ||
178 | if (jump_label_enabled(&sched_feat_keys[i])) | ||
179 | jump_label_dec(&sched_feat_keys[i]); | ||
180 | } | ||
181 | |||
182 | static void sched_feat_enable(int i) | ||
183 | { | ||
184 | if (!jump_label_enabled(&sched_feat_keys[i])) | ||
185 | jump_label_inc(&sched_feat_keys[i]); | ||
186 | } | ||
187 | #else | ||
188 | static void sched_feat_disable(int i) { }; | ||
189 | static void sched_feat_enable(int i) { }; | ||
190 | #endif /* HAVE_JUMP_LABEL */ | ||
191 | |||
740 | static ssize_t | 192 | static ssize_t |
741 | sched_feat_write(struct file *filp, const char __user *ubuf, | 193 | sched_feat_write(struct file *filp, const char __user *ubuf, |
742 | size_t cnt, loff_t *ppos) | 194 | size_t cnt, loff_t *ppos) |
@@ -760,17 +212,20 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
760 | cmp += 3; | 212 | cmp += 3; |
761 | } | 213 | } |
762 | 214 | ||
763 | for (i = 0; sched_feat_names[i]; i++) { | 215 | for (i = 0; i < __SCHED_FEAT_NR; i++) { |
764 | if (strcmp(cmp, sched_feat_names[i]) == 0) { | 216 | if (strcmp(cmp, sched_feat_names[i]) == 0) { |
765 | if (neg) | 217 | if (neg) { |
766 | sysctl_sched_features &= ~(1UL << i); | 218 | sysctl_sched_features &= ~(1UL << i); |
767 | else | 219 | sched_feat_disable(i); |
220 | } else { | ||
768 | sysctl_sched_features |= (1UL << i); | 221 | sysctl_sched_features |= (1UL << i); |
222 | sched_feat_enable(i); | ||
223 | } | ||
769 | break; | 224 | break; |
770 | } | 225 | } |
771 | } | 226 | } |
772 | 227 | ||
773 | if (!sched_feat_names[i]) | 228 | if (i == __SCHED_FEAT_NR) |
774 | return -EINVAL; | 229 | return -EINVAL; |
775 | 230 | ||
776 | *ppos += cnt; | 231 | *ppos += cnt; |
@@ -799,10 +254,7 @@ static __init int sched_init_debug(void) | |||
799 | return 0; | 254 | return 0; |
800 | } | 255 | } |
801 | late_initcall(sched_init_debug); | 256 | late_initcall(sched_init_debug); |
802 | 257 | #endif /* CONFIG_SCHED_DEBUG */ | |
803 | #endif | ||
804 | |||
805 | #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) | ||
806 | 258 | ||
807 | /* | 259 | /* |
808 | * Number of tasks to iterate in a single balance run. | 260 | * Number of tasks to iterate in a single balance run. |
@@ -824,7 +276,7 @@ const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; | |||
824 | */ | 276 | */ |
825 | unsigned int sysctl_sched_rt_period = 1000000; | 277 | unsigned int sysctl_sched_rt_period = 1000000; |
826 | 278 | ||
827 | static __read_mostly int scheduler_running; | 279 | __read_mostly int scheduler_running; |
828 | 280 | ||
829 | /* | 281 | /* |
830 | * part of the period that we allow rt tasks to run in us. | 282 | * part of the period that we allow rt tasks to run in us. |
@@ -832,112 +284,7 @@ static __read_mostly int scheduler_running; | |||
832 | */ | 284 | */ |
833 | int sysctl_sched_rt_runtime = 950000; | 285 | int sysctl_sched_rt_runtime = 950000; |
834 | 286 | ||
835 | static inline u64 global_rt_period(void) | ||
836 | { | ||
837 | return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; | ||
838 | } | ||
839 | |||
840 | static inline u64 global_rt_runtime(void) | ||
841 | { | ||
842 | if (sysctl_sched_rt_runtime < 0) | ||
843 | return RUNTIME_INF; | ||
844 | |||
845 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; | ||
846 | } | ||
847 | |||
848 | #ifndef prepare_arch_switch | ||
849 | # define prepare_arch_switch(next) do { } while (0) | ||
850 | #endif | ||
851 | #ifndef finish_arch_switch | ||
852 | # define finish_arch_switch(prev) do { } while (0) | ||
853 | #endif | ||
854 | |||
855 | static inline int task_current(struct rq *rq, struct task_struct *p) | ||
856 | { | ||
857 | return rq->curr == p; | ||
858 | } | ||
859 | |||
860 | static inline int task_running(struct rq *rq, struct task_struct *p) | ||
861 | { | ||
862 | #ifdef CONFIG_SMP | ||
863 | return p->on_cpu; | ||
864 | #else | ||
865 | return task_current(rq, p); | ||
866 | #endif | ||
867 | } | ||
868 | |||
869 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
870 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | ||
871 | { | ||
872 | #ifdef CONFIG_SMP | ||
873 | /* | ||
874 | * We can optimise this out completely for !SMP, because the | ||
875 | * SMP rebalancing from interrupt is the only thing that cares | ||
876 | * here. | ||
877 | */ | ||
878 | next->on_cpu = 1; | ||
879 | #endif | ||
880 | } | ||
881 | 287 | ||
882 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | ||
883 | { | ||
884 | #ifdef CONFIG_SMP | ||
885 | /* | ||
886 | * After ->on_cpu is cleared, the task can be moved to a different CPU. | ||
887 | * We must ensure this doesn't happen until the switch is completely | ||
888 | * finished. | ||
889 | */ | ||
890 | smp_wmb(); | ||
891 | prev->on_cpu = 0; | ||
892 | #endif | ||
893 | #ifdef CONFIG_DEBUG_SPINLOCK | ||
894 | /* this is a valid case when another task releases the spinlock */ | ||
895 | rq->lock.owner = current; | ||
896 | #endif | ||
897 | /* | ||
898 | * If we are tracking spinlock dependencies then we have to | ||
899 | * fix up the runqueue lock - which gets 'carried over' from | ||
900 | * prev into current: | ||
901 | */ | ||
902 | spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); | ||
903 | |||
904 | raw_spin_unlock_irq(&rq->lock); | ||
905 | } | ||
906 | |||
907 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ | ||
908 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | ||
909 | { | ||
910 | #ifdef CONFIG_SMP | ||
911 | /* | ||
912 | * We can optimise this out completely for !SMP, because the | ||
913 | * SMP rebalancing from interrupt is the only thing that cares | ||
914 | * here. | ||
915 | */ | ||
916 | next->on_cpu = 1; | ||
917 | #endif | ||
918 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
919 | raw_spin_unlock_irq(&rq->lock); | ||
920 | #else | ||
921 | raw_spin_unlock(&rq->lock); | ||
922 | #endif | ||
923 | } | ||
924 | |||
925 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | ||
926 | { | ||
927 | #ifdef CONFIG_SMP | ||
928 | /* | ||
929 | * After ->on_cpu is cleared, the task can be moved to a different CPU. | ||
930 | * We must ensure this doesn't happen until the switch is completely | ||
931 | * finished. | ||
932 | */ | ||
933 | smp_wmb(); | ||
934 | prev->on_cpu = 0; | ||
935 | #endif | ||
936 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
937 | local_irq_enable(); | ||
938 | #endif | ||
939 | } | ||
940 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | ||
941 | 288 | ||
942 | /* | 289 | /* |
943 | * __task_rq_lock - lock the rq @p resides on. | 290 | * __task_rq_lock - lock the rq @p resides on. |
@@ -1020,20 +367,6 @@ static struct rq *this_rq_lock(void) | |||
1020 | * rq->lock. | 367 | * rq->lock. |
1021 | */ | 368 | */ |
1022 | 369 | ||
1023 | /* | ||
1024 | * Use hrtick when: | ||
1025 | * - enabled by features | ||
1026 | * - hrtimer is actually high res | ||
1027 | */ | ||
1028 | static inline int hrtick_enabled(struct rq *rq) | ||
1029 | { | ||
1030 | if (!sched_feat(HRTICK)) | ||
1031 | return 0; | ||
1032 | if (!cpu_active(cpu_of(rq))) | ||
1033 | return 0; | ||
1034 | return hrtimer_is_hres_active(&rq->hrtick_timer); | ||
1035 | } | ||
1036 | |||
1037 | static void hrtick_clear(struct rq *rq) | 370 | static void hrtick_clear(struct rq *rq) |
1038 | { | 371 | { |
1039 | if (hrtimer_active(&rq->hrtick_timer)) | 372 | if (hrtimer_active(&rq->hrtick_timer)) |
@@ -1077,7 +410,7 @@ static void __hrtick_start(void *arg) | |||
1077 | * | 410 | * |
1078 | * called with rq->lock held and irqs disabled | 411 | * called with rq->lock held and irqs disabled |
1079 | */ | 412 | */ |
1080 | static void hrtick_start(struct rq *rq, u64 delay) | 413 | void hrtick_start(struct rq *rq, u64 delay) |
1081 | { | 414 | { |
1082 | struct hrtimer *timer = &rq->hrtick_timer; | 415 | struct hrtimer *timer = &rq->hrtick_timer; |
1083 | ktime_t time = ktime_add_ns(timer->base->get_time(), delay); | 416 | ktime_t time = ktime_add_ns(timer->base->get_time(), delay); |
@@ -1121,7 +454,7 @@ static __init void init_hrtick(void) | |||
1121 | * | 454 | * |
1122 | * called with rq->lock held and irqs disabled | 455 | * called with rq->lock held and irqs disabled |
1123 | */ | 456 | */ |
1124 | static void hrtick_start(struct rq *rq, u64 delay) | 457 | void hrtick_start(struct rq *rq, u64 delay) |
1125 | { | 458 | { |
1126 | __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, | 459 | __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, |
1127 | HRTIMER_MODE_REL_PINNED, 0); | 460 | HRTIMER_MODE_REL_PINNED, 0); |
@@ -1172,7 +505,7 @@ static inline void init_hrtick(void) | |||
1172 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) | 505 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) |
1173 | #endif | 506 | #endif |
1174 | 507 | ||
1175 | static void resched_task(struct task_struct *p) | 508 | void resched_task(struct task_struct *p) |
1176 | { | 509 | { |
1177 | int cpu; | 510 | int cpu; |
1178 | 511 | ||
@@ -1193,7 +526,7 @@ static void resched_task(struct task_struct *p) | |||
1193 | smp_send_reschedule(cpu); | 526 | smp_send_reschedule(cpu); |
1194 | } | 527 | } |
1195 | 528 | ||
1196 | static void resched_cpu(int cpu) | 529 | void resched_cpu(int cpu) |
1197 | { | 530 | { |
1198 | struct rq *rq = cpu_rq(cpu); | 531 | struct rq *rq = cpu_rq(cpu); |
1199 | unsigned long flags; | 532 | unsigned long flags; |
@@ -1272,14 +605,22 @@ void wake_up_idle_cpu(int cpu) | |||
1272 | smp_send_reschedule(cpu); | 605 | smp_send_reschedule(cpu); |
1273 | } | 606 | } |
1274 | 607 | ||
1275 | #endif /* CONFIG_NO_HZ */ | 608 | static inline bool got_nohz_idle_kick(void) |
609 | { | ||
610 | int cpu = smp_processor_id(); | ||
611 | return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); | ||
612 | } | ||
1276 | 613 | ||
1277 | static u64 sched_avg_period(void) | 614 | #else /* CONFIG_NO_HZ */ |
615 | |||
616 | static inline bool got_nohz_idle_kick(void) | ||
1278 | { | 617 | { |
1279 | return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; | 618 | return false; |
1280 | } | 619 | } |
1281 | 620 | ||
1282 | static void sched_avg_update(struct rq *rq) | 621 | #endif /* CONFIG_NO_HZ */ |
622 | |||
623 | void sched_avg_update(struct rq *rq) | ||
1283 | { | 624 | { |
1284 | s64 period = sched_avg_period(); | 625 | s64 period = sched_avg_period(); |
1285 | 626 | ||
@@ -1295,200 +636,34 @@ static void sched_avg_update(struct rq *rq) | |||
1295 | } | 636 | } |
1296 | } | 637 | } |
1297 | 638 | ||
1298 | static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | ||
1299 | { | ||
1300 | rq->rt_avg += rt_delta; | ||
1301 | sched_avg_update(rq); | ||
1302 | } | ||
1303 | |||
1304 | #else /* !CONFIG_SMP */ | 639 | #else /* !CONFIG_SMP */ |
1305 | static void resched_task(struct task_struct *p) | 640 | void resched_task(struct task_struct *p) |
1306 | { | 641 | { |
1307 | assert_raw_spin_locked(&task_rq(p)->lock); | 642 | assert_raw_spin_locked(&task_rq(p)->lock); |
1308 | set_tsk_need_resched(p); | 643 | set_tsk_need_resched(p); |
1309 | } | 644 | } |
1310 | |||
1311 | static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | ||
1312 | { | ||
1313 | } | ||
1314 | |||
1315 | static void sched_avg_update(struct rq *rq) | ||
1316 | { | ||
1317 | } | ||
1318 | #endif /* CONFIG_SMP */ | 645 | #endif /* CONFIG_SMP */ |
1319 | 646 | ||
1320 | #if BITS_PER_LONG == 32 | 647 | #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ |
1321 | # define WMULT_CONST (~0UL) | 648 | (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) |
1322 | #else | ||
1323 | # define WMULT_CONST (1UL << 32) | ||
1324 | #endif | ||
1325 | |||
1326 | #define WMULT_SHIFT 32 | ||
1327 | |||
1328 | /* | ||
1329 | * Shift right and round: | ||
1330 | */ | ||
1331 | #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) | ||
1332 | |||
1333 | /* | ||
1334 | * delta *= weight / lw | ||
1335 | */ | ||
1336 | static unsigned long | ||
1337 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, | ||
1338 | struct load_weight *lw) | ||
1339 | { | ||
1340 | u64 tmp; | ||
1341 | |||
1342 | /* | ||
1343 | * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched | ||
1344 | * entities since MIN_SHARES = 2. Treat weight as 1 if less than | ||
1345 | * 2^SCHED_LOAD_RESOLUTION. | ||
1346 | */ | ||
1347 | if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) | ||
1348 | tmp = (u64)delta_exec * scale_load_down(weight); | ||
1349 | else | ||
1350 | tmp = (u64)delta_exec; | ||
1351 | |||
1352 | if (!lw->inv_weight) { | ||
1353 | unsigned long w = scale_load_down(lw->weight); | ||
1354 | |||
1355 | if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) | ||
1356 | lw->inv_weight = 1; | ||
1357 | else if (unlikely(!w)) | ||
1358 | lw->inv_weight = WMULT_CONST; | ||
1359 | else | ||
1360 | lw->inv_weight = WMULT_CONST / w; | ||
1361 | } | ||
1362 | |||
1363 | /* | ||
1364 | * Check whether we'd overflow the 64-bit multiplication: | ||
1365 | */ | ||
1366 | if (unlikely(tmp > WMULT_CONST)) | ||
1367 | tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, | ||
1368 | WMULT_SHIFT/2); | ||
1369 | else | ||
1370 | tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); | ||
1371 | |||
1372 | return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); | ||
1373 | } | ||
1374 | |||
1375 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) | ||
1376 | { | ||
1377 | lw->weight += inc; | ||
1378 | lw->inv_weight = 0; | ||
1379 | } | ||
1380 | |||
1381 | static inline void update_load_sub(struct load_weight *lw, unsigned long dec) | ||
1382 | { | ||
1383 | lw->weight -= dec; | ||
1384 | lw->inv_weight = 0; | ||
1385 | } | ||
1386 | |||
1387 | static inline void update_load_set(struct load_weight *lw, unsigned long w) | ||
1388 | { | ||
1389 | lw->weight = w; | ||
1390 | lw->inv_weight = 0; | ||
1391 | } | ||
1392 | |||
1393 | /* | 649 | /* |
1394 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | 650 | * Iterate task_group tree rooted at *from, calling @down when first entering a |
1395 | * of tasks with abnormal "nice" values across CPUs the contribution that | 651 | * node and @up when leaving it for the final time. |
1396 | * each task makes to its run queue's load is weighted according to its | ||
1397 | * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a | ||
1398 | * scaled version of the new time slice allocation that they receive on time | ||
1399 | * slice expiry etc. | ||
1400 | */ | ||
1401 | |||
1402 | #define WEIGHT_IDLEPRIO 3 | ||
1403 | #define WMULT_IDLEPRIO 1431655765 | ||
1404 | |||
1405 | /* | ||
1406 | * Nice levels are multiplicative, with a gentle 10% change for every | ||
1407 | * nice level changed. I.e. when a CPU-bound task goes from nice 0 to | ||
1408 | * nice 1, it will get ~10% less CPU time than another CPU-bound task | ||
1409 | * that remained on nice 0. | ||
1410 | * | 652 | * |
1411 | * The "10% effect" is relative and cumulative: from _any_ nice level, | 653 | * Caller must hold rcu_lock or sufficient equivalent. |
1412 | * if you go up 1 level, it's -10% CPU usage, if you go down 1 level | ||
1413 | * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. | ||
1414 | * If a task goes up by ~10% and another task goes down by ~10% then | ||
1415 | * the relative distance between them is ~25%.) | ||
1416 | */ | ||
1417 | static const int prio_to_weight[40] = { | ||
1418 | /* -20 */ 88761, 71755, 56483, 46273, 36291, | ||
1419 | /* -15 */ 29154, 23254, 18705, 14949, 11916, | ||
1420 | /* -10 */ 9548, 7620, 6100, 4904, 3906, | ||
1421 | /* -5 */ 3121, 2501, 1991, 1586, 1277, | ||
1422 | /* 0 */ 1024, 820, 655, 526, 423, | ||
1423 | /* 5 */ 335, 272, 215, 172, 137, | ||
1424 | /* 10 */ 110, 87, 70, 56, 45, | ||
1425 | /* 15 */ 36, 29, 23, 18, 15, | ||
1426 | }; | ||
1427 | |||
1428 | /* | ||
1429 | * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. | ||
1430 | * | ||
1431 | * In cases where the weight does not change often, we can use the | ||
1432 | * precalculated inverse to speed up arithmetics by turning divisions | ||
1433 | * into multiplications: | ||
1434 | */ | ||
1435 | static const u32 prio_to_wmult[40] = { | ||
1436 | /* -20 */ 48388, 59856, 76040, 92818, 118348, | ||
1437 | /* -15 */ 147320, 184698, 229616, 287308, 360437, | ||
1438 | /* -10 */ 449829, 563644, 704093, 875809, 1099582, | ||
1439 | /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, | ||
1440 | /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, | ||
1441 | /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, | ||
1442 | /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, | ||
1443 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, | ||
1444 | }; | ||
1445 | |||
1446 | /* Time spent by the tasks of the cpu accounting group executing in ... */ | ||
1447 | enum cpuacct_stat_index { | ||
1448 | CPUACCT_STAT_USER, /* ... user mode */ | ||
1449 | CPUACCT_STAT_SYSTEM, /* ... kernel mode */ | ||
1450 | |||
1451 | CPUACCT_STAT_NSTATS, | ||
1452 | }; | ||
1453 | |||
1454 | #ifdef CONFIG_CGROUP_CPUACCT | ||
1455 | static void cpuacct_charge(struct task_struct *tsk, u64 cputime); | ||
1456 | static void cpuacct_update_stats(struct task_struct *tsk, | ||
1457 | enum cpuacct_stat_index idx, cputime_t val); | ||
1458 | #else | ||
1459 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | ||
1460 | static inline void cpuacct_update_stats(struct task_struct *tsk, | ||
1461 | enum cpuacct_stat_index idx, cputime_t val) {} | ||
1462 | #endif | ||
1463 | |||
1464 | static inline void inc_cpu_load(struct rq *rq, unsigned long load) | ||
1465 | { | ||
1466 | update_load_add(&rq->load, load); | ||
1467 | } | ||
1468 | |||
1469 | static inline void dec_cpu_load(struct rq *rq, unsigned long load) | ||
1470 | { | ||
1471 | update_load_sub(&rq->load, load); | ||
1472 | } | ||
1473 | |||
1474 | #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) | ||
1475 | typedef int (*tg_visitor)(struct task_group *, void *); | ||
1476 | |||
1477 | /* | ||
1478 | * Iterate the full tree, calling @down when first entering a node and @up when | ||
1479 | * leaving it for the final time. | ||
1480 | */ | 654 | */ |
1481 | static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) | 655 | int walk_tg_tree_from(struct task_group *from, |
656 | tg_visitor down, tg_visitor up, void *data) | ||
1482 | { | 657 | { |
1483 | struct task_group *parent, *child; | 658 | struct task_group *parent, *child; |
1484 | int ret; | 659 | int ret; |
1485 | 660 | ||
1486 | rcu_read_lock(); | 661 | parent = from; |
1487 | parent = &root_task_group; | 662 | |
1488 | down: | 663 | down: |
1489 | ret = (*down)(parent, data); | 664 | ret = (*down)(parent, data); |
1490 | if (ret) | 665 | if (ret) |
1491 | goto out_unlock; | 666 | goto out; |
1492 | list_for_each_entry_rcu(child, &parent->children, siblings) { | 667 | list_for_each_entry_rcu(child, &parent->children, siblings) { |
1493 | parent = child; | 668 | parent = child; |
1494 | goto down; | 669 | goto down; |
@@ -1497,273 +672,24 @@ up: | |||
1497 | continue; | 672 | continue; |
1498 | } | 673 | } |
1499 | ret = (*up)(parent, data); | 674 | ret = (*up)(parent, data); |
1500 | if (ret) | 675 | if (ret || parent == from) |
1501 | goto out_unlock; | 676 | goto out; |
1502 | 677 | ||
1503 | child = parent; | 678 | child = parent; |
1504 | parent = parent->parent; | 679 | parent = parent->parent; |
1505 | if (parent) | 680 | if (parent) |
1506 | goto up; | 681 | goto up; |
1507 | out_unlock: | 682 | out: |
1508 | rcu_read_unlock(); | ||
1509 | |||
1510 | return ret; | 683 | return ret; |
1511 | } | 684 | } |
1512 | 685 | ||
1513 | static int tg_nop(struct task_group *tg, void *data) | 686 | int tg_nop(struct task_group *tg, void *data) |
1514 | { | 687 | { |
1515 | return 0; | 688 | return 0; |
1516 | } | 689 | } |
1517 | #endif | 690 | #endif |
1518 | 691 | ||
1519 | #ifdef CONFIG_SMP | 692 | void update_cpu_load(struct rq *this_rq); |
1520 | /* Used instead of source_load when we know the type == 0 */ | ||
1521 | static unsigned long weighted_cpuload(const int cpu) | ||
1522 | { | ||
1523 | return cpu_rq(cpu)->load.weight; | ||
1524 | } | ||
1525 | |||
1526 | /* | ||
1527 | * Return a low guess at the load of a migration-source cpu weighted | ||
1528 | * according to the scheduling class and "nice" value. | ||
1529 | * | ||
1530 | * We want to under-estimate the load of migration sources, to | ||
1531 | * balance conservatively. | ||
1532 | */ | ||
1533 | static unsigned long source_load(int cpu, int type) | ||
1534 | { | ||
1535 | struct rq *rq = cpu_rq(cpu); | ||
1536 | unsigned long total = weighted_cpuload(cpu); | ||
1537 | |||
1538 | if (type == 0 || !sched_feat(LB_BIAS)) | ||
1539 | return total; | ||
1540 | |||
1541 | return min(rq->cpu_load[type-1], total); | ||
1542 | } | ||
1543 | |||
1544 | /* | ||
1545 | * Return a high guess at the load of a migration-target cpu weighted | ||
1546 | * according to the scheduling class and "nice" value. | ||
1547 | */ | ||
1548 | static unsigned long target_load(int cpu, int type) | ||
1549 | { | ||
1550 | struct rq *rq = cpu_rq(cpu); | ||
1551 | unsigned long total = weighted_cpuload(cpu); | ||
1552 | |||
1553 | if (type == 0 || !sched_feat(LB_BIAS)) | ||
1554 | return total; | ||
1555 | |||
1556 | return max(rq->cpu_load[type-1], total); | ||
1557 | } | ||
1558 | |||
1559 | static unsigned long power_of(int cpu) | ||
1560 | { | ||
1561 | return cpu_rq(cpu)->cpu_power; | ||
1562 | } | ||
1563 | |||
1564 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | ||
1565 | |||
1566 | static unsigned long cpu_avg_load_per_task(int cpu) | ||
1567 | { | ||
1568 | struct rq *rq = cpu_rq(cpu); | ||
1569 | unsigned long nr_running = ACCESS_ONCE(rq->nr_running); | ||
1570 | |||
1571 | if (nr_running) | ||
1572 | rq->avg_load_per_task = rq->load.weight / nr_running; | ||
1573 | else | ||
1574 | rq->avg_load_per_task = 0; | ||
1575 | |||
1576 | return rq->avg_load_per_task; | ||
1577 | } | ||
1578 | |||
1579 | #ifdef CONFIG_PREEMPT | ||
1580 | |||
1581 | static void double_rq_lock(struct rq *rq1, struct rq *rq2); | ||
1582 | |||
1583 | /* | ||
1584 | * fair double_lock_balance: Safely acquires both rq->locks in a fair | ||
1585 | * way at the expense of forcing extra atomic operations in all | ||
1586 | * invocations. This assures that the double_lock is acquired using the | ||
1587 | * same underlying policy as the spinlock_t on this architecture, which | ||
1588 | * reduces latency compared to the unfair variant below. However, it | ||
1589 | * also adds more overhead and therefore may reduce throughput. | ||
1590 | */ | ||
1591 | static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) | ||
1592 | __releases(this_rq->lock) | ||
1593 | __acquires(busiest->lock) | ||
1594 | __acquires(this_rq->lock) | ||
1595 | { | ||
1596 | raw_spin_unlock(&this_rq->lock); | ||
1597 | double_rq_lock(this_rq, busiest); | ||
1598 | |||
1599 | return 1; | ||
1600 | } | ||
1601 | |||
1602 | #else | ||
1603 | /* | ||
1604 | * Unfair double_lock_balance: Optimizes throughput at the expense of | ||
1605 | * latency by eliminating extra atomic operations when the locks are | ||
1606 | * already in proper order on entry. This favors lower cpu-ids and will | ||
1607 | * grant the double lock to lower cpus over higher ids under contention, | ||
1608 | * regardless of entry order into the function. | ||
1609 | */ | ||
1610 | static int _double_lock_balance(struct rq *this_rq, struct rq *busiest) | ||
1611 | __releases(this_rq->lock) | ||
1612 | __acquires(busiest->lock) | ||
1613 | __acquires(this_rq->lock) | ||
1614 | { | ||
1615 | int ret = 0; | ||
1616 | |||
1617 | if (unlikely(!raw_spin_trylock(&busiest->lock))) { | ||
1618 | if (busiest < this_rq) { | ||
1619 | raw_spin_unlock(&this_rq->lock); | ||
1620 | raw_spin_lock(&busiest->lock); | ||
1621 | raw_spin_lock_nested(&this_rq->lock, | ||
1622 | SINGLE_DEPTH_NESTING); | ||
1623 | ret = 1; | ||
1624 | } else | ||
1625 | raw_spin_lock_nested(&busiest->lock, | ||
1626 | SINGLE_DEPTH_NESTING); | ||
1627 | } | ||
1628 | return ret; | ||
1629 | } | ||
1630 | |||
1631 | #endif /* CONFIG_PREEMPT */ | ||
1632 | |||
1633 | /* | ||
1634 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. | ||
1635 | */ | ||
1636 | static int double_lock_balance(struct rq *this_rq, struct rq *busiest) | ||
1637 | { | ||
1638 | if (unlikely(!irqs_disabled())) { | ||
1639 | /* printk() doesn't work good under rq->lock */ | ||
1640 | raw_spin_unlock(&this_rq->lock); | ||
1641 | BUG_ON(1); | ||
1642 | } | ||
1643 | |||
1644 | return _double_lock_balance(this_rq, busiest); | ||
1645 | } | ||
1646 | |||
1647 | static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) | ||
1648 | __releases(busiest->lock) | ||
1649 | { | ||
1650 | raw_spin_unlock(&busiest->lock); | ||
1651 | lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); | ||
1652 | } | ||
1653 | |||
1654 | /* | ||
1655 | * double_rq_lock - safely lock two runqueues | ||
1656 | * | ||
1657 | * Note this does not disable interrupts like task_rq_lock, | ||
1658 | * you need to do so manually before calling. | ||
1659 | */ | ||
1660 | static void double_rq_lock(struct rq *rq1, struct rq *rq2) | ||
1661 | __acquires(rq1->lock) | ||
1662 | __acquires(rq2->lock) | ||
1663 | { | ||
1664 | BUG_ON(!irqs_disabled()); | ||
1665 | if (rq1 == rq2) { | ||
1666 | raw_spin_lock(&rq1->lock); | ||
1667 | __acquire(rq2->lock); /* Fake it out ;) */ | ||
1668 | } else { | ||
1669 | if (rq1 < rq2) { | ||
1670 | raw_spin_lock(&rq1->lock); | ||
1671 | raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); | ||
1672 | } else { | ||
1673 | raw_spin_lock(&rq2->lock); | ||
1674 | raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); | ||
1675 | } | ||
1676 | } | ||
1677 | } | ||
1678 | |||
1679 | /* | ||
1680 | * double_rq_unlock - safely unlock two runqueues | ||
1681 | * | ||
1682 | * Note this does not restore interrupts like task_rq_unlock, | ||
1683 | * you need to do so manually after calling. | ||
1684 | */ | ||
1685 | static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | ||
1686 | __releases(rq1->lock) | ||
1687 | __releases(rq2->lock) | ||
1688 | { | ||
1689 | raw_spin_unlock(&rq1->lock); | ||
1690 | if (rq1 != rq2) | ||
1691 | raw_spin_unlock(&rq2->lock); | ||
1692 | else | ||
1693 | __release(rq2->lock); | ||
1694 | } | ||
1695 | |||
1696 | #else /* CONFIG_SMP */ | ||
1697 | |||
1698 | /* | ||
1699 | * double_rq_lock - safely lock two runqueues | ||
1700 | * | ||
1701 | * Note this does not disable interrupts like task_rq_lock, | ||
1702 | * you need to do so manually before calling. | ||
1703 | */ | ||
1704 | static void double_rq_lock(struct rq *rq1, struct rq *rq2) | ||
1705 | __acquires(rq1->lock) | ||
1706 | __acquires(rq2->lock) | ||
1707 | { | ||
1708 | BUG_ON(!irqs_disabled()); | ||
1709 | BUG_ON(rq1 != rq2); | ||
1710 | raw_spin_lock(&rq1->lock); | ||
1711 | __acquire(rq2->lock); /* Fake it out ;) */ | ||
1712 | } | ||
1713 | |||
1714 | /* | ||
1715 | * double_rq_unlock - safely unlock two runqueues | ||
1716 | * | ||
1717 | * Note this does not restore interrupts like task_rq_unlock, | ||
1718 | * you need to do so manually after calling. | ||
1719 | */ | ||
1720 | static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | ||
1721 | __releases(rq1->lock) | ||
1722 | __releases(rq2->lock) | ||
1723 | { | ||
1724 | BUG_ON(rq1 != rq2); | ||
1725 | raw_spin_unlock(&rq1->lock); | ||
1726 | __release(rq2->lock); | ||
1727 | } | ||
1728 | |||
1729 | #endif | ||
1730 | |||
1731 | static void calc_load_account_idle(struct rq *this_rq); | ||
1732 | static void update_sysctl(void); | ||
1733 | static int get_update_sysctl_factor(void); | ||
1734 | static void update_cpu_load(struct rq *this_rq); | ||
1735 | |||
1736 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | ||
1737 | { | ||
1738 | set_task_rq(p, cpu); | ||
1739 | #ifdef CONFIG_SMP | ||
1740 | /* | ||
1741 | * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be | ||
1742 | * successfuly executed on another CPU. We must ensure that updates of | ||
1743 | * per-task data have been completed by this moment. | ||
1744 | */ | ||
1745 | smp_wmb(); | ||
1746 | task_thread_info(p)->cpu = cpu; | ||
1747 | #endif | ||
1748 | } | ||
1749 | |||
1750 | static const struct sched_class rt_sched_class; | ||
1751 | |||
1752 | #define sched_class_highest (&stop_sched_class) | ||
1753 | #define for_each_class(class) \ | ||
1754 | for (class = sched_class_highest; class; class = class->next) | ||
1755 | |||
1756 | #include "sched_stats.h" | ||
1757 | |||
1758 | static void inc_nr_running(struct rq *rq) | ||
1759 | { | ||
1760 | rq->nr_running++; | ||
1761 | } | ||
1762 | |||
1763 | static void dec_nr_running(struct rq *rq) | ||
1764 | { | ||
1765 | rq->nr_running--; | ||
1766 | } | ||
1767 | 693 | ||
1768 | static void set_load_weight(struct task_struct *p) | 694 | static void set_load_weight(struct task_struct *p) |
1769 | { | 695 | { |
@@ -1800,25 +726,23 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) | |||
1800 | /* | 726 | /* |
1801 | * activate_task - move a task to the runqueue. | 727 | * activate_task - move a task to the runqueue. |
1802 | */ | 728 | */ |
1803 | static void activate_task(struct rq *rq, struct task_struct *p, int flags) | 729 | void activate_task(struct rq *rq, struct task_struct *p, int flags) |
1804 | { | 730 | { |
1805 | if (task_contributes_to_load(p)) | 731 | if (task_contributes_to_load(p)) |
1806 | rq->nr_uninterruptible--; | 732 | rq->nr_uninterruptible--; |
1807 | 733 | ||
1808 | enqueue_task(rq, p, flags); | 734 | enqueue_task(rq, p, flags); |
1809 | inc_nr_running(rq); | ||
1810 | } | 735 | } |
1811 | 736 | ||
1812 | /* | 737 | /* |
1813 | * deactivate_task - remove a task from the runqueue. | 738 | * deactivate_task - remove a task from the runqueue. |
1814 | */ | 739 | */ |
1815 | static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) | 740 | void deactivate_task(struct rq *rq, struct task_struct *p, int flags) |
1816 | { | 741 | { |
1817 | if (task_contributes_to_load(p)) | 742 | if (task_contributes_to_load(p)) |
1818 | rq->nr_uninterruptible++; | 743 | rq->nr_uninterruptible++; |
1819 | 744 | ||
1820 | dequeue_task(rq, p, flags); | 745 | dequeue_task(rq, p, flags); |
1821 | dec_nr_running(rq); | ||
1822 | } | 746 | } |
1823 | 747 | ||
1824 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 748 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
@@ -2004,14 +928,14 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) | |||
2004 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 928 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
2005 | static int irqtime_account_hi_update(void) | 929 | static int irqtime_account_hi_update(void) |
2006 | { | 930 | { |
2007 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 931 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
2008 | unsigned long flags; | 932 | unsigned long flags; |
2009 | u64 latest_ns; | 933 | u64 latest_ns; |
2010 | int ret = 0; | 934 | int ret = 0; |
2011 | 935 | ||
2012 | local_irq_save(flags); | 936 | local_irq_save(flags); |
2013 | latest_ns = this_cpu_read(cpu_hardirq_time); | 937 | latest_ns = this_cpu_read(cpu_hardirq_time); |
2014 | if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq)) | 938 | if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) |
2015 | ret = 1; | 939 | ret = 1; |
2016 | local_irq_restore(flags); | 940 | local_irq_restore(flags); |
2017 | return ret; | 941 | return ret; |
@@ -2019,14 +943,14 @@ static int irqtime_account_hi_update(void) | |||
2019 | 943 | ||
2020 | static int irqtime_account_si_update(void) | 944 | static int irqtime_account_si_update(void) |
2021 | { | 945 | { |
2022 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 946 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
2023 | unsigned long flags; | 947 | unsigned long flags; |
2024 | u64 latest_ns; | 948 | u64 latest_ns; |
2025 | int ret = 0; | 949 | int ret = 0; |
2026 | 950 | ||
2027 | local_irq_save(flags); | 951 | local_irq_save(flags); |
2028 | latest_ns = this_cpu_read(cpu_softirq_time); | 952 | latest_ns = this_cpu_read(cpu_softirq_time); |
2029 | if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq)) | 953 | if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) |
2030 | ret = 1; | 954 | ret = 1; |
2031 | local_irq_restore(flags); | 955 | local_irq_restore(flags); |
2032 | return ret; | 956 | return ret; |
@@ -2038,15 +962,6 @@ static int irqtime_account_si_update(void) | |||
2038 | 962 | ||
2039 | #endif | 963 | #endif |
2040 | 964 | ||
2041 | #include "sched_idletask.c" | ||
2042 | #include "sched_fair.c" | ||
2043 | #include "sched_rt.c" | ||
2044 | #include "sched_autogroup.c" | ||
2045 | #include "sched_stoptask.c" | ||
2046 | #ifdef CONFIG_SCHED_DEBUG | ||
2047 | # include "sched_debug.c" | ||
2048 | #endif | ||
2049 | |||
2050 | void sched_set_stop_task(int cpu, struct task_struct *stop) | 965 | void sched_set_stop_task(int cpu, struct task_struct *stop) |
2051 | { | 966 | { |
2052 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; | 967 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; |
@@ -2144,7 +1059,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, | |||
2144 | p->sched_class->prio_changed(rq, p, oldprio); | 1059 | p->sched_class->prio_changed(rq, p, oldprio); |
2145 | } | 1060 | } |
2146 | 1061 | ||
2147 | static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | 1062 | void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) |
2148 | { | 1063 | { |
2149 | const struct sched_class *class; | 1064 | const struct sched_class *class; |
2150 | 1065 | ||
@@ -2170,38 +1085,6 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | |||
2170 | } | 1085 | } |
2171 | 1086 | ||
2172 | #ifdef CONFIG_SMP | 1087 | #ifdef CONFIG_SMP |
2173 | /* | ||
2174 | * Is this task likely cache-hot: | ||
2175 | */ | ||
2176 | static int | ||
2177 | task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | ||
2178 | { | ||
2179 | s64 delta; | ||
2180 | |||
2181 | if (p->sched_class != &fair_sched_class) | ||
2182 | return 0; | ||
2183 | |||
2184 | if (unlikely(p->policy == SCHED_IDLE)) | ||
2185 | return 0; | ||
2186 | |||
2187 | /* | ||
2188 | * Buddy candidates are cache hot: | ||
2189 | */ | ||
2190 | if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && | ||
2191 | (&p->se == cfs_rq_of(&p->se)->next || | ||
2192 | &p->se == cfs_rq_of(&p->se)->last)) | ||
2193 | return 1; | ||
2194 | |||
2195 | if (sysctl_sched_migration_cost == -1) | ||
2196 | return 1; | ||
2197 | if (sysctl_sched_migration_cost == 0) | ||
2198 | return 0; | ||
2199 | |||
2200 | delta = now - p->se.exec_start; | ||
2201 | |||
2202 | return delta < (s64)sysctl_sched_migration_cost; | ||
2203 | } | ||
2204 | |||
2205 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | 1088 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) |
2206 | { | 1089 | { |
2207 | #ifdef CONFIG_SCHED_DEBUG | 1090 | #ifdef CONFIG_SCHED_DEBUG |
@@ -2390,11 +1273,11 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||
2390 | 1273 | ||
2391 | /* Look for allowed, online CPU in same node. */ | 1274 | /* Look for allowed, online CPU in same node. */ |
2392 | for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) | 1275 | for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) |
2393 | if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) | 1276 | if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) |
2394 | return dest_cpu; | 1277 | return dest_cpu; |
2395 | 1278 | ||
2396 | /* Any allowed, online CPU? */ | 1279 | /* Any allowed, online CPU? */ |
2397 | dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); | 1280 | dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask); |
2398 | if (dest_cpu < nr_cpu_ids) | 1281 | if (dest_cpu < nr_cpu_ids) |
2399 | return dest_cpu; | 1282 | return dest_cpu; |
2400 | 1283 | ||
@@ -2431,7 +1314,7 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) | |||
2431 | * [ this allows ->select_task() to simply return task_cpu(p) and | 1314 | * [ this allows ->select_task() to simply return task_cpu(p) and |
2432 | * not worry about this generic constraint ] | 1315 | * not worry about this generic constraint ] |
2433 | */ | 1316 | */ |
2434 | if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || | 1317 | if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || |
2435 | !cpu_online(cpu))) | 1318 | !cpu_online(cpu))) |
2436 | cpu = select_fallback_rq(task_cpu(p), p); | 1319 | cpu = select_fallback_rq(task_cpu(p), p); |
2437 | 1320 | ||
@@ -2556,42 +1439,26 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) | |||
2556 | } | 1439 | } |
2557 | 1440 | ||
2558 | #ifdef CONFIG_SMP | 1441 | #ifdef CONFIG_SMP |
2559 | static void sched_ttwu_do_pending(struct task_struct *list) | 1442 | static void sched_ttwu_pending(void) |
2560 | { | 1443 | { |
2561 | struct rq *rq = this_rq(); | 1444 | struct rq *rq = this_rq(); |
1445 | struct llist_node *llist = llist_del_all(&rq->wake_list); | ||
1446 | struct task_struct *p; | ||
2562 | 1447 | ||
2563 | raw_spin_lock(&rq->lock); | 1448 | raw_spin_lock(&rq->lock); |
2564 | 1449 | ||
2565 | while (list) { | 1450 | while (llist) { |
2566 | struct task_struct *p = list; | 1451 | p = llist_entry(llist, struct task_struct, wake_entry); |
2567 | list = list->wake_entry; | 1452 | llist = llist_next(llist); |
2568 | ttwu_do_activate(rq, p, 0); | 1453 | ttwu_do_activate(rq, p, 0); |
2569 | } | 1454 | } |
2570 | 1455 | ||
2571 | raw_spin_unlock(&rq->lock); | 1456 | raw_spin_unlock(&rq->lock); |
2572 | } | 1457 | } |
2573 | 1458 | ||
2574 | #ifdef CONFIG_HOTPLUG_CPU | ||
2575 | |||
2576 | static void sched_ttwu_pending(void) | ||
2577 | { | ||
2578 | struct rq *rq = this_rq(); | ||
2579 | struct task_struct *list = xchg(&rq->wake_list, NULL); | ||
2580 | |||
2581 | if (!list) | ||
2582 | return; | ||
2583 | |||
2584 | sched_ttwu_do_pending(list); | ||
2585 | } | ||
2586 | |||
2587 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
2588 | |||
2589 | void scheduler_ipi(void) | 1459 | void scheduler_ipi(void) |
2590 | { | 1460 | { |
2591 | struct rq *rq = this_rq(); | 1461 | if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) |
2592 | struct task_struct *list = xchg(&rq->wake_list, NULL); | ||
2593 | |||
2594 | if (!list) | ||
2595 | return; | 1462 | return; |
2596 | 1463 | ||
2597 | /* | 1464 | /* |
@@ -2608,25 +1475,21 @@ void scheduler_ipi(void) | |||
2608 | * somewhat pessimize the simple resched case. | 1475 | * somewhat pessimize the simple resched case. |
2609 | */ | 1476 | */ |
2610 | irq_enter(); | 1477 | irq_enter(); |
2611 | sched_ttwu_do_pending(list); | 1478 | sched_ttwu_pending(); |
1479 | |||
1480 | /* | ||
1481 | * Check if someone kicked us for doing the nohz idle load balance. | ||
1482 | */ | ||
1483 | if (unlikely(got_nohz_idle_kick() && !need_resched())) { | ||
1484 | this_rq()->idle_balance = 1; | ||
1485 | raise_softirq_irqoff(SCHED_SOFTIRQ); | ||
1486 | } | ||
2612 | irq_exit(); | 1487 | irq_exit(); |
2613 | } | 1488 | } |
2614 | 1489 | ||
2615 | static void ttwu_queue_remote(struct task_struct *p, int cpu) | 1490 | static void ttwu_queue_remote(struct task_struct *p, int cpu) |
2616 | { | 1491 | { |
2617 | struct rq *rq = cpu_rq(cpu); | 1492 | if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) |
2618 | struct task_struct *next = rq->wake_list; | ||
2619 | |||
2620 | for (;;) { | ||
2621 | struct task_struct *old = next; | ||
2622 | |||
2623 | p->wake_entry = next; | ||
2624 | next = cmpxchg(&rq->wake_list, old, p); | ||
2625 | if (next == old) | ||
2626 | break; | ||
2627 | } | ||
2628 | |||
2629 | if (!next) | ||
2630 | smp_send_reschedule(cpu); | 1493 | smp_send_reschedule(cpu); |
2631 | } | 1494 | } |
2632 | 1495 | ||
@@ -2648,6 +1511,11 @@ static int ttwu_activate_remote(struct task_struct *p, int wake_flags) | |||
2648 | 1511 | ||
2649 | } | 1512 | } |
2650 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | 1513 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ |
1514 | |||
1515 | static inline int ttwu_share_cache(int this_cpu, int that_cpu) | ||
1516 | { | ||
1517 | return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); | ||
1518 | } | ||
2651 | #endif /* CONFIG_SMP */ | 1519 | #endif /* CONFIG_SMP */ |
2652 | 1520 | ||
2653 | static void ttwu_queue(struct task_struct *p, int cpu) | 1521 | static void ttwu_queue(struct task_struct *p, int cpu) |
@@ -2655,7 +1523,7 @@ static void ttwu_queue(struct task_struct *p, int cpu) | |||
2655 | struct rq *rq = cpu_rq(cpu); | 1523 | struct rq *rq = cpu_rq(cpu); |
2656 | 1524 | ||
2657 | #if defined(CONFIG_SMP) | 1525 | #if defined(CONFIG_SMP) |
2658 | if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { | 1526 | if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) { |
2659 | sched_clock_cpu(cpu); /* sync clocks x-cpu */ | 1527 | sched_clock_cpu(cpu); /* sync clocks x-cpu */ |
2660 | ttwu_queue_remote(p, cpu); | 1528 | ttwu_queue_remote(p, cpu); |
2661 | return; | 1529 | return; |
@@ -2848,19 +1716,23 @@ void sched_fork(struct task_struct *p) | |||
2848 | p->state = TASK_RUNNING; | 1716 | p->state = TASK_RUNNING; |
2849 | 1717 | ||
2850 | /* | 1718 | /* |
1719 | * Make sure we do not leak PI boosting priority to the child. | ||
1720 | */ | ||
1721 | p->prio = current->normal_prio; | ||
1722 | |||
1723 | /* | ||
2851 | * Revert to default priority/policy on fork if requested. | 1724 | * Revert to default priority/policy on fork if requested. |
2852 | */ | 1725 | */ |
2853 | if (unlikely(p->sched_reset_on_fork)) { | 1726 | if (unlikely(p->sched_reset_on_fork)) { |
2854 | if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { | 1727 | if (task_has_rt_policy(p)) { |
2855 | p->policy = SCHED_NORMAL; | 1728 | p->policy = SCHED_NORMAL; |
2856 | p->normal_prio = p->static_prio; | ||
2857 | } | ||
2858 | |||
2859 | if (PRIO_TO_NICE(p->static_prio) < 0) { | ||
2860 | p->static_prio = NICE_TO_PRIO(0); | 1729 | p->static_prio = NICE_TO_PRIO(0); |
2861 | p->normal_prio = p->static_prio; | 1730 | p->rt_priority = 0; |
2862 | set_load_weight(p); | 1731 | } else if (PRIO_TO_NICE(p->static_prio) < 0) |
2863 | } | 1732 | p->static_prio = NICE_TO_PRIO(0); |
1733 | |||
1734 | p->prio = p->normal_prio = __normal_prio(p); | ||
1735 | set_load_weight(p); | ||
2864 | 1736 | ||
2865 | /* | 1737 | /* |
2866 | * We don't need the reset flag anymore after the fork. It has | 1738 | * We don't need the reset flag anymore after the fork. It has |
@@ -2869,11 +1741,6 @@ void sched_fork(struct task_struct *p) | |||
2869 | p->sched_reset_on_fork = 0; | 1741 | p->sched_reset_on_fork = 0; |
2870 | } | 1742 | } |
2871 | 1743 | ||
2872 | /* | ||
2873 | * Make sure we do not leak PI boosting priority to the child. | ||
2874 | */ | ||
2875 | p->prio = current->normal_prio; | ||
2876 | |||
2877 | if (!rt_prio(p->prio)) | 1744 | if (!rt_prio(p->prio)) |
2878 | p->sched_class = &fair_sched_class; | 1745 | p->sched_class = &fair_sched_class; |
2879 | 1746 | ||
@@ -3070,6 +1937,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
3070 | local_irq_enable(); | 1937 | local_irq_enable(); |
3071 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | 1938 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ |
3072 | finish_lock_switch(rq, prev); | 1939 | finish_lock_switch(rq, prev); |
1940 | trace_sched_stat_sleeptime(current, rq->clock); | ||
3073 | 1941 | ||
3074 | fire_sched_in_preempt_notifiers(current); | 1942 | fire_sched_in_preempt_notifiers(current); |
3075 | if (mm) | 1943 | if (mm) |
@@ -3305,7 +2173,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) | |||
3305 | */ | 2173 | */ |
3306 | static atomic_long_t calc_load_tasks_idle; | 2174 | static atomic_long_t calc_load_tasks_idle; |
3307 | 2175 | ||
3308 | static void calc_load_account_idle(struct rq *this_rq) | 2176 | void calc_load_account_idle(struct rq *this_rq) |
3309 | { | 2177 | { |
3310 | long delta; | 2178 | long delta; |
3311 | 2179 | ||
@@ -3449,7 +2317,7 @@ static void calc_global_nohz(unsigned long ticks) | |||
3449 | */ | 2317 | */ |
3450 | } | 2318 | } |
3451 | #else | 2319 | #else |
3452 | static void calc_load_account_idle(struct rq *this_rq) | 2320 | void calc_load_account_idle(struct rq *this_rq) |
3453 | { | 2321 | { |
3454 | } | 2322 | } |
3455 | 2323 | ||
@@ -3592,7 +2460,7 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) | |||
3592 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called | 2460 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called |
3593 | * every tick. We fix it up based on jiffies. | 2461 | * every tick. We fix it up based on jiffies. |
3594 | */ | 2462 | */ |
3595 | static void update_cpu_load(struct rq *this_rq) | 2463 | void update_cpu_load(struct rq *this_rq) |
3596 | { | 2464 | { |
3597 | unsigned long this_load = this_rq->load.weight; | 2465 | unsigned long this_load = this_rq->load.weight; |
3598 | unsigned long curr_jiffies = jiffies; | 2466 | unsigned long curr_jiffies = jiffies; |
@@ -3670,8 +2538,10 @@ unlock: | |||
3670 | #endif | 2538 | #endif |
3671 | 2539 | ||
3672 | DEFINE_PER_CPU(struct kernel_stat, kstat); | 2540 | DEFINE_PER_CPU(struct kernel_stat, kstat); |
2541 | DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); | ||
3673 | 2542 | ||
3674 | EXPORT_PER_CPU_SYMBOL(kstat); | 2543 | EXPORT_PER_CPU_SYMBOL(kstat); |
2544 | EXPORT_PER_CPU_SYMBOL(kernel_cpustat); | ||
3675 | 2545 | ||
3676 | /* | 2546 | /* |
3677 | * Return any ns on the sched_clock that have not yet been accounted in | 2547 | * Return any ns on the sched_clock that have not yet been accounted in |
@@ -3724,6 +2594,42 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
3724 | return ns; | 2594 | return ns; |
3725 | } | 2595 | } |
3726 | 2596 | ||
2597 | #ifdef CONFIG_CGROUP_CPUACCT | ||
2598 | struct cgroup_subsys cpuacct_subsys; | ||
2599 | struct cpuacct root_cpuacct; | ||
2600 | #endif | ||
2601 | |||
2602 | static inline void task_group_account_field(struct task_struct *p, int index, | ||
2603 | u64 tmp) | ||
2604 | { | ||
2605 | #ifdef CONFIG_CGROUP_CPUACCT | ||
2606 | struct kernel_cpustat *kcpustat; | ||
2607 | struct cpuacct *ca; | ||
2608 | #endif | ||
2609 | /* | ||
2610 | * Since all updates are sure to touch the root cgroup, we | ||
2611 | * get ourselves ahead and touch it first. If the root cgroup | ||
2612 | * is the only cgroup, then nothing else should be necessary. | ||
2613 | * | ||
2614 | */ | ||
2615 | __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; | ||
2616 | |||
2617 | #ifdef CONFIG_CGROUP_CPUACCT | ||
2618 | if (unlikely(!cpuacct_subsys.active)) | ||
2619 | return; | ||
2620 | |||
2621 | rcu_read_lock(); | ||
2622 | ca = task_ca(p); | ||
2623 | while (ca && (ca != &root_cpuacct)) { | ||
2624 | kcpustat = this_cpu_ptr(ca->cpustat); | ||
2625 | kcpustat->cpustat[index] += tmp; | ||
2626 | ca = parent_ca(ca); | ||
2627 | } | ||
2628 | rcu_read_unlock(); | ||
2629 | #endif | ||
2630 | } | ||
2631 | |||
2632 | |||
3727 | /* | 2633 | /* |
3728 | * Account user cpu time to a process. | 2634 | * Account user cpu time to a process. |
3729 | * @p: the process that the cpu time gets accounted to | 2635 | * @p: the process that the cpu time gets accounted to |
@@ -3733,22 +2639,18 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
3733 | void account_user_time(struct task_struct *p, cputime_t cputime, | 2639 | void account_user_time(struct task_struct *p, cputime_t cputime, |
3734 | cputime_t cputime_scaled) | 2640 | cputime_t cputime_scaled) |
3735 | { | 2641 | { |
3736 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 2642 | int index; |
3737 | cputime64_t tmp; | ||
3738 | 2643 | ||
3739 | /* Add user time to process. */ | 2644 | /* Add user time to process. */ |
3740 | p->utime = cputime_add(p->utime, cputime); | 2645 | p->utime += cputime; |
3741 | p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); | 2646 | p->utimescaled += cputime_scaled; |
3742 | account_group_user_time(p, cputime); | 2647 | account_group_user_time(p, cputime); |
3743 | 2648 | ||
2649 | index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; | ||
2650 | |||
3744 | /* Add user time to cpustat. */ | 2651 | /* Add user time to cpustat. */ |
3745 | tmp = cputime_to_cputime64(cputime); | 2652 | task_group_account_field(p, index, (__force u64) cputime); |
3746 | if (TASK_NICE(p) > 0) | ||
3747 | cpustat->nice = cputime64_add(cpustat->nice, tmp); | ||
3748 | else | ||
3749 | cpustat->user = cputime64_add(cpustat->user, tmp); | ||
3750 | 2653 | ||
3751 | cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime); | ||
3752 | /* Account for user time used */ | 2654 | /* Account for user time used */ |
3753 | acct_update_integrals(p); | 2655 | acct_update_integrals(p); |
3754 | } | 2656 | } |
@@ -3762,24 +2664,21 @@ void account_user_time(struct task_struct *p, cputime_t cputime, | |||
3762 | static void account_guest_time(struct task_struct *p, cputime_t cputime, | 2664 | static void account_guest_time(struct task_struct *p, cputime_t cputime, |
3763 | cputime_t cputime_scaled) | 2665 | cputime_t cputime_scaled) |
3764 | { | 2666 | { |
3765 | cputime64_t tmp; | 2667 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
3766 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | ||
3767 | |||
3768 | tmp = cputime_to_cputime64(cputime); | ||
3769 | 2668 | ||
3770 | /* Add guest time to process. */ | 2669 | /* Add guest time to process. */ |
3771 | p->utime = cputime_add(p->utime, cputime); | 2670 | p->utime += cputime; |
3772 | p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); | 2671 | p->utimescaled += cputime_scaled; |
3773 | account_group_user_time(p, cputime); | 2672 | account_group_user_time(p, cputime); |
3774 | p->gtime = cputime_add(p->gtime, cputime); | 2673 | p->gtime += cputime; |
3775 | 2674 | ||
3776 | /* Add guest time to cpustat. */ | 2675 | /* Add guest time to cpustat. */ |
3777 | if (TASK_NICE(p) > 0) { | 2676 | if (TASK_NICE(p) > 0) { |
3778 | cpustat->nice = cputime64_add(cpustat->nice, tmp); | 2677 | cpustat[CPUTIME_NICE] += (__force u64) cputime; |
3779 | cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp); | 2678 | cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; |
3780 | } else { | 2679 | } else { |
3781 | cpustat->user = cputime64_add(cpustat->user, tmp); | 2680 | cpustat[CPUTIME_USER] += (__force u64) cputime; |
3782 | cpustat->guest = cputime64_add(cpustat->guest, tmp); | 2681 | cpustat[CPUTIME_GUEST] += (__force u64) cputime; |
3783 | } | 2682 | } |
3784 | } | 2683 | } |
3785 | 2684 | ||
@@ -3792,18 +2691,15 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, | |||
3792 | */ | 2691 | */ |
3793 | static inline | 2692 | static inline |
3794 | void __account_system_time(struct task_struct *p, cputime_t cputime, | 2693 | void __account_system_time(struct task_struct *p, cputime_t cputime, |
3795 | cputime_t cputime_scaled, cputime64_t *target_cputime64) | 2694 | cputime_t cputime_scaled, int index) |
3796 | { | 2695 | { |
3797 | cputime64_t tmp = cputime_to_cputime64(cputime); | ||
3798 | |||
3799 | /* Add system time to process. */ | 2696 | /* Add system time to process. */ |
3800 | p->stime = cputime_add(p->stime, cputime); | 2697 | p->stime += cputime; |
3801 | p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); | 2698 | p->stimescaled += cputime_scaled; |
3802 | account_group_system_time(p, cputime); | 2699 | account_group_system_time(p, cputime); |
3803 | 2700 | ||
3804 | /* Add system time to cpustat. */ | 2701 | /* Add system time to cpustat. */ |
3805 | *target_cputime64 = cputime64_add(*target_cputime64, tmp); | 2702 | task_group_account_field(p, index, (__force u64) cputime); |
3806 | cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); | ||
3807 | 2703 | ||
3808 | /* Account for system time used */ | 2704 | /* Account for system time used */ |
3809 | acct_update_integrals(p); | 2705 | acct_update_integrals(p); |
@@ -3819,8 +2715,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime, | |||
3819 | void account_system_time(struct task_struct *p, int hardirq_offset, | 2715 | void account_system_time(struct task_struct *p, int hardirq_offset, |
3820 | cputime_t cputime, cputime_t cputime_scaled) | 2716 | cputime_t cputime, cputime_t cputime_scaled) |
3821 | { | 2717 | { |
3822 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 2718 | int index; |
3823 | cputime64_t *target_cputime64; | ||
3824 | 2719 | ||
3825 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { | 2720 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { |
3826 | account_guest_time(p, cputime, cputime_scaled); | 2721 | account_guest_time(p, cputime, cputime_scaled); |
@@ -3828,13 +2723,13 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
3828 | } | 2723 | } |
3829 | 2724 | ||
3830 | if (hardirq_count() - hardirq_offset) | 2725 | if (hardirq_count() - hardirq_offset) |
3831 | target_cputime64 = &cpustat->irq; | 2726 | index = CPUTIME_IRQ; |
3832 | else if (in_serving_softirq()) | 2727 | else if (in_serving_softirq()) |
3833 | target_cputime64 = &cpustat->softirq; | 2728 | index = CPUTIME_SOFTIRQ; |
3834 | else | 2729 | else |
3835 | target_cputime64 = &cpustat->system; | 2730 | index = CPUTIME_SYSTEM; |
3836 | 2731 | ||
3837 | __account_system_time(p, cputime, cputime_scaled, target_cputime64); | 2732 | __account_system_time(p, cputime, cputime_scaled, index); |
3838 | } | 2733 | } |
3839 | 2734 | ||
3840 | /* | 2735 | /* |
@@ -3843,10 +2738,9 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
3843 | */ | 2738 | */ |
3844 | void account_steal_time(cputime_t cputime) | 2739 | void account_steal_time(cputime_t cputime) |
3845 | { | 2740 | { |
3846 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 2741 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
3847 | cputime64_t cputime64 = cputime_to_cputime64(cputime); | ||
3848 | 2742 | ||
3849 | cpustat->steal = cputime64_add(cpustat->steal, cputime64); | 2743 | cpustat[CPUTIME_STEAL] += (__force u64) cputime; |
3850 | } | 2744 | } |
3851 | 2745 | ||
3852 | /* | 2746 | /* |
@@ -3855,14 +2749,13 @@ void account_steal_time(cputime_t cputime) | |||
3855 | */ | 2749 | */ |
3856 | void account_idle_time(cputime_t cputime) | 2750 | void account_idle_time(cputime_t cputime) |
3857 | { | 2751 | { |
3858 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 2752 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
3859 | cputime64_t cputime64 = cputime_to_cputime64(cputime); | ||
3860 | struct rq *rq = this_rq(); | 2753 | struct rq *rq = this_rq(); |
3861 | 2754 | ||
3862 | if (atomic_read(&rq->nr_iowait) > 0) | 2755 | if (atomic_read(&rq->nr_iowait) > 0) |
3863 | cpustat->iowait = cputime64_add(cpustat->iowait, cputime64); | 2756 | cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; |
3864 | else | 2757 | else |
3865 | cpustat->idle = cputime64_add(cpustat->idle, cputime64); | 2758 | cpustat[CPUTIME_IDLE] += (__force u64) cputime; |
3866 | } | 2759 | } |
3867 | 2760 | ||
3868 | static __always_inline bool steal_account_process_tick(void) | 2761 | static __always_inline bool steal_account_process_tick(void) |
@@ -3912,16 +2805,15 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | |||
3912 | struct rq *rq) | 2805 | struct rq *rq) |
3913 | { | 2806 | { |
3914 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | 2807 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); |
3915 | cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); | 2808 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
3916 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | ||
3917 | 2809 | ||
3918 | if (steal_account_process_tick()) | 2810 | if (steal_account_process_tick()) |
3919 | return; | 2811 | return; |
3920 | 2812 | ||
3921 | if (irqtime_account_hi_update()) { | 2813 | if (irqtime_account_hi_update()) { |
3922 | cpustat->irq = cputime64_add(cpustat->irq, tmp); | 2814 | cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; |
3923 | } else if (irqtime_account_si_update()) { | 2815 | } else if (irqtime_account_si_update()) { |
3924 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); | 2816 | cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; |
3925 | } else if (this_cpu_ksoftirqd() == p) { | 2817 | } else if (this_cpu_ksoftirqd() == p) { |
3926 | /* | 2818 | /* |
3927 | * ksoftirqd time do not get accounted in cpu_softirq_time. | 2819 | * ksoftirqd time do not get accounted in cpu_softirq_time. |
@@ -3929,7 +2821,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | |||
3929 | * Also, p->stime needs to be updated for ksoftirqd. | 2821 | * Also, p->stime needs to be updated for ksoftirqd. |
3930 | */ | 2822 | */ |
3931 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | 2823 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, |
3932 | &cpustat->softirq); | 2824 | CPUTIME_SOFTIRQ); |
3933 | } else if (user_tick) { | 2825 | } else if (user_tick) { |
3934 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | 2826 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); |
3935 | } else if (p == rq->idle) { | 2827 | } else if (p == rq->idle) { |
@@ -3938,7 +2830,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | |||
3938 | account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); | 2830 | account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); |
3939 | } else { | 2831 | } else { |
3940 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | 2832 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, |
3941 | &cpustat->system); | 2833 | CPUTIME_SYSTEM); |
3942 | } | 2834 | } |
3943 | } | 2835 | } |
3944 | 2836 | ||
@@ -4037,7 +2929,7 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
4037 | 2929 | ||
4038 | void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | 2930 | void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) |
4039 | { | 2931 | { |
4040 | cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime); | 2932 | cputime_t rtime, utime = p->utime, total = utime + p->stime; |
4041 | 2933 | ||
4042 | /* | 2934 | /* |
4043 | * Use CFS's precise accounting: | 2935 | * Use CFS's precise accounting: |
@@ -4045,11 +2937,11 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
4045 | rtime = nsecs_to_cputime(p->se.sum_exec_runtime); | 2937 | rtime = nsecs_to_cputime(p->se.sum_exec_runtime); |
4046 | 2938 | ||
4047 | if (total) { | 2939 | if (total) { |
4048 | u64 temp = rtime; | 2940 | u64 temp = (__force u64) rtime; |
4049 | 2941 | ||
4050 | temp *= utime; | 2942 | temp *= (__force u64) utime; |
4051 | do_div(temp, total); | 2943 | do_div(temp, (__force u32) total); |
4052 | utime = (cputime_t)temp; | 2944 | utime = (__force cputime_t) temp; |
4053 | } else | 2945 | } else |
4054 | utime = rtime; | 2946 | utime = rtime; |
4055 | 2947 | ||
@@ -4057,7 +2949,7 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
4057 | * Compare with previous values, to keep monotonicity: | 2949 | * Compare with previous values, to keep monotonicity: |
4058 | */ | 2950 | */ |
4059 | p->prev_utime = max(p->prev_utime, utime); | 2951 | p->prev_utime = max(p->prev_utime, utime); |
4060 | p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime)); | 2952 | p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); |
4061 | 2953 | ||
4062 | *ut = p->prev_utime; | 2954 | *ut = p->prev_utime; |
4063 | *st = p->prev_stime; | 2955 | *st = p->prev_stime; |
@@ -4074,21 +2966,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
4074 | 2966 | ||
4075 | thread_group_cputime(p, &cputime); | 2967 | thread_group_cputime(p, &cputime); |
4076 | 2968 | ||
4077 | total = cputime_add(cputime.utime, cputime.stime); | 2969 | total = cputime.utime + cputime.stime; |
4078 | rtime = nsecs_to_cputime(cputime.sum_exec_runtime); | 2970 | rtime = nsecs_to_cputime(cputime.sum_exec_runtime); |
4079 | 2971 | ||
4080 | if (total) { | 2972 | if (total) { |
4081 | u64 temp = rtime; | 2973 | u64 temp = (__force u64) rtime; |
4082 | 2974 | ||
4083 | temp *= cputime.utime; | 2975 | temp *= (__force u64) cputime.utime; |
4084 | do_div(temp, total); | 2976 | do_div(temp, (__force u32) total); |
4085 | utime = (cputime_t)temp; | 2977 | utime = (__force cputime_t) temp; |
4086 | } else | 2978 | } else |
4087 | utime = rtime; | 2979 | utime = rtime; |
4088 | 2980 | ||
4089 | sig->prev_utime = max(sig->prev_utime, utime); | 2981 | sig->prev_utime = max(sig->prev_utime, utime); |
4090 | sig->prev_stime = max(sig->prev_stime, | 2982 | sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime); |
4091 | cputime_sub(rtime, sig->prev_utime)); | ||
4092 | 2983 | ||
4093 | *ut = sig->prev_utime; | 2984 | *ut = sig->prev_utime; |
4094 | *st = sig->prev_stime; | 2985 | *st = sig->prev_stime; |
@@ -4116,7 +3007,7 @@ void scheduler_tick(void) | |||
4116 | perf_event_task_tick(); | 3007 | perf_event_task_tick(); |
4117 | 3008 | ||
4118 | #ifdef CONFIG_SMP | 3009 | #ifdef CONFIG_SMP |
4119 | rq->idle_at_tick = idle_cpu(cpu); | 3010 | rq->idle_balance = idle_cpu(cpu); |
4120 | trigger_load_balance(rq, cpu); | 3011 | trigger_load_balance(rq, cpu); |
4121 | #endif | 3012 | #endif |
4122 | } | 3013 | } |
@@ -4187,6 +3078,9 @@ static noinline void __schedule_bug(struct task_struct *prev) | |||
4187 | { | 3078 | { |
4188 | struct pt_regs *regs = get_irq_regs(); | 3079 | struct pt_regs *regs = get_irq_regs(); |
4189 | 3080 | ||
3081 | if (oops_in_progress) | ||
3082 | return; | ||
3083 | |||
4190 | printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", | 3084 | printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", |
4191 | prev->comm, prev->pid, preempt_count()); | 3085 | prev->comm, prev->pid, preempt_count()); |
4192 | 3086 | ||
@@ -4213,6 +3107,7 @@ static inline void schedule_debug(struct task_struct *prev) | |||
4213 | */ | 3107 | */ |
4214 | if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) | 3108 | if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) |
4215 | __schedule_bug(prev); | 3109 | __schedule_bug(prev); |
3110 | rcu_sleep_check(); | ||
4216 | 3111 | ||
4217 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); | 3112 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); |
4218 | 3113 | ||
@@ -4239,7 +3134,7 @@ pick_next_task(struct rq *rq) | |||
4239 | * Optimization: we know that if all tasks are in | 3134 | * Optimization: we know that if all tasks are in |
4240 | * the fair class we can call that function directly: | 3135 | * the fair class we can call that function directly: |
4241 | */ | 3136 | */ |
4242 | if (likely(rq->nr_running == rq->cfs.nr_running)) { | 3137 | if (likely(rq->nr_running == rq->cfs.h_nr_running)) { |
4243 | p = fair_sched_class.pick_next_task(rq); | 3138 | p = fair_sched_class.pick_next_task(rq); |
4244 | if (likely(p)) | 3139 | if (likely(p)) |
4245 | return p; | 3140 | return p; |
@@ -4676,6 +3571,9 @@ EXPORT_SYMBOL(wait_for_completion); | |||
4676 | * This waits for either a completion of a specific task to be signaled or for a | 3571 | * This waits for either a completion of a specific task to be signaled or for a |
4677 | * specified timeout to expire. The timeout is in jiffies. It is not | 3572 | * specified timeout to expire. The timeout is in jiffies. It is not |
4678 | * interruptible. | 3573 | * interruptible. |
3574 | * | ||
3575 | * The return value is 0 if timed out, and positive (at least 1, or number of | ||
3576 | * jiffies left till timeout) if completed. | ||
4679 | */ | 3577 | */ |
4680 | unsigned long __sched | 3578 | unsigned long __sched |
4681 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) | 3579 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) |
@@ -4690,6 +3588,8 @@ EXPORT_SYMBOL(wait_for_completion_timeout); | |||
4690 | * | 3588 | * |
4691 | * This waits for completion of a specific task to be signaled. It is | 3589 | * This waits for completion of a specific task to be signaled. It is |
4692 | * interruptible. | 3590 | * interruptible. |
3591 | * | ||
3592 | * The return value is -ERESTARTSYS if interrupted, 0 if completed. | ||
4693 | */ | 3593 | */ |
4694 | int __sched wait_for_completion_interruptible(struct completion *x) | 3594 | int __sched wait_for_completion_interruptible(struct completion *x) |
4695 | { | 3595 | { |
@@ -4707,6 +3607,9 @@ EXPORT_SYMBOL(wait_for_completion_interruptible); | |||
4707 | * | 3607 | * |
4708 | * This waits for either a completion of a specific task to be signaled or for a | 3608 | * This waits for either a completion of a specific task to be signaled or for a |
4709 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. | 3609 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. |
3610 | * | ||
3611 | * The return value is -ERESTARTSYS if interrupted, 0 if timed out, | ||
3612 | * positive (at least 1, or number of jiffies left till timeout) if completed. | ||
4710 | */ | 3613 | */ |
4711 | long __sched | 3614 | long __sched |
4712 | wait_for_completion_interruptible_timeout(struct completion *x, | 3615 | wait_for_completion_interruptible_timeout(struct completion *x, |
@@ -4722,6 +3625,8 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); | |||
4722 | * | 3625 | * |
4723 | * This waits to be signaled for completion of a specific task. It can be | 3626 | * This waits to be signaled for completion of a specific task. It can be |
4724 | * interrupted by a kill signal. | 3627 | * interrupted by a kill signal. |
3628 | * | ||
3629 | * The return value is -ERESTARTSYS if interrupted, 0 if completed. | ||
4725 | */ | 3630 | */ |
4726 | int __sched wait_for_completion_killable(struct completion *x) | 3631 | int __sched wait_for_completion_killable(struct completion *x) |
4727 | { | 3632 | { |
@@ -4740,6 +3645,9 @@ EXPORT_SYMBOL(wait_for_completion_killable); | |||
4740 | * This waits for either a completion of a specific task to be | 3645 | * This waits for either a completion of a specific task to be |
4741 | * signaled or for a specified timeout to expire. It can be | 3646 | * signaled or for a specified timeout to expire. It can be |
4742 | * interrupted by a kill signal. The timeout is in jiffies. | 3647 | * interrupted by a kill signal. The timeout is in jiffies. |
3648 | * | ||
3649 | * The return value is -ERESTARTSYS if interrupted, 0 if timed out, | ||
3650 | * positive (at least 1, or number of jiffies left till timeout) if completed. | ||
4743 | */ | 3651 | */ |
4744 | long __sched | 3652 | long __sched |
4745 | wait_for_completion_killable_timeout(struct completion *x, | 3653 | wait_for_completion_killable_timeout(struct completion *x, |
@@ -5025,7 +3933,20 @@ EXPORT_SYMBOL(task_nice); | |||
5025 | */ | 3933 | */ |
5026 | int idle_cpu(int cpu) | 3934 | int idle_cpu(int cpu) |
5027 | { | 3935 | { |
5028 | return cpu_curr(cpu) == cpu_rq(cpu)->idle; | 3936 | struct rq *rq = cpu_rq(cpu); |
3937 | |||
3938 | if (rq->curr != rq->idle) | ||
3939 | return 0; | ||
3940 | |||
3941 | if (rq->nr_running) | ||
3942 | return 0; | ||
3943 | |||
3944 | #ifdef CONFIG_SMP | ||
3945 | if (!llist_empty(&rq->wake_list)) | ||
3946 | return 0; | ||
3947 | #endif | ||
3948 | |||
3949 | return 1; | ||
5029 | } | 3950 | } |
5030 | 3951 | ||
5031 | /** | 3952 | /** |
@@ -5691,6 +4612,13 @@ again: | |||
5691 | */ | 4612 | */ |
5692 | if (preempt && rq != p_rq) | 4613 | if (preempt && rq != p_rq) |
5693 | resched_task(p_rq->curr); | 4614 | resched_task(p_rq->curr); |
4615 | } else { | ||
4616 | /* | ||
4617 | * We might have set it in task_yield_fair(), but are | ||
4618 | * not going to schedule(), so don't want to skip | ||
4619 | * the next update. | ||
4620 | */ | ||
4621 | rq->skip_clock_update = 0; | ||
5694 | } | 4622 | } |
5695 | 4623 | ||
5696 | out: | 4624 | out: |
@@ -5858,7 +4786,7 @@ void sched_show_task(struct task_struct *p) | |||
5858 | free = stack_not_used(p); | 4786 | free = stack_not_used(p); |
5859 | #endif | 4787 | #endif |
5860 | printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, | 4788 | printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, |
5861 | task_pid_nr(p), task_pid_nr(p->real_parent), | 4789 | task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)), |
5862 | (unsigned long)task_thread_info(p)->flags); | 4790 | (unsigned long)task_thread_info(p)->flags); |
5863 | 4791 | ||
5864 | show_stack(p, NULL); | 4792 | show_stack(p, NULL); |
@@ -5875,7 +4803,7 @@ void show_state_filter(unsigned long state_filter) | |||
5875 | printk(KERN_INFO | 4803 | printk(KERN_INFO |
5876 | " task PC stack pid father\n"); | 4804 | " task PC stack pid father\n"); |
5877 | #endif | 4805 | #endif |
5878 | read_lock(&tasklist_lock); | 4806 | rcu_read_lock(); |
5879 | do_each_thread(g, p) { | 4807 | do_each_thread(g, p) { |
5880 | /* | 4808 | /* |
5881 | * reset the NMI-timeout, listing all files on a slow | 4809 | * reset the NMI-timeout, listing all files on a slow |
@@ -5891,7 +4819,7 @@ void show_state_filter(unsigned long state_filter) | |||
5891 | #ifdef CONFIG_SCHED_DEBUG | 4819 | #ifdef CONFIG_SCHED_DEBUG |
5892 | sysrq_sched_debug_show(); | 4820 | sysrq_sched_debug_show(); |
5893 | #endif | 4821 | #endif |
5894 | read_unlock(&tasklist_lock); | 4822 | rcu_read_unlock(); |
5895 | /* | 4823 | /* |
5896 | * Only show locks if all tasks are dumped: | 4824 | * Only show locks if all tasks are dumped: |
5897 | */ | 4825 | */ |
@@ -5952,62 +4880,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
5952 | */ | 4880 | */ |
5953 | idle->sched_class = &idle_sched_class; | 4881 | idle->sched_class = &idle_sched_class; |
5954 | ftrace_graph_init_idle_task(idle, cpu); | 4882 | ftrace_graph_init_idle_task(idle, cpu); |
5955 | } | 4883 | #if defined(CONFIG_SMP) |
5956 | 4884 | sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); | |
5957 | /* | 4885 | #endif |
5958 | * In a system that switches off the HZ timer nohz_cpu_mask | ||
5959 | * indicates which cpus entered this state. This is used | ||
5960 | * in the rcu update to wait only for active cpus. For system | ||
5961 | * which do not switch off the HZ timer nohz_cpu_mask should | ||
5962 | * always be CPU_BITS_NONE. | ||
5963 | */ | ||
5964 | cpumask_var_t nohz_cpu_mask; | ||
5965 | |||
5966 | /* | ||
5967 | * Increase the granularity value when there are more CPUs, | ||
5968 | * because with more CPUs the 'effective latency' as visible | ||
5969 | * to users decreases. But the relationship is not linear, | ||
5970 | * so pick a second-best guess by going with the log2 of the | ||
5971 | * number of CPUs. | ||
5972 | * | ||
5973 | * This idea comes from the SD scheduler of Con Kolivas: | ||
5974 | */ | ||
5975 | static int get_update_sysctl_factor(void) | ||
5976 | { | ||
5977 | unsigned int cpus = min_t(int, num_online_cpus(), 8); | ||
5978 | unsigned int factor; | ||
5979 | |||
5980 | switch (sysctl_sched_tunable_scaling) { | ||
5981 | case SCHED_TUNABLESCALING_NONE: | ||
5982 | factor = 1; | ||
5983 | break; | ||
5984 | case SCHED_TUNABLESCALING_LINEAR: | ||
5985 | factor = cpus; | ||
5986 | break; | ||
5987 | case SCHED_TUNABLESCALING_LOG: | ||
5988 | default: | ||
5989 | factor = 1 + ilog2(cpus); | ||
5990 | break; | ||
5991 | } | ||
5992 | |||
5993 | return factor; | ||
5994 | } | ||
5995 | |||
5996 | static void update_sysctl(void) | ||
5997 | { | ||
5998 | unsigned int factor = get_update_sysctl_factor(); | ||
5999 | |||
6000 | #define SET_SYSCTL(name) \ | ||
6001 | (sysctl_##name = (factor) * normalized_sysctl_##name) | ||
6002 | SET_SYSCTL(sched_min_granularity); | ||
6003 | SET_SYSCTL(sched_latency); | ||
6004 | SET_SYSCTL(sched_wakeup_granularity); | ||
6005 | #undef SET_SYSCTL | ||
6006 | } | ||
6007 | |||
6008 | static inline void sched_init_granularity(void) | ||
6009 | { | ||
6010 | update_sysctl(); | ||
6011 | } | 4886 | } |
6012 | 4887 | ||
6013 | #ifdef CONFIG_SMP | 4888 | #ifdef CONFIG_SMP |
@@ -6015,10 +4890,9 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | |||
6015 | { | 4890 | { |
6016 | if (p->sched_class && p->sched_class->set_cpus_allowed) | 4891 | if (p->sched_class && p->sched_class->set_cpus_allowed) |
6017 | p->sched_class->set_cpus_allowed(p, new_mask); | 4892 | p->sched_class->set_cpus_allowed(p, new_mask); |
6018 | else { | 4893 | |
6019 | cpumask_copy(&p->cpus_allowed, new_mask); | 4894 | cpumask_copy(&p->cpus_allowed, new_mask); |
6020 | p->rt.nr_cpus_allowed = cpumask_weight(new_mask); | 4895 | p->rt.nr_cpus_allowed = cpumask_weight(new_mask); |
6021 | } | ||
6022 | } | 4896 | } |
6023 | 4897 | ||
6024 | /* | 4898 | /* |
@@ -6116,7 +4990,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
6116 | if (task_cpu(p) != src_cpu) | 4990 | if (task_cpu(p) != src_cpu) |
6117 | goto done; | 4991 | goto done; |
6118 | /* Affinity changed (again). */ | 4992 | /* Affinity changed (again). */ |
6119 | if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) | 4993 | if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) |
6120 | goto fail; | 4994 | goto fail; |
6121 | 4995 | ||
6122 | /* | 4996 | /* |
@@ -6222,6 +5096,9 @@ static void migrate_tasks(unsigned int dead_cpu) | |||
6222 | */ | 5096 | */ |
6223 | rq->stop = NULL; | 5097 | rq->stop = NULL; |
6224 | 5098 | ||
5099 | /* Ensure any throttled groups are reachable by pick_next_task */ | ||
5100 | unthrottle_offline_cfs_rqs(rq); | ||
5101 | |||
6225 | for ( ; ; ) { | 5102 | for ( ; ; ) { |
6226 | /* | 5103 | /* |
6227 | * There's this thread running, bail when that's the only | 5104 | * There's this thread running, bail when that's the only |
@@ -6299,7 +5176,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep) | |||
6299 | static void | 5176 | static void |
6300 | set_table_entry(struct ctl_table *entry, | 5177 | set_table_entry(struct ctl_table *entry, |
6301 | const char *procname, void *data, int maxlen, | 5178 | const char *procname, void *data, int maxlen, |
6302 | mode_t mode, proc_handler *proc_handler) | 5179 | umode_t mode, proc_handler *proc_handler) |
6303 | { | 5180 | { |
6304 | entry->procname = procname; | 5181 | entry->procname = procname; |
6305 | entry->data = data; | 5182 | entry->data = data; |
@@ -6799,6 +5676,12 @@ out: | |||
6799 | return -ENOMEM; | 5676 | return -ENOMEM; |
6800 | } | 5677 | } |
6801 | 5678 | ||
5679 | /* | ||
5680 | * By default the system creates a single root-domain with all cpus as | ||
5681 | * members (mimicking the global state we have today). | ||
5682 | */ | ||
5683 | struct root_domain def_root_domain; | ||
5684 | |||
6802 | static void init_defrootdomain(void) | 5685 | static void init_defrootdomain(void) |
6803 | { | 5686 | { |
6804 | init_rootdomain(&def_root_domain); | 5687 | init_rootdomain(&def_root_domain); |
@@ -6870,6 +5753,31 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) | |||
6870 | } | 5753 | } |
6871 | 5754 | ||
6872 | /* | 5755 | /* |
5756 | * Keep a special pointer to the highest sched_domain that has | ||
5757 | * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this | ||
5758 | * allows us to avoid some pointer chasing select_idle_sibling(). | ||
5759 | * | ||
5760 | * Also keep a unique ID per domain (we use the first cpu number in | ||
5761 | * the cpumask of the domain), this allows us to quickly tell if | ||
5762 | * two cpus are in the same cache domain, see ttwu_share_cache(). | ||
5763 | */ | ||
5764 | DEFINE_PER_CPU(struct sched_domain *, sd_llc); | ||
5765 | DEFINE_PER_CPU(int, sd_llc_id); | ||
5766 | |||
5767 | static void update_top_cache_domain(int cpu) | ||
5768 | { | ||
5769 | struct sched_domain *sd; | ||
5770 | int id = cpu; | ||
5771 | |||
5772 | sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); | ||
5773 | if (sd) | ||
5774 | id = cpumask_first(sched_domain_span(sd)); | ||
5775 | |||
5776 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); | ||
5777 | per_cpu(sd_llc_id, cpu) = id; | ||
5778 | } | ||
5779 | |||
5780 | /* | ||
6873 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must | 5781 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must |
6874 | * hold the hotplug lock. | 5782 | * hold the hotplug lock. |
6875 | */ | 5783 | */ |
@@ -6908,6 +5816,8 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
6908 | tmp = rq->sd; | 5816 | tmp = rq->sd; |
6909 | rcu_assign_pointer(rq->sd, sd); | 5817 | rcu_assign_pointer(rq->sd, sd); |
6910 | destroy_sched_domains(tmp, cpu); | 5818 | destroy_sched_domains(tmp, cpu); |
5819 | |||
5820 | update_top_cache_domain(cpu); | ||
6911 | } | 5821 | } |
6912 | 5822 | ||
6913 | /* cpus with isolated domains */ | 5823 | /* cpus with isolated domains */ |
@@ -6923,8 +5833,6 @@ static int __init isolated_cpu_setup(char *str) | |||
6923 | 5833 | ||
6924 | __setup("isolcpus=", isolated_cpu_setup); | 5834 | __setup("isolcpus=", isolated_cpu_setup); |
6925 | 5835 | ||
6926 | #define SD_NODES_PER_DOMAIN 16 | ||
6927 | |||
6928 | #ifdef CONFIG_NUMA | 5836 | #ifdef CONFIG_NUMA |
6929 | 5837 | ||
6930 | /** | 5838 | /** |
@@ -7069,7 +5977,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
7069 | continue; | 5977 | continue; |
7070 | 5978 | ||
7071 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | 5979 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), |
7072 | GFP_KERNEL, cpu_to_node(i)); | 5980 | GFP_KERNEL, cpu_to_node(cpu)); |
7073 | 5981 | ||
7074 | if (!sg) | 5982 | if (!sg) |
7075 | goto fail; | 5983 | goto fail; |
@@ -7207,6 +6115,12 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
7207 | return; | 6115 | return; |
7208 | 6116 | ||
7209 | update_group_power(sd, cpu); | 6117 | update_group_power(sd, cpu); |
6118 | atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); | ||
6119 | } | ||
6120 | |||
6121 | int __weak arch_sd_sibling_asym_packing(void) | ||
6122 | { | ||
6123 | return 0*SD_ASYM_PACKING; | ||
7210 | } | 6124 | } |
7211 | 6125 | ||
7212 | /* | 6126 | /* |
@@ -7761,54 +6675,52 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) | |||
7761 | } | 6675 | } |
7762 | 6676 | ||
7763 | #ifdef CONFIG_SCHED_MC | 6677 | #ifdef CONFIG_SCHED_MC |
7764 | static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, | 6678 | static ssize_t sched_mc_power_savings_show(struct device *dev, |
7765 | struct sysdev_class_attribute *attr, | 6679 | struct device_attribute *attr, |
7766 | char *page) | 6680 | char *buf) |
7767 | { | 6681 | { |
7768 | return sprintf(page, "%u\n", sched_mc_power_savings); | 6682 | return sprintf(buf, "%u\n", sched_mc_power_savings); |
7769 | } | 6683 | } |
7770 | static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, | 6684 | static ssize_t sched_mc_power_savings_store(struct device *dev, |
7771 | struct sysdev_class_attribute *attr, | 6685 | struct device_attribute *attr, |
7772 | const char *buf, size_t count) | 6686 | const char *buf, size_t count) |
7773 | { | 6687 | { |
7774 | return sched_power_savings_store(buf, count, 0); | 6688 | return sched_power_savings_store(buf, count, 0); |
7775 | } | 6689 | } |
7776 | static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644, | 6690 | static DEVICE_ATTR(sched_mc_power_savings, 0644, |
7777 | sched_mc_power_savings_show, | 6691 | sched_mc_power_savings_show, |
7778 | sched_mc_power_savings_store); | 6692 | sched_mc_power_savings_store); |
7779 | #endif | 6693 | #endif |
7780 | 6694 | ||
7781 | #ifdef CONFIG_SCHED_SMT | 6695 | #ifdef CONFIG_SCHED_SMT |
7782 | static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, | 6696 | static ssize_t sched_smt_power_savings_show(struct device *dev, |
7783 | struct sysdev_class_attribute *attr, | 6697 | struct device_attribute *attr, |
7784 | char *page) | 6698 | char *buf) |
7785 | { | 6699 | { |
7786 | return sprintf(page, "%u\n", sched_smt_power_savings); | 6700 | return sprintf(buf, "%u\n", sched_smt_power_savings); |
7787 | } | 6701 | } |
7788 | static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, | 6702 | static ssize_t sched_smt_power_savings_store(struct device *dev, |
7789 | struct sysdev_class_attribute *attr, | 6703 | struct device_attribute *attr, |
7790 | const char *buf, size_t count) | 6704 | const char *buf, size_t count) |
7791 | { | 6705 | { |
7792 | return sched_power_savings_store(buf, count, 1); | 6706 | return sched_power_savings_store(buf, count, 1); |
7793 | } | 6707 | } |
7794 | static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644, | 6708 | static DEVICE_ATTR(sched_smt_power_savings, 0644, |
7795 | sched_smt_power_savings_show, | 6709 | sched_smt_power_savings_show, |
7796 | sched_smt_power_savings_store); | 6710 | sched_smt_power_savings_store); |
7797 | #endif | 6711 | #endif |
7798 | 6712 | ||
7799 | int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) | 6713 | int __init sched_create_sysfs_power_savings_entries(struct device *dev) |
7800 | { | 6714 | { |
7801 | int err = 0; | 6715 | int err = 0; |
7802 | 6716 | ||
7803 | #ifdef CONFIG_SCHED_SMT | 6717 | #ifdef CONFIG_SCHED_SMT |
7804 | if (smt_capable()) | 6718 | if (smt_capable()) |
7805 | err = sysfs_create_file(&cls->kset.kobj, | 6719 | err = device_create_file(dev, &dev_attr_sched_smt_power_savings); |
7806 | &attr_sched_smt_power_savings.attr); | ||
7807 | #endif | 6720 | #endif |
7808 | #ifdef CONFIG_SCHED_MC | 6721 | #ifdef CONFIG_SCHED_MC |
7809 | if (!err && mc_capable()) | 6722 | if (!err && mc_capable()) |
7810 | err = sysfs_create_file(&cls->kset.kobj, | 6723 | err = device_create_file(dev, &dev_attr_sched_mc_power_savings); |
7811 | &attr_sched_mc_power_savings.attr); | ||
7812 | #endif | 6724 | #endif |
7813 | return err; | 6725 | return err; |
7814 | } | 6726 | } |
@@ -7844,29 +6756,6 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, | |||
7844 | } | 6756 | } |
7845 | } | 6757 | } |
7846 | 6758 | ||
7847 | static int update_runtime(struct notifier_block *nfb, | ||
7848 | unsigned long action, void *hcpu) | ||
7849 | { | ||
7850 | int cpu = (int)(long)hcpu; | ||
7851 | |||
7852 | switch (action) { | ||
7853 | case CPU_DOWN_PREPARE: | ||
7854 | case CPU_DOWN_PREPARE_FROZEN: | ||
7855 | disable_runtime(cpu_rq(cpu)); | ||
7856 | return NOTIFY_OK; | ||
7857 | |||
7858 | case CPU_DOWN_FAILED: | ||
7859 | case CPU_DOWN_FAILED_FROZEN: | ||
7860 | case CPU_ONLINE: | ||
7861 | case CPU_ONLINE_FROZEN: | ||
7862 | enable_runtime(cpu_rq(cpu)); | ||
7863 | return NOTIFY_OK; | ||
7864 | |||
7865 | default: | ||
7866 | return NOTIFY_DONE; | ||
7867 | } | ||
7868 | } | ||
7869 | |||
7870 | void __init sched_init_smp(void) | 6759 | void __init sched_init_smp(void) |
7871 | { | 6760 | { |
7872 | cpumask_var_t non_isolated_cpus; | 6761 | cpumask_var_t non_isolated_cpus; |
@@ -7915,103 +6804,11 @@ int in_sched_functions(unsigned long addr) | |||
7915 | && addr < (unsigned long)__sched_text_end); | 6804 | && addr < (unsigned long)__sched_text_end); |
7916 | } | 6805 | } |
7917 | 6806 | ||
7918 | static void init_cfs_rq(struct cfs_rq *cfs_rq) | 6807 | #ifdef CONFIG_CGROUP_SCHED |
7919 | { | 6808 | struct task_group root_task_group; |
7920 | cfs_rq->tasks_timeline = RB_ROOT; | ||
7921 | INIT_LIST_HEAD(&cfs_rq->tasks); | ||
7922 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); | ||
7923 | #ifndef CONFIG_64BIT | ||
7924 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; | ||
7925 | #endif | ||
7926 | } | ||
7927 | |||
7928 | static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | ||
7929 | { | ||
7930 | struct rt_prio_array *array; | ||
7931 | int i; | ||
7932 | |||
7933 | array = &rt_rq->active; | ||
7934 | for (i = 0; i < MAX_RT_PRIO; i++) { | ||
7935 | INIT_LIST_HEAD(array->queue + i); | ||
7936 | __clear_bit(i, array->bitmap); | ||
7937 | } | ||
7938 | /* delimiter for bitsearch: */ | ||
7939 | __set_bit(MAX_RT_PRIO, array->bitmap); | ||
7940 | |||
7941 | #if defined CONFIG_SMP | ||
7942 | rt_rq->highest_prio.curr = MAX_RT_PRIO; | ||
7943 | rt_rq->highest_prio.next = MAX_RT_PRIO; | ||
7944 | rt_rq->rt_nr_migratory = 0; | ||
7945 | rt_rq->overloaded = 0; | ||
7946 | plist_head_init(&rt_rq->pushable_tasks); | ||
7947 | #endif | ||
7948 | |||
7949 | rt_rq->rt_time = 0; | ||
7950 | rt_rq->rt_throttled = 0; | ||
7951 | rt_rq->rt_runtime = 0; | ||
7952 | raw_spin_lock_init(&rt_rq->rt_runtime_lock); | ||
7953 | } | ||
7954 | |||
7955 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
7956 | static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | ||
7957 | struct sched_entity *se, int cpu, | ||
7958 | struct sched_entity *parent) | ||
7959 | { | ||
7960 | struct rq *rq = cpu_rq(cpu); | ||
7961 | |||
7962 | cfs_rq->tg = tg; | ||
7963 | cfs_rq->rq = rq; | ||
7964 | #ifdef CONFIG_SMP | ||
7965 | /* allow initial update_cfs_load() to truncate */ | ||
7966 | cfs_rq->load_stamp = 1; | ||
7967 | #endif | 6809 | #endif |
7968 | 6810 | ||
7969 | tg->cfs_rq[cpu] = cfs_rq; | 6811 | DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); |
7970 | tg->se[cpu] = se; | ||
7971 | |||
7972 | /* se could be NULL for root_task_group */ | ||
7973 | if (!se) | ||
7974 | return; | ||
7975 | |||
7976 | if (!parent) | ||
7977 | se->cfs_rq = &rq->cfs; | ||
7978 | else | ||
7979 | se->cfs_rq = parent->my_q; | ||
7980 | |||
7981 | se->my_q = cfs_rq; | ||
7982 | update_load_set(&se->load, 0); | ||
7983 | se->parent = parent; | ||
7984 | } | ||
7985 | #endif | ||
7986 | |||
7987 | #ifdef CONFIG_RT_GROUP_SCHED | ||
7988 | static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | ||
7989 | struct sched_rt_entity *rt_se, int cpu, | ||
7990 | struct sched_rt_entity *parent) | ||
7991 | { | ||
7992 | struct rq *rq = cpu_rq(cpu); | ||
7993 | |||
7994 | rt_rq->highest_prio.curr = MAX_RT_PRIO; | ||
7995 | rt_rq->rt_nr_boosted = 0; | ||
7996 | rt_rq->rq = rq; | ||
7997 | rt_rq->tg = tg; | ||
7998 | |||
7999 | tg->rt_rq[cpu] = rt_rq; | ||
8000 | tg->rt_se[cpu] = rt_se; | ||
8001 | |||
8002 | if (!rt_se) | ||
8003 | return; | ||
8004 | |||
8005 | if (!parent) | ||
8006 | rt_se->rt_rq = &rq->rt; | ||
8007 | else | ||
8008 | rt_se->rt_rq = parent->my_q; | ||
8009 | |||
8010 | rt_se->my_q = rt_rq; | ||
8011 | rt_se->parent = parent; | ||
8012 | INIT_LIST_HEAD(&rt_se->run_list); | ||
8013 | } | ||
8014 | #endif | ||
8015 | 6812 | ||
8016 | void __init sched_init(void) | 6813 | void __init sched_init(void) |
8017 | { | 6814 | { |
@@ -8069,9 +6866,17 @@ void __init sched_init(void) | |||
8069 | #ifdef CONFIG_CGROUP_SCHED | 6866 | #ifdef CONFIG_CGROUP_SCHED |
8070 | list_add(&root_task_group.list, &task_groups); | 6867 | list_add(&root_task_group.list, &task_groups); |
8071 | INIT_LIST_HEAD(&root_task_group.children); | 6868 | INIT_LIST_HEAD(&root_task_group.children); |
6869 | INIT_LIST_HEAD(&root_task_group.siblings); | ||
8072 | autogroup_init(&init_task); | 6870 | autogroup_init(&init_task); |
6871 | |||
8073 | #endif /* CONFIG_CGROUP_SCHED */ | 6872 | #endif /* CONFIG_CGROUP_SCHED */ |
8074 | 6873 | ||
6874 | #ifdef CONFIG_CGROUP_CPUACCT | ||
6875 | root_cpuacct.cpustat = &kernel_cpustat; | ||
6876 | root_cpuacct.cpuusage = alloc_percpu(u64); | ||
6877 | /* Too early, not expected to fail */ | ||
6878 | BUG_ON(!root_cpuacct.cpuusage); | ||
6879 | #endif | ||
8075 | for_each_possible_cpu(i) { | 6880 | for_each_possible_cpu(i) { |
8076 | struct rq *rq; | 6881 | struct rq *rq; |
8077 | 6882 | ||
@@ -8083,7 +6888,7 @@ void __init sched_init(void) | |||
8083 | init_cfs_rq(&rq->cfs); | 6888 | init_cfs_rq(&rq->cfs); |
8084 | init_rt_rq(&rq->rt, rq); | 6889 | init_rt_rq(&rq->rt, rq); |
8085 | #ifdef CONFIG_FAIR_GROUP_SCHED | 6890 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8086 | root_task_group.shares = root_task_group_load; | 6891 | root_task_group.shares = ROOT_TASK_GROUP_LOAD; |
8087 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 6892 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
8088 | /* | 6893 | /* |
8089 | * How much cpu bandwidth does root_task_group get? | 6894 | * How much cpu bandwidth does root_task_group get? |
@@ -8104,6 +6909,7 @@ void __init sched_init(void) | |||
8104 | * We achieve this by letting root_task_group's tasks sit | 6909 | * We achieve this by letting root_task_group's tasks sit |
8105 | * directly in rq->cfs (i.e root_task_group->se[] = NULL). | 6910 | * directly in rq->cfs (i.e root_task_group->se[] = NULL). |
8106 | */ | 6911 | */ |
6912 | init_cfs_bandwidth(&root_task_group.cfs_bandwidth); | ||
8107 | init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); | 6913 | init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); |
8108 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 6914 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
8109 | 6915 | ||
@@ -8132,8 +6938,7 @@ void __init sched_init(void) | |||
8132 | rq->avg_idle = 2*sysctl_sched_migration_cost; | 6938 | rq->avg_idle = 2*sysctl_sched_migration_cost; |
8133 | rq_attach_root(rq, &def_root_domain); | 6939 | rq_attach_root(rq, &def_root_domain); |
8134 | #ifdef CONFIG_NO_HZ | 6940 | #ifdef CONFIG_NO_HZ |
8135 | rq->nohz_balance_kick = 0; | 6941 | rq->nohz_flags = 0; |
8136 | init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i)); | ||
8137 | #endif | 6942 | #endif |
8138 | #endif | 6943 | #endif |
8139 | init_rq_hrtick(rq); | 6944 | init_rq_hrtick(rq); |
@@ -8146,10 +6951,6 @@ void __init sched_init(void) | |||
8146 | INIT_HLIST_HEAD(&init_task.preempt_notifiers); | 6951 | INIT_HLIST_HEAD(&init_task.preempt_notifiers); |
8147 | #endif | 6952 | #endif |
8148 | 6953 | ||
8149 | #ifdef CONFIG_SMP | ||
8150 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); | ||
8151 | #endif | ||
8152 | |||
8153 | #ifdef CONFIG_RT_MUTEXES | 6954 | #ifdef CONFIG_RT_MUTEXES |
8154 | plist_head_init(&init_task.pi_waiters); | 6955 | plist_head_init(&init_task.pi_waiters); |
8155 | #endif | 6956 | #endif |
@@ -8175,21 +6976,13 @@ void __init sched_init(void) | |||
8175 | */ | 6976 | */ |
8176 | current->sched_class = &fair_sched_class; | 6977 | current->sched_class = &fair_sched_class; |
8177 | 6978 | ||
8178 | /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ | ||
8179 | zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); | ||
8180 | #ifdef CONFIG_SMP | 6979 | #ifdef CONFIG_SMP |
8181 | zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); | 6980 | zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); |
8182 | #ifdef CONFIG_NO_HZ | ||
8183 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); | ||
8184 | alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); | ||
8185 | atomic_set(&nohz.load_balancer, nr_cpu_ids); | ||
8186 | atomic_set(&nohz.first_pick_cpu, nr_cpu_ids); | ||
8187 | atomic_set(&nohz.second_pick_cpu, nr_cpu_ids); | ||
8188 | #endif | ||
8189 | /* May be allocated at isolcpus cmdline parse time */ | 6981 | /* May be allocated at isolcpus cmdline parse time */ |
8190 | if (cpu_isolated_map == NULL) | 6982 | if (cpu_isolated_map == NULL) |
8191 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); | 6983 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); |
8192 | #endif /* SMP */ | 6984 | #endif |
6985 | init_sched_fair_class(); | ||
8193 | 6986 | ||
8194 | scheduler_running = 1; | 6987 | scheduler_running = 1; |
8195 | } | 6988 | } |
@@ -8206,6 +6999,7 @@ void __might_sleep(const char *file, int line, int preempt_offset) | |||
8206 | { | 6999 | { |
8207 | static unsigned long prev_jiffy; /* ratelimiting */ | 7000 | static unsigned long prev_jiffy; /* ratelimiting */ |
8208 | 7001 | ||
7002 | rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ | ||
8209 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || | 7003 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || |
8210 | system_state != SYSTEM_RUNNING || oops_in_progress) | 7004 | system_state != SYSTEM_RUNNING || oops_in_progress) |
8211 | return; | 7005 | return; |
@@ -8340,165 +7134,10 @@ void set_curr_task(int cpu, struct task_struct *p) | |||
8340 | 7134 | ||
8341 | #endif | 7135 | #endif |
8342 | 7136 | ||
8343 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
8344 | static void free_fair_sched_group(struct task_group *tg) | ||
8345 | { | ||
8346 | int i; | ||
8347 | |||
8348 | for_each_possible_cpu(i) { | ||
8349 | if (tg->cfs_rq) | ||
8350 | kfree(tg->cfs_rq[i]); | ||
8351 | if (tg->se) | ||
8352 | kfree(tg->se[i]); | ||
8353 | } | ||
8354 | |||
8355 | kfree(tg->cfs_rq); | ||
8356 | kfree(tg->se); | ||
8357 | } | ||
8358 | |||
8359 | static | ||
8360 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | ||
8361 | { | ||
8362 | struct cfs_rq *cfs_rq; | ||
8363 | struct sched_entity *se; | ||
8364 | int i; | ||
8365 | |||
8366 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); | ||
8367 | if (!tg->cfs_rq) | ||
8368 | goto err; | ||
8369 | tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); | ||
8370 | if (!tg->se) | ||
8371 | goto err; | ||
8372 | |||
8373 | tg->shares = NICE_0_LOAD; | ||
8374 | |||
8375 | for_each_possible_cpu(i) { | ||
8376 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), | ||
8377 | GFP_KERNEL, cpu_to_node(i)); | ||
8378 | if (!cfs_rq) | ||
8379 | goto err; | ||
8380 | |||
8381 | se = kzalloc_node(sizeof(struct sched_entity), | ||
8382 | GFP_KERNEL, cpu_to_node(i)); | ||
8383 | if (!se) | ||
8384 | goto err_free_rq; | ||
8385 | |||
8386 | init_cfs_rq(cfs_rq); | ||
8387 | init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); | ||
8388 | } | ||
8389 | |||
8390 | return 1; | ||
8391 | |||
8392 | err_free_rq: | ||
8393 | kfree(cfs_rq); | ||
8394 | err: | ||
8395 | return 0; | ||
8396 | } | ||
8397 | |||
8398 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | ||
8399 | { | ||
8400 | struct rq *rq = cpu_rq(cpu); | ||
8401 | unsigned long flags; | ||
8402 | |||
8403 | /* | ||
8404 | * Only empty task groups can be destroyed; so we can speculatively | ||
8405 | * check on_list without danger of it being re-added. | ||
8406 | */ | ||
8407 | if (!tg->cfs_rq[cpu]->on_list) | ||
8408 | return; | ||
8409 | |||
8410 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
8411 | list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); | ||
8412 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
8413 | } | ||
8414 | #else /* !CONFIG_FAIR_GROUP_SCHED */ | ||
8415 | static inline void free_fair_sched_group(struct task_group *tg) | ||
8416 | { | ||
8417 | } | ||
8418 | |||
8419 | static inline | ||
8420 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | ||
8421 | { | ||
8422 | return 1; | ||
8423 | } | ||
8424 | |||
8425 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | ||
8426 | { | ||
8427 | } | ||
8428 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
8429 | |||
8430 | #ifdef CONFIG_RT_GROUP_SCHED | ||
8431 | static void free_rt_sched_group(struct task_group *tg) | ||
8432 | { | ||
8433 | int i; | ||
8434 | |||
8435 | if (tg->rt_se) | ||
8436 | destroy_rt_bandwidth(&tg->rt_bandwidth); | ||
8437 | |||
8438 | for_each_possible_cpu(i) { | ||
8439 | if (tg->rt_rq) | ||
8440 | kfree(tg->rt_rq[i]); | ||
8441 | if (tg->rt_se) | ||
8442 | kfree(tg->rt_se[i]); | ||
8443 | } | ||
8444 | |||
8445 | kfree(tg->rt_rq); | ||
8446 | kfree(tg->rt_se); | ||
8447 | } | ||
8448 | |||
8449 | static | ||
8450 | int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | ||
8451 | { | ||
8452 | struct rt_rq *rt_rq; | ||
8453 | struct sched_rt_entity *rt_se; | ||
8454 | int i; | ||
8455 | |||
8456 | tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); | ||
8457 | if (!tg->rt_rq) | ||
8458 | goto err; | ||
8459 | tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); | ||
8460 | if (!tg->rt_se) | ||
8461 | goto err; | ||
8462 | |||
8463 | init_rt_bandwidth(&tg->rt_bandwidth, | ||
8464 | ktime_to_ns(def_rt_bandwidth.rt_period), 0); | ||
8465 | |||
8466 | for_each_possible_cpu(i) { | ||
8467 | rt_rq = kzalloc_node(sizeof(struct rt_rq), | ||
8468 | GFP_KERNEL, cpu_to_node(i)); | ||
8469 | if (!rt_rq) | ||
8470 | goto err; | ||
8471 | |||
8472 | rt_se = kzalloc_node(sizeof(struct sched_rt_entity), | ||
8473 | GFP_KERNEL, cpu_to_node(i)); | ||
8474 | if (!rt_se) | ||
8475 | goto err_free_rq; | ||
8476 | |||
8477 | init_rt_rq(rt_rq, cpu_rq(i)); | ||
8478 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; | ||
8479 | init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); | ||
8480 | } | ||
8481 | |||
8482 | return 1; | ||
8483 | |||
8484 | err_free_rq: | ||
8485 | kfree(rt_rq); | ||
8486 | err: | ||
8487 | return 0; | ||
8488 | } | ||
8489 | #else /* !CONFIG_RT_GROUP_SCHED */ | ||
8490 | static inline void free_rt_sched_group(struct task_group *tg) | ||
8491 | { | ||
8492 | } | ||
8493 | |||
8494 | static inline | ||
8495 | int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | ||
8496 | { | ||
8497 | return 1; | ||
8498 | } | ||
8499 | #endif /* CONFIG_RT_GROUP_SCHED */ | ||
8500 | |||
8501 | #ifdef CONFIG_CGROUP_SCHED | 7137 | #ifdef CONFIG_CGROUP_SCHED |
7138 | /* task_group_lock serializes the addition/removal of task groups */ | ||
7139 | static DEFINE_SPINLOCK(task_group_lock); | ||
7140 | |||
8502 | static void free_sched_group(struct task_group *tg) | 7141 | static void free_sched_group(struct task_group *tg) |
8503 | { | 7142 | { |
8504 | free_fair_sched_group(tg); | 7143 | free_fair_sched_group(tg); |
@@ -8603,47 +7242,13 @@ void sched_move_task(struct task_struct *tsk) | |||
8603 | } | 7242 | } |
8604 | #endif /* CONFIG_CGROUP_SCHED */ | 7243 | #endif /* CONFIG_CGROUP_SCHED */ |
8605 | 7244 | ||
8606 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7245 | #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) |
8607 | static DEFINE_MUTEX(shares_mutex); | 7246 | static unsigned long to_ratio(u64 period, u64 runtime) |
8608 | |||
8609 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) | ||
8610 | { | 7247 | { |
8611 | int i; | 7248 | if (runtime == RUNTIME_INF) |
8612 | unsigned long flags; | 7249 | return 1ULL << 20; |
8613 | |||
8614 | /* | ||
8615 | * We can't change the weight of the root cgroup. | ||
8616 | */ | ||
8617 | if (!tg->se[0]) | ||
8618 | return -EINVAL; | ||
8619 | |||
8620 | shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); | ||
8621 | |||
8622 | mutex_lock(&shares_mutex); | ||
8623 | if (tg->shares == shares) | ||
8624 | goto done; | ||
8625 | |||
8626 | tg->shares = shares; | ||
8627 | for_each_possible_cpu(i) { | ||
8628 | struct rq *rq = cpu_rq(i); | ||
8629 | struct sched_entity *se; | ||
8630 | |||
8631 | se = tg->se[i]; | ||
8632 | /* Propagate contribution to hierarchy */ | ||
8633 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
8634 | for_each_sched_entity(se) | ||
8635 | update_cfs_shares(group_cfs_rq(se)); | ||
8636 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
8637 | } | ||
8638 | |||
8639 | done: | ||
8640 | mutex_unlock(&shares_mutex); | ||
8641 | return 0; | ||
8642 | } | ||
8643 | 7250 | ||
8644 | unsigned long sched_group_shares(struct task_group *tg) | 7251 | return div64_u64(runtime << 20, period); |
8645 | { | ||
8646 | return tg->shares; | ||
8647 | } | 7252 | } |
8648 | #endif | 7253 | #endif |
8649 | 7254 | ||
@@ -8653,21 +7258,13 @@ unsigned long sched_group_shares(struct task_group *tg) | |||
8653 | */ | 7258 | */ |
8654 | static DEFINE_MUTEX(rt_constraints_mutex); | 7259 | static DEFINE_MUTEX(rt_constraints_mutex); |
8655 | 7260 | ||
8656 | static unsigned long to_ratio(u64 period, u64 runtime) | ||
8657 | { | ||
8658 | if (runtime == RUNTIME_INF) | ||
8659 | return 1ULL << 20; | ||
8660 | |||
8661 | return div64_u64(runtime << 20, period); | ||
8662 | } | ||
8663 | |||
8664 | /* Must be called with tasklist_lock held */ | 7261 | /* Must be called with tasklist_lock held */ |
8665 | static inline int tg_has_rt_tasks(struct task_group *tg) | 7262 | static inline int tg_has_rt_tasks(struct task_group *tg) |
8666 | { | 7263 | { |
8667 | struct task_struct *g, *p; | 7264 | struct task_struct *g, *p; |
8668 | 7265 | ||
8669 | do_each_thread(g, p) { | 7266 | do_each_thread(g, p) { |
8670 | if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) | 7267 | if (rt_task(p) && task_rq(p)->rt.tg == tg) |
8671 | return 1; | 7268 | return 1; |
8672 | } while_each_thread(g, p); | 7269 | } while_each_thread(g, p); |
8673 | 7270 | ||
@@ -8680,7 +7277,7 @@ struct rt_schedulable_data { | |||
8680 | u64 rt_runtime; | 7277 | u64 rt_runtime; |
8681 | }; | 7278 | }; |
8682 | 7279 | ||
8683 | static int tg_schedulable(struct task_group *tg, void *data) | 7280 | static int tg_rt_schedulable(struct task_group *tg, void *data) |
8684 | { | 7281 | { |
8685 | struct rt_schedulable_data *d = data; | 7282 | struct rt_schedulable_data *d = data; |
8686 | struct task_group *child; | 7283 | struct task_group *child; |
@@ -8738,16 +7335,22 @@ static int tg_schedulable(struct task_group *tg, void *data) | |||
8738 | 7335 | ||
8739 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | 7336 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) |
8740 | { | 7337 | { |
7338 | int ret; | ||
7339 | |||
8741 | struct rt_schedulable_data data = { | 7340 | struct rt_schedulable_data data = { |
8742 | .tg = tg, | 7341 | .tg = tg, |
8743 | .rt_period = period, | 7342 | .rt_period = period, |
8744 | .rt_runtime = runtime, | 7343 | .rt_runtime = runtime, |
8745 | }; | 7344 | }; |
8746 | 7345 | ||
8747 | return walk_tg_tree(tg_schedulable, tg_nop, &data); | 7346 | rcu_read_lock(); |
7347 | ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); | ||
7348 | rcu_read_unlock(); | ||
7349 | |||
7350 | return ret; | ||
8748 | } | 7351 | } |
8749 | 7352 | ||
8750 | static int tg_set_bandwidth(struct task_group *tg, | 7353 | static int tg_set_rt_bandwidth(struct task_group *tg, |
8751 | u64 rt_period, u64 rt_runtime) | 7354 | u64 rt_period, u64 rt_runtime) |
8752 | { | 7355 | { |
8753 | int i, err = 0; | 7356 | int i, err = 0; |
@@ -8786,7 +7389,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) | |||
8786 | if (rt_runtime_us < 0) | 7389 | if (rt_runtime_us < 0) |
8787 | rt_runtime = RUNTIME_INF; | 7390 | rt_runtime = RUNTIME_INF; |
8788 | 7391 | ||
8789 | return tg_set_bandwidth(tg, rt_period, rt_runtime); | 7392 | return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); |
8790 | } | 7393 | } |
8791 | 7394 | ||
8792 | long sched_group_rt_runtime(struct task_group *tg) | 7395 | long sched_group_rt_runtime(struct task_group *tg) |
@@ -8811,7 +7414,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) | |||
8811 | if (rt_period == 0) | 7414 | if (rt_period == 0) |
8812 | return -EINVAL; | 7415 | return -EINVAL; |
8813 | 7416 | ||
8814 | return tg_set_bandwidth(tg, rt_period, rt_runtime); | 7417 | return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); |
8815 | } | 7418 | } |
8816 | 7419 | ||
8817 | long sched_group_rt_period(struct task_group *tg) | 7420 | long sched_group_rt_period(struct task_group *tg) |
@@ -8953,24 +7556,31 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
8953 | sched_destroy_group(tg); | 7556 | sched_destroy_group(tg); |
8954 | } | 7557 | } |
8955 | 7558 | ||
8956 | static int | 7559 | static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, |
8957 | cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | 7560 | struct cgroup_taskset *tset) |
8958 | { | 7561 | { |
7562 | struct task_struct *task; | ||
7563 | |||
7564 | cgroup_taskset_for_each(task, cgrp, tset) { | ||
8959 | #ifdef CONFIG_RT_GROUP_SCHED | 7565 | #ifdef CONFIG_RT_GROUP_SCHED |
8960 | if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) | 7566 | if (!sched_rt_can_attach(cgroup_tg(cgrp), task)) |
8961 | return -EINVAL; | 7567 | return -EINVAL; |
8962 | #else | 7568 | #else |
8963 | /* We don't support RT-tasks being in separate groups */ | 7569 | /* We don't support RT-tasks being in separate groups */ |
8964 | if (tsk->sched_class != &fair_sched_class) | 7570 | if (task->sched_class != &fair_sched_class) |
8965 | return -EINVAL; | 7571 | return -EINVAL; |
8966 | #endif | 7572 | #endif |
7573 | } | ||
8967 | return 0; | 7574 | return 0; |
8968 | } | 7575 | } |
8969 | 7576 | ||
8970 | static void | 7577 | static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, |
8971 | cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | 7578 | struct cgroup_taskset *tset) |
8972 | { | 7579 | { |
8973 | sched_move_task(tsk); | 7580 | struct task_struct *task; |
7581 | |||
7582 | cgroup_taskset_for_each(task, cgrp, tset) | ||
7583 | sched_move_task(task); | ||
8974 | } | 7584 | } |
8975 | 7585 | ||
8976 | static void | 7586 | static void |
@@ -9001,6 +7611,237 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) | |||
9001 | 7611 | ||
9002 | return (u64) scale_load_down(tg->shares); | 7612 | return (u64) scale_load_down(tg->shares); |
9003 | } | 7613 | } |
7614 | |||
7615 | #ifdef CONFIG_CFS_BANDWIDTH | ||
7616 | static DEFINE_MUTEX(cfs_constraints_mutex); | ||
7617 | |||
7618 | const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ | ||
7619 | const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ | ||
7620 | |||
7621 | static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); | ||
7622 | |||
7623 | static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) | ||
7624 | { | ||
7625 | int i, ret = 0, runtime_enabled, runtime_was_enabled; | ||
7626 | struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; | ||
7627 | |||
7628 | if (tg == &root_task_group) | ||
7629 | return -EINVAL; | ||
7630 | |||
7631 | /* | ||
7632 | * Ensure we have at some amount of bandwidth every period. This is | ||
7633 | * to prevent reaching a state of large arrears when throttled via | ||
7634 | * entity_tick() resulting in prolonged exit starvation. | ||
7635 | */ | ||
7636 | if (quota < min_cfs_quota_period || period < min_cfs_quota_period) | ||
7637 | return -EINVAL; | ||
7638 | |||
7639 | /* | ||
7640 | * Likewise, bound things on the otherside by preventing insane quota | ||
7641 | * periods. This also allows us to normalize in computing quota | ||
7642 | * feasibility. | ||
7643 | */ | ||
7644 | if (period > max_cfs_quota_period) | ||
7645 | return -EINVAL; | ||
7646 | |||
7647 | mutex_lock(&cfs_constraints_mutex); | ||
7648 | ret = __cfs_schedulable(tg, period, quota); | ||
7649 | if (ret) | ||
7650 | goto out_unlock; | ||
7651 | |||
7652 | runtime_enabled = quota != RUNTIME_INF; | ||
7653 | runtime_was_enabled = cfs_b->quota != RUNTIME_INF; | ||
7654 | account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled); | ||
7655 | raw_spin_lock_irq(&cfs_b->lock); | ||
7656 | cfs_b->period = ns_to_ktime(period); | ||
7657 | cfs_b->quota = quota; | ||
7658 | |||
7659 | __refill_cfs_bandwidth_runtime(cfs_b); | ||
7660 | /* restart the period timer (if active) to handle new period expiry */ | ||
7661 | if (runtime_enabled && cfs_b->timer_active) { | ||
7662 | /* force a reprogram */ | ||
7663 | cfs_b->timer_active = 0; | ||
7664 | __start_cfs_bandwidth(cfs_b); | ||
7665 | } | ||
7666 | raw_spin_unlock_irq(&cfs_b->lock); | ||
7667 | |||
7668 | for_each_possible_cpu(i) { | ||
7669 | struct cfs_rq *cfs_rq = tg->cfs_rq[i]; | ||
7670 | struct rq *rq = cfs_rq->rq; | ||
7671 | |||
7672 | raw_spin_lock_irq(&rq->lock); | ||
7673 | cfs_rq->runtime_enabled = runtime_enabled; | ||
7674 | cfs_rq->runtime_remaining = 0; | ||
7675 | |||
7676 | if (cfs_rq->throttled) | ||
7677 | unthrottle_cfs_rq(cfs_rq); | ||
7678 | raw_spin_unlock_irq(&rq->lock); | ||
7679 | } | ||
7680 | out_unlock: | ||
7681 | mutex_unlock(&cfs_constraints_mutex); | ||
7682 | |||
7683 | return ret; | ||
7684 | } | ||
7685 | |||
7686 | int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) | ||
7687 | { | ||
7688 | u64 quota, period; | ||
7689 | |||
7690 | period = ktime_to_ns(tg->cfs_bandwidth.period); | ||
7691 | if (cfs_quota_us < 0) | ||
7692 | quota = RUNTIME_INF; | ||
7693 | else | ||
7694 | quota = (u64)cfs_quota_us * NSEC_PER_USEC; | ||
7695 | |||
7696 | return tg_set_cfs_bandwidth(tg, period, quota); | ||
7697 | } | ||
7698 | |||
7699 | long tg_get_cfs_quota(struct task_group *tg) | ||
7700 | { | ||
7701 | u64 quota_us; | ||
7702 | |||
7703 | if (tg->cfs_bandwidth.quota == RUNTIME_INF) | ||
7704 | return -1; | ||
7705 | |||
7706 | quota_us = tg->cfs_bandwidth.quota; | ||
7707 | do_div(quota_us, NSEC_PER_USEC); | ||
7708 | |||
7709 | return quota_us; | ||
7710 | } | ||
7711 | |||
7712 | int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) | ||
7713 | { | ||
7714 | u64 quota, period; | ||
7715 | |||
7716 | period = (u64)cfs_period_us * NSEC_PER_USEC; | ||
7717 | quota = tg->cfs_bandwidth.quota; | ||
7718 | |||
7719 | return tg_set_cfs_bandwidth(tg, period, quota); | ||
7720 | } | ||
7721 | |||
7722 | long tg_get_cfs_period(struct task_group *tg) | ||
7723 | { | ||
7724 | u64 cfs_period_us; | ||
7725 | |||
7726 | cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); | ||
7727 | do_div(cfs_period_us, NSEC_PER_USEC); | ||
7728 | |||
7729 | return cfs_period_us; | ||
7730 | } | ||
7731 | |||
7732 | static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) | ||
7733 | { | ||
7734 | return tg_get_cfs_quota(cgroup_tg(cgrp)); | ||
7735 | } | ||
7736 | |||
7737 | static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, | ||
7738 | s64 cfs_quota_us) | ||
7739 | { | ||
7740 | return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); | ||
7741 | } | ||
7742 | |||
7743 | static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) | ||
7744 | { | ||
7745 | return tg_get_cfs_period(cgroup_tg(cgrp)); | ||
7746 | } | ||
7747 | |||
7748 | static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, | ||
7749 | u64 cfs_period_us) | ||
7750 | { | ||
7751 | return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); | ||
7752 | } | ||
7753 | |||
7754 | struct cfs_schedulable_data { | ||
7755 | struct task_group *tg; | ||
7756 | u64 period, quota; | ||
7757 | }; | ||
7758 | |||
7759 | /* | ||
7760 | * normalize group quota/period to be quota/max_period | ||
7761 | * note: units are usecs | ||
7762 | */ | ||
7763 | static u64 normalize_cfs_quota(struct task_group *tg, | ||
7764 | struct cfs_schedulable_data *d) | ||
7765 | { | ||
7766 | u64 quota, period; | ||
7767 | |||
7768 | if (tg == d->tg) { | ||
7769 | period = d->period; | ||
7770 | quota = d->quota; | ||
7771 | } else { | ||
7772 | period = tg_get_cfs_period(tg); | ||
7773 | quota = tg_get_cfs_quota(tg); | ||
7774 | } | ||
7775 | |||
7776 | /* note: these should typically be equivalent */ | ||
7777 | if (quota == RUNTIME_INF || quota == -1) | ||
7778 | return RUNTIME_INF; | ||
7779 | |||
7780 | return to_ratio(period, quota); | ||
7781 | } | ||
7782 | |||
7783 | static int tg_cfs_schedulable_down(struct task_group *tg, void *data) | ||
7784 | { | ||
7785 | struct cfs_schedulable_data *d = data; | ||
7786 | struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; | ||
7787 | s64 quota = 0, parent_quota = -1; | ||
7788 | |||
7789 | if (!tg->parent) { | ||
7790 | quota = RUNTIME_INF; | ||
7791 | } else { | ||
7792 | struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; | ||
7793 | |||
7794 | quota = normalize_cfs_quota(tg, d); | ||
7795 | parent_quota = parent_b->hierarchal_quota; | ||
7796 | |||
7797 | /* | ||
7798 | * ensure max(child_quota) <= parent_quota, inherit when no | ||
7799 | * limit is set | ||
7800 | */ | ||
7801 | if (quota == RUNTIME_INF) | ||
7802 | quota = parent_quota; | ||
7803 | else if (parent_quota != RUNTIME_INF && quota > parent_quota) | ||
7804 | return -EINVAL; | ||
7805 | } | ||
7806 | cfs_b->hierarchal_quota = quota; | ||
7807 | |||
7808 | return 0; | ||
7809 | } | ||
7810 | |||
7811 | static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) | ||
7812 | { | ||
7813 | int ret; | ||
7814 | struct cfs_schedulable_data data = { | ||
7815 | .tg = tg, | ||
7816 | .period = period, | ||
7817 | .quota = quota, | ||
7818 | }; | ||
7819 | |||
7820 | if (quota != RUNTIME_INF) { | ||
7821 | do_div(data.period, NSEC_PER_USEC); | ||
7822 | do_div(data.quota, NSEC_PER_USEC); | ||
7823 | } | ||
7824 | |||
7825 | rcu_read_lock(); | ||
7826 | ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); | ||
7827 | rcu_read_unlock(); | ||
7828 | |||
7829 | return ret; | ||
7830 | } | ||
7831 | |||
7832 | static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, | ||
7833 | struct cgroup_map_cb *cb) | ||
7834 | { | ||
7835 | struct task_group *tg = cgroup_tg(cgrp); | ||
7836 | struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; | ||
7837 | |||
7838 | cb->fill(cb, "nr_periods", cfs_b->nr_periods); | ||
7839 | cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); | ||
7840 | cb->fill(cb, "throttled_time", cfs_b->throttled_time); | ||
7841 | |||
7842 | return 0; | ||
7843 | } | ||
7844 | #endif /* CONFIG_CFS_BANDWIDTH */ | ||
9004 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 7845 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
9005 | 7846 | ||
9006 | #ifdef CONFIG_RT_GROUP_SCHED | 7847 | #ifdef CONFIG_RT_GROUP_SCHED |
@@ -9035,6 +7876,22 @@ static struct cftype cpu_files[] = { | |||
9035 | .write_u64 = cpu_shares_write_u64, | 7876 | .write_u64 = cpu_shares_write_u64, |
9036 | }, | 7877 | }, |
9037 | #endif | 7878 | #endif |
7879 | #ifdef CONFIG_CFS_BANDWIDTH | ||
7880 | { | ||
7881 | .name = "cfs_quota_us", | ||
7882 | .read_s64 = cpu_cfs_quota_read_s64, | ||
7883 | .write_s64 = cpu_cfs_quota_write_s64, | ||
7884 | }, | ||
7885 | { | ||
7886 | .name = "cfs_period_us", | ||
7887 | .read_u64 = cpu_cfs_period_read_u64, | ||
7888 | .write_u64 = cpu_cfs_period_write_u64, | ||
7889 | }, | ||
7890 | { | ||
7891 | .name = "stat", | ||
7892 | .read_map = cpu_stats_show, | ||
7893 | }, | ||
7894 | #endif | ||
9038 | #ifdef CONFIG_RT_GROUP_SCHED | 7895 | #ifdef CONFIG_RT_GROUP_SCHED |
9039 | { | 7896 | { |
9040 | .name = "rt_runtime_us", | 7897 | .name = "rt_runtime_us", |
@@ -9058,8 +7915,8 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
9058 | .name = "cpu", | 7915 | .name = "cpu", |
9059 | .create = cpu_cgroup_create, | 7916 | .create = cpu_cgroup_create, |
9060 | .destroy = cpu_cgroup_destroy, | 7917 | .destroy = cpu_cgroup_destroy, |
9061 | .can_attach_task = cpu_cgroup_can_attach_task, | 7918 | .can_attach = cpu_cgroup_can_attach, |
9062 | .attach_task = cpu_cgroup_attach_task, | 7919 | .attach = cpu_cgroup_attach, |
9063 | .exit = cpu_cgroup_exit, | 7920 | .exit = cpu_cgroup_exit, |
9064 | .populate = cpu_cgroup_populate, | 7921 | .populate = cpu_cgroup_populate, |
9065 | .subsys_id = cpu_cgroup_subsys_id, | 7922 | .subsys_id = cpu_cgroup_subsys_id, |
@@ -9077,38 +7934,16 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
9077 | * (balbir@in.ibm.com). | 7934 | * (balbir@in.ibm.com). |
9078 | */ | 7935 | */ |
9079 | 7936 | ||
9080 | /* track cpu usage of a group of tasks and its child groups */ | ||
9081 | struct cpuacct { | ||
9082 | struct cgroup_subsys_state css; | ||
9083 | /* cpuusage holds pointer to a u64-type object on every cpu */ | ||
9084 | u64 __percpu *cpuusage; | ||
9085 | struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; | ||
9086 | struct cpuacct *parent; | ||
9087 | }; | ||
9088 | |||
9089 | struct cgroup_subsys cpuacct_subsys; | ||
9090 | |||
9091 | /* return cpu accounting group corresponding to this container */ | ||
9092 | static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) | ||
9093 | { | ||
9094 | return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), | ||
9095 | struct cpuacct, css); | ||
9096 | } | ||
9097 | |||
9098 | /* return cpu accounting group to which this task belongs */ | ||
9099 | static inline struct cpuacct *task_ca(struct task_struct *tsk) | ||
9100 | { | ||
9101 | return container_of(task_subsys_state(tsk, cpuacct_subsys_id), | ||
9102 | struct cpuacct, css); | ||
9103 | } | ||
9104 | |||
9105 | /* create a new cpu accounting group */ | 7937 | /* create a new cpu accounting group */ |
9106 | static struct cgroup_subsys_state *cpuacct_create( | 7938 | static struct cgroup_subsys_state *cpuacct_create( |
9107 | struct cgroup_subsys *ss, struct cgroup *cgrp) | 7939 | struct cgroup_subsys *ss, struct cgroup *cgrp) |
9108 | { | 7940 | { |
9109 | struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); | 7941 | struct cpuacct *ca; |
9110 | int i; | 7942 | |
7943 | if (!cgrp->parent) | ||
7944 | return &root_cpuacct.css; | ||
9111 | 7945 | ||
7946 | ca = kzalloc(sizeof(*ca), GFP_KERNEL); | ||
9112 | if (!ca) | 7947 | if (!ca) |
9113 | goto out; | 7948 | goto out; |
9114 | 7949 | ||
@@ -9116,18 +7951,13 @@ static struct cgroup_subsys_state *cpuacct_create( | |||
9116 | if (!ca->cpuusage) | 7951 | if (!ca->cpuusage) |
9117 | goto out_free_ca; | 7952 | goto out_free_ca; |
9118 | 7953 | ||
9119 | for (i = 0; i < CPUACCT_STAT_NSTATS; i++) | 7954 | ca->cpustat = alloc_percpu(struct kernel_cpustat); |
9120 | if (percpu_counter_init(&ca->cpustat[i], 0)) | 7955 | if (!ca->cpustat) |
9121 | goto out_free_counters; | 7956 | goto out_free_cpuusage; |
9122 | |||
9123 | if (cgrp->parent) | ||
9124 | ca->parent = cgroup_ca(cgrp->parent); | ||
9125 | 7957 | ||
9126 | return &ca->css; | 7958 | return &ca->css; |
9127 | 7959 | ||
9128 | out_free_counters: | 7960 | out_free_cpuusage: |
9129 | while (--i >= 0) | ||
9130 | percpu_counter_destroy(&ca->cpustat[i]); | ||
9131 | free_percpu(ca->cpuusage); | 7961 | free_percpu(ca->cpuusage); |
9132 | out_free_ca: | 7962 | out_free_ca: |
9133 | kfree(ca); | 7963 | kfree(ca); |
@@ -9140,10 +7970,8 @@ static void | |||
9140 | cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) | 7970 | cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) |
9141 | { | 7971 | { |
9142 | struct cpuacct *ca = cgroup_ca(cgrp); | 7972 | struct cpuacct *ca = cgroup_ca(cgrp); |
9143 | int i; | ||
9144 | 7973 | ||
9145 | for (i = 0; i < CPUACCT_STAT_NSTATS; i++) | 7974 | free_percpu(ca->cpustat); |
9146 | percpu_counter_destroy(&ca->cpustat[i]); | ||
9147 | free_percpu(ca->cpuusage); | 7975 | free_percpu(ca->cpuusage); |
9148 | kfree(ca); | 7976 | kfree(ca); |
9149 | } | 7977 | } |
@@ -9236,16 +8064,31 @@ static const char *cpuacct_stat_desc[] = { | |||
9236 | }; | 8064 | }; |
9237 | 8065 | ||
9238 | static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, | 8066 | static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, |
9239 | struct cgroup_map_cb *cb) | 8067 | struct cgroup_map_cb *cb) |
9240 | { | 8068 | { |
9241 | struct cpuacct *ca = cgroup_ca(cgrp); | 8069 | struct cpuacct *ca = cgroup_ca(cgrp); |
9242 | int i; | 8070 | int cpu; |
8071 | s64 val = 0; | ||
8072 | |||
8073 | for_each_online_cpu(cpu) { | ||
8074 | struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); | ||
8075 | val += kcpustat->cpustat[CPUTIME_USER]; | ||
8076 | val += kcpustat->cpustat[CPUTIME_NICE]; | ||
8077 | } | ||
8078 | val = cputime64_to_clock_t(val); | ||
8079 | cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); | ||
9243 | 8080 | ||
9244 | for (i = 0; i < CPUACCT_STAT_NSTATS; i++) { | 8081 | val = 0; |
9245 | s64 val = percpu_counter_read(&ca->cpustat[i]); | 8082 | for_each_online_cpu(cpu) { |
9246 | val = cputime64_to_clock_t(val); | 8083 | struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); |
9247 | cb->fill(cb, cpuacct_stat_desc[i], val); | 8084 | val += kcpustat->cpustat[CPUTIME_SYSTEM]; |
8085 | val += kcpustat->cpustat[CPUTIME_IRQ]; | ||
8086 | val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; | ||
9248 | } | 8087 | } |
8088 | |||
8089 | val = cputime64_to_clock_t(val); | ||
8090 | cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); | ||
8091 | |||
9249 | return 0; | 8092 | return 0; |
9250 | } | 8093 | } |
9251 | 8094 | ||
@@ -9275,7 +8118,7 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
9275 | * | 8118 | * |
9276 | * called with rq->lock held. | 8119 | * called with rq->lock held. |
9277 | */ | 8120 | */ |
9278 | static void cpuacct_charge(struct task_struct *tsk, u64 cputime) | 8121 | void cpuacct_charge(struct task_struct *tsk, u64 cputime) |
9279 | { | 8122 | { |
9280 | struct cpuacct *ca; | 8123 | struct cpuacct *ca; |
9281 | int cpu; | 8124 | int cpu; |
@@ -9289,7 +8132,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime) | |||
9289 | 8132 | ||
9290 | ca = task_ca(tsk); | 8133 | ca = task_ca(tsk); |
9291 | 8134 | ||
9292 | for (; ca; ca = ca->parent) { | 8135 | for (; ca; ca = parent_ca(ca)) { |
9293 | u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); | 8136 | u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); |
9294 | *cpuusage += cputime; | 8137 | *cpuusage += cputime; |
9295 | } | 8138 | } |
@@ -9297,45 +8140,6 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime) | |||
9297 | rcu_read_unlock(); | 8140 | rcu_read_unlock(); |
9298 | } | 8141 | } |
9299 | 8142 | ||
9300 | /* | ||
9301 | * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large | ||
9302 | * in cputime_t units. As a result, cpuacct_update_stats calls | ||
9303 | * percpu_counter_add with values large enough to always overflow the | ||
9304 | * per cpu batch limit causing bad SMP scalability. | ||
9305 | * | ||
9306 | * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we | ||
9307 | * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled | ||
9308 | * and enabled. We cap it at INT_MAX which is the largest allowed batch value. | ||
9309 | */ | ||
9310 | #ifdef CONFIG_SMP | ||
9311 | #define CPUACCT_BATCH \ | ||
9312 | min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX) | ||
9313 | #else | ||
9314 | #define CPUACCT_BATCH 0 | ||
9315 | #endif | ||
9316 | |||
9317 | /* | ||
9318 | * Charge the system/user time to the task's accounting group. | ||
9319 | */ | ||
9320 | static void cpuacct_update_stats(struct task_struct *tsk, | ||
9321 | enum cpuacct_stat_index idx, cputime_t val) | ||
9322 | { | ||
9323 | struct cpuacct *ca; | ||
9324 | int batch = CPUACCT_BATCH; | ||
9325 | |||
9326 | if (unlikely(!cpuacct_subsys.active)) | ||
9327 | return; | ||
9328 | |||
9329 | rcu_read_lock(); | ||
9330 | ca = task_ca(tsk); | ||
9331 | |||
9332 | do { | ||
9333 | __percpu_counter_add(&ca->cpustat[idx], val, batch); | ||
9334 | ca = ca->parent; | ||
9335 | } while (ca); | ||
9336 | rcu_read_unlock(); | ||
9337 | } | ||
9338 | |||
9339 | struct cgroup_subsys cpuacct_subsys = { | 8143 | struct cgroup_subsys cpuacct_subsys = { |
9340 | .name = "cpuacct", | 8144 | .name = "cpuacct", |
9341 | .create = cpuacct_create, | 8145 | .create = cpuacct_create, |
@@ -9344,4 +8148,3 @@ struct cgroup_subsys cpuacct_subsys = { | |||
9344 | .subsys_id = cpuacct_subsys_id, | 8148 | .subsys_id = cpuacct_subsys_id, |
9345 | }; | 8149 | }; |
9346 | #endif /* CONFIG_CGROUP_CPUACCT */ | 8150 | #endif /* CONFIG_CGROUP_CPUACCT */ |
9347 | |||
diff --git a/kernel/sched_cpupri.c b/kernel/sched/cpupri.c index 2722dc1b4138..b0d798eaf130 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched/cpupri.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * kernel/sched_cpupri.c | 2 | * kernel/sched/cpupri.c |
3 | * | 3 | * |
4 | * CPU priority management | 4 | * CPU priority management |
5 | * | 5 | * |
@@ -28,7 +28,7 @@ | |||
28 | */ | 28 | */ |
29 | 29 | ||
30 | #include <linux/gfp.h> | 30 | #include <linux/gfp.h> |
31 | #include "sched_cpupri.h" | 31 | #include "cpupri.h" |
32 | 32 | ||
33 | /* Convert between a 140 based task->prio, and our 102 based cpupri */ | 33 | /* Convert between a 140 based task->prio, and our 102 based cpupri */ |
34 | static int convert_prio(int prio) | 34 | static int convert_prio(int prio) |
@@ -47,9 +47,6 @@ static int convert_prio(int prio) | |||
47 | return cpupri; | 47 | return cpupri; |
48 | } | 48 | } |
49 | 49 | ||
50 | #define for_each_cpupri_active(array, idx) \ | ||
51 | for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES) | ||
52 | |||
53 | /** | 50 | /** |
54 | * cpupri_find - find the best (lowest-pri) CPU in the system | 51 | * cpupri_find - find the best (lowest-pri) CPU in the system |
55 | * @cp: The cpupri context | 52 | * @cp: The cpupri context |
@@ -71,11 +68,38 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, | |||
71 | int idx = 0; | 68 | int idx = 0; |
72 | int task_pri = convert_prio(p->prio); | 69 | int task_pri = convert_prio(p->prio); |
73 | 70 | ||
74 | for_each_cpupri_active(cp->pri_active, idx) { | 71 | if (task_pri >= MAX_RT_PRIO) |
75 | struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; | 72 | return 0; |
76 | 73 | ||
77 | if (idx >= task_pri) | 74 | for (idx = 0; idx < task_pri; idx++) { |
78 | break; | 75 | struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; |
76 | int skip = 0; | ||
77 | |||
78 | if (!atomic_read(&(vec)->count)) | ||
79 | skip = 1; | ||
80 | /* | ||
81 | * When looking at the vector, we need to read the counter, | ||
82 | * do a memory barrier, then read the mask. | ||
83 | * | ||
84 | * Note: This is still all racey, but we can deal with it. | ||
85 | * Ideally, we only want to look at masks that are set. | ||
86 | * | ||
87 | * If a mask is not set, then the only thing wrong is that we | ||
88 | * did a little more work than necessary. | ||
89 | * | ||
90 | * If we read a zero count but the mask is set, because of the | ||
91 | * memory barriers, that can only happen when the highest prio | ||
92 | * task for a run queue has left the run queue, in which case, | ||
93 | * it will be followed by a pull. If the task we are processing | ||
94 | * fails to find a proper place to go, that pull request will | ||
95 | * pull this task if the run queue is running at a lower | ||
96 | * priority. | ||
97 | */ | ||
98 | smp_rmb(); | ||
99 | |||
100 | /* Need to do the rmb for every iteration */ | ||
101 | if (skip) | ||
102 | continue; | ||
79 | 103 | ||
80 | if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) | 104 | if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) |
81 | continue; | 105 | continue; |
@@ -115,7 +139,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) | |||
115 | { | 139 | { |
116 | int *currpri = &cp->cpu_to_pri[cpu]; | 140 | int *currpri = &cp->cpu_to_pri[cpu]; |
117 | int oldpri = *currpri; | 141 | int oldpri = *currpri; |
118 | unsigned long flags; | 142 | int do_mb = 0; |
119 | 143 | ||
120 | newpri = convert_prio(newpri); | 144 | newpri = convert_prio(newpri); |
121 | 145 | ||
@@ -128,32 +152,46 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) | |||
128 | * If the cpu was currently mapped to a different value, we | 152 | * If the cpu was currently mapped to a different value, we |
129 | * need to map it to the new value then remove the old value. | 153 | * need to map it to the new value then remove the old value. |
130 | * Note, we must add the new value first, otherwise we risk the | 154 | * Note, we must add the new value first, otherwise we risk the |
131 | * cpu being cleared from pri_active, and this cpu could be | 155 | * cpu being missed by the priority loop in cpupri_find. |
132 | * missed for a push or pull. | ||
133 | */ | 156 | */ |
134 | if (likely(newpri != CPUPRI_INVALID)) { | 157 | if (likely(newpri != CPUPRI_INVALID)) { |
135 | struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; | 158 | struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; |
136 | 159 | ||
137 | raw_spin_lock_irqsave(&vec->lock, flags); | ||
138 | |||
139 | cpumask_set_cpu(cpu, vec->mask); | 160 | cpumask_set_cpu(cpu, vec->mask); |
140 | vec->count++; | 161 | /* |
141 | if (vec->count == 1) | 162 | * When adding a new vector, we update the mask first, |
142 | set_bit(newpri, cp->pri_active); | 163 | * do a write memory barrier, and then update the count, to |
143 | 164 | * make sure the vector is visible when count is set. | |
144 | raw_spin_unlock_irqrestore(&vec->lock, flags); | 165 | */ |
166 | smp_mb__before_atomic_inc(); | ||
167 | atomic_inc(&(vec)->count); | ||
168 | do_mb = 1; | ||
145 | } | 169 | } |
146 | if (likely(oldpri != CPUPRI_INVALID)) { | 170 | if (likely(oldpri != CPUPRI_INVALID)) { |
147 | struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; | 171 | struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; |
148 | 172 | ||
149 | raw_spin_lock_irqsave(&vec->lock, flags); | 173 | /* |
150 | 174 | * Because the order of modification of the vec->count | |
151 | vec->count--; | 175 | * is important, we must make sure that the update |
152 | if (!vec->count) | 176 | * of the new prio is seen before we decrement the |
153 | clear_bit(oldpri, cp->pri_active); | 177 | * old prio. This makes sure that the loop sees |
178 | * one or the other when we raise the priority of | ||
179 | * the run queue. We don't care about when we lower the | ||
180 | * priority, as that will trigger an rt pull anyway. | ||
181 | * | ||
182 | * We only need to do a memory barrier if we updated | ||
183 | * the new priority vec. | ||
184 | */ | ||
185 | if (do_mb) | ||
186 | smp_mb__after_atomic_inc(); | ||
187 | |||
188 | /* | ||
189 | * When removing from the vector, we decrement the counter first | ||
190 | * do a memory barrier and then clear the mask. | ||
191 | */ | ||
192 | atomic_dec(&(vec)->count); | ||
193 | smp_mb__after_atomic_inc(); | ||
154 | cpumask_clear_cpu(cpu, vec->mask); | 194 | cpumask_clear_cpu(cpu, vec->mask); |
155 | |||
156 | raw_spin_unlock_irqrestore(&vec->lock, flags); | ||
157 | } | 195 | } |
158 | 196 | ||
159 | *currpri = newpri; | 197 | *currpri = newpri; |
@@ -175,8 +213,7 @@ int cpupri_init(struct cpupri *cp) | |||
175 | for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { | 213 | for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { |
176 | struct cpupri_vec *vec = &cp->pri_to_cpu[i]; | 214 | struct cpupri_vec *vec = &cp->pri_to_cpu[i]; |
177 | 215 | ||
178 | raw_spin_lock_init(&vec->lock); | 216 | atomic_set(&vec->count, 0); |
179 | vec->count = 0; | ||
180 | if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) | 217 | if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) |
181 | goto cleanup; | 218 | goto cleanup; |
182 | } | 219 | } |
diff --git a/kernel/sched_cpupri.h b/kernel/sched/cpupri.h index 9fc7d386fea4..f6d756173491 100644 --- a/kernel/sched_cpupri.h +++ b/kernel/sched/cpupri.h | |||
@@ -4,7 +4,6 @@ | |||
4 | #include <linux/sched.h> | 4 | #include <linux/sched.h> |
5 | 5 | ||
6 | #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) | 6 | #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) |
7 | #define CPUPRI_NR_PRI_WORDS BITS_TO_LONGS(CPUPRI_NR_PRIORITIES) | ||
8 | 7 | ||
9 | #define CPUPRI_INVALID -1 | 8 | #define CPUPRI_INVALID -1 |
10 | #define CPUPRI_IDLE 0 | 9 | #define CPUPRI_IDLE 0 |
@@ -12,14 +11,12 @@ | |||
12 | /* values 2-101 are RT priorities 0-99 */ | 11 | /* values 2-101 are RT priorities 0-99 */ |
13 | 12 | ||
14 | struct cpupri_vec { | 13 | struct cpupri_vec { |
15 | raw_spinlock_t lock; | 14 | atomic_t count; |
16 | int count; | 15 | cpumask_var_t mask; |
17 | cpumask_var_t mask; | ||
18 | }; | 16 | }; |
19 | 17 | ||
20 | struct cpupri { | 18 | struct cpupri { |
21 | struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; | 19 | struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; |
22 | long pri_active[CPUPRI_NR_PRI_WORDS]; | ||
23 | int cpu_to_pri[NR_CPUS]; | 20 | int cpu_to_pri[NR_CPUS]; |
24 | }; | 21 | }; |
25 | 22 | ||
diff --git a/kernel/sched_debug.c b/kernel/sched/debug.c index a6710a112b4f..2a075e10004b 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched/debug.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * kernel/time/sched_debug.c | 2 | * kernel/sched/debug.c |
3 | * | 3 | * |
4 | * Print the CFS rbtree | 4 | * Print the CFS rbtree |
5 | * | 5 | * |
@@ -16,6 +16,8 @@ | |||
16 | #include <linux/kallsyms.h> | 16 | #include <linux/kallsyms.h> |
17 | #include <linux/utsname.h> | 17 | #include <linux/utsname.h> |
18 | 18 | ||
19 | #include "sched.h" | ||
20 | |||
19 | static DEFINE_SPINLOCK(sched_debug_lock); | 21 | static DEFINE_SPINLOCK(sched_debug_lock); |
20 | 22 | ||
21 | /* | 23 | /* |
@@ -373,7 +375,7 @@ static int sched_debug_show(struct seq_file *m, void *v) | |||
373 | return 0; | 375 | return 0; |
374 | } | 376 | } |
375 | 377 | ||
376 | static void sysrq_sched_debug_show(void) | 378 | void sysrq_sched_debug_show(void) |
377 | { | 379 | { |
378 | sched_debug_show(NULL, NULL); | 380 | sched_debug_show(NULL, NULL); |
379 | } | 381 | } |
diff --git a/kernel/sched_fair.c b/kernel/sched/fair.c index bc8ee9993814..84adb2d66cbd 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched/fair.c | |||
@@ -23,6 +23,13 @@ | |||
23 | #include <linux/latencytop.h> | 23 | #include <linux/latencytop.h> |
24 | #include <linux/sched.h> | 24 | #include <linux/sched.h> |
25 | #include <linux/cpumask.h> | 25 | #include <linux/cpumask.h> |
26 | #include <linux/slab.h> | ||
27 | #include <linux/profile.h> | ||
28 | #include <linux/interrupt.h> | ||
29 | |||
30 | #include <trace/events/sched.h> | ||
31 | |||
32 | #include "sched.h" | ||
26 | 33 | ||
27 | /* | 34 | /* |
28 | * Targeted preemption latency for CPU-bound tasks: | 35 | * Targeted preemption latency for CPU-bound tasks: |
@@ -89,7 +96,124 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL; | |||
89 | */ | 96 | */ |
90 | unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; | 97 | unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; |
91 | 98 | ||
92 | static const struct sched_class fair_sched_class; | 99 | #ifdef CONFIG_CFS_BANDWIDTH |
100 | /* | ||
101 | * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool | ||
102 | * each time a cfs_rq requests quota. | ||
103 | * | ||
104 | * Note: in the case that the slice exceeds the runtime remaining (either due | ||
105 | * to consumption or the quota being specified to be smaller than the slice) | ||
106 | * we will always only issue the remaining available time. | ||
107 | * | ||
108 | * default: 5 msec, units: microseconds | ||
109 | */ | ||
110 | unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; | ||
111 | #endif | ||
112 | |||
113 | /* | ||
114 | * Increase the granularity value when there are more CPUs, | ||
115 | * because with more CPUs the 'effective latency' as visible | ||
116 | * to users decreases. But the relationship is not linear, | ||
117 | * so pick a second-best guess by going with the log2 of the | ||
118 | * number of CPUs. | ||
119 | * | ||
120 | * This idea comes from the SD scheduler of Con Kolivas: | ||
121 | */ | ||
122 | static int get_update_sysctl_factor(void) | ||
123 | { | ||
124 | unsigned int cpus = min_t(int, num_online_cpus(), 8); | ||
125 | unsigned int factor; | ||
126 | |||
127 | switch (sysctl_sched_tunable_scaling) { | ||
128 | case SCHED_TUNABLESCALING_NONE: | ||
129 | factor = 1; | ||
130 | break; | ||
131 | case SCHED_TUNABLESCALING_LINEAR: | ||
132 | factor = cpus; | ||
133 | break; | ||
134 | case SCHED_TUNABLESCALING_LOG: | ||
135 | default: | ||
136 | factor = 1 + ilog2(cpus); | ||
137 | break; | ||
138 | } | ||
139 | |||
140 | return factor; | ||
141 | } | ||
142 | |||
143 | static void update_sysctl(void) | ||
144 | { | ||
145 | unsigned int factor = get_update_sysctl_factor(); | ||
146 | |||
147 | #define SET_SYSCTL(name) \ | ||
148 | (sysctl_##name = (factor) * normalized_sysctl_##name) | ||
149 | SET_SYSCTL(sched_min_granularity); | ||
150 | SET_SYSCTL(sched_latency); | ||
151 | SET_SYSCTL(sched_wakeup_granularity); | ||
152 | #undef SET_SYSCTL | ||
153 | } | ||
154 | |||
155 | void sched_init_granularity(void) | ||
156 | { | ||
157 | update_sysctl(); | ||
158 | } | ||
159 | |||
160 | #if BITS_PER_LONG == 32 | ||
161 | # define WMULT_CONST (~0UL) | ||
162 | #else | ||
163 | # define WMULT_CONST (1UL << 32) | ||
164 | #endif | ||
165 | |||
166 | #define WMULT_SHIFT 32 | ||
167 | |||
168 | /* | ||
169 | * Shift right and round: | ||
170 | */ | ||
171 | #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) | ||
172 | |||
173 | /* | ||
174 | * delta *= weight / lw | ||
175 | */ | ||
176 | static unsigned long | ||
177 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, | ||
178 | struct load_weight *lw) | ||
179 | { | ||
180 | u64 tmp; | ||
181 | |||
182 | /* | ||
183 | * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched | ||
184 | * entities since MIN_SHARES = 2. Treat weight as 1 if less than | ||
185 | * 2^SCHED_LOAD_RESOLUTION. | ||
186 | */ | ||
187 | if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) | ||
188 | tmp = (u64)delta_exec * scale_load_down(weight); | ||
189 | else | ||
190 | tmp = (u64)delta_exec; | ||
191 | |||
192 | if (!lw->inv_weight) { | ||
193 | unsigned long w = scale_load_down(lw->weight); | ||
194 | |||
195 | if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) | ||
196 | lw->inv_weight = 1; | ||
197 | else if (unlikely(!w)) | ||
198 | lw->inv_weight = WMULT_CONST; | ||
199 | else | ||
200 | lw->inv_weight = WMULT_CONST / w; | ||
201 | } | ||
202 | |||
203 | /* | ||
204 | * Check whether we'd overflow the 64-bit multiplication: | ||
205 | */ | ||
206 | if (unlikely(tmp > WMULT_CONST)) | ||
207 | tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, | ||
208 | WMULT_SHIFT/2); | ||
209 | else | ||
210 | tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); | ||
211 | |||
212 | return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); | ||
213 | } | ||
214 | |||
215 | |||
216 | const struct sched_class fair_sched_class; | ||
93 | 217 | ||
94 | /************************************************************** | 218 | /************************************************************** |
95 | * CFS operations on generic schedulable entities: | 219 | * CFS operations on generic schedulable entities: |
@@ -292,6 +416,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) | |||
292 | 416 | ||
293 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 417 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
294 | 418 | ||
419 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | ||
420 | unsigned long delta_exec); | ||
295 | 421 | ||
296 | /************************************************************** | 422 | /************************************************************** |
297 | * Scheduling class tree data structure manipulation methods: | 423 | * Scheduling class tree data structure manipulation methods: |
@@ -397,7 +523,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
397 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); | 523 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); |
398 | } | 524 | } |
399 | 525 | ||
400 | static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) | 526 | struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) |
401 | { | 527 | { |
402 | struct rb_node *left = cfs_rq->rb_leftmost; | 528 | struct rb_node *left = cfs_rq->rb_leftmost; |
403 | 529 | ||
@@ -418,7 +544,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se) | |||
418 | } | 544 | } |
419 | 545 | ||
420 | #ifdef CONFIG_SCHED_DEBUG | 546 | #ifdef CONFIG_SCHED_DEBUG |
421 | static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) | 547 | struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) |
422 | { | 548 | { |
423 | struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); | 549 | struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); |
424 | 550 | ||
@@ -583,6 +709,8 @@ static void update_curr(struct cfs_rq *cfs_rq) | |||
583 | cpuacct_charge(curtask, delta_exec); | 709 | cpuacct_charge(curtask, delta_exec); |
584 | account_group_exec_runtime(curtask, delta_exec); | 710 | account_group_exec_runtime(curtask, delta_exec); |
585 | } | 711 | } |
712 | |||
713 | account_cfs_rq_runtime(cfs_rq, delta_exec); | ||
586 | } | 714 | } |
587 | 715 | ||
588 | static inline void | 716 | static inline void |
@@ -666,7 +794,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
666 | { | 794 | { |
667 | update_load_add(&cfs_rq->load, se->load.weight); | 795 | update_load_add(&cfs_rq->load, se->load.weight); |
668 | if (!parent_entity(se)) | 796 | if (!parent_entity(se)) |
669 | inc_cpu_load(rq_of(cfs_rq), se->load.weight); | 797 | update_load_add(&rq_of(cfs_rq)->load, se->load.weight); |
670 | if (entity_is_task(se)) { | 798 | if (entity_is_task(se)) { |
671 | add_cfs_task_weight(cfs_rq, se->load.weight); | 799 | add_cfs_task_weight(cfs_rq, se->load.weight); |
672 | list_add(&se->group_node, &cfs_rq->tasks); | 800 | list_add(&se->group_node, &cfs_rq->tasks); |
@@ -679,7 +807,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
679 | { | 807 | { |
680 | update_load_sub(&cfs_rq->load, se->load.weight); | 808 | update_load_sub(&cfs_rq->load, se->load.weight); |
681 | if (!parent_entity(se)) | 809 | if (!parent_entity(se)) |
682 | dec_cpu_load(rq_of(cfs_rq), se->load.weight); | 810 | update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); |
683 | if (entity_is_task(se)) { | 811 | if (entity_is_task(se)) { |
684 | add_cfs_task_weight(cfs_rq, -se->load.weight); | 812 | add_cfs_task_weight(cfs_rq, -se->load.weight); |
685 | list_del_init(&se->group_node); | 813 | list_del_init(&se->group_node); |
@@ -688,6 +816,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
688 | } | 816 | } |
689 | 817 | ||
690 | #ifdef CONFIG_FAIR_GROUP_SCHED | 818 | #ifdef CONFIG_FAIR_GROUP_SCHED |
819 | /* we need this in update_cfs_load and load-balance functions below */ | ||
820 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); | ||
691 | # ifdef CONFIG_SMP | 821 | # ifdef CONFIG_SMP |
692 | static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, | 822 | static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, |
693 | int global_update) | 823 | int global_update) |
@@ -710,7 +840,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | |||
710 | u64 now, delta; | 840 | u64 now, delta; |
711 | unsigned long load = cfs_rq->load.weight; | 841 | unsigned long load = cfs_rq->load.weight; |
712 | 842 | ||
713 | if (cfs_rq->tg == &root_task_group) | 843 | if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq)) |
714 | return; | 844 | return; |
715 | 845 | ||
716 | now = rq_of(cfs_rq)->clock_task; | 846 | now = rq_of(cfs_rq)->clock_task; |
@@ -752,19 +882,32 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | |||
752 | list_del_leaf_cfs_rq(cfs_rq); | 882 | list_del_leaf_cfs_rq(cfs_rq); |
753 | } | 883 | } |
754 | 884 | ||
885 | static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) | ||
886 | { | ||
887 | long tg_weight; | ||
888 | |||
889 | /* | ||
890 | * Use this CPU's actual weight instead of the last load_contribution | ||
891 | * to gain a more accurate current total weight. See | ||
892 | * update_cfs_rq_load_contribution(). | ||
893 | */ | ||
894 | tg_weight = atomic_read(&tg->load_weight); | ||
895 | tg_weight -= cfs_rq->load_contribution; | ||
896 | tg_weight += cfs_rq->load.weight; | ||
897 | |||
898 | return tg_weight; | ||
899 | } | ||
900 | |||
755 | static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) | 901 | static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) |
756 | { | 902 | { |
757 | long load_weight, load, shares; | 903 | long tg_weight, load, shares; |
758 | 904 | ||
905 | tg_weight = calc_tg_weight(tg, cfs_rq); | ||
759 | load = cfs_rq->load.weight; | 906 | load = cfs_rq->load.weight; |
760 | 907 | ||
761 | load_weight = atomic_read(&tg->load_weight); | ||
762 | load_weight += load; | ||
763 | load_weight -= cfs_rq->load_contribution; | ||
764 | |||
765 | shares = (tg->shares * load); | 908 | shares = (tg->shares * load); |
766 | if (load_weight) | 909 | if (tg_weight) |
767 | shares /= load_weight; | 910 | shares /= tg_weight; |
768 | 911 | ||
769 | if (shares < MIN_SHARES) | 912 | if (shares < MIN_SHARES) |
770 | shares = MIN_SHARES; | 913 | shares = MIN_SHARES; |
@@ -819,7 +962,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq) | |||
819 | 962 | ||
820 | tg = cfs_rq->tg; | 963 | tg = cfs_rq->tg; |
821 | se = tg->se[cpu_of(rq_of(cfs_rq))]; | 964 | se = tg->se[cpu_of(rq_of(cfs_rq))]; |
822 | if (!se) | 965 | if (!se || throttled_hierarchy(cfs_rq)) |
823 | return; | 966 | return; |
824 | #ifndef CONFIG_SMP | 967 | #ifndef CONFIG_SMP |
825 | if (likely(se->load.weight == tg->shares)) | 968 | if (likely(se->load.weight == tg->shares)) |
@@ -860,7 +1003,6 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
860 | if (unlikely(delta > se->statistics.sleep_max)) | 1003 | if (unlikely(delta > se->statistics.sleep_max)) |
861 | se->statistics.sleep_max = delta; | 1004 | se->statistics.sleep_max = delta; |
862 | 1005 | ||
863 | se->statistics.sleep_start = 0; | ||
864 | se->statistics.sum_sleep_runtime += delta; | 1006 | se->statistics.sum_sleep_runtime += delta; |
865 | 1007 | ||
866 | if (tsk) { | 1008 | if (tsk) { |
@@ -877,7 +1019,6 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
877 | if (unlikely(delta > se->statistics.block_max)) | 1019 | if (unlikely(delta > se->statistics.block_max)) |
878 | se->statistics.block_max = delta; | 1020 | se->statistics.block_max = delta; |
879 | 1021 | ||
880 | se->statistics.block_start = 0; | ||
881 | se->statistics.sum_sleep_runtime += delta; | 1022 | se->statistics.sum_sleep_runtime += delta; |
882 | 1023 | ||
883 | if (tsk) { | 1024 | if (tsk) { |
@@ -887,6 +1028,8 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
887 | trace_sched_stat_iowait(tsk, delta); | 1028 | trace_sched_stat_iowait(tsk, delta); |
888 | } | 1029 | } |
889 | 1030 | ||
1031 | trace_sched_stat_blocked(tsk, delta); | ||
1032 | |||
890 | /* | 1033 | /* |
891 | * Blocking time is in units of nanosecs, so shift by | 1034 | * Blocking time is in units of nanosecs, so shift by |
892 | * 20 to get a milliseconds-range estimation of the | 1035 | * 20 to get a milliseconds-range estimation of the |
@@ -950,6 +1093,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) | |||
950 | se->vruntime = vruntime; | 1093 | se->vruntime = vruntime; |
951 | } | 1094 | } |
952 | 1095 | ||
1096 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq); | ||
1097 | |||
953 | static void | 1098 | static void |
954 | enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | 1099 | enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) |
955 | { | 1100 | { |
@@ -979,8 +1124,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
979 | __enqueue_entity(cfs_rq, se); | 1124 | __enqueue_entity(cfs_rq, se); |
980 | se->on_rq = 1; | 1125 | se->on_rq = 1; |
981 | 1126 | ||
982 | if (cfs_rq->nr_running == 1) | 1127 | if (cfs_rq->nr_running == 1) { |
983 | list_add_leaf_cfs_rq(cfs_rq); | 1128 | list_add_leaf_cfs_rq(cfs_rq); |
1129 | check_enqueue_throttle(cfs_rq); | ||
1130 | } | ||
984 | } | 1131 | } |
985 | 1132 | ||
986 | static void __clear_buddies_last(struct sched_entity *se) | 1133 | static void __clear_buddies_last(struct sched_entity *se) |
@@ -1028,6 +1175,8 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
1028 | __clear_buddies_skip(se); | 1175 | __clear_buddies_skip(se); |
1029 | } | 1176 | } |
1030 | 1177 | ||
1178 | static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); | ||
1179 | |||
1031 | static void | 1180 | static void |
1032 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | 1181 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) |
1033 | { | 1182 | { |
@@ -1066,6 +1215,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
1066 | if (!(flags & DEQUEUE_SLEEP)) | 1215 | if (!(flags & DEQUEUE_SLEEP)) |
1067 | se->vruntime -= cfs_rq->min_vruntime; | 1216 | se->vruntime -= cfs_rq->min_vruntime; |
1068 | 1217 | ||
1218 | /* return excess runtime on last dequeue */ | ||
1219 | return_cfs_rq_runtime(cfs_rq); | ||
1220 | |||
1069 | update_min_vruntime(cfs_rq); | 1221 | update_min_vruntime(cfs_rq); |
1070 | update_cfs_shares(cfs_rq); | 1222 | update_cfs_shares(cfs_rq); |
1071 | } | 1223 | } |
@@ -1077,6 +1229,8 @@ static void | |||
1077 | check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | 1229 | check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) |
1078 | { | 1230 | { |
1079 | unsigned long ideal_runtime, delta_exec; | 1231 | unsigned long ideal_runtime, delta_exec; |
1232 | struct sched_entity *se; | ||
1233 | s64 delta; | ||
1080 | 1234 | ||
1081 | ideal_runtime = sched_slice(cfs_rq, curr); | 1235 | ideal_runtime = sched_slice(cfs_rq, curr); |
1082 | delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; | 1236 | delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; |
@@ -1095,22 +1249,17 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | |||
1095 | * narrow margin doesn't have to wait for a full slice. | 1249 | * narrow margin doesn't have to wait for a full slice. |
1096 | * This also mitigates buddy induced latencies under load. | 1250 | * This also mitigates buddy induced latencies under load. |
1097 | */ | 1251 | */ |
1098 | if (!sched_feat(WAKEUP_PREEMPT)) | ||
1099 | return; | ||
1100 | |||
1101 | if (delta_exec < sysctl_sched_min_granularity) | 1252 | if (delta_exec < sysctl_sched_min_granularity) |
1102 | return; | 1253 | return; |
1103 | 1254 | ||
1104 | if (cfs_rq->nr_running > 1) { | 1255 | se = __pick_first_entity(cfs_rq); |
1105 | struct sched_entity *se = __pick_first_entity(cfs_rq); | 1256 | delta = curr->vruntime - se->vruntime; |
1106 | s64 delta = curr->vruntime - se->vruntime; | ||
1107 | 1257 | ||
1108 | if (delta < 0) | 1258 | if (delta < 0) |
1109 | return; | 1259 | return; |
1110 | 1260 | ||
1111 | if (delta > ideal_runtime) | 1261 | if (delta > ideal_runtime) |
1112 | resched_task(rq_of(cfs_rq)->curr); | 1262 | resched_task(rq_of(cfs_rq)->curr); |
1113 | } | ||
1114 | } | 1263 | } |
1115 | 1264 | ||
1116 | static void | 1265 | static void |
@@ -1185,6 +1334,8 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) | |||
1185 | return se; | 1334 | return se; |
1186 | } | 1335 | } |
1187 | 1336 | ||
1337 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq); | ||
1338 | |||
1188 | static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) | 1339 | static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) |
1189 | { | 1340 | { |
1190 | /* | 1341 | /* |
@@ -1194,6 +1345,9 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) | |||
1194 | if (prev->on_rq) | 1345 | if (prev->on_rq) |
1195 | update_curr(cfs_rq); | 1346 | update_curr(cfs_rq); |
1196 | 1347 | ||
1348 | /* throttle cfs_rqs exceeding runtime */ | ||
1349 | check_cfs_rq_runtime(cfs_rq); | ||
1350 | |||
1197 | check_spread(cfs_rq, prev); | 1351 | check_spread(cfs_rq, prev); |
1198 | if (prev->on_rq) { | 1352 | if (prev->on_rq) { |
1199 | update_stats_wait_start(cfs_rq, prev); | 1353 | update_stats_wait_start(cfs_rq, prev); |
@@ -1233,10 +1387,742 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | |||
1233 | return; | 1387 | return; |
1234 | #endif | 1388 | #endif |
1235 | 1389 | ||
1236 | if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) | 1390 | if (cfs_rq->nr_running > 1) |
1237 | check_preempt_tick(cfs_rq, curr); | 1391 | check_preempt_tick(cfs_rq, curr); |
1238 | } | 1392 | } |
1239 | 1393 | ||
1394 | |||
1395 | /************************************************** | ||
1396 | * CFS bandwidth control machinery | ||
1397 | */ | ||
1398 | |||
1399 | #ifdef CONFIG_CFS_BANDWIDTH | ||
1400 | |||
1401 | #ifdef HAVE_JUMP_LABEL | ||
1402 | static struct jump_label_key __cfs_bandwidth_used; | ||
1403 | |||
1404 | static inline bool cfs_bandwidth_used(void) | ||
1405 | { | ||
1406 | return static_branch(&__cfs_bandwidth_used); | ||
1407 | } | ||
1408 | |||
1409 | void account_cfs_bandwidth_used(int enabled, int was_enabled) | ||
1410 | { | ||
1411 | /* only need to count groups transitioning between enabled/!enabled */ | ||
1412 | if (enabled && !was_enabled) | ||
1413 | jump_label_inc(&__cfs_bandwidth_used); | ||
1414 | else if (!enabled && was_enabled) | ||
1415 | jump_label_dec(&__cfs_bandwidth_used); | ||
1416 | } | ||
1417 | #else /* HAVE_JUMP_LABEL */ | ||
1418 | static bool cfs_bandwidth_used(void) | ||
1419 | { | ||
1420 | return true; | ||
1421 | } | ||
1422 | |||
1423 | void account_cfs_bandwidth_used(int enabled, int was_enabled) {} | ||
1424 | #endif /* HAVE_JUMP_LABEL */ | ||
1425 | |||
1426 | /* | ||
1427 | * default period for cfs group bandwidth. | ||
1428 | * default: 0.1s, units: nanoseconds | ||
1429 | */ | ||
1430 | static inline u64 default_cfs_period(void) | ||
1431 | { | ||
1432 | return 100000000ULL; | ||
1433 | } | ||
1434 | |||
1435 | static inline u64 sched_cfs_bandwidth_slice(void) | ||
1436 | { | ||
1437 | return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC; | ||
1438 | } | ||
1439 | |||
1440 | /* | ||
1441 | * Replenish runtime according to assigned quota and update expiration time. | ||
1442 | * We use sched_clock_cpu directly instead of rq->clock to avoid adding | ||
1443 | * additional synchronization around rq->lock. | ||
1444 | * | ||
1445 | * requires cfs_b->lock | ||
1446 | */ | ||
1447 | void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) | ||
1448 | { | ||
1449 | u64 now; | ||
1450 | |||
1451 | if (cfs_b->quota == RUNTIME_INF) | ||
1452 | return; | ||
1453 | |||
1454 | now = sched_clock_cpu(smp_processor_id()); | ||
1455 | cfs_b->runtime = cfs_b->quota; | ||
1456 | cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); | ||
1457 | } | ||
1458 | |||
1459 | static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | ||
1460 | { | ||
1461 | return &tg->cfs_bandwidth; | ||
1462 | } | ||
1463 | |||
1464 | /* returns 0 on failure to allocate runtime */ | ||
1465 | static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
1466 | { | ||
1467 | struct task_group *tg = cfs_rq->tg; | ||
1468 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); | ||
1469 | u64 amount = 0, min_amount, expires; | ||
1470 | |||
1471 | /* note: this is a positive sum as runtime_remaining <= 0 */ | ||
1472 | min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; | ||
1473 | |||
1474 | raw_spin_lock(&cfs_b->lock); | ||
1475 | if (cfs_b->quota == RUNTIME_INF) | ||
1476 | amount = min_amount; | ||
1477 | else { | ||
1478 | /* | ||
1479 | * If the bandwidth pool has become inactive, then at least one | ||
1480 | * period must have elapsed since the last consumption. | ||
1481 | * Refresh the global state and ensure bandwidth timer becomes | ||
1482 | * active. | ||
1483 | */ | ||
1484 | if (!cfs_b->timer_active) { | ||
1485 | __refill_cfs_bandwidth_runtime(cfs_b); | ||
1486 | __start_cfs_bandwidth(cfs_b); | ||
1487 | } | ||
1488 | |||
1489 | if (cfs_b->runtime > 0) { | ||
1490 | amount = min(cfs_b->runtime, min_amount); | ||
1491 | cfs_b->runtime -= amount; | ||
1492 | cfs_b->idle = 0; | ||
1493 | } | ||
1494 | } | ||
1495 | expires = cfs_b->runtime_expires; | ||
1496 | raw_spin_unlock(&cfs_b->lock); | ||
1497 | |||
1498 | cfs_rq->runtime_remaining += amount; | ||
1499 | /* | ||
1500 | * we may have advanced our local expiration to account for allowed | ||
1501 | * spread between our sched_clock and the one on which runtime was | ||
1502 | * issued. | ||
1503 | */ | ||
1504 | if ((s64)(expires - cfs_rq->runtime_expires) > 0) | ||
1505 | cfs_rq->runtime_expires = expires; | ||
1506 | |||
1507 | return cfs_rq->runtime_remaining > 0; | ||
1508 | } | ||
1509 | |||
1510 | /* | ||
1511 | * Note: This depends on the synchronization provided by sched_clock and the | ||
1512 | * fact that rq->clock snapshots this value. | ||
1513 | */ | ||
1514 | static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
1515 | { | ||
1516 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | ||
1517 | struct rq *rq = rq_of(cfs_rq); | ||
1518 | |||
1519 | /* if the deadline is ahead of our clock, nothing to do */ | ||
1520 | if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0)) | ||
1521 | return; | ||
1522 | |||
1523 | if (cfs_rq->runtime_remaining < 0) | ||
1524 | return; | ||
1525 | |||
1526 | /* | ||
1527 | * If the local deadline has passed we have to consider the | ||
1528 | * possibility that our sched_clock is 'fast' and the global deadline | ||
1529 | * has not truly expired. | ||
1530 | * | ||
1531 | * Fortunately we can check determine whether this the case by checking | ||
1532 | * whether the global deadline has advanced. | ||
1533 | */ | ||
1534 | |||
1535 | if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) { | ||
1536 | /* extend local deadline, drift is bounded above by 2 ticks */ | ||
1537 | cfs_rq->runtime_expires += TICK_NSEC; | ||
1538 | } else { | ||
1539 | /* global deadline is ahead, expiration has passed */ | ||
1540 | cfs_rq->runtime_remaining = 0; | ||
1541 | } | ||
1542 | } | ||
1543 | |||
1544 | static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | ||
1545 | unsigned long delta_exec) | ||
1546 | { | ||
1547 | /* dock delta_exec before expiring quota (as it could span periods) */ | ||
1548 | cfs_rq->runtime_remaining -= delta_exec; | ||
1549 | expire_cfs_rq_runtime(cfs_rq); | ||
1550 | |||
1551 | if (likely(cfs_rq->runtime_remaining > 0)) | ||
1552 | return; | ||
1553 | |||
1554 | /* | ||
1555 | * if we're unable to extend our runtime we resched so that the active | ||
1556 | * hierarchy can be throttled | ||
1557 | */ | ||
1558 | if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) | ||
1559 | resched_task(rq_of(cfs_rq)->curr); | ||
1560 | } | ||
1561 | |||
1562 | static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | ||
1563 | unsigned long delta_exec) | ||
1564 | { | ||
1565 | if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) | ||
1566 | return; | ||
1567 | |||
1568 | __account_cfs_rq_runtime(cfs_rq, delta_exec); | ||
1569 | } | ||
1570 | |||
1571 | static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) | ||
1572 | { | ||
1573 | return cfs_bandwidth_used() && cfs_rq->throttled; | ||
1574 | } | ||
1575 | |||
1576 | /* check whether cfs_rq, or any parent, is throttled */ | ||
1577 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) | ||
1578 | { | ||
1579 | return cfs_bandwidth_used() && cfs_rq->throttle_count; | ||
1580 | } | ||
1581 | |||
1582 | /* | ||
1583 | * Ensure that neither of the group entities corresponding to src_cpu or | ||
1584 | * dest_cpu are members of a throttled hierarchy when performing group | ||
1585 | * load-balance operations. | ||
1586 | */ | ||
1587 | static inline int throttled_lb_pair(struct task_group *tg, | ||
1588 | int src_cpu, int dest_cpu) | ||
1589 | { | ||
1590 | struct cfs_rq *src_cfs_rq, *dest_cfs_rq; | ||
1591 | |||
1592 | src_cfs_rq = tg->cfs_rq[src_cpu]; | ||
1593 | dest_cfs_rq = tg->cfs_rq[dest_cpu]; | ||
1594 | |||
1595 | return throttled_hierarchy(src_cfs_rq) || | ||
1596 | throttled_hierarchy(dest_cfs_rq); | ||
1597 | } | ||
1598 | |||
1599 | /* updated child weight may affect parent so we have to do this bottom up */ | ||
1600 | static int tg_unthrottle_up(struct task_group *tg, void *data) | ||
1601 | { | ||
1602 | struct rq *rq = data; | ||
1603 | struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; | ||
1604 | |||
1605 | cfs_rq->throttle_count--; | ||
1606 | #ifdef CONFIG_SMP | ||
1607 | if (!cfs_rq->throttle_count) { | ||
1608 | u64 delta = rq->clock_task - cfs_rq->load_stamp; | ||
1609 | |||
1610 | /* leaving throttled state, advance shares averaging windows */ | ||
1611 | cfs_rq->load_stamp += delta; | ||
1612 | cfs_rq->load_last += delta; | ||
1613 | |||
1614 | /* update entity weight now that we are on_rq again */ | ||
1615 | update_cfs_shares(cfs_rq); | ||
1616 | } | ||
1617 | #endif | ||
1618 | |||
1619 | return 0; | ||
1620 | } | ||
1621 | |||
1622 | static int tg_throttle_down(struct task_group *tg, void *data) | ||
1623 | { | ||
1624 | struct rq *rq = data; | ||
1625 | struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; | ||
1626 | |||
1627 | /* group is entering throttled state, record last load */ | ||
1628 | if (!cfs_rq->throttle_count) | ||
1629 | update_cfs_load(cfs_rq, 0); | ||
1630 | cfs_rq->throttle_count++; | ||
1631 | |||
1632 | return 0; | ||
1633 | } | ||
1634 | |||
1635 | static void throttle_cfs_rq(struct cfs_rq *cfs_rq) | ||
1636 | { | ||
1637 | struct rq *rq = rq_of(cfs_rq); | ||
1638 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | ||
1639 | struct sched_entity *se; | ||
1640 | long task_delta, dequeue = 1; | ||
1641 | |||
1642 | se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; | ||
1643 | |||
1644 | /* account load preceding throttle */ | ||
1645 | rcu_read_lock(); | ||
1646 | walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); | ||
1647 | rcu_read_unlock(); | ||
1648 | |||
1649 | task_delta = cfs_rq->h_nr_running; | ||
1650 | for_each_sched_entity(se) { | ||
1651 | struct cfs_rq *qcfs_rq = cfs_rq_of(se); | ||
1652 | /* throttled entity or throttle-on-deactivate */ | ||
1653 | if (!se->on_rq) | ||
1654 | break; | ||
1655 | |||
1656 | if (dequeue) | ||
1657 | dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); | ||
1658 | qcfs_rq->h_nr_running -= task_delta; | ||
1659 | |||
1660 | if (qcfs_rq->load.weight) | ||
1661 | dequeue = 0; | ||
1662 | } | ||
1663 | |||
1664 | if (!se) | ||
1665 | rq->nr_running -= task_delta; | ||
1666 | |||
1667 | cfs_rq->throttled = 1; | ||
1668 | cfs_rq->throttled_timestamp = rq->clock; | ||
1669 | raw_spin_lock(&cfs_b->lock); | ||
1670 | list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); | ||
1671 | raw_spin_unlock(&cfs_b->lock); | ||
1672 | } | ||
1673 | |||
1674 | void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) | ||
1675 | { | ||
1676 | struct rq *rq = rq_of(cfs_rq); | ||
1677 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | ||
1678 | struct sched_entity *se; | ||
1679 | int enqueue = 1; | ||
1680 | long task_delta; | ||
1681 | |||
1682 | se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; | ||
1683 | |||
1684 | cfs_rq->throttled = 0; | ||
1685 | raw_spin_lock(&cfs_b->lock); | ||
1686 | cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp; | ||
1687 | list_del_rcu(&cfs_rq->throttled_list); | ||
1688 | raw_spin_unlock(&cfs_b->lock); | ||
1689 | cfs_rq->throttled_timestamp = 0; | ||
1690 | |||
1691 | update_rq_clock(rq); | ||
1692 | /* update hierarchical throttle state */ | ||
1693 | walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); | ||
1694 | |||
1695 | if (!cfs_rq->load.weight) | ||
1696 | return; | ||
1697 | |||
1698 | task_delta = cfs_rq->h_nr_running; | ||
1699 | for_each_sched_entity(se) { | ||
1700 | if (se->on_rq) | ||
1701 | enqueue = 0; | ||
1702 | |||
1703 | cfs_rq = cfs_rq_of(se); | ||
1704 | if (enqueue) | ||
1705 | enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); | ||
1706 | cfs_rq->h_nr_running += task_delta; | ||
1707 | |||
1708 | if (cfs_rq_throttled(cfs_rq)) | ||
1709 | break; | ||
1710 | } | ||
1711 | |||
1712 | if (!se) | ||
1713 | rq->nr_running += task_delta; | ||
1714 | |||
1715 | /* determine whether we need to wake up potentially idle cpu */ | ||
1716 | if (rq->curr == rq->idle && rq->cfs.nr_running) | ||
1717 | resched_task(rq->curr); | ||
1718 | } | ||
1719 | |||
1720 | static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, | ||
1721 | u64 remaining, u64 expires) | ||
1722 | { | ||
1723 | struct cfs_rq *cfs_rq; | ||
1724 | u64 runtime = remaining; | ||
1725 | |||
1726 | rcu_read_lock(); | ||
1727 | list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, | ||
1728 | throttled_list) { | ||
1729 | struct rq *rq = rq_of(cfs_rq); | ||
1730 | |||
1731 | raw_spin_lock(&rq->lock); | ||
1732 | if (!cfs_rq_throttled(cfs_rq)) | ||
1733 | goto next; | ||
1734 | |||
1735 | runtime = -cfs_rq->runtime_remaining + 1; | ||
1736 | if (runtime > remaining) | ||
1737 | runtime = remaining; | ||
1738 | remaining -= runtime; | ||
1739 | |||
1740 | cfs_rq->runtime_remaining += runtime; | ||
1741 | cfs_rq->runtime_expires = expires; | ||
1742 | |||
1743 | /* we check whether we're throttled above */ | ||
1744 | if (cfs_rq->runtime_remaining > 0) | ||
1745 | unthrottle_cfs_rq(cfs_rq); | ||
1746 | |||
1747 | next: | ||
1748 | raw_spin_unlock(&rq->lock); | ||
1749 | |||
1750 | if (!remaining) | ||
1751 | break; | ||
1752 | } | ||
1753 | rcu_read_unlock(); | ||
1754 | |||
1755 | return remaining; | ||
1756 | } | ||
1757 | |||
1758 | /* | ||
1759 | * Responsible for refilling a task_group's bandwidth and unthrottling its | ||
1760 | * cfs_rqs as appropriate. If there has been no activity within the last | ||
1761 | * period the timer is deactivated until scheduling resumes; cfs_b->idle is | ||
1762 | * used to track this state. | ||
1763 | */ | ||
1764 | static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) | ||
1765 | { | ||
1766 | u64 runtime, runtime_expires; | ||
1767 | int idle = 1, throttled; | ||
1768 | |||
1769 | raw_spin_lock(&cfs_b->lock); | ||
1770 | /* no need to continue the timer with no bandwidth constraint */ | ||
1771 | if (cfs_b->quota == RUNTIME_INF) | ||
1772 | goto out_unlock; | ||
1773 | |||
1774 | throttled = !list_empty(&cfs_b->throttled_cfs_rq); | ||
1775 | /* idle depends on !throttled (for the case of a large deficit) */ | ||
1776 | idle = cfs_b->idle && !throttled; | ||
1777 | cfs_b->nr_periods += overrun; | ||
1778 | |||
1779 | /* if we're going inactive then everything else can be deferred */ | ||
1780 | if (idle) | ||
1781 | goto out_unlock; | ||
1782 | |||
1783 | __refill_cfs_bandwidth_runtime(cfs_b); | ||
1784 | |||
1785 | if (!throttled) { | ||
1786 | /* mark as potentially idle for the upcoming period */ | ||
1787 | cfs_b->idle = 1; | ||
1788 | goto out_unlock; | ||
1789 | } | ||
1790 | |||
1791 | /* account preceding periods in which throttling occurred */ | ||
1792 | cfs_b->nr_throttled += overrun; | ||
1793 | |||
1794 | /* | ||
1795 | * There are throttled entities so we must first use the new bandwidth | ||
1796 | * to unthrottle them before making it generally available. This | ||
1797 | * ensures that all existing debts will be paid before a new cfs_rq is | ||
1798 | * allowed to run. | ||
1799 | */ | ||
1800 | runtime = cfs_b->runtime; | ||
1801 | runtime_expires = cfs_b->runtime_expires; | ||
1802 | cfs_b->runtime = 0; | ||
1803 | |||
1804 | /* | ||
1805 | * This check is repeated as we are holding onto the new bandwidth | ||
1806 | * while we unthrottle. This can potentially race with an unthrottled | ||
1807 | * group trying to acquire new bandwidth from the global pool. | ||
1808 | */ | ||
1809 | while (throttled && runtime > 0) { | ||
1810 | raw_spin_unlock(&cfs_b->lock); | ||
1811 | /* we can't nest cfs_b->lock while distributing bandwidth */ | ||
1812 | runtime = distribute_cfs_runtime(cfs_b, runtime, | ||
1813 | runtime_expires); | ||
1814 | raw_spin_lock(&cfs_b->lock); | ||
1815 | |||
1816 | throttled = !list_empty(&cfs_b->throttled_cfs_rq); | ||
1817 | } | ||
1818 | |||
1819 | /* return (any) remaining runtime */ | ||
1820 | cfs_b->runtime = runtime; | ||
1821 | /* | ||
1822 | * While we are ensured activity in the period following an | ||
1823 | * unthrottle, this also covers the case in which the new bandwidth is | ||
1824 | * insufficient to cover the existing bandwidth deficit. (Forcing the | ||
1825 | * timer to remain active while there are any throttled entities.) | ||
1826 | */ | ||
1827 | cfs_b->idle = 0; | ||
1828 | out_unlock: | ||
1829 | if (idle) | ||
1830 | cfs_b->timer_active = 0; | ||
1831 | raw_spin_unlock(&cfs_b->lock); | ||
1832 | |||
1833 | return idle; | ||
1834 | } | ||
1835 | |||
1836 | /* a cfs_rq won't donate quota below this amount */ | ||
1837 | static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC; | ||
1838 | /* minimum remaining period time to redistribute slack quota */ | ||
1839 | static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC; | ||
1840 | /* how long we wait to gather additional slack before distributing */ | ||
1841 | static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; | ||
1842 | |||
1843 | /* are we near the end of the current quota period? */ | ||
1844 | static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) | ||
1845 | { | ||
1846 | struct hrtimer *refresh_timer = &cfs_b->period_timer; | ||
1847 | u64 remaining; | ||
1848 | |||
1849 | /* if the call-back is running a quota refresh is already occurring */ | ||
1850 | if (hrtimer_callback_running(refresh_timer)) | ||
1851 | return 1; | ||
1852 | |||
1853 | /* is a quota refresh about to occur? */ | ||
1854 | remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer)); | ||
1855 | if (remaining < min_expire) | ||
1856 | return 1; | ||
1857 | |||
1858 | return 0; | ||
1859 | } | ||
1860 | |||
1861 | static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b) | ||
1862 | { | ||
1863 | u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration; | ||
1864 | |||
1865 | /* if there's a quota refresh soon don't bother with slack */ | ||
1866 | if (runtime_refresh_within(cfs_b, min_left)) | ||
1867 | return; | ||
1868 | |||
1869 | start_bandwidth_timer(&cfs_b->slack_timer, | ||
1870 | ns_to_ktime(cfs_bandwidth_slack_period)); | ||
1871 | } | ||
1872 | |||
1873 | /* we know any runtime found here is valid as update_curr() precedes return */ | ||
1874 | static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
1875 | { | ||
1876 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | ||
1877 | s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime; | ||
1878 | |||
1879 | if (slack_runtime <= 0) | ||
1880 | return; | ||
1881 | |||
1882 | raw_spin_lock(&cfs_b->lock); | ||
1883 | if (cfs_b->quota != RUNTIME_INF && | ||
1884 | cfs_rq->runtime_expires == cfs_b->runtime_expires) { | ||
1885 | cfs_b->runtime += slack_runtime; | ||
1886 | |||
1887 | /* we are under rq->lock, defer unthrottling using a timer */ | ||
1888 | if (cfs_b->runtime > sched_cfs_bandwidth_slice() && | ||
1889 | !list_empty(&cfs_b->throttled_cfs_rq)) | ||
1890 | start_cfs_slack_bandwidth(cfs_b); | ||
1891 | } | ||
1892 | raw_spin_unlock(&cfs_b->lock); | ||
1893 | |||
1894 | /* even if it's not valid for return we don't want to try again */ | ||
1895 | cfs_rq->runtime_remaining -= slack_runtime; | ||
1896 | } | ||
1897 | |||
1898 | static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
1899 | { | ||
1900 | if (!cfs_bandwidth_used()) | ||
1901 | return; | ||
1902 | |||
1903 | if (!cfs_rq->runtime_enabled || cfs_rq->nr_running) | ||
1904 | return; | ||
1905 | |||
1906 | __return_cfs_rq_runtime(cfs_rq); | ||
1907 | } | ||
1908 | |||
1909 | /* | ||
1910 | * This is done with a timer (instead of inline with bandwidth return) since | ||
1911 | * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs. | ||
1912 | */ | ||
1913 | static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) | ||
1914 | { | ||
1915 | u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); | ||
1916 | u64 expires; | ||
1917 | |||
1918 | /* confirm we're still not at a refresh boundary */ | ||
1919 | if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) | ||
1920 | return; | ||
1921 | |||
1922 | raw_spin_lock(&cfs_b->lock); | ||
1923 | if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { | ||
1924 | runtime = cfs_b->runtime; | ||
1925 | cfs_b->runtime = 0; | ||
1926 | } | ||
1927 | expires = cfs_b->runtime_expires; | ||
1928 | raw_spin_unlock(&cfs_b->lock); | ||
1929 | |||
1930 | if (!runtime) | ||
1931 | return; | ||
1932 | |||
1933 | runtime = distribute_cfs_runtime(cfs_b, runtime, expires); | ||
1934 | |||
1935 | raw_spin_lock(&cfs_b->lock); | ||
1936 | if (expires == cfs_b->runtime_expires) | ||
1937 | cfs_b->runtime = runtime; | ||
1938 | raw_spin_unlock(&cfs_b->lock); | ||
1939 | } | ||
1940 | |||
1941 | /* | ||
1942 | * When a group wakes up we want to make sure that its quota is not already | ||
1943 | * expired/exceeded, otherwise it may be allowed to steal additional ticks of | ||
1944 | * runtime as update_curr() throttling can not not trigger until it's on-rq. | ||
1945 | */ | ||
1946 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) | ||
1947 | { | ||
1948 | if (!cfs_bandwidth_used()) | ||
1949 | return; | ||
1950 | |||
1951 | /* an active group must be handled by the update_curr()->put() path */ | ||
1952 | if (!cfs_rq->runtime_enabled || cfs_rq->curr) | ||
1953 | return; | ||
1954 | |||
1955 | /* ensure the group is not already throttled */ | ||
1956 | if (cfs_rq_throttled(cfs_rq)) | ||
1957 | return; | ||
1958 | |||
1959 | /* update runtime allocation */ | ||
1960 | account_cfs_rq_runtime(cfs_rq, 0); | ||
1961 | if (cfs_rq->runtime_remaining <= 0) | ||
1962 | throttle_cfs_rq(cfs_rq); | ||
1963 | } | ||
1964 | |||
1965 | /* conditionally throttle active cfs_rq's from put_prev_entity() */ | ||
1966 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
1967 | { | ||
1968 | if (!cfs_bandwidth_used()) | ||
1969 | return; | ||
1970 | |||
1971 | if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) | ||
1972 | return; | ||
1973 | |||
1974 | /* | ||
1975 | * it's possible for a throttled entity to be forced into a running | ||
1976 | * state (e.g. set_curr_task), in this case we're finished. | ||
1977 | */ | ||
1978 | if (cfs_rq_throttled(cfs_rq)) | ||
1979 | return; | ||
1980 | |||
1981 | throttle_cfs_rq(cfs_rq); | ||
1982 | } | ||
1983 | |||
1984 | static inline u64 default_cfs_period(void); | ||
1985 | static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); | ||
1986 | static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b); | ||
1987 | |||
1988 | static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) | ||
1989 | { | ||
1990 | struct cfs_bandwidth *cfs_b = | ||
1991 | container_of(timer, struct cfs_bandwidth, slack_timer); | ||
1992 | do_sched_cfs_slack_timer(cfs_b); | ||
1993 | |||
1994 | return HRTIMER_NORESTART; | ||
1995 | } | ||
1996 | |||
1997 | static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) | ||
1998 | { | ||
1999 | struct cfs_bandwidth *cfs_b = | ||
2000 | container_of(timer, struct cfs_bandwidth, period_timer); | ||
2001 | ktime_t now; | ||
2002 | int overrun; | ||
2003 | int idle = 0; | ||
2004 | |||
2005 | for (;;) { | ||
2006 | now = hrtimer_cb_get_time(timer); | ||
2007 | overrun = hrtimer_forward(timer, now, cfs_b->period); | ||
2008 | |||
2009 | if (!overrun) | ||
2010 | break; | ||
2011 | |||
2012 | idle = do_sched_cfs_period_timer(cfs_b, overrun); | ||
2013 | } | ||
2014 | |||
2015 | return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; | ||
2016 | } | ||
2017 | |||
2018 | void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
2019 | { | ||
2020 | raw_spin_lock_init(&cfs_b->lock); | ||
2021 | cfs_b->runtime = 0; | ||
2022 | cfs_b->quota = RUNTIME_INF; | ||
2023 | cfs_b->period = ns_to_ktime(default_cfs_period()); | ||
2024 | |||
2025 | INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); | ||
2026 | hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
2027 | cfs_b->period_timer.function = sched_cfs_period_timer; | ||
2028 | hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
2029 | cfs_b->slack_timer.function = sched_cfs_slack_timer; | ||
2030 | } | ||
2031 | |||
2032 | static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) | ||
2033 | { | ||
2034 | cfs_rq->runtime_enabled = 0; | ||
2035 | INIT_LIST_HEAD(&cfs_rq->throttled_list); | ||
2036 | } | ||
2037 | |||
2038 | /* requires cfs_b->lock, may release to reprogram timer */ | ||
2039 | void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
2040 | { | ||
2041 | /* | ||
2042 | * The timer may be active because we're trying to set a new bandwidth | ||
2043 | * period or because we're racing with the tear-down path | ||
2044 | * (timer_active==0 becomes visible before the hrtimer call-back | ||
2045 | * terminates). In either case we ensure that it's re-programmed | ||
2046 | */ | ||
2047 | while (unlikely(hrtimer_active(&cfs_b->period_timer))) { | ||
2048 | raw_spin_unlock(&cfs_b->lock); | ||
2049 | /* ensure cfs_b->lock is available while we wait */ | ||
2050 | hrtimer_cancel(&cfs_b->period_timer); | ||
2051 | |||
2052 | raw_spin_lock(&cfs_b->lock); | ||
2053 | /* if someone else restarted the timer then we're done */ | ||
2054 | if (cfs_b->timer_active) | ||
2055 | return; | ||
2056 | } | ||
2057 | |||
2058 | cfs_b->timer_active = 1; | ||
2059 | start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period); | ||
2060 | } | ||
2061 | |||
2062 | static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | ||
2063 | { | ||
2064 | hrtimer_cancel(&cfs_b->period_timer); | ||
2065 | hrtimer_cancel(&cfs_b->slack_timer); | ||
2066 | } | ||
2067 | |||
2068 | void unthrottle_offline_cfs_rqs(struct rq *rq) | ||
2069 | { | ||
2070 | struct cfs_rq *cfs_rq; | ||
2071 | |||
2072 | for_each_leaf_cfs_rq(rq, cfs_rq) { | ||
2073 | struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); | ||
2074 | |||
2075 | if (!cfs_rq->runtime_enabled) | ||
2076 | continue; | ||
2077 | |||
2078 | /* | ||
2079 | * clock_task is not advancing so we just need to make sure | ||
2080 | * there's some valid quota amount | ||
2081 | */ | ||
2082 | cfs_rq->runtime_remaining = cfs_b->quota; | ||
2083 | if (cfs_rq_throttled(cfs_rq)) | ||
2084 | unthrottle_cfs_rq(cfs_rq); | ||
2085 | } | ||
2086 | } | ||
2087 | |||
2088 | #else /* CONFIG_CFS_BANDWIDTH */ | ||
2089 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, | ||
2090 | unsigned long delta_exec) {} | ||
2091 | static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | ||
2092 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} | ||
2093 | static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | ||
2094 | |||
2095 | static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) | ||
2096 | { | ||
2097 | return 0; | ||
2098 | } | ||
2099 | |||
2100 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) | ||
2101 | { | ||
2102 | return 0; | ||
2103 | } | ||
2104 | |||
2105 | static inline int throttled_lb_pair(struct task_group *tg, | ||
2106 | int src_cpu, int dest_cpu) | ||
2107 | { | ||
2108 | return 0; | ||
2109 | } | ||
2110 | |||
2111 | void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} | ||
2112 | |||
2113 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
2114 | static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} | ||
2115 | #endif | ||
2116 | |||
2117 | static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | ||
2118 | { | ||
2119 | return NULL; | ||
2120 | } | ||
2121 | static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} | ||
2122 | void unthrottle_offline_cfs_rqs(struct rq *rq) {} | ||
2123 | |||
2124 | #endif /* CONFIG_CFS_BANDWIDTH */ | ||
2125 | |||
1240 | /************************************************** | 2126 | /************************************************** |
1241 | * CFS operations on tasks: | 2127 | * CFS operations on tasks: |
1242 | */ | 2128 | */ |
@@ -1249,7 +2135,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) | |||
1249 | 2135 | ||
1250 | WARN_ON(task_rq(p) != rq); | 2136 | WARN_ON(task_rq(p) != rq); |
1251 | 2137 | ||
1252 | if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) { | 2138 | if (cfs_rq->nr_running > 1) { |
1253 | u64 slice = sched_slice(cfs_rq, se); | 2139 | u64 slice = sched_slice(cfs_rq, se); |
1254 | u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; | 2140 | u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; |
1255 | s64 delta = slice - ran; | 2141 | s64 delta = slice - ran; |
@@ -1280,7 +2166,7 @@ static void hrtick_update(struct rq *rq) | |||
1280 | { | 2166 | { |
1281 | struct task_struct *curr = rq->curr; | 2167 | struct task_struct *curr = rq->curr; |
1282 | 2168 | ||
1283 | if (curr->sched_class != &fair_sched_class) | 2169 | if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class) |
1284 | return; | 2170 | return; |
1285 | 2171 | ||
1286 | if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) | 2172 | if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) |
@@ -1313,16 +2199,33 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1313 | break; | 2199 | break; |
1314 | cfs_rq = cfs_rq_of(se); | 2200 | cfs_rq = cfs_rq_of(se); |
1315 | enqueue_entity(cfs_rq, se, flags); | 2201 | enqueue_entity(cfs_rq, se, flags); |
2202 | |||
2203 | /* | ||
2204 | * end evaluation on encountering a throttled cfs_rq | ||
2205 | * | ||
2206 | * note: in the case of encountering a throttled cfs_rq we will | ||
2207 | * post the final h_nr_running increment below. | ||
2208 | */ | ||
2209 | if (cfs_rq_throttled(cfs_rq)) | ||
2210 | break; | ||
2211 | cfs_rq->h_nr_running++; | ||
2212 | |||
1316 | flags = ENQUEUE_WAKEUP; | 2213 | flags = ENQUEUE_WAKEUP; |
1317 | } | 2214 | } |
1318 | 2215 | ||
1319 | for_each_sched_entity(se) { | 2216 | for_each_sched_entity(se) { |
1320 | cfs_rq = cfs_rq_of(se); | 2217 | cfs_rq = cfs_rq_of(se); |
2218 | cfs_rq->h_nr_running++; | ||
2219 | |||
2220 | if (cfs_rq_throttled(cfs_rq)) | ||
2221 | break; | ||
1321 | 2222 | ||
1322 | update_cfs_load(cfs_rq, 0); | 2223 | update_cfs_load(cfs_rq, 0); |
1323 | update_cfs_shares(cfs_rq); | 2224 | update_cfs_shares(cfs_rq); |
1324 | } | 2225 | } |
1325 | 2226 | ||
2227 | if (!se) | ||
2228 | inc_nr_running(rq); | ||
1326 | hrtick_update(rq); | 2229 | hrtick_update(rq); |
1327 | } | 2230 | } |
1328 | 2231 | ||
@@ -1343,6 +2246,16 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1343 | cfs_rq = cfs_rq_of(se); | 2246 | cfs_rq = cfs_rq_of(se); |
1344 | dequeue_entity(cfs_rq, se, flags); | 2247 | dequeue_entity(cfs_rq, se, flags); |
1345 | 2248 | ||
2249 | /* | ||
2250 | * end evaluation on encountering a throttled cfs_rq | ||
2251 | * | ||
2252 | * note: in the case of encountering a throttled cfs_rq we will | ||
2253 | * post the final h_nr_running decrement below. | ||
2254 | */ | ||
2255 | if (cfs_rq_throttled(cfs_rq)) | ||
2256 | break; | ||
2257 | cfs_rq->h_nr_running--; | ||
2258 | |||
1346 | /* Don't dequeue parent if it has other entities besides us */ | 2259 | /* Don't dequeue parent if it has other entities besides us */ |
1347 | if (cfs_rq->load.weight) { | 2260 | if (cfs_rq->load.weight) { |
1348 | /* | 2261 | /* |
@@ -1361,15 +2274,76 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1361 | 2274 | ||
1362 | for_each_sched_entity(se) { | 2275 | for_each_sched_entity(se) { |
1363 | cfs_rq = cfs_rq_of(se); | 2276 | cfs_rq = cfs_rq_of(se); |
2277 | cfs_rq->h_nr_running--; | ||
2278 | |||
2279 | if (cfs_rq_throttled(cfs_rq)) | ||
2280 | break; | ||
1364 | 2281 | ||
1365 | update_cfs_load(cfs_rq, 0); | 2282 | update_cfs_load(cfs_rq, 0); |
1366 | update_cfs_shares(cfs_rq); | 2283 | update_cfs_shares(cfs_rq); |
1367 | } | 2284 | } |
1368 | 2285 | ||
2286 | if (!se) | ||
2287 | dec_nr_running(rq); | ||
1369 | hrtick_update(rq); | 2288 | hrtick_update(rq); |
1370 | } | 2289 | } |
1371 | 2290 | ||
1372 | #ifdef CONFIG_SMP | 2291 | #ifdef CONFIG_SMP |
2292 | /* Used instead of source_load when we know the type == 0 */ | ||
2293 | static unsigned long weighted_cpuload(const int cpu) | ||
2294 | { | ||
2295 | return cpu_rq(cpu)->load.weight; | ||
2296 | } | ||
2297 | |||
2298 | /* | ||
2299 | * Return a low guess at the load of a migration-source cpu weighted | ||
2300 | * according to the scheduling class and "nice" value. | ||
2301 | * | ||
2302 | * We want to under-estimate the load of migration sources, to | ||
2303 | * balance conservatively. | ||
2304 | */ | ||
2305 | static unsigned long source_load(int cpu, int type) | ||
2306 | { | ||
2307 | struct rq *rq = cpu_rq(cpu); | ||
2308 | unsigned long total = weighted_cpuload(cpu); | ||
2309 | |||
2310 | if (type == 0 || !sched_feat(LB_BIAS)) | ||
2311 | return total; | ||
2312 | |||
2313 | return min(rq->cpu_load[type-1], total); | ||
2314 | } | ||
2315 | |||
2316 | /* | ||
2317 | * Return a high guess at the load of a migration-target cpu weighted | ||
2318 | * according to the scheduling class and "nice" value. | ||
2319 | */ | ||
2320 | static unsigned long target_load(int cpu, int type) | ||
2321 | { | ||
2322 | struct rq *rq = cpu_rq(cpu); | ||
2323 | unsigned long total = weighted_cpuload(cpu); | ||
2324 | |||
2325 | if (type == 0 || !sched_feat(LB_BIAS)) | ||
2326 | return total; | ||
2327 | |||
2328 | return max(rq->cpu_load[type-1], total); | ||
2329 | } | ||
2330 | |||
2331 | static unsigned long power_of(int cpu) | ||
2332 | { | ||
2333 | return cpu_rq(cpu)->cpu_power; | ||
2334 | } | ||
2335 | |||
2336 | static unsigned long cpu_avg_load_per_task(int cpu) | ||
2337 | { | ||
2338 | struct rq *rq = cpu_rq(cpu); | ||
2339 | unsigned long nr_running = ACCESS_ONCE(rq->nr_running); | ||
2340 | |||
2341 | if (nr_running) | ||
2342 | return rq->load.weight / nr_running; | ||
2343 | |||
2344 | return 0; | ||
2345 | } | ||
2346 | |||
1373 | 2347 | ||
1374 | static void task_waking_fair(struct task_struct *p) | 2348 | static void task_waking_fair(struct task_struct *p) |
1375 | { | 2349 | { |
@@ -1399,42 +2373,105 @@ static void task_waking_fair(struct task_struct *p) | |||
1399 | * Adding load to a group doesn't make a group heavier, but can cause movement | 2373 | * Adding load to a group doesn't make a group heavier, but can cause movement |
1400 | * of group shares between cpus. Assuming the shares were perfectly aligned one | 2374 | * of group shares between cpus. Assuming the shares were perfectly aligned one |
1401 | * can calculate the shift in shares. | 2375 | * can calculate the shift in shares. |
2376 | * | ||
2377 | * Calculate the effective load difference if @wl is added (subtracted) to @tg | ||
2378 | * on this @cpu and results in a total addition (subtraction) of @wg to the | ||
2379 | * total group weight. | ||
2380 | * | ||
2381 | * Given a runqueue weight distribution (rw_i) we can compute a shares | ||
2382 | * distribution (s_i) using: | ||
2383 | * | ||
2384 | * s_i = rw_i / \Sum rw_j (1) | ||
2385 | * | ||
2386 | * Suppose we have 4 CPUs and our @tg is a direct child of the root group and | ||
2387 | * has 7 equal weight tasks, distributed as below (rw_i), with the resulting | ||
2388 | * shares distribution (s_i): | ||
2389 | * | ||
2390 | * rw_i = { 2, 4, 1, 0 } | ||
2391 | * s_i = { 2/7, 4/7, 1/7, 0 } | ||
2392 | * | ||
2393 | * As per wake_affine() we're interested in the load of two CPUs (the CPU the | ||
2394 | * task used to run on and the CPU the waker is running on), we need to | ||
2395 | * compute the effect of waking a task on either CPU and, in case of a sync | ||
2396 | * wakeup, compute the effect of the current task going to sleep. | ||
2397 | * | ||
2398 | * So for a change of @wl to the local @cpu with an overall group weight change | ||
2399 | * of @wl we can compute the new shares distribution (s'_i) using: | ||
2400 | * | ||
2401 | * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2) | ||
2402 | * | ||
2403 | * Suppose we're interested in CPUs 0 and 1, and want to compute the load | ||
2404 | * differences in waking a task to CPU 0. The additional task changes the | ||
2405 | * weight and shares distributions like: | ||
2406 | * | ||
2407 | * rw'_i = { 3, 4, 1, 0 } | ||
2408 | * s'_i = { 3/8, 4/8, 1/8, 0 } | ||
2409 | * | ||
2410 | * We can then compute the difference in effective weight by using: | ||
2411 | * | ||
2412 | * dw_i = S * (s'_i - s_i) (3) | ||
2413 | * | ||
2414 | * Where 'S' is the group weight as seen by its parent. | ||
2415 | * | ||
2416 | * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7) | ||
2417 | * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 - | ||
2418 | * 4/7) times the weight of the group. | ||
1402 | */ | 2419 | */ |
1403 | static long effective_load(struct task_group *tg, int cpu, long wl, long wg) | 2420 | static long effective_load(struct task_group *tg, int cpu, long wl, long wg) |
1404 | { | 2421 | { |
1405 | struct sched_entity *se = tg->se[cpu]; | 2422 | struct sched_entity *se = tg->se[cpu]; |
1406 | 2423 | ||
1407 | if (!tg->parent) | 2424 | if (!tg->parent) /* the trivial, non-cgroup case */ |
1408 | return wl; | 2425 | return wl; |
1409 | 2426 | ||
1410 | for_each_sched_entity(se) { | 2427 | for_each_sched_entity(se) { |
1411 | long lw, w; | 2428 | long w, W; |
1412 | 2429 | ||
1413 | tg = se->my_q->tg; | 2430 | tg = se->my_q->tg; |
1414 | w = se->my_q->load.weight; | ||
1415 | 2431 | ||
1416 | /* use this cpu's instantaneous contribution */ | 2432 | /* |
1417 | lw = atomic_read(&tg->load_weight); | 2433 | * W = @wg + \Sum rw_j |
1418 | lw -= se->my_q->load_contribution; | 2434 | */ |
1419 | lw += w + wg; | 2435 | W = wg + calc_tg_weight(tg, se->my_q); |
1420 | 2436 | ||
1421 | wl += w; | 2437 | /* |
2438 | * w = rw_i + @wl | ||
2439 | */ | ||
2440 | w = se->my_q->load.weight + wl; | ||
1422 | 2441 | ||
1423 | if (lw > 0 && wl < lw) | 2442 | /* |
1424 | wl = (wl * tg->shares) / lw; | 2443 | * wl = S * s'_i; see (2) |
2444 | */ | ||
2445 | if (W > 0 && w < W) | ||
2446 | wl = (w * tg->shares) / W; | ||
1425 | else | 2447 | else |
1426 | wl = tg->shares; | 2448 | wl = tg->shares; |
1427 | 2449 | ||
1428 | /* zero point is MIN_SHARES */ | 2450 | /* |
2451 | * Per the above, wl is the new se->load.weight value; since | ||
2452 | * those are clipped to [MIN_SHARES, ...) do so now. See | ||
2453 | * calc_cfs_shares(). | ||
2454 | */ | ||
1429 | if (wl < MIN_SHARES) | 2455 | if (wl < MIN_SHARES) |
1430 | wl = MIN_SHARES; | 2456 | wl = MIN_SHARES; |
2457 | |||
2458 | /* | ||
2459 | * wl = dw_i = S * (s'_i - s_i); see (3) | ||
2460 | */ | ||
1431 | wl -= se->load.weight; | 2461 | wl -= se->load.weight; |
2462 | |||
2463 | /* | ||
2464 | * Recursively apply this logic to all parent groups to compute | ||
2465 | * the final effective load change on the root group. Since | ||
2466 | * only the @tg group gets extra weight, all parent groups can | ||
2467 | * only redistribute existing shares. @wl is the shift in shares | ||
2468 | * resulting from this level per the above. | ||
2469 | */ | ||
1432 | wg = 0; | 2470 | wg = 0; |
1433 | } | 2471 | } |
1434 | 2472 | ||
1435 | return wl; | 2473 | return wl; |
1436 | } | 2474 | } |
1437 | |||
1438 | #else | 2475 | #else |
1439 | 2476 | ||
1440 | static inline unsigned long effective_load(struct task_group *tg, int cpu, | 2477 | static inline unsigned long effective_load(struct task_group *tg, int cpu, |
@@ -1547,7 +2584,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
1547 | 2584 | ||
1548 | /* Skip over this group if it has no CPUs allowed */ | 2585 | /* Skip over this group if it has no CPUs allowed */ |
1549 | if (!cpumask_intersects(sched_group_cpus(group), | 2586 | if (!cpumask_intersects(sched_group_cpus(group), |
1550 | &p->cpus_allowed)) | 2587 | tsk_cpus_allowed(p))) |
1551 | continue; | 2588 | continue; |
1552 | 2589 | ||
1553 | local_group = cpumask_test_cpu(this_cpu, | 2590 | local_group = cpumask_test_cpu(this_cpu, |
@@ -1593,7 +2630,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | |||
1593 | int i; | 2630 | int i; |
1594 | 2631 | ||
1595 | /* Traverse only the allowed CPUs */ | 2632 | /* Traverse only the allowed CPUs */ |
1596 | for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { | 2633 | for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { |
1597 | load = weighted_cpuload(i); | 2634 | load = weighted_cpuload(i); |
1598 | 2635 | ||
1599 | if (load < min_load || (load == min_load && i == this_cpu)) { | 2636 | if (load < min_load || (load == min_load && i == this_cpu)) { |
@@ -1613,6 +2650,7 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
1613 | int cpu = smp_processor_id(); | 2650 | int cpu = smp_processor_id(); |
1614 | int prev_cpu = task_cpu(p); | 2651 | int prev_cpu = task_cpu(p); |
1615 | struct sched_domain *sd; | 2652 | struct sched_domain *sd; |
2653 | struct sched_group *sg; | ||
1616 | int i; | 2654 | int i; |
1617 | 2655 | ||
1618 | /* | 2656 | /* |
@@ -1633,25 +2671,28 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
1633 | * Otherwise, iterate the domains and find an elegible idle cpu. | 2671 | * Otherwise, iterate the domains and find an elegible idle cpu. |
1634 | */ | 2672 | */ |
1635 | rcu_read_lock(); | 2673 | rcu_read_lock(); |
1636 | for_each_domain(target, sd) { | ||
1637 | if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) | ||
1638 | break; | ||
1639 | 2674 | ||
1640 | for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { | 2675 | sd = rcu_dereference(per_cpu(sd_llc, target)); |
1641 | if (idle_cpu(i)) { | 2676 | for_each_lower_domain(sd) { |
1642 | target = i; | 2677 | sg = sd->groups; |
1643 | break; | 2678 | do { |
2679 | if (!cpumask_intersects(sched_group_cpus(sg), | ||
2680 | tsk_cpus_allowed(p))) | ||
2681 | goto next; | ||
2682 | |||
2683 | for_each_cpu(i, sched_group_cpus(sg)) { | ||
2684 | if (!idle_cpu(i)) | ||
2685 | goto next; | ||
1644 | } | 2686 | } |
1645 | } | ||
1646 | 2687 | ||
1647 | /* | 2688 | target = cpumask_first_and(sched_group_cpus(sg), |
1648 | * Lets stop looking for an idle sibling when we reached | 2689 | tsk_cpus_allowed(p)); |
1649 | * the domain that spans the current cpu and prev_cpu. | 2690 | goto done; |
1650 | */ | 2691 | next: |
1651 | if (cpumask_test_cpu(cpu, sched_domain_span(sd)) && | 2692 | sg = sg->next; |
1652 | cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) | 2693 | } while (sg != sd->groups); |
1653 | break; | ||
1654 | } | 2694 | } |
2695 | done: | ||
1655 | rcu_read_unlock(); | 2696 | rcu_read_unlock(); |
1656 | 2697 | ||
1657 | return target; | 2698 | return target; |
@@ -1679,8 +2720,11 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) | |||
1679 | int want_sd = 1; | 2720 | int want_sd = 1; |
1680 | int sync = wake_flags & WF_SYNC; | 2721 | int sync = wake_flags & WF_SYNC; |
1681 | 2722 | ||
2723 | if (p->rt.nr_cpus_allowed == 1) | ||
2724 | return prev_cpu; | ||
2725 | |||
1682 | if (sd_flag & SD_BALANCE_WAKE) { | 2726 | if (sd_flag & SD_BALANCE_WAKE) { |
1683 | if (cpumask_test_cpu(cpu, &p->cpus_allowed)) | 2727 | if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) |
1684 | want_affine = 1; | 2728 | want_affine = 1; |
1685 | new_cpu = prev_cpu; | 2729 | new_cpu = prev_cpu; |
1686 | } | 2730 | } |
@@ -1875,6 +2919,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
1875 | if (unlikely(se == pse)) | 2919 | if (unlikely(se == pse)) |
1876 | return; | 2920 | return; |
1877 | 2921 | ||
2922 | /* | ||
2923 | * This is possible from callers such as pull_task(), in which we | ||
2924 | * unconditionally check_prempt_curr() after an enqueue (which may have | ||
2925 | * lead to a throttle). This both saves work and prevents false | ||
2926 | * next-buddy nomination below. | ||
2927 | */ | ||
2928 | if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) | ||
2929 | return; | ||
2930 | |||
1878 | if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { | 2931 | if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { |
1879 | set_next_buddy(pse); | 2932 | set_next_buddy(pse); |
1880 | next_buddy_marked = 1; | 2933 | next_buddy_marked = 1; |
@@ -1883,6 +2936,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
1883 | /* | 2936 | /* |
1884 | * We can come here with TIF_NEED_RESCHED already set from new task | 2937 | * We can come here with TIF_NEED_RESCHED already set from new task |
1885 | * wake up path. | 2938 | * wake up path. |
2939 | * | ||
2940 | * Note: this also catches the edge-case of curr being in a throttled | ||
2941 | * group (e.g. via set_curr_task), since update_curr() (in the | ||
2942 | * enqueue of curr) will have resulted in resched being set. This | ||
2943 | * prevents us from potentially nominating it as a false LAST_BUDDY | ||
2944 | * below. | ||
1886 | */ | 2945 | */ |
1887 | if (test_tsk_need_resched(curr)) | 2946 | if (test_tsk_need_resched(curr)) |
1888 | return; | 2947 | return; |
@@ -1899,10 +2958,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
1899 | if (unlikely(p->policy != SCHED_NORMAL)) | 2958 | if (unlikely(p->policy != SCHED_NORMAL)) |
1900 | return; | 2959 | return; |
1901 | 2960 | ||
1902 | |||
1903 | if (!sched_feat(WAKEUP_PREEMPT)) | ||
1904 | return; | ||
1905 | |||
1906 | find_matching_se(&se, &pse); | 2961 | find_matching_se(&se, &pse); |
1907 | update_curr(cfs_rq_of(se)); | 2962 | update_curr(cfs_rq_of(se)); |
1908 | BUG_ON(!pse); | 2963 | BUG_ON(!pse); |
@@ -1952,7 +3007,8 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) | |||
1952 | } while (cfs_rq); | 3007 | } while (cfs_rq); |
1953 | 3008 | ||
1954 | p = task_of(se); | 3009 | p = task_of(se); |
1955 | hrtick_start_fair(rq, p); | 3010 | if (hrtick_enabled(rq)) |
3011 | hrtick_start_fair(rq, p); | ||
1956 | 3012 | ||
1957 | return p; | 3013 | return p; |
1958 | } | 3014 | } |
@@ -1996,6 +3052,12 @@ static void yield_task_fair(struct rq *rq) | |||
1996 | * Update run-time statistics of the 'current'. | 3052 | * Update run-time statistics of the 'current'. |
1997 | */ | 3053 | */ |
1998 | update_curr(cfs_rq); | 3054 | update_curr(cfs_rq); |
3055 | /* | ||
3056 | * Tell update_rq_clock() that we've just updated, | ||
3057 | * so we don't do microscopic update in schedule() | ||
3058 | * and double the fastpath cost. | ||
3059 | */ | ||
3060 | rq->skip_clock_update = 1; | ||
1999 | } | 3061 | } |
2000 | 3062 | ||
2001 | set_skip_buddy(se); | 3063 | set_skip_buddy(se); |
@@ -2005,7 +3067,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
2005 | { | 3067 | { |
2006 | struct sched_entity *se = &p->se; | 3068 | struct sched_entity *se = &p->se; |
2007 | 3069 | ||
2008 | if (!se->on_rq) | 3070 | /* throttled hierarchies are not runnable */ |
3071 | if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se))) | ||
2009 | return false; | 3072 | return false; |
2010 | 3073 | ||
2011 | /* Tell the scheduler that we'd really like pse to run next. */ | 3074 | /* Tell the scheduler that we'd really like pse to run next. */ |
@@ -2035,12 +3098,50 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, | |||
2035 | } | 3098 | } |
2036 | 3099 | ||
2037 | /* | 3100 | /* |
3101 | * Is this task likely cache-hot: | ||
3102 | */ | ||
3103 | static int | ||
3104 | task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | ||
3105 | { | ||
3106 | s64 delta; | ||
3107 | |||
3108 | if (p->sched_class != &fair_sched_class) | ||
3109 | return 0; | ||
3110 | |||
3111 | if (unlikely(p->policy == SCHED_IDLE)) | ||
3112 | return 0; | ||
3113 | |||
3114 | /* | ||
3115 | * Buddy candidates are cache hot: | ||
3116 | */ | ||
3117 | if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && | ||
3118 | (&p->se == cfs_rq_of(&p->se)->next || | ||
3119 | &p->se == cfs_rq_of(&p->se)->last)) | ||
3120 | return 1; | ||
3121 | |||
3122 | if (sysctl_sched_migration_cost == -1) | ||
3123 | return 1; | ||
3124 | if (sysctl_sched_migration_cost == 0) | ||
3125 | return 0; | ||
3126 | |||
3127 | delta = now - p->se.exec_start; | ||
3128 | |||
3129 | return delta < (s64)sysctl_sched_migration_cost; | ||
3130 | } | ||
3131 | |||
3132 | #define LBF_ALL_PINNED 0x01 | ||
3133 | #define LBF_NEED_BREAK 0x02 /* clears into HAD_BREAK */ | ||
3134 | #define LBF_HAD_BREAK 0x04 | ||
3135 | #define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */ | ||
3136 | #define LBF_ABORT 0x10 | ||
3137 | |||
3138 | /* | ||
2038 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? | 3139 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? |
2039 | */ | 3140 | */ |
2040 | static | 3141 | static |
2041 | int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | 3142 | int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, |
2042 | struct sched_domain *sd, enum cpu_idle_type idle, | 3143 | struct sched_domain *sd, enum cpu_idle_type idle, |
2043 | int *all_pinned) | 3144 | int *lb_flags) |
2044 | { | 3145 | { |
2045 | int tsk_cache_hot = 0; | 3146 | int tsk_cache_hot = 0; |
2046 | /* | 3147 | /* |
@@ -2049,11 +3150,11 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | |||
2049 | * 2) cannot be migrated to this CPU due to cpus_allowed, or | 3150 | * 2) cannot be migrated to this CPU due to cpus_allowed, or |
2050 | * 3) are cache-hot on their current CPU. | 3151 | * 3) are cache-hot on their current CPU. |
2051 | */ | 3152 | */ |
2052 | if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { | 3153 | if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) { |
2053 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); | 3154 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); |
2054 | return 0; | 3155 | return 0; |
2055 | } | 3156 | } |
2056 | *all_pinned = 0; | 3157 | *lb_flags &= ~LBF_ALL_PINNED; |
2057 | 3158 | ||
2058 | if (task_running(rq, p)) { | 3159 | if (task_running(rq, p)) { |
2059 | schedstat_inc(p, se.statistics.nr_failed_migrations_running); | 3160 | schedstat_inc(p, se.statistics.nr_failed_migrations_running); |
@@ -2102,6 +3203,9 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2102 | 3203 | ||
2103 | for_each_leaf_cfs_rq(busiest, cfs_rq) { | 3204 | for_each_leaf_cfs_rq(busiest, cfs_rq) { |
2104 | list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { | 3205 | list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { |
3206 | if (throttled_lb_pair(task_group(p), | ||
3207 | busiest->cpu, this_cpu)) | ||
3208 | break; | ||
2105 | 3209 | ||
2106 | if (!can_migrate_task(p, busiest, this_cpu, | 3210 | if (!can_migrate_task(p, busiest, this_cpu, |
2107 | sd, idle, &pinned)) | 3211 | sd, idle, &pinned)) |
@@ -2124,7 +3228,7 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2124 | static unsigned long | 3228 | static unsigned long |
2125 | balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | 3229 | balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, |
2126 | unsigned long max_load_move, struct sched_domain *sd, | 3230 | unsigned long max_load_move, struct sched_domain *sd, |
2127 | enum cpu_idle_type idle, int *all_pinned, | 3231 | enum cpu_idle_type idle, int *lb_flags, |
2128 | struct cfs_rq *busiest_cfs_rq) | 3232 | struct cfs_rq *busiest_cfs_rq) |
2129 | { | 3233 | { |
2130 | int loops = 0, pulled = 0; | 3234 | int loops = 0, pulled = 0; |
@@ -2135,12 +3239,14 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2135 | goto out; | 3239 | goto out; |
2136 | 3240 | ||
2137 | list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { | 3241 | list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { |
2138 | if (loops++ > sysctl_sched_nr_migrate) | 3242 | if (loops++ > sysctl_sched_nr_migrate) { |
3243 | *lb_flags |= LBF_NEED_BREAK; | ||
2139 | break; | 3244 | break; |
3245 | } | ||
2140 | 3246 | ||
2141 | if ((p->se.load.weight >> 1) > rem_load_move || | 3247 | if ((p->se.load.weight >> 1) > rem_load_move || |
2142 | !can_migrate_task(p, busiest, this_cpu, sd, idle, | 3248 | !can_migrate_task(p, busiest, this_cpu, sd, idle, |
2143 | all_pinned)) | 3249 | lb_flags)) |
2144 | continue; | 3250 | continue; |
2145 | 3251 | ||
2146 | pull_task(busiest, p, this_rq, this_cpu); | 3252 | pull_task(busiest, p, this_rq, this_cpu); |
@@ -2153,8 +3259,10 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2153 | * kernels will stop after the first task is pulled to minimize | 3259 | * kernels will stop after the first task is pulled to minimize |
2154 | * the critical section. | 3260 | * the critical section. |
2155 | */ | 3261 | */ |
2156 | if (idle == CPU_NEWLY_IDLE) | 3262 | if (idle == CPU_NEWLY_IDLE) { |
3263 | *lb_flags |= LBF_ABORT; | ||
2157 | break; | 3264 | break; |
3265 | } | ||
2158 | #endif | 3266 | #endif |
2159 | 3267 | ||
2160 | /* | 3268 | /* |
@@ -2217,8 +3325,13 @@ static void update_shares(int cpu) | |||
2217 | * Iterates the task_group tree in a bottom up fashion, see | 3325 | * Iterates the task_group tree in a bottom up fashion, see |
2218 | * list_add_leaf_cfs_rq() for details. | 3326 | * list_add_leaf_cfs_rq() for details. |
2219 | */ | 3327 | */ |
2220 | for_each_leaf_cfs_rq(rq, cfs_rq) | 3328 | for_each_leaf_cfs_rq(rq, cfs_rq) { |
3329 | /* throttled entities do not contribute to load */ | ||
3330 | if (throttled_hierarchy(cfs_rq)) | ||
3331 | continue; | ||
3332 | |||
2221 | update_shares_cpu(cfs_rq->tg, cpu); | 3333 | update_shares_cpu(cfs_rq->tg, cpu); |
3334 | } | ||
2222 | rcu_read_unlock(); | 3335 | rcu_read_unlock(); |
2223 | } | 3336 | } |
2224 | 3337 | ||
@@ -2254,7 +3367,7 @@ static unsigned long | |||
2254 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 3367 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
2255 | unsigned long max_load_move, | 3368 | unsigned long max_load_move, |
2256 | struct sched_domain *sd, enum cpu_idle_type idle, | 3369 | struct sched_domain *sd, enum cpu_idle_type idle, |
2257 | int *all_pinned) | 3370 | int *lb_flags) |
2258 | { | 3371 | { |
2259 | long rem_load_move = max_load_move; | 3372 | long rem_load_move = max_load_move; |
2260 | struct cfs_rq *busiest_cfs_rq; | 3373 | struct cfs_rq *busiest_cfs_rq; |
@@ -2267,17 +3380,21 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2267 | unsigned long busiest_weight = busiest_cfs_rq->load.weight; | 3380 | unsigned long busiest_weight = busiest_cfs_rq->load.weight; |
2268 | u64 rem_load, moved_load; | 3381 | u64 rem_load, moved_load; |
2269 | 3382 | ||
3383 | if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT)) | ||
3384 | break; | ||
3385 | |||
2270 | /* | 3386 | /* |
2271 | * empty group | 3387 | * empty group or part of a throttled hierarchy |
2272 | */ | 3388 | */ |
2273 | if (!busiest_cfs_rq->task_weight) | 3389 | if (!busiest_cfs_rq->task_weight || |
3390 | throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu)) | ||
2274 | continue; | 3391 | continue; |
2275 | 3392 | ||
2276 | rem_load = (u64)rem_load_move * busiest_weight; | 3393 | rem_load = (u64)rem_load_move * busiest_weight; |
2277 | rem_load = div_u64(rem_load, busiest_h_load + 1); | 3394 | rem_load = div_u64(rem_load, busiest_h_load + 1); |
2278 | 3395 | ||
2279 | moved_load = balance_tasks(this_rq, this_cpu, busiest, | 3396 | moved_load = balance_tasks(this_rq, this_cpu, busiest, |
2280 | rem_load, sd, idle, all_pinned, | 3397 | rem_load, sd, idle, lb_flags, |
2281 | busiest_cfs_rq); | 3398 | busiest_cfs_rq); |
2282 | 3399 | ||
2283 | if (!moved_load) | 3400 | if (!moved_load) |
@@ -2303,10 +3420,10 @@ static unsigned long | |||
2303 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 3420 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
2304 | unsigned long max_load_move, | 3421 | unsigned long max_load_move, |
2305 | struct sched_domain *sd, enum cpu_idle_type idle, | 3422 | struct sched_domain *sd, enum cpu_idle_type idle, |
2306 | int *all_pinned) | 3423 | int *lb_flags) |
2307 | { | 3424 | { |
2308 | return balance_tasks(this_rq, this_cpu, busiest, | 3425 | return balance_tasks(this_rq, this_cpu, busiest, |
2309 | max_load_move, sd, idle, all_pinned, | 3426 | max_load_move, sd, idle, lb_flags, |
2310 | &busiest->cfs); | 3427 | &busiest->cfs); |
2311 | } | 3428 | } |
2312 | #endif | 3429 | #endif |
@@ -2321,29 +3438,30 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2321 | static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | 3438 | static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, |
2322 | unsigned long max_load_move, | 3439 | unsigned long max_load_move, |
2323 | struct sched_domain *sd, enum cpu_idle_type idle, | 3440 | struct sched_domain *sd, enum cpu_idle_type idle, |
2324 | int *all_pinned) | 3441 | int *lb_flags) |
2325 | { | 3442 | { |
2326 | unsigned long total_load_moved = 0, load_moved; | 3443 | unsigned long total_load_moved = 0, load_moved; |
2327 | 3444 | ||
2328 | do { | 3445 | do { |
2329 | load_moved = load_balance_fair(this_rq, this_cpu, busiest, | 3446 | load_moved = load_balance_fair(this_rq, this_cpu, busiest, |
2330 | max_load_move - total_load_moved, | 3447 | max_load_move - total_load_moved, |
2331 | sd, idle, all_pinned); | 3448 | sd, idle, lb_flags); |
2332 | 3449 | ||
2333 | total_load_moved += load_moved; | 3450 | total_load_moved += load_moved; |
2334 | 3451 | ||
3452 | if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT)) | ||
3453 | break; | ||
3454 | |||
2335 | #ifdef CONFIG_PREEMPT | 3455 | #ifdef CONFIG_PREEMPT |
2336 | /* | 3456 | /* |
2337 | * NEWIDLE balancing is a source of latency, so preemptible | 3457 | * NEWIDLE balancing is a source of latency, so preemptible |
2338 | * kernels will stop after the first task is pulled to minimize | 3458 | * kernels will stop after the first task is pulled to minimize |
2339 | * the critical section. | 3459 | * the critical section. |
2340 | */ | 3460 | */ |
2341 | if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) | 3461 | if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) { |
2342 | break; | 3462 | *lb_flags |= LBF_ABORT; |
2343 | |||
2344 | if (raw_spin_is_contended(&this_rq->lock) || | ||
2345 | raw_spin_is_contended(&busiest->lock)) | ||
2346 | break; | 3463 | break; |
3464 | } | ||
2347 | #endif | 3465 | #endif |
2348 | } while (load_moved && max_load_move > total_load_moved); | 3466 | } while (load_moved && max_load_move > total_load_moved); |
2349 | 3467 | ||
@@ -2405,15 +3523,6 @@ struct sg_lb_stats { | |||
2405 | }; | 3523 | }; |
2406 | 3524 | ||
2407 | /** | 3525 | /** |
2408 | * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. | ||
2409 | * @group: The group whose first cpu is to be returned. | ||
2410 | */ | ||
2411 | static inline unsigned int group_first_cpu(struct sched_group *group) | ||
2412 | { | ||
2413 | return cpumask_first(sched_group_cpus(group)); | ||
2414 | } | ||
2415 | |||
2416 | /** | ||
2417 | * get_sd_load_idx - Obtain the load index for a given sched domain. | 3526 | * get_sd_load_idx - Obtain the load index for a given sched domain. |
2418 | * @sd: The sched_domain whose load_idx is to be obtained. | 3527 | * @sd: The sched_domain whose load_idx is to be obtained. |
2419 | * @idle: The Idle status of the CPU for whose sd load_icx is obtained. | 3528 | * @idle: The Idle status of the CPU for whose sd load_icx is obtained. |
@@ -2662,7 +3771,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) | |||
2662 | sdg->sgp->power = power; | 3771 | sdg->sgp->power = power; |
2663 | } | 3772 | } |
2664 | 3773 | ||
2665 | static void update_group_power(struct sched_domain *sd, int cpu) | 3774 | void update_group_power(struct sched_domain *sd, int cpu) |
2666 | { | 3775 | { |
2667 | struct sched_domain *child = sd->child; | 3776 | struct sched_domain *child = sd->child; |
2668 | struct sched_group *group, *sdg = sd->groups; | 3777 | struct sched_group *group, *sdg = sd->groups; |
@@ -2854,7 +3963,7 @@ static bool update_sd_pick_busiest(struct sched_domain *sd, | |||
2854 | } | 3963 | } |
2855 | 3964 | ||
2856 | /** | 3965 | /** |
2857 | * update_sd_lb_stats - Update sched_group's statistics for load balancing. | 3966 | * update_sd_lb_stats - Update sched_domain's statistics for load balancing. |
2858 | * @sd: sched_domain whose statistics are to be updated. | 3967 | * @sd: sched_domain whose statistics are to be updated. |
2859 | * @this_cpu: Cpu for which load balance is currently performed. | 3968 | * @this_cpu: Cpu for which load balance is currently performed. |
2860 | * @idle: Idle status of this_cpu | 3969 | * @idle: Idle status of this_cpu |
@@ -2928,11 +4037,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
2928 | } while (sg != sd->groups); | 4037 | } while (sg != sd->groups); |
2929 | } | 4038 | } |
2930 | 4039 | ||
2931 | int __weak arch_sd_sibling_asym_packing(void) | ||
2932 | { | ||
2933 | return 0*SD_ASYM_PACKING; | ||
2934 | } | ||
2935 | |||
2936 | /** | 4040 | /** |
2937 | * check_asym_packing - Check to see if the group is packed into the | 4041 | * check_asym_packing - Check to see if the group is packed into the |
2938 | * sched doman. | 4042 | * sched doman. |
@@ -3296,7 +4400,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | |||
3296 | #define MAX_PINNED_INTERVAL 512 | 4400 | #define MAX_PINNED_INTERVAL 512 |
3297 | 4401 | ||
3298 | /* Working cpumask for load_balance and load_balance_newidle. */ | 4402 | /* Working cpumask for load_balance and load_balance_newidle. */ |
3299 | static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); | 4403 | DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); |
3300 | 4404 | ||
3301 | static int need_active_balance(struct sched_domain *sd, int idle, | 4405 | static int need_active_balance(struct sched_domain *sd, int idle, |
3302 | int busiest_cpu, int this_cpu) | 4406 | int busiest_cpu, int this_cpu) |
@@ -3347,7 +4451,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
3347 | struct sched_domain *sd, enum cpu_idle_type idle, | 4451 | struct sched_domain *sd, enum cpu_idle_type idle, |
3348 | int *balance) | 4452 | int *balance) |
3349 | { | 4453 | { |
3350 | int ld_moved, all_pinned = 0, active_balance = 0; | 4454 | int ld_moved, lb_flags = 0, active_balance = 0; |
3351 | struct sched_group *group; | 4455 | struct sched_group *group; |
3352 | unsigned long imbalance; | 4456 | unsigned long imbalance; |
3353 | struct rq *busiest; | 4457 | struct rq *busiest; |
@@ -3388,11 +4492,11 @@ redo: | |||
3388 | * still unbalanced. ld_moved simply stays zero, so it is | 4492 | * still unbalanced. ld_moved simply stays zero, so it is |
3389 | * correctly treated as an imbalance. | 4493 | * correctly treated as an imbalance. |
3390 | */ | 4494 | */ |
3391 | all_pinned = 1; | 4495 | lb_flags |= LBF_ALL_PINNED; |
3392 | local_irq_save(flags); | 4496 | local_irq_save(flags); |
3393 | double_rq_lock(this_rq, busiest); | 4497 | double_rq_lock(this_rq, busiest); |
3394 | ld_moved = move_tasks(this_rq, this_cpu, busiest, | 4498 | ld_moved = move_tasks(this_rq, this_cpu, busiest, |
3395 | imbalance, sd, idle, &all_pinned); | 4499 | imbalance, sd, idle, &lb_flags); |
3396 | double_rq_unlock(this_rq, busiest); | 4500 | double_rq_unlock(this_rq, busiest); |
3397 | local_irq_restore(flags); | 4501 | local_irq_restore(flags); |
3398 | 4502 | ||
@@ -3402,8 +4506,18 @@ redo: | |||
3402 | if (ld_moved && this_cpu != smp_processor_id()) | 4506 | if (ld_moved && this_cpu != smp_processor_id()) |
3403 | resched_cpu(this_cpu); | 4507 | resched_cpu(this_cpu); |
3404 | 4508 | ||
4509 | if (lb_flags & LBF_ABORT) | ||
4510 | goto out_balanced; | ||
4511 | |||
4512 | if (lb_flags & LBF_NEED_BREAK) { | ||
4513 | lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK; | ||
4514 | if (lb_flags & LBF_ABORT) | ||
4515 | goto out_balanced; | ||
4516 | goto redo; | ||
4517 | } | ||
4518 | |||
3405 | /* All tasks on this runqueue were pinned by CPU affinity */ | 4519 | /* All tasks on this runqueue were pinned by CPU affinity */ |
3406 | if (unlikely(all_pinned)) { | 4520 | if (unlikely(lb_flags & LBF_ALL_PINNED)) { |
3407 | cpumask_clear_cpu(cpu_of(busiest), cpus); | 4521 | cpumask_clear_cpu(cpu_of(busiest), cpus); |
3408 | if (!cpumask_empty(cpus)) | 4522 | if (!cpumask_empty(cpus)) |
3409 | goto redo; | 4523 | goto redo; |
@@ -3430,10 +4544,10 @@ redo: | |||
3430 | * moved to this_cpu | 4544 | * moved to this_cpu |
3431 | */ | 4545 | */ |
3432 | if (!cpumask_test_cpu(this_cpu, | 4546 | if (!cpumask_test_cpu(this_cpu, |
3433 | &busiest->curr->cpus_allowed)) { | 4547 | tsk_cpus_allowed(busiest->curr))) { |
3434 | raw_spin_unlock_irqrestore(&busiest->lock, | 4548 | raw_spin_unlock_irqrestore(&busiest->lock, |
3435 | flags); | 4549 | flags); |
3436 | all_pinned = 1; | 4550 | lb_flags |= LBF_ALL_PINNED; |
3437 | goto out_one_pinned; | 4551 | goto out_one_pinned; |
3438 | } | 4552 | } |
3439 | 4553 | ||
@@ -3486,7 +4600,8 @@ out_balanced: | |||
3486 | 4600 | ||
3487 | out_one_pinned: | 4601 | out_one_pinned: |
3488 | /* tune up the balancing interval */ | 4602 | /* tune up the balancing interval */ |
3489 | if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || | 4603 | if (((lb_flags & LBF_ALL_PINNED) && |
4604 | sd->balance_interval < MAX_PINNED_INTERVAL) || | ||
3490 | (sd->balance_interval < sd->max_interval)) | 4605 | (sd->balance_interval < sd->max_interval)) |
3491 | sd->balance_interval *= 2; | 4606 | sd->balance_interval *= 2; |
3492 | 4607 | ||
@@ -3499,7 +4614,7 @@ out: | |||
3499 | * idle_balance is called by schedule() if this_cpu is about to become | 4614 | * idle_balance is called by schedule() if this_cpu is about to become |
3500 | * idle. Attempts to pull tasks from other CPUs. | 4615 | * idle. Attempts to pull tasks from other CPUs. |
3501 | */ | 4616 | */ |
3502 | static void idle_balance(int this_cpu, struct rq *this_rq) | 4617 | void idle_balance(int this_cpu, struct rq *this_rq) |
3503 | { | 4618 | { |
3504 | struct sched_domain *sd; | 4619 | struct sched_domain *sd; |
3505 | int pulled_task = 0; | 4620 | int pulled_task = 0; |
@@ -3612,46 +4727,18 @@ out_unlock: | |||
3612 | } | 4727 | } |
3613 | 4728 | ||
3614 | #ifdef CONFIG_NO_HZ | 4729 | #ifdef CONFIG_NO_HZ |
3615 | |||
3616 | static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb); | ||
3617 | |||
3618 | static void trigger_sched_softirq(void *data) | ||
3619 | { | ||
3620 | raise_softirq_irqoff(SCHED_SOFTIRQ); | ||
3621 | } | ||
3622 | |||
3623 | static inline void init_sched_softirq_csd(struct call_single_data *csd) | ||
3624 | { | ||
3625 | csd->func = trigger_sched_softirq; | ||
3626 | csd->info = NULL; | ||
3627 | csd->flags = 0; | ||
3628 | csd->priv = 0; | ||
3629 | } | ||
3630 | |||
3631 | /* | 4730 | /* |
3632 | * idle load balancing details | 4731 | * idle load balancing details |
3633 | * - One of the idle CPUs nominates itself as idle load_balancer, while | ||
3634 | * entering idle. | ||
3635 | * - This idle load balancer CPU will also go into tickless mode when | ||
3636 | * it is idle, just like all other idle CPUs | ||
3637 | * - When one of the busy CPUs notice that there may be an idle rebalancing | 4732 | * - When one of the busy CPUs notice that there may be an idle rebalancing |
3638 | * needed, they will kick the idle load balancer, which then does idle | 4733 | * needed, they will kick the idle load balancer, which then does idle |
3639 | * load balancing for all the idle CPUs. | 4734 | * load balancing for all the idle CPUs. |
3640 | */ | 4735 | */ |
3641 | static struct { | 4736 | static struct { |
3642 | atomic_t load_balancer; | ||
3643 | atomic_t first_pick_cpu; | ||
3644 | atomic_t second_pick_cpu; | ||
3645 | cpumask_var_t idle_cpus_mask; | 4737 | cpumask_var_t idle_cpus_mask; |
3646 | cpumask_var_t grp_idle_mask; | 4738 | atomic_t nr_cpus; |
3647 | unsigned long next_balance; /* in jiffy units */ | 4739 | unsigned long next_balance; /* in jiffy units */ |
3648 | } nohz ____cacheline_aligned; | 4740 | } nohz ____cacheline_aligned; |
3649 | 4741 | ||
3650 | int get_nohz_load_balancer(void) | ||
3651 | { | ||
3652 | return atomic_read(&nohz.load_balancer); | ||
3653 | } | ||
3654 | |||
3655 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 4742 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
3656 | /** | 4743 | /** |
3657 | * lowest_flag_domain - Return lowest sched_domain containing flag. | 4744 | * lowest_flag_domain - Return lowest sched_domain containing flag. |
@@ -3667,7 +4754,7 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | |||
3667 | struct sched_domain *sd; | 4754 | struct sched_domain *sd; |
3668 | 4755 | ||
3669 | for_each_domain(cpu, sd) | 4756 | for_each_domain(cpu, sd) |
3670 | if (sd && (sd->flags & flag)) | 4757 | if (sd->flags & flag) |
3671 | break; | 4758 | break; |
3672 | 4759 | ||
3673 | return sd; | 4760 | return sd; |
@@ -3688,33 +4775,6 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | |||
3688 | (sd && (sd->flags & flag)); sd = sd->parent) | 4775 | (sd && (sd->flags & flag)); sd = sd->parent) |
3689 | 4776 | ||
3690 | /** | 4777 | /** |
3691 | * is_semi_idle_group - Checks if the given sched_group is semi-idle. | ||
3692 | * @ilb_group: group to be checked for semi-idleness | ||
3693 | * | ||
3694 | * Returns: 1 if the group is semi-idle. 0 otherwise. | ||
3695 | * | ||
3696 | * We define a sched_group to be semi idle if it has atleast one idle-CPU | ||
3697 | * and atleast one non-idle CPU. This helper function checks if the given | ||
3698 | * sched_group is semi-idle or not. | ||
3699 | */ | ||
3700 | static inline int is_semi_idle_group(struct sched_group *ilb_group) | ||
3701 | { | ||
3702 | cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask, | ||
3703 | sched_group_cpus(ilb_group)); | ||
3704 | |||
3705 | /* | ||
3706 | * A sched_group is semi-idle when it has atleast one busy cpu | ||
3707 | * and atleast one idle cpu. | ||
3708 | */ | ||
3709 | if (cpumask_empty(nohz.grp_idle_mask)) | ||
3710 | return 0; | ||
3711 | |||
3712 | if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group))) | ||
3713 | return 0; | ||
3714 | |||
3715 | return 1; | ||
3716 | } | ||
3717 | /** | ||
3718 | * find_new_ilb - Finds the optimum idle load balancer for nomination. | 4778 | * find_new_ilb - Finds the optimum idle load balancer for nomination. |
3719 | * @cpu: The cpu which is nominating a new idle_load_balancer. | 4779 | * @cpu: The cpu which is nominating a new idle_load_balancer. |
3720 | * | 4780 | * |
@@ -3728,9 +4788,9 @@ static inline int is_semi_idle_group(struct sched_group *ilb_group) | |||
3728 | */ | 4788 | */ |
3729 | static int find_new_ilb(int cpu) | 4789 | static int find_new_ilb(int cpu) |
3730 | { | 4790 | { |
4791 | int ilb = cpumask_first(nohz.idle_cpus_mask); | ||
4792 | struct sched_group *ilbg; | ||
3731 | struct sched_domain *sd; | 4793 | struct sched_domain *sd; |
3732 | struct sched_group *ilb_group; | ||
3733 | int ilb = nr_cpu_ids; | ||
3734 | 4794 | ||
3735 | /* | 4795 | /* |
3736 | * Have idle load balancer selection from semi-idle packages only | 4796 | * Have idle load balancer selection from semi-idle packages only |
@@ -3748,23 +4808,28 @@ static int find_new_ilb(int cpu) | |||
3748 | 4808 | ||
3749 | rcu_read_lock(); | 4809 | rcu_read_lock(); |
3750 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { | 4810 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { |
3751 | ilb_group = sd->groups; | 4811 | ilbg = sd->groups; |
3752 | 4812 | ||
3753 | do { | 4813 | do { |
3754 | if (is_semi_idle_group(ilb_group)) { | 4814 | if (ilbg->group_weight != |
3755 | ilb = cpumask_first(nohz.grp_idle_mask); | 4815 | atomic_read(&ilbg->sgp->nr_busy_cpus)) { |
4816 | ilb = cpumask_first_and(nohz.idle_cpus_mask, | ||
4817 | sched_group_cpus(ilbg)); | ||
3756 | goto unlock; | 4818 | goto unlock; |
3757 | } | 4819 | } |
3758 | 4820 | ||
3759 | ilb_group = ilb_group->next; | 4821 | ilbg = ilbg->next; |
3760 | 4822 | ||
3761 | } while (ilb_group != sd->groups); | 4823 | } while (ilbg != sd->groups); |
3762 | } | 4824 | } |
3763 | unlock: | 4825 | unlock: |
3764 | rcu_read_unlock(); | 4826 | rcu_read_unlock(); |
3765 | 4827 | ||
3766 | out_done: | 4828 | out_done: |
3767 | return ilb; | 4829 | if (ilb < nr_cpu_ids && idle_cpu(ilb)) |
4830 | return ilb; | ||
4831 | |||
4832 | return nr_cpu_ids; | ||
3768 | } | 4833 | } |
3769 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ | 4834 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ |
3770 | static inline int find_new_ilb(int call_cpu) | 4835 | static inline int find_new_ilb(int call_cpu) |
@@ -3784,94 +4849,68 @@ static void nohz_balancer_kick(int cpu) | |||
3784 | 4849 | ||
3785 | nohz.next_balance++; | 4850 | nohz.next_balance++; |
3786 | 4851 | ||
3787 | ilb_cpu = get_nohz_load_balancer(); | 4852 | ilb_cpu = find_new_ilb(cpu); |
3788 | |||
3789 | if (ilb_cpu >= nr_cpu_ids) { | ||
3790 | ilb_cpu = cpumask_first(nohz.idle_cpus_mask); | ||
3791 | if (ilb_cpu >= nr_cpu_ids) | ||
3792 | return; | ||
3793 | } | ||
3794 | 4853 | ||
3795 | if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { | 4854 | if (ilb_cpu >= nr_cpu_ids) |
3796 | struct call_single_data *cp; | 4855 | return; |
3797 | 4856 | ||
3798 | cpu_rq(ilb_cpu)->nohz_balance_kick = 1; | 4857 | if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) |
3799 | cp = &per_cpu(remote_sched_softirq_cb, cpu); | 4858 | return; |
3800 | __smp_call_function_single(ilb_cpu, cp, 0); | 4859 | /* |
3801 | } | 4860 | * Use smp_send_reschedule() instead of resched_cpu(). |
4861 | * This way we generate a sched IPI on the target cpu which | ||
4862 | * is idle. And the softirq performing nohz idle load balance | ||
4863 | * will be run before returning from the IPI. | ||
4864 | */ | ||
4865 | smp_send_reschedule(ilb_cpu); | ||
3802 | return; | 4866 | return; |
3803 | } | 4867 | } |
3804 | 4868 | ||
3805 | /* | 4869 | static inline void set_cpu_sd_state_busy(void) |
3806 | * This routine will try to nominate the ilb (idle load balancing) | ||
3807 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle | ||
3808 | * load balancing on behalf of all those cpus. | ||
3809 | * | ||
3810 | * When the ilb owner becomes busy, we will not have new ilb owner until some | ||
3811 | * idle CPU wakes up and goes back to idle or some busy CPU tries to kick | ||
3812 | * idle load balancing by kicking one of the idle CPUs. | ||
3813 | * | ||
3814 | * Ticks are stopped for the ilb owner as well, with busy CPU kicking this | ||
3815 | * ilb owner CPU in future (when there is a need for idle load balancing on | ||
3816 | * behalf of all idle CPUs). | ||
3817 | */ | ||
3818 | void select_nohz_load_balancer(int stop_tick) | ||
3819 | { | 4870 | { |
4871 | struct sched_domain *sd; | ||
3820 | int cpu = smp_processor_id(); | 4872 | int cpu = smp_processor_id(); |
3821 | 4873 | ||
3822 | if (stop_tick) { | 4874 | if (!test_bit(NOHZ_IDLE, nohz_flags(cpu))) |
3823 | if (!cpu_active(cpu)) { | 4875 | return; |
3824 | if (atomic_read(&nohz.load_balancer) != cpu) | 4876 | clear_bit(NOHZ_IDLE, nohz_flags(cpu)); |
3825 | return; | ||
3826 | |||
3827 | /* | ||
3828 | * If we are going offline and still the leader, | ||
3829 | * give up! | ||
3830 | */ | ||
3831 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, | ||
3832 | nr_cpu_ids) != cpu) | ||
3833 | BUG(); | ||
3834 | 4877 | ||
3835 | return; | 4878 | rcu_read_lock(); |
3836 | } | 4879 | for_each_domain(cpu, sd) |
4880 | atomic_inc(&sd->groups->sgp->nr_busy_cpus); | ||
4881 | rcu_read_unlock(); | ||
4882 | } | ||
3837 | 4883 | ||
3838 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); | 4884 | void set_cpu_sd_state_idle(void) |
4885 | { | ||
4886 | struct sched_domain *sd; | ||
4887 | int cpu = smp_processor_id(); | ||
3839 | 4888 | ||
3840 | if (atomic_read(&nohz.first_pick_cpu) == cpu) | 4889 | if (test_bit(NOHZ_IDLE, nohz_flags(cpu))) |
3841 | atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids); | 4890 | return; |
3842 | if (atomic_read(&nohz.second_pick_cpu) == cpu) | 4891 | set_bit(NOHZ_IDLE, nohz_flags(cpu)); |
3843 | atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); | ||
3844 | 4892 | ||
3845 | if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) { | 4893 | rcu_read_lock(); |
3846 | int new_ilb; | 4894 | for_each_domain(cpu, sd) |
4895 | atomic_dec(&sd->groups->sgp->nr_busy_cpus); | ||
4896 | rcu_read_unlock(); | ||
4897 | } | ||
3847 | 4898 | ||
3848 | /* make me the ilb owner */ | 4899 | /* |
3849 | if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids, | 4900 | * This routine will record that this cpu is going idle with tick stopped. |
3850 | cpu) != nr_cpu_ids) | 4901 | * This info will be used in performing idle load balancing in the future. |
3851 | return; | 4902 | */ |
4903 | void select_nohz_load_balancer(int stop_tick) | ||
4904 | { | ||
4905 | int cpu = smp_processor_id(); | ||
3852 | 4906 | ||
3853 | /* | 4907 | if (stop_tick) { |
3854 | * Check to see if there is a more power-efficient | 4908 | if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) |
3855 | * ilb. | ||
3856 | */ | ||
3857 | new_ilb = find_new_ilb(cpu); | ||
3858 | if (new_ilb < nr_cpu_ids && new_ilb != cpu) { | ||
3859 | atomic_set(&nohz.load_balancer, nr_cpu_ids); | ||
3860 | resched_cpu(new_ilb); | ||
3861 | return; | ||
3862 | } | ||
3863 | return; | ||
3864 | } | ||
3865 | } else { | ||
3866 | if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) | ||
3867 | return; | 4909 | return; |
3868 | 4910 | ||
3869 | cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); | 4911 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); |
3870 | 4912 | atomic_inc(&nohz.nr_cpus); | |
3871 | if (atomic_read(&nohz.load_balancer) == cpu) | 4913 | set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); |
3872 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, | ||
3873 | nr_cpu_ids) != cpu) | ||
3874 | BUG(); | ||
3875 | } | 4914 | } |
3876 | return; | 4915 | return; |
3877 | } | 4916 | } |
@@ -3885,7 +4924,7 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; | |||
3885 | * Scale the max load_balance interval with the number of CPUs in the system. | 4924 | * Scale the max load_balance interval with the number of CPUs in the system. |
3886 | * This trades load-balance latency on larger machines for less cross talk. | 4925 | * This trades load-balance latency on larger machines for less cross talk. |
3887 | */ | 4926 | */ |
3888 | static void update_max_interval(void) | 4927 | void update_max_interval(void) |
3889 | { | 4928 | { |
3890 | max_load_balance_interval = HZ*num_online_cpus()/10; | 4929 | max_load_balance_interval = HZ*num_online_cpus()/10; |
3891 | } | 4930 | } |
@@ -3977,11 +5016,12 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | |||
3977 | struct rq *rq; | 5016 | struct rq *rq; |
3978 | int balance_cpu; | 5017 | int balance_cpu; |
3979 | 5018 | ||
3980 | if (idle != CPU_IDLE || !this_rq->nohz_balance_kick) | 5019 | if (idle != CPU_IDLE || |
3981 | return; | 5020 | !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) |
5021 | goto end; | ||
3982 | 5022 | ||
3983 | for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { | 5023 | for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { |
3984 | if (balance_cpu == this_cpu) | 5024 | if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) |
3985 | continue; | 5025 | continue; |
3986 | 5026 | ||
3987 | /* | 5027 | /* |
@@ -3989,10 +5029,8 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | |||
3989 | * work being done for other cpus. Next load | 5029 | * work being done for other cpus. Next load |
3990 | * balancing owner will pick it up. | 5030 | * balancing owner will pick it up. |
3991 | */ | 5031 | */ |
3992 | if (need_resched()) { | 5032 | if (need_resched()) |
3993 | this_rq->nohz_balance_kick = 0; | ||
3994 | break; | 5033 | break; |
3995 | } | ||
3996 | 5034 | ||
3997 | raw_spin_lock_irq(&this_rq->lock); | 5035 | raw_spin_lock_irq(&this_rq->lock); |
3998 | update_rq_clock(this_rq); | 5036 | update_rq_clock(this_rq); |
@@ -4006,53 +5044,75 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | |||
4006 | this_rq->next_balance = rq->next_balance; | 5044 | this_rq->next_balance = rq->next_balance; |
4007 | } | 5045 | } |
4008 | nohz.next_balance = this_rq->next_balance; | 5046 | nohz.next_balance = this_rq->next_balance; |
4009 | this_rq->nohz_balance_kick = 0; | 5047 | end: |
5048 | clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)); | ||
4010 | } | 5049 | } |
4011 | 5050 | ||
4012 | /* | 5051 | /* |
4013 | * Current heuristic for kicking the idle load balancer | 5052 | * Current heuristic for kicking the idle load balancer in the presence |
4014 | * - first_pick_cpu is the one of the busy CPUs. It will kick | 5053 | * of an idle cpu is the system. |
4015 | * idle load balancer when it has more than one process active. This | 5054 | * - This rq has more than one task. |
4016 | * eliminates the need for idle load balancing altogether when we have | 5055 | * - At any scheduler domain level, this cpu's scheduler group has multiple |
4017 | * only one running process in the system (common case). | 5056 | * busy cpu's exceeding the group's power. |
4018 | * - If there are more than one busy CPU, idle load balancer may have | 5057 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler |
4019 | * to run for active_load_balance to happen (i.e., two busy CPUs are | 5058 | * domain span are idle. |
4020 | * SMT or core siblings and can run better if they move to different | ||
4021 | * physical CPUs). So, second_pick_cpu is the second of the busy CPUs | ||
4022 | * which will kick idle load balancer as soon as it has any load. | ||
4023 | */ | 5059 | */ |
4024 | static inline int nohz_kick_needed(struct rq *rq, int cpu) | 5060 | static inline int nohz_kick_needed(struct rq *rq, int cpu) |
4025 | { | 5061 | { |
4026 | unsigned long now = jiffies; | 5062 | unsigned long now = jiffies; |
4027 | int ret; | 5063 | struct sched_domain *sd; |
4028 | int first_pick_cpu, second_pick_cpu; | ||
4029 | 5064 | ||
4030 | if (time_before(now, nohz.next_balance)) | 5065 | if (unlikely(idle_cpu(cpu))) |
4031 | return 0; | 5066 | return 0; |
4032 | 5067 | ||
4033 | if (rq->idle_at_tick) | 5068 | /* |
4034 | return 0; | 5069 | * We may be recently in ticked or tickless idle mode. At the first |
5070 | * busy tick after returning from idle, we will update the busy stats. | ||
5071 | */ | ||
5072 | set_cpu_sd_state_busy(); | ||
5073 | if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { | ||
5074 | clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); | ||
5075 | cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); | ||
5076 | atomic_dec(&nohz.nr_cpus); | ||
5077 | } | ||
4035 | 5078 | ||
4036 | first_pick_cpu = atomic_read(&nohz.first_pick_cpu); | 5079 | /* |
4037 | second_pick_cpu = atomic_read(&nohz.second_pick_cpu); | 5080 | * None are in tickless mode and hence no need for NOHZ idle load |
5081 | * balancing. | ||
5082 | */ | ||
5083 | if (likely(!atomic_read(&nohz.nr_cpus))) | ||
5084 | return 0; | ||
4038 | 5085 | ||
4039 | if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu && | 5086 | if (time_before(now, nohz.next_balance)) |
4040 | second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu) | ||
4041 | return 0; | 5087 | return 0; |
4042 | 5088 | ||
4043 | ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu); | 5089 | if (rq->nr_running >= 2) |
4044 | if (ret == nr_cpu_ids || ret == cpu) { | 5090 | goto need_kick; |
4045 | atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); | 5091 | |
4046 | if (rq->nr_running > 1) | 5092 | rcu_read_lock(); |
4047 | return 1; | 5093 | for_each_domain(cpu, sd) { |
4048 | } else { | 5094 | struct sched_group *sg = sd->groups; |
4049 | ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu); | 5095 | struct sched_group_power *sgp = sg->sgp; |
4050 | if (ret == nr_cpu_ids || ret == cpu) { | 5096 | int nr_busy = atomic_read(&sgp->nr_busy_cpus); |
4051 | if (rq->nr_running) | 5097 | |
4052 | return 1; | 5098 | if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1) |
4053 | } | 5099 | goto need_kick_unlock; |
5100 | |||
5101 | if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight | ||
5102 | && (cpumask_first_and(nohz.idle_cpus_mask, | ||
5103 | sched_domain_span(sd)) < cpu)) | ||
5104 | goto need_kick_unlock; | ||
5105 | |||
5106 | if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING))) | ||
5107 | break; | ||
4054 | } | 5108 | } |
5109 | rcu_read_unlock(); | ||
4055 | return 0; | 5110 | return 0; |
5111 | |||
5112 | need_kick_unlock: | ||
5113 | rcu_read_unlock(); | ||
5114 | need_kick: | ||
5115 | return 1; | ||
4056 | } | 5116 | } |
4057 | #else | 5117 | #else |
4058 | static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } | 5118 | static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } |
@@ -4066,7 +5126,7 @@ static void run_rebalance_domains(struct softirq_action *h) | |||
4066 | { | 5126 | { |
4067 | int this_cpu = smp_processor_id(); | 5127 | int this_cpu = smp_processor_id(); |
4068 | struct rq *this_rq = cpu_rq(this_cpu); | 5128 | struct rq *this_rq = cpu_rq(this_cpu); |
4069 | enum cpu_idle_type idle = this_rq->idle_at_tick ? | 5129 | enum cpu_idle_type idle = this_rq->idle_balance ? |
4070 | CPU_IDLE : CPU_NOT_IDLE; | 5130 | CPU_IDLE : CPU_NOT_IDLE; |
4071 | 5131 | ||
4072 | rebalance_domains(this_cpu, idle); | 5132 | rebalance_domains(this_cpu, idle); |
@@ -4087,14 +5147,14 @@ static inline int on_null_domain(int cpu) | |||
4087 | /* | 5147 | /* |
4088 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. | 5148 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. |
4089 | */ | 5149 | */ |
4090 | static inline void trigger_load_balance(struct rq *rq, int cpu) | 5150 | void trigger_load_balance(struct rq *rq, int cpu) |
4091 | { | 5151 | { |
4092 | /* Don't need to rebalance while attached to NULL domain */ | 5152 | /* Don't need to rebalance while attached to NULL domain */ |
4093 | if (time_after_eq(jiffies, rq->next_balance) && | 5153 | if (time_after_eq(jiffies, rq->next_balance) && |
4094 | likely(!on_null_domain(cpu))) | 5154 | likely(!on_null_domain(cpu))) |
4095 | raise_softirq(SCHED_SOFTIRQ); | 5155 | raise_softirq(SCHED_SOFTIRQ); |
4096 | #ifdef CONFIG_NO_HZ | 5156 | #ifdef CONFIG_NO_HZ |
4097 | else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) | 5157 | if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) |
4098 | nohz_balancer_kick(cpu); | 5158 | nohz_balancer_kick(cpu); |
4099 | #endif | 5159 | #endif |
4100 | } | 5160 | } |
@@ -4109,15 +5169,6 @@ static void rq_offline_fair(struct rq *rq) | |||
4109 | update_sysctl(); | 5169 | update_sysctl(); |
4110 | } | 5170 | } |
4111 | 5171 | ||
4112 | #else /* CONFIG_SMP */ | ||
4113 | |||
4114 | /* | ||
4115 | * on UP we do not need to balance between CPUs: | ||
4116 | */ | ||
4117 | static inline void idle_balance(int cpu, struct rq *rq) | ||
4118 | { | ||
4119 | } | ||
4120 | |||
4121 | #endif /* CONFIG_SMP */ | 5172 | #endif /* CONFIG_SMP */ |
4122 | 5173 | ||
4123 | /* | 5174 | /* |
@@ -4141,8 +5192,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) | |||
4141 | */ | 5192 | */ |
4142 | static void task_fork_fair(struct task_struct *p) | 5193 | static void task_fork_fair(struct task_struct *p) |
4143 | { | 5194 | { |
4144 | struct cfs_rq *cfs_rq = task_cfs_rq(current); | 5195 | struct cfs_rq *cfs_rq; |
4145 | struct sched_entity *se = &p->se, *curr = cfs_rq->curr; | 5196 | struct sched_entity *se = &p->se, *curr; |
4146 | int this_cpu = smp_processor_id(); | 5197 | int this_cpu = smp_processor_id(); |
4147 | struct rq *rq = this_rq(); | 5198 | struct rq *rq = this_rq(); |
4148 | unsigned long flags; | 5199 | unsigned long flags; |
@@ -4151,6 +5202,9 @@ static void task_fork_fair(struct task_struct *p) | |||
4151 | 5202 | ||
4152 | update_rq_clock(rq); | 5203 | update_rq_clock(rq); |
4153 | 5204 | ||
5205 | cfs_rq = task_cfs_rq(current); | ||
5206 | curr = cfs_rq->curr; | ||
5207 | |||
4154 | if (unlikely(task_cpu(p) != this_cpu)) { | 5208 | if (unlikely(task_cpu(p) != this_cpu)) { |
4155 | rcu_read_lock(); | 5209 | rcu_read_lock(); |
4156 | __set_task_cpu(p, this_cpu); | 5210 | __set_task_cpu(p, this_cpu); |
@@ -4251,8 +5305,23 @@ static void set_curr_task_fair(struct rq *rq) | |||
4251 | { | 5305 | { |
4252 | struct sched_entity *se = &rq->curr->se; | 5306 | struct sched_entity *se = &rq->curr->se; |
4253 | 5307 | ||
4254 | for_each_sched_entity(se) | 5308 | for_each_sched_entity(se) { |
4255 | set_next_entity(cfs_rq_of(se), se); | 5309 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
5310 | |||
5311 | set_next_entity(cfs_rq, se); | ||
5312 | /* ensure bandwidth has been allocated on our new cfs_rq */ | ||
5313 | account_cfs_rq_runtime(cfs_rq, 0); | ||
5314 | } | ||
5315 | } | ||
5316 | |||
5317 | void init_cfs_rq(struct cfs_rq *cfs_rq) | ||
5318 | { | ||
5319 | cfs_rq->tasks_timeline = RB_ROOT; | ||
5320 | INIT_LIST_HEAD(&cfs_rq->tasks); | ||
5321 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); | ||
5322 | #ifndef CONFIG_64BIT | ||
5323 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; | ||
5324 | #endif | ||
4256 | } | 5325 | } |
4257 | 5326 | ||
4258 | #ifdef CONFIG_FAIR_GROUP_SCHED | 5327 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -4271,13 +5340,182 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) | |||
4271 | * to another cgroup's rq. This does somewhat interfere with the | 5340 | * to another cgroup's rq. This does somewhat interfere with the |
4272 | * fair sleeper stuff for the first placement, but who cares. | 5341 | * fair sleeper stuff for the first placement, but who cares. |
4273 | */ | 5342 | */ |
5343 | /* | ||
5344 | * When !on_rq, vruntime of the task has usually NOT been normalized. | ||
5345 | * But there are some cases where it has already been normalized: | ||
5346 | * | ||
5347 | * - Moving a forked child which is waiting for being woken up by | ||
5348 | * wake_up_new_task(). | ||
5349 | * - Moving a task which has been woken up by try_to_wake_up() and | ||
5350 | * waiting for actually being woken up by sched_ttwu_pending(). | ||
5351 | * | ||
5352 | * To prevent boost or penalty in the new cfs_rq caused by delta | ||
5353 | * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. | ||
5354 | */ | ||
5355 | if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING)) | ||
5356 | on_rq = 1; | ||
5357 | |||
4274 | if (!on_rq) | 5358 | if (!on_rq) |
4275 | p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; | 5359 | p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; |
4276 | set_task_rq(p, task_cpu(p)); | 5360 | set_task_rq(p, task_cpu(p)); |
4277 | if (!on_rq) | 5361 | if (!on_rq) |
4278 | p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; | 5362 | p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; |
4279 | } | 5363 | } |
5364 | |||
5365 | void free_fair_sched_group(struct task_group *tg) | ||
5366 | { | ||
5367 | int i; | ||
5368 | |||
5369 | destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); | ||
5370 | |||
5371 | for_each_possible_cpu(i) { | ||
5372 | if (tg->cfs_rq) | ||
5373 | kfree(tg->cfs_rq[i]); | ||
5374 | if (tg->se) | ||
5375 | kfree(tg->se[i]); | ||
5376 | } | ||
5377 | |||
5378 | kfree(tg->cfs_rq); | ||
5379 | kfree(tg->se); | ||
5380 | } | ||
5381 | |||
5382 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | ||
5383 | { | ||
5384 | struct cfs_rq *cfs_rq; | ||
5385 | struct sched_entity *se; | ||
5386 | int i; | ||
5387 | |||
5388 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); | ||
5389 | if (!tg->cfs_rq) | ||
5390 | goto err; | ||
5391 | tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); | ||
5392 | if (!tg->se) | ||
5393 | goto err; | ||
5394 | |||
5395 | tg->shares = NICE_0_LOAD; | ||
5396 | |||
5397 | init_cfs_bandwidth(tg_cfs_bandwidth(tg)); | ||
5398 | |||
5399 | for_each_possible_cpu(i) { | ||
5400 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), | ||
5401 | GFP_KERNEL, cpu_to_node(i)); | ||
5402 | if (!cfs_rq) | ||
5403 | goto err; | ||
5404 | |||
5405 | se = kzalloc_node(sizeof(struct sched_entity), | ||
5406 | GFP_KERNEL, cpu_to_node(i)); | ||
5407 | if (!se) | ||
5408 | goto err_free_rq; | ||
5409 | |||
5410 | init_cfs_rq(cfs_rq); | ||
5411 | init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); | ||
5412 | } | ||
5413 | |||
5414 | return 1; | ||
5415 | |||
5416 | err_free_rq: | ||
5417 | kfree(cfs_rq); | ||
5418 | err: | ||
5419 | return 0; | ||
5420 | } | ||
5421 | |||
5422 | void unregister_fair_sched_group(struct task_group *tg, int cpu) | ||
5423 | { | ||
5424 | struct rq *rq = cpu_rq(cpu); | ||
5425 | unsigned long flags; | ||
5426 | |||
5427 | /* | ||
5428 | * Only empty task groups can be destroyed; so we can speculatively | ||
5429 | * check on_list without danger of it being re-added. | ||
5430 | */ | ||
5431 | if (!tg->cfs_rq[cpu]->on_list) | ||
5432 | return; | ||
5433 | |||
5434 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
5435 | list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); | ||
5436 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
5437 | } | ||
5438 | |||
5439 | void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | ||
5440 | struct sched_entity *se, int cpu, | ||
5441 | struct sched_entity *parent) | ||
5442 | { | ||
5443 | struct rq *rq = cpu_rq(cpu); | ||
5444 | |||
5445 | cfs_rq->tg = tg; | ||
5446 | cfs_rq->rq = rq; | ||
5447 | #ifdef CONFIG_SMP | ||
5448 | /* allow initial update_cfs_load() to truncate */ | ||
5449 | cfs_rq->load_stamp = 1; | ||
4280 | #endif | 5450 | #endif |
5451 | init_cfs_rq_runtime(cfs_rq); | ||
5452 | |||
5453 | tg->cfs_rq[cpu] = cfs_rq; | ||
5454 | tg->se[cpu] = se; | ||
5455 | |||
5456 | /* se could be NULL for root_task_group */ | ||
5457 | if (!se) | ||
5458 | return; | ||
5459 | |||
5460 | if (!parent) | ||
5461 | se->cfs_rq = &rq->cfs; | ||
5462 | else | ||
5463 | se->cfs_rq = parent->my_q; | ||
5464 | |||
5465 | se->my_q = cfs_rq; | ||
5466 | update_load_set(&se->load, 0); | ||
5467 | se->parent = parent; | ||
5468 | } | ||
5469 | |||
5470 | static DEFINE_MUTEX(shares_mutex); | ||
5471 | |||
5472 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) | ||
5473 | { | ||
5474 | int i; | ||
5475 | unsigned long flags; | ||
5476 | |||
5477 | /* | ||
5478 | * We can't change the weight of the root cgroup. | ||
5479 | */ | ||
5480 | if (!tg->se[0]) | ||
5481 | return -EINVAL; | ||
5482 | |||
5483 | shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); | ||
5484 | |||
5485 | mutex_lock(&shares_mutex); | ||
5486 | if (tg->shares == shares) | ||
5487 | goto done; | ||
5488 | |||
5489 | tg->shares = shares; | ||
5490 | for_each_possible_cpu(i) { | ||
5491 | struct rq *rq = cpu_rq(i); | ||
5492 | struct sched_entity *se; | ||
5493 | |||
5494 | se = tg->se[i]; | ||
5495 | /* Propagate contribution to hierarchy */ | ||
5496 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
5497 | for_each_sched_entity(se) | ||
5498 | update_cfs_shares(group_cfs_rq(se)); | ||
5499 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
5500 | } | ||
5501 | |||
5502 | done: | ||
5503 | mutex_unlock(&shares_mutex); | ||
5504 | return 0; | ||
5505 | } | ||
5506 | #else /* CONFIG_FAIR_GROUP_SCHED */ | ||
5507 | |||
5508 | void free_fair_sched_group(struct task_group *tg) { } | ||
5509 | |||
5510 | int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | ||
5511 | { | ||
5512 | return 1; | ||
5513 | } | ||
5514 | |||
5515 | void unregister_fair_sched_group(struct task_group *tg, int cpu) { } | ||
5516 | |||
5517 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
5518 | |||
4281 | 5519 | ||
4282 | static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) | 5520 | static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) |
4283 | { | 5521 | { |
@@ -4297,7 +5535,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task | |||
4297 | /* | 5535 | /* |
4298 | * All the scheduling class methods: | 5536 | * All the scheduling class methods: |
4299 | */ | 5537 | */ |
4300 | static const struct sched_class fair_sched_class = { | 5538 | const struct sched_class fair_sched_class = { |
4301 | .next = &idle_sched_class, | 5539 | .next = &idle_sched_class, |
4302 | .enqueue_task = enqueue_task_fair, | 5540 | .enqueue_task = enqueue_task_fair, |
4303 | .dequeue_task = dequeue_task_fair, | 5541 | .dequeue_task = dequeue_task_fair, |
@@ -4334,7 +5572,7 @@ static const struct sched_class fair_sched_class = { | |||
4334 | }; | 5572 | }; |
4335 | 5573 | ||
4336 | #ifdef CONFIG_SCHED_DEBUG | 5574 | #ifdef CONFIG_SCHED_DEBUG |
4337 | static void print_cfs_stats(struct seq_file *m, int cpu) | 5575 | void print_cfs_stats(struct seq_file *m, int cpu) |
4338 | { | 5576 | { |
4339 | struct cfs_rq *cfs_rq; | 5577 | struct cfs_rq *cfs_rq; |
4340 | 5578 | ||
@@ -4344,3 +5582,15 @@ static void print_cfs_stats(struct seq_file *m, int cpu) | |||
4344 | rcu_read_unlock(); | 5582 | rcu_read_unlock(); |
4345 | } | 5583 | } |
4346 | #endif | 5584 | #endif |
5585 | |||
5586 | __init void init_sched_fair_class(void) | ||
5587 | { | ||
5588 | #ifdef CONFIG_SMP | ||
5589 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); | ||
5590 | |||
5591 | #ifdef CONFIG_NO_HZ | ||
5592 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); | ||
5593 | #endif | ||
5594 | #endif /* SMP */ | ||
5595 | |||
5596 | } | ||
diff --git a/kernel/sched_features.h b/kernel/sched/features.h index 2e74677cb040..e61fd73913d0 100644 --- a/kernel/sched_features.h +++ b/kernel/sched/features.h | |||
@@ -3,18 +3,13 @@ | |||
3 | * them to run sooner, but does not allow tons of sleepers to | 3 | * them to run sooner, but does not allow tons of sleepers to |
4 | * rip the spread apart. | 4 | * rip the spread apart. |
5 | */ | 5 | */ |
6 | SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1) | 6 | SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) |
7 | 7 | ||
8 | /* | 8 | /* |
9 | * Place new tasks ahead so that they do not starve already running | 9 | * Place new tasks ahead so that they do not starve already running |
10 | * tasks | 10 | * tasks |
11 | */ | 11 | */ |
12 | SCHED_FEAT(START_DEBIT, 1) | 12 | SCHED_FEAT(START_DEBIT, true) |
13 | |||
14 | /* | ||
15 | * Should wakeups try to preempt running tasks. | ||
16 | */ | ||
17 | SCHED_FEAT(WAKEUP_PREEMPT, 1) | ||
18 | 13 | ||
19 | /* | 14 | /* |
20 | * Based on load and program behaviour, see if it makes sense to place | 15 | * Based on load and program behaviour, see if it makes sense to place |
@@ -22,53 +17,54 @@ SCHED_FEAT(WAKEUP_PREEMPT, 1) | |||
22 | * improve cache locality. Typically used with SYNC wakeups as | 17 | * improve cache locality. Typically used with SYNC wakeups as |
23 | * generated by pipes and the like, see also SYNC_WAKEUPS. | 18 | * generated by pipes and the like, see also SYNC_WAKEUPS. |
24 | */ | 19 | */ |
25 | SCHED_FEAT(AFFINE_WAKEUPS, 1) | 20 | SCHED_FEAT(AFFINE_WAKEUPS, true) |
26 | 21 | ||
27 | /* | 22 | /* |
28 | * Prefer to schedule the task we woke last (assuming it failed | 23 | * Prefer to schedule the task we woke last (assuming it failed |
29 | * wakeup-preemption), since its likely going to consume data we | 24 | * wakeup-preemption), since its likely going to consume data we |
30 | * touched, increases cache locality. | 25 | * touched, increases cache locality. |
31 | */ | 26 | */ |
32 | SCHED_FEAT(NEXT_BUDDY, 0) | 27 | SCHED_FEAT(NEXT_BUDDY, false) |
33 | 28 | ||
34 | /* | 29 | /* |
35 | * Prefer to schedule the task that ran last (when we did | 30 | * Prefer to schedule the task that ran last (when we did |
36 | * wake-preempt) as that likely will touch the same data, increases | 31 | * wake-preempt) as that likely will touch the same data, increases |
37 | * cache locality. | 32 | * cache locality. |
38 | */ | 33 | */ |
39 | SCHED_FEAT(LAST_BUDDY, 1) | 34 | SCHED_FEAT(LAST_BUDDY, true) |
40 | 35 | ||
41 | /* | 36 | /* |
42 | * Consider buddies to be cache hot, decreases the likelyness of a | 37 | * Consider buddies to be cache hot, decreases the likelyness of a |
43 | * cache buddy being migrated away, increases cache locality. | 38 | * cache buddy being migrated away, increases cache locality. |
44 | */ | 39 | */ |
45 | SCHED_FEAT(CACHE_HOT_BUDDY, 1) | 40 | SCHED_FEAT(CACHE_HOT_BUDDY, true) |
46 | 41 | ||
47 | /* | 42 | /* |
48 | * Use arch dependent cpu power functions | 43 | * Use arch dependent cpu power functions |
49 | */ | 44 | */ |
50 | SCHED_FEAT(ARCH_POWER, 0) | 45 | SCHED_FEAT(ARCH_POWER, false) |
51 | 46 | ||
52 | SCHED_FEAT(HRTICK, 0) | 47 | SCHED_FEAT(HRTICK, false) |
53 | SCHED_FEAT(DOUBLE_TICK, 0) | 48 | SCHED_FEAT(DOUBLE_TICK, false) |
54 | SCHED_FEAT(LB_BIAS, 1) | 49 | SCHED_FEAT(LB_BIAS, true) |
55 | 50 | ||
56 | /* | 51 | /* |
57 | * Spin-wait on mutex acquisition when the mutex owner is running on | 52 | * Spin-wait on mutex acquisition when the mutex owner is running on |
58 | * another cpu -- assumes that when the owner is running, it will soon | 53 | * another cpu -- assumes that when the owner is running, it will soon |
59 | * release the lock. Decreases scheduling overhead. | 54 | * release the lock. Decreases scheduling overhead. |
60 | */ | 55 | */ |
61 | SCHED_FEAT(OWNER_SPIN, 1) | 56 | SCHED_FEAT(OWNER_SPIN, true) |
62 | 57 | ||
63 | /* | 58 | /* |
64 | * Decrement CPU power based on time not spent running tasks | 59 | * Decrement CPU power based on time not spent running tasks |
65 | */ | 60 | */ |
66 | SCHED_FEAT(NONTASK_POWER, 1) | 61 | SCHED_FEAT(NONTASK_POWER, true) |
67 | 62 | ||
68 | /* | 63 | /* |
69 | * Queue remote wakeups on the target CPU and process them | 64 | * Queue remote wakeups on the target CPU and process them |
70 | * using the scheduler IPI. Reduces rq->lock contention/bounces. | 65 | * using the scheduler IPI. Reduces rq->lock contention/bounces. |
71 | */ | 66 | */ |
72 | SCHED_FEAT(TTWU_QUEUE, 1) | 67 | SCHED_FEAT(TTWU_QUEUE, true) |
73 | 68 | ||
74 | SCHED_FEAT(FORCE_SD_OVERLAP, 0) | 69 | SCHED_FEAT(FORCE_SD_OVERLAP, false) |
70 | SCHED_FEAT(RT_RUNTIME_SHARE, true) | ||
diff --git a/kernel/sched_idletask.c b/kernel/sched/idle_task.c index 0a51882534ea..91b4c957f289 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched/idle_task.c | |||
@@ -1,3 +1,5 @@ | |||
1 | #include "sched.h" | ||
2 | |||
1 | /* | 3 | /* |
2 | * idle-task scheduling class. | 4 | * idle-task scheduling class. |
3 | * | 5 | * |
@@ -71,7 +73,7 @@ static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task | |||
71 | /* | 73 | /* |
72 | * Simple, special scheduling class for the per-CPU idle tasks: | 74 | * Simple, special scheduling class for the per-CPU idle tasks: |
73 | */ | 75 | */ |
74 | static const struct sched_class idle_sched_class = { | 76 | const struct sched_class idle_sched_class = { |
75 | /* .next is NULL */ | 77 | /* .next is NULL */ |
76 | /* no enqueue/yield_task for idle tasks */ | 78 | /* no enqueue/yield_task for idle tasks */ |
77 | 79 | ||
diff --git a/kernel/sched_rt.c b/kernel/sched/rt.c index af1177858be3..3640ebbb466b 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched/rt.c | |||
@@ -3,7 +3,92 @@ | |||
3 | * policies) | 3 | * policies) |
4 | */ | 4 | */ |
5 | 5 | ||
6 | #include "sched.h" | ||
7 | |||
8 | #include <linux/slab.h> | ||
9 | |||
10 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); | ||
11 | |||
12 | struct rt_bandwidth def_rt_bandwidth; | ||
13 | |||
14 | static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) | ||
15 | { | ||
16 | struct rt_bandwidth *rt_b = | ||
17 | container_of(timer, struct rt_bandwidth, rt_period_timer); | ||
18 | ktime_t now; | ||
19 | int overrun; | ||
20 | int idle = 0; | ||
21 | |||
22 | for (;;) { | ||
23 | now = hrtimer_cb_get_time(timer); | ||
24 | overrun = hrtimer_forward(timer, now, rt_b->rt_period); | ||
25 | |||
26 | if (!overrun) | ||
27 | break; | ||
28 | |||
29 | idle = do_sched_rt_period_timer(rt_b, overrun); | ||
30 | } | ||
31 | |||
32 | return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; | ||
33 | } | ||
34 | |||
35 | void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) | ||
36 | { | ||
37 | rt_b->rt_period = ns_to_ktime(period); | ||
38 | rt_b->rt_runtime = runtime; | ||
39 | |||
40 | raw_spin_lock_init(&rt_b->rt_runtime_lock); | ||
41 | |||
42 | hrtimer_init(&rt_b->rt_period_timer, | ||
43 | CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
44 | rt_b->rt_period_timer.function = sched_rt_period_timer; | ||
45 | } | ||
46 | |||
47 | static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | ||
48 | { | ||
49 | if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) | ||
50 | return; | ||
51 | |||
52 | if (hrtimer_active(&rt_b->rt_period_timer)) | ||
53 | return; | ||
54 | |||
55 | raw_spin_lock(&rt_b->rt_runtime_lock); | ||
56 | start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); | ||
57 | raw_spin_unlock(&rt_b->rt_runtime_lock); | ||
58 | } | ||
59 | |||
60 | void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | ||
61 | { | ||
62 | struct rt_prio_array *array; | ||
63 | int i; | ||
64 | |||
65 | array = &rt_rq->active; | ||
66 | for (i = 0; i < MAX_RT_PRIO; i++) { | ||
67 | INIT_LIST_HEAD(array->queue + i); | ||
68 | __clear_bit(i, array->bitmap); | ||
69 | } | ||
70 | /* delimiter for bitsearch: */ | ||
71 | __set_bit(MAX_RT_PRIO, array->bitmap); | ||
72 | |||
73 | #if defined CONFIG_SMP | ||
74 | rt_rq->highest_prio.curr = MAX_RT_PRIO; | ||
75 | rt_rq->highest_prio.next = MAX_RT_PRIO; | ||
76 | rt_rq->rt_nr_migratory = 0; | ||
77 | rt_rq->overloaded = 0; | ||
78 | plist_head_init(&rt_rq->pushable_tasks); | ||
79 | #endif | ||
80 | |||
81 | rt_rq->rt_time = 0; | ||
82 | rt_rq->rt_throttled = 0; | ||
83 | rt_rq->rt_runtime = 0; | ||
84 | raw_spin_lock_init(&rt_rq->rt_runtime_lock); | ||
85 | } | ||
86 | |||
6 | #ifdef CONFIG_RT_GROUP_SCHED | 87 | #ifdef CONFIG_RT_GROUP_SCHED |
88 | static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) | ||
89 | { | ||
90 | hrtimer_cancel(&rt_b->rt_period_timer); | ||
91 | } | ||
7 | 92 | ||
8 | #define rt_entity_is_task(rt_se) (!(rt_se)->my_q) | 93 | #define rt_entity_is_task(rt_se) (!(rt_se)->my_q) |
9 | 94 | ||
@@ -25,6 +110,91 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) | |||
25 | return rt_se->rt_rq; | 110 | return rt_se->rt_rq; |
26 | } | 111 | } |
27 | 112 | ||
113 | void free_rt_sched_group(struct task_group *tg) | ||
114 | { | ||
115 | int i; | ||
116 | |||
117 | if (tg->rt_se) | ||
118 | destroy_rt_bandwidth(&tg->rt_bandwidth); | ||
119 | |||
120 | for_each_possible_cpu(i) { | ||
121 | if (tg->rt_rq) | ||
122 | kfree(tg->rt_rq[i]); | ||
123 | if (tg->rt_se) | ||
124 | kfree(tg->rt_se[i]); | ||
125 | } | ||
126 | |||
127 | kfree(tg->rt_rq); | ||
128 | kfree(tg->rt_se); | ||
129 | } | ||
130 | |||
131 | void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | ||
132 | struct sched_rt_entity *rt_se, int cpu, | ||
133 | struct sched_rt_entity *parent) | ||
134 | { | ||
135 | struct rq *rq = cpu_rq(cpu); | ||
136 | |||
137 | rt_rq->highest_prio.curr = MAX_RT_PRIO; | ||
138 | rt_rq->rt_nr_boosted = 0; | ||
139 | rt_rq->rq = rq; | ||
140 | rt_rq->tg = tg; | ||
141 | |||
142 | tg->rt_rq[cpu] = rt_rq; | ||
143 | tg->rt_se[cpu] = rt_se; | ||
144 | |||
145 | if (!rt_se) | ||
146 | return; | ||
147 | |||
148 | if (!parent) | ||
149 | rt_se->rt_rq = &rq->rt; | ||
150 | else | ||
151 | rt_se->rt_rq = parent->my_q; | ||
152 | |||
153 | rt_se->my_q = rt_rq; | ||
154 | rt_se->parent = parent; | ||
155 | INIT_LIST_HEAD(&rt_se->run_list); | ||
156 | } | ||
157 | |||
158 | int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | ||
159 | { | ||
160 | struct rt_rq *rt_rq; | ||
161 | struct sched_rt_entity *rt_se; | ||
162 | int i; | ||
163 | |||
164 | tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); | ||
165 | if (!tg->rt_rq) | ||
166 | goto err; | ||
167 | tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); | ||
168 | if (!tg->rt_se) | ||
169 | goto err; | ||
170 | |||
171 | init_rt_bandwidth(&tg->rt_bandwidth, | ||
172 | ktime_to_ns(def_rt_bandwidth.rt_period), 0); | ||
173 | |||
174 | for_each_possible_cpu(i) { | ||
175 | rt_rq = kzalloc_node(sizeof(struct rt_rq), | ||
176 | GFP_KERNEL, cpu_to_node(i)); | ||
177 | if (!rt_rq) | ||
178 | goto err; | ||
179 | |||
180 | rt_se = kzalloc_node(sizeof(struct sched_rt_entity), | ||
181 | GFP_KERNEL, cpu_to_node(i)); | ||
182 | if (!rt_se) | ||
183 | goto err_free_rq; | ||
184 | |||
185 | init_rt_rq(rt_rq, cpu_rq(i)); | ||
186 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; | ||
187 | init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); | ||
188 | } | ||
189 | |||
190 | return 1; | ||
191 | |||
192 | err_free_rq: | ||
193 | kfree(rt_rq); | ||
194 | err: | ||
195 | return 0; | ||
196 | } | ||
197 | |||
28 | #else /* CONFIG_RT_GROUP_SCHED */ | 198 | #else /* CONFIG_RT_GROUP_SCHED */ |
29 | 199 | ||
30 | #define rt_entity_is_task(rt_se) (1) | 200 | #define rt_entity_is_task(rt_se) (1) |
@@ -47,6 +217,12 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) | |||
47 | return &rq->rt; | 217 | return &rq->rt; |
48 | } | 218 | } |
49 | 219 | ||
220 | void free_rt_sched_group(struct task_group *tg) { } | ||
221 | |||
222 | int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | ||
223 | { | ||
224 | return 1; | ||
225 | } | ||
50 | #endif /* CONFIG_RT_GROUP_SCHED */ | 226 | #endif /* CONFIG_RT_GROUP_SCHED */ |
51 | 227 | ||
52 | #ifdef CONFIG_SMP | 228 | #ifdef CONFIG_SMP |
@@ -124,21 +300,33 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | |||
124 | update_rt_migration(rt_rq); | 300 | update_rt_migration(rt_rq); |
125 | } | 301 | } |
126 | 302 | ||
303 | static inline int has_pushable_tasks(struct rq *rq) | ||
304 | { | ||
305 | return !plist_head_empty(&rq->rt.pushable_tasks); | ||
306 | } | ||
307 | |||
127 | static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) | 308 | static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) |
128 | { | 309 | { |
129 | plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); | 310 | plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); |
130 | plist_node_init(&p->pushable_tasks, p->prio); | 311 | plist_node_init(&p->pushable_tasks, p->prio); |
131 | plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks); | 312 | plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks); |
313 | |||
314 | /* Update the highest prio pushable task */ | ||
315 | if (p->prio < rq->rt.highest_prio.next) | ||
316 | rq->rt.highest_prio.next = p->prio; | ||
132 | } | 317 | } |
133 | 318 | ||
134 | static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) | 319 | static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) |
135 | { | 320 | { |
136 | plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); | 321 | plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); |
137 | } | ||
138 | 322 | ||
139 | static inline int has_pushable_tasks(struct rq *rq) | 323 | /* Update the new highest prio pushable task */ |
140 | { | 324 | if (has_pushable_tasks(rq)) { |
141 | return !plist_head_empty(&rq->rt.pushable_tasks); | 325 | p = plist_first_entry(&rq->rt.pushable_tasks, |
326 | struct task_struct, pushable_tasks); | ||
327 | rq->rt.highest_prio.next = p->prio; | ||
328 | } else | ||
329 | rq->rt.highest_prio.next = MAX_RT_PRIO; | ||
142 | } | 330 | } |
143 | 331 | ||
144 | #else | 332 | #else |
@@ -544,10 +732,35 @@ static void enable_runtime(struct rq *rq) | |||
544 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 732 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
545 | } | 733 | } |
546 | 734 | ||
735 | int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu) | ||
736 | { | ||
737 | int cpu = (int)(long)hcpu; | ||
738 | |||
739 | switch (action) { | ||
740 | case CPU_DOWN_PREPARE: | ||
741 | case CPU_DOWN_PREPARE_FROZEN: | ||
742 | disable_runtime(cpu_rq(cpu)); | ||
743 | return NOTIFY_OK; | ||
744 | |||
745 | case CPU_DOWN_FAILED: | ||
746 | case CPU_DOWN_FAILED_FROZEN: | ||
747 | case CPU_ONLINE: | ||
748 | case CPU_ONLINE_FROZEN: | ||
749 | enable_runtime(cpu_rq(cpu)); | ||
750 | return NOTIFY_OK; | ||
751 | |||
752 | default: | ||
753 | return NOTIFY_DONE; | ||
754 | } | ||
755 | } | ||
756 | |||
547 | static int balance_runtime(struct rt_rq *rt_rq) | 757 | static int balance_runtime(struct rt_rq *rt_rq) |
548 | { | 758 | { |
549 | int more = 0; | 759 | int more = 0; |
550 | 760 | ||
761 | if (!sched_feat(RT_RUNTIME_SHARE)) | ||
762 | return more; | ||
763 | |||
551 | if (rt_rq->rt_time > rt_rq->rt_runtime) { | 764 | if (rt_rq->rt_time > rt_rq->rt_runtime) { |
552 | raw_spin_unlock(&rt_rq->rt_runtime_lock); | 765 | raw_spin_unlock(&rt_rq->rt_runtime_lock); |
553 | more = do_balance_runtime(rt_rq); | 766 | more = do_balance_runtime(rt_rq); |
@@ -633,7 +846,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) | |||
633 | if (rt_rq->rt_throttled) | 846 | if (rt_rq->rt_throttled) |
634 | return rt_rq_throttled(rt_rq); | 847 | return rt_rq_throttled(rt_rq); |
635 | 848 | ||
636 | if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq)) | 849 | if (runtime >= sched_rt_period(rt_rq)) |
637 | return 0; | 850 | return 0; |
638 | 851 | ||
639 | balance_runtime(rt_rq); | 852 | balance_runtime(rt_rq); |
@@ -643,6 +856,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) | |||
643 | 856 | ||
644 | if (rt_rq->rt_time > runtime) { | 857 | if (rt_rq->rt_time > runtime) { |
645 | rt_rq->rt_throttled = 1; | 858 | rt_rq->rt_throttled = 1; |
859 | printk_once(KERN_WARNING "sched: RT throttling activated\n"); | ||
646 | if (rt_rq_throttled(rt_rq)) { | 860 | if (rt_rq_throttled(rt_rq)) { |
647 | sched_rt_rq_dequeue(rt_rq); | 861 | sched_rt_rq_dequeue(rt_rq); |
648 | return 1; | 862 | return 1; |
@@ -698,47 +912,13 @@ static void update_curr_rt(struct rq *rq) | |||
698 | 912 | ||
699 | #if defined CONFIG_SMP | 913 | #if defined CONFIG_SMP |
700 | 914 | ||
701 | static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu); | ||
702 | |||
703 | static inline int next_prio(struct rq *rq) | ||
704 | { | ||
705 | struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu); | ||
706 | |||
707 | if (next && rt_prio(next->prio)) | ||
708 | return next->prio; | ||
709 | else | ||
710 | return MAX_RT_PRIO; | ||
711 | } | ||
712 | |||
713 | static void | 915 | static void |
714 | inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) | 916 | inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) |
715 | { | 917 | { |
716 | struct rq *rq = rq_of_rt_rq(rt_rq); | 918 | struct rq *rq = rq_of_rt_rq(rt_rq); |
717 | 919 | ||
718 | if (prio < prev_prio) { | 920 | if (rq->online && prio < prev_prio) |
719 | 921 | cpupri_set(&rq->rd->cpupri, rq->cpu, prio); | |
720 | /* | ||
721 | * If the new task is higher in priority than anything on the | ||
722 | * run-queue, we know that the previous high becomes our | ||
723 | * next-highest. | ||
724 | */ | ||
725 | rt_rq->highest_prio.next = prev_prio; | ||
726 | |||
727 | if (rq->online) | ||
728 | cpupri_set(&rq->rd->cpupri, rq->cpu, prio); | ||
729 | |||
730 | } else if (prio == rt_rq->highest_prio.curr) | ||
731 | /* | ||
732 | * If the next task is equal in priority to the highest on | ||
733 | * the run-queue, then we implicitly know that the next highest | ||
734 | * task cannot be any lower than current | ||
735 | */ | ||
736 | rt_rq->highest_prio.next = prio; | ||
737 | else if (prio < rt_rq->highest_prio.next) | ||
738 | /* | ||
739 | * Otherwise, we need to recompute next-highest | ||
740 | */ | ||
741 | rt_rq->highest_prio.next = next_prio(rq); | ||
742 | } | 922 | } |
743 | 923 | ||
744 | static void | 924 | static void |
@@ -746,9 +926,6 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) | |||
746 | { | 926 | { |
747 | struct rq *rq = rq_of_rt_rq(rt_rq); | 927 | struct rq *rq = rq_of_rt_rq(rt_rq); |
748 | 928 | ||
749 | if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next)) | ||
750 | rt_rq->highest_prio.next = next_prio(rq); | ||
751 | |||
752 | if (rq->online && rt_rq->highest_prio.curr != prev_prio) | 929 | if (rq->online && rt_rq->highest_prio.curr != prev_prio) |
753 | cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); | 930 | cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); |
754 | } | 931 | } |
@@ -961,6 +1138,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) | |||
961 | 1138 | ||
962 | if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) | 1139 | if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) |
963 | enqueue_pushable_task(rq, p); | 1140 | enqueue_pushable_task(rq, p); |
1141 | |||
1142 | inc_nr_running(rq); | ||
964 | } | 1143 | } |
965 | 1144 | ||
966 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) | 1145 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) |
@@ -971,11 +1150,13 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) | |||
971 | dequeue_rt_entity(rt_se); | 1150 | dequeue_rt_entity(rt_se); |
972 | 1151 | ||
973 | dequeue_pushable_task(rq, p); | 1152 | dequeue_pushable_task(rq, p); |
1153 | |||
1154 | dec_nr_running(rq); | ||
974 | } | 1155 | } |
975 | 1156 | ||
976 | /* | 1157 | /* |
977 | * Put task to the end of the run list without the overhead of dequeue | 1158 | * Put task to the head or the end of the run list without the overhead of |
978 | * followed by enqueue. | 1159 | * dequeue followed by enqueue. |
979 | */ | 1160 | */ |
980 | static void | 1161 | static void |
981 | requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head) | 1162 | requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head) |
@@ -1017,10 +1198,15 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) | |||
1017 | struct rq *rq; | 1198 | struct rq *rq; |
1018 | int cpu; | 1199 | int cpu; |
1019 | 1200 | ||
1020 | if (sd_flag != SD_BALANCE_WAKE) | ||
1021 | return smp_processor_id(); | ||
1022 | |||
1023 | cpu = task_cpu(p); | 1201 | cpu = task_cpu(p); |
1202 | |||
1203 | if (p->rt.nr_cpus_allowed == 1) | ||
1204 | goto out; | ||
1205 | |||
1206 | /* For anything but wake ups, just return the task_cpu */ | ||
1207 | if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) | ||
1208 | goto out; | ||
1209 | |||
1024 | rq = cpu_rq(cpu); | 1210 | rq = cpu_rq(cpu); |
1025 | 1211 | ||
1026 | rcu_read_lock(); | 1212 | rcu_read_lock(); |
@@ -1059,6 +1245,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) | |||
1059 | } | 1245 | } |
1060 | rcu_read_unlock(); | 1246 | rcu_read_unlock(); |
1061 | 1247 | ||
1248 | out: | ||
1062 | return cpu; | 1249 | return cpu; |
1063 | } | 1250 | } |
1064 | 1251 | ||
@@ -1178,7 +1365,6 @@ static struct task_struct *pick_next_task_rt(struct rq *rq) | |||
1178 | static void put_prev_task_rt(struct rq *rq, struct task_struct *p) | 1365 | static void put_prev_task_rt(struct rq *rq, struct task_struct *p) |
1179 | { | 1366 | { |
1180 | update_curr_rt(rq); | 1367 | update_curr_rt(rq); |
1181 | p->se.exec_start = 0; | ||
1182 | 1368 | ||
1183 | /* | 1369 | /* |
1184 | * The previous task needs to be made eligible for pushing | 1370 | * The previous task needs to be made eligible for pushing |
@@ -1193,12 +1379,10 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) | |||
1193 | /* Only try algorithms three times */ | 1379 | /* Only try algorithms three times */ |
1194 | #define RT_MAX_TRIES 3 | 1380 | #define RT_MAX_TRIES 3 |
1195 | 1381 | ||
1196 | static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); | ||
1197 | |||
1198 | static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) | 1382 | static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) |
1199 | { | 1383 | { |
1200 | if (!task_running(rq, p) && | 1384 | if (!task_running(rq, p) && |
1201 | (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && | 1385 | (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && |
1202 | (p->rt.nr_cpus_allowed > 1)) | 1386 | (p->rt.nr_cpus_allowed > 1)) |
1203 | return 1; | 1387 | return 1; |
1204 | return 0; | 1388 | return 0; |
@@ -1343,7 +1527,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) | |||
1343 | */ | 1527 | */ |
1344 | if (unlikely(task_rq(task) != rq || | 1528 | if (unlikely(task_rq(task) != rq || |
1345 | !cpumask_test_cpu(lowest_rq->cpu, | 1529 | !cpumask_test_cpu(lowest_rq->cpu, |
1346 | &task->cpus_allowed) || | 1530 | tsk_cpus_allowed(task)) || |
1347 | task_running(rq, task) || | 1531 | task_running(rq, task) || |
1348 | !task->on_rq)) { | 1532 | !task->on_rq)) { |
1349 | 1533 | ||
@@ -1394,6 +1578,7 @@ static int push_rt_task(struct rq *rq) | |||
1394 | { | 1578 | { |
1395 | struct task_struct *next_task; | 1579 | struct task_struct *next_task; |
1396 | struct rq *lowest_rq; | 1580 | struct rq *lowest_rq; |
1581 | int ret = 0; | ||
1397 | 1582 | ||
1398 | if (!rq->rt.overloaded) | 1583 | if (!rq->rt.overloaded) |
1399 | return 0; | 1584 | return 0; |
@@ -1426,7 +1611,7 @@ retry: | |||
1426 | if (!lowest_rq) { | 1611 | if (!lowest_rq) { |
1427 | struct task_struct *task; | 1612 | struct task_struct *task; |
1428 | /* | 1613 | /* |
1429 | * find lock_lowest_rq releases rq->lock | 1614 | * find_lock_lowest_rq releases rq->lock |
1430 | * so it is possible that next_task has migrated. | 1615 | * so it is possible that next_task has migrated. |
1431 | * | 1616 | * |
1432 | * We need to make sure that the task is still on the same | 1617 | * We need to make sure that the task is still on the same |
@@ -1436,12 +1621,11 @@ retry: | |||
1436 | task = pick_next_pushable_task(rq); | 1621 | task = pick_next_pushable_task(rq); |
1437 | if (task_cpu(next_task) == rq->cpu && task == next_task) { | 1622 | if (task_cpu(next_task) == rq->cpu && task == next_task) { |
1438 | /* | 1623 | /* |
1439 | * If we get here, the task hasn't moved at all, but | 1624 | * The task hasn't migrated, and is still the next |
1440 | * it has failed to push. We will not try again, | 1625 | * eligible task, but we failed to find a run-queue |
1441 | * since the other cpus will pull from us when they | 1626 | * to push it to. Do not retry in this case, since |
1442 | * are ready. | 1627 | * other cpus will pull from us when ready. |
1443 | */ | 1628 | */ |
1444 | dequeue_pushable_task(rq, next_task); | ||
1445 | goto out; | 1629 | goto out; |
1446 | } | 1630 | } |
1447 | 1631 | ||
@@ -1460,6 +1644,7 @@ retry: | |||
1460 | deactivate_task(rq, next_task, 0); | 1644 | deactivate_task(rq, next_task, 0); |
1461 | set_task_cpu(next_task, lowest_rq->cpu); | 1645 | set_task_cpu(next_task, lowest_rq->cpu); |
1462 | activate_task(lowest_rq, next_task, 0); | 1646 | activate_task(lowest_rq, next_task, 0); |
1647 | ret = 1; | ||
1463 | 1648 | ||
1464 | resched_task(lowest_rq->curr); | 1649 | resched_task(lowest_rq->curr); |
1465 | 1650 | ||
@@ -1468,7 +1653,7 @@ retry: | |||
1468 | out: | 1653 | out: |
1469 | put_task_struct(next_task); | 1654 | put_task_struct(next_task); |
1470 | 1655 | ||
1471 | return 1; | 1656 | return ret; |
1472 | } | 1657 | } |
1473 | 1658 | ||
1474 | static void push_rt_tasks(struct rq *rq) | 1659 | static void push_rt_tasks(struct rq *rq) |
@@ -1626,9 +1811,6 @@ static void set_cpus_allowed_rt(struct task_struct *p, | |||
1626 | 1811 | ||
1627 | update_rt_migration(&rq->rt); | 1812 | update_rt_migration(&rq->rt); |
1628 | } | 1813 | } |
1629 | |||
1630 | cpumask_copy(&p->cpus_allowed, new_mask); | ||
1631 | p->rt.nr_cpus_allowed = weight; | ||
1632 | } | 1814 | } |
1633 | 1815 | ||
1634 | /* Assumes rq->lock is held */ | 1816 | /* Assumes rq->lock is held */ |
@@ -1670,13 +1852,14 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) | |||
1670 | pull_rt_task(rq); | 1852 | pull_rt_task(rq); |
1671 | } | 1853 | } |
1672 | 1854 | ||
1673 | static inline void init_sched_rt_class(void) | 1855 | void init_sched_rt_class(void) |
1674 | { | 1856 | { |
1675 | unsigned int i; | 1857 | unsigned int i; |
1676 | 1858 | ||
1677 | for_each_possible_cpu(i) | 1859 | for_each_possible_cpu(i) { |
1678 | zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), | 1860 | zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), |
1679 | GFP_KERNEL, cpu_to_node(i)); | 1861 | GFP_KERNEL, cpu_to_node(i)); |
1862 | } | ||
1680 | } | 1863 | } |
1681 | #endif /* CONFIG_SMP */ | 1864 | #endif /* CONFIG_SMP */ |
1682 | 1865 | ||
@@ -1817,7 +2000,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) | |||
1817 | return 0; | 2000 | return 0; |
1818 | } | 2001 | } |
1819 | 2002 | ||
1820 | static const struct sched_class rt_sched_class = { | 2003 | const struct sched_class rt_sched_class = { |
1821 | .next = &fair_sched_class, | 2004 | .next = &fair_sched_class, |
1822 | .enqueue_task = enqueue_task_rt, | 2005 | .enqueue_task = enqueue_task_rt, |
1823 | .dequeue_task = dequeue_task_rt, | 2006 | .dequeue_task = dequeue_task_rt, |
@@ -1852,7 +2035,7 @@ static const struct sched_class rt_sched_class = { | |||
1852 | #ifdef CONFIG_SCHED_DEBUG | 2035 | #ifdef CONFIG_SCHED_DEBUG |
1853 | extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); | 2036 | extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); |
1854 | 2037 | ||
1855 | static void print_rt_stats(struct seq_file *m, int cpu) | 2038 | void print_rt_stats(struct seq_file *m, int cpu) |
1856 | { | 2039 | { |
1857 | rt_rq_iter_t iter; | 2040 | rt_rq_iter_t iter; |
1858 | struct rt_rq *rt_rq; | 2041 | struct rt_rq *rt_rq; |
@@ -1863,4 +2046,3 @@ static void print_rt_stats(struct seq_file *m, int cpu) | |||
1863 | rcu_read_unlock(); | 2046 | rcu_read_unlock(); |
1864 | } | 2047 | } |
1865 | #endif /* CONFIG_SCHED_DEBUG */ | 2048 | #endif /* CONFIG_SCHED_DEBUG */ |
1866 | |||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h new file mode 100644 index 000000000000..98c0c2623db8 --- /dev/null +++ b/kernel/sched/sched.h | |||
@@ -0,0 +1,1166 @@ | |||
1 | |||
2 | #include <linux/sched.h> | ||
3 | #include <linux/mutex.h> | ||
4 | #include <linux/spinlock.h> | ||
5 | #include <linux/stop_machine.h> | ||
6 | |||
7 | #include "cpupri.h" | ||
8 | |||
9 | extern __read_mostly int scheduler_running; | ||
10 | |||
11 | /* | ||
12 | * Convert user-nice values [ -20 ... 0 ... 19 ] | ||
13 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], | ||
14 | * and back. | ||
15 | */ | ||
16 | #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) | ||
17 | #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) | ||
18 | #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) | ||
19 | |||
20 | /* | ||
21 | * 'User priority' is the nice value converted to something we | ||
22 | * can work with better when scaling various scheduler parameters, | ||
23 | * it's a [ 0 ... 39 ] range. | ||
24 | */ | ||
25 | #define USER_PRIO(p) ((p)-MAX_RT_PRIO) | ||
26 | #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) | ||
27 | #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) | ||
28 | |||
29 | /* | ||
30 | * Helpers for converting nanosecond timing to jiffy resolution | ||
31 | */ | ||
32 | #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) | ||
33 | |||
34 | #define NICE_0_LOAD SCHED_LOAD_SCALE | ||
35 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT | ||
36 | |||
37 | /* | ||
38 | * These are the 'tuning knobs' of the scheduler: | ||
39 | * | ||
40 | * default timeslice is 100 msecs (used only for SCHED_RR tasks). | ||
41 | * Timeslices get refilled after they expire. | ||
42 | */ | ||
43 | #define DEF_TIMESLICE (100 * HZ / 1000) | ||
44 | |||
45 | /* | ||
46 | * single value that denotes runtime == period, ie unlimited time. | ||
47 | */ | ||
48 | #define RUNTIME_INF ((u64)~0ULL) | ||
49 | |||
50 | static inline int rt_policy(int policy) | ||
51 | { | ||
52 | if (policy == SCHED_FIFO || policy == SCHED_RR) | ||
53 | return 1; | ||
54 | return 0; | ||
55 | } | ||
56 | |||
57 | static inline int task_has_rt_policy(struct task_struct *p) | ||
58 | { | ||
59 | return rt_policy(p->policy); | ||
60 | } | ||
61 | |||
62 | /* | ||
63 | * This is the priority-queue data structure of the RT scheduling class: | ||
64 | */ | ||
65 | struct rt_prio_array { | ||
66 | DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ | ||
67 | struct list_head queue[MAX_RT_PRIO]; | ||
68 | }; | ||
69 | |||
70 | struct rt_bandwidth { | ||
71 | /* nests inside the rq lock: */ | ||
72 | raw_spinlock_t rt_runtime_lock; | ||
73 | ktime_t rt_period; | ||
74 | u64 rt_runtime; | ||
75 | struct hrtimer rt_period_timer; | ||
76 | }; | ||
77 | |||
78 | extern struct mutex sched_domains_mutex; | ||
79 | |||
80 | #ifdef CONFIG_CGROUP_SCHED | ||
81 | |||
82 | #include <linux/cgroup.h> | ||
83 | |||
84 | struct cfs_rq; | ||
85 | struct rt_rq; | ||
86 | |||
87 | static LIST_HEAD(task_groups); | ||
88 | |||
89 | struct cfs_bandwidth { | ||
90 | #ifdef CONFIG_CFS_BANDWIDTH | ||
91 | raw_spinlock_t lock; | ||
92 | ktime_t period; | ||
93 | u64 quota, runtime; | ||
94 | s64 hierarchal_quota; | ||
95 | u64 runtime_expires; | ||
96 | |||
97 | int idle, timer_active; | ||
98 | struct hrtimer period_timer, slack_timer; | ||
99 | struct list_head throttled_cfs_rq; | ||
100 | |||
101 | /* statistics */ | ||
102 | int nr_periods, nr_throttled; | ||
103 | u64 throttled_time; | ||
104 | #endif | ||
105 | }; | ||
106 | |||
107 | /* task group related information */ | ||
108 | struct task_group { | ||
109 | struct cgroup_subsys_state css; | ||
110 | |||
111 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
112 | /* schedulable entities of this group on each cpu */ | ||
113 | struct sched_entity **se; | ||
114 | /* runqueue "owned" by this group on each cpu */ | ||
115 | struct cfs_rq **cfs_rq; | ||
116 | unsigned long shares; | ||
117 | |||
118 | atomic_t load_weight; | ||
119 | #endif | ||
120 | |||
121 | #ifdef CONFIG_RT_GROUP_SCHED | ||
122 | struct sched_rt_entity **rt_se; | ||
123 | struct rt_rq **rt_rq; | ||
124 | |||
125 | struct rt_bandwidth rt_bandwidth; | ||
126 | #endif | ||
127 | |||
128 | struct rcu_head rcu; | ||
129 | struct list_head list; | ||
130 | |||
131 | struct task_group *parent; | ||
132 | struct list_head siblings; | ||
133 | struct list_head children; | ||
134 | |||
135 | #ifdef CONFIG_SCHED_AUTOGROUP | ||
136 | struct autogroup *autogroup; | ||
137 | #endif | ||
138 | |||
139 | struct cfs_bandwidth cfs_bandwidth; | ||
140 | }; | ||
141 | |||
142 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
143 | #define ROOT_TASK_GROUP_LOAD NICE_0_LOAD | ||
144 | |||
145 | /* | ||
146 | * A weight of 0 or 1 can cause arithmetics problems. | ||
147 | * A weight of a cfs_rq is the sum of weights of which entities | ||
148 | * are queued on this cfs_rq, so a weight of a entity should not be | ||
149 | * too large, so as the shares value of a task group. | ||
150 | * (The default weight is 1024 - so there's no practical | ||
151 | * limitation from this.) | ||
152 | */ | ||
153 | #define MIN_SHARES (1UL << 1) | ||
154 | #define MAX_SHARES (1UL << 18) | ||
155 | #endif | ||
156 | |||
157 | /* Default task group. | ||
158 | * Every task in system belong to this group at bootup. | ||
159 | */ | ||
160 | extern struct task_group root_task_group; | ||
161 | |||
162 | typedef int (*tg_visitor)(struct task_group *, void *); | ||
163 | |||
164 | extern int walk_tg_tree_from(struct task_group *from, | ||
165 | tg_visitor down, tg_visitor up, void *data); | ||
166 | |||
167 | /* | ||
168 | * Iterate the full tree, calling @down when first entering a node and @up when | ||
169 | * leaving it for the final time. | ||
170 | * | ||
171 | * Caller must hold rcu_lock or sufficient equivalent. | ||
172 | */ | ||
173 | static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) | ||
174 | { | ||
175 | return walk_tg_tree_from(&root_task_group, down, up, data); | ||
176 | } | ||
177 | |||
178 | extern int tg_nop(struct task_group *tg, void *data); | ||
179 | |||
180 | extern void free_fair_sched_group(struct task_group *tg); | ||
181 | extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); | ||
182 | extern void unregister_fair_sched_group(struct task_group *tg, int cpu); | ||
183 | extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | ||
184 | struct sched_entity *se, int cpu, | ||
185 | struct sched_entity *parent); | ||
186 | extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); | ||
187 | extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); | ||
188 | |||
189 | extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); | ||
190 | extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); | ||
191 | extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); | ||
192 | |||
193 | extern void free_rt_sched_group(struct task_group *tg); | ||
194 | extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent); | ||
195 | extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | ||
196 | struct sched_rt_entity *rt_se, int cpu, | ||
197 | struct sched_rt_entity *parent); | ||
198 | |||
199 | #else /* CONFIG_CGROUP_SCHED */ | ||
200 | |||
201 | struct cfs_bandwidth { }; | ||
202 | |||
203 | #endif /* CONFIG_CGROUP_SCHED */ | ||
204 | |||
205 | /* CFS-related fields in a runqueue */ | ||
206 | struct cfs_rq { | ||
207 | struct load_weight load; | ||
208 | unsigned long nr_running, h_nr_running; | ||
209 | |||
210 | u64 exec_clock; | ||
211 | u64 min_vruntime; | ||
212 | #ifndef CONFIG_64BIT | ||
213 | u64 min_vruntime_copy; | ||
214 | #endif | ||
215 | |||
216 | struct rb_root tasks_timeline; | ||
217 | struct rb_node *rb_leftmost; | ||
218 | |||
219 | struct list_head tasks; | ||
220 | struct list_head *balance_iterator; | ||
221 | |||
222 | /* | ||
223 | * 'curr' points to currently running entity on this cfs_rq. | ||
224 | * It is set to NULL otherwise (i.e when none are currently running). | ||
225 | */ | ||
226 | struct sched_entity *curr, *next, *last, *skip; | ||
227 | |||
228 | #ifdef CONFIG_SCHED_DEBUG | ||
229 | unsigned int nr_spread_over; | ||
230 | #endif | ||
231 | |||
232 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
233 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | ||
234 | |||
235 | /* | ||
236 | * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in | ||
237 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities | ||
238 | * (like users, containers etc.) | ||
239 | * | ||
240 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This | ||
241 | * list is used during load balance. | ||
242 | */ | ||
243 | int on_list; | ||
244 | struct list_head leaf_cfs_rq_list; | ||
245 | struct task_group *tg; /* group that "owns" this runqueue */ | ||
246 | |||
247 | #ifdef CONFIG_SMP | ||
248 | /* | ||
249 | * the part of load.weight contributed by tasks | ||
250 | */ | ||
251 | unsigned long task_weight; | ||
252 | |||
253 | /* | ||
254 | * h_load = weight * f(tg) | ||
255 | * | ||
256 | * Where f(tg) is the recursive weight fraction assigned to | ||
257 | * this group. | ||
258 | */ | ||
259 | unsigned long h_load; | ||
260 | |||
261 | /* | ||
262 | * Maintaining per-cpu shares distribution for group scheduling | ||
263 | * | ||
264 | * load_stamp is the last time we updated the load average | ||
265 | * load_last is the last time we updated the load average and saw load | ||
266 | * load_unacc_exec_time is currently unaccounted execution time | ||
267 | */ | ||
268 | u64 load_avg; | ||
269 | u64 load_period; | ||
270 | u64 load_stamp, load_last, load_unacc_exec_time; | ||
271 | |||
272 | unsigned long load_contribution; | ||
273 | #endif /* CONFIG_SMP */ | ||
274 | #ifdef CONFIG_CFS_BANDWIDTH | ||
275 | int runtime_enabled; | ||
276 | u64 runtime_expires; | ||
277 | s64 runtime_remaining; | ||
278 | |||
279 | u64 throttled_timestamp; | ||
280 | int throttled, throttle_count; | ||
281 | struct list_head throttled_list; | ||
282 | #endif /* CONFIG_CFS_BANDWIDTH */ | ||
283 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
284 | }; | ||
285 | |||
286 | static inline int rt_bandwidth_enabled(void) | ||
287 | { | ||
288 | return sysctl_sched_rt_runtime >= 0; | ||
289 | } | ||
290 | |||
291 | /* Real-Time classes' related field in a runqueue: */ | ||
292 | struct rt_rq { | ||
293 | struct rt_prio_array active; | ||
294 | unsigned long rt_nr_running; | ||
295 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED | ||
296 | struct { | ||
297 | int curr; /* highest queued rt task prio */ | ||
298 | #ifdef CONFIG_SMP | ||
299 | int next; /* next highest */ | ||
300 | #endif | ||
301 | } highest_prio; | ||
302 | #endif | ||
303 | #ifdef CONFIG_SMP | ||
304 | unsigned long rt_nr_migratory; | ||
305 | unsigned long rt_nr_total; | ||
306 | int overloaded; | ||
307 | struct plist_head pushable_tasks; | ||
308 | #endif | ||
309 | int rt_throttled; | ||
310 | u64 rt_time; | ||
311 | u64 rt_runtime; | ||
312 | /* Nests inside the rq lock: */ | ||
313 | raw_spinlock_t rt_runtime_lock; | ||
314 | |||
315 | #ifdef CONFIG_RT_GROUP_SCHED | ||
316 | unsigned long rt_nr_boosted; | ||
317 | |||
318 | struct rq *rq; | ||
319 | struct list_head leaf_rt_rq_list; | ||
320 | struct task_group *tg; | ||
321 | #endif | ||
322 | }; | ||
323 | |||
324 | #ifdef CONFIG_SMP | ||
325 | |||
326 | /* | ||
327 | * We add the notion of a root-domain which will be used to define per-domain | ||
328 | * variables. Each exclusive cpuset essentially defines an island domain by | ||
329 | * fully partitioning the member cpus from any other cpuset. Whenever a new | ||
330 | * exclusive cpuset is created, we also create and attach a new root-domain | ||
331 | * object. | ||
332 | * | ||
333 | */ | ||
334 | struct root_domain { | ||
335 | atomic_t refcount; | ||
336 | atomic_t rto_count; | ||
337 | struct rcu_head rcu; | ||
338 | cpumask_var_t span; | ||
339 | cpumask_var_t online; | ||
340 | |||
341 | /* | ||
342 | * The "RT overload" flag: it gets set if a CPU has more than | ||
343 | * one runnable RT task. | ||
344 | */ | ||
345 | cpumask_var_t rto_mask; | ||
346 | struct cpupri cpupri; | ||
347 | }; | ||
348 | |||
349 | extern struct root_domain def_root_domain; | ||
350 | |||
351 | #endif /* CONFIG_SMP */ | ||
352 | |||
353 | /* | ||
354 | * This is the main, per-CPU runqueue data structure. | ||
355 | * | ||
356 | * Locking rule: those places that want to lock multiple runqueues | ||
357 | * (such as the load balancing or the thread migration code), lock | ||
358 | * acquire operations must be ordered by ascending &runqueue. | ||
359 | */ | ||
360 | struct rq { | ||
361 | /* runqueue lock: */ | ||
362 | raw_spinlock_t lock; | ||
363 | |||
364 | /* | ||
365 | * nr_running and cpu_load should be in the same cacheline because | ||
366 | * remote CPUs use both these fields when doing load calculation. | ||
367 | */ | ||
368 | unsigned long nr_running; | ||
369 | #define CPU_LOAD_IDX_MAX 5 | ||
370 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | ||
371 | unsigned long last_load_update_tick; | ||
372 | #ifdef CONFIG_NO_HZ | ||
373 | u64 nohz_stamp; | ||
374 | unsigned long nohz_flags; | ||
375 | #endif | ||
376 | int skip_clock_update; | ||
377 | |||
378 | /* capture load from *all* tasks on this cpu: */ | ||
379 | struct load_weight load; | ||
380 | unsigned long nr_load_updates; | ||
381 | u64 nr_switches; | ||
382 | |||
383 | struct cfs_rq cfs; | ||
384 | struct rt_rq rt; | ||
385 | |||
386 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
387 | /* list of leaf cfs_rq on this cpu: */ | ||
388 | struct list_head leaf_cfs_rq_list; | ||
389 | #endif | ||
390 | #ifdef CONFIG_RT_GROUP_SCHED | ||
391 | struct list_head leaf_rt_rq_list; | ||
392 | #endif | ||
393 | |||
394 | /* | ||
395 | * This is part of a global counter where only the total sum | ||
396 | * over all CPUs matters. A task can increase this counter on | ||
397 | * one CPU and if it got migrated afterwards it may decrease | ||
398 | * it on another CPU. Always updated under the runqueue lock: | ||
399 | */ | ||
400 | unsigned long nr_uninterruptible; | ||
401 | |||
402 | struct task_struct *curr, *idle, *stop; | ||
403 | unsigned long next_balance; | ||
404 | struct mm_struct *prev_mm; | ||
405 | |||
406 | u64 clock; | ||
407 | u64 clock_task; | ||
408 | |||
409 | atomic_t nr_iowait; | ||
410 | |||
411 | #ifdef CONFIG_SMP | ||
412 | struct root_domain *rd; | ||
413 | struct sched_domain *sd; | ||
414 | |||
415 | unsigned long cpu_power; | ||
416 | |||
417 | unsigned char idle_balance; | ||
418 | /* For active balancing */ | ||
419 | int post_schedule; | ||
420 | int active_balance; | ||
421 | int push_cpu; | ||
422 | struct cpu_stop_work active_balance_work; | ||
423 | /* cpu of this runqueue: */ | ||
424 | int cpu; | ||
425 | int online; | ||
426 | |||
427 | u64 rt_avg; | ||
428 | u64 age_stamp; | ||
429 | u64 idle_stamp; | ||
430 | u64 avg_idle; | ||
431 | #endif | ||
432 | |||
433 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
434 | u64 prev_irq_time; | ||
435 | #endif | ||
436 | #ifdef CONFIG_PARAVIRT | ||
437 | u64 prev_steal_time; | ||
438 | #endif | ||
439 | #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING | ||
440 | u64 prev_steal_time_rq; | ||
441 | #endif | ||
442 | |||
443 | /* calc_load related fields */ | ||
444 | unsigned long calc_load_update; | ||
445 | long calc_load_active; | ||
446 | |||
447 | #ifdef CONFIG_SCHED_HRTICK | ||
448 | #ifdef CONFIG_SMP | ||
449 | int hrtick_csd_pending; | ||
450 | struct call_single_data hrtick_csd; | ||
451 | #endif | ||
452 | struct hrtimer hrtick_timer; | ||
453 | #endif | ||
454 | |||
455 | #ifdef CONFIG_SCHEDSTATS | ||
456 | /* latency stats */ | ||
457 | struct sched_info rq_sched_info; | ||
458 | unsigned long long rq_cpu_time; | ||
459 | /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ | ||
460 | |||
461 | /* sys_sched_yield() stats */ | ||
462 | unsigned int yld_count; | ||
463 | |||
464 | /* schedule() stats */ | ||
465 | unsigned int sched_switch; | ||
466 | unsigned int sched_count; | ||
467 | unsigned int sched_goidle; | ||
468 | |||
469 | /* try_to_wake_up() stats */ | ||
470 | unsigned int ttwu_count; | ||
471 | unsigned int ttwu_local; | ||
472 | #endif | ||
473 | |||
474 | #ifdef CONFIG_SMP | ||
475 | struct llist_head wake_list; | ||
476 | #endif | ||
477 | }; | ||
478 | |||
479 | static inline int cpu_of(struct rq *rq) | ||
480 | { | ||
481 | #ifdef CONFIG_SMP | ||
482 | return rq->cpu; | ||
483 | #else | ||
484 | return 0; | ||
485 | #endif | ||
486 | } | ||
487 | |||
488 | DECLARE_PER_CPU(struct rq, runqueues); | ||
489 | |||
490 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) | ||
491 | #define this_rq() (&__get_cpu_var(runqueues)) | ||
492 | #define task_rq(p) cpu_rq(task_cpu(p)) | ||
493 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | ||
494 | #define raw_rq() (&__raw_get_cpu_var(runqueues)) | ||
495 | |||
496 | #ifdef CONFIG_SMP | ||
497 | |||
498 | #define rcu_dereference_check_sched_domain(p) \ | ||
499 | rcu_dereference_check((p), \ | ||
500 | lockdep_is_held(&sched_domains_mutex)) | ||
501 | |||
502 | /* | ||
503 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. | ||
504 | * See detach_destroy_domains: synchronize_sched for details. | ||
505 | * | ||
506 | * The domain tree of any CPU may only be accessed from within | ||
507 | * preempt-disabled sections. | ||
508 | */ | ||
509 | #define for_each_domain(cpu, __sd) \ | ||
510 | for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ | ||
511 | __sd; __sd = __sd->parent) | ||
512 | |||
513 | #define for_each_lower_domain(sd) for (; sd; sd = sd->child) | ||
514 | |||
515 | /** | ||
516 | * highest_flag_domain - Return highest sched_domain containing flag. | ||
517 | * @cpu: The cpu whose highest level of sched domain is to | ||
518 | * be returned. | ||
519 | * @flag: The flag to check for the highest sched_domain | ||
520 | * for the given cpu. | ||
521 | * | ||
522 | * Returns the highest sched_domain of a cpu which contains the given flag. | ||
523 | */ | ||
524 | static inline struct sched_domain *highest_flag_domain(int cpu, int flag) | ||
525 | { | ||
526 | struct sched_domain *sd, *hsd = NULL; | ||
527 | |||
528 | for_each_domain(cpu, sd) { | ||
529 | if (!(sd->flags & flag)) | ||
530 | break; | ||
531 | hsd = sd; | ||
532 | } | ||
533 | |||
534 | return hsd; | ||
535 | } | ||
536 | |||
537 | DECLARE_PER_CPU(struct sched_domain *, sd_llc); | ||
538 | DECLARE_PER_CPU(int, sd_llc_id); | ||
539 | |||
540 | #endif /* CONFIG_SMP */ | ||
541 | |||
542 | #include "stats.h" | ||
543 | #include "auto_group.h" | ||
544 | |||
545 | #ifdef CONFIG_CGROUP_SCHED | ||
546 | |||
547 | /* | ||
548 | * Return the group to which this tasks belongs. | ||
549 | * | ||
550 | * We use task_subsys_state_check() and extend the RCU verification with | ||
551 | * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each | ||
552 | * task it moves into the cgroup. Therefore by holding either of those locks, | ||
553 | * we pin the task to the current cgroup. | ||
554 | */ | ||
555 | static inline struct task_group *task_group(struct task_struct *p) | ||
556 | { | ||
557 | struct task_group *tg; | ||
558 | struct cgroup_subsys_state *css; | ||
559 | |||
560 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, | ||
561 | lockdep_is_held(&p->pi_lock) || | ||
562 | lockdep_is_held(&task_rq(p)->lock)); | ||
563 | tg = container_of(css, struct task_group, css); | ||
564 | |||
565 | return autogroup_task_group(p, tg); | ||
566 | } | ||
567 | |||
568 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ | ||
569 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) | ||
570 | { | ||
571 | #if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED) | ||
572 | struct task_group *tg = task_group(p); | ||
573 | #endif | ||
574 | |||
575 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
576 | p->se.cfs_rq = tg->cfs_rq[cpu]; | ||
577 | p->se.parent = tg->se[cpu]; | ||
578 | #endif | ||
579 | |||
580 | #ifdef CONFIG_RT_GROUP_SCHED | ||
581 | p->rt.rt_rq = tg->rt_rq[cpu]; | ||
582 | p->rt.parent = tg->rt_se[cpu]; | ||
583 | #endif | ||
584 | } | ||
585 | |||
586 | #else /* CONFIG_CGROUP_SCHED */ | ||
587 | |||
588 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } | ||
589 | static inline struct task_group *task_group(struct task_struct *p) | ||
590 | { | ||
591 | return NULL; | ||
592 | } | ||
593 | |||
594 | #endif /* CONFIG_CGROUP_SCHED */ | ||
595 | |||
596 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | ||
597 | { | ||
598 | set_task_rq(p, cpu); | ||
599 | #ifdef CONFIG_SMP | ||
600 | /* | ||
601 | * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be | ||
602 | * successfuly executed on another CPU. We must ensure that updates of | ||
603 | * per-task data have been completed by this moment. | ||
604 | */ | ||
605 | smp_wmb(); | ||
606 | task_thread_info(p)->cpu = cpu; | ||
607 | #endif | ||
608 | } | ||
609 | |||
610 | /* | ||
611 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: | ||
612 | */ | ||
613 | #ifdef CONFIG_SCHED_DEBUG | ||
614 | # include <linux/jump_label.h> | ||
615 | # define const_debug __read_mostly | ||
616 | #else | ||
617 | # define const_debug const | ||
618 | #endif | ||
619 | |||
620 | extern const_debug unsigned int sysctl_sched_features; | ||
621 | |||
622 | #define SCHED_FEAT(name, enabled) \ | ||
623 | __SCHED_FEAT_##name , | ||
624 | |||
625 | enum { | ||
626 | #include "features.h" | ||
627 | __SCHED_FEAT_NR, | ||
628 | }; | ||
629 | |||
630 | #undef SCHED_FEAT | ||
631 | |||
632 | #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) | ||
633 | static __always_inline bool static_branch__true(struct jump_label_key *key) | ||
634 | { | ||
635 | return likely(static_branch(key)); /* Not out of line branch. */ | ||
636 | } | ||
637 | |||
638 | static __always_inline bool static_branch__false(struct jump_label_key *key) | ||
639 | { | ||
640 | return unlikely(static_branch(key)); /* Out of line branch. */ | ||
641 | } | ||
642 | |||
643 | #define SCHED_FEAT(name, enabled) \ | ||
644 | static __always_inline bool static_branch_##name(struct jump_label_key *key) \ | ||
645 | { \ | ||
646 | return static_branch__##enabled(key); \ | ||
647 | } | ||
648 | |||
649 | #include "features.h" | ||
650 | |||
651 | #undef SCHED_FEAT | ||
652 | |||
653 | extern struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR]; | ||
654 | #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) | ||
655 | #else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ | ||
656 | #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) | ||
657 | #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ | ||
658 | |||
659 | static inline u64 global_rt_period(void) | ||
660 | { | ||
661 | return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; | ||
662 | } | ||
663 | |||
664 | static inline u64 global_rt_runtime(void) | ||
665 | { | ||
666 | if (sysctl_sched_rt_runtime < 0) | ||
667 | return RUNTIME_INF; | ||
668 | |||
669 | return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; | ||
670 | } | ||
671 | |||
672 | |||
673 | |||
674 | static inline int task_current(struct rq *rq, struct task_struct *p) | ||
675 | { | ||
676 | return rq->curr == p; | ||
677 | } | ||
678 | |||
679 | static inline int task_running(struct rq *rq, struct task_struct *p) | ||
680 | { | ||
681 | #ifdef CONFIG_SMP | ||
682 | return p->on_cpu; | ||
683 | #else | ||
684 | return task_current(rq, p); | ||
685 | #endif | ||
686 | } | ||
687 | |||
688 | |||
689 | #ifndef prepare_arch_switch | ||
690 | # define prepare_arch_switch(next) do { } while (0) | ||
691 | #endif | ||
692 | #ifndef finish_arch_switch | ||
693 | # define finish_arch_switch(prev) do { } while (0) | ||
694 | #endif | ||
695 | |||
696 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
697 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | ||
698 | { | ||
699 | #ifdef CONFIG_SMP | ||
700 | /* | ||
701 | * We can optimise this out completely for !SMP, because the | ||
702 | * SMP rebalancing from interrupt is the only thing that cares | ||
703 | * here. | ||
704 | */ | ||
705 | next->on_cpu = 1; | ||
706 | #endif | ||
707 | } | ||
708 | |||
709 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | ||
710 | { | ||
711 | #ifdef CONFIG_SMP | ||
712 | /* | ||
713 | * After ->on_cpu is cleared, the task can be moved to a different CPU. | ||
714 | * We must ensure this doesn't happen until the switch is completely | ||
715 | * finished. | ||
716 | */ | ||
717 | smp_wmb(); | ||
718 | prev->on_cpu = 0; | ||
719 | #endif | ||
720 | #ifdef CONFIG_DEBUG_SPINLOCK | ||
721 | /* this is a valid case when another task releases the spinlock */ | ||
722 | rq->lock.owner = current; | ||
723 | #endif | ||
724 | /* | ||
725 | * If we are tracking spinlock dependencies then we have to | ||
726 | * fix up the runqueue lock - which gets 'carried over' from | ||
727 | * prev into current: | ||
728 | */ | ||
729 | spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); | ||
730 | |||
731 | raw_spin_unlock_irq(&rq->lock); | ||
732 | } | ||
733 | |||
734 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ | ||
735 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | ||
736 | { | ||
737 | #ifdef CONFIG_SMP | ||
738 | /* | ||
739 | * We can optimise this out completely for !SMP, because the | ||
740 | * SMP rebalancing from interrupt is the only thing that cares | ||
741 | * here. | ||
742 | */ | ||
743 | next->on_cpu = 1; | ||
744 | #endif | ||
745 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
746 | raw_spin_unlock_irq(&rq->lock); | ||
747 | #else | ||
748 | raw_spin_unlock(&rq->lock); | ||
749 | #endif | ||
750 | } | ||
751 | |||
752 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | ||
753 | { | ||
754 | #ifdef CONFIG_SMP | ||
755 | /* | ||
756 | * After ->on_cpu is cleared, the task can be moved to a different CPU. | ||
757 | * We must ensure this doesn't happen until the switch is completely | ||
758 | * finished. | ||
759 | */ | ||
760 | smp_wmb(); | ||
761 | prev->on_cpu = 0; | ||
762 | #endif | ||
763 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
764 | local_irq_enable(); | ||
765 | #endif | ||
766 | } | ||
767 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | ||
768 | |||
769 | |||
770 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) | ||
771 | { | ||
772 | lw->weight += inc; | ||
773 | lw->inv_weight = 0; | ||
774 | } | ||
775 | |||
776 | static inline void update_load_sub(struct load_weight *lw, unsigned long dec) | ||
777 | { | ||
778 | lw->weight -= dec; | ||
779 | lw->inv_weight = 0; | ||
780 | } | ||
781 | |||
782 | static inline void update_load_set(struct load_weight *lw, unsigned long w) | ||
783 | { | ||
784 | lw->weight = w; | ||
785 | lw->inv_weight = 0; | ||
786 | } | ||
787 | |||
788 | /* | ||
789 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | ||
790 | * of tasks with abnormal "nice" values across CPUs the contribution that | ||
791 | * each task makes to its run queue's load is weighted according to its | ||
792 | * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a | ||
793 | * scaled version of the new time slice allocation that they receive on time | ||
794 | * slice expiry etc. | ||
795 | */ | ||
796 | |||
797 | #define WEIGHT_IDLEPRIO 3 | ||
798 | #define WMULT_IDLEPRIO 1431655765 | ||
799 | |||
800 | /* | ||
801 | * Nice levels are multiplicative, with a gentle 10% change for every | ||
802 | * nice level changed. I.e. when a CPU-bound task goes from nice 0 to | ||
803 | * nice 1, it will get ~10% less CPU time than another CPU-bound task | ||
804 | * that remained on nice 0. | ||
805 | * | ||
806 | * The "10% effect" is relative and cumulative: from _any_ nice level, | ||
807 | * if you go up 1 level, it's -10% CPU usage, if you go down 1 level | ||
808 | * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. | ||
809 | * If a task goes up by ~10% and another task goes down by ~10% then | ||
810 | * the relative distance between them is ~25%.) | ||
811 | */ | ||
812 | static const int prio_to_weight[40] = { | ||
813 | /* -20 */ 88761, 71755, 56483, 46273, 36291, | ||
814 | /* -15 */ 29154, 23254, 18705, 14949, 11916, | ||
815 | /* -10 */ 9548, 7620, 6100, 4904, 3906, | ||
816 | /* -5 */ 3121, 2501, 1991, 1586, 1277, | ||
817 | /* 0 */ 1024, 820, 655, 526, 423, | ||
818 | /* 5 */ 335, 272, 215, 172, 137, | ||
819 | /* 10 */ 110, 87, 70, 56, 45, | ||
820 | /* 15 */ 36, 29, 23, 18, 15, | ||
821 | }; | ||
822 | |||
823 | /* | ||
824 | * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. | ||
825 | * | ||
826 | * In cases where the weight does not change often, we can use the | ||
827 | * precalculated inverse to speed up arithmetics by turning divisions | ||
828 | * into multiplications: | ||
829 | */ | ||
830 | static const u32 prio_to_wmult[40] = { | ||
831 | /* -20 */ 48388, 59856, 76040, 92818, 118348, | ||
832 | /* -15 */ 147320, 184698, 229616, 287308, 360437, | ||
833 | /* -10 */ 449829, 563644, 704093, 875809, 1099582, | ||
834 | /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, | ||
835 | /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, | ||
836 | /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, | ||
837 | /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, | ||
838 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, | ||
839 | }; | ||
840 | |||
841 | /* Time spent by the tasks of the cpu accounting group executing in ... */ | ||
842 | enum cpuacct_stat_index { | ||
843 | CPUACCT_STAT_USER, /* ... user mode */ | ||
844 | CPUACCT_STAT_SYSTEM, /* ... kernel mode */ | ||
845 | |||
846 | CPUACCT_STAT_NSTATS, | ||
847 | }; | ||
848 | |||
849 | |||
850 | #define sched_class_highest (&stop_sched_class) | ||
851 | #define for_each_class(class) \ | ||
852 | for (class = sched_class_highest; class; class = class->next) | ||
853 | |||
854 | extern const struct sched_class stop_sched_class; | ||
855 | extern const struct sched_class rt_sched_class; | ||
856 | extern const struct sched_class fair_sched_class; | ||
857 | extern const struct sched_class idle_sched_class; | ||
858 | |||
859 | |||
860 | #ifdef CONFIG_SMP | ||
861 | |||
862 | extern void trigger_load_balance(struct rq *rq, int cpu); | ||
863 | extern void idle_balance(int this_cpu, struct rq *this_rq); | ||
864 | |||
865 | #else /* CONFIG_SMP */ | ||
866 | |||
867 | static inline void idle_balance(int cpu, struct rq *rq) | ||
868 | { | ||
869 | } | ||
870 | |||
871 | #endif | ||
872 | |||
873 | extern void sysrq_sched_debug_show(void); | ||
874 | extern void sched_init_granularity(void); | ||
875 | extern void update_max_interval(void); | ||
876 | extern void update_group_power(struct sched_domain *sd, int cpu); | ||
877 | extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu); | ||
878 | extern void init_sched_rt_class(void); | ||
879 | extern void init_sched_fair_class(void); | ||
880 | |||
881 | extern void resched_task(struct task_struct *p); | ||
882 | extern void resched_cpu(int cpu); | ||
883 | |||
884 | extern struct rt_bandwidth def_rt_bandwidth; | ||
885 | extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); | ||
886 | |||
887 | extern void update_cpu_load(struct rq *this_rq); | ||
888 | |||
889 | #ifdef CONFIG_CGROUP_CPUACCT | ||
890 | #include <linux/cgroup.h> | ||
891 | /* track cpu usage of a group of tasks and its child groups */ | ||
892 | struct cpuacct { | ||
893 | struct cgroup_subsys_state css; | ||
894 | /* cpuusage holds pointer to a u64-type object on every cpu */ | ||
895 | u64 __percpu *cpuusage; | ||
896 | struct kernel_cpustat __percpu *cpustat; | ||
897 | }; | ||
898 | |||
899 | /* return cpu accounting group corresponding to this container */ | ||
900 | static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) | ||
901 | { | ||
902 | return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), | ||
903 | struct cpuacct, css); | ||
904 | } | ||
905 | |||
906 | /* return cpu accounting group to which this task belongs */ | ||
907 | static inline struct cpuacct *task_ca(struct task_struct *tsk) | ||
908 | { | ||
909 | return container_of(task_subsys_state(tsk, cpuacct_subsys_id), | ||
910 | struct cpuacct, css); | ||
911 | } | ||
912 | |||
913 | static inline struct cpuacct *parent_ca(struct cpuacct *ca) | ||
914 | { | ||
915 | if (!ca || !ca->css.cgroup->parent) | ||
916 | return NULL; | ||
917 | return cgroup_ca(ca->css.cgroup->parent); | ||
918 | } | ||
919 | |||
920 | extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); | ||
921 | #else | ||
922 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | ||
923 | #endif | ||
924 | |||
925 | static inline void inc_nr_running(struct rq *rq) | ||
926 | { | ||
927 | rq->nr_running++; | ||
928 | } | ||
929 | |||
930 | static inline void dec_nr_running(struct rq *rq) | ||
931 | { | ||
932 | rq->nr_running--; | ||
933 | } | ||
934 | |||
935 | extern void update_rq_clock(struct rq *rq); | ||
936 | |||
937 | extern void activate_task(struct rq *rq, struct task_struct *p, int flags); | ||
938 | extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); | ||
939 | |||
940 | extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); | ||
941 | |||
942 | extern const_debug unsigned int sysctl_sched_time_avg; | ||
943 | extern const_debug unsigned int sysctl_sched_nr_migrate; | ||
944 | extern const_debug unsigned int sysctl_sched_migration_cost; | ||
945 | |||
946 | static inline u64 sched_avg_period(void) | ||
947 | { | ||
948 | return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; | ||
949 | } | ||
950 | |||
951 | void calc_load_account_idle(struct rq *this_rq); | ||
952 | |||
953 | #ifdef CONFIG_SCHED_HRTICK | ||
954 | |||
955 | /* | ||
956 | * Use hrtick when: | ||
957 | * - enabled by features | ||
958 | * - hrtimer is actually high res | ||
959 | */ | ||
960 | static inline int hrtick_enabled(struct rq *rq) | ||
961 | { | ||
962 | if (!sched_feat(HRTICK)) | ||
963 | return 0; | ||
964 | if (!cpu_active(cpu_of(rq))) | ||
965 | return 0; | ||
966 | return hrtimer_is_hres_active(&rq->hrtick_timer); | ||
967 | } | ||
968 | |||
969 | void hrtick_start(struct rq *rq, u64 delay); | ||
970 | |||
971 | #else | ||
972 | |||
973 | static inline int hrtick_enabled(struct rq *rq) | ||
974 | { | ||
975 | return 0; | ||
976 | } | ||
977 | |||
978 | #endif /* CONFIG_SCHED_HRTICK */ | ||
979 | |||
980 | #ifdef CONFIG_SMP | ||
981 | extern void sched_avg_update(struct rq *rq); | ||
982 | static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | ||
983 | { | ||
984 | rq->rt_avg += rt_delta; | ||
985 | sched_avg_update(rq); | ||
986 | } | ||
987 | #else | ||
988 | static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } | ||
989 | static inline void sched_avg_update(struct rq *rq) { } | ||
990 | #endif | ||
991 | |||
992 | extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period); | ||
993 | |||
994 | #ifdef CONFIG_SMP | ||
995 | #ifdef CONFIG_PREEMPT | ||
996 | |||
997 | static inline void double_rq_lock(struct rq *rq1, struct rq *rq2); | ||
998 | |||
999 | /* | ||
1000 | * fair double_lock_balance: Safely acquires both rq->locks in a fair | ||
1001 | * way at the expense of forcing extra atomic operations in all | ||
1002 | * invocations. This assures that the double_lock is acquired using the | ||
1003 | * same underlying policy as the spinlock_t on this architecture, which | ||
1004 | * reduces latency compared to the unfair variant below. However, it | ||
1005 | * also adds more overhead and therefore may reduce throughput. | ||
1006 | */ | ||
1007 | static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) | ||
1008 | __releases(this_rq->lock) | ||
1009 | __acquires(busiest->lock) | ||
1010 | __acquires(this_rq->lock) | ||
1011 | { | ||
1012 | raw_spin_unlock(&this_rq->lock); | ||
1013 | double_rq_lock(this_rq, busiest); | ||
1014 | |||
1015 | return 1; | ||
1016 | } | ||
1017 | |||
1018 | #else | ||
1019 | /* | ||
1020 | * Unfair double_lock_balance: Optimizes throughput at the expense of | ||
1021 | * latency by eliminating extra atomic operations when the locks are | ||
1022 | * already in proper order on entry. This favors lower cpu-ids and will | ||
1023 | * grant the double lock to lower cpus over higher ids under contention, | ||
1024 | * regardless of entry order into the function. | ||
1025 | */ | ||
1026 | static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) | ||
1027 | __releases(this_rq->lock) | ||
1028 | __acquires(busiest->lock) | ||
1029 | __acquires(this_rq->lock) | ||
1030 | { | ||
1031 | int ret = 0; | ||
1032 | |||
1033 | if (unlikely(!raw_spin_trylock(&busiest->lock))) { | ||
1034 | if (busiest < this_rq) { | ||
1035 | raw_spin_unlock(&this_rq->lock); | ||
1036 | raw_spin_lock(&busiest->lock); | ||
1037 | raw_spin_lock_nested(&this_rq->lock, | ||
1038 | SINGLE_DEPTH_NESTING); | ||
1039 | ret = 1; | ||
1040 | } else | ||
1041 | raw_spin_lock_nested(&busiest->lock, | ||
1042 | SINGLE_DEPTH_NESTING); | ||
1043 | } | ||
1044 | return ret; | ||
1045 | } | ||
1046 | |||
1047 | #endif /* CONFIG_PREEMPT */ | ||
1048 | |||
1049 | /* | ||
1050 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. | ||
1051 | */ | ||
1052 | static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) | ||
1053 | { | ||
1054 | if (unlikely(!irqs_disabled())) { | ||
1055 | /* printk() doesn't work good under rq->lock */ | ||
1056 | raw_spin_unlock(&this_rq->lock); | ||
1057 | BUG_ON(1); | ||
1058 | } | ||
1059 | |||
1060 | return _double_lock_balance(this_rq, busiest); | ||
1061 | } | ||
1062 | |||
1063 | static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) | ||
1064 | __releases(busiest->lock) | ||
1065 | { | ||
1066 | raw_spin_unlock(&busiest->lock); | ||
1067 | lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); | ||
1068 | } | ||
1069 | |||
1070 | /* | ||
1071 | * double_rq_lock - safely lock two runqueues | ||
1072 | * | ||
1073 | * Note this does not disable interrupts like task_rq_lock, | ||
1074 | * you need to do so manually before calling. | ||
1075 | */ | ||
1076 | static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) | ||
1077 | __acquires(rq1->lock) | ||
1078 | __acquires(rq2->lock) | ||
1079 | { | ||
1080 | BUG_ON(!irqs_disabled()); | ||
1081 | if (rq1 == rq2) { | ||
1082 | raw_spin_lock(&rq1->lock); | ||
1083 | __acquire(rq2->lock); /* Fake it out ;) */ | ||
1084 | } else { | ||
1085 | if (rq1 < rq2) { | ||
1086 | raw_spin_lock(&rq1->lock); | ||
1087 | raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); | ||
1088 | } else { | ||
1089 | raw_spin_lock(&rq2->lock); | ||
1090 | raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); | ||
1091 | } | ||
1092 | } | ||
1093 | } | ||
1094 | |||
1095 | /* | ||
1096 | * double_rq_unlock - safely unlock two runqueues | ||
1097 | * | ||
1098 | * Note this does not restore interrupts like task_rq_unlock, | ||
1099 | * you need to do so manually after calling. | ||
1100 | */ | ||
1101 | static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) | ||
1102 | __releases(rq1->lock) | ||
1103 | __releases(rq2->lock) | ||
1104 | { | ||
1105 | raw_spin_unlock(&rq1->lock); | ||
1106 | if (rq1 != rq2) | ||
1107 | raw_spin_unlock(&rq2->lock); | ||
1108 | else | ||
1109 | __release(rq2->lock); | ||
1110 | } | ||
1111 | |||
1112 | #else /* CONFIG_SMP */ | ||
1113 | |||
1114 | /* | ||
1115 | * double_rq_lock - safely lock two runqueues | ||
1116 | * | ||
1117 | * Note this does not disable interrupts like task_rq_lock, | ||
1118 | * you need to do so manually before calling. | ||
1119 | */ | ||
1120 | static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) | ||
1121 | __acquires(rq1->lock) | ||
1122 | __acquires(rq2->lock) | ||
1123 | { | ||
1124 | BUG_ON(!irqs_disabled()); | ||
1125 | BUG_ON(rq1 != rq2); | ||
1126 | raw_spin_lock(&rq1->lock); | ||
1127 | __acquire(rq2->lock); /* Fake it out ;) */ | ||
1128 | } | ||
1129 | |||
1130 | /* | ||
1131 | * double_rq_unlock - safely unlock two runqueues | ||
1132 | * | ||
1133 | * Note this does not restore interrupts like task_rq_unlock, | ||
1134 | * you need to do so manually after calling. | ||
1135 | */ | ||
1136 | static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) | ||
1137 | __releases(rq1->lock) | ||
1138 | __releases(rq2->lock) | ||
1139 | { | ||
1140 | BUG_ON(rq1 != rq2); | ||
1141 | raw_spin_unlock(&rq1->lock); | ||
1142 | __release(rq2->lock); | ||
1143 | } | ||
1144 | |||
1145 | #endif | ||
1146 | |||
1147 | extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); | ||
1148 | extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); | ||
1149 | extern void print_cfs_stats(struct seq_file *m, int cpu); | ||
1150 | extern void print_rt_stats(struct seq_file *m, int cpu); | ||
1151 | |||
1152 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); | ||
1153 | extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); | ||
1154 | extern void unthrottle_offline_cfs_rqs(struct rq *rq); | ||
1155 | |||
1156 | extern void account_cfs_bandwidth_used(int enabled, int was_enabled); | ||
1157 | |||
1158 | #ifdef CONFIG_NO_HZ | ||
1159 | enum rq_nohz_flag_bits { | ||
1160 | NOHZ_TICK_STOPPED, | ||
1161 | NOHZ_BALANCE_KICK, | ||
1162 | NOHZ_IDLE, | ||
1163 | }; | ||
1164 | |||
1165 | #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) | ||
1166 | #endif | ||
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c new file mode 100644 index 000000000000..2a581ba8e190 --- /dev/null +++ b/kernel/sched/stats.c | |||
@@ -0,0 +1,111 @@ | |||
1 | |||
2 | #include <linux/slab.h> | ||
3 | #include <linux/fs.h> | ||
4 | #include <linux/seq_file.h> | ||
5 | #include <linux/proc_fs.h> | ||
6 | |||
7 | #include "sched.h" | ||
8 | |||
9 | /* | ||
10 | * bump this up when changing the output format or the meaning of an existing | ||
11 | * format, so that tools can adapt (or abort) | ||
12 | */ | ||
13 | #define SCHEDSTAT_VERSION 15 | ||
14 | |||
15 | static int show_schedstat(struct seq_file *seq, void *v) | ||
16 | { | ||
17 | int cpu; | ||
18 | int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9; | ||
19 | char *mask_str = kmalloc(mask_len, GFP_KERNEL); | ||
20 | |||
21 | if (mask_str == NULL) | ||
22 | return -ENOMEM; | ||
23 | |||
24 | seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); | ||
25 | seq_printf(seq, "timestamp %lu\n", jiffies); | ||
26 | for_each_online_cpu(cpu) { | ||
27 | struct rq *rq = cpu_rq(cpu); | ||
28 | #ifdef CONFIG_SMP | ||
29 | struct sched_domain *sd; | ||
30 | int dcount = 0; | ||
31 | #endif | ||
32 | |||
33 | /* runqueue-specific stats */ | ||
34 | seq_printf(seq, | ||
35 | "cpu%d %u %u %u %u %u %u %llu %llu %lu", | ||
36 | cpu, rq->yld_count, | ||
37 | rq->sched_switch, rq->sched_count, rq->sched_goidle, | ||
38 | rq->ttwu_count, rq->ttwu_local, | ||
39 | rq->rq_cpu_time, | ||
40 | rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); | ||
41 | |||
42 | seq_printf(seq, "\n"); | ||
43 | |||
44 | #ifdef CONFIG_SMP | ||
45 | /* domain-specific stats */ | ||
46 | rcu_read_lock(); | ||
47 | for_each_domain(cpu, sd) { | ||
48 | enum cpu_idle_type itype; | ||
49 | |||
50 | cpumask_scnprintf(mask_str, mask_len, | ||
51 | sched_domain_span(sd)); | ||
52 | seq_printf(seq, "domain%d %s", dcount++, mask_str); | ||
53 | for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; | ||
54 | itype++) { | ||
55 | seq_printf(seq, " %u %u %u %u %u %u %u %u", | ||
56 | sd->lb_count[itype], | ||
57 | sd->lb_balanced[itype], | ||
58 | sd->lb_failed[itype], | ||
59 | sd->lb_imbalance[itype], | ||
60 | sd->lb_gained[itype], | ||
61 | sd->lb_hot_gained[itype], | ||
62 | sd->lb_nobusyq[itype], | ||
63 | sd->lb_nobusyg[itype]); | ||
64 | } | ||
65 | seq_printf(seq, | ||
66 | " %u %u %u %u %u %u %u %u %u %u %u %u\n", | ||
67 | sd->alb_count, sd->alb_failed, sd->alb_pushed, | ||
68 | sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed, | ||
69 | sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed, | ||
70 | sd->ttwu_wake_remote, sd->ttwu_move_affine, | ||
71 | sd->ttwu_move_balance); | ||
72 | } | ||
73 | rcu_read_unlock(); | ||
74 | #endif | ||
75 | } | ||
76 | kfree(mask_str); | ||
77 | return 0; | ||
78 | } | ||
79 | |||
80 | static int schedstat_open(struct inode *inode, struct file *file) | ||
81 | { | ||
82 | unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); | ||
83 | char *buf = kmalloc(size, GFP_KERNEL); | ||
84 | struct seq_file *m; | ||
85 | int res; | ||
86 | |||
87 | if (!buf) | ||
88 | return -ENOMEM; | ||
89 | res = single_open(file, show_schedstat, NULL); | ||
90 | if (!res) { | ||
91 | m = file->private_data; | ||
92 | m->buf = buf; | ||
93 | m->size = size; | ||
94 | } else | ||
95 | kfree(buf); | ||
96 | return res; | ||
97 | } | ||
98 | |||
99 | static const struct file_operations proc_schedstat_operations = { | ||
100 | .open = schedstat_open, | ||
101 | .read = seq_read, | ||
102 | .llseek = seq_lseek, | ||
103 | .release = single_release, | ||
104 | }; | ||
105 | |||
106 | static int __init proc_schedstat_init(void) | ||
107 | { | ||
108 | proc_create("schedstat", 0, NULL, &proc_schedstat_operations); | ||
109 | return 0; | ||
110 | } | ||
111 | module_init(proc_schedstat_init); | ||
diff --git a/kernel/sched_stats.h b/kernel/sched/stats.h index 331e01bcd026..2ef90a51ec5e 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched/stats.h | |||
@@ -1,108 +1,5 @@ | |||
1 | 1 | ||
2 | #ifdef CONFIG_SCHEDSTATS | 2 | #ifdef CONFIG_SCHEDSTATS |
3 | /* | ||
4 | * bump this up when changing the output format or the meaning of an existing | ||
5 | * format, so that tools can adapt (or abort) | ||
6 | */ | ||
7 | #define SCHEDSTAT_VERSION 15 | ||
8 | |||
9 | static int show_schedstat(struct seq_file *seq, void *v) | ||
10 | { | ||
11 | int cpu; | ||
12 | int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9; | ||
13 | char *mask_str = kmalloc(mask_len, GFP_KERNEL); | ||
14 | |||
15 | if (mask_str == NULL) | ||
16 | return -ENOMEM; | ||
17 | |||
18 | seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); | ||
19 | seq_printf(seq, "timestamp %lu\n", jiffies); | ||
20 | for_each_online_cpu(cpu) { | ||
21 | struct rq *rq = cpu_rq(cpu); | ||
22 | #ifdef CONFIG_SMP | ||
23 | struct sched_domain *sd; | ||
24 | int dcount = 0; | ||
25 | #endif | ||
26 | |||
27 | /* runqueue-specific stats */ | ||
28 | seq_printf(seq, | ||
29 | "cpu%d %u %u %u %u %u %u %llu %llu %lu", | ||
30 | cpu, rq->yld_count, | ||
31 | rq->sched_switch, rq->sched_count, rq->sched_goidle, | ||
32 | rq->ttwu_count, rq->ttwu_local, | ||
33 | rq->rq_cpu_time, | ||
34 | rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); | ||
35 | |||
36 | seq_printf(seq, "\n"); | ||
37 | |||
38 | #ifdef CONFIG_SMP | ||
39 | /* domain-specific stats */ | ||
40 | rcu_read_lock(); | ||
41 | for_each_domain(cpu, sd) { | ||
42 | enum cpu_idle_type itype; | ||
43 | |||
44 | cpumask_scnprintf(mask_str, mask_len, | ||
45 | sched_domain_span(sd)); | ||
46 | seq_printf(seq, "domain%d %s", dcount++, mask_str); | ||
47 | for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; | ||
48 | itype++) { | ||
49 | seq_printf(seq, " %u %u %u %u %u %u %u %u", | ||
50 | sd->lb_count[itype], | ||
51 | sd->lb_balanced[itype], | ||
52 | sd->lb_failed[itype], | ||
53 | sd->lb_imbalance[itype], | ||
54 | sd->lb_gained[itype], | ||
55 | sd->lb_hot_gained[itype], | ||
56 | sd->lb_nobusyq[itype], | ||
57 | sd->lb_nobusyg[itype]); | ||
58 | } | ||
59 | seq_printf(seq, | ||
60 | " %u %u %u %u %u %u %u %u %u %u %u %u\n", | ||
61 | sd->alb_count, sd->alb_failed, sd->alb_pushed, | ||
62 | sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed, | ||
63 | sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed, | ||
64 | sd->ttwu_wake_remote, sd->ttwu_move_affine, | ||
65 | sd->ttwu_move_balance); | ||
66 | } | ||
67 | rcu_read_unlock(); | ||
68 | #endif | ||
69 | } | ||
70 | kfree(mask_str); | ||
71 | return 0; | ||
72 | } | ||
73 | |||
74 | static int schedstat_open(struct inode *inode, struct file *file) | ||
75 | { | ||
76 | unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); | ||
77 | char *buf = kmalloc(size, GFP_KERNEL); | ||
78 | struct seq_file *m; | ||
79 | int res; | ||
80 | |||
81 | if (!buf) | ||
82 | return -ENOMEM; | ||
83 | res = single_open(file, show_schedstat, NULL); | ||
84 | if (!res) { | ||
85 | m = file->private_data; | ||
86 | m->buf = buf; | ||
87 | m->size = size; | ||
88 | } else | ||
89 | kfree(buf); | ||
90 | return res; | ||
91 | } | ||
92 | |||
93 | static const struct file_operations proc_schedstat_operations = { | ||
94 | .open = schedstat_open, | ||
95 | .read = seq_read, | ||
96 | .llseek = seq_lseek, | ||
97 | .release = single_release, | ||
98 | }; | ||
99 | |||
100 | static int __init proc_schedstat_init(void) | ||
101 | { | ||
102 | proc_create("schedstat", 0, NULL, &proc_schedstat_operations); | ||
103 | return 0; | ||
104 | } | ||
105 | module_init(proc_schedstat_init); | ||
106 | 3 | ||
107 | /* | 4 | /* |
108 | * Expects runqueue lock to be held for atomicity of update | 5 | * Expects runqueue lock to be held for atomicity of update |
@@ -282,10 +179,9 @@ static inline void account_group_user_time(struct task_struct *tsk, | |||
282 | if (!cputimer->running) | 179 | if (!cputimer->running) |
283 | return; | 180 | return; |
284 | 181 | ||
285 | spin_lock(&cputimer->lock); | 182 | raw_spin_lock(&cputimer->lock); |
286 | cputimer->cputime.utime = | 183 | cputimer->cputime.utime += cputime; |
287 | cputime_add(cputimer->cputime.utime, cputime); | 184 | raw_spin_unlock(&cputimer->lock); |
288 | spin_unlock(&cputimer->lock); | ||
289 | } | 185 | } |
290 | 186 | ||
291 | /** | 187 | /** |
@@ -306,10 +202,9 @@ static inline void account_group_system_time(struct task_struct *tsk, | |||
306 | if (!cputimer->running) | 202 | if (!cputimer->running) |
307 | return; | 203 | return; |
308 | 204 | ||
309 | spin_lock(&cputimer->lock); | 205 | raw_spin_lock(&cputimer->lock); |
310 | cputimer->cputime.stime = | 206 | cputimer->cputime.stime += cputime; |
311 | cputime_add(cputimer->cputime.stime, cputime); | 207 | raw_spin_unlock(&cputimer->lock); |
312 | spin_unlock(&cputimer->lock); | ||
313 | } | 208 | } |
314 | 209 | ||
315 | /** | 210 | /** |
@@ -330,7 +225,7 @@ static inline void account_group_exec_runtime(struct task_struct *tsk, | |||
330 | if (!cputimer->running) | 225 | if (!cputimer->running) |
331 | return; | 226 | return; |
332 | 227 | ||
333 | spin_lock(&cputimer->lock); | 228 | raw_spin_lock(&cputimer->lock); |
334 | cputimer->cputime.sum_exec_runtime += ns; | 229 | cputimer->cputime.sum_exec_runtime += ns; |
335 | spin_unlock(&cputimer->lock); | 230 | raw_spin_unlock(&cputimer->lock); |
336 | } | 231 | } |
diff --git a/kernel/sched_stoptask.c b/kernel/sched/stop_task.c index 6f437632afab..7b386e86fd23 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched/stop_task.c | |||
@@ -1,3 +1,5 @@ | |||
1 | #include "sched.h" | ||
2 | |||
1 | /* | 3 | /* |
2 | * stop-task scheduling class. | 4 | * stop-task scheduling class. |
3 | * | 5 | * |
@@ -34,11 +36,13 @@ static struct task_struct *pick_next_task_stop(struct rq *rq) | |||
34 | static void | 36 | static void |
35 | enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) | 37 | enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) |
36 | { | 38 | { |
39 | inc_nr_running(rq); | ||
37 | } | 40 | } |
38 | 41 | ||
39 | static void | 42 | static void |
40 | dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) | 43 | dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) |
41 | { | 44 | { |
45 | dec_nr_running(rq); | ||
42 | } | 46 | } |
43 | 47 | ||
44 | static void yield_task_stop(struct rq *rq) | 48 | static void yield_task_stop(struct rq *rq) |
@@ -78,7 +82,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task) | |||
78 | /* | 82 | /* |
79 | * Simple, special scheduling class for the per-CPU stop tasks: | 83 | * Simple, special scheduling class for the per-CPU stop tasks: |
80 | */ | 84 | */ |
81 | static const struct sched_class stop_sched_class = { | 85 | const struct sched_class stop_sched_class = { |
82 | .next = &rt_sched_class, | 86 | .next = &rt_sched_class, |
83 | 87 | ||
84 | .enqueue_task = enqueue_task_stop, | 88 | .enqueue_task = enqueue_task_stop, |
diff --git a/kernel/semaphore.c b/kernel/semaphore.c index 94a62c0d4ade..60636a4e25c3 100644 --- a/kernel/semaphore.c +++ b/kernel/semaphore.c | |||
@@ -27,7 +27,7 @@ | |||
27 | 27 | ||
28 | #include <linux/compiler.h> | 28 | #include <linux/compiler.h> |
29 | #include <linux/kernel.h> | 29 | #include <linux/kernel.h> |
30 | #include <linux/module.h> | 30 | #include <linux/export.h> |
31 | #include <linux/sched.h> | 31 | #include <linux/sched.h> |
32 | #include <linux/semaphore.h> | 32 | #include <linux/semaphore.h> |
33 | #include <linux/spinlock.h> | 33 | #include <linux/spinlock.h> |
@@ -54,12 +54,12 @@ void down(struct semaphore *sem) | |||
54 | { | 54 | { |
55 | unsigned long flags; | 55 | unsigned long flags; |
56 | 56 | ||
57 | spin_lock_irqsave(&sem->lock, flags); | 57 | raw_spin_lock_irqsave(&sem->lock, flags); |
58 | if (likely(sem->count > 0)) | 58 | if (likely(sem->count > 0)) |
59 | sem->count--; | 59 | sem->count--; |
60 | else | 60 | else |
61 | __down(sem); | 61 | __down(sem); |
62 | spin_unlock_irqrestore(&sem->lock, flags); | 62 | raw_spin_unlock_irqrestore(&sem->lock, flags); |
63 | } | 63 | } |
64 | EXPORT_SYMBOL(down); | 64 | EXPORT_SYMBOL(down); |
65 | 65 | ||
@@ -77,12 +77,12 @@ int down_interruptible(struct semaphore *sem) | |||
77 | unsigned long flags; | 77 | unsigned long flags; |
78 | int result = 0; | 78 | int result = 0; |
79 | 79 | ||
80 | spin_lock_irqsave(&sem->lock, flags); | 80 | raw_spin_lock_irqsave(&sem->lock, flags); |
81 | if (likely(sem->count > 0)) | 81 | if (likely(sem->count > 0)) |
82 | sem->count--; | 82 | sem->count--; |
83 | else | 83 | else |
84 | result = __down_interruptible(sem); | 84 | result = __down_interruptible(sem); |
85 | spin_unlock_irqrestore(&sem->lock, flags); | 85 | raw_spin_unlock_irqrestore(&sem->lock, flags); |
86 | 86 | ||
87 | return result; | 87 | return result; |
88 | } | 88 | } |
@@ -103,12 +103,12 @@ int down_killable(struct semaphore *sem) | |||
103 | unsigned long flags; | 103 | unsigned long flags; |
104 | int result = 0; | 104 | int result = 0; |
105 | 105 | ||
106 | spin_lock_irqsave(&sem->lock, flags); | 106 | raw_spin_lock_irqsave(&sem->lock, flags); |
107 | if (likely(sem->count > 0)) | 107 | if (likely(sem->count > 0)) |
108 | sem->count--; | 108 | sem->count--; |
109 | else | 109 | else |
110 | result = __down_killable(sem); | 110 | result = __down_killable(sem); |
111 | spin_unlock_irqrestore(&sem->lock, flags); | 111 | raw_spin_unlock_irqrestore(&sem->lock, flags); |
112 | 112 | ||
113 | return result; | 113 | return result; |
114 | } | 114 | } |
@@ -132,11 +132,11 @@ int down_trylock(struct semaphore *sem) | |||
132 | unsigned long flags; | 132 | unsigned long flags; |
133 | int count; | 133 | int count; |
134 | 134 | ||
135 | spin_lock_irqsave(&sem->lock, flags); | 135 | raw_spin_lock_irqsave(&sem->lock, flags); |
136 | count = sem->count - 1; | 136 | count = sem->count - 1; |
137 | if (likely(count >= 0)) | 137 | if (likely(count >= 0)) |
138 | sem->count = count; | 138 | sem->count = count; |
139 | spin_unlock_irqrestore(&sem->lock, flags); | 139 | raw_spin_unlock_irqrestore(&sem->lock, flags); |
140 | 140 | ||
141 | return (count < 0); | 141 | return (count < 0); |
142 | } | 142 | } |
@@ -157,12 +157,12 @@ int down_timeout(struct semaphore *sem, long jiffies) | |||
157 | unsigned long flags; | 157 | unsigned long flags; |
158 | int result = 0; | 158 | int result = 0; |
159 | 159 | ||
160 | spin_lock_irqsave(&sem->lock, flags); | 160 | raw_spin_lock_irqsave(&sem->lock, flags); |
161 | if (likely(sem->count > 0)) | 161 | if (likely(sem->count > 0)) |
162 | sem->count--; | 162 | sem->count--; |
163 | else | 163 | else |
164 | result = __down_timeout(sem, jiffies); | 164 | result = __down_timeout(sem, jiffies); |
165 | spin_unlock_irqrestore(&sem->lock, flags); | 165 | raw_spin_unlock_irqrestore(&sem->lock, flags); |
166 | 166 | ||
167 | return result; | 167 | return result; |
168 | } | 168 | } |
@@ -179,12 +179,12 @@ void up(struct semaphore *sem) | |||
179 | { | 179 | { |
180 | unsigned long flags; | 180 | unsigned long flags; |
181 | 181 | ||
182 | spin_lock_irqsave(&sem->lock, flags); | 182 | raw_spin_lock_irqsave(&sem->lock, flags); |
183 | if (likely(list_empty(&sem->wait_list))) | 183 | if (likely(list_empty(&sem->wait_list))) |
184 | sem->count++; | 184 | sem->count++; |
185 | else | 185 | else |
186 | __up(sem); | 186 | __up(sem); |
187 | spin_unlock_irqrestore(&sem->lock, flags); | 187 | raw_spin_unlock_irqrestore(&sem->lock, flags); |
188 | } | 188 | } |
189 | EXPORT_SYMBOL(up); | 189 | EXPORT_SYMBOL(up); |
190 | 190 | ||
@@ -217,9 +217,9 @@ static inline int __sched __down_common(struct semaphore *sem, long state, | |||
217 | if (timeout <= 0) | 217 | if (timeout <= 0) |
218 | goto timed_out; | 218 | goto timed_out; |
219 | __set_task_state(task, state); | 219 | __set_task_state(task, state); |
220 | spin_unlock_irq(&sem->lock); | 220 | raw_spin_unlock_irq(&sem->lock); |
221 | timeout = schedule_timeout(timeout); | 221 | timeout = schedule_timeout(timeout); |
222 | spin_lock_irq(&sem->lock); | 222 | raw_spin_lock_irq(&sem->lock); |
223 | if (waiter.up) | 223 | if (waiter.up) |
224 | return 0; | 224 | return 0; |
225 | } | 225 | } |
diff --git a/kernel/signal.c b/kernel/signal.c index 291c9700be75..c73c4284160e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -11,7 +11,7 @@ | |||
11 | */ | 11 | */ |
12 | 12 | ||
13 | #include <linux/slab.h> | 13 | #include <linux/slab.h> |
14 | #include <linux/module.h> | 14 | #include <linux/export.h> |
15 | #include <linux/init.h> | 15 | #include <linux/init.h> |
16 | #include <linux/sched.h> | 16 | #include <linux/sched.h> |
17 | #include <linux/fs.h> | 17 | #include <linux/fs.h> |
@@ -28,6 +28,7 @@ | |||
28 | #include <linux/freezer.h> | 28 | #include <linux/freezer.h> |
29 | #include <linux/pid_namespace.h> | 29 | #include <linux/pid_namespace.h> |
30 | #include <linux/nsproxy.h> | 30 | #include <linux/nsproxy.h> |
31 | #include <linux/user_namespace.h> | ||
31 | #define CREATE_TRACE_POINTS | 32 | #define CREATE_TRACE_POINTS |
32 | #include <trace/events/signal.h> | 33 | #include <trace/events/signal.h> |
33 | 34 | ||
@@ -1019,6 +1020,34 @@ static inline int legacy_queue(struct sigpending *signals, int sig) | |||
1019 | return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); | 1020 | return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); |
1020 | } | 1021 | } |
1021 | 1022 | ||
1023 | /* | ||
1024 | * map the uid in struct cred into user namespace *ns | ||
1025 | */ | ||
1026 | static inline uid_t map_cred_ns(const struct cred *cred, | ||
1027 | struct user_namespace *ns) | ||
1028 | { | ||
1029 | return user_ns_map_uid(ns, cred, cred->uid); | ||
1030 | } | ||
1031 | |||
1032 | #ifdef CONFIG_USER_NS | ||
1033 | static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) | ||
1034 | { | ||
1035 | if (current_user_ns() == task_cred_xxx(t, user_ns)) | ||
1036 | return; | ||
1037 | |||
1038 | if (SI_FROMKERNEL(info)) | ||
1039 | return; | ||
1040 | |||
1041 | info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns), | ||
1042 | current_cred(), info->si_uid); | ||
1043 | } | ||
1044 | #else | ||
1045 | static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) | ||
1046 | { | ||
1047 | return; | ||
1048 | } | ||
1049 | #endif | ||
1050 | |||
1022 | static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, | 1051 | static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, |
1023 | int group, int from_ancestor_ns) | 1052 | int group, int from_ancestor_ns) |
1024 | { | 1053 | { |
@@ -1088,6 +1117,9 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, | |||
1088 | q->info.si_pid = 0; | 1117 | q->info.si_pid = 0; |
1089 | break; | 1118 | break; |
1090 | } | 1119 | } |
1120 | |||
1121 | userns_fixup_signal_uid(&q->info, t); | ||
1122 | |||
1091 | } else if (!is_si_special(info)) { | 1123 | } else if (!is_si_special(info)) { |
1092 | if (sig >= SIGRTMIN && info->si_code != SI_USER) { | 1124 | if (sig >= SIGRTMIN && info->si_code != SI_USER) { |
1093 | /* | 1125 | /* |
@@ -1344,13 +1376,24 @@ int kill_proc_info(int sig, struct siginfo *info, pid_t pid) | |||
1344 | return error; | 1376 | return error; |
1345 | } | 1377 | } |
1346 | 1378 | ||
1379 | static int kill_as_cred_perm(const struct cred *cred, | ||
1380 | struct task_struct *target) | ||
1381 | { | ||
1382 | const struct cred *pcred = __task_cred(target); | ||
1383 | if (cred->user_ns != pcred->user_ns) | ||
1384 | return 0; | ||
1385 | if (cred->euid != pcred->suid && cred->euid != pcred->uid && | ||
1386 | cred->uid != pcred->suid && cred->uid != pcred->uid) | ||
1387 | return 0; | ||
1388 | return 1; | ||
1389 | } | ||
1390 | |||
1347 | /* like kill_pid_info(), but doesn't use uid/euid of "current" */ | 1391 | /* like kill_pid_info(), but doesn't use uid/euid of "current" */ |
1348 | int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid, | 1392 | int kill_pid_info_as_cred(int sig, struct siginfo *info, struct pid *pid, |
1349 | uid_t uid, uid_t euid, u32 secid) | 1393 | const struct cred *cred, u32 secid) |
1350 | { | 1394 | { |
1351 | int ret = -EINVAL; | 1395 | int ret = -EINVAL; |
1352 | struct task_struct *p; | 1396 | struct task_struct *p; |
1353 | const struct cred *pcred; | ||
1354 | unsigned long flags; | 1397 | unsigned long flags; |
1355 | 1398 | ||
1356 | if (!valid_signal(sig)) | 1399 | if (!valid_signal(sig)) |
@@ -1362,10 +1405,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid, | |||
1362 | ret = -ESRCH; | 1405 | ret = -ESRCH; |
1363 | goto out_unlock; | 1406 | goto out_unlock; |
1364 | } | 1407 | } |
1365 | pcred = __task_cred(p); | 1408 | if (si_fromuser(info) && !kill_as_cred_perm(cred, p)) { |
1366 | if (si_fromuser(info) && | ||
1367 | euid != pcred->suid && euid != pcred->uid && | ||
1368 | uid != pcred->suid && uid != pcred->uid) { | ||
1369 | ret = -EPERM; | 1409 | ret = -EPERM; |
1370 | goto out_unlock; | 1410 | goto out_unlock; |
1371 | } | 1411 | } |
@@ -1384,7 +1424,7 @@ out_unlock: | |||
1384 | rcu_read_unlock(); | 1424 | rcu_read_unlock(); |
1385 | return ret; | 1425 | return ret; |
1386 | } | 1426 | } |
1387 | EXPORT_SYMBOL_GPL(kill_pid_info_as_uid); | 1427 | EXPORT_SYMBOL_GPL(kill_pid_info_as_cred); |
1388 | 1428 | ||
1389 | /* | 1429 | /* |
1390 | * kill_something_info() interprets pid in interesting ways just like kill(2). | 1430 | * kill_something_info() interprets pid in interesting ways just like kill(2). |
@@ -1618,13 +1658,12 @@ bool do_notify_parent(struct task_struct *tsk, int sig) | |||
1618 | */ | 1658 | */ |
1619 | rcu_read_lock(); | 1659 | rcu_read_lock(); |
1620 | info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); | 1660 | info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); |
1621 | info.si_uid = __task_cred(tsk)->uid; | 1661 | info.si_uid = map_cred_ns(__task_cred(tsk), |
1662 | task_cred_xxx(tsk->parent, user_ns)); | ||
1622 | rcu_read_unlock(); | 1663 | rcu_read_unlock(); |
1623 | 1664 | ||
1624 | info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime, | 1665 | info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime); |
1625 | tsk->signal->utime)); | 1666 | info.si_stime = cputime_to_clock_t(tsk->stime + tsk->signal->stime); |
1626 | info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime, | ||
1627 | tsk->signal->stime)); | ||
1628 | 1667 | ||
1629 | info.si_status = tsk->exit_code & 0x7f; | 1668 | info.si_status = tsk->exit_code & 0x7f; |
1630 | if (tsk->exit_code & 0x80) | 1669 | if (tsk->exit_code & 0x80) |
@@ -1703,7 +1742,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, | |||
1703 | */ | 1742 | */ |
1704 | rcu_read_lock(); | 1743 | rcu_read_lock(); |
1705 | info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); | 1744 | info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); |
1706 | info.si_uid = __task_cred(tsk)->uid; | 1745 | info.si_uid = map_cred_ns(__task_cred(tsk), |
1746 | task_cred_xxx(parent, user_ns)); | ||
1707 | rcu_read_unlock(); | 1747 | rcu_read_unlock(); |
1708 | 1748 | ||
1709 | info.si_utime = cputime_to_clock_t(tsk->utime); | 1749 | info.si_utime = cputime_to_clock_t(tsk->utime); |
@@ -1986,8 +2026,6 @@ static bool do_signal_stop(int signr) | |||
1986 | */ | 2026 | */ |
1987 | if (!(sig->flags & SIGNAL_STOP_STOPPED)) | 2027 | if (!(sig->flags & SIGNAL_STOP_STOPPED)) |
1988 | sig->group_exit_code = signr; | 2028 | sig->group_exit_code = signr; |
1989 | else | ||
1990 | WARN_ON_ONCE(!current->ptrace); | ||
1991 | 2029 | ||
1992 | sig->group_stop_count = 0; | 2030 | sig->group_stop_count = 0; |
1993 | 2031 | ||
@@ -2121,8 +2159,11 @@ static int ptrace_signal(int signr, siginfo_t *info, | |||
2121 | info->si_signo = signr; | 2159 | info->si_signo = signr; |
2122 | info->si_errno = 0; | 2160 | info->si_errno = 0; |
2123 | info->si_code = SI_USER; | 2161 | info->si_code = SI_USER; |
2162 | rcu_read_lock(); | ||
2124 | info->si_pid = task_pid_vnr(current->parent); | 2163 | info->si_pid = task_pid_vnr(current->parent); |
2125 | info->si_uid = task_uid(current->parent); | 2164 | info->si_uid = map_cred_ns(__task_cred(current->parent), |
2165 | current_user_ns()); | ||
2166 | rcu_read_unlock(); | ||
2126 | } | 2167 | } |
2127 | 2168 | ||
2128 | /* If the (new) signal is now blocked, requeue it. */ | 2169 | /* If the (new) signal is now blocked, requeue it. */ |
@@ -2314,6 +2355,27 @@ relock: | |||
2314 | return signr; | 2355 | return signr; |
2315 | } | 2356 | } |
2316 | 2357 | ||
2358 | /** | ||
2359 | * block_sigmask - add @ka's signal mask to current->blocked | ||
2360 | * @ka: action for @signr | ||
2361 | * @signr: signal that has been successfully delivered | ||
2362 | * | ||
2363 | * This function should be called when a signal has succesfully been | ||
2364 | * delivered. It adds the mask of signals for @ka to current->blocked | ||
2365 | * so that they are blocked during the execution of the signal | ||
2366 | * handler. In addition, @signr will be blocked unless %SA_NODEFER is | ||
2367 | * set in @ka->sa.sa_flags. | ||
2368 | */ | ||
2369 | void block_sigmask(struct k_sigaction *ka, int signr) | ||
2370 | { | ||
2371 | sigset_t blocked; | ||
2372 | |||
2373 | sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); | ||
2374 | if (!(ka->sa.sa_flags & SA_NODEFER)) | ||
2375 | sigaddset(&blocked, signr); | ||
2376 | set_current_blocked(&blocked); | ||
2377 | } | ||
2378 | |||
2317 | /* | 2379 | /* |
2318 | * It could be that complete_signal() picked us to notify about the | 2380 | * It could be that complete_signal() picked us to notify about the |
2319 | * group-wide signal. Other threads should be notified now to take | 2381 | * group-wide signal. Other threads should be notified now to take |
@@ -2351,8 +2413,15 @@ void exit_signals(struct task_struct *tsk) | |||
2351 | int group_stop = 0; | 2413 | int group_stop = 0; |
2352 | sigset_t unblocked; | 2414 | sigset_t unblocked; |
2353 | 2415 | ||
2416 | /* | ||
2417 | * @tsk is about to have PF_EXITING set - lock out users which | ||
2418 | * expect stable threadgroup. | ||
2419 | */ | ||
2420 | threadgroup_change_begin(tsk); | ||
2421 | |||
2354 | if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) { | 2422 | if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) { |
2355 | tsk->flags |= PF_EXITING; | 2423 | tsk->flags |= PF_EXITING; |
2424 | threadgroup_change_end(tsk); | ||
2356 | return; | 2425 | return; |
2357 | } | 2426 | } |
2358 | 2427 | ||
@@ -2362,6 +2431,9 @@ void exit_signals(struct task_struct *tsk) | |||
2362 | * see wants_signal(), do_signal_stop(). | 2431 | * see wants_signal(), do_signal_stop(). |
2363 | */ | 2432 | */ |
2364 | tsk->flags |= PF_EXITING; | 2433 | tsk->flags |= PF_EXITING; |
2434 | |||
2435 | threadgroup_change_end(tsk); | ||
2436 | |||
2365 | if (!signal_pending(tsk)) | 2437 | if (!signal_pending(tsk)) |
2366 | goto out; | 2438 | goto out; |
2367 | 2439 | ||
diff --git a/kernel/smp.c b/kernel/smp.c index fb67dfa8394e..db197d60489b 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
@@ -6,7 +6,7 @@ | |||
6 | #include <linux/rcupdate.h> | 6 | #include <linux/rcupdate.h> |
7 | #include <linux/rculist.h> | 7 | #include <linux/rculist.h> |
8 | #include <linux/kernel.h> | 8 | #include <linux/kernel.h> |
9 | #include <linux/module.h> | 9 | #include <linux/export.h> |
10 | #include <linux/percpu.h> | 10 | #include <linux/percpu.h> |
11 | #include <linux/init.h> | 11 | #include <linux/init.h> |
12 | #include <linux/gfp.h> | 12 | #include <linux/gfp.h> |
diff --git a/kernel/softirq.c b/kernel/softirq.c index fca82c32042b..4eb3a0fa351e 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -10,7 +10,7 @@ | |||
10 | * Remote softirq infrastructure is by Jens Axboe. | 10 | * Remote softirq infrastructure is by Jens Axboe. |
11 | */ | 11 | */ |
12 | 12 | ||
13 | #include <linux/module.h> | 13 | #include <linux/export.h> |
14 | #include <linux/kernel_stat.h> | 14 | #include <linux/kernel_stat.h> |
15 | #include <linux/interrupt.h> | 15 | #include <linux/interrupt.h> |
16 | #include <linux/init.h> | 16 | #include <linux/init.h> |
@@ -347,12 +347,12 @@ void irq_exit(void) | |||
347 | if (!in_interrupt() && local_softirq_pending()) | 347 | if (!in_interrupt() && local_softirq_pending()) |
348 | invoke_softirq(); | 348 | invoke_softirq(); |
349 | 349 | ||
350 | rcu_irq_exit(); | ||
351 | #ifdef CONFIG_NO_HZ | 350 | #ifdef CONFIG_NO_HZ |
352 | /* Make sure that timer wheel updates are propagated */ | 351 | /* Make sure that timer wheel updates are propagated */ |
353 | if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) | 352 | if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) |
354 | tick_nohz_stop_sched_tick(0); | 353 | tick_nohz_irq_exit(); |
355 | #endif | 354 | #endif |
355 | rcu_irq_exit(); | ||
356 | preempt_enable_no_resched(); | 356 | preempt_enable_no_resched(); |
357 | } | 357 | } |
358 | 358 | ||
diff --git a/kernel/spinlock.c b/kernel/spinlock.c index be6517fb9c14..84c7d96918bf 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c | |||
@@ -19,7 +19,7 @@ | |||
19 | #include <linux/spinlock.h> | 19 | #include <linux/spinlock.h> |
20 | #include <linux/interrupt.h> | 20 | #include <linux/interrupt.h> |
21 | #include <linux/debug_locks.h> | 21 | #include <linux/debug_locks.h> |
22 | #include <linux/module.h> | 22 | #include <linux/export.h> |
23 | 23 | ||
24 | /* | 24 | /* |
25 | * If lockdep is enabled then we use the non-preemption spin-ops | 25 | * If lockdep is enabled then we use the non-preemption spin-ops |
diff --git a/kernel/srcu.c b/kernel/srcu.c index 73ce23feaea9..0febf61e1aa3 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c | |||
@@ -24,7 +24,7 @@ | |||
24 | * | 24 | * |
25 | */ | 25 | */ |
26 | 26 | ||
27 | #include <linux/module.h> | 27 | #include <linux/export.h> |
28 | #include <linux/mutex.h> | 28 | #include <linux/mutex.h> |
29 | #include <linux/percpu.h> | 29 | #include <linux/percpu.h> |
30 | #include <linux/preempt.h> | 30 | #include <linux/preempt.h> |
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index d20c6983aad9..00fe55cc5a82 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c | |||
@@ -7,7 +7,7 @@ | |||
7 | */ | 7 | */ |
8 | #include <linux/sched.h> | 8 | #include <linux/sched.h> |
9 | #include <linux/kernel.h> | 9 | #include <linux/kernel.h> |
10 | #include <linux/module.h> | 10 | #include <linux/export.h> |
11 | #include <linux/kallsyms.h> | 11 | #include <linux/kallsyms.h> |
12 | #include <linux/stacktrace.h> | 12 | #include <linux/stacktrace.h> |
13 | 13 | ||
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index ba5070ce5765..2f194e965715 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
@@ -12,7 +12,7 @@ | |||
12 | #include <linux/cpu.h> | 12 | #include <linux/cpu.h> |
13 | #include <linux/init.h> | 13 | #include <linux/init.h> |
14 | #include <linux/kthread.h> | 14 | #include <linux/kthread.h> |
15 | #include <linux/module.h> | 15 | #include <linux/export.h> |
16 | #include <linux/percpu.h> | 16 | #include <linux/percpu.h> |
17 | #include <linux/sched.h> | 17 | #include <linux/sched.h> |
18 | #include <linux/stop_machine.h> | 18 | #include <linux/stop_machine.h> |
@@ -41,6 +41,7 @@ struct cpu_stopper { | |||
41 | }; | 41 | }; |
42 | 42 | ||
43 | static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); | 43 | static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); |
44 | static bool stop_machine_initialized = false; | ||
44 | 45 | ||
45 | static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) | 46 | static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) |
46 | { | 47 | { |
@@ -386,6 +387,8 @@ static int __init cpu_stop_init(void) | |||
386 | cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu); | 387 | cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu); |
387 | register_cpu_notifier(&cpu_stop_cpu_notifier); | 388 | register_cpu_notifier(&cpu_stop_cpu_notifier); |
388 | 389 | ||
390 | stop_machine_initialized = true; | ||
391 | |||
389 | return 0; | 392 | return 0; |
390 | } | 393 | } |
391 | early_initcall(cpu_stop_init); | 394 | early_initcall(cpu_stop_init); |
@@ -485,6 +488,25 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) | |||
485 | .num_threads = num_online_cpus(), | 488 | .num_threads = num_online_cpus(), |
486 | .active_cpus = cpus }; | 489 | .active_cpus = cpus }; |
487 | 490 | ||
491 | if (!stop_machine_initialized) { | ||
492 | /* | ||
493 | * Handle the case where stop_machine() is called | ||
494 | * early in boot before stop_machine() has been | ||
495 | * initialized. | ||
496 | */ | ||
497 | unsigned long flags; | ||
498 | int ret; | ||
499 | |||
500 | WARN_ON_ONCE(smdata.num_threads != 1); | ||
501 | |||
502 | local_irq_save(flags); | ||
503 | hard_irq_disable(); | ||
504 | ret = (*fn)(data); | ||
505 | local_irq_restore(flags); | ||
506 | |||
507 | return ret; | ||
508 | } | ||
509 | |||
488 | /* Set the initial state and stop all online cpus. */ | 510 | /* Set the initial state and stop all online cpus. */ |
489 | set_state(&smdata, STOPMACHINE_PREPARE); | 511 | set_state(&smdata, STOPMACHINE_PREPARE); |
490 | return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata); | 512 | return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata); |
diff --git a/kernel/sys.c b/kernel/sys.c index 1dbbe695a5ef..40701538fbd1 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -4,7 +4,7 @@ | |||
4 | * Copyright (C) 1991, 1992 Linus Torvalds | 4 | * Copyright (C) 1991, 1992 Linus Torvalds |
5 | */ | 5 | */ |
6 | 6 | ||
7 | #include <linux/module.h> | 7 | #include <linux/export.h> |
8 | #include <linux/mm.h> | 8 | #include <linux/mm.h> |
9 | #include <linux/utsname.h> | 9 | #include <linux/utsname.h> |
10 | #include <linux/mman.h> | 10 | #include <linux/mman.h> |
@@ -12,6 +12,7 @@ | |||
12 | #include <linux/prctl.h> | 12 | #include <linux/prctl.h> |
13 | #include <linux/highuid.h> | 13 | #include <linux/highuid.h> |
14 | #include <linux/fs.h> | 14 | #include <linux/fs.h> |
15 | #include <linux/kmod.h> | ||
15 | #include <linux/perf_event.h> | 16 | #include <linux/perf_event.h> |
16 | #include <linux/resource.h> | 17 | #include <linux/resource.h> |
17 | #include <linux/kernel.h> | 18 | #include <linux/kernel.h> |
@@ -1286,6 +1287,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) | |||
1286 | memset(u->nodename + len, 0, sizeof(u->nodename) - len); | 1287 | memset(u->nodename + len, 0, sizeof(u->nodename) - len); |
1287 | errno = 0; | 1288 | errno = 0; |
1288 | } | 1289 | } |
1290 | uts_proc_notify(UTS_PROC_HOSTNAME); | ||
1289 | up_write(&uts_sem); | 1291 | up_write(&uts_sem); |
1290 | return errno; | 1292 | return errno; |
1291 | } | 1293 | } |
@@ -1336,6 +1338,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) | |||
1336 | memset(u->domainname + len, 0, sizeof(u->domainname) - len); | 1338 | memset(u->domainname + len, 0, sizeof(u->domainname) - len); |
1337 | errno = 0; | 1339 | errno = 0; |
1338 | } | 1340 | } |
1341 | uts_proc_notify(UTS_PROC_DOMAINNAME); | ||
1339 | up_write(&uts_sem); | 1342 | up_write(&uts_sem); |
1340 | return errno; | 1343 | return errno; |
1341 | } | 1344 | } |
@@ -1602,7 +1605,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
1602 | unsigned long maxrss = 0; | 1605 | unsigned long maxrss = 0; |
1603 | 1606 | ||
1604 | memset((char *) r, 0, sizeof *r); | 1607 | memset((char *) r, 0, sizeof *r); |
1605 | utime = stime = cputime_zero; | 1608 | utime = stime = 0; |
1606 | 1609 | ||
1607 | if (who == RUSAGE_THREAD) { | 1610 | if (who == RUSAGE_THREAD) { |
1608 | task_times(current, &utime, &stime); | 1611 | task_times(current, &utime, &stime); |
@@ -1632,8 +1635,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
1632 | 1635 | ||
1633 | case RUSAGE_SELF: | 1636 | case RUSAGE_SELF: |
1634 | thread_group_times(p, &tgutime, &tgstime); | 1637 | thread_group_times(p, &tgutime, &tgstime); |
1635 | utime = cputime_add(utime, tgutime); | 1638 | utime += tgutime; |
1636 | stime = cputime_add(stime, tgstime); | 1639 | stime += tgstime; |
1637 | r->ru_nvcsw += p->signal->nvcsw; | 1640 | r->ru_nvcsw += p->signal->nvcsw; |
1638 | r->ru_nivcsw += p->signal->nivcsw; | 1641 | r->ru_nivcsw += p->signal->nivcsw; |
1639 | r->ru_minflt += p->signal->min_flt; | 1642 | r->ru_minflt += p->signal->min_flt; |
@@ -1689,6 +1692,124 @@ SYSCALL_DEFINE1(umask, int, mask) | |||
1689 | return mask; | 1692 | return mask; |
1690 | } | 1693 | } |
1691 | 1694 | ||
1695 | #ifdef CONFIG_CHECKPOINT_RESTORE | ||
1696 | static int prctl_set_mm(int opt, unsigned long addr, | ||
1697 | unsigned long arg4, unsigned long arg5) | ||
1698 | { | ||
1699 | unsigned long rlim = rlimit(RLIMIT_DATA); | ||
1700 | unsigned long vm_req_flags; | ||
1701 | unsigned long vm_bad_flags; | ||
1702 | struct vm_area_struct *vma; | ||
1703 | int error = 0; | ||
1704 | struct mm_struct *mm = current->mm; | ||
1705 | |||
1706 | if (arg4 | arg5) | ||
1707 | return -EINVAL; | ||
1708 | |||
1709 | if (!capable(CAP_SYS_ADMIN)) | ||
1710 | return -EPERM; | ||
1711 | |||
1712 | if (addr >= TASK_SIZE) | ||
1713 | return -EINVAL; | ||
1714 | |||
1715 | down_read(&mm->mmap_sem); | ||
1716 | vma = find_vma(mm, addr); | ||
1717 | |||
1718 | if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) { | ||
1719 | /* It must be existing VMA */ | ||
1720 | if (!vma || vma->vm_start > addr) | ||
1721 | goto out; | ||
1722 | } | ||
1723 | |||
1724 | error = -EINVAL; | ||
1725 | switch (opt) { | ||
1726 | case PR_SET_MM_START_CODE: | ||
1727 | case PR_SET_MM_END_CODE: | ||
1728 | vm_req_flags = VM_READ | VM_EXEC; | ||
1729 | vm_bad_flags = VM_WRITE | VM_MAYSHARE; | ||
1730 | |||
1731 | if ((vma->vm_flags & vm_req_flags) != vm_req_flags || | ||
1732 | (vma->vm_flags & vm_bad_flags)) | ||
1733 | goto out; | ||
1734 | |||
1735 | if (opt == PR_SET_MM_START_CODE) | ||
1736 | mm->start_code = addr; | ||
1737 | else | ||
1738 | mm->end_code = addr; | ||
1739 | break; | ||
1740 | |||
1741 | case PR_SET_MM_START_DATA: | ||
1742 | case PR_SET_MM_END_DATA: | ||
1743 | vm_req_flags = VM_READ | VM_WRITE; | ||
1744 | vm_bad_flags = VM_EXEC | VM_MAYSHARE; | ||
1745 | |||
1746 | if ((vma->vm_flags & vm_req_flags) != vm_req_flags || | ||
1747 | (vma->vm_flags & vm_bad_flags)) | ||
1748 | goto out; | ||
1749 | |||
1750 | if (opt == PR_SET_MM_START_DATA) | ||
1751 | mm->start_data = addr; | ||
1752 | else | ||
1753 | mm->end_data = addr; | ||
1754 | break; | ||
1755 | |||
1756 | case PR_SET_MM_START_STACK: | ||
1757 | |||
1758 | #ifdef CONFIG_STACK_GROWSUP | ||
1759 | vm_req_flags = VM_READ | VM_WRITE | VM_GROWSUP; | ||
1760 | #else | ||
1761 | vm_req_flags = VM_READ | VM_WRITE | VM_GROWSDOWN; | ||
1762 | #endif | ||
1763 | if ((vma->vm_flags & vm_req_flags) != vm_req_flags) | ||
1764 | goto out; | ||
1765 | |||
1766 | mm->start_stack = addr; | ||
1767 | break; | ||
1768 | |||
1769 | case PR_SET_MM_START_BRK: | ||
1770 | if (addr <= mm->end_data) | ||
1771 | goto out; | ||
1772 | |||
1773 | if (rlim < RLIM_INFINITY && | ||
1774 | (mm->brk - addr) + | ||
1775 | (mm->end_data - mm->start_data) > rlim) | ||
1776 | goto out; | ||
1777 | |||
1778 | mm->start_brk = addr; | ||
1779 | break; | ||
1780 | |||
1781 | case PR_SET_MM_BRK: | ||
1782 | if (addr <= mm->end_data) | ||
1783 | goto out; | ||
1784 | |||
1785 | if (rlim < RLIM_INFINITY && | ||
1786 | (addr - mm->start_brk) + | ||
1787 | (mm->end_data - mm->start_data) > rlim) | ||
1788 | goto out; | ||
1789 | |||
1790 | mm->brk = addr; | ||
1791 | break; | ||
1792 | |||
1793 | default: | ||
1794 | error = -EINVAL; | ||
1795 | goto out; | ||
1796 | } | ||
1797 | |||
1798 | error = 0; | ||
1799 | |||
1800 | out: | ||
1801 | up_read(&mm->mmap_sem); | ||
1802 | |||
1803 | return error; | ||
1804 | } | ||
1805 | #else /* CONFIG_CHECKPOINT_RESTORE */ | ||
1806 | static int prctl_set_mm(int opt, unsigned long addr, | ||
1807 | unsigned long arg4, unsigned long arg5) | ||
1808 | { | ||
1809 | return -EINVAL; | ||
1810 | } | ||
1811 | #endif | ||
1812 | |||
1692 | SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | 1813 | SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, |
1693 | unsigned long, arg4, unsigned long, arg5) | 1814 | unsigned long, arg4, unsigned long, arg5) |
1694 | { | 1815 | { |
@@ -1759,6 +1880,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
1759 | sizeof(me->comm) - 1) < 0) | 1880 | sizeof(me->comm) - 1) < 0) |
1760 | return -EFAULT; | 1881 | return -EFAULT; |
1761 | set_task_comm(me, comm); | 1882 | set_task_comm(me, comm); |
1883 | proc_comm_connector(me); | ||
1762 | return 0; | 1884 | return 0; |
1763 | case PR_GET_NAME: | 1885 | case PR_GET_NAME: |
1764 | get_task_comm(comm, me); | 1886 | get_task_comm(comm, me); |
@@ -1837,6 +1959,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
1837 | else | 1959 | else |
1838 | error = PR_MCE_KILL_DEFAULT; | 1960 | error = PR_MCE_KILL_DEFAULT; |
1839 | break; | 1961 | break; |
1962 | case PR_SET_MM: | ||
1963 | error = prctl_set_mm(arg2, arg3, arg4, arg5); | ||
1964 | break; | ||
1840 | default: | 1965 | default: |
1841 | error = -EINVAL; | 1966 | error = -EINVAL; |
1842 | break; | 1967 | break; |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index a9a5de07c4f1..47bfa16430d7 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -145,6 +145,10 @@ cond_syscall(sys_io_submit); | |||
145 | cond_syscall(sys_io_cancel); | 145 | cond_syscall(sys_io_cancel); |
146 | cond_syscall(sys_io_getevents); | 146 | cond_syscall(sys_io_getevents); |
147 | cond_syscall(sys_syslog); | 147 | cond_syscall(sys_syslog); |
148 | cond_syscall(sys_process_vm_readv); | ||
149 | cond_syscall(sys_process_vm_writev); | ||
150 | cond_syscall(compat_sys_process_vm_readv); | ||
151 | cond_syscall(compat_sys_process_vm_writev); | ||
148 | 152 | ||
149 | /* arch-specific weak syscall entries */ | 153 | /* arch-specific weak syscall entries */ |
150 | cond_syscall(sys_pciconfig_read); | 154 | cond_syscall(sys_pciconfig_read); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 11d65b531e50..f487f257e05e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -57,6 +57,7 @@ | |||
57 | #include <linux/pipe_fs_i.h> | 57 | #include <linux/pipe_fs_i.h> |
58 | #include <linux/oom.h> | 58 | #include <linux/oom.h> |
59 | #include <linux/kmod.h> | 59 | #include <linux/kmod.h> |
60 | #include <linux/capability.h> | ||
60 | 61 | ||
61 | #include <asm/uaccess.h> | 62 | #include <asm/uaccess.h> |
62 | #include <asm/processor.h> | 63 | #include <asm/processor.h> |
@@ -134,6 +135,7 @@ static int minolduid; | |||
134 | static int min_percpu_pagelist_fract = 8; | 135 | static int min_percpu_pagelist_fract = 8; |
135 | 136 | ||
136 | static int ngroups_max = NGROUPS_MAX; | 137 | static int ngroups_max = NGROUPS_MAX; |
138 | static const int cap_last_cap = CAP_LAST_CAP; | ||
137 | 139 | ||
138 | #ifdef CONFIG_INOTIFY_USER | 140 | #ifdef CONFIG_INOTIFY_USER |
139 | #include <linux/inotify.h> | 141 | #include <linux/inotify.h> |
@@ -151,14 +153,6 @@ extern int pwrsw_enabled; | |||
151 | extern int unaligned_enabled; | 153 | extern int unaligned_enabled; |
152 | #endif | 154 | #endif |
153 | 155 | ||
154 | #ifdef CONFIG_S390 | ||
155 | #ifdef CONFIG_MATHEMU | ||
156 | extern int sysctl_ieee_emulation_warnings; | ||
157 | #endif | ||
158 | extern int sysctl_userprocess_debug; | ||
159 | extern int spin_retry; | ||
160 | #endif | ||
161 | |||
162 | #ifdef CONFIG_IA64 | 156 | #ifdef CONFIG_IA64 |
163 | extern int no_unaligned_warning; | 157 | extern int no_unaligned_warning; |
164 | extern int unaligned_dump_stack; | 158 | extern int unaligned_dump_stack; |
@@ -379,6 +373,16 @@ static struct ctl_table kern_table[] = { | |||
379 | .extra2 = &one, | 373 | .extra2 = &one, |
380 | }, | 374 | }, |
381 | #endif | 375 | #endif |
376 | #ifdef CONFIG_CFS_BANDWIDTH | ||
377 | { | ||
378 | .procname = "sched_cfs_bandwidth_slice_us", | ||
379 | .data = &sysctl_sched_cfs_bandwidth_slice, | ||
380 | .maxlen = sizeof(unsigned int), | ||
381 | .mode = 0644, | ||
382 | .proc_handler = proc_dointvec_minmax, | ||
383 | .extra1 = &one, | ||
384 | }, | ||
385 | #endif | ||
382 | #ifdef CONFIG_PROVE_LOCKING | 386 | #ifdef CONFIG_PROVE_LOCKING |
383 | { | 387 | { |
384 | .procname = "prove_locking", | 388 | .procname = "prove_locking", |
@@ -730,6 +734,13 @@ static struct ctl_table kern_table[] = { | |||
730 | .mode = 0444, | 734 | .mode = 0444, |
731 | .proc_handler = proc_dointvec, | 735 | .proc_handler = proc_dointvec, |
732 | }, | 736 | }, |
737 | { | ||
738 | .procname = "cap_last_cap", | ||
739 | .data = (void *)&cap_last_cap, | ||
740 | .maxlen = sizeof(int), | ||
741 | .mode = 0444, | ||
742 | .proc_handler = proc_dointvec, | ||
743 | }, | ||
733 | #if defined(CONFIG_LOCKUP_DETECTOR) | 744 | #if defined(CONFIG_LOCKUP_DETECTOR) |
734 | { | 745 | { |
735 | .procname = "watchdog", | 746 | .procname = "watchdog", |
@@ -792,6 +803,15 @@ static struct ctl_table kern_table[] = { | |||
792 | .mode = 0644, | 803 | .mode = 0644, |
793 | .proc_handler = proc_dointvec, | 804 | .proc_handler = proc_dointvec, |
794 | }, | 805 | }, |
806 | #ifdef CONFIG_DEBUG_STACKOVERFLOW | ||
807 | { | ||
808 | .procname = "panic_on_stackoverflow", | ||
809 | .data = &sysctl_panic_on_stackoverflow, | ||
810 | .maxlen = sizeof(int), | ||
811 | .mode = 0644, | ||
812 | .proc_handler = proc_dointvec, | ||
813 | }, | ||
814 | #endif | ||
795 | { | 815 | { |
796 | .procname = "bootloader_type", | 816 | .procname = "bootloader_type", |
797 | .data = &bootloader_type, | 817 | .data = &bootloader_type, |
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index e8bffbe2ba4b..a650694883a1 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c | |||
@@ -214,7 +214,7 @@ static const struct bin_table bin_net_ipv4_route_table[] = { | |||
214 | { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" }, | 214 | { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" }, |
215 | { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" }, | 215 | { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" }, |
216 | { CTL_INT, NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" }, | 216 | { CTL_INT, NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" }, |
217 | { CTL_INT, NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval" }, | 217 | /* NET_IPV4_ROUTE_GC_INTERVAL "gc_interval" no longer used */ |
218 | { CTL_INT, NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" }, | 218 | { CTL_INT, NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" }, |
219 | { CTL_INT, NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" }, | 219 | { CTL_INT, NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" }, |
220 | { CTL_INT, NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" }, | 220 | { CTL_INT, NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" }, |
@@ -1354,7 +1354,7 @@ static ssize_t binary_sysctl(const int *name, int nlen, | |||
1354 | 1354 | ||
1355 | fput(file); | 1355 | fput(file); |
1356 | out_putname: | 1356 | out_putname: |
1357 | putname(pathname); | 1357 | __putname(pathname); |
1358 | out: | 1358 | out: |
1359 | return result; | 1359 | return result; |
1360 | } | 1360 | } |
diff --git a/kernel/time.c b/kernel/time.c index 8e8dc6d705c9..73e416db0a1e 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
@@ -27,7 +27,7 @@ | |||
27 | * with nanosecond accuracy | 27 | * with nanosecond accuracy |
28 | */ | 28 | */ |
29 | 29 | ||
30 | #include <linux/module.h> | 30 | #include <linux/export.h> |
31 | #include <linux/timex.h> | 31 | #include <linux/timex.h> |
32 | #include <linux/capability.h> | 32 | #include <linux/capability.h> |
33 | #include <linux/clocksource.h> | 33 | #include <linux/clocksource.h> |
@@ -575,7 +575,7 @@ EXPORT_SYMBOL(jiffies_to_timeval); | |||
575 | /* | 575 | /* |
576 | * Convert jiffies/jiffies_64 to clock_t and back. | 576 | * Convert jiffies/jiffies_64 to clock_t and back. |
577 | */ | 577 | */ |
578 | clock_t jiffies_to_clock_t(long x) | 578 | clock_t jiffies_to_clock_t(unsigned long x) |
579 | { | 579 | { |
580 | #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 | 580 | #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 |
581 | # if HZ < USER_HZ | 581 | # if HZ < USER_HZ |
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index f06a8a365648..2cf9cc7aa103 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig | |||
@@ -25,5 +25,7 @@ config HIGH_RES_TIMERS | |||
25 | config GENERIC_CLOCKEVENTS_BUILD | 25 | config GENERIC_CLOCKEVENTS_BUILD |
26 | bool | 26 | bool |
27 | default y | 27 | default y |
28 | depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR | 28 | depends on GENERIC_CLOCKEVENTS |
29 | 29 | ||
30 | config GENERIC_CLOCKEVENTS_MIN_ADJUST | ||
31 | bool | ||
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index ea5e1a928d5b..8a46f5d64504 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c | |||
@@ -53,27 +53,6 @@ static struct rtc_device *rtcdev; | |||
53 | static DEFINE_SPINLOCK(rtcdev_lock); | 53 | static DEFINE_SPINLOCK(rtcdev_lock); |
54 | 54 | ||
55 | /** | 55 | /** |
56 | * has_wakealarm - check rtc device has wakealarm ability | ||
57 | * @dev: current device | ||
58 | * @name_ptr: name to be returned | ||
59 | * | ||
60 | * This helper function checks to see if the rtc device can wake | ||
61 | * from suspend. | ||
62 | */ | ||
63 | static int has_wakealarm(struct device *dev, void *name_ptr) | ||
64 | { | ||
65 | struct rtc_device *candidate = to_rtc_device(dev); | ||
66 | |||
67 | if (!candidate->ops->set_alarm) | ||
68 | return 0; | ||
69 | if (!device_may_wakeup(candidate->dev.parent)) | ||
70 | return 0; | ||
71 | |||
72 | *(const char **)name_ptr = dev_name(dev); | ||
73 | return 1; | ||
74 | } | ||
75 | |||
76 | /** | ||
77 | * alarmtimer_get_rtcdev - Return selected rtcdevice | 56 | * alarmtimer_get_rtcdev - Return selected rtcdevice |
78 | * | 57 | * |
79 | * This function returns the rtc device to use for wakealarms. | 58 | * This function returns the rtc device to use for wakealarms. |
@@ -82,37 +61,64 @@ static int has_wakealarm(struct device *dev, void *name_ptr) | |||
82 | */ | 61 | */ |
83 | static struct rtc_device *alarmtimer_get_rtcdev(void) | 62 | static struct rtc_device *alarmtimer_get_rtcdev(void) |
84 | { | 63 | { |
85 | struct device *dev; | ||
86 | char *str; | ||
87 | unsigned long flags; | 64 | unsigned long flags; |
88 | struct rtc_device *ret; | 65 | struct rtc_device *ret; |
89 | 66 | ||
90 | spin_lock_irqsave(&rtcdev_lock, flags); | 67 | spin_lock_irqsave(&rtcdev_lock, flags); |
91 | if (!rtcdev) { | ||
92 | /* Find an rtc device and init the rtc_timer */ | ||
93 | dev = class_find_device(rtc_class, NULL, &str, has_wakealarm); | ||
94 | /* If we have a device then str is valid. See has_wakealarm() */ | ||
95 | if (dev) { | ||
96 | rtcdev = rtc_class_open(str); | ||
97 | /* | ||
98 | * Drop the reference we got in class_find_device, | ||
99 | * rtc_open takes its own. | ||
100 | */ | ||
101 | put_device(dev); | ||
102 | rtc_timer_init(&rtctimer, NULL, NULL); | ||
103 | } | ||
104 | } | ||
105 | ret = rtcdev; | 68 | ret = rtcdev; |
106 | spin_unlock_irqrestore(&rtcdev_lock, flags); | 69 | spin_unlock_irqrestore(&rtcdev_lock, flags); |
107 | 70 | ||
108 | return ret; | 71 | return ret; |
109 | } | 72 | } |
73 | |||
74 | |||
75 | static int alarmtimer_rtc_add_device(struct device *dev, | ||
76 | struct class_interface *class_intf) | ||
77 | { | ||
78 | unsigned long flags; | ||
79 | struct rtc_device *rtc = to_rtc_device(dev); | ||
80 | |||
81 | if (rtcdev) | ||
82 | return -EBUSY; | ||
83 | |||
84 | if (!rtc->ops->set_alarm) | ||
85 | return -1; | ||
86 | if (!device_may_wakeup(rtc->dev.parent)) | ||
87 | return -1; | ||
88 | |||
89 | spin_lock_irqsave(&rtcdev_lock, flags); | ||
90 | if (!rtcdev) { | ||
91 | rtcdev = rtc; | ||
92 | /* hold a reference so it doesn't go away */ | ||
93 | get_device(dev); | ||
94 | } | ||
95 | spin_unlock_irqrestore(&rtcdev_lock, flags); | ||
96 | return 0; | ||
97 | } | ||
98 | |||
99 | static struct class_interface alarmtimer_rtc_interface = { | ||
100 | .add_dev = &alarmtimer_rtc_add_device, | ||
101 | }; | ||
102 | |||
103 | static int alarmtimer_rtc_interface_setup(void) | ||
104 | { | ||
105 | alarmtimer_rtc_interface.class = rtc_class; | ||
106 | return class_interface_register(&alarmtimer_rtc_interface); | ||
107 | } | ||
108 | static void alarmtimer_rtc_interface_remove(void) | ||
109 | { | ||
110 | class_interface_unregister(&alarmtimer_rtc_interface); | ||
111 | } | ||
110 | #else | 112 | #else |
111 | #define alarmtimer_get_rtcdev() (0) | 113 | static inline struct rtc_device *alarmtimer_get_rtcdev(void) |
112 | #define rtcdev (0) | 114 | { |
115 | return NULL; | ||
116 | } | ||
117 | #define rtcdev (NULL) | ||
118 | static inline int alarmtimer_rtc_interface_setup(void) { return 0; } | ||
119 | static inline void alarmtimer_rtc_interface_remove(void) { } | ||
113 | #endif | 120 | #endif |
114 | 121 | ||
115 | |||
116 | /** | 122 | /** |
117 | * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue | 123 | * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue |
118 | * @base: pointer to the base where the timer is being run | 124 | * @base: pointer to the base where the timer is being run |
@@ -126,6 +132,8 @@ static struct rtc_device *alarmtimer_get_rtcdev(void) | |||
126 | static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm) | 132 | static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm) |
127 | { | 133 | { |
128 | timerqueue_add(&base->timerqueue, &alarm->node); | 134 | timerqueue_add(&base->timerqueue, &alarm->node); |
135 | alarm->state |= ALARMTIMER_STATE_ENQUEUED; | ||
136 | |||
129 | if (&alarm->node == timerqueue_getnext(&base->timerqueue)) { | 137 | if (&alarm->node == timerqueue_getnext(&base->timerqueue)) { |
130 | hrtimer_try_to_cancel(&base->timer); | 138 | hrtimer_try_to_cancel(&base->timer); |
131 | hrtimer_start(&base->timer, alarm->node.expires, | 139 | hrtimer_start(&base->timer, alarm->node.expires, |
@@ -147,7 +155,12 @@ static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm) | |||
147 | { | 155 | { |
148 | struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue); | 156 | struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue); |
149 | 157 | ||
158 | if (!(alarm->state & ALARMTIMER_STATE_ENQUEUED)) | ||
159 | return; | ||
160 | |||
150 | timerqueue_del(&base->timerqueue, &alarm->node); | 161 | timerqueue_del(&base->timerqueue, &alarm->node); |
162 | alarm->state &= ~ALARMTIMER_STATE_ENQUEUED; | ||
163 | |||
151 | if (next == &alarm->node) { | 164 | if (next == &alarm->node) { |
152 | hrtimer_try_to_cancel(&base->timer); | 165 | hrtimer_try_to_cancel(&base->timer); |
153 | next = timerqueue_getnext(&base->timerqueue); | 166 | next = timerqueue_getnext(&base->timerqueue); |
@@ -174,6 +187,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) | |||
174 | unsigned long flags; | 187 | unsigned long flags; |
175 | ktime_t now; | 188 | ktime_t now; |
176 | int ret = HRTIMER_NORESTART; | 189 | int ret = HRTIMER_NORESTART; |
190 | int restart = ALARMTIMER_NORESTART; | ||
177 | 191 | ||
178 | spin_lock_irqsave(&base->lock, flags); | 192 | spin_lock_irqsave(&base->lock, flags); |
179 | now = base->gettime(); | 193 | now = base->gettime(); |
@@ -181,23 +195,25 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) | |||
181 | struct alarm *alarm; | 195 | struct alarm *alarm; |
182 | ktime_t expired = next->expires; | 196 | ktime_t expired = next->expires; |
183 | 197 | ||
184 | if (expired.tv64 >= now.tv64) | 198 | if (expired.tv64 > now.tv64) |
185 | break; | 199 | break; |
186 | 200 | ||
187 | alarm = container_of(next, struct alarm, node); | 201 | alarm = container_of(next, struct alarm, node); |
188 | 202 | ||
189 | timerqueue_del(&base->timerqueue, &alarm->node); | 203 | timerqueue_del(&base->timerqueue, &alarm->node); |
190 | alarm->enabled = 0; | 204 | alarm->state &= ~ALARMTIMER_STATE_ENQUEUED; |
191 | /* Re-add periodic timers */ | 205 | |
192 | if (alarm->period.tv64) { | 206 | alarm->state |= ALARMTIMER_STATE_CALLBACK; |
193 | alarm->node.expires = ktime_add(expired, alarm->period); | ||
194 | timerqueue_add(&base->timerqueue, &alarm->node); | ||
195 | alarm->enabled = 1; | ||
196 | } | ||
197 | spin_unlock_irqrestore(&base->lock, flags); | 207 | spin_unlock_irqrestore(&base->lock, flags); |
198 | if (alarm->function) | 208 | if (alarm->function) |
199 | alarm->function(alarm); | 209 | restart = alarm->function(alarm, now); |
200 | spin_lock_irqsave(&base->lock, flags); | 210 | spin_lock_irqsave(&base->lock, flags); |
211 | alarm->state &= ~ALARMTIMER_STATE_CALLBACK; | ||
212 | |||
213 | if (restart != ALARMTIMER_NORESTART) { | ||
214 | timerqueue_add(&base->timerqueue, &alarm->node); | ||
215 | alarm->state |= ALARMTIMER_STATE_ENQUEUED; | ||
216 | } | ||
201 | } | 217 | } |
202 | 218 | ||
203 | if (next) { | 219 | if (next) { |
@@ -234,7 +250,7 @@ static int alarmtimer_suspend(struct device *dev) | |||
234 | freezer_delta = ktime_set(0, 0); | 250 | freezer_delta = ktime_set(0, 0); |
235 | spin_unlock_irqrestore(&freezer_delta_lock, flags); | 251 | spin_unlock_irqrestore(&freezer_delta_lock, flags); |
236 | 252 | ||
237 | rtc = rtcdev; | 253 | rtc = alarmtimer_get_rtcdev(); |
238 | /* If we have no rtcdev, just return */ | 254 | /* If we have no rtcdev, just return */ |
239 | if (!rtc) | 255 | if (!rtc) |
240 | return 0; | 256 | return 0; |
@@ -299,53 +315,111 @@ static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) | |||
299 | * @function: callback that is run when the alarm fires | 315 | * @function: callback that is run when the alarm fires |
300 | */ | 316 | */ |
301 | void alarm_init(struct alarm *alarm, enum alarmtimer_type type, | 317 | void alarm_init(struct alarm *alarm, enum alarmtimer_type type, |
302 | void (*function)(struct alarm *)) | 318 | enum alarmtimer_restart (*function)(struct alarm *, ktime_t)) |
303 | { | 319 | { |
304 | timerqueue_init(&alarm->node); | 320 | timerqueue_init(&alarm->node); |
305 | alarm->period = ktime_set(0, 0); | ||
306 | alarm->function = function; | 321 | alarm->function = function; |
307 | alarm->type = type; | 322 | alarm->type = type; |
308 | alarm->enabled = 0; | 323 | alarm->state = ALARMTIMER_STATE_INACTIVE; |
309 | } | 324 | } |
310 | 325 | ||
311 | /** | 326 | /** |
312 | * alarm_start - Sets an alarm to fire | 327 | * alarm_start - Sets an alarm to fire |
313 | * @alarm: ptr to alarm to set | 328 | * @alarm: ptr to alarm to set |
314 | * @start: time to run the alarm | 329 | * @start: time to run the alarm |
315 | * @period: period at which the alarm will recur | ||
316 | */ | 330 | */ |
317 | void alarm_start(struct alarm *alarm, ktime_t start, ktime_t period) | 331 | void alarm_start(struct alarm *alarm, ktime_t start) |
318 | { | 332 | { |
319 | struct alarm_base *base = &alarm_bases[alarm->type]; | 333 | struct alarm_base *base = &alarm_bases[alarm->type]; |
320 | unsigned long flags; | 334 | unsigned long flags; |
321 | 335 | ||
322 | spin_lock_irqsave(&base->lock, flags); | 336 | spin_lock_irqsave(&base->lock, flags); |
323 | if (alarm->enabled) | 337 | if (alarmtimer_active(alarm)) |
324 | alarmtimer_remove(base, alarm); | 338 | alarmtimer_remove(base, alarm); |
325 | alarm->node.expires = start; | 339 | alarm->node.expires = start; |
326 | alarm->period = period; | ||
327 | alarmtimer_enqueue(base, alarm); | 340 | alarmtimer_enqueue(base, alarm); |
328 | alarm->enabled = 1; | ||
329 | spin_unlock_irqrestore(&base->lock, flags); | 341 | spin_unlock_irqrestore(&base->lock, flags); |
330 | } | 342 | } |
331 | 343 | ||
332 | /** | 344 | /** |
333 | * alarm_cancel - Tries to cancel an alarm timer | 345 | * alarm_try_to_cancel - Tries to cancel an alarm timer |
334 | * @alarm: ptr to alarm to be canceled | 346 | * @alarm: ptr to alarm to be canceled |
347 | * | ||
348 | * Returns 1 if the timer was canceled, 0 if it was not running, | ||
349 | * and -1 if the callback was running | ||
335 | */ | 350 | */ |
336 | void alarm_cancel(struct alarm *alarm) | 351 | int alarm_try_to_cancel(struct alarm *alarm) |
337 | { | 352 | { |
338 | struct alarm_base *base = &alarm_bases[alarm->type]; | 353 | struct alarm_base *base = &alarm_bases[alarm->type]; |
339 | unsigned long flags; | 354 | unsigned long flags; |
340 | 355 | int ret = -1; | |
341 | spin_lock_irqsave(&base->lock, flags); | 356 | spin_lock_irqsave(&base->lock, flags); |
342 | if (alarm->enabled) | 357 | |
358 | if (alarmtimer_callback_running(alarm)) | ||
359 | goto out; | ||
360 | |||
361 | if (alarmtimer_is_queued(alarm)) { | ||
343 | alarmtimer_remove(base, alarm); | 362 | alarmtimer_remove(base, alarm); |
344 | alarm->enabled = 0; | 363 | ret = 1; |
364 | } else | ||
365 | ret = 0; | ||
366 | out: | ||
345 | spin_unlock_irqrestore(&base->lock, flags); | 367 | spin_unlock_irqrestore(&base->lock, flags); |
368 | return ret; | ||
369 | } | ||
370 | |||
371 | |||
372 | /** | ||
373 | * alarm_cancel - Spins trying to cancel an alarm timer until it is done | ||
374 | * @alarm: ptr to alarm to be canceled | ||
375 | * | ||
376 | * Returns 1 if the timer was canceled, 0 if it was not active. | ||
377 | */ | ||
378 | int alarm_cancel(struct alarm *alarm) | ||
379 | { | ||
380 | for (;;) { | ||
381 | int ret = alarm_try_to_cancel(alarm); | ||
382 | if (ret >= 0) | ||
383 | return ret; | ||
384 | cpu_relax(); | ||
385 | } | ||
386 | } | ||
387 | |||
388 | |||
389 | u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) | ||
390 | { | ||
391 | u64 overrun = 1; | ||
392 | ktime_t delta; | ||
393 | |||
394 | delta = ktime_sub(now, alarm->node.expires); | ||
395 | |||
396 | if (delta.tv64 < 0) | ||
397 | return 0; | ||
398 | |||
399 | if (unlikely(delta.tv64 >= interval.tv64)) { | ||
400 | s64 incr = ktime_to_ns(interval); | ||
401 | |||
402 | overrun = ktime_divns(delta, incr); | ||
403 | |||
404 | alarm->node.expires = ktime_add_ns(alarm->node.expires, | ||
405 | incr*overrun); | ||
406 | |||
407 | if (alarm->node.expires.tv64 > now.tv64) | ||
408 | return overrun; | ||
409 | /* | ||
410 | * This (and the ktime_add() below) is the | ||
411 | * correction for exact: | ||
412 | */ | ||
413 | overrun++; | ||
414 | } | ||
415 | |||
416 | alarm->node.expires = ktime_add(alarm->node.expires, interval); | ||
417 | return overrun; | ||
346 | } | 418 | } |
347 | 419 | ||
348 | 420 | ||
421 | |||
422 | |||
349 | /** | 423 | /** |
350 | * clock2alarm - helper that converts from clockid to alarmtypes | 424 | * clock2alarm - helper that converts from clockid to alarmtypes |
351 | * @clockid: clockid. | 425 | * @clockid: clockid. |
@@ -365,12 +439,21 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid) | |||
365 | * | 439 | * |
366 | * Posix timer callback for expired alarm timers. | 440 | * Posix timer callback for expired alarm timers. |
367 | */ | 441 | */ |
368 | static void alarm_handle_timer(struct alarm *alarm) | 442 | static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, |
443 | ktime_t now) | ||
369 | { | 444 | { |
370 | struct k_itimer *ptr = container_of(alarm, struct k_itimer, | 445 | struct k_itimer *ptr = container_of(alarm, struct k_itimer, |
371 | it.alarmtimer); | 446 | it.alarm.alarmtimer); |
372 | if (posix_timer_event(ptr, 0) != 0) | 447 | if (posix_timer_event(ptr, 0) != 0) |
373 | ptr->it_overrun++; | 448 | ptr->it_overrun++; |
449 | |||
450 | /* Re-add periodic timers */ | ||
451 | if (ptr->it.alarm.interval.tv64) { | ||
452 | ptr->it_overrun += alarm_forward(alarm, now, | ||
453 | ptr->it.alarm.interval); | ||
454 | return ALARMTIMER_RESTART; | ||
455 | } | ||
456 | return ALARMTIMER_NORESTART; | ||
374 | } | 457 | } |
375 | 458 | ||
376 | /** | 459 | /** |
@@ -427,7 +510,7 @@ static int alarm_timer_create(struct k_itimer *new_timer) | |||
427 | 510 | ||
428 | type = clock2alarm(new_timer->it_clock); | 511 | type = clock2alarm(new_timer->it_clock); |
429 | base = &alarm_bases[type]; | 512 | base = &alarm_bases[type]; |
430 | alarm_init(&new_timer->it.alarmtimer, type, alarm_handle_timer); | 513 | alarm_init(&new_timer->it.alarm.alarmtimer, type, alarm_handle_timer); |
431 | return 0; | 514 | return 0; |
432 | } | 515 | } |
433 | 516 | ||
@@ -444,9 +527,9 @@ static void alarm_timer_get(struct k_itimer *timr, | |||
444 | memset(cur_setting, 0, sizeof(struct itimerspec)); | 527 | memset(cur_setting, 0, sizeof(struct itimerspec)); |
445 | 528 | ||
446 | cur_setting->it_interval = | 529 | cur_setting->it_interval = |
447 | ktime_to_timespec(timr->it.alarmtimer.period); | 530 | ktime_to_timespec(timr->it.alarm.interval); |
448 | cur_setting->it_value = | 531 | cur_setting->it_value = |
449 | ktime_to_timespec(timr->it.alarmtimer.node.expires); | 532 | ktime_to_timespec(timr->it.alarm.alarmtimer.node.expires); |
450 | return; | 533 | return; |
451 | } | 534 | } |
452 | 535 | ||
@@ -461,7 +544,9 @@ static int alarm_timer_del(struct k_itimer *timr) | |||
461 | if (!rtcdev) | 544 | if (!rtcdev) |
462 | return -ENOTSUPP; | 545 | return -ENOTSUPP; |
463 | 546 | ||
464 | alarm_cancel(&timr->it.alarmtimer); | 547 | if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0) |
548 | return TIMER_RETRY; | ||
549 | |||
465 | return 0; | 550 | return 0; |
466 | } | 551 | } |
467 | 552 | ||
@@ -481,25 +566,17 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, | |||
481 | if (!rtcdev) | 566 | if (!rtcdev) |
482 | return -ENOTSUPP; | 567 | return -ENOTSUPP; |
483 | 568 | ||
484 | /* | ||
485 | * XXX HACK! Currently we can DOS a system if the interval | ||
486 | * period on alarmtimers is too small. Cap the interval here | ||
487 | * to 100us and solve this properly in a future patch! -jstultz | ||
488 | */ | ||
489 | if ((new_setting->it_interval.tv_sec == 0) && | ||
490 | (new_setting->it_interval.tv_nsec < 100000)) | ||
491 | new_setting->it_interval.tv_nsec = 100000; | ||
492 | |||
493 | if (old_setting) | 569 | if (old_setting) |
494 | alarm_timer_get(timr, old_setting); | 570 | alarm_timer_get(timr, old_setting); |
495 | 571 | ||
496 | /* If the timer was already set, cancel it */ | 572 | /* If the timer was already set, cancel it */ |
497 | alarm_cancel(&timr->it.alarmtimer); | 573 | if (alarm_try_to_cancel(&timr->it.alarm.alarmtimer) < 0) |
574 | return TIMER_RETRY; | ||
498 | 575 | ||
499 | /* start the timer */ | 576 | /* start the timer */ |
500 | alarm_start(&timr->it.alarmtimer, | 577 | timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval); |
501 | timespec_to_ktime(new_setting->it_value), | 578 | alarm_start(&timr->it.alarm.alarmtimer, |
502 | timespec_to_ktime(new_setting->it_interval)); | 579 | timespec_to_ktime(new_setting->it_value)); |
503 | return 0; | 580 | return 0; |
504 | } | 581 | } |
505 | 582 | ||
@@ -509,13 +586,15 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, | |||
509 | * | 586 | * |
510 | * Wakes up the task that set the alarmtimer | 587 | * Wakes up the task that set the alarmtimer |
511 | */ | 588 | */ |
512 | static void alarmtimer_nsleep_wakeup(struct alarm *alarm) | 589 | static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm, |
590 | ktime_t now) | ||
513 | { | 591 | { |
514 | struct task_struct *task = (struct task_struct *)alarm->data; | 592 | struct task_struct *task = (struct task_struct *)alarm->data; |
515 | 593 | ||
516 | alarm->data = NULL; | 594 | alarm->data = NULL; |
517 | if (task) | 595 | if (task) |
518 | wake_up_process(task); | 596 | wake_up_process(task); |
597 | return ALARMTIMER_NORESTART; | ||
519 | } | 598 | } |
520 | 599 | ||
521 | /** | 600 | /** |
@@ -530,7 +609,7 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp) | |||
530 | alarm->data = (void *)current; | 609 | alarm->data = (void *)current; |
531 | do { | 610 | do { |
532 | set_current_state(TASK_INTERRUPTIBLE); | 611 | set_current_state(TASK_INTERRUPTIBLE); |
533 | alarm_start(alarm, absexp, ktime_set(0, 0)); | 612 | alarm_start(alarm, absexp); |
534 | if (likely(alarm->data)) | 613 | if (likely(alarm->data)) |
535 | schedule(); | 614 | schedule(); |
536 | 615 | ||
@@ -691,6 +770,7 @@ static struct platform_driver alarmtimer_driver = { | |||
691 | */ | 770 | */ |
692 | static int __init alarmtimer_init(void) | 771 | static int __init alarmtimer_init(void) |
693 | { | 772 | { |
773 | struct platform_device *pdev; | ||
694 | int error = 0; | 774 | int error = 0; |
695 | int i; | 775 | int i; |
696 | struct k_clock alarm_clock = { | 776 | struct k_clock alarm_clock = { |
@@ -719,10 +799,26 @@ static int __init alarmtimer_init(void) | |||
719 | HRTIMER_MODE_ABS); | 799 | HRTIMER_MODE_ABS); |
720 | alarm_bases[i].timer.function = alarmtimer_fired; | 800 | alarm_bases[i].timer.function = alarmtimer_fired; |
721 | } | 801 | } |
802 | |||
803 | error = alarmtimer_rtc_interface_setup(); | ||
804 | if (error) | ||
805 | return error; | ||
806 | |||
722 | error = platform_driver_register(&alarmtimer_driver); | 807 | error = platform_driver_register(&alarmtimer_driver); |
723 | platform_device_register_simple("alarmtimer", -1, NULL, 0); | 808 | if (error) |
809 | goto out_if; | ||
724 | 810 | ||
811 | pdev = platform_device_register_simple("alarmtimer", -1, NULL, 0); | ||
812 | if (IS_ERR(pdev)) { | ||
813 | error = PTR_ERR(pdev); | ||
814 | goto out_drv; | ||
815 | } | ||
816 | return 0; | ||
817 | |||
818 | out_drv: | ||
819 | platform_driver_unregister(&alarmtimer_driver); | ||
820 | out_if: | ||
821 | alarmtimer_rtc_interface_remove(); | ||
725 | return error; | 822 | return error; |
726 | } | 823 | } |
727 | device_initcall(alarmtimer_init); | 824 | device_initcall(alarmtimer_init); |
728 | |||
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index e4c699dfa4e8..9cd928f7a7c6 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
@@ -17,7 +17,6 @@ | |||
17 | #include <linux/module.h> | 17 | #include <linux/module.h> |
18 | #include <linux/notifier.h> | 18 | #include <linux/notifier.h> |
19 | #include <linux/smp.h> | 19 | #include <linux/smp.h> |
20 | #include <linux/sysdev.h> | ||
21 | 20 | ||
22 | #include "tick-internal.h" | 21 | #include "tick-internal.h" |
23 | 22 | ||
@@ -94,42 +93,143 @@ void clockevents_shutdown(struct clock_event_device *dev) | |||
94 | dev->next_event.tv64 = KTIME_MAX; | 93 | dev->next_event.tv64 = KTIME_MAX; |
95 | } | 94 | } |
96 | 95 | ||
96 | #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST | ||
97 | |||
98 | /* Limit min_delta to a jiffie */ | ||
99 | #define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) | ||
100 | |||
101 | /** | ||
102 | * clockevents_increase_min_delta - raise minimum delta of a clock event device | ||
103 | * @dev: device to increase the minimum delta | ||
104 | * | ||
105 | * Returns 0 on success, -ETIME when the minimum delta reached the limit. | ||
106 | */ | ||
107 | static int clockevents_increase_min_delta(struct clock_event_device *dev) | ||
108 | { | ||
109 | /* Nothing to do if we already reached the limit */ | ||
110 | if (dev->min_delta_ns >= MIN_DELTA_LIMIT) { | ||
111 | printk(KERN_WARNING "CE: Reprogramming failure. Giving up\n"); | ||
112 | dev->next_event.tv64 = KTIME_MAX; | ||
113 | return -ETIME; | ||
114 | } | ||
115 | |||
116 | if (dev->min_delta_ns < 5000) | ||
117 | dev->min_delta_ns = 5000; | ||
118 | else | ||
119 | dev->min_delta_ns += dev->min_delta_ns >> 1; | ||
120 | |||
121 | if (dev->min_delta_ns > MIN_DELTA_LIMIT) | ||
122 | dev->min_delta_ns = MIN_DELTA_LIMIT; | ||
123 | |||
124 | printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n", | ||
125 | dev->name ? dev->name : "?", | ||
126 | (unsigned long long) dev->min_delta_ns); | ||
127 | return 0; | ||
128 | } | ||
129 | |||
130 | /** | ||
131 | * clockevents_program_min_delta - Set clock event device to the minimum delay. | ||
132 | * @dev: device to program | ||
133 | * | ||
134 | * Returns 0 on success, -ETIME when the retry loop failed. | ||
135 | */ | ||
136 | static int clockevents_program_min_delta(struct clock_event_device *dev) | ||
137 | { | ||
138 | unsigned long long clc; | ||
139 | int64_t delta; | ||
140 | int i; | ||
141 | |||
142 | for (i = 0;;) { | ||
143 | delta = dev->min_delta_ns; | ||
144 | dev->next_event = ktime_add_ns(ktime_get(), delta); | ||
145 | |||
146 | if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) | ||
147 | return 0; | ||
148 | |||
149 | dev->retries++; | ||
150 | clc = ((unsigned long long) delta * dev->mult) >> dev->shift; | ||
151 | if (dev->set_next_event((unsigned long) clc, dev) == 0) | ||
152 | return 0; | ||
153 | |||
154 | if (++i > 2) { | ||
155 | /* | ||
156 | * We tried 3 times to program the device with the | ||
157 | * given min_delta_ns. Try to increase the minimum | ||
158 | * delta, if that fails as well get out of here. | ||
159 | */ | ||
160 | if (clockevents_increase_min_delta(dev)) | ||
161 | return -ETIME; | ||
162 | i = 0; | ||
163 | } | ||
164 | } | ||
165 | } | ||
166 | |||
167 | #else /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ | ||
168 | |||
169 | /** | ||
170 | * clockevents_program_min_delta - Set clock event device to the minimum delay. | ||
171 | * @dev: device to program | ||
172 | * | ||
173 | * Returns 0 on success, -ETIME when the retry loop failed. | ||
174 | */ | ||
175 | static int clockevents_program_min_delta(struct clock_event_device *dev) | ||
176 | { | ||
177 | unsigned long long clc; | ||
178 | int64_t delta; | ||
179 | |||
180 | delta = dev->min_delta_ns; | ||
181 | dev->next_event = ktime_add_ns(ktime_get(), delta); | ||
182 | |||
183 | if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) | ||
184 | return 0; | ||
185 | |||
186 | dev->retries++; | ||
187 | clc = ((unsigned long long) delta * dev->mult) >> dev->shift; | ||
188 | return dev->set_next_event((unsigned long) clc, dev); | ||
189 | } | ||
190 | |||
191 | #endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ | ||
192 | |||
97 | /** | 193 | /** |
98 | * clockevents_program_event - Reprogram the clock event device. | 194 | * clockevents_program_event - Reprogram the clock event device. |
195 | * @dev: device to program | ||
99 | * @expires: absolute expiry time (monotonic clock) | 196 | * @expires: absolute expiry time (monotonic clock) |
197 | * @force: program minimum delay if expires can not be set | ||
100 | * | 198 | * |
101 | * Returns 0 on success, -ETIME when the event is in the past. | 199 | * Returns 0 on success, -ETIME when the event is in the past. |
102 | */ | 200 | */ |
103 | int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, | 201 | int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, |
104 | ktime_t now) | 202 | bool force) |
105 | { | 203 | { |
106 | unsigned long long clc; | 204 | unsigned long long clc; |
107 | int64_t delta; | 205 | int64_t delta; |
206 | int rc; | ||
108 | 207 | ||
109 | if (unlikely(expires.tv64 < 0)) { | 208 | if (unlikely(expires.tv64 < 0)) { |
110 | WARN_ON_ONCE(1); | 209 | WARN_ON_ONCE(1); |
111 | return -ETIME; | 210 | return -ETIME; |
112 | } | 211 | } |
113 | 212 | ||
114 | delta = ktime_to_ns(ktime_sub(expires, now)); | ||
115 | |||
116 | if (delta <= 0) | ||
117 | return -ETIME; | ||
118 | |||
119 | dev->next_event = expires; | 213 | dev->next_event = expires; |
120 | 214 | ||
121 | if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) | 215 | if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) |
122 | return 0; | 216 | return 0; |
123 | 217 | ||
124 | if (delta > dev->max_delta_ns) | 218 | /* Shortcut for clockevent devices that can deal with ktime. */ |
125 | delta = dev->max_delta_ns; | 219 | if (dev->features & CLOCK_EVT_FEAT_KTIME) |
126 | if (delta < dev->min_delta_ns) | 220 | return dev->set_next_ktime(expires, dev); |
127 | delta = dev->min_delta_ns; | 221 | |
222 | delta = ktime_to_ns(ktime_sub(expires, ktime_get())); | ||
223 | if (delta <= 0) | ||
224 | return force ? clockevents_program_min_delta(dev) : -ETIME; | ||
128 | 225 | ||
129 | clc = delta * dev->mult; | 226 | delta = min(delta, (int64_t) dev->max_delta_ns); |
130 | clc >>= dev->shift; | 227 | delta = max(delta, (int64_t) dev->min_delta_ns); |
131 | 228 | ||
132 | return dev->set_next_event((unsigned long) clc, dev); | 229 | clc = ((unsigned long long) delta * dev->mult) >> dev->shift; |
230 | rc = dev->set_next_event((unsigned long) clc, dev); | ||
231 | |||
232 | return (rc && force) ? clockevents_program_min_delta(dev) : rc; | ||
133 | } | 233 | } |
134 | 234 | ||
135 | /** | 235 | /** |
@@ -258,7 +358,7 @@ int clockevents_update_freq(struct clock_event_device *dev, u32 freq) | |||
258 | if (dev->mode != CLOCK_EVT_MODE_ONESHOT) | 358 | if (dev->mode != CLOCK_EVT_MODE_ONESHOT) |
259 | return 0; | 359 | return 0; |
260 | 360 | ||
261 | return clockevents_program_event(dev, dev->next_event, ktime_get()); | 361 | return clockevents_program_event(dev, dev->next_event, false); |
262 | } | 362 | } |
263 | 363 | ||
264 | /* | 364 | /* |
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index e0980f0d9a0a..a45ca167ab24 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
@@ -23,8 +23,8 @@ | |||
23 | * o Allow clocksource drivers to be unregistered | 23 | * o Allow clocksource drivers to be unregistered |
24 | */ | 24 | */ |
25 | 25 | ||
26 | #include <linux/device.h> | ||
26 | #include <linux/clocksource.h> | 27 | #include <linux/clocksource.h> |
27 | #include <linux/sysdev.h> | ||
28 | #include <linux/init.h> | 28 | #include <linux/init.h> |
29 | #include <linux/module.h> | 29 | #include <linux/module.h> |
30 | #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ | 30 | #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ |
@@ -186,6 +186,7 @@ static struct timer_list watchdog_timer; | |||
186 | static DECLARE_WORK(watchdog_work, clocksource_watchdog_work); | 186 | static DECLARE_WORK(watchdog_work, clocksource_watchdog_work); |
187 | static DEFINE_SPINLOCK(watchdog_lock); | 187 | static DEFINE_SPINLOCK(watchdog_lock); |
188 | static int watchdog_running; | 188 | static int watchdog_running; |
189 | static atomic_t watchdog_reset_pending; | ||
189 | 190 | ||
190 | static int clocksource_watchdog_kthread(void *data); | 191 | static int clocksource_watchdog_kthread(void *data); |
191 | static void __clocksource_change_rating(struct clocksource *cs, int rating); | 192 | static void __clocksource_change_rating(struct clocksource *cs, int rating); |
@@ -247,12 +248,14 @@ static void clocksource_watchdog(unsigned long data) | |||
247 | struct clocksource *cs; | 248 | struct clocksource *cs; |
248 | cycle_t csnow, wdnow; | 249 | cycle_t csnow, wdnow; |
249 | int64_t wd_nsec, cs_nsec; | 250 | int64_t wd_nsec, cs_nsec; |
250 | int next_cpu; | 251 | int next_cpu, reset_pending; |
251 | 252 | ||
252 | spin_lock(&watchdog_lock); | 253 | spin_lock(&watchdog_lock); |
253 | if (!watchdog_running) | 254 | if (!watchdog_running) |
254 | goto out; | 255 | goto out; |
255 | 256 | ||
257 | reset_pending = atomic_read(&watchdog_reset_pending); | ||
258 | |||
256 | list_for_each_entry(cs, &watchdog_list, wd_list) { | 259 | list_for_each_entry(cs, &watchdog_list, wd_list) { |
257 | 260 | ||
258 | /* Clocksource already marked unstable? */ | 261 | /* Clocksource already marked unstable? */ |
@@ -268,7 +271,8 @@ static void clocksource_watchdog(unsigned long data) | |||
268 | local_irq_enable(); | 271 | local_irq_enable(); |
269 | 272 | ||
270 | /* Clocksource initialized ? */ | 273 | /* Clocksource initialized ? */ |
271 | if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { | 274 | if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) || |
275 | atomic_read(&watchdog_reset_pending)) { | ||
272 | cs->flags |= CLOCK_SOURCE_WATCHDOG; | 276 | cs->flags |= CLOCK_SOURCE_WATCHDOG; |
273 | cs->wd_last = wdnow; | 277 | cs->wd_last = wdnow; |
274 | cs->cs_last = csnow; | 278 | cs->cs_last = csnow; |
@@ -283,8 +287,11 @@ static void clocksource_watchdog(unsigned long data) | |||
283 | cs->cs_last = csnow; | 287 | cs->cs_last = csnow; |
284 | cs->wd_last = wdnow; | 288 | cs->wd_last = wdnow; |
285 | 289 | ||
290 | if (atomic_read(&watchdog_reset_pending)) | ||
291 | continue; | ||
292 | |||
286 | /* Check the deviation from the watchdog clocksource. */ | 293 | /* Check the deviation from the watchdog clocksource. */ |
287 | if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { | 294 | if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) { |
288 | clocksource_unstable(cs, cs_nsec - wd_nsec); | 295 | clocksource_unstable(cs, cs_nsec - wd_nsec); |
289 | continue; | 296 | continue; |
290 | } | 297 | } |
@@ -303,6 +310,13 @@ static void clocksource_watchdog(unsigned long data) | |||
303 | } | 310 | } |
304 | 311 | ||
305 | /* | 312 | /* |
313 | * We only clear the watchdog_reset_pending, when we did a | ||
314 | * full cycle through all clocksources. | ||
315 | */ | ||
316 | if (reset_pending) | ||
317 | atomic_dec(&watchdog_reset_pending); | ||
318 | |||
319 | /* | ||
306 | * Cycle through CPUs to check if the CPUs stay synchronized | 320 | * Cycle through CPUs to check if the CPUs stay synchronized |
307 | * to each other. | 321 | * to each other. |
308 | */ | 322 | */ |
@@ -344,23 +358,7 @@ static inline void clocksource_reset_watchdog(void) | |||
344 | 358 | ||
345 | static void clocksource_resume_watchdog(void) | 359 | static void clocksource_resume_watchdog(void) |
346 | { | 360 | { |
347 | unsigned long flags; | 361 | atomic_inc(&watchdog_reset_pending); |
348 | |||
349 | /* | ||
350 | * We use trylock here to avoid a potential dead lock when | ||
351 | * kgdb calls this code after the kernel has been stopped with | ||
352 | * watchdog_lock held. When watchdog_lock is held we just | ||
353 | * return and accept, that the watchdog might trigger and mark | ||
354 | * the monitored clock source (usually TSC) unstable. | ||
355 | * | ||
356 | * This does not affect the other caller clocksource_resume() | ||
357 | * because at this point the kernel is UP, interrupts are | ||
358 | * disabled and nothing can hold watchdog_lock. | ||
359 | */ | ||
360 | if (!spin_trylock_irqsave(&watchdog_lock, flags)) | ||
361 | return; | ||
362 | clocksource_reset_watchdog(); | ||
363 | spin_unlock_irqrestore(&watchdog_lock, flags); | ||
364 | } | 362 | } |
365 | 363 | ||
366 | static void clocksource_enqueue_watchdog(struct clocksource *cs) | 364 | static void clocksource_enqueue_watchdog(struct clocksource *cs) |
@@ -494,6 +492,22 @@ void clocksource_touch_watchdog(void) | |||
494 | } | 492 | } |
495 | 493 | ||
496 | /** | 494 | /** |
495 | * clocksource_max_adjustment- Returns max adjustment amount | ||
496 | * @cs: Pointer to clocksource | ||
497 | * | ||
498 | */ | ||
499 | static u32 clocksource_max_adjustment(struct clocksource *cs) | ||
500 | { | ||
501 | u64 ret; | ||
502 | /* | ||
503 | * We won't try to correct for more then 11% adjustments (110,000 ppm), | ||
504 | */ | ||
505 | ret = (u64)cs->mult * 11; | ||
506 | do_div(ret,100); | ||
507 | return (u32)ret; | ||
508 | } | ||
509 | |||
510 | /** | ||
497 | * clocksource_max_deferment - Returns max time the clocksource can be deferred | 511 | * clocksource_max_deferment - Returns max time the clocksource can be deferred |
498 | * @cs: Pointer to clocksource | 512 | * @cs: Pointer to clocksource |
499 | * | 513 | * |
@@ -505,25 +519,28 @@ static u64 clocksource_max_deferment(struct clocksource *cs) | |||
505 | /* | 519 | /* |
506 | * Calculate the maximum number of cycles that we can pass to the | 520 | * Calculate the maximum number of cycles that we can pass to the |
507 | * cyc2ns function without overflowing a 64-bit signed result. The | 521 | * cyc2ns function without overflowing a 64-bit signed result. The |
508 | * maximum number of cycles is equal to ULLONG_MAX/cs->mult which | 522 | * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj) |
509 | * is equivalent to the below. | 523 | * which is equivalent to the below. |
510 | * max_cycles < (2^63)/cs->mult | 524 | * max_cycles < (2^63)/(cs->mult + cs->maxadj) |
511 | * max_cycles < 2^(log2((2^63)/cs->mult)) | 525 | * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj))) |
512 | * max_cycles < 2^(log2(2^63) - log2(cs->mult)) | 526 | * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj)) |
513 | * max_cycles < 2^(63 - log2(cs->mult)) | 527 | * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj)) |
514 | * max_cycles < 1 << (63 - log2(cs->mult)) | 528 | * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj)) |
515 | * Please note that we add 1 to the result of the log2 to account for | 529 | * Please note that we add 1 to the result of the log2 to account for |
516 | * any rounding errors, ensure the above inequality is satisfied and | 530 | * any rounding errors, ensure the above inequality is satisfied and |
517 | * no overflow will occur. | 531 | * no overflow will occur. |
518 | */ | 532 | */ |
519 | max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1)); | 533 | max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1)); |
520 | 534 | ||
521 | /* | 535 | /* |
522 | * The actual maximum number of cycles we can defer the clocksource is | 536 | * The actual maximum number of cycles we can defer the clocksource is |
523 | * determined by the minimum of max_cycles and cs->mask. | 537 | * determined by the minimum of max_cycles and cs->mask. |
538 | * Note: Here we subtract the maxadj to make sure we don't sleep for | ||
539 | * too long if there's a large negative adjustment. | ||
524 | */ | 540 | */ |
525 | max_cycles = min_t(u64, max_cycles, (u64) cs->mask); | 541 | max_cycles = min_t(u64, max_cycles, (u64) cs->mask); |
526 | max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift); | 542 | max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj, |
543 | cs->shift); | ||
527 | 544 | ||
528 | /* | 545 | /* |
529 | * To ensure that the clocksource does not wrap whilst we are idle, | 546 | * To ensure that the clocksource does not wrap whilst we are idle, |
@@ -531,7 +548,7 @@ static u64 clocksource_max_deferment(struct clocksource *cs) | |||
531 | * note a margin of 12.5% is used because this can be computed with | 548 | * note a margin of 12.5% is used because this can be computed with |
532 | * a shift, versus say 10% which would require division. | 549 | * a shift, versus say 10% which would require division. |
533 | */ | 550 | */ |
534 | return max_nsecs - (max_nsecs >> 5); | 551 | return max_nsecs - (max_nsecs >> 3); |
535 | } | 552 | } |
536 | 553 | ||
537 | #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET | 554 | #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET |
@@ -630,7 +647,7 @@ static void clocksource_enqueue(struct clocksource *cs) | |||
630 | 647 | ||
631 | /** | 648 | /** |
632 | * __clocksource_updatefreq_scale - Used update clocksource with new freq | 649 | * __clocksource_updatefreq_scale - Used update clocksource with new freq |
633 | * @t: clocksource to be registered | 650 | * @cs: clocksource to be registered |
634 | * @scale: Scale factor multiplied against freq to get clocksource hz | 651 | * @scale: Scale factor multiplied against freq to get clocksource hz |
635 | * @freq: clocksource frequency (cycles per second) divided by scale | 652 | * @freq: clocksource frequency (cycles per second) divided by scale |
636 | * | 653 | * |
@@ -642,7 +659,6 @@ static void clocksource_enqueue(struct clocksource *cs) | |||
642 | void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) | 659 | void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) |
643 | { | 660 | { |
644 | u64 sec; | 661 | u64 sec; |
645 | |||
646 | /* | 662 | /* |
647 | * Calc the maximum number of seconds which we can run before | 663 | * Calc the maximum number of seconds which we can run before |
648 | * wrapping around. For clocksources which have a mask > 32bit | 664 | * wrapping around. For clocksources which have a mask > 32bit |
@@ -653,7 +669,7 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) | |||
653 | * ~ 0.06ppm granularity for NTP. We apply the same 12.5% | 669 | * ~ 0.06ppm granularity for NTP. We apply the same 12.5% |
654 | * margin as we do in clocksource_max_deferment() | 670 | * margin as we do in clocksource_max_deferment() |
655 | */ | 671 | */ |
656 | sec = (cs->mask - (cs->mask >> 5)); | 672 | sec = (cs->mask - (cs->mask >> 3)); |
657 | do_div(sec, freq); | 673 | do_div(sec, freq); |
658 | do_div(sec, scale); | 674 | do_div(sec, scale); |
659 | if (!sec) | 675 | if (!sec) |
@@ -663,13 +679,27 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) | |||
663 | 679 | ||
664 | clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, | 680 | clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, |
665 | NSEC_PER_SEC / scale, sec * scale); | 681 | NSEC_PER_SEC / scale, sec * scale); |
682 | |||
683 | /* | ||
684 | * for clocksources that have large mults, to avoid overflow. | ||
685 | * Since mult may be adjusted by ntp, add an safety extra margin | ||
686 | * | ||
687 | */ | ||
688 | cs->maxadj = clocksource_max_adjustment(cs); | ||
689 | while ((cs->mult + cs->maxadj < cs->mult) | ||
690 | || (cs->mult - cs->maxadj > cs->mult)) { | ||
691 | cs->mult >>= 1; | ||
692 | cs->shift--; | ||
693 | cs->maxadj = clocksource_max_adjustment(cs); | ||
694 | } | ||
695 | |||
666 | cs->max_idle_ns = clocksource_max_deferment(cs); | 696 | cs->max_idle_ns = clocksource_max_deferment(cs); |
667 | } | 697 | } |
668 | EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); | 698 | EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); |
669 | 699 | ||
670 | /** | 700 | /** |
671 | * __clocksource_register_scale - Used to install new clocksources | 701 | * __clocksource_register_scale - Used to install new clocksources |
672 | * @t: clocksource to be registered | 702 | * @cs: clocksource to be registered |
673 | * @scale: Scale factor multiplied against freq to get clocksource hz | 703 | * @scale: Scale factor multiplied against freq to get clocksource hz |
674 | * @freq: clocksource frequency (cycles per second) divided by scale | 704 | * @freq: clocksource frequency (cycles per second) divided by scale |
675 | * | 705 | * |
@@ -697,12 +727,18 @@ EXPORT_SYMBOL_GPL(__clocksource_register_scale); | |||
697 | 727 | ||
698 | /** | 728 | /** |
699 | * clocksource_register - Used to install new clocksources | 729 | * clocksource_register - Used to install new clocksources |
700 | * @t: clocksource to be registered | 730 | * @cs: clocksource to be registered |
701 | * | 731 | * |
702 | * Returns -EBUSY if registration fails, zero otherwise. | 732 | * Returns -EBUSY if registration fails, zero otherwise. |
703 | */ | 733 | */ |
704 | int clocksource_register(struct clocksource *cs) | 734 | int clocksource_register(struct clocksource *cs) |
705 | { | 735 | { |
736 | /* calculate max adjustment for given mult/shift */ | ||
737 | cs->maxadj = clocksource_max_adjustment(cs); | ||
738 | WARN_ONCE(cs->mult + cs->maxadj < cs->mult, | ||
739 | "Clocksource %s might overflow on 11%% adjustment\n", | ||
740 | cs->name); | ||
741 | |||
706 | /* calculate max idle time permitted for this clocksource */ | 742 | /* calculate max idle time permitted for this clocksource */ |
707 | cs->max_idle_ns = clocksource_max_deferment(cs); | 743 | cs->max_idle_ns = clocksource_max_deferment(cs); |
708 | 744 | ||
@@ -725,6 +761,8 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating) | |||
725 | 761 | ||
726 | /** | 762 | /** |
727 | * clocksource_change_rating - Change the rating of a registered clocksource | 763 | * clocksource_change_rating - Change the rating of a registered clocksource |
764 | * @cs: clocksource to be changed | ||
765 | * @rating: new rating | ||
728 | */ | 766 | */ |
729 | void clocksource_change_rating(struct clocksource *cs, int rating) | 767 | void clocksource_change_rating(struct clocksource *cs, int rating) |
730 | { | 768 | { |
@@ -736,6 +774,7 @@ EXPORT_SYMBOL(clocksource_change_rating); | |||
736 | 774 | ||
737 | /** | 775 | /** |
738 | * clocksource_unregister - remove a registered clocksource | 776 | * clocksource_unregister - remove a registered clocksource |
777 | * @cs: clocksource to be unregistered | ||
739 | */ | 778 | */ |
740 | void clocksource_unregister(struct clocksource *cs) | 779 | void clocksource_unregister(struct clocksource *cs) |
741 | { | 780 | { |
@@ -751,13 +790,14 @@ EXPORT_SYMBOL(clocksource_unregister); | |||
751 | /** | 790 | /** |
752 | * sysfs_show_current_clocksources - sysfs interface for current clocksource | 791 | * sysfs_show_current_clocksources - sysfs interface for current clocksource |
753 | * @dev: unused | 792 | * @dev: unused |
793 | * @attr: unused | ||
754 | * @buf: char buffer to be filled with clocksource list | 794 | * @buf: char buffer to be filled with clocksource list |
755 | * | 795 | * |
756 | * Provides sysfs interface for listing current clocksource. | 796 | * Provides sysfs interface for listing current clocksource. |
757 | */ | 797 | */ |
758 | static ssize_t | 798 | static ssize_t |
759 | sysfs_show_current_clocksources(struct sys_device *dev, | 799 | sysfs_show_current_clocksources(struct device *dev, |
760 | struct sysdev_attribute *attr, char *buf) | 800 | struct device_attribute *attr, char *buf) |
761 | { | 801 | { |
762 | ssize_t count = 0; | 802 | ssize_t count = 0; |
763 | 803 | ||
@@ -771,14 +811,15 @@ sysfs_show_current_clocksources(struct sys_device *dev, | |||
771 | /** | 811 | /** |
772 | * sysfs_override_clocksource - interface for manually overriding clocksource | 812 | * sysfs_override_clocksource - interface for manually overriding clocksource |
773 | * @dev: unused | 813 | * @dev: unused |
814 | * @attr: unused | ||
774 | * @buf: name of override clocksource | 815 | * @buf: name of override clocksource |
775 | * @count: length of buffer | 816 | * @count: length of buffer |
776 | * | 817 | * |
777 | * Takes input from sysfs interface for manually overriding the default | 818 | * Takes input from sysfs interface for manually overriding the default |
778 | * clocksource selection. | 819 | * clocksource selection. |
779 | */ | 820 | */ |
780 | static ssize_t sysfs_override_clocksource(struct sys_device *dev, | 821 | static ssize_t sysfs_override_clocksource(struct device *dev, |
781 | struct sysdev_attribute *attr, | 822 | struct device_attribute *attr, |
782 | const char *buf, size_t count) | 823 | const char *buf, size_t count) |
783 | { | 824 | { |
784 | size_t ret = count; | 825 | size_t ret = count; |
@@ -806,13 +847,14 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, | |||
806 | /** | 847 | /** |
807 | * sysfs_show_available_clocksources - sysfs interface for listing clocksource | 848 | * sysfs_show_available_clocksources - sysfs interface for listing clocksource |
808 | * @dev: unused | 849 | * @dev: unused |
850 | * @attr: unused | ||
809 | * @buf: char buffer to be filled with clocksource list | 851 | * @buf: char buffer to be filled with clocksource list |
810 | * | 852 | * |
811 | * Provides sysfs interface for listing registered clocksources | 853 | * Provides sysfs interface for listing registered clocksources |
812 | */ | 854 | */ |
813 | static ssize_t | 855 | static ssize_t |
814 | sysfs_show_available_clocksources(struct sys_device *dev, | 856 | sysfs_show_available_clocksources(struct device *dev, |
815 | struct sysdev_attribute *attr, | 857 | struct device_attribute *attr, |
816 | char *buf) | 858 | char *buf) |
817 | { | 859 | { |
818 | struct clocksource *src; | 860 | struct clocksource *src; |
@@ -841,35 +883,36 @@ sysfs_show_available_clocksources(struct sys_device *dev, | |||
841 | /* | 883 | /* |
842 | * Sysfs setup bits: | 884 | * Sysfs setup bits: |
843 | */ | 885 | */ |
844 | static SYSDEV_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, | 886 | static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, |
845 | sysfs_override_clocksource); | 887 | sysfs_override_clocksource); |
846 | 888 | ||
847 | static SYSDEV_ATTR(available_clocksource, 0444, | 889 | static DEVICE_ATTR(available_clocksource, 0444, |
848 | sysfs_show_available_clocksources, NULL); | 890 | sysfs_show_available_clocksources, NULL); |
849 | 891 | ||
850 | static struct sysdev_class clocksource_sysclass = { | 892 | static struct bus_type clocksource_subsys = { |
851 | .name = "clocksource", | 893 | .name = "clocksource", |
894 | .dev_name = "clocksource", | ||
852 | }; | 895 | }; |
853 | 896 | ||
854 | static struct sys_device device_clocksource = { | 897 | static struct device device_clocksource = { |
855 | .id = 0, | 898 | .id = 0, |
856 | .cls = &clocksource_sysclass, | 899 | .bus = &clocksource_subsys, |
857 | }; | 900 | }; |
858 | 901 | ||
859 | static int __init init_clocksource_sysfs(void) | 902 | static int __init init_clocksource_sysfs(void) |
860 | { | 903 | { |
861 | int error = sysdev_class_register(&clocksource_sysclass); | 904 | int error = subsys_system_register(&clocksource_subsys, NULL); |
862 | 905 | ||
863 | if (!error) | 906 | if (!error) |
864 | error = sysdev_register(&device_clocksource); | 907 | error = device_register(&device_clocksource); |
865 | if (!error) | 908 | if (!error) |
866 | error = sysdev_create_file( | 909 | error = device_create_file( |
867 | &device_clocksource, | 910 | &device_clocksource, |
868 | &attr_current_clocksource); | 911 | &dev_attr_current_clocksource); |
869 | if (!error) | 912 | if (!error) |
870 | error = sysdev_create_file( | 913 | error = device_create_file( |
871 | &device_clocksource, | 914 | &device_clocksource, |
872 | &attr_available_clocksource); | 915 | &dev_attr_available_clocksource); |
873 | return error; | 916 | return error; |
874 | } | 917 | } |
875 | 918 | ||
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index c340ca658f37..ce033c7aa2e8 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c | |||
@@ -18,6 +18,7 @@ | |||
18 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | 18 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
19 | */ | 19 | */ |
20 | #include <linux/device.h> | 20 | #include <linux/device.h> |
21 | #include <linux/export.h> | ||
21 | #include <linux/file.h> | 22 | #include <linux/file.h> |
22 | #include <linux/posix-clock.h> | 23 | #include <linux/posix-clock.h> |
23 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index c7218d132738..fd4a7b1625a2 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
@@ -71,7 +71,7 @@ int tick_check_broadcast_device(struct clock_event_device *dev) | |||
71 | (dev->features & CLOCK_EVT_FEAT_C3STOP)) | 71 | (dev->features & CLOCK_EVT_FEAT_C3STOP)) |
72 | return 0; | 72 | return 0; |
73 | 73 | ||
74 | clockevents_exchange_device(NULL, dev); | 74 | clockevents_exchange_device(tick_broadcast_device.evtdev, dev); |
75 | tick_broadcast_device.evtdev = dev; | 75 | tick_broadcast_device.evtdev = dev; |
76 | if (!cpumask_empty(tick_get_broadcast_mask())) | 76 | if (!cpumask_empty(tick_get_broadcast_mask())) |
77 | tick_broadcast_start_periodic(dev); | 77 | tick_broadcast_start_periodic(dev); |
@@ -194,7 +194,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) | |||
194 | for (next = dev->next_event; ;) { | 194 | for (next = dev->next_event; ;) { |
195 | next = ktime_add(next, tick_period); | 195 | next = ktime_add(next, tick_period); |
196 | 196 | ||
197 | if (!clockevents_program_event(dev, next, ktime_get())) | 197 | if (!clockevents_program_event(dev, next, false)) |
198 | return; | 198 | return; |
199 | tick_do_periodic_broadcast(); | 199 | tick_do_periodic_broadcast(); |
200 | } | 200 | } |
@@ -373,7 +373,7 @@ static int tick_broadcast_set_event(ktime_t expires, int force) | |||
373 | { | 373 | { |
374 | struct clock_event_device *bc = tick_broadcast_device.evtdev; | 374 | struct clock_event_device *bc = tick_broadcast_device.evtdev; |
375 | 375 | ||
376 | return tick_dev_program_event(bc, expires, force); | 376 | return clockevents_program_event(bc, expires, force); |
377 | } | 377 | } |
378 | 378 | ||
379 | int tick_resume_broadcast_oneshot(struct clock_event_device *bc) | 379 | int tick_resume_broadcast_oneshot(struct clock_event_device *bc) |
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 119528de8235..da6c9ecad4e4 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c | |||
@@ -94,7 +94,7 @@ void tick_handle_periodic(struct clock_event_device *dev) | |||
94 | */ | 94 | */ |
95 | next = ktime_add(dev->next_event, tick_period); | 95 | next = ktime_add(dev->next_event, tick_period); |
96 | for (;;) { | 96 | for (;;) { |
97 | if (!clockevents_program_event(dev, next, ktime_get())) | 97 | if (!clockevents_program_event(dev, next, false)) |
98 | return; | 98 | return; |
99 | /* | 99 | /* |
100 | * Have to be careful here. If we're in oneshot mode, | 100 | * Have to be careful here. If we're in oneshot mode, |
@@ -137,7 +137,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) | |||
137 | clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); | 137 | clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); |
138 | 138 | ||
139 | for (;;) { | 139 | for (;;) { |
140 | if (!clockevents_program_event(dev, next, ktime_get())) | 140 | if (!clockevents_program_event(dev, next, false)) |
141 | return; | 141 | return; |
142 | next = ktime_add(next, tick_period); | 142 | next = ktime_add(next, tick_period); |
143 | } | 143 | } |
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 1009b06d6f89..4e265b901fed 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h | |||
@@ -26,8 +26,6 @@ extern void clockevents_shutdown(struct clock_event_device *dev); | |||
26 | extern void tick_setup_oneshot(struct clock_event_device *newdev, | 26 | extern void tick_setup_oneshot(struct clock_event_device *newdev, |
27 | void (*handler)(struct clock_event_device *), | 27 | void (*handler)(struct clock_event_device *), |
28 | ktime_t nextevt); | 28 | ktime_t nextevt); |
29 | extern int tick_dev_program_event(struct clock_event_device *dev, | ||
30 | ktime_t expires, int force); | ||
31 | extern int tick_program_event(ktime_t expires, int force); | 29 | extern int tick_program_event(ktime_t expires, int force); |
32 | extern void tick_oneshot_notify(void); | 30 | extern void tick_oneshot_notify(void); |
33 | extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); | 31 | extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); |
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 2d04411a5f05..824109060a33 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c | |||
@@ -21,74 +21,6 @@ | |||
21 | 21 | ||
22 | #include "tick-internal.h" | 22 | #include "tick-internal.h" |
23 | 23 | ||
24 | /* Limit min_delta to a jiffie */ | ||
25 | #define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) | ||
26 | |||
27 | static int tick_increase_min_delta(struct clock_event_device *dev) | ||
28 | { | ||
29 | /* Nothing to do if we already reached the limit */ | ||
30 | if (dev->min_delta_ns >= MIN_DELTA_LIMIT) | ||
31 | return -ETIME; | ||
32 | |||
33 | if (dev->min_delta_ns < 5000) | ||
34 | dev->min_delta_ns = 5000; | ||
35 | else | ||
36 | dev->min_delta_ns += dev->min_delta_ns >> 1; | ||
37 | |||
38 | if (dev->min_delta_ns > MIN_DELTA_LIMIT) | ||
39 | dev->min_delta_ns = MIN_DELTA_LIMIT; | ||
40 | |||
41 | printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n", | ||
42 | dev->name ? dev->name : "?", | ||
43 | (unsigned long long) dev->min_delta_ns); | ||
44 | return 0; | ||
45 | } | ||
46 | |||
47 | /** | ||
48 | * tick_program_event internal worker function | ||
49 | */ | ||
50 | int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires, | ||
51 | int force) | ||
52 | { | ||
53 | ktime_t now = ktime_get(); | ||
54 | int i; | ||
55 | |||
56 | for (i = 0;;) { | ||
57 | int ret = clockevents_program_event(dev, expires, now); | ||
58 | |||
59 | if (!ret || !force) | ||
60 | return ret; | ||
61 | |||
62 | dev->retries++; | ||
63 | /* | ||
64 | * We tried 3 times to program the device with the given | ||
65 | * min_delta_ns. If that's not working then we increase it | ||
66 | * and emit a warning. | ||
67 | */ | ||
68 | if (++i > 2) { | ||
69 | /* Increase the min. delta and try again */ | ||
70 | if (tick_increase_min_delta(dev)) { | ||
71 | /* | ||
72 | * Get out of the loop if min_delta_ns | ||
73 | * hit the limit already. That's | ||
74 | * better than staying here forever. | ||
75 | * | ||
76 | * We clear next_event so we have a | ||
77 | * chance that the box survives. | ||
78 | */ | ||
79 | printk(KERN_WARNING | ||
80 | "CE: Reprogramming failure. Giving up\n"); | ||
81 | dev->next_event.tv64 = KTIME_MAX; | ||
82 | return -ETIME; | ||
83 | } | ||
84 | i = 0; | ||
85 | } | ||
86 | |||
87 | now = ktime_get(); | ||
88 | expires = ktime_add_ns(now, dev->min_delta_ns); | ||
89 | } | ||
90 | } | ||
91 | |||
92 | /** | 24 | /** |
93 | * tick_program_event | 25 | * tick_program_event |
94 | */ | 26 | */ |
@@ -96,7 +28,7 @@ int tick_program_event(ktime_t expires, int force) | |||
96 | { | 28 | { |
97 | struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); | 29 | struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); |
98 | 30 | ||
99 | return tick_dev_program_event(dev, expires, force); | 31 | return clockevents_program_event(dev, expires, force); |
100 | } | 32 | } |
101 | 33 | ||
102 | /** | 34 | /** |
@@ -104,11 +36,10 @@ int tick_program_event(ktime_t expires, int force) | |||
104 | */ | 36 | */ |
105 | void tick_resume_oneshot(void) | 37 | void tick_resume_oneshot(void) |
106 | { | 38 | { |
107 | struct tick_device *td = &__get_cpu_var(tick_cpu_device); | 39 | struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); |
108 | struct clock_event_device *dev = td->evtdev; | ||
109 | 40 | ||
110 | clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); | 41 | clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); |
111 | tick_program_event(ktime_get(), 1); | 42 | clockevents_program_event(dev, ktime_get(), true); |
112 | } | 43 | } |
113 | 44 | ||
114 | /** | 45 | /** |
@@ -120,7 +51,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev, | |||
120 | { | 51 | { |
121 | newdev->event_handler = handler; | 52 | newdev->event_handler = handler; |
122 | clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); | 53 | clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); |
123 | tick_dev_program_event(newdev, next_event, 1); | 54 | clockevents_program_event(newdev, next_event, true); |
124 | } | 55 | } |
125 | 56 | ||
126 | /** | 57 | /** |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index d5097c44b407..7656642e4b8e 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -139,7 +139,6 @@ static void tick_nohz_update_jiffies(ktime_t now) | |||
139 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | 139 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); |
140 | unsigned long flags; | 140 | unsigned long flags; |
141 | 141 | ||
142 | cpumask_clear_cpu(cpu, nohz_cpu_mask); | ||
143 | ts->idle_waketime = now; | 142 | ts->idle_waketime = now; |
144 | 143 | ||
145 | local_irq_save(flags); | 144 | local_irq_save(flags); |
@@ -159,9 +158,10 @@ update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_upda | |||
159 | 158 | ||
160 | if (ts->idle_active) { | 159 | if (ts->idle_active) { |
161 | delta = ktime_sub(now, ts->idle_entrytime); | 160 | delta = ktime_sub(now, ts->idle_entrytime); |
162 | ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); | ||
163 | if (nr_iowait_cpu(cpu) > 0) | 161 | if (nr_iowait_cpu(cpu) > 0) |
164 | ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); | 162 | ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); |
163 | else | ||
164 | ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); | ||
165 | ts->idle_entrytime = now; | 165 | ts->idle_entrytime = now; |
166 | } | 166 | } |
167 | 167 | ||
@@ -197,11 +197,11 @@ static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts) | |||
197 | /** | 197 | /** |
198 | * get_cpu_idle_time_us - get the total idle time of a cpu | 198 | * get_cpu_idle_time_us - get the total idle time of a cpu |
199 | * @cpu: CPU number to query | 199 | * @cpu: CPU number to query |
200 | * @last_update_time: variable to store update time in | 200 | * @last_update_time: variable to store update time in. Do not update |
201 | * counters if NULL. | ||
201 | * | 202 | * |
202 | * Return the cummulative idle time (since boot) for a given | 203 | * Return the cummulative idle time (since boot) for a given |
203 | * CPU, in microseconds. The idle time returned includes | 204 | * CPU, in microseconds. |
204 | * the iowait time (unlike what "top" and co report). | ||
205 | * | 205 | * |
206 | * This time is measured via accounting rather than sampling, | 206 | * This time is measured via accounting rather than sampling, |
207 | * and is as accurate as ktime_get() is. | 207 | * and is as accurate as ktime_get() is. |
@@ -211,20 +211,35 @@ static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts) | |||
211 | u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) | 211 | u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) |
212 | { | 212 | { |
213 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | 213 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); |
214 | ktime_t now, idle; | ||
214 | 215 | ||
215 | if (!tick_nohz_enabled) | 216 | if (!tick_nohz_enabled) |
216 | return -1; | 217 | return -1; |
217 | 218 | ||
218 | update_ts_time_stats(cpu, ts, ktime_get(), last_update_time); | 219 | now = ktime_get(); |
220 | if (last_update_time) { | ||
221 | update_ts_time_stats(cpu, ts, now, last_update_time); | ||
222 | idle = ts->idle_sleeptime; | ||
223 | } else { | ||
224 | if (ts->idle_active && !nr_iowait_cpu(cpu)) { | ||
225 | ktime_t delta = ktime_sub(now, ts->idle_entrytime); | ||
226 | |||
227 | idle = ktime_add(ts->idle_sleeptime, delta); | ||
228 | } else { | ||
229 | idle = ts->idle_sleeptime; | ||
230 | } | ||
231 | } | ||
232 | |||
233 | return ktime_to_us(idle); | ||
219 | 234 | ||
220 | return ktime_to_us(ts->idle_sleeptime); | ||
221 | } | 235 | } |
222 | EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); | 236 | EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); |
223 | 237 | ||
224 | /* | 238 | /** |
225 | * get_cpu_iowait_time_us - get the total iowait time of a cpu | 239 | * get_cpu_iowait_time_us - get the total iowait time of a cpu |
226 | * @cpu: CPU number to query | 240 | * @cpu: CPU number to query |
227 | * @last_update_time: variable to store update time in | 241 | * @last_update_time: variable to store update time in. Do not update |
242 | * counters if NULL. | ||
228 | * | 243 | * |
229 | * Return the cummulative iowait time (since boot) for a given | 244 | * Return the cummulative iowait time (since boot) for a given |
230 | * CPU, in microseconds. | 245 | * CPU, in microseconds. |
@@ -237,52 +252,40 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); | |||
237 | u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) | 252 | u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) |
238 | { | 253 | { |
239 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | 254 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); |
255 | ktime_t now, iowait; | ||
240 | 256 | ||
241 | if (!tick_nohz_enabled) | 257 | if (!tick_nohz_enabled) |
242 | return -1; | 258 | return -1; |
243 | 259 | ||
244 | update_ts_time_stats(cpu, ts, ktime_get(), last_update_time); | 260 | now = ktime_get(); |
261 | if (last_update_time) { | ||
262 | update_ts_time_stats(cpu, ts, now, last_update_time); | ||
263 | iowait = ts->iowait_sleeptime; | ||
264 | } else { | ||
265 | if (ts->idle_active && nr_iowait_cpu(cpu) > 0) { | ||
266 | ktime_t delta = ktime_sub(now, ts->idle_entrytime); | ||
245 | 267 | ||
246 | return ktime_to_us(ts->iowait_sleeptime); | 268 | iowait = ktime_add(ts->iowait_sleeptime, delta); |
269 | } else { | ||
270 | iowait = ts->iowait_sleeptime; | ||
271 | } | ||
272 | } | ||
273 | |||
274 | return ktime_to_us(iowait); | ||
247 | } | 275 | } |
248 | EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); | 276 | EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); |
249 | 277 | ||
250 | /** | 278 | static void tick_nohz_stop_sched_tick(struct tick_sched *ts) |
251 | * tick_nohz_stop_sched_tick - stop the idle tick from the idle task | ||
252 | * | ||
253 | * When the next event is more than a tick into the future, stop the idle tick | ||
254 | * Called either from the idle loop or from irq_exit() when an idle period was | ||
255 | * just interrupted by an interrupt which did not cause a reschedule. | ||
256 | */ | ||
257 | void tick_nohz_stop_sched_tick(int inidle) | ||
258 | { | 279 | { |
259 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; | 280 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; |
260 | struct tick_sched *ts; | ||
261 | ktime_t last_update, expires, now; | 281 | ktime_t last_update, expires, now; |
262 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; | 282 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; |
263 | u64 time_delta; | 283 | u64 time_delta; |
264 | int cpu; | 284 | int cpu; |
265 | 285 | ||
266 | local_irq_save(flags); | ||
267 | |||
268 | cpu = smp_processor_id(); | 286 | cpu = smp_processor_id(); |
269 | ts = &per_cpu(tick_cpu_sched, cpu); | 287 | ts = &per_cpu(tick_cpu_sched, cpu); |
270 | 288 | ||
271 | /* | ||
272 | * Call to tick_nohz_start_idle stops the last_update_time from being | ||
273 | * updated. Thus, it must not be called in the event we are called from | ||
274 | * irq_exit() with the prior state different than idle. | ||
275 | */ | ||
276 | if (!inidle && !ts->inidle) | ||
277 | goto end; | ||
278 | |||
279 | /* | ||
280 | * Set ts->inidle unconditionally. Even if the system did not | ||
281 | * switch to NOHZ mode the cpu frequency governers rely on the | ||
282 | * update of the idle time accounting in tick_nohz_start_idle(). | ||
283 | */ | ||
284 | ts->inidle = 1; | ||
285 | |||
286 | now = tick_nohz_start_idle(cpu, ts); | 289 | now = tick_nohz_start_idle(cpu, ts); |
287 | 290 | ||
288 | /* | 291 | /* |
@@ -298,10 +301,10 @@ void tick_nohz_stop_sched_tick(int inidle) | |||
298 | } | 301 | } |
299 | 302 | ||
300 | if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) | 303 | if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) |
301 | goto end; | 304 | return; |
302 | 305 | ||
303 | if (need_resched()) | 306 | if (need_resched()) |
304 | goto end; | 307 | return; |
305 | 308 | ||
306 | if (unlikely(local_softirq_pending() && cpu_online(cpu))) { | 309 | if (unlikely(local_softirq_pending() && cpu_online(cpu))) { |
307 | static int ratelimit; | 310 | static int ratelimit; |
@@ -311,7 +314,7 @@ void tick_nohz_stop_sched_tick(int inidle) | |||
311 | (unsigned int) local_softirq_pending()); | 314 | (unsigned int) local_softirq_pending()); |
312 | ratelimit++; | 315 | ratelimit++; |
313 | } | 316 | } |
314 | goto end; | 317 | return; |
315 | } | 318 | } |
316 | 319 | ||
317 | ts->idle_calls++; | 320 | ts->idle_calls++; |
@@ -389,9 +392,6 @@ void tick_nohz_stop_sched_tick(int inidle) | |||
389 | else | 392 | else |
390 | expires.tv64 = KTIME_MAX; | 393 | expires.tv64 = KTIME_MAX; |
391 | 394 | ||
392 | if (delta_jiffies > 1) | ||
393 | cpumask_set_cpu(cpu, nohz_cpu_mask); | ||
394 | |||
395 | /* Skip reprogram of event if its not changed */ | 395 | /* Skip reprogram of event if its not changed */ |
396 | if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) | 396 | if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) |
397 | goto out; | 397 | goto out; |
@@ -409,7 +409,6 @@ void tick_nohz_stop_sched_tick(int inidle) | |||
409 | ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); | 409 | ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); |
410 | ts->tick_stopped = 1; | 410 | ts->tick_stopped = 1; |
411 | ts->idle_jiffies = last_jiffies; | 411 | ts->idle_jiffies = last_jiffies; |
412 | rcu_enter_nohz(); | ||
413 | } | 412 | } |
414 | 413 | ||
415 | ts->idle_sleeps++; | 414 | ts->idle_sleeps++; |
@@ -441,15 +440,70 @@ void tick_nohz_stop_sched_tick(int inidle) | |||
441 | * softirq. | 440 | * softirq. |
442 | */ | 441 | */ |
443 | tick_do_update_jiffies64(ktime_get()); | 442 | tick_do_update_jiffies64(ktime_get()); |
444 | cpumask_clear_cpu(cpu, nohz_cpu_mask); | ||
445 | } | 443 | } |
446 | raise_softirq_irqoff(TIMER_SOFTIRQ); | 444 | raise_softirq_irqoff(TIMER_SOFTIRQ); |
447 | out: | 445 | out: |
448 | ts->next_jiffies = next_jiffies; | 446 | ts->next_jiffies = next_jiffies; |
449 | ts->last_jiffies = last_jiffies; | 447 | ts->last_jiffies = last_jiffies; |
450 | ts->sleep_length = ktime_sub(dev->next_event, now); | 448 | ts->sleep_length = ktime_sub(dev->next_event, now); |
451 | end: | 449 | } |
452 | local_irq_restore(flags); | 450 | |
451 | /** | ||
452 | * tick_nohz_idle_enter - stop the idle tick from the idle task | ||
453 | * | ||
454 | * When the next event is more than a tick into the future, stop the idle tick | ||
455 | * Called when we start the idle loop. | ||
456 | * | ||
457 | * The arch is responsible of calling: | ||
458 | * | ||
459 | * - rcu_idle_enter() after its last use of RCU before the CPU is put | ||
460 | * to sleep. | ||
461 | * - rcu_idle_exit() before the first use of RCU after the CPU is woken up. | ||
462 | */ | ||
463 | void tick_nohz_idle_enter(void) | ||
464 | { | ||
465 | struct tick_sched *ts; | ||
466 | |||
467 | WARN_ON_ONCE(irqs_disabled()); | ||
468 | |||
469 | /* | ||
470 | * Update the idle state in the scheduler domain hierarchy | ||
471 | * when tick_nohz_stop_sched_tick() is called from the idle loop. | ||
472 | * State will be updated to busy during the first busy tick after | ||
473 | * exiting idle. | ||
474 | */ | ||
475 | set_cpu_sd_state_idle(); | ||
476 | |||
477 | local_irq_disable(); | ||
478 | |||
479 | ts = &__get_cpu_var(tick_cpu_sched); | ||
480 | /* | ||
481 | * set ts->inidle unconditionally. even if the system did not | ||
482 | * switch to nohz mode the cpu frequency governers rely on the | ||
483 | * update of the idle time accounting in tick_nohz_start_idle(). | ||
484 | */ | ||
485 | ts->inidle = 1; | ||
486 | tick_nohz_stop_sched_tick(ts); | ||
487 | |||
488 | local_irq_enable(); | ||
489 | } | ||
490 | |||
491 | /** | ||
492 | * tick_nohz_irq_exit - update next tick event from interrupt exit | ||
493 | * | ||
494 | * When an interrupt fires while we are idle and it doesn't cause | ||
495 | * a reschedule, it may still add, modify or delete a timer, enqueue | ||
496 | * an RCU callback, etc... | ||
497 | * So we need to re-calculate and reprogram the next tick event. | ||
498 | */ | ||
499 | void tick_nohz_irq_exit(void) | ||
500 | { | ||
501 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | ||
502 | |||
503 | if (!ts->inidle) | ||
504 | return; | ||
505 | |||
506 | tick_nohz_stop_sched_tick(ts); | ||
453 | } | 507 | } |
454 | 508 | ||
455 | /** | 509 | /** |
@@ -491,11 +545,13 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) | |||
491 | } | 545 | } |
492 | 546 | ||
493 | /** | 547 | /** |
494 | * tick_nohz_restart_sched_tick - restart the idle tick from the idle task | 548 | * tick_nohz_idle_exit - restart the idle tick from the idle task |
495 | * | 549 | * |
496 | * Restart the idle tick when the CPU is woken up from idle | 550 | * Restart the idle tick when the CPU is woken up from idle |
551 | * This also exit the RCU extended quiescent state. The CPU | ||
552 | * can use RCU again after this function is called. | ||
497 | */ | 553 | */ |
498 | void tick_nohz_restart_sched_tick(void) | 554 | void tick_nohz_idle_exit(void) |
499 | { | 555 | { |
500 | int cpu = smp_processor_id(); | 556 | int cpu = smp_processor_id(); |
501 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | 557 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); |
@@ -505,6 +561,7 @@ void tick_nohz_restart_sched_tick(void) | |||
505 | ktime_t now; | 561 | ktime_t now; |
506 | 562 | ||
507 | local_irq_disable(); | 563 | local_irq_disable(); |
564 | |||
508 | if (ts->idle_active || (ts->inidle && ts->tick_stopped)) | 565 | if (ts->idle_active || (ts->inidle && ts->tick_stopped)) |
509 | now = ktime_get(); | 566 | now = ktime_get(); |
510 | 567 | ||
@@ -519,12 +576,9 @@ void tick_nohz_restart_sched_tick(void) | |||
519 | 576 | ||
520 | ts->inidle = 0; | 577 | ts->inidle = 0; |
521 | 578 | ||
522 | rcu_exit_nohz(); | ||
523 | |||
524 | /* Update jiffies first */ | 579 | /* Update jiffies first */ |
525 | select_nohz_load_balancer(0); | 580 | select_nohz_load_balancer(0); |
526 | tick_do_update_jiffies64(now); | 581 | tick_do_update_jiffies64(now); |
527 | cpumask_clear_cpu(cpu, nohz_cpu_mask); | ||
528 | 582 | ||
529 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | 583 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING |
530 | /* | 584 | /* |
@@ -640,8 +694,6 @@ static void tick_nohz_switch_to_nohz(void) | |||
640 | next = ktime_add(next, tick_period); | 694 | next = ktime_add(next, tick_period); |
641 | } | 695 | } |
642 | local_irq_enable(); | 696 | local_irq_enable(); |
643 | |||
644 | printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id()); | ||
645 | } | 697 | } |
646 | 698 | ||
647 | /* | 699 | /* |
@@ -793,10 +845,8 @@ void tick_setup_sched_timer(void) | |||
793 | } | 845 | } |
794 | 846 | ||
795 | #ifdef CONFIG_NO_HZ | 847 | #ifdef CONFIG_NO_HZ |
796 | if (tick_nohz_enabled) { | 848 | if (tick_nohz_enabled) |
797 | ts->nohz_mode = NOHZ_MODE_HIGHRES; | 849 | ts->nohz_mode = NOHZ_MODE_HIGHRES; |
798 | printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id()); | ||
799 | } | ||
800 | #endif | 850 | #endif |
801 | } | 851 | } |
802 | #endif /* HIGH_RES_TIMERS */ | 852 | #endif /* HIGH_RES_TIMERS */ |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 2b021b0e8507..0c6358186401 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -131,7 +131,7 @@ static inline s64 timekeeping_get_ns_raw(void) | |||
131 | /* calculate the delta since the last update_wall_time: */ | 131 | /* calculate the delta since the last update_wall_time: */ |
132 | cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; | 132 | cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; |
133 | 133 | ||
134 | /* return delta convert to nanoseconds using ntp adjusted mult. */ | 134 | /* return delta convert to nanoseconds. */ |
135 | return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); | 135 | return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); |
136 | } | 136 | } |
137 | 137 | ||
@@ -249,6 +249,8 @@ ktime_t ktime_get(void) | |||
249 | secs = xtime.tv_sec + wall_to_monotonic.tv_sec; | 249 | secs = xtime.tv_sec + wall_to_monotonic.tv_sec; |
250 | nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; | 250 | nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; |
251 | nsecs += timekeeping_get_ns(); | 251 | nsecs += timekeeping_get_ns(); |
252 | /* If arch requires, add in gettimeoffset() */ | ||
253 | nsecs += arch_gettimeoffset(); | ||
252 | 254 | ||
253 | } while (read_seqretry(&xtime_lock, seq)); | 255 | } while (read_seqretry(&xtime_lock, seq)); |
254 | /* | 256 | /* |
@@ -280,6 +282,8 @@ void ktime_get_ts(struct timespec *ts) | |||
280 | *ts = xtime; | 282 | *ts = xtime; |
281 | tomono = wall_to_monotonic; | 283 | tomono = wall_to_monotonic; |
282 | nsecs = timekeeping_get_ns(); | 284 | nsecs = timekeeping_get_ns(); |
285 | /* If arch requires, add in gettimeoffset() */ | ||
286 | nsecs += arch_gettimeoffset(); | ||
283 | 287 | ||
284 | } while (read_seqretry(&xtime_lock, seq)); | 288 | } while (read_seqretry(&xtime_lock, seq)); |
285 | 289 | ||
@@ -802,14 +806,44 @@ static void timekeeping_adjust(s64 offset) | |||
802 | s64 error, interval = timekeeper.cycle_interval; | 806 | s64 error, interval = timekeeper.cycle_interval; |
803 | int adj; | 807 | int adj; |
804 | 808 | ||
809 | /* | ||
810 | * The point of this is to check if the error is greater then half | ||
811 | * an interval. | ||
812 | * | ||
813 | * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs. | ||
814 | * | ||
815 | * Note we subtract one in the shift, so that error is really error*2. | ||
816 | * This "saves" dividing(shifting) interval twice, but keeps the | ||
817 | * (error > interval) comparison as still measuring if error is | ||
818 | * larger then half an interval. | ||
819 | * | ||
820 | * Note: It does not "save" on aggravation when reading the code. | ||
821 | */ | ||
805 | error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1); | 822 | error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1); |
806 | if (error > interval) { | 823 | if (error > interval) { |
824 | /* | ||
825 | * We now divide error by 4(via shift), which checks if | ||
826 | * the error is greater then twice the interval. | ||
827 | * If it is greater, we need a bigadjust, if its smaller, | ||
828 | * we can adjust by 1. | ||
829 | */ | ||
807 | error >>= 2; | 830 | error >>= 2; |
831 | /* | ||
832 | * XXX - In update_wall_time, we round up to the next | ||
833 | * nanosecond, and store the amount rounded up into | ||
834 | * the error. This causes the likely below to be unlikely. | ||
835 | * | ||
836 | * The proper fix is to avoid rounding up by using | ||
837 | * the high precision timekeeper.xtime_nsec instead of | ||
838 | * xtime.tv_nsec everywhere. Fixing this will take some | ||
839 | * time. | ||
840 | */ | ||
808 | if (likely(error <= interval)) | 841 | if (likely(error <= interval)) |
809 | adj = 1; | 842 | adj = 1; |
810 | else | 843 | else |
811 | adj = timekeeping_bigadjust(error, &interval, &offset); | 844 | adj = timekeeping_bigadjust(error, &interval, &offset); |
812 | } else if (error < -interval) { | 845 | } else if (error < -interval) { |
846 | /* See comment above, this is just switched for the negative */ | ||
813 | error >>= 2; | 847 | error >>= 2; |
814 | if (likely(error >= -interval)) { | 848 | if (likely(error >= -interval)) { |
815 | adj = -1; | 849 | adj = -1; |
@@ -817,9 +851,65 @@ static void timekeeping_adjust(s64 offset) | |||
817 | offset = -offset; | 851 | offset = -offset; |
818 | } else | 852 | } else |
819 | adj = timekeeping_bigadjust(error, &interval, &offset); | 853 | adj = timekeeping_bigadjust(error, &interval, &offset); |
820 | } else | 854 | } else /* No adjustment needed */ |
821 | return; | 855 | return; |
822 | 856 | ||
857 | WARN_ONCE(timekeeper.clock->maxadj && | ||
858 | (timekeeper.mult + adj > timekeeper.clock->mult + | ||
859 | timekeeper.clock->maxadj), | ||
860 | "Adjusting %s more then 11%% (%ld vs %ld)\n", | ||
861 | timekeeper.clock->name, (long)timekeeper.mult + adj, | ||
862 | (long)timekeeper.clock->mult + | ||
863 | timekeeper.clock->maxadj); | ||
864 | /* | ||
865 | * So the following can be confusing. | ||
866 | * | ||
867 | * To keep things simple, lets assume adj == 1 for now. | ||
868 | * | ||
869 | * When adj != 1, remember that the interval and offset values | ||
870 | * have been appropriately scaled so the math is the same. | ||
871 | * | ||
872 | * The basic idea here is that we're increasing the multiplier | ||
873 | * by one, this causes the xtime_interval to be incremented by | ||
874 | * one cycle_interval. This is because: | ||
875 | * xtime_interval = cycle_interval * mult | ||
876 | * So if mult is being incremented by one: | ||
877 | * xtime_interval = cycle_interval * (mult + 1) | ||
878 | * Its the same as: | ||
879 | * xtime_interval = (cycle_interval * mult) + cycle_interval | ||
880 | * Which can be shortened to: | ||
881 | * xtime_interval += cycle_interval | ||
882 | * | ||
883 | * So offset stores the non-accumulated cycles. Thus the current | ||
884 | * time (in shifted nanoseconds) is: | ||
885 | * now = (offset * adj) + xtime_nsec | ||
886 | * Now, even though we're adjusting the clock frequency, we have | ||
887 | * to keep time consistent. In other words, we can't jump back | ||
888 | * in time, and we also want to avoid jumping forward in time. | ||
889 | * | ||
890 | * So given the same offset value, we need the time to be the same | ||
891 | * both before and after the freq adjustment. | ||
892 | * now = (offset * adj_1) + xtime_nsec_1 | ||
893 | * now = (offset * adj_2) + xtime_nsec_2 | ||
894 | * So: | ||
895 | * (offset * adj_1) + xtime_nsec_1 = | ||
896 | * (offset * adj_2) + xtime_nsec_2 | ||
897 | * And we know: | ||
898 | * adj_2 = adj_1 + 1 | ||
899 | * So: | ||
900 | * (offset * adj_1) + xtime_nsec_1 = | ||
901 | * (offset * (adj_1+1)) + xtime_nsec_2 | ||
902 | * (offset * adj_1) + xtime_nsec_1 = | ||
903 | * (offset * adj_1) + offset + xtime_nsec_2 | ||
904 | * Canceling the sides: | ||
905 | * xtime_nsec_1 = offset + xtime_nsec_2 | ||
906 | * Which gives us: | ||
907 | * xtime_nsec_2 = xtime_nsec_1 - offset | ||
908 | * Which simplfies to: | ||
909 | * xtime_nsec -= offset | ||
910 | * | ||
911 | * XXX - TODO: Doc ntp_error calculation. | ||
912 | */ | ||
823 | timekeeper.mult += adj; | 913 | timekeeper.mult += adj; |
824 | timekeeper.xtime_interval += interval; | 914 | timekeeper.xtime_interval += interval; |
825 | timekeeper.xtime_nsec -= offset; | 915 | timekeeper.xtime_nsec -= offset; |
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index a5d0a3a85dd8..0b537f27b559 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c | |||
@@ -81,7 +81,7 @@ struct entry { | |||
81 | /* | 81 | /* |
82 | * Spinlock protecting the tables - not taken during lookup: | 82 | * Spinlock protecting the tables - not taken during lookup: |
83 | */ | 83 | */ |
84 | static DEFINE_SPINLOCK(table_lock); | 84 | static DEFINE_RAW_SPINLOCK(table_lock); |
85 | 85 | ||
86 | /* | 86 | /* |
87 | * Per-CPU lookup locks for fast hash lookup: | 87 | * Per-CPU lookup locks for fast hash lookup: |
@@ -188,7 +188,7 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm) | |||
188 | prev = NULL; | 188 | prev = NULL; |
189 | curr = *head; | 189 | curr = *head; |
190 | 190 | ||
191 | spin_lock(&table_lock); | 191 | raw_spin_lock(&table_lock); |
192 | /* | 192 | /* |
193 | * Make sure we have not raced with another CPU: | 193 | * Make sure we have not raced with another CPU: |
194 | */ | 194 | */ |
@@ -215,7 +215,7 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm) | |||
215 | *head = curr; | 215 | *head = curr; |
216 | } | 216 | } |
217 | out_unlock: | 217 | out_unlock: |
218 | spin_unlock(&table_lock); | 218 | raw_spin_unlock(&table_lock); |
219 | 219 | ||
220 | return curr; | 220 | return curr; |
221 | } | 221 | } |
diff --git a/kernel/timer.c b/kernel/timer.c index 8cff36119e4d..a297ffcf888e 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -20,7 +20,7 @@ | |||
20 | */ | 20 | */ |
21 | 21 | ||
22 | #include <linux/kernel_stat.h> | 22 | #include <linux/kernel_stat.h> |
23 | #include <linux/module.h> | 23 | #include <linux/export.h> |
24 | #include <linux/interrupt.h> | 24 | #include <linux/interrupt.h> |
25 | #include <linux/percpu.h> | 25 | #include <linux/percpu.h> |
26 | #include <linux/init.h> | 26 | #include <linux/init.h> |
@@ -427,6 +427,12 @@ static int timer_fixup_init(void *addr, enum debug_obj_state state) | |||
427 | } | 427 | } |
428 | } | 428 | } |
429 | 429 | ||
430 | /* Stub timer callback for improperly used timers. */ | ||
431 | static void stub_timer(unsigned long data) | ||
432 | { | ||
433 | WARN_ON(1); | ||
434 | } | ||
435 | |||
430 | /* | 436 | /* |
431 | * fixup_activate is called when: | 437 | * fixup_activate is called when: |
432 | * - an active object is activated | 438 | * - an active object is activated |
@@ -450,7 +456,8 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state) | |||
450 | debug_object_activate(timer, &timer_debug_descr); | 456 | debug_object_activate(timer, &timer_debug_descr); |
451 | return 0; | 457 | return 0; |
452 | } else { | 458 | } else { |
453 | WARN_ON_ONCE(1); | 459 | setup_timer(timer, stub_timer, 0); |
460 | return 1; | ||
454 | } | 461 | } |
455 | return 0; | 462 | return 0; |
456 | 463 | ||
@@ -480,12 +487,40 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state) | |||
480 | } | 487 | } |
481 | } | 488 | } |
482 | 489 | ||
490 | /* | ||
491 | * fixup_assert_init is called when: | ||
492 | * - an untracked/uninit-ed object is found | ||
493 | */ | ||
494 | static int timer_fixup_assert_init(void *addr, enum debug_obj_state state) | ||
495 | { | ||
496 | struct timer_list *timer = addr; | ||
497 | |||
498 | switch (state) { | ||
499 | case ODEBUG_STATE_NOTAVAILABLE: | ||
500 | if (timer->entry.prev == TIMER_ENTRY_STATIC) { | ||
501 | /* | ||
502 | * This is not really a fixup. The timer was | ||
503 | * statically initialized. We just make sure that it | ||
504 | * is tracked in the object tracker. | ||
505 | */ | ||
506 | debug_object_init(timer, &timer_debug_descr); | ||
507 | return 0; | ||
508 | } else { | ||
509 | setup_timer(timer, stub_timer, 0); | ||
510 | return 1; | ||
511 | } | ||
512 | default: | ||
513 | return 0; | ||
514 | } | ||
515 | } | ||
516 | |||
483 | static struct debug_obj_descr timer_debug_descr = { | 517 | static struct debug_obj_descr timer_debug_descr = { |
484 | .name = "timer_list", | 518 | .name = "timer_list", |
485 | .debug_hint = timer_debug_hint, | 519 | .debug_hint = timer_debug_hint, |
486 | .fixup_init = timer_fixup_init, | 520 | .fixup_init = timer_fixup_init, |
487 | .fixup_activate = timer_fixup_activate, | 521 | .fixup_activate = timer_fixup_activate, |
488 | .fixup_free = timer_fixup_free, | 522 | .fixup_free = timer_fixup_free, |
523 | .fixup_assert_init = timer_fixup_assert_init, | ||
489 | }; | 524 | }; |
490 | 525 | ||
491 | static inline void debug_timer_init(struct timer_list *timer) | 526 | static inline void debug_timer_init(struct timer_list *timer) |
@@ -508,6 +543,11 @@ static inline void debug_timer_free(struct timer_list *timer) | |||
508 | debug_object_free(timer, &timer_debug_descr); | 543 | debug_object_free(timer, &timer_debug_descr); |
509 | } | 544 | } |
510 | 545 | ||
546 | static inline void debug_timer_assert_init(struct timer_list *timer) | ||
547 | { | ||
548 | debug_object_assert_init(timer, &timer_debug_descr); | ||
549 | } | ||
550 | |||
511 | static void __init_timer(struct timer_list *timer, | 551 | static void __init_timer(struct timer_list *timer, |
512 | const char *name, | 552 | const char *name, |
513 | struct lock_class_key *key); | 553 | struct lock_class_key *key); |
@@ -531,6 +571,7 @@ EXPORT_SYMBOL_GPL(destroy_timer_on_stack); | |||
531 | static inline void debug_timer_init(struct timer_list *timer) { } | 571 | static inline void debug_timer_init(struct timer_list *timer) { } |
532 | static inline void debug_timer_activate(struct timer_list *timer) { } | 572 | static inline void debug_timer_activate(struct timer_list *timer) { } |
533 | static inline void debug_timer_deactivate(struct timer_list *timer) { } | 573 | static inline void debug_timer_deactivate(struct timer_list *timer) { } |
574 | static inline void debug_timer_assert_init(struct timer_list *timer) { } | ||
534 | #endif | 575 | #endif |
535 | 576 | ||
536 | static inline void debug_init(struct timer_list *timer) | 577 | static inline void debug_init(struct timer_list *timer) |
@@ -552,6 +593,11 @@ static inline void debug_deactivate(struct timer_list *timer) | |||
552 | trace_timer_cancel(timer); | 593 | trace_timer_cancel(timer); |
553 | } | 594 | } |
554 | 595 | ||
596 | static inline void debug_assert_init(struct timer_list *timer) | ||
597 | { | ||
598 | debug_timer_assert_init(timer); | ||
599 | } | ||
600 | |||
555 | static void __init_timer(struct timer_list *timer, | 601 | static void __init_timer(struct timer_list *timer, |
556 | const char *name, | 602 | const char *name, |
557 | struct lock_class_key *key) | 603 | struct lock_class_key *key) |
@@ -902,6 +948,8 @@ int del_timer(struct timer_list *timer) | |||
902 | unsigned long flags; | 948 | unsigned long flags; |
903 | int ret = 0; | 949 | int ret = 0; |
904 | 950 | ||
951 | debug_assert_init(timer); | ||
952 | |||
905 | timer_stats_timer_clear_start_info(timer); | 953 | timer_stats_timer_clear_start_info(timer); |
906 | if (timer_pending(timer)) { | 954 | if (timer_pending(timer)) { |
907 | base = lock_timer_base(timer, &flags); | 955 | base = lock_timer_base(timer, &flags); |
@@ -932,6 +980,8 @@ int try_to_del_timer_sync(struct timer_list *timer) | |||
932 | unsigned long flags; | 980 | unsigned long flags; |
933 | int ret = -1; | 981 | int ret = -1; |
934 | 982 | ||
983 | debug_assert_init(timer); | ||
984 | |||
935 | base = lock_timer_base(timer, &flags); | 985 | base = lock_timer_base(timer, &flags); |
936 | 986 | ||
937 | if (base->running_timer == timer) | 987 | if (base->running_timer == timer) |
@@ -1368,7 +1418,7 @@ SYSCALL_DEFINE0(getppid) | |||
1368 | int pid; | 1418 | int pid; |
1369 | 1419 | ||
1370 | rcu_read_lock(); | 1420 | rcu_read_lock(); |
1371 | pid = task_tgid_vnr(current->real_parent); | 1421 | pid = task_tgid_vnr(rcu_dereference(current->real_parent)); |
1372 | rcu_read_unlock(); | 1422 | rcu_read_unlock(); |
1373 | 1423 | ||
1374 | return pid; | 1424 | return pid; |
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 761c510a06c5..5f39a07fe5ea 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile | |||
@@ -15,6 +15,8 @@ ifdef CONFIG_TRACING_BRANCHES | |||
15 | KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING | 15 | KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING |
16 | endif | 16 | endif |
17 | 17 | ||
18 | CFLAGS_trace_events_filter.o := -I$(src) | ||
19 | |||
18 | # | 20 | # |
19 | # Make the trace clocks available generally: it's infrastructure | 21 | # Make the trace clocks available generally: it's infrastructure |
20 | # relied on by ptrace for example: | 22 | # relied on by ptrace for example: |
@@ -53,6 +55,9 @@ endif | |||
53 | obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o | 55 | obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o |
54 | obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o | 56 | obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o |
55 | obj-$(CONFIG_TRACEPOINTS) += power-traces.o | 57 | obj-$(CONFIG_TRACEPOINTS) += power-traces.o |
58 | ifeq ($(CONFIG_PM_RUNTIME),y) | ||
59 | obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o | ||
60 | endif | ||
56 | ifeq ($(CONFIG_TRACING),y) | 61 | ifeq ($(CONFIG_TRACING),y) |
57 | obj-$(CONFIG_KGDB_KDB) += trace_kdb.o | 62 | obj-$(CONFIG_KGDB_KDB) += trace_kdb.o |
58 | endif | 63 | endif |
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 7c910a5593a6..cdea7b56b0c9 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/mutex.h> | 23 | #include <linux/mutex.h> |
24 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
25 | #include <linux/debugfs.h> | 25 | #include <linux/debugfs.h> |
26 | #include <linux/export.h> | ||
26 | #include <linux/time.h> | 27 | #include <linux/time.h> |
27 | #include <linux/uaccess.h> | 28 | #include <linux/uaccess.h> |
28 | 29 | ||
@@ -401,7 +402,7 @@ static int blk_remove_buf_file_callback(struct dentry *dentry) | |||
401 | 402 | ||
402 | static struct dentry *blk_create_buf_file_callback(const char *filename, | 403 | static struct dentry *blk_create_buf_file_callback(const char *filename, |
403 | struct dentry *parent, | 404 | struct dentry *parent, |
404 | int mode, | 405 | umode_t mode, |
405 | struct rchan_buf *buf, | 406 | struct rchan_buf *buf, |
406 | int *is_global) | 407 | int *is_global) |
407 | { | 408 | { |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c3e4575e7829..b1e8943fed1d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/hardirq.h> | 22 | #include <linux/hardirq.h> |
23 | #include <linux/kthread.h> | 23 | #include <linux/kthread.h> |
24 | #include <linux/uaccess.h> | 24 | #include <linux/uaccess.h> |
25 | #include <linux/module.h> | ||
25 | #include <linux/ftrace.h> | 26 | #include <linux/ftrace.h> |
26 | #include <linux/sysctl.h> | 27 | #include <linux/sysctl.h> |
27 | #include <linux/slab.h> | 28 | #include <linux/slab.h> |
@@ -151,7 +152,6 @@ void clear_ftrace_function(void) | |||
151 | ftrace_pid_function = ftrace_stub; | 152 | ftrace_pid_function = ftrace_stub; |
152 | } | 153 | } |
153 | 154 | ||
154 | #undef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST | ||
155 | #ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST | 155 | #ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST |
156 | /* | 156 | /* |
157 | * For those archs that do not test ftrace_trace_stop in their | 157 | * For those archs that do not test ftrace_trace_stop in their |
@@ -1211,7 +1211,9 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, | |||
1211 | if (!src->count) { | 1211 | if (!src->count) { |
1212 | free_ftrace_hash_rcu(*dst); | 1212 | free_ftrace_hash_rcu(*dst); |
1213 | rcu_assign_pointer(*dst, EMPTY_HASH); | 1213 | rcu_assign_pointer(*dst, EMPTY_HASH); |
1214 | return 0; | 1214 | /* still need to update the function records */ |
1215 | ret = 0; | ||
1216 | goto out; | ||
1215 | } | 1217 | } |
1216 | 1218 | ||
1217 | /* | 1219 | /* |
@@ -3863,6 +3865,14 @@ void ftrace_kill(void) | |||
3863 | } | 3865 | } |
3864 | 3866 | ||
3865 | /** | 3867 | /** |
3868 | * Test if ftrace is dead or not. | ||
3869 | */ | ||
3870 | int ftrace_is_dead(void) | ||
3871 | { | ||
3872 | return ftrace_disabled; | ||
3873 | } | ||
3874 | |||
3875 | /** | ||
3866 | * register_ftrace_function - register a function for profiling | 3876 | * register_ftrace_function - register a function for profiling |
3867 | * @ops - ops structure that holds the function for profiling. | 3877 | * @ops - ops structure that holds the function for profiling. |
3868 | * | 3878 | * |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 731201bf4acc..f5b7b5c1195b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -478,7 +478,7 @@ struct ring_buffer_per_cpu { | |||
478 | int cpu; | 478 | int cpu; |
479 | atomic_t record_disabled; | 479 | atomic_t record_disabled; |
480 | struct ring_buffer *buffer; | 480 | struct ring_buffer *buffer; |
481 | spinlock_t reader_lock; /* serialize readers */ | 481 | raw_spinlock_t reader_lock; /* serialize readers */ |
482 | arch_spinlock_t lock; | 482 | arch_spinlock_t lock; |
483 | struct lock_class_key lock_key; | 483 | struct lock_class_key lock_key; |
484 | struct list_head *pages; | 484 | struct list_head *pages; |
@@ -488,12 +488,14 @@ struct ring_buffer_per_cpu { | |||
488 | struct buffer_page *reader_page; | 488 | struct buffer_page *reader_page; |
489 | unsigned long lost_events; | 489 | unsigned long lost_events; |
490 | unsigned long last_overrun; | 490 | unsigned long last_overrun; |
491 | local_t entries_bytes; | ||
491 | local_t commit_overrun; | 492 | local_t commit_overrun; |
492 | local_t overrun; | 493 | local_t overrun; |
493 | local_t entries; | 494 | local_t entries; |
494 | local_t committing; | 495 | local_t committing; |
495 | local_t commits; | 496 | local_t commits; |
496 | unsigned long read; | 497 | unsigned long read; |
498 | unsigned long read_bytes; | ||
497 | u64 write_stamp; | 499 | u64 write_stamp; |
498 | u64 read_stamp; | 500 | u64 read_stamp; |
499 | }; | 501 | }; |
@@ -1062,7 +1064,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) | |||
1062 | 1064 | ||
1063 | cpu_buffer->cpu = cpu; | 1065 | cpu_buffer->cpu = cpu; |
1064 | cpu_buffer->buffer = buffer; | 1066 | cpu_buffer->buffer = buffer; |
1065 | spin_lock_init(&cpu_buffer->reader_lock); | 1067 | raw_spin_lock_init(&cpu_buffer->reader_lock); |
1066 | lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); | 1068 | lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); |
1067 | cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; | 1069 | cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; |
1068 | 1070 | ||
@@ -1259,7 +1261,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) | |||
1259 | struct list_head *p; | 1261 | struct list_head *p; |
1260 | unsigned i; | 1262 | unsigned i; |
1261 | 1263 | ||
1262 | spin_lock_irq(&cpu_buffer->reader_lock); | 1264 | raw_spin_lock_irq(&cpu_buffer->reader_lock); |
1263 | rb_head_page_deactivate(cpu_buffer); | 1265 | rb_head_page_deactivate(cpu_buffer); |
1264 | 1266 | ||
1265 | for (i = 0; i < nr_pages; i++) { | 1267 | for (i = 0; i < nr_pages; i++) { |
@@ -1277,7 +1279,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) | |||
1277 | rb_check_pages(cpu_buffer); | 1279 | rb_check_pages(cpu_buffer); |
1278 | 1280 | ||
1279 | out: | 1281 | out: |
1280 | spin_unlock_irq(&cpu_buffer->reader_lock); | 1282 | raw_spin_unlock_irq(&cpu_buffer->reader_lock); |
1281 | } | 1283 | } |
1282 | 1284 | ||
1283 | static void | 1285 | static void |
@@ -1288,7 +1290,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, | |||
1288 | struct list_head *p; | 1290 | struct list_head *p; |
1289 | unsigned i; | 1291 | unsigned i; |
1290 | 1292 | ||
1291 | spin_lock_irq(&cpu_buffer->reader_lock); | 1293 | raw_spin_lock_irq(&cpu_buffer->reader_lock); |
1292 | rb_head_page_deactivate(cpu_buffer); | 1294 | rb_head_page_deactivate(cpu_buffer); |
1293 | 1295 | ||
1294 | for (i = 0; i < nr_pages; i++) { | 1296 | for (i = 0; i < nr_pages; i++) { |
@@ -1303,7 +1305,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, | |||
1303 | rb_check_pages(cpu_buffer); | 1305 | rb_check_pages(cpu_buffer); |
1304 | 1306 | ||
1305 | out: | 1307 | out: |
1306 | spin_unlock_irq(&cpu_buffer->reader_lock); | 1308 | raw_spin_unlock_irq(&cpu_buffer->reader_lock); |
1307 | } | 1309 | } |
1308 | 1310 | ||
1309 | /** | 1311 | /** |
@@ -1708,6 +1710,7 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, | |||
1708 | * the counters. | 1710 | * the counters. |
1709 | */ | 1711 | */ |
1710 | local_add(entries, &cpu_buffer->overrun); | 1712 | local_add(entries, &cpu_buffer->overrun); |
1713 | local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); | ||
1711 | 1714 | ||
1712 | /* | 1715 | /* |
1713 | * The entries will be zeroed out when we move the | 1716 | * The entries will be zeroed out when we move the |
@@ -1863,6 +1866,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, | |||
1863 | event = __rb_page_index(tail_page, tail); | 1866 | event = __rb_page_index(tail_page, tail); |
1864 | kmemcheck_annotate_bitfield(event, bitfield); | 1867 | kmemcheck_annotate_bitfield(event, bitfield); |
1865 | 1868 | ||
1869 | /* account for padding bytes */ | ||
1870 | local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes); | ||
1871 | |||
1866 | /* | 1872 | /* |
1867 | * Save the original length to the meta data. | 1873 | * Save the original length to the meta data. |
1868 | * This will be used by the reader to add lost event | 1874 | * This will be used by the reader to add lost event |
@@ -2054,6 +2060,9 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, | |||
2054 | if (!tail) | 2060 | if (!tail) |
2055 | tail_page->page->time_stamp = ts; | 2061 | tail_page->page->time_stamp = ts; |
2056 | 2062 | ||
2063 | /* account for these added bytes */ | ||
2064 | local_add(length, &cpu_buffer->entries_bytes); | ||
2065 | |||
2057 | return event; | 2066 | return event; |
2058 | } | 2067 | } |
2059 | 2068 | ||
@@ -2076,6 +2085,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, | |||
2076 | if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { | 2085 | if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { |
2077 | unsigned long write_mask = | 2086 | unsigned long write_mask = |
2078 | local_read(&bpage->write) & ~RB_WRITE_MASK; | 2087 | local_read(&bpage->write) & ~RB_WRITE_MASK; |
2088 | unsigned long event_length = rb_event_length(event); | ||
2079 | /* | 2089 | /* |
2080 | * This is on the tail page. It is possible that | 2090 | * This is on the tail page. It is possible that |
2081 | * a write could come in and move the tail page | 2091 | * a write could come in and move the tail page |
@@ -2085,8 +2095,11 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, | |||
2085 | old_index += write_mask; | 2095 | old_index += write_mask; |
2086 | new_index += write_mask; | 2096 | new_index += write_mask; |
2087 | index = local_cmpxchg(&bpage->write, old_index, new_index); | 2097 | index = local_cmpxchg(&bpage->write, old_index, new_index); |
2088 | if (index == old_index) | 2098 | if (index == old_index) { |
2099 | /* update counters */ | ||
2100 | local_sub(event_length, &cpu_buffer->entries_bytes); | ||
2089 | return 1; | 2101 | return 1; |
2102 | } | ||
2090 | } | 2103 | } |
2091 | 2104 | ||
2092 | /* could not discard */ | 2105 | /* could not discard */ |
@@ -2661,6 +2674,58 @@ rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) | |||
2661 | } | 2674 | } |
2662 | 2675 | ||
2663 | /** | 2676 | /** |
2677 | * ring_buffer_oldest_event_ts - get the oldest event timestamp from the buffer | ||
2678 | * @buffer: The ring buffer | ||
2679 | * @cpu: The per CPU buffer to read from. | ||
2680 | */ | ||
2681 | unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu) | ||
2682 | { | ||
2683 | unsigned long flags; | ||
2684 | struct ring_buffer_per_cpu *cpu_buffer; | ||
2685 | struct buffer_page *bpage; | ||
2686 | unsigned long ret; | ||
2687 | |||
2688 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) | ||
2689 | return 0; | ||
2690 | |||
2691 | cpu_buffer = buffer->buffers[cpu]; | ||
2692 | raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); | ||
2693 | /* | ||
2694 | * if the tail is on reader_page, oldest time stamp is on the reader | ||
2695 | * page | ||
2696 | */ | ||
2697 | if (cpu_buffer->tail_page == cpu_buffer->reader_page) | ||
2698 | bpage = cpu_buffer->reader_page; | ||
2699 | else | ||
2700 | bpage = rb_set_head_page(cpu_buffer); | ||
2701 | ret = bpage->page->time_stamp; | ||
2702 | raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); | ||
2703 | |||
2704 | return ret; | ||
2705 | } | ||
2706 | EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts); | ||
2707 | |||
2708 | /** | ||
2709 | * ring_buffer_bytes_cpu - get the number of bytes consumed in a cpu buffer | ||
2710 | * @buffer: The ring buffer | ||
2711 | * @cpu: The per CPU buffer to read from. | ||
2712 | */ | ||
2713 | unsigned long ring_buffer_bytes_cpu(struct ring_buffer *buffer, int cpu) | ||
2714 | { | ||
2715 | struct ring_buffer_per_cpu *cpu_buffer; | ||
2716 | unsigned long ret; | ||
2717 | |||
2718 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) | ||
2719 | return 0; | ||
2720 | |||
2721 | cpu_buffer = buffer->buffers[cpu]; | ||
2722 | ret = local_read(&cpu_buffer->entries_bytes) - cpu_buffer->read_bytes; | ||
2723 | |||
2724 | return ret; | ||
2725 | } | ||
2726 | EXPORT_SYMBOL_GPL(ring_buffer_bytes_cpu); | ||
2727 | |||
2728 | /** | ||
2664 | * ring_buffer_entries_cpu - get the number of entries in a cpu buffer | 2729 | * ring_buffer_entries_cpu - get the number of entries in a cpu buffer |
2665 | * @buffer: The ring buffer | 2730 | * @buffer: The ring buffer |
2666 | * @cpu: The per CPU buffer to get the entries from. | 2731 | * @cpu: The per CPU buffer to get the entries from. |
@@ -2804,9 +2869,9 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter) | |||
2804 | 2869 | ||
2805 | cpu_buffer = iter->cpu_buffer; | 2870 | cpu_buffer = iter->cpu_buffer; |
2806 | 2871 | ||
2807 | spin_lock_irqsave(&cpu_buffer->reader_lock, flags); | 2872 | raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); |
2808 | rb_iter_reset(iter); | 2873 | rb_iter_reset(iter); |
2809 | spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); | 2874 | raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); |
2810 | } | 2875 | } |
2811 | EXPORT_SYMBOL_GPL(ring_buffer_iter_reset); | 2876 | EXPORT_SYMBOL_GPL(ring_buffer_iter_reset); |
2812 | 2877 | ||
@@ -3265,12 +3330,12 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts, | |||
3265 | again: | 3330 | again: |
3266 | local_irq_save(flags); | 3331 | local_irq_save(flags); |
3267 | if (dolock) | 3332 | if (dolock) |
3268 | spin_lock(&cpu_buffer->reader_lock); | 3333 | raw_spin_lock(&cpu_buffer->reader_lock); |
3269 | event = rb_buffer_peek(cpu_buffer, ts, lost_events); | 3334 | event = rb_buffer_peek(cpu_buffer, ts, lost_events); |
3270 | if (event && event->type_len == RINGBUF_TYPE_PADDING) | 3335 | if (event && event->type_len == RINGBUF_TYPE_PADDING) |
3271 | rb_advance_reader(cpu_buffer); | 3336 | rb_advance_reader(cpu_buffer); |
3272 | if (dolock) | 3337 | if (dolock) |
3273 | spin_unlock(&cpu_buffer->reader_lock); | 3338 | raw_spin_unlock(&cpu_buffer->reader_lock); |
3274 | local_irq_restore(flags); | 3339 | local_irq_restore(flags); |
3275 | 3340 | ||
3276 | if (event && event->type_len == RINGBUF_TYPE_PADDING) | 3341 | if (event && event->type_len == RINGBUF_TYPE_PADDING) |
@@ -3295,9 +3360,9 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) | |||
3295 | unsigned long flags; | 3360 | unsigned long flags; |
3296 | 3361 | ||
3297 | again: | 3362 | again: |
3298 | spin_lock_irqsave(&cpu_buffer->reader_lock, flags); | 3363 | raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); |
3299 | event = rb_iter_peek(iter, ts); | 3364 | event = rb_iter_peek(iter, ts); |
3300 | spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); | 3365 | raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); |
3301 | 3366 | ||
3302 | if (event && event->type_len == RINGBUF_TYPE_PADDING) | 3367 | if (event && event->type_len == RINGBUF_TYPE_PADDING) |
3303 | goto again; | 3368 | goto again; |
@@ -3337,7 +3402,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts, | |||
3337 | cpu_buffer = buffer->buffers[cpu]; | 3402 | cpu_buffer = buffer->buffers[cpu]; |
3338 | local_irq_save(flags); | 3403 | local_irq_save(flags); |
3339 | if (dolock) | 3404 | if (dolock) |
3340 | spin_lock(&cpu_buffer->reader_lock); | 3405 | raw_spin_lock(&cpu_buffer->reader_lock); |
3341 | 3406 | ||
3342 | event = rb_buffer_peek(cpu_buffer, ts, lost_events); | 3407 | event = rb_buffer_peek(cpu_buffer, ts, lost_events); |
3343 | if (event) { | 3408 | if (event) { |
@@ -3346,7 +3411,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts, | |||
3346 | } | 3411 | } |
3347 | 3412 | ||
3348 | if (dolock) | 3413 | if (dolock) |
3349 | spin_unlock(&cpu_buffer->reader_lock); | 3414 | raw_spin_unlock(&cpu_buffer->reader_lock); |
3350 | local_irq_restore(flags); | 3415 | local_irq_restore(flags); |
3351 | 3416 | ||
3352 | out: | 3417 | out: |
@@ -3438,11 +3503,11 @@ ring_buffer_read_start(struct ring_buffer_iter *iter) | |||
3438 | 3503 | ||
3439 | cpu_buffer = iter->cpu_buffer; | 3504 | cpu_buffer = iter->cpu_buffer; |
3440 | 3505 | ||
3441 | spin_lock_irqsave(&cpu_buffer->reader_lock, flags); | 3506 | raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); |
3442 | arch_spin_lock(&cpu_buffer->lock); | 3507 | arch_spin_lock(&cpu_buffer->lock); |
3443 | rb_iter_reset(iter); | 3508 | rb_iter_reset(iter); |
3444 | arch_spin_unlock(&cpu_buffer->lock); | 3509 | arch_spin_unlock(&cpu_buffer->lock); |
3445 | spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); | 3510 | raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); |
3446 | } | 3511 | } |
3447 | EXPORT_SYMBOL_GPL(ring_buffer_read_start); | 3512 | EXPORT_SYMBOL_GPL(ring_buffer_read_start); |
3448 | 3513 | ||
@@ -3477,7 +3542,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) | |||
3477 | struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; | 3542 | struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; |
3478 | unsigned long flags; | 3543 | unsigned long flags; |
3479 | 3544 | ||
3480 | spin_lock_irqsave(&cpu_buffer->reader_lock, flags); | 3545 | raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); |
3481 | again: | 3546 | again: |
3482 | event = rb_iter_peek(iter, ts); | 3547 | event = rb_iter_peek(iter, ts); |
3483 | if (!event) | 3548 | if (!event) |
@@ -3488,7 +3553,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) | |||
3488 | 3553 | ||
3489 | rb_advance_iter(iter); | 3554 | rb_advance_iter(iter); |
3490 | out: | 3555 | out: |
3491 | spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); | 3556 | raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); |
3492 | 3557 | ||
3493 | return event; | 3558 | return event; |
3494 | } | 3559 | } |
@@ -3527,11 +3592,13 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) | |||
3527 | cpu_buffer->reader_page->read = 0; | 3592 | cpu_buffer->reader_page->read = 0; |
3528 | 3593 | ||
3529 | local_set(&cpu_buffer->commit_overrun, 0); | 3594 | local_set(&cpu_buffer->commit_overrun, 0); |
3595 | local_set(&cpu_buffer->entries_bytes, 0); | ||
3530 | local_set(&cpu_buffer->overrun, 0); | 3596 | local_set(&cpu_buffer->overrun, 0); |
3531 | local_set(&cpu_buffer->entries, 0); | 3597 | local_set(&cpu_buffer->entries, 0); |
3532 | local_set(&cpu_buffer->committing, 0); | 3598 | local_set(&cpu_buffer->committing, 0); |
3533 | local_set(&cpu_buffer->commits, 0); | 3599 | local_set(&cpu_buffer->commits, 0); |
3534 | cpu_buffer->read = 0; | 3600 | cpu_buffer->read = 0; |
3601 | cpu_buffer->read_bytes = 0; | ||
3535 | 3602 | ||
3536 | cpu_buffer->write_stamp = 0; | 3603 | cpu_buffer->write_stamp = 0; |
3537 | cpu_buffer->read_stamp = 0; | 3604 | cpu_buffer->read_stamp = 0; |
@@ -3557,7 +3624,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) | |||
3557 | 3624 | ||
3558 | atomic_inc(&cpu_buffer->record_disabled); | 3625 | atomic_inc(&cpu_buffer->record_disabled); |
3559 | 3626 | ||
3560 | spin_lock_irqsave(&cpu_buffer->reader_lock, flags); | 3627 | raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); |
3561 | 3628 | ||
3562 | if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) | 3629 | if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) |
3563 | goto out; | 3630 | goto out; |
@@ -3569,7 +3636,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) | |||
3569 | arch_spin_unlock(&cpu_buffer->lock); | 3636 | arch_spin_unlock(&cpu_buffer->lock); |
3570 | 3637 | ||
3571 | out: | 3638 | out: |
3572 | spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); | 3639 | raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); |
3573 | 3640 | ||
3574 | atomic_dec(&cpu_buffer->record_disabled); | 3641 | atomic_dec(&cpu_buffer->record_disabled); |
3575 | } | 3642 | } |
@@ -3607,10 +3674,10 @@ int ring_buffer_empty(struct ring_buffer *buffer) | |||
3607 | cpu_buffer = buffer->buffers[cpu]; | 3674 | cpu_buffer = buffer->buffers[cpu]; |
3608 | local_irq_save(flags); | 3675 | local_irq_save(flags); |
3609 | if (dolock) | 3676 | if (dolock) |
3610 | spin_lock(&cpu_buffer->reader_lock); | 3677 | raw_spin_lock(&cpu_buffer->reader_lock); |
3611 | ret = rb_per_cpu_empty(cpu_buffer); | 3678 | ret = rb_per_cpu_empty(cpu_buffer); |
3612 | if (dolock) | 3679 | if (dolock) |
3613 | spin_unlock(&cpu_buffer->reader_lock); | 3680 | raw_spin_unlock(&cpu_buffer->reader_lock); |
3614 | local_irq_restore(flags); | 3681 | local_irq_restore(flags); |
3615 | 3682 | ||
3616 | if (!ret) | 3683 | if (!ret) |
@@ -3641,10 +3708,10 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) | |||
3641 | cpu_buffer = buffer->buffers[cpu]; | 3708 | cpu_buffer = buffer->buffers[cpu]; |
3642 | local_irq_save(flags); | 3709 | local_irq_save(flags); |
3643 | if (dolock) | 3710 | if (dolock) |
3644 | spin_lock(&cpu_buffer->reader_lock); | 3711 | raw_spin_lock(&cpu_buffer->reader_lock); |
3645 | ret = rb_per_cpu_empty(cpu_buffer); | 3712 | ret = rb_per_cpu_empty(cpu_buffer); |
3646 | if (dolock) | 3713 | if (dolock) |
3647 | spin_unlock(&cpu_buffer->reader_lock); | 3714 | raw_spin_unlock(&cpu_buffer->reader_lock); |
3648 | local_irq_restore(flags); | 3715 | local_irq_restore(flags); |
3649 | 3716 | ||
3650 | return ret; | 3717 | return ret; |
@@ -3841,7 +3908,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, | |||
3841 | if (!bpage) | 3908 | if (!bpage) |
3842 | goto out; | 3909 | goto out; |
3843 | 3910 | ||
3844 | spin_lock_irqsave(&cpu_buffer->reader_lock, flags); | 3911 | raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); |
3845 | 3912 | ||
3846 | reader = rb_get_reader_page(cpu_buffer); | 3913 | reader = rb_get_reader_page(cpu_buffer); |
3847 | if (!reader) | 3914 | if (!reader) |
@@ -3918,6 +3985,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, | |||
3918 | } else { | 3985 | } else { |
3919 | /* update the entry counter */ | 3986 | /* update the entry counter */ |
3920 | cpu_buffer->read += rb_page_entries(reader); | 3987 | cpu_buffer->read += rb_page_entries(reader); |
3988 | cpu_buffer->read_bytes += BUF_PAGE_SIZE; | ||
3921 | 3989 | ||
3922 | /* swap the pages */ | 3990 | /* swap the pages */ |
3923 | rb_init_page(bpage); | 3991 | rb_init_page(bpage); |
@@ -3964,7 +4032,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, | |||
3964 | memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit); | 4032 | memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit); |
3965 | 4033 | ||
3966 | out_unlock: | 4034 | out_unlock: |
3967 | spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); | 4035 | raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); |
3968 | 4036 | ||
3969 | out: | 4037 | out: |
3970 | return ret; | 4038 | return ret; |
diff --git a/kernel/trace/rpm-traces.c b/kernel/trace/rpm-traces.c new file mode 100644 index 000000000000..4b3b5eaf94d1 --- /dev/null +++ b/kernel/trace/rpm-traces.c | |||
@@ -0,0 +1,20 @@ | |||
1 | /* | ||
2 | * Power trace points | ||
3 | * | ||
4 | * Copyright (C) 2009 Ming Lei <ming.lei@canonical.com> | ||
5 | */ | ||
6 | |||
7 | #include <linux/string.h> | ||
8 | #include <linux/types.h> | ||
9 | #include <linux/workqueue.h> | ||
10 | #include <linux/sched.h> | ||
11 | #include <linux/module.h> | ||
12 | #include <linux/usb.h> | ||
13 | |||
14 | #define CREATE_TRACE_POINTS | ||
15 | #include <trace/events/rpm.h> | ||
16 | |||
17 | EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_return_int); | ||
18 | EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_idle); | ||
19 | EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_suspend); | ||
20 | EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_resume); | ||
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e5df02c69b1d..a3f1bc5d2a00 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -338,10 +338,11 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); | |||
338 | /* trace_flags holds trace_options default values */ | 338 | /* trace_flags holds trace_options default values */ |
339 | unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | | 339 | unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | |
340 | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | | 340 | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | |
341 | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE; | 341 | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | |
342 | TRACE_ITER_IRQ_INFO; | ||
342 | 343 | ||
343 | static int trace_stop_count; | 344 | static int trace_stop_count; |
344 | static DEFINE_SPINLOCK(tracing_start_lock); | 345 | static DEFINE_RAW_SPINLOCK(tracing_start_lock); |
345 | 346 | ||
346 | static void wakeup_work_handler(struct work_struct *work) | 347 | static void wakeup_work_handler(struct work_struct *work) |
347 | { | 348 | { |
@@ -426,6 +427,7 @@ static const char *trace_options[] = { | |||
426 | "record-cmd", | 427 | "record-cmd", |
427 | "overwrite", | 428 | "overwrite", |
428 | "disable_on_free", | 429 | "disable_on_free", |
430 | "irq-info", | ||
429 | NULL | 431 | NULL |
430 | }; | 432 | }; |
431 | 433 | ||
@@ -435,6 +437,7 @@ static struct { | |||
435 | } trace_clocks[] = { | 437 | } trace_clocks[] = { |
436 | { trace_clock_local, "local" }, | 438 | { trace_clock_local, "local" }, |
437 | { trace_clock_global, "global" }, | 439 | { trace_clock_global, "global" }, |
440 | { trace_clock_counter, "counter" }, | ||
438 | }; | 441 | }; |
439 | 442 | ||
440 | int trace_clock_id; | 443 | int trace_clock_id; |
@@ -960,7 +963,7 @@ void tracing_start(void) | |||
960 | if (tracing_disabled) | 963 | if (tracing_disabled) |
961 | return; | 964 | return; |
962 | 965 | ||
963 | spin_lock_irqsave(&tracing_start_lock, flags); | 966 | raw_spin_lock_irqsave(&tracing_start_lock, flags); |
964 | if (--trace_stop_count) { | 967 | if (--trace_stop_count) { |
965 | if (trace_stop_count < 0) { | 968 | if (trace_stop_count < 0) { |
966 | /* Someone screwed up their debugging */ | 969 | /* Someone screwed up their debugging */ |
@@ -985,7 +988,7 @@ void tracing_start(void) | |||
985 | 988 | ||
986 | ftrace_start(); | 989 | ftrace_start(); |
987 | out: | 990 | out: |
988 | spin_unlock_irqrestore(&tracing_start_lock, flags); | 991 | raw_spin_unlock_irqrestore(&tracing_start_lock, flags); |
989 | } | 992 | } |
990 | 993 | ||
991 | /** | 994 | /** |
@@ -1000,7 +1003,7 @@ void tracing_stop(void) | |||
1000 | unsigned long flags; | 1003 | unsigned long flags; |
1001 | 1004 | ||
1002 | ftrace_stop(); | 1005 | ftrace_stop(); |
1003 | spin_lock_irqsave(&tracing_start_lock, flags); | 1006 | raw_spin_lock_irqsave(&tracing_start_lock, flags); |
1004 | if (trace_stop_count++) | 1007 | if (trace_stop_count++) |
1005 | goto out; | 1008 | goto out; |
1006 | 1009 | ||
@@ -1018,7 +1021,7 @@ void tracing_stop(void) | |||
1018 | arch_spin_unlock(&ftrace_max_lock); | 1021 | arch_spin_unlock(&ftrace_max_lock); |
1019 | 1022 | ||
1020 | out: | 1023 | out: |
1021 | spin_unlock_irqrestore(&tracing_start_lock, flags); | 1024 | raw_spin_unlock_irqrestore(&tracing_start_lock, flags); |
1022 | } | 1025 | } |
1023 | 1026 | ||
1024 | void trace_stop_cmdline_recording(void); | 1027 | void trace_stop_cmdline_recording(void); |
@@ -1842,6 +1845,33 @@ static void s_stop(struct seq_file *m, void *p) | |||
1842 | trace_event_read_unlock(); | 1845 | trace_event_read_unlock(); |
1843 | } | 1846 | } |
1844 | 1847 | ||
1848 | static void | ||
1849 | get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *entries) | ||
1850 | { | ||
1851 | unsigned long count; | ||
1852 | int cpu; | ||
1853 | |||
1854 | *total = 0; | ||
1855 | *entries = 0; | ||
1856 | |||
1857 | for_each_tracing_cpu(cpu) { | ||
1858 | count = ring_buffer_entries_cpu(tr->buffer, cpu); | ||
1859 | /* | ||
1860 | * If this buffer has skipped entries, then we hold all | ||
1861 | * entries for the trace and we need to ignore the | ||
1862 | * ones before the time stamp. | ||
1863 | */ | ||
1864 | if (tr->data[cpu]->skipped_entries) { | ||
1865 | count -= tr->data[cpu]->skipped_entries; | ||
1866 | /* total is the same as the entries */ | ||
1867 | *total += count; | ||
1868 | } else | ||
1869 | *total += count + | ||
1870 | ring_buffer_overrun_cpu(tr->buffer, cpu); | ||
1871 | *entries += count; | ||
1872 | } | ||
1873 | } | ||
1874 | |||
1845 | static void print_lat_help_header(struct seq_file *m) | 1875 | static void print_lat_help_header(struct seq_file *m) |
1846 | { | 1876 | { |
1847 | seq_puts(m, "# _------=> CPU# \n"); | 1877 | seq_puts(m, "# _------=> CPU# \n"); |
@@ -1854,12 +1884,35 @@ static void print_lat_help_header(struct seq_file *m) | |||
1854 | seq_puts(m, "# \\ / ||||| \\ | / \n"); | 1884 | seq_puts(m, "# \\ / ||||| \\ | / \n"); |
1855 | } | 1885 | } |
1856 | 1886 | ||
1857 | static void print_func_help_header(struct seq_file *m) | 1887 | static void print_event_info(struct trace_array *tr, struct seq_file *m) |
1888 | { | ||
1889 | unsigned long total; | ||
1890 | unsigned long entries; | ||
1891 | |||
1892 | get_total_entries(tr, &total, &entries); | ||
1893 | seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu #P:%d\n", | ||
1894 | entries, total, num_online_cpus()); | ||
1895 | seq_puts(m, "#\n"); | ||
1896 | } | ||
1897 | |||
1898 | static void print_func_help_header(struct trace_array *tr, struct seq_file *m) | ||
1858 | { | 1899 | { |
1859 | seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); | 1900 | print_event_info(tr, m); |
1901 | seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); | ||
1860 | seq_puts(m, "# | | | | |\n"); | 1902 | seq_puts(m, "# | | | | |\n"); |
1861 | } | 1903 | } |
1862 | 1904 | ||
1905 | static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m) | ||
1906 | { | ||
1907 | print_event_info(tr, m); | ||
1908 | seq_puts(m, "# _-----=> irqs-off\n"); | ||
1909 | seq_puts(m, "# / _----=> need-resched\n"); | ||
1910 | seq_puts(m, "# | / _---=> hardirq/softirq\n"); | ||
1911 | seq_puts(m, "# || / _--=> preempt-depth\n"); | ||
1912 | seq_puts(m, "# ||| / delay\n"); | ||
1913 | seq_puts(m, "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n"); | ||
1914 | seq_puts(m, "# | | | |||| | |\n"); | ||
1915 | } | ||
1863 | 1916 | ||
1864 | void | 1917 | void |
1865 | print_trace_header(struct seq_file *m, struct trace_iterator *iter) | 1918 | print_trace_header(struct seq_file *m, struct trace_iterator *iter) |
@@ -1868,32 +1921,14 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) | |||
1868 | struct trace_array *tr = iter->tr; | 1921 | struct trace_array *tr = iter->tr; |
1869 | struct trace_array_cpu *data = tr->data[tr->cpu]; | 1922 | struct trace_array_cpu *data = tr->data[tr->cpu]; |
1870 | struct tracer *type = current_trace; | 1923 | struct tracer *type = current_trace; |
1871 | unsigned long entries = 0; | 1924 | unsigned long entries; |
1872 | unsigned long total = 0; | 1925 | unsigned long total; |
1873 | unsigned long count; | ||
1874 | const char *name = "preemption"; | 1926 | const char *name = "preemption"; |
1875 | int cpu; | ||
1876 | 1927 | ||
1877 | if (type) | 1928 | if (type) |
1878 | name = type->name; | 1929 | name = type->name; |
1879 | 1930 | ||
1880 | 1931 | get_total_entries(tr, &total, &entries); | |
1881 | for_each_tracing_cpu(cpu) { | ||
1882 | count = ring_buffer_entries_cpu(tr->buffer, cpu); | ||
1883 | /* | ||
1884 | * If this buffer has skipped entries, then we hold all | ||
1885 | * entries for the trace and we need to ignore the | ||
1886 | * ones before the time stamp. | ||
1887 | */ | ||
1888 | if (tr->data[cpu]->skipped_entries) { | ||
1889 | count -= tr->data[cpu]->skipped_entries; | ||
1890 | /* total is the same as the entries */ | ||
1891 | total += count; | ||
1892 | } else | ||
1893 | total += count + | ||
1894 | ring_buffer_overrun_cpu(tr->buffer, cpu); | ||
1895 | entries += count; | ||
1896 | } | ||
1897 | 1932 | ||
1898 | seq_printf(m, "# %s latency trace v1.1.5 on %s\n", | 1933 | seq_printf(m, "# %s latency trace v1.1.5 on %s\n", |
1899 | name, UTS_RELEASE); | 1934 | name, UTS_RELEASE); |
@@ -2139,6 +2174,21 @@ enum print_line_t print_trace_line(struct trace_iterator *iter) | |||
2139 | return print_trace_fmt(iter); | 2174 | return print_trace_fmt(iter); |
2140 | } | 2175 | } |
2141 | 2176 | ||
2177 | void trace_latency_header(struct seq_file *m) | ||
2178 | { | ||
2179 | struct trace_iterator *iter = m->private; | ||
2180 | |||
2181 | /* print nothing if the buffers are empty */ | ||
2182 | if (trace_empty(iter)) | ||
2183 | return; | ||
2184 | |||
2185 | if (iter->iter_flags & TRACE_FILE_LAT_FMT) | ||
2186 | print_trace_header(m, iter); | ||
2187 | |||
2188 | if (!(trace_flags & TRACE_ITER_VERBOSE)) | ||
2189 | print_lat_help_header(m); | ||
2190 | } | ||
2191 | |||
2142 | void trace_default_header(struct seq_file *m) | 2192 | void trace_default_header(struct seq_file *m) |
2143 | { | 2193 | { |
2144 | struct trace_iterator *iter = m->private; | 2194 | struct trace_iterator *iter = m->private; |
@@ -2154,11 +2204,23 @@ void trace_default_header(struct seq_file *m) | |||
2154 | if (!(trace_flags & TRACE_ITER_VERBOSE)) | 2204 | if (!(trace_flags & TRACE_ITER_VERBOSE)) |
2155 | print_lat_help_header(m); | 2205 | print_lat_help_header(m); |
2156 | } else { | 2206 | } else { |
2157 | if (!(trace_flags & TRACE_ITER_VERBOSE)) | 2207 | if (!(trace_flags & TRACE_ITER_VERBOSE)) { |
2158 | print_func_help_header(m); | 2208 | if (trace_flags & TRACE_ITER_IRQ_INFO) |
2209 | print_func_help_header_irq(iter->tr, m); | ||
2210 | else | ||
2211 | print_func_help_header(iter->tr, m); | ||
2212 | } | ||
2159 | } | 2213 | } |
2160 | } | 2214 | } |
2161 | 2215 | ||
2216 | static void test_ftrace_alive(struct seq_file *m) | ||
2217 | { | ||
2218 | if (!ftrace_is_dead()) | ||
2219 | return; | ||
2220 | seq_printf(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n"); | ||
2221 | seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n"); | ||
2222 | } | ||
2223 | |||
2162 | static int s_show(struct seq_file *m, void *v) | 2224 | static int s_show(struct seq_file *m, void *v) |
2163 | { | 2225 | { |
2164 | struct trace_iterator *iter = v; | 2226 | struct trace_iterator *iter = v; |
@@ -2168,6 +2230,7 @@ static int s_show(struct seq_file *m, void *v) | |||
2168 | if (iter->tr) { | 2230 | if (iter->tr) { |
2169 | seq_printf(m, "# tracer: %s\n", iter->trace->name); | 2231 | seq_printf(m, "# tracer: %s\n", iter->trace->name); |
2170 | seq_puts(m, "#\n"); | 2232 | seq_puts(m, "#\n"); |
2233 | test_ftrace_alive(m); | ||
2171 | } | 2234 | } |
2172 | if (iter->trace && iter->trace->print_header) | 2235 | if (iter->trace && iter->trace->print_header) |
2173 | iter->trace->print_header(m); | 2236 | iter->trace->print_header(m); |
@@ -2710,9 +2773,9 @@ static const char readme_msg[] = | |||
2710 | "# cat /sys/kernel/debug/tracing/trace_options\n" | 2773 | "# cat /sys/kernel/debug/tracing/trace_options\n" |
2711 | "noprint-parent nosym-offset nosym-addr noverbose\n" | 2774 | "noprint-parent nosym-offset nosym-addr noverbose\n" |
2712 | "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n" | 2775 | "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n" |
2713 | "# echo 1 > /sys/kernel/debug/tracing/tracing_enabled\n" | 2776 | "# echo 1 > /sys/kernel/debug/tracing/tracing_on\n" |
2714 | "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n" | 2777 | "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n" |
2715 | "# echo 0 > /sys/kernel/debug/tracing/tracing_enabled\n" | 2778 | "# echo 0 > /sys/kernel/debug/tracing/tracing_on\n" |
2716 | ; | 2779 | ; |
2717 | 2780 | ||
2718 | static ssize_t | 2781 | static ssize_t |
@@ -3569,6 +3632,30 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, | |||
3569 | } | 3632 | } |
3570 | 3633 | ||
3571 | static ssize_t | 3634 | static ssize_t |
3635 | tracing_total_entries_read(struct file *filp, char __user *ubuf, | ||
3636 | size_t cnt, loff_t *ppos) | ||
3637 | { | ||
3638 | struct trace_array *tr = filp->private_data; | ||
3639 | char buf[64]; | ||
3640 | int r, cpu; | ||
3641 | unsigned long size = 0, expanded_size = 0; | ||
3642 | |||
3643 | mutex_lock(&trace_types_lock); | ||
3644 | for_each_tracing_cpu(cpu) { | ||
3645 | size += tr->entries >> 10; | ||
3646 | if (!ring_buffer_expanded) | ||
3647 | expanded_size += trace_buf_size >> 10; | ||
3648 | } | ||
3649 | if (ring_buffer_expanded) | ||
3650 | r = sprintf(buf, "%lu\n", size); | ||
3651 | else | ||
3652 | r = sprintf(buf, "%lu (expanded: %lu)\n", size, expanded_size); | ||
3653 | mutex_unlock(&trace_types_lock); | ||
3654 | |||
3655 | return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); | ||
3656 | } | ||
3657 | |||
3658 | static ssize_t | ||
3572 | tracing_free_buffer_write(struct file *filp, const char __user *ubuf, | 3659 | tracing_free_buffer_write(struct file *filp, const char __user *ubuf, |
3573 | size_t cnt, loff_t *ppos) | 3660 | size_t cnt, loff_t *ppos) |
3574 | { | 3661 | { |
@@ -3594,22 +3681,24 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) | |||
3594 | return 0; | 3681 | return 0; |
3595 | } | 3682 | } |
3596 | 3683 | ||
3597 | static int mark_printk(const char *fmt, ...) | ||
3598 | { | ||
3599 | int ret; | ||
3600 | va_list args; | ||
3601 | va_start(args, fmt); | ||
3602 | ret = trace_vprintk(0, fmt, args); | ||
3603 | va_end(args); | ||
3604 | return ret; | ||
3605 | } | ||
3606 | |||
3607 | static ssize_t | 3684 | static ssize_t |
3608 | tracing_mark_write(struct file *filp, const char __user *ubuf, | 3685 | tracing_mark_write(struct file *filp, const char __user *ubuf, |
3609 | size_t cnt, loff_t *fpos) | 3686 | size_t cnt, loff_t *fpos) |
3610 | { | 3687 | { |
3611 | char *buf; | 3688 | unsigned long addr = (unsigned long)ubuf; |
3612 | size_t written; | 3689 | struct ring_buffer_event *event; |
3690 | struct ring_buffer *buffer; | ||
3691 | struct print_entry *entry; | ||
3692 | unsigned long irq_flags; | ||
3693 | struct page *pages[2]; | ||
3694 | int nr_pages = 1; | ||
3695 | ssize_t written; | ||
3696 | void *page1; | ||
3697 | void *page2; | ||
3698 | int offset; | ||
3699 | int size; | ||
3700 | int len; | ||
3701 | int ret; | ||
3613 | 3702 | ||
3614 | if (tracing_disabled) | 3703 | if (tracing_disabled) |
3615 | return -EINVAL; | 3704 | return -EINVAL; |
@@ -3617,28 +3706,81 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, | |||
3617 | if (cnt > TRACE_BUF_SIZE) | 3706 | if (cnt > TRACE_BUF_SIZE) |
3618 | cnt = TRACE_BUF_SIZE; | 3707 | cnt = TRACE_BUF_SIZE; |
3619 | 3708 | ||
3620 | buf = kmalloc(cnt + 2, GFP_KERNEL); | 3709 | /* |
3621 | if (buf == NULL) | 3710 | * Userspace is injecting traces into the kernel trace buffer. |
3622 | return -ENOMEM; | 3711 | * We want to be as non intrusive as possible. |
3712 | * To do so, we do not want to allocate any special buffers | ||
3713 | * or take any locks, but instead write the userspace data | ||
3714 | * straight into the ring buffer. | ||
3715 | * | ||
3716 | * First we need to pin the userspace buffer into memory, | ||
3717 | * which, most likely it is, because it just referenced it. | ||
3718 | * But there's no guarantee that it is. By using get_user_pages_fast() | ||
3719 | * and kmap_atomic/kunmap_atomic() we can get access to the | ||
3720 | * pages directly. We then write the data directly into the | ||
3721 | * ring buffer. | ||
3722 | */ | ||
3723 | BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); | ||
3623 | 3724 | ||
3624 | if (copy_from_user(buf, ubuf, cnt)) { | 3725 | /* check if we cross pages */ |
3625 | kfree(buf); | 3726 | if ((addr & PAGE_MASK) != ((addr + cnt) & PAGE_MASK)) |
3626 | return -EFAULT; | 3727 | nr_pages = 2; |
3728 | |||
3729 | offset = addr & (PAGE_SIZE - 1); | ||
3730 | addr &= PAGE_MASK; | ||
3731 | |||
3732 | ret = get_user_pages_fast(addr, nr_pages, 0, pages); | ||
3733 | if (ret < nr_pages) { | ||
3734 | while (--ret >= 0) | ||
3735 | put_page(pages[ret]); | ||
3736 | written = -EFAULT; | ||
3737 | goto out; | ||
3627 | } | 3738 | } |
3628 | if (buf[cnt-1] != '\n') { | 3739 | |
3629 | buf[cnt] = '\n'; | 3740 | page1 = kmap_atomic(pages[0]); |
3630 | buf[cnt+1] = '\0'; | 3741 | if (nr_pages == 2) |
3742 | page2 = kmap_atomic(pages[1]); | ||
3743 | |||
3744 | local_save_flags(irq_flags); | ||
3745 | size = sizeof(*entry) + cnt + 2; /* possible \n added */ | ||
3746 | buffer = global_trace.buffer; | ||
3747 | event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, | ||
3748 | irq_flags, preempt_count()); | ||
3749 | if (!event) { | ||
3750 | /* Ring buffer disabled, return as if not open for write */ | ||
3751 | written = -EBADF; | ||
3752 | goto out_unlock; | ||
3753 | } | ||
3754 | |||
3755 | entry = ring_buffer_event_data(event); | ||
3756 | entry->ip = _THIS_IP_; | ||
3757 | |||
3758 | if (nr_pages == 2) { | ||
3759 | len = PAGE_SIZE - offset; | ||
3760 | memcpy(&entry->buf, page1 + offset, len); | ||
3761 | memcpy(&entry->buf[len], page2, cnt - len); | ||
3631 | } else | 3762 | } else |
3632 | buf[cnt] = '\0'; | 3763 | memcpy(&entry->buf, page1 + offset, cnt); |
3633 | 3764 | ||
3634 | written = mark_printk("%s", buf); | 3765 | if (entry->buf[cnt - 1] != '\n') { |
3635 | kfree(buf); | 3766 | entry->buf[cnt] = '\n'; |
3636 | *fpos += written; | 3767 | entry->buf[cnt + 1] = '\0'; |
3768 | } else | ||
3769 | entry->buf[cnt] = '\0'; | ||
3770 | |||
3771 | ring_buffer_unlock_commit(buffer, event); | ||
3772 | |||
3773 | written = cnt; | ||
3637 | 3774 | ||
3638 | /* don't tell userspace we wrote more - it might confuse them */ | 3775 | *fpos += written; |
3639 | if (written > cnt) | ||
3640 | written = cnt; | ||
3641 | 3776 | ||
3777 | out_unlock: | ||
3778 | if (nr_pages == 2) | ||
3779 | kunmap_atomic(page2); | ||
3780 | kunmap_atomic(page1); | ||
3781 | while (nr_pages > 0) | ||
3782 | put_page(pages[--nr_pages]); | ||
3783 | out: | ||
3642 | return written; | 3784 | return written; |
3643 | } | 3785 | } |
3644 | 3786 | ||
@@ -3739,6 +3881,12 @@ static const struct file_operations tracing_entries_fops = { | |||
3739 | .llseek = generic_file_llseek, | 3881 | .llseek = generic_file_llseek, |
3740 | }; | 3882 | }; |
3741 | 3883 | ||
3884 | static const struct file_operations tracing_total_entries_fops = { | ||
3885 | .open = tracing_open_generic, | ||
3886 | .read = tracing_total_entries_read, | ||
3887 | .llseek = generic_file_llseek, | ||
3888 | }; | ||
3889 | |||
3742 | static const struct file_operations tracing_free_buffer_fops = { | 3890 | static const struct file_operations tracing_free_buffer_fops = { |
3743 | .write = tracing_free_buffer_write, | 3891 | .write = tracing_free_buffer_write, |
3744 | .release = tracing_free_buffer_release, | 3892 | .release = tracing_free_buffer_release, |
@@ -3808,8 +3956,6 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, | |||
3808 | if (info->read < PAGE_SIZE) | 3956 | if (info->read < PAGE_SIZE) |
3809 | goto read; | 3957 | goto read; |
3810 | 3958 | ||
3811 | info->read = 0; | ||
3812 | |||
3813 | trace_access_lock(info->cpu); | 3959 | trace_access_lock(info->cpu); |
3814 | ret = ring_buffer_read_page(info->tr->buffer, | 3960 | ret = ring_buffer_read_page(info->tr->buffer, |
3815 | &info->spare, | 3961 | &info->spare, |
@@ -3819,6 +3965,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, | |||
3819 | if (ret < 0) | 3965 | if (ret < 0) |
3820 | return 0; | 3966 | return 0; |
3821 | 3967 | ||
3968 | info->read = 0; | ||
3969 | |||
3822 | read: | 3970 | read: |
3823 | size = PAGE_SIZE - info->read; | 3971 | size = PAGE_SIZE - info->read; |
3824 | if (size > count) | 3972 | if (size > count) |
@@ -4026,6 +4174,8 @@ tracing_stats_read(struct file *filp, char __user *ubuf, | |||
4026 | struct trace_array *tr = &global_trace; | 4174 | struct trace_array *tr = &global_trace; |
4027 | struct trace_seq *s; | 4175 | struct trace_seq *s; |
4028 | unsigned long cnt; | 4176 | unsigned long cnt; |
4177 | unsigned long long t; | ||
4178 | unsigned long usec_rem; | ||
4029 | 4179 | ||
4030 | s = kmalloc(sizeof(*s), GFP_KERNEL); | 4180 | s = kmalloc(sizeof(*s), GFP_KERNEL); |
4031 | if (!s) | 4181 | if (!s) |
@@ -4042,6 +4192,17 @@ tracing_stats_read(struct file *filp, char __user *ubuf, | |||
4042 | cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); | 4192 | cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); |
4043 | trace_seq_printf(s, "commit overrun: %ld\n", cnt); | 4193 | trace_seq_printf(s, "commit overrun: %ld\n", cnt); |
4044 | 4194 | ||
4195 | cnt = ring_buffer_bytes_cpu(tr->buffer, cpu); | ||
4196 | trace_seq_printf(s, "bytes: %ld\n", cnt); | ||
4197 | |||
4198 | t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu)); | ||
4199 | usec_rem = do_div(t, USEC_PER_SEC); | ||
4200 | trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem); | ||
4201 | |||
4202 | t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu)); | ||
4203 | usec_rem = do_div(t, USEC_PER_SEC); | ||
4204 | trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); | ||
4205 | |||
4045 | count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); | 4206 | count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); |
4046 | 4207 | ||
4047 | kfree(s); | 4208 | kfree(s); |
@@ -4277,7 +4438,7 @@ static const struct file_operations trace_options_core_fops = { | |||
4277 | }; | 4438 | }; |
4278 | 4439 | ||
4279 | struct dentry *trace_create_file(const char *name, | 4440 | struct dentry *trace_create_file(const char *name, |
4280 | mode_t mode, | 4441 | umode_t mode, |
4281 | struct dentry *parent, | 4442 | struct dentry *parent, |
4282 | void *data, | 4443 | void *data, |
4283 | const struct file_operations *fops) | 4444 | const struct file_operations *fops) |
@@ -4450,6 +4611,9 @@ static __init int tracer_init_debugfs(void) | |||
4450 | trace_create_file("buffer_size_kb", 0644, d_tracer, | 4611 | trace_create_file("buffer_size_kb", 0644, d_tracer, |
4451 | &global_trace, &tracing_entries_fops); | 4612 | &global_trace, &tracing_entries_fops); |
4452 | 4613 | ||
4614 | trace_create_file("buffer_total_size_kb", 0444, d_tracer, | ||
4615 | &global_trace, &tracing_total_entries_fops); | ||
4616 | |||
4453 | trace_create_file("free_buffer", 0644, d_tracer, | 4617 | trace_create_file("free_buffer", 0644, d_tracer, |
4454 | &global_trace, &tracing_free_buffer_fops); | 4618 | &global_trace, &tracing_free_buffer_fops); |
4455 | 4619 | ||
@@ -4566,6 +4730,12 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) | |||
4566 | 4730 | ||
4567 | tracing_off(); | 4731 | tracing_off(); |
4568 | 4732 | ||
4733 | /* Did function tracer already get disabled? */ | ||
4734 | if (ftrace_is_dead()) { | ||
4735 | printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n"); | ||
4736 | printk("# MAY BE MISSING FUNCTION EVENTS\n"); | ||
4737 | } | ||
4738 | |||
4569 | if (disable_tracing) | 4739 | if (disable_tracing) |
4570 | ftrace_kill(); | 4740 | ftrace_kill(); |
4571 | 4741 | ||
@@ -4658,6 +4828,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) | |||
4658 | { | 4828 | { |
4659 | __ftrace_dump(true, oops_dump_mode); | 4829 | __ftrace_dump(true, oops_dump_mode); |
4660 | } | 4830 | } |
4831 | EXPORT_SYMBOL_GPL(ftrace_dump); | ||
4661 | 4832 | ||
4662 | __init static int tracer_alloc_buffers(void) | 4833 | __init static int tracer_alloc_buffers(void) |
4663 | { | 4834 | { |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 616846bcfee5..b93ecbadad6d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -312,7 +312,7 @@ void tracing_reset_current(int cpu); | |||
312 | void tracing_reset_current_online_cpus(void); | 312 | void tracing_reset_current_online_cpus(void); |
313 | int tracing_open_generic(struct inode *inode, struct file *filp); | 313 | int tracing_open_generic(struct inode *inode, struct file *filp); |
314 | struct dentry *trace_create_file(const char *name, | 314 | struct dentry *trace_create_file(const char *name, |
315 | mode_t mode, | 315 | umode_t mode, |
316 | struct dentry *parent, | 316 | struct dentry *parent, |
317 | void *data, | 317 | void *data, |
318 | const struct file_operations *fops); | 318 | const struct file_operations *fops); |
@@ -370,6 +370,7 @@ void trace_graph_function(struct trace_array *tr, | |||
370 | unsigned long ip, | 370 | unsigned long ip, |
371 | unsigned long parent_ip, | 371 | unsigned long parent_ip, |
372 | unsigned long flags, int pc); | 372 | unsigned long flags, int pc); |
373 | void trace_latency_header(struct seq_file *m); | ||
373 | void trace_default_header(struct seq_file *m); | 374 | void trace_default_header(struct seq_file *m); |
374 | void print_trace_header(struct seq_file *m, struct trace_iterator *iter); | 375 | void print_trace_header(struct seq_file *m, struct trace_iterator *iter); |
375 | int trace_empty(struct trace_iterator *iter); | 376 | int trace_empty(struct trace_iterator *iter); |
@@ -579,11 +580,13 @@ static inline int ftrace_trace_task(struct task_struct *task) | |||
579 | 580 | ||
580 | return test_tsk_trace_trace(task); | 581 | return test_tsk_trace_trace(task); |
581 | } | 582 | } |
583 | extern int ftrace_is_dead(void); | ||
582 | #else | 584 | #else |
583 | static inline int ftrace_trace_task(struct task_struct *task) | 585 | static inline int ftrace_trace_task(struct task_struct *task) |
584 | { | 586 | { |
585 | return 1; | 587 | return 1; |
586 | } | 588 | } |
589 | static inline int ftrace_is_dead(void) { return 0; } | ||
587 | #endif | 590 | #endif |
588 | 591 | ||
589 | /* | 592 | /* |
@@ -652,6 +655,7 @@ enum trace_iterator_flags { | |||
652 | TRACE_ITER_RECORD_CMD = 0x100000, | 655 | TRACE_ITER_RECORD_CMD = 0x100000, |
653 | TRACE_ITER_OVERWRITE = 0x200000, | 656 | TRACE_ITER_OVERWRITE = 0x200000, |
654 | TRACE_ITER_STOP_ON_FREE = 0x400000, | 657 | TRACE_ITER_STOP_ON_FREE = 0x400000, |
658 | TRACE_ITER_IRQ_INFO = 0x800000, | ||
655 | }; | 659 | }; |
656 | 660 | ||
657 | /* | 661 | /* |
@@ -761,16 +765,10 @@ struct filter_pred { | |||
761 | filter_pred_fn_t fn; | 765 | filter_pred_fn_t fn; |
762 | u64 val; | 766 | u64 val; |
763 | struct regex regex; | 767 | struct regex regex; |
764 | /* | 768 | unsigned short *ops; |
765 | * Leaf nodes use field_name, ops is used by AND and OR | 769 | #ifdef CONFIG_FTRACE_STARTUP_TEST |
766 | * nodes. The field_name is always freed when freeing a pred. | 770 | struct ftrace_event_field *field; |
767 | * We can overload field_name for ops and have it freed | 771 | #endif |
768 | * as well. | ||
769 | */ | ||
770 | union { | ||
771 | char *field_name; | ||
772 | unsigned short *ops; | ||
773 | }; | ||
774 | int offset; | 772 | int offset; |
775 | int not; | 773 | int not; |
776 | int op; | 774 | int op; |
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 6302747a1398..394783531cbb 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c | |||
@@ -113,3 +113,15 @@ u64 notrace trace_clock_global(void) | |||
113 | 113 | ||
114 | return now; | 114 | return now; |
115 | } | 115 | } |
116 | |||
117 | static atomic64_t trace_counter; | ||
118 | |||
119 | /* | ||
120 | * trace_clock_counter(): simply an atomic counter. | ||
121 | * Use the trace_counter "counter" for cases where you do not care | ||
122 | * about timings, but are interested in strict ordering. | ||
123 | */ | ||
124 | u64 notrace trace_clock_counter(void) | ||
125 | { | ||
126 | return atomic64_add_return(1, &trace_counter); | ||
127 | } | ||
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 581876f9f387..c212a7f934ec 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -1078,7 +1078,6 @@ event_subsystem_dir(const char *name, struct dentry *d_events) | |||
1078 | /* First see if we did not already create this dir */ | 1078 | /* First see if we did not already create this dir */ |
1079 | list_for_each_entry(system, &event_subsystems, list) { | 1079 | list_for_each_entry(system, &event_subsystems, list) { |
1080 | if (strcmp(system->name, name) == 0) { | 1080 | if (strcmp(system->name, name) == 0) { |
1081 | __get_system(system); | ||
1082 | system->nr_events++; | 1081 | system->nr_events++; |
1083 | return system->entry; | 1082 | return system->entry; |
1084 | } | 1083 | } |
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 256764ecccd6..f04cc3136bd3 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c | |||
@@ -27,6 +27,12 @@ | |||
27 | #include "trace.h" | 27 | #include "trace.h" |
28 | #include "trace_output.h" | 28 | #include "trace_output.h" |
29 | 29 | ||
30 | #define DEFAULT_SYS_FILTER_MESSAGE \ | ||
31 | "### global filter ###\n" \ | ||
32 | "# Use this to set filters for multiple events.\n" \ | ||
33 | "# Only events with the given fields will be affected.\n" \ | ||
34 | "# If no events are modified, an error message will be displayed here" | ||
35 | |||
30 | enum filter_op_ids | 36 | enum filter_op_ids |
31 | { | 37 | { |
32 | OP_OR, | 38 | OP_OR, |
@@ -381,6 +387,63 @@ get_pred_parent(struct filter_pred *pred, struct filter_pred *preds, | |||
381 | return pred; | 387 | return pred; |
382 | } | 388 | } |
383 | 389 | ||
390 | enum walk_return { | ||
391 | WALK_PRED_ABORT, | ||
392 | WALK_PRED_PARENT, | ||
393 | WALK_PRED_DEFAULT, | ||
394 | }; | ||
395 | |||
396 | typedef int (*filter_pred_walkcb_t) (enum move_type move, | ||
397 | struct filter_pred *pred, | ||
398 | int *err, void *data); | ||
399 | |||
400 | static int walk_pred_tree(struct filter_pred *preds, | ||
401 | struct filter_pred *root, | ||
402 | filter_pred_walkcb_t cb, void *data) | ||
403 | { | ||
404 | struct filter_pred *pred = root; | ||
405 | enum move_type move = MOVE_DOWN; | ||
406 | int done = 0; | ||
407 | |||
408 | if (!preds) | ||
409 | return -EINVAL; | ||
410 | |||
411 | do { | ||
412 | int err = 0, ret; | ||
413 | |||
414 | ret = cb(move, pred, &err, data); | ||
415 | if (ret == WALK_PRED_ABORT) | ||
416 | return err; | ||
417 | if (ret == WALK_PRED_PARENT) | ||
418 | goto get_parent; | ||
419 | |||
420 | switch (move) { | ||
421 | case MOVE_DOWN: | ||
422 | if (pred->left != FILTER_PRED_INVALID) { | ||
423 | pred = &preds[pred->left]; | ||
424 | continue; | ||
425 | } | ||
426 | goto get_parent; | ||
427 | case MOVE_UP_FROM_LEFT: | ||
428 | pred = &preds[pred->right]; | ||
429 | move = MOVE_DOWN; | ||
430 | continue; | ||
431 | case MOVE_UP_FROM_RIGHT: | ||
432 | get_parent: | ||
433 | if (pred == root) | ||
434 | break; | ||
435 | pred = get_pred_parent(pred, preds, | ||
436 | pred->parent, | ||
437 | &move); | ||
438 | continue; | ||
439 | } | ||
440 | done = 1; | ||
441 | } while (!done); | ||
442 | |||
443 | /* We are fine. */ | ||
444 | return 0; | ||
445 | } | ||
446 | |||
384 | /* | 447 | /* |
385 | * A series of AND or ORs where found together. Instead of | 448 | * A series of AND or ORs where found together. Instead of |
386 | * climbing up and down the tree branches, an array of the | 449 | * climbing up and down the tree branches, an array of the |
@@ -410,99 +473,91 @@ static int process_ops(struct filter_pred *preds, | |||
410 | 473 | ||
411 | for (i = 0; i < op->val; i++) { | 474 | for (i = 0; i < op->val; i++) { |
412 | pred = &preds[op->ops[i]]; | 475 | pred = &preds[op->ops[i]]; |
413 | match = pred->fn(pred, rec); | 476 | if (!WARN_ON_ONCE(!pred->fn)) |
477 | match = pred->fn(pred, rec); | ||
414 | if (!!match == type) | 478 | if (!!match == type) |
415 | return match; | 479 | return match; |
416 | } | 480 | } |
417 | return match; | 481 | return match; |
418 | } | 482 | } |
419 | 483 | ||
484 | struct filter_match_preds_data { | ||
485 | struct filter_pred *preds; | ||
486 | int match; | ||
487 | void *rec; | ||
488 | }; | ||
489 | |||
490 | static int filter_match_preds_cb(enum move_type move, struct filter_pred *pred, | ||
491 | int *err, void *data) | ||
492 | { | ||
493 | struct filter_match_preds_data *d = data; | ||
494 | |||
495 | *err = 0; | ||
496 | switch (move) { | ||
497 | case MOVE_DOWN: | ||
498 | /* only AND and OR have children */ | ||
499 | if (pred->left != FILTER_PRED_INVALID) { | ||
500 | /* If ops is set, then it was folded. */ | ||
501 | if (!pred->ops) | ||
502 | return WALK_PRED_DEFAULT; | ||
503 | /* We can treat folded ops as a leaf node */ | ||
504 | d->match = process_ops(d->preds, pred, d->rec); | ||
505 | } else { | ||
506 | if (!WARN_ON_ONCE(!pred->fn)) | ||
507 | d->match = pred->fn(pred, d->rec); | ||
508 | } | ||
509 | |||
510 | return WALK_PRED_PARENT; | ||
511 | case MOVE_UP_FROM_LEFT: | ||
512 | /* | ||
513 | * Check for short circuits. | ||
514 | * | ||
515 | * Optimization: !!match == (pred->op == OP_OR) | ||
516 | * is the same as: | ||
517 | * if ((match && pred->op == OP_OR) || | ||
518 | * (!match && pred->op == OP_AND)) | ||
519 | */ | ||
520 | if (!!d->match == (pred->op == OP_OR)) | ||
521 | return WALK_PRED_PARENT; | ||
522 | break; | ||
523 | case MOVE_UP_FROM_RIGHT: | ||
524 | break; | ||
525 | } | ||
526 | |||
527 | return WALK_PRED_DEFAULT; | ||
528 | } | ||
529 | |||
420 | /* return 1 if event matches, 0 otherwise (discard) */ | 530 | /* return 1 if event matches, 0 otherwise (discard) */ |
421 | int filter_match_preds(struct event_filter *filter, void *rec) | 531 | int filter_match_preds(struct event_filter *filter, void *rec) |
422 | { | 532 | { |
423 | int match = -1; | ||
424 | enum move_type move = MOVE_DOWN; | ||
425 | struct filter_pred *preds; | 533 | struct filter_pred *preds; |
426 | struct filter_pred *pred; | ||
427 | struct filter_pred *root; | 534 | struct filter_pred *root; |
428 | int n_preds; | 535 | struct filter_match_preds_data data = { |
429 | int done = 0; | 536 | /* match is currently meaningless */ |
537 | .match = -1, | ||
538 | .rec = rec, | ||
539 | }; | ||
540 | int n_preds, ret; | ||
430 | 541 | ||
431 | /* no filter is considered a match */ | 542 | /* no filter is considered a match */ |
432 | if (!filter) | 543 | if (!filter) |
433 | return 1; | 544 | return 1; |
434 | 545 | ||
435 | n_preds = filter->n_preds; | 546 | n_preds = filter->n_preds; |
436 | |||
437 | if (!n_preds) | 547 | if (!n_preds) |
438 | return 1; | 548 | return 1; |
439 | 549 | ||
440 | /* | 550 | /* |
441 | * n_preds, root and filter->preds are protect with preemption disabled. | 551 | * n_preds, root and filter->preds are protect with preemption disabled. |
442 | */ | 552 | */ |
443 | preds = rcu_dereference_sched(filter->preds); | ||
444 | root = rcu_dereference_sched(filter->root); | 553 | root = rcu_dereference_sched(filter->root); |
445 | if (!root) | 554 | if (!root) |
446 | return 1; | 555 | return 1; |
447 | 556 | ||
448 | pred = root; | 557 | data.preds = preds = rcu_dereference_sched(filter->preds); |
449 | 558 | ret = walk_pred_tree(preds, root, filter_match_preds_cb, &data); | |
450 | /* match is currently meaningless */ | 559 | WARN_ON(ret); |
451 | match = -1; | 560 | return data.match; |
452 | |||
453 | do { | ||
454 | switch (move) { | ||
455 | case MOVE_DOWN: | ||
456 | /* only AND and OR have children */ | ||
457 | if (pred->left != FILTER_PRED_INVALID) { | ||
458 | /* If ops is set, then it was folded. */ | ||
459 | if (!pred->ops) { | ||
460 | /* keep going to down the left side */ | ||
461 | pred = &preds[pred->left]; | ||
462 | continue; | ||
463 | } | ||
464 | /* We can treat folded ops as a leaf node */ | ||
465 | match = process_ops(preds, pred, rec); | ||
466 | } else | ||
467 | match = pred->fn(pred, rec); | ||
468 | /* If this pred is the only pred */ | ||
469 | if (pred == root) | ||
470 | break; | ||
471 | pred = get_pred_parent(pred, preds, | ||
472 | pred->parent, &move); | ||
473 | continue; | ||
474 | case MOVE_UP_FROM_LEFT: | ||
475 | /* | ||
476 | * Check for short circuits. | ||
477 | * | ||
478 | * Optimization: !!match == (pred->op == OP_OR) | ||
479 | * is the same as: | ||
480 | * if ((match && pred->op == OP_OR) || | ||
481 | * (!match && pred->op == OP_AND)) | ||
482 | */ | ||
483 | if (!!match == (pred->op == OP_OR)) { | ||
484 | if (pred == root) | ||
485 | break; | ||
486 | pred = get_pred_parent(pred, preds, | ||
487 | pred->parent, &move); | ||
488 | continue; | ||
489 | } | ||
490 | /* now go down the right side of the tree. */ | ||
491 | pred = &preds[pred->right]; | ||
492 | move = MOVE_DOWN; | ||
493 | continue; | ||
494 | case MOVE_UP_FROM_RIGHT: | ||
495 | /* We finished this equation. */ | ||
496 | if (pred == root) | ||
497 | break; | ||
498 | pred = get_pred_parent(pred, preds, | ||
499 | pred->parent, &move); | ||
500 | continue; | ||
501 | } | ||
502 | done = 1; | ||
503 | } while (!done); | ||
504 | |||
505 | return match; | ||
506 | } | 561 | } |
507 | EXPORT_SYMBOL_GPL(filter_match_preds); | 562 | EXPORT_SYMBOL_GPL(filter_match_preds); |
508 | 563 | ||
@@ -597,7 +652,7 @@ void print_subsystem_event_filter(struct event_subsystem *system, | |||
597 | if (filter && filter->filter_string) | 652 | if (filter && filter->filter_string) |
598 | trace_seq_printf(s, "%s\n", filter->filter_string); | 653 | trace_seq_printf(s, "%s\n", filter->filter_string); |
599 | else | 654 | else |
600 | trace_seq_printf(s, "none\n"); | 655 | trace_seq_printf(s, DEFAULT_SYS_FILTER_MESSAGE "\n"); |
601 | mutex_unlock(&event_mutex); | 656 | mutex_unlock(&event_mutex); |
602 | } | 657 | } |
603 | 658 | ||
@@ -628,22 +683,6 @@ find_event_field(struct ftrace_event_call *call, char *name) | |||
628 | return __find_event_field(head, name); | 683 | return __find_event_field(head, name); |
629 | } | 684 | } |
630 | 685 | ||
631 | static void filter_free_pred(struct filter_pred *pred) | ||
632 | { | ||
633 | if (!pred) | ||
634 | return; | ||
635 | |||
636 | kfree(pred->field_name); | ||
637 | kfree(pred); | ||
638 | } | ||
639 | |||
640 | static void filter_clear_pred(struct filter_pred *pred) | ||
641 | { | ||
642 | kfree(pred->field_name); | ||
643 | pred->field_name = NULL; | ||
644 | pred->regex.len = 0; | ||
645 | } | ||
646 | |||
647 | static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) | 686 | static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) |
648 | { | 687 | { |
649 | stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL); | 688 | stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL); |
@@ -689,20 +728,13 @@ __pop_pred_stack(struct pred_stack *stack) | |||
689 | static int filter_set_pred(struct event_filter *filter, | 728 | static int filter_set_pred(struct event_filter *filter, |
690 | int idx, | 729 | int idx, |
691 | struct pred_stack *stack, | 730 | struct pred_stack *stack, |
692 | struct filter_pred *src, | 731 | struct filter_pred *src) |
693 | filter_pred_fn_t fn) | ||
694 | { | 732 | { |
695 | struct filter_pred *dest = &filter->preds[idx]; | 733 | struct filter_pred *dest = &filter->preds[idx]; |
696 | struct filter_pred *left; | 734 | struct filter_pred *left; |
697 | struct filter_pred *right; | 735 | struct filter_pred *right; |
698 | 736 | ||
699 | *dest = *src; | 737 | *dest = *src; |
700 | if (src->field_name) { | ||
701 | dest->field_name = kstrdup(src->field_name, GFP_KERNEL); | ||
702 | if (!dest->field_name) | ||
703 | return -ENOMEM; | ||
704 | } | ||
705 | dest->fn = fn; | ||
706 | dest->index = idx; | 738 | dest->index = idx; |
707 | 739 | ||
708 | if (dest->op == OP_OR || dest->op == OP_AND) { | 740 | if (dest->op == OP_OR || dest->op == OP_AND) { |
@@ -743,11 +775,7 @@ static int filter_set_pred(struct event_filter *filter, | |||
743 | 775 | ||
744 | static void __free_preds(struct event_filter *filter) | 776 | static void __free_preds(struct event_filter *filter) |
745 | { | 777 | { |
746 | int i; | ||
747 | |||
748 | if (filter->preds) { | 778 | if (filter->preds) { |
749 | for (i = 0; i < filter->a_preds; i++) | ||
750 | kfree(filter->preds[i].field_name); | ||
751 | kfree(filter->preds); | 779 | kfree(filter->preds); |
752 | filter->preds = NULL; | 780 | filter->preds = NULL; |
753 | } | 781 | } |
@@ -840,23 +868,19 @@ static void filter_free_subsystem_filters(struct event_subsystem *system) | |||
840 | } | 868 | } |
841 | } | 869 | } |
842 | 870 | ||
843 | static int filter_add_pred_fn(struct filter_parse_state *ps, | 871 | static int filter_add_pred(struct filter_parse_state *ps, |
844 | struct ftrace_event_call *call, | 872 | struct event_filter *filter, |
845 | struct event_filter *filter, | 873 | struct filter_pred *pred, |
846 | struct filter_pred *pred, | 874 | struct pred_stack *stack) |
847 | struct pred_stack *stack, | ||
848 | filter_pred_fn_t fn) | ||
849 | { | 875 | { |
850 | int idx, err; | 876 | int err; |
851 | 877 | ||
852 | if (WARN_ON(filter->n_preds == filter->a_preds)) { | 878 | if (WARN_ON(filter->n_preds == filter->a_preds)) { |
853 | parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); | 879 | parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); |
854 | return -ENOSPC; | 880 | return -ENOSPC; |
855 | } | 881 | } |
856 | 882 | ||
857 | idx = filter->n_preds; | 883 | err = filter_set_pred(filter, filter->n_preds, stack, pred); |
858 | filter_clear_pred(&filter->preds[idx]); | ||
859 | err = filter_set_pred(filter, idx, stack, pred, fn); | ||
860 | if (err) | 884 | if (err) |
861 | return err; | 885 | return err; |
862 | 886 | ||
@@ -937,31 +961,15 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size, | |||
937 | return fn; | 961 | return fn; |
938 | } | 962 | } |
939 | 963 | ||
940 | static int filter_add_pred(struct filter_parse_state *ps, | 964 | static int init_pred(struct filter_parse_state *ps, |
941 | struct ftrace_event_call *call, | 965 | struct ftrace_event_field *field, |
942 | struct event_filter *filter, | 966 | struct filter_pred *pred) |
943 | struct filter_pred *pred, | 967 | |
944 | struct pred_stack *stack, | ||
945 | bool dry_run) | ||
946 | { | 968 | { |
947 | struct ftrace_event_field *field; | 969 | filter_pred_fn_t fn = filter_pred_none; |
948 | filter_pred_fn_t fn; | ||
949 | unsigned long long val; | 970 | unsigned long long val; |
950 | int ret; | 971 | int ret; |
951 | 972 | ||
952 | fn = pred->fn = filter_pred_none; | ||
953 | |||
954 | if (pred->op == OP_AND) | ||
955 | goto add_pred_fn; | ||
956 | else if (pred->op == OP_OR) | ||
957 | goto add_pred_fn; | ||
958 | |||
959 | field = find_event_field(call, pred->field_name); | ||
960 | if (!field) { | ||
961 | parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0); | ||
962 | return -EINVAL; | ||
963 | } | ||
964 | |||
965 | pred->offset = field->offset; | 973 | pred->offset = field->offset; |
966 | 974 | ||
967 | if (!is_legal_op(field, pred->op)) { | 975 | if (!is_legal_op(field, pred->op)) { |
@@ -1001,9 +1009,7 @@ static int filter_add_pred(struct filter_parse_state *ps, | |||
1001 | if (pred->op == OP_NE) | 1009 | if (pred->op == OP_NE) |
1002 | pred->not = 1; | 1010 | pred->not = 1; |
1003 | 1011 | ||
1004 | add_pred_fn: | 1012 | pred->fn = fn; |
1005 | if (!dry_run) | ||
1006 | return filter_add_pred_fn(ps, call, filter, pred, stack, fn); | ||
1007 | return 0; | 1013 | return 0; |
1008 | } | 1014 | } |
1009 | 1015 | ||
@@ -1302,39 +1308,37 @@ parse_operand: | |||
1302 | return 0; | 1308 | return 0; |
1303 | } | 1309 | } |
1304 | 1310 | ||
1305 | static struct filter_pred *create_pred(int op, char *operand1, char *operand2) | 1311 | static struct filter_pred *create_pred(struct filter_parse_state *ps, |
1312 | struct ftrace_event_call *call, | ||
1313 | int op, char *operand1, char *operand2) | ||
1306 | { | 1314 | { |
1307 | struct filter_pred *pred; | 1315 | struct ftrace_event_field *field; |
1316 | static struct filter_pred pred; | ||
1308 | 1317 | ||
1309 | pred = kzalloc(sizeof(*pred), GFP_KERNEL); | 1318 | memset(&pred, 0, sizeof(pred)); |
1310 | if (!pred) | 1319 | pred.op = op; |
1311 | return NULL; | ||
1312 | 1320 | ||
1313 | pred->field_name = kstrdup(operand1, GFP_KERNEL); | 1321 | if (op == OP_AND || op == OP_OR) |
1314 | if (!pred->field_name) { | 1322 | return &pred; |
1315 | kfree(pred); | 1323 | |
1324 | if (!operand1 || !operand2) { | ||
1325 | parse_error(ps, FILT_ERR_MISSING_FIELD, 0); | ||
1316 | return NULL; | 1326 | return NULL; |
1317 | } | 1327 | } |
1318 | 1328 | ||
1319 | strcpy(pred->regex.pattern, operand2); | 1329 | field = find_event_field(call, operand1); |
1320 | pred->regex.len = strlen(pred->regex.pattern); | 1330 | if (!field) { |
1321 | 1331 | parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0); | |
1322 | pred->op = op; | ||
1323 | |||
1324 | return pred; | ||
1325 | } | ||
1326 | |||
1327 | static struct filter_pred *create_logical_pred(int op) | ||
1328 | { | ||
1329 | struct filter_pred *pred; | ||
1330 | |||
1331 | pred = kzalloc(sizeof(*pred), GFP_KERNEL); | ||
1332 | if (!pred) | ||
1333 | return NULL; | 1332 | return NULL; |
1333 | } | ||
1334 | 1334 | ||
1335 | pred->op = op; | 1335 | strcpy(pred.regex.pattern, operand2); |
1336 | pred.regex.len = strlen(pred.regex.pattern); | ||
1336 | 1337 | ||
1337 | return pred; | 1338 | #ifdef CONFIG_FTRACE_STARTUP_TEST |
1339 | pred.field = field; | ||
1340 | #endif | ||
1341 | return init_pred(ps, field, &pred) ? NULL : &pred; | ||
1338 | } | 1342 | } |
1339 | 1343 | ||
1340 | static int check_preds(struct filter_parse_state *ps) | 1344 | static int check_preds(struct filter_parse_state *ps) |
@@ -1375,6 +1379,23 @@ static int count_preds(struct filter_parse_state *ps) | |||
1375 | return n_preds; | 1379 | return n_preds; |
1376 | } | 1380 | } |
1377 | 1381 | ||
1382 | struct check_pred_data { | ||
1383 | int count; | ||
1384 | int max; | ||
1385 | }; | ||
1386 | |||
1387 | static int check_pred_tree_cb(enum move_type move, struct filter_pred *pred, | ||
1388 | int *err, void *data) | ||
1389 | { | ||
1390 | struct check_pred_data *d = data; | ||
1391 | |||
1392 | if (WARN_ON(d->count++ > d->max)) { | ||
1393 | *err = -EINVAL; | ||
1394 | return WALK_PRED_ABORT; | ||
1395 | } | ||
1396 | return WALK_PRED_DEFAULT; | ||
1397 | } | ||
1398 | |||
1378 | /* | 1399 | /* |
1379 | * The tree is walked at filtering of an event. If the tree is not correctly | 1400 | * The tree is walked at filtering of an event. If the tree is not correctly |
1380 | * built, it may cause an infinite loop. Check here that the tree does | 1401 | * built, it may cause an infinite loop. Check here that the tree does |
@@ -1383,107 +1404,76 @@ static int count_preds(struct filter_parse_state *ps) | |||
1383 | static int check_pred_tree(struct event_filter *filter, | 1404 | static int check_pred_tree(struct event_filter *filter, |
1384 | struct filter_pred *root) | 1405 | struct filter_pred *root) |
1385 | { | 1406 | { |
1386 | struct filter_pred *preds; | 1407 | struct check_pred_data data = { |
1387 | struct filter_pred *pred; | 1408 | /* |
1388 | enum move_type move = MOVE_DOWN; | 1409 | * The max that we can hit a node is three times. |
1389 | int count = 0; | 1410 | * Once going down, once coming up from left, and |
1390 | int done = 0; | 1411 | * once coming up from right. This is more than enough |
1391 | int max; | 1412 | * since leafs are only hit a single time. |
1392 | 1413 | */ | |
1393 | /* | 1414 | .max = 3 * filter->n_preds, |
1394 | * The max that we can hit a node is three times. | 1415 | .count = 0, |
1395 | * Once going down, once coming up from left, and | 1416 | }; |
1396 | * once coming up from right. This is more than enough | ||
1397 | * since leafs are only hit a single time. | ||
1398 | */ | ||
1399 | max = 3 * filter->n_preds; | ||
1400 | 1417 | ||
1401 | preds = filter->preds; | 1418 | return walk_pred_tree(filter->preds, root, |
1402 | if (!preds) | 1419 | check_pred_tree_cb, &data); |
1403 | return -EINVAL; | 1420 | } |
1404 | pred = root; | ||
1405 | 1421 | ||
1406 | do { | 1422 | static int count_leafs_cb(enum move_type move, struct filter_pred *pred, |
1407 | if (WARN_ON(count++ > max)) | 1423 | int *err, void *data) |
1408 | return -EINVAL; | 1424 | { |
1425 | int *count = data; | ||
1409 | 1426 | ||
1410 | switch (move) { | 1427 | if ((move == MOVE_DOWN) && |
1411 | case MOVE_DOWN: | 1428 | (pred->left == FILTER_PRED_INVALID)) |
1412 | if (pred->left != FILTER_PRED_INVALID) { | 1429 | (*count)++; |
1413 | pred = &preds[pred->left]; | ||
1414 | continue; | ||
1415 | } | ||
1416 | /* A leaf at the root is just a leaf in the tree */ | ||
1417 | if (pred == root) | ||
1418 | break; | ||
1419 | pred = get_pred_parent(pred, preds, | ||
1420 | pred->parent, &move); | ||
1421 | continue; | ||
1422 | case MOVE_UP_FROM_LEFT: | ||
1423 | pred = &preds[pred->right]; | ||
1424 | move = MOVE_DOWN; | ||
1425 | continue; | ||
1426 | case MOVE_UP_FROM_RIGHT: | ||
1427 | if (pred == root) | ||
1428 | break; | ||
1429 | pred = get_pred_parent(pred, preds, | ||
1430 | pred->parent, &move); | ||
1431 | continue; | ||
1432 | } | ||
1433 | done = 1; | ||
1434 | } while (!done); | ||
1435 | 1430 | ||
1436 | /* We are fine. */ | 1431 | return WALK_PRED_DEFAULT; |
1437 | return 0; | ||
1438 | } | 1432 | } |
1439 | 1433 | ||
1440 | static int count_leafs(struct filter_pred *preds, struct filter_pred *root) | 1434 | static int count_leafs(struct filter_pred *preds, struct filter_pred *root) |
1441 | { | 1435 | { |
1442 | struct filter_pred *pred; | 1436 | int count = 0, ret; |
1443 | enum move_type move = MOVE_DOWN; | ||
1444 | int count = 0; | ||
1445 | int done = 0; | ||
1446 | 1437 | ||
1447 | pred = root; | 1438 | ret = walk_pred_tree(preds, root, count_leafs_cb, &count); |
1439 | WARN_ON(ret); | ||
1440 | return count; | ||
1441 | } | ||
1448 | 1442 | ||
1449 | do { | 1443 | struct fold_pred_data { |
1450 | switch (move) { | 1444 | struct filter_pred *root; |
1451 | case MOVE_DOWN: | 1445 | int count; |
1452 | if (pred->left != FILTER_PRED_INVALID) { | 1446 | int children; |
1453 | pred = &preds[pred->left]; | 1447 | }; |
1454 | continue; | ||
1455 | } | ||
1456 | /* A leaf at the root is just a leaf in the tree */ | ||
1457 | if (pred == root) | ||
1458 | return 1; | ||
1459 | count++; | ||
1460 | pred = get_pred_parent(pred, preds, | ||
1461 | pred->parent, &move); | ||
1462 | continue; | ||
1463 | case MOVE_UP_FROM_LEFT: | ||
1464 | pred = &preds[pred->right]; | ||
1465 | move = MOVE_DOWN; | ||
1466 | continue; | ||
1467 | case MOVE_UP_FROM_RIGHT: | ||
1468 | if (pred == root) | ||
1469 | break; | ||
1470 | pred = get_pred_parent(pred, preds, | ||
1471 | pred->parent, &move); | ||
1472 | continue; | ||
1473 | } | ||
1474 | done = 1; | ||
1475 | } while (!done); | ||
1476 | 1448 | ||
1477 | return count; | 1449 | static int fold_pred_cb(enum move_type move, struct filter_pred *pred, |
1450 | int *err, void *data) | ||
1451 | { | ||
1452 | struct fold_pred_data *d = data; | ||
1453 | struct filter_pred *root = d->root; | ||
1454 | |||
1455 | if (move != MOVE_DOWN) | ||
1456 | return WALK_PRED_DEFAULT; | ||
1457 | if (pred->left != FILTER_PRED_INVALID) | ||
1458 | return WALK_PRED_DEFAULT; | ||
1459 | |||
1460 | if (WARN_ON(d->count == d->children)) { | ||
1461 | *err = -EINVAL; | ||
1462 | return WALK_PRED_ABORT; | ||
1463 | } | ||
1464 | |||
1465 | pred->index &= ~FILTER_PRED_FOLD; | ||
1466 | root->ops[d->count++] = pred->index; | ||
1467 | return WALK_PRED_DEFAULT; | ||
1478 | } | 1468 | } |
1479 | 1469 | ||
1480 | static int fold_pred(struct filter_pred *preds, struct filter_pred *root) | 1470 | static int fold_pred(struct filter_pred *preds, struct filter_pred *root) |
1481 | { | 1471 | { |
1482 | struct filter_pred *pred; | 1472 | struct fold_pred_data data = { |
1483 | enum move_type move = MOVE_DOWN; | 1473 | .root = root, |
1484 | int count = 0; | 1474 | .count = 0, |
1475 | }; | ||
1485 | int children; | 1476 | int children; |
1486 | int done = 0; | ||
1487 | 1477 | ||
1488 | /* No need to keep the fold flag */ | 1478 | /* No need to keep the fold flag */ |
1489 | root->index &= ~FILTER_PRED_FOLD; | 1479 | root->index &= ~FILTER_PRED_FOLD; |
@@ -1501,37 +1491,26 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root) | |||
1501 | return -ENOMEM; | 1491 | return -ENOMEM; |
1502 | 1492 | ||
1503 | root->val = children; | 1493 | root->val = children; |
1494 | data.children = children; | ||
1495 | return walk_pred_tree(preds, root, fold_pred_cb, &data); | ||
1496 | } | ||
1504 | 1497 | ||
1505 | pred = root; | 1498 | static int fold_pred_tree_cb(enum move_type move, struct filter_pred *pred, |
1506 | do { | 1499 | int *err, void *data) |
1507 | switch (move) { | 1500 | { |
1508 | case MOVE_DOWN: | 1501 | struct filter_pred *preds = data; |
1509 | if (pred->left != FILTER_PRED_INVALID) { | ||
1510 | pred = &preds[pred->left]; | ||
1511 | continue; | ||
1512 | } | ||
1513 | if (WARN_ON(count == children)) | ||
1514 | return -EINVAL; | ||
1515 | pred->index &= ~FILTER_PRED_FOLD; | ||
1516 | root->ops[count++] = pred->index; | ||
1517 | pred = get_pred_parent(pred, preds, | ||
1518 | pred->parent, &move); | ||
1519 | continue; | ||
1520 | case MOVE_UP_FROM_LEFT: | ||
1521 | pred = &preds[pred->right]; | ||
1522 | move = MOVE_DOWN; | ||
1523 | continue; | ||
1524 | case MOVE_UP_FROM_RIGHT: | ||
1525 | if (pred == root) | ||
1526 | break; | ||
1527 | pred = get_pred_parent(pred, preds, | ||
1528 | pred->parent, &move); | ||
1529 | continue; | ||
1530 | } | ||
1531 | done = 1; | ||
1532 | } while (!done); | ||
1533 | 1502 | ||
1534 | return 0; | 1503 | if (move != MOVE_DOWN) |
1504 | return WALK_PRED_DEFAULT; | ||
1505 | if (!(pred->index & FILTER_PRED_FOLD)) | ||
1506 | return WALK_PRED_DEFAULT; | ||
1507 | |||
1508 | *err = fold_pred(preds, pred); | ||
1509 | if (*err) | ||
1510 | return WALK_PRED_ABORT; | ||
1511 | |||
1512 | /* eveyrhing below is folded, continue with parent */ | ||
1513 | return WALK_PRED_PARENT; | ||
1535 | } | 1514 | } |
1536 | 1515 | ||
1537 | /* | 1516 | /* |
@@ -1542,51 +1521,8 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root) | |||
1542 | static int fold_pred_tree(struct event_filter *filter, | 1521 | static int fold_pred_tree(struct event_filter *filter, |
1543 | struct filter_pred *root) | 1522 | struct filter_pred *root) |
1544 | { | 1523 | { |
1545 | struct filter_pred *preds; | 1524 | return walk_pred_tree(filter->preds, root, fold_pred_tree_cb, |
1546 | struct filter_pred *pred; | 1525 | filter->preds); |
1547 | enum move_type move = MOVE_DOWN; | ||
1548 | int done = 0; | ||
1549 | int err; | ||
1550 | |||
1551 | preds = filter->preds; | ||
1552 | if (!preds) | ||
1553 | return -EINVAL; | ||
1554 | pred = root; | ||
1555 | |||
1556 | do { | ||
1557 | switch (move) { | ||
1558 | case MOVE_DOWN: | ||
1559 | if (pred->index & FILTER_PRED_FOLD) { | ||
1560 | err = fold_pred(preds, pred); | ||
1561 | if (err) | ||
1562 | return err; | ||
1563 | /* Folded nodes are like leafs */ | ||
1564 | } else if (pred->left != FILTER_PRED_INVALID) { | ||
1565 | pred = &preds[pred->left]; | ||
1566 | continue; | ||
1567 | } | ||
1568 | |||
1569 | /* A leaf at the root is just a leaf in the tree */ | ||
1570 | if (pred == root) | ||
1571 | break; | ||
1572 | pred = get_pred_parent(pred, preds, | ||
1573 | pred->parent, &move); | ||
1574 | continue; | ||
1575 | case MOVE_UP_FROM_LEFT: | ||
1576 | pred = &preds[pred->right]; | ||
1577 | move = MOVE_DOWN; | ||
1578 | continue; | ||
1579 | case MOVE_UP_FROM_RIGHT: | ||
1580 | if (pred == root) | ||
1581 | break; | ||
1582 | pred = get_pred_parent(pred, preds, | ||
1583 | pred->parent, &move); | ||
1584 | continue; | ||
1585 | } | ||
1586 | done = 1; | ||
1587 | } while (!done); | ||
1588 | |||
1589 | return 0; | ||
1590 | } | 1526 | } |
1591 | 1527 | ||
1592 | static int replace_preds(struct ftrace_event_call *call, | 1528 | static int replace_preds(struct ftrace_event_call *call, |
@@ -1643,27 +1579,17 @@ static int replace_preds(struct ftrace_event_call *call, | |||
1643 | goto fail; | 1579 | goto fail; |
1644 | } | 1580 | } |
1645 | 1581 | ||
1646 | if (elt->op == OP_AND || elt->op == OP_OR) { | 1582 | pred = create_pred(ps, call, elt->op, operand1, operand2); |
1647 | pred = create_logical_pred(elt->op); | 1583 | if (!pred) { |
1648 | goto add_pred; | ||
1649 | } | ||
1650 | |||
1651 | if (!operand1 || !operand2) { | ||
1652 | parse_error(ps, FILT_ERR_MISSING_FIELD, 0); | ||
1653 | err = -EINVAL; | 1584 | err = -EINVAL; |
1654 | goto fail; | 1585 | goto fail; |
1655 | } | 1586 | } |
1656 | 1587 | ||
1657 | pred = create_pred(elt->op, operand1, operand2); | 1588 | if (!dry_run) { |
1658 | add_pred: | 1589 | err = filter_add_pred(ps, filter, pred, &stack); |
1659 | if (!pred) { | 1590 | if (err) |
1660 | err = -ENOMEM; | 1591 | goto fail; |
1661 | goto fail; | ||
1662 | } | 1592 | } |
1663 | err = filter_add_pred(ps, call, filter, pred, &stack, dry_run); | ||
1664 | filter_free_pred(pred); | ||
1665 | if (err) | ||
1666 | goto fail; | ||
1667 | 1593 | ||
1668 | operand1 = operand2 = NULL; | 1594 | operand1 = operand2 = NULL; |
1669 | } | 1595 | } |
@@ -1729,7 +1655,9 @@ static int replace_system_preds(struct event_subsystem *system, | |||
1729 | */ | 1655 | */ |
1730 | err = replace_preds(call, NULL, ps, filter_string, true); | 1656 | err = replace_preds(call, NULL, ps, filter_string, true); |
1731 | if (err) | 1657 | if (err) |
1732 | goto fail; | 1658 | call->flags |= TRACE_EVENT_FL_NO_SET_FILTER; |
1659 | else | ||
1660 | call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER; | ||
1733 | } | 1661 | } |
1734 | 1662 | ||
1735 | list_for_each_entry(call, &ftrace_events, list) { | 1663 | list_for_each_entry(call, &ftrace_events, list) { |
@@ -1738,6 +1666,9 @@ static int replace_system_preds(struct event_subsystem *system, | |||
1738 | if (strcmp(call->class->system, system->name) != 0) | 1666 | if (strcmp(call->class->system, system->name) != 0) |
1739 | continue; | 1667 | continue; |
1740 | 1668 | ||
1669 | if (call->flags & TRACE_EVENT_FL_NO_SET_FILTER) | ||
1670 | continue; | ||
1671 | |||
1741 | filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); | 1672 | filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); |
1742 | if (!filter_item) | 1673 | if (!filter_item) |
1743 | goto fail_mem; | 1674 | goto fail_mem; |
@@ -1766,7 +1697,7 @@ static int replace_system_preds(struct event_subsystem *system, | |||
1766 | * replace the filter for the call. | 1697 | * replace the filter for the call. |
1767 | */ | 1698 | */ |
1768 | filter = call->filter; | 1699 | filter = call->filter; |
1769 | call->filter = filter_item->filter; | 1700 | rcu_assign_pointer(call->filter, filter_item->filter); |
1770 | filter_item->filter = filter; | 1701 | filter_item->filter = filter; |
1771 | 1702 | ||
1772 | fail = false; | 1703 | fail = false; |
@@ -1821,7 +1752,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) | |||
1821 | filter = call->filter; | 1752 | filter = call->filter; |
1822 | if (!filter) | 1753 | if (!filter) |
1823 | goto out_unlock; | 1754 | goto out_unlock; |
1824 | call->filter = NULL; | 1755 | RCU_INIT_POINTER(call->filter, NULL); |
1825 | /* Make sure the filter is not being used */ | 1756 | /* Make sure the filter is not being used */ |
1826 | synchronize_sched(); | 1757 | synchronize_sched(); |
1827 | __free_filter(filter); | 1758 | __free_filter(filter); |
@@ -1862,7 +1793,7 @@ out: | |||
1862 | * string | 1793 | * string |
1863 | */ | 1794 | */ |
1864 | tmp = call->filter; | 1795 | tmp = call->filter; |
1865 | call->filter = filter; | 1796 | rcu_assign_pointer(call->filter, filter); |
1866 | if (tmp) { | 1797 | if (tmp) { |
1867 | /* Make sure the call is done with the filter */ | 1798 | /* Make sure the call is done with the filter */ |
1868 | synchronize_sched(); | 1799 | synchronize_sched(); |
@@ -1913,7 +1844,10 @@ int apply_subsystem_event_filter(struct event_subsystem *system, | |||
1913 | if (!filter) | 1844 | if (!filter) |
1914 | goto out; | 1845 | goto out; |
1915 | 1846 | ||
1916 | replace_filter_string(filter, filter_string); | 1847 | /* System filters just show a default message */ |
1848 | kfree(filter->filter_string); | ||
1849 | filter->filter_string = NULL; | ||
1850 | |||
1917 | /* | 1851 | /* |
1918 | * No event actually uses the system filter | 1852 | * No event actually uses the system filter |
1919 | * we can free it without synchronize_sched(). | 1853 | * we can free it without synchronize_sched(). |
@@ -1923,14 +1857,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system, | |||
1923 | 1857 | ||
1924 | parse_init(ps, filter_ops, filter_string); | 1858 | parse_init(ps, filter_ops, filter_string); |
1925 | err = filter_parse(ps); | 1859 | err = filter_parse(ps); |
1926 | if (err) { | 1860 | if (err) |
1927 | append_filter_err(ps, system->filter); | 1861 | goto err_filter; |
1928 | goto out; | ||
1929 | } | ||
1930 | 1862 | ||
1931 | err = replace_system_preds(system, ps, filter_string); | 1863 | err = replace_system_preds(system, ps, filter_string); |
1932 | if (err) | 1864 | if (err) |
1933 | append_filter_err(ps, system->filter); | 1865 | goto err_filter; |
1934 | 1866 | ||
1935 | out: | 1867 | out: |
1936 | filter_opstack_clear(ps); | 1868 | filter_opstack_clear(ps); |
@@ -1940,6 +1872,11 @@ out_unlock: | |||
1940 | mutex_unlock(&event_mutex); | 1872 | mutex_unlock(&event_mutex); |
1941 | 1873 | ||
1942 | return err; | 1874 | return err; |
1875 | |||
1876 | err_filter: | ||
1877 | replace_filter_string(filter, filter_string); | ||
1878 | append_filter_err(ps, system->filter); | ||
1879 | goto out; | ||
1943 | } | 1880 | } |
1944 | 1881 | ||
1945 | #ifdef CONFIG_PERF_EVENTS | 1882 | #ifdef CONFIG_PERF_EVENTS |
@@ -1958,17 +1895,14 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, | |||
1958 | int err; | 1895 | int err; |
1959 | struct event_filter *filter; | 1896 | struct event_filter *filter; |
1960 | struct filter_parse_state *ps; | 1897 | struct filter_parse_state *ps; |
1961 | struct ftrace_event_call *call = NULL; | 1898 | struct ftrace_event_call *call; |
1962 | 1899 | ||
1963 | mutex_lock(&event_mutex); | 1900 | mutex_lock(&event_mutex); |
1964 | 1901 | ||
1965 | list_for_each_entry(call, &ftrace_events, list) { | 1902 | call = event->tp_event; |
1966 | if (call->event.type == event_id) | ||
1967 | break; | ||
1968 | } | ||
1969 | 1903 | ||
1970 | err = -EINVAL; | 1904 | err = -EINVAL; |
1971 | if (&call->list == &ftrace_events) | 1905 | if (!call) |
1972 | goto out_unlock; | 1906 | goto out_unlock; |
1973 | 1907 | ||
1974 | err = -EEXIST; | 1908 | err = -EEXIST; |
@@ -2012,3 +1946,215 @@ out_unlock: | |||
2012 | 1946 | ||
2013 | #endif /* CONFIG_PERF_EVENTS */ | 1947 | #endif /* CONFIG_PERF_EVENTS */ |
2014 | 1948 | ||
1949 | #ifdef CONFIG_FTRACE_STARTUP_TEST | ||
1950 | |||
1951 | #include <linux/types.h> | ||
1952 | #include <linux/tracepoint.h> | ||
1953 | |||
1954 | #define CREATE_TRACE_POINTS | ||
1955 | #include "trace_events_filter_test.h" | ||
1956 | |||
1957 | static int test_get_filter(char *filter_str, struct ftrace_event_call *call, | ||
1958 | struct event_filter **pfilter) | ||
1959 | { | ||
1960 | struct event_filter *filter; | ||
1961 | struct filter_parse_state *ps; | ||
1962 | int err = -ENOMEM; | ||
1963 | |||
1964 | filter = __alloc_filter(); | ||
1965 | if (!filter) | ||
1966 | goto out; | ||
1967 | |||
1968 | ps = kzalloc(sizeof(*ps), GFP_KERNEL); | ||
1969 | if (!ps) | ||
1970 | goto free_filter; | ||
1971 | |||
1972 | parse_init(ps, filter_ops, filter_str); | ||
1973 | err = filter_parse(ps); | ||
1974 | if (err) | ||
1975 | goto free_ps; | ||
1976 | |||
1977 | err = replace_preds(call, filter, ps, filter_str, false); | ||
1978 | if (!err) | ||
1979 | *pfilter = filter; | ||
1980 | |||
1981 | free_ps: | ||
1982 | filter_opstack_clear(ps); | ||
1983 | postfix_clear(ps); | ||
1984 | kfree(ps); | ||
1985 | |||
1986 | free_filter: | ||
1987 | if (err) | ||
1988 | __free_filter(filter); | ||
1989 | |||
1990 | out: | ||
1991 | return err; | ||
1992 | } | ||
1993 | |||
1994 | #define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \ | ||
1995 | { \ | ||
1996 | .filter = FILTER, \ | ||
1997 | .rec = { .a = va, .b = vb, .c = vc, .d = vd, \ | ||
1998 | .e = ve, .f = vf, .g = vg, .h = vh }, \ | ||
1999 | .match = m, \ | ||
2000 | .not_visited = nvisit, \ | ||
2001 | } | ||
2002 | #define YES 1 | ||
2003 | #define NO 0 | ||
2004 | |||
2005 | static struct test_filter_data_t { | ||
2006 | char *filter; | ||
2007 | struct ftrace_raw_ftrace_test_filter rec; | ||
2008 | int match; | ||
2009 | char *not_visited; | ||
2010 | } test_filter_data[] = { | ||
2011 | #define FILTER "a == 1 && b == 1 && c == 1 && d == 1 && " \ | ||
2012 | "e == 1 && f == 1 && g == 1 && h == 1" | ||
2013 | DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, ""), | ||
2014 | DATA_REC(NO, 0, 1, 1, 1, 1, 1, 1, 1, "bcdefgh"), | ||
2015 | DATA_REC(NO, 1, 1, 1, 1, 1, 1, 1, 0, ""), | ||
2016 | #undef FILTER | ||
2017 | #define FILTER "a == 1 || b == 1 || c == 1 || d == 1 || " \ | ||
2018 | "e == 1 || f == 1 || g == 1 || h == 1" | ||
2019 | DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 0, ""), | ||
2020 | DATA_REC(YES, 0, 0, 0, 0, 0, 0, 0, 1, ""), | ||
2021 | DATA_REC(YES, 1, 0, 0, 0, 0, 0, 0, 0, "bcdefgh"), | ||
2022 | #undef FILTER | ||
2023 | #define FILTER "(a == 1 || b == 1) && (c == 1 || d == 1) && " \ | ||
2024 | "(e == 1 || f == 1) && (g == 1 || h == 1)" | ||
2025 | DATA_REC(NO, 0, 0, 1, 1, 1, 1, 1, 1, "dfh"), | ||
2026 | DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""), | ||
2027 | DATA_REC(YES, 1, 0, 1, 0, 0, 1, 0, 1, "bd"), | ||
2028 | DATA_REC(NO, 1, 0, 1, 0, 0, 1, 0, 0, "bd"), | ||
2029 | #undef FILTER | ||
2030 | #define FILTER "(a == 1 && b == 1) || (c == 1 && d == 1) || " \ | ||
2031 | "(e == 1 && f == 1) || (g == 1 && h == 1)" | ||
2032 | DATA_REC(YES, 1, 0, 1, 1, 1, 1, 1, 1, "efgh"), | ||
2033 | DATA_REC(YES, 0, 0, 0, 0, 0, 0, 1, 1, ""), | ||
2034 | DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 1, ""), | ||
2035 | #undef FILTER | ||
2036 | #define FILTER "(a == 1 && b == 1) && (c == 1 && d == 1) && " \ | ||
2037 | "(e == 1 && f == 1) || (g == 1 && h == 1)" | ||
2038 | DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 0, "gh"), | ||
2039 | DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 1, ""), | ||
2040 | DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, ""), | ||
2041 | #undef FILTER | ||
2042 | #define FILTER "((a == 1 || b == 1) || (c == 1 || d == 1) || " \ | ||
2043 | "(e == 1 || f == 1)) && (g == 1 || h == 1)" | ||
2044 | DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 1, "bcdef"), | ||
2045 | DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 0, ""), | ||
2046 | DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, "h"), | ||
2047 | #undef FILTER | ||
2048 | #define FILTER "((((((((a == 1) && (b == 1)) || (c == 1)) && (d == 1)) || " \ | ||
2049 | "(e == 1)) && (f == 1)) || (g == 1)) && (h == 1))" | ||
2050 | DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "ceg"), | ||
2051 | DATA_REC(NO, 0, 1, 0, 1, 0, 1, 0, 1, ""), | ||
2052 | DATA_REC(NO, 1, 0, 1, 0, 1, 0, 1, 0, ""), | ||
2053 | #undef FILTER | ||
2054 | #define FILTER "((((((((a == 1) || (b == 1)) && (c == 1)) || (d == 1)) && " \ | ||
2055 | "(e == 1)) || (f == 1)) && (g == 1)) || (h == 1))" | ||
2056 | DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "bdfh"), | ||
2057 | DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""), | ||
2058 | DATA_REC(YES, 1, 0, 1, 0, 1, 0, 1, 0, "bdfh"), | ||
2059 | }; | ||
2060 | |||
2061 | #undef DATA_REC | ||
2062 | #undef FILTER | ||
2063 | #undef YES | ||
2064 | #undef NO | ||
2065 | |||
2066 | #define DATA_CNT (sizeof(test_filter_data)/sizeof(struct test_filter_data_t)) | ||
2067 | |||
2068 | static int test_pred_visited; | ||
2069 | |||
2070 | static int test_pred_visited_fn(struct filter_pred *pred, void *event) | ||
2071 | { | ||
2072 | struct ftrace_event_field *field = pred->field; | ||
2073 | |||
2074 | test_pred_visited = 1; | ||
2075 | printk(KERN_INFO "\npred visited %s\n", field->name); | ||
2076 | return 1; | ||
2077 | } | ||
2078 | |||
2079 | static int test_walk_pred_cb(enum move_type move, struct filter_pred *pred, | ||
2080 | int *err, void *data) | ||
2081 | { | ||
2082 | char *fields = data; | ||
2083 | |||
2084 | if ((move == MOVE_DOWN) && | ||
2085 | (pred->left == FILTER_PRED_INVALID)) { | ||
2086 | struct ftrace_event_field *field = pred->field; | ||
2087 | |||
2088 | if (!field) { | ||
2089 | WARN(1, "all leafs should have field defined"); | ||
2090 | return WALK_PRED_DEFAULT; | ||
2091 | } | ||
2092 | if (!strchr(fields, *field->name)) | ||
2093 | return WALK_PRED_DEFAULT; | ||
2094 | |||
2095 | WARN_ON(!pred->fn); | ||
2096 | pred->fn = test_pred_visited_fn; | ||
2097 | } | ||
2098 | return WALK_PRED_DEFAULT; | ||
2099 | } | ||
2100 | |||
2101 | static __init int ftrace_test_event_filter(void) | ||
2102 | { | ||
2103 | int i; | ||
2104 | |||
2105 | printk(KERN_INFO "Testing ftrace filter: "); | ||
2106 | |||
2107 | for (i = 0; i < DATA_CNT; i++) { | ||
2108 | struct event_filter *filter = NULL; | ||
2109 | struct test_filter_data_t *d = &test_filter_data[i]; | ||
2110 | int err; | ||
2111 | |||
2112 | err = test_get_filter(d->filter, &event_ftrace_test_filter, | ||
2113 | &filter); | ||
2114 | if (err) { | ||
2115 | printk(KERN_INFO | ||
2116 | "Failed to get filter for '%s', err %d\n", | ||
2117 | d->filter, err); | ||
2118 | break; | ||
2119 | } | ||
2120 | |||
2121 | /* | ||
2122 | * The preemption disabling is not really needed for self | ||
2123 | * tests, but the rcu dereference will complain without it. | ||
2124 | */ | ||
2125 | preempt_disable(); | ||
2126 | if (*d->not_visited) | ||
2127 | walk_pred_tree(filter->preds, filter->root, | ||
2128 | test_walk_pred_cb, | ||
2129 | d->not_visited); | ||
2130 | |||
2131 | test_pred_visited = 0; | ||
2132 | err = filter_match_preds(filter, &d->rec); | ||
2133 | preempt_enable(); | ||
2134 | |||
2135 | __free_filter(filter); | ||
2136 | |||
2137 | if (test_pred_visited) { | ||
2138 | printk(KERN_INFO | ||
2139 | "Failed, unwanted pred visited for filter %s\n", | ||
2140 | d->filter); | ||
2141 | break; | ||
2142 | } | ||
2143 | |||
2144 | if (err != d->match) { | ||
2145 | printk(KERN_INFO | ||
2146 | "Failed to match filter '%s', expected %d\n", | ||
2147 | d->filter, d->match); | ||
2148 | break; | ||
2149 | } | ||
2150 | } | ||
2151 | |||
2152 | if (i == DATA_CNT) | ||
2153 | printk(KERN_CONT "OK\n"); | ||
2154 | |||
2155 | return 0; | ||
2156 | } | ||
2157 | |||
2158 | late_initcall(ftrace_test_event_filter); | ||
2159 | |||
2160 | #endif /* CONFIG_FTRACE_STARTUP_TEST */ | ||
diff --git a/kernel/trace/trace_events_filter_test.h b/kernel/trace/trace_events_filter_test.h new file mode 100644 index 000000000000..bfd4dba0d603 --- /dev/null +++ b/kernel/trace/trace_events_filter_test.h | |||
@@ -0,0 +1,50 @@ | |||
1 | #undef TRACE_SYSTEM | ||
2 | #define TRACE_SYSTEM test | ||
3 | |||
4 | #if !defined(_TRACE_TEST_H) || defined(TRACE_HEADER_MULTI_READ) | ||
5 | #define _TRACE_TEST_H | ||
6 | |||
7 | #include <linux/tracepoint.h> | ||
8 | |||
9 | TRACE_EVENT(ftrace_test_filter, | ||
10 | |||
11 | TP_PROTO(int a, int b, int c, int d, int e, int f, int g, int h), | ||
12 | |||
13 | TP_ARGS(a, b, c, d, e, f, g, h), | ||
14 | |||
15 | TP_STRUCT__entry( | ||
16 | __field(int, a) | ||
17 | __field(int, b) | ||
18 | __field(int, c) | ||
19 | __field(int, d) | ||
20 | __field(int, e) | ||
21 | __field(int, f) | ||
22 | __field(int, g) | ||
23 | __field(int, h) | ||
24 | ), | ||
25 | |||
26 | TP_fast_assign( | ||
27 | __entry->a = a; | ||
28 | __entry->b = b; | ||
29 | __entry->c = c; | ||
30 | __entry->d = d; | ||
31 | __entry->e = e; | ||
32 | __entry->f = f; | ||
33 | __entry->g = g; | ||
34 | __entry->h = h; | ||
35 | ), | ||
36 | |||
37 | TP_printk("a %d, b %d, c %d, d %d, e %d, f %d, g %d, h %d", | ||
38 | __entry->a, __entry->b, __entry->c, __entry->d, | ||
39 | __entry->e, __entry->f, __entry->g, __entry->h) | ||
40 | ); | ||
41 | |||
42 | #endif /* _TRACE_TEST_H || TRACE_HEADER_MULTI_READ */ | ||
43 | |||
44 | #undef TRACE_INCLUDE_PATH | ||
45 | #undef TRACE_INCLUDE_FILE | ||
46 | #define TRACE_INCLUDE_PATH . | ||
47 | #define TRACE_INCLUDE_FILE trace_events_filter_test | ||
48 | |||
49 | /* This part must be outside protection */ | ||
50 | #include <trace/define_trace.h> | ||
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 667aa8cc0cfc..99d20e920368 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c | |||
@@ -23,7 +23,7 @@ static int tracer_enabled __read_mostly; | |||
23 | 23 | ||
24 | static DEFINE_PER_CPU(int, tracing_cpu); | 24 | static DEFINE_PER_CPU(int, tracing_cpu); |
25 | 25 | ||
26 | static DEFINE_SPINLOCK(max_trace_lock); | 26 | static DEFINE_RAW_SPINLOCK(max_trace_lock); |
27 | 27 | ||
28 | enum { | 28 | enum { |
29 | TRACER_IRQS_OFF = (1 << 1), | 29 | TRACER_IRQS_OFF = (1 << 1), |
@@ -280,9 +280,20 @@ static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) | |||
280 | } | 280 | } |
281 | 281 | ||
282 | static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { } | 282 | static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { } |
283 | static void irqsoff_print_header(struct seq_file *s) { } | ||
284 | static void irqsoff_trace_open(struct trace_iterator *iter) { } | 283 | static void irqsoff_trace_open(struct trace_iterator *iter) { } |
285 | static void irqsoff_trace_close(struct trace_iterator *iter) { } | 284 | static void irqsoff_trace_close(struct trace_iterator *iter) { } |
285 | |||
286 | #ifdef CONFIG_FUNCTION_TRACER | ||
287 | static void irqsoff_print_header(struct seq_file *s) | ||
288 | { | ||
289 | trace_default_header(s); | ||
290 | } | ||
291 | #else | ||
292 | static void irqsoff_print_header(struct seq_file *s) | ||
293 | { | ||
294 | trace_latency_header(s); | ||
295 | } | ||
296 | #endif /* CONFIG_FUNCTION_TRACER */ | ||
286 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ | 297 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ |
287 | 298 | ||
288 | /* | 299 | /* |
@@ -321,7 +332,7 @@ check_critical_timing(struct trace_array *tr, | |||
321 | if (!report_latency(delta)) | 332 | if (!report_latency(delta)) |
322 | goto out; | 333 | goto out; |
323 | 334 | ||
324 | spin_lock_irqsave(&max_trace_lock, flags); | 335 | raw_spin_lock_irqsave(&max_trace_lock, flags); |
325 | 336 | ||
326 | /* check if we are still the max latency */ | 337 | /* check if we are still the max latency */ |
327 | if (!report_latency(delta)) | 338 | if (!report_latency(delta)) |
@@ -344,7 +355,7 @@ check_critical_timing(struct trace_array *tr, | |||
344 | max_sequence++; | 355 | max_sequence++; |
345 | 356 | ||
346 | out_unlock: | 357 | out_unlock: |
347 | spin_unlock_irqrestore(&max_trace_lock, flags); | 358 | raw_spin_unlock_irqrestore(&max_trace_lock, flags); |
348 | 359 | ||
349 | out: | 360 | out: |
350 | data->critical_sequence = max_sequence; | 361 | data->critical_sequence = max_sequence; |
@@ -505,13 +516,13 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller); | |||
505 | #ifdef CONFIG_PREEMPT_TRACER | 516 | #ifdef CONFIG_PREEMPT_TRACER |
506 | void trace_preempt_on(unsigned long a0, unsigned long a1) | 517 | void trace_preempt_on(unsigned long a0, unsigned long a1) |
507 | { | 518 | { |
508 | if (preempt_trace()) | 519 | if (preempt_trace() && !irq_trace()) |
509 | stop_critical_timing(a0, a1); | 520 | stop_critical_timing(a0, a1); |
510 | } | 521 | } |
511 | 522 | ||
512 | void trace_preempt_off(unsigned long a0, unsigned long a1) | 523 | void trace_preempt_off(unsigned long a0, unsigned long a1) |
513 | { | 524 | { |
514 | if (preempt_trace()) | 525 | if (preempt_trace() && !irq_trace()) |
515 | start_critical_timing(a0, a1); | 526 | start_critical_timing(a0, a1); |
516 | } | 527 | } |
517 | #endif /* CONFIG_PREEMPT_TRACER */ | 528 | #endif /* CONFIG_PREEMPT_TRACER */ |
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5fb3697bf0e5..00d527c945a4 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
@@ -836,11 +836,17 @@ static void __unregister_trace_probe(struct trace_probe *tp) | |||
836 | } | 836 | } |
837 | 837 | ||
838 | /* Unregister a trace_probe and probe_event: call with locking probe_lock */ | 838 | /* Unregister a trace_probe and probe_event: call with locking probe_lock */ |
839 | static void unregister_trace_probe(struct trace_probe *tp) | 839 | static int unregister_trace_probe(struct trace_probe *tp) |
840 | { | 840 | { |
841 | /* Enabled event can not be unregistered */ | ||
842 | if (trace_probe_is_enabled(tp)) | ||
843 | return -EBUSY; | ||
844 | |||
841 | __unregister_trace_probe(tp); | 845 | __unregister_trace_probe(tp); |
842 | list_del(&tp->list); | 846 | list_del(&tp->list); |
843 | unregister_probe_event(tp); | 847 | unregister_probe_event(tp); |
848 | |||
849 | return 0; | ||
844 | } | 850 | } |
845 | 851 | ||
846 | /* Register a trace_probe and probe_event */ | 852 | /* Register a trace_probe and probe_event */ |
@@ -854,7 +860,9 @@ static int register_trace_probe(struct trace_probe *tp) | |||
854 | /* Delete old (same name) event if exist */ | 860 | /* Delete old (same name) event if exist */ |
855 | old_tp = find_trace_probe(tp->call.name, tp->call.class->system); | 861 | old_tp = find_trace_probe(tp->call.name, tp->call.class->system); |
856 | if (old_tp) { | 862 | if (old_tp) { |
857 | unregister_trace_probe(old_tp); | 863 | ret = unregister_trace_probe(old_tp); |
864 | if (ret < 0) | ||
865 | goto end; | ||
858 | free_trace_probe(old_tp); | 866 | free_trace_probe(old_tp); |
859 | } | 867 | } |
860 | 868 | ||
@@ -892,6 +900,7 @@ static int trace_probe_module_callback(struct notifier_block *nb, | |||
892 | mutex_lock(&probe_lock); | 900 | mutex_lock(&probe_lock); |
893 | list_for_each_entry(tp, &probe_list, list) { | 901 | list_for_each_entry(tp, &probe_list, list) { |
894 | if (trace_probe_within_module(tp, mod)) { | 902 | if (trace_probe_within_module(tp, mod)) { |
903 | /* Don't need to check busy - this should have gone. */ | ||
895 | __unregister_trace_probe(tp); | 904 | __unregister_trace_probe(tp); |
896 | ret = __register_trace_probe(tp); | 905 | ret = __register_trace_probe(tp); |
897 | if (ret) | 906 | if (ret) |
@@ -1205,10 +1214,11 @@ static int create_trace_probe(int argc, char **argv) | |||
1205 | return -ENOENT; | 1214 | return -ENOENT; |
1206 | } | 1215 | } |
1207 | /* delete an event */ | 1216 | /* delete an event */ |
1208 | unregister_trace_probe(tp); | 1217 | ret = unregister_trace_probe(tp); |
1209 | free_trace_probe(tp); | 1218 | if (ret == 0) |
1219 | free_trace_probe(tp); | ||
1210 | mutex_unlock(&probe_lock); | 1220 | mutex_unlock(&probe_lock); |
1211 | return 0; | 1221 | return ret; |
1212 | } | 1222 | } |
1213 | 1223 | ||
1214 | if (argc < 2) { | 1224 | if (argc < 2) { |
@@ -1317,18 +1327,29 @@ error: | |||
1317 | return ret; | 1327 | return ret; |
1318 | } | 1328 | } |
1319 | 1329 | ||
1320 | static void release_all_trace_probes(void) | 1330 | static int release_all_trace_probes(void) |
1321 | { | 1331 | { |
1322 | struct trace_probe *tp; | 1332 | struct trace_probe *tp; |
1333 | int ret = 0; | ||
1323 | 1334 | ||
1324 | mutex_lock(&probe_lock); | 1335 | mutex_lock(&probe_lock); |
1336 | /* Ensure no probe is in use. */ | ||
1337 | list_for_each_entry(tp, &probe_list, list) | ||
1338 | if (trace_probe_is_enabled(tp)) { | ||
1339 | ret = -EBUSY; | ||
1340 | goto end; | ||
1341 | } | ||
1325 | /* TODO: Use batch unregistration */ | 1342 | /* TODO: Use batch unregistration */ |
1326 | while (!list_empty(&probe_list)) { | 1343 | while (!list_empty(&probe_list)) { |
1327 | tp = list_entry(probe_list.next, struct trace_probe, list); | 1344 | tp = list_entry(probe_list.next, struct trace_probe, list); |
1328 | unregister_trace_probe(tp); | 1345 | unregister_trace_probe(tp); |
1329 | free_trace_probe(tp); | 1346 | free_trace_probe(tp); |
1330 | } | 1347 | } |
1348 | |||
1349 | end: | ||
1331 | mutex_unlock(&probe_lock); | 1350 | mutex_unlock(&probe_lock); |
1351 | |||
1352 | return ret; | ||
1332 | } | 1353 | } |
1333 | 1354 | ||
1334 | /* Probes listing interfaces */ | 1355 | /* Probes listing interfaces */ |
@@ -1380,9 +1401,13 @@ static const struct seq_operations probes_seq_op = { | |||
1380 | 1401 | ||
1381 | static int probes_open(struct inode *inode, struct file *file) | 1402 | static int probes_open(struct inode *inode, struct file *file) |
1382 | { | 1403 | { |
1383 | if ((file->f_mode & FMODE_WRITE) && | 1404 | int ret; |
1384 | (file->f_flags & O_TRUNC)) | 1405 | |
1385 | release_all_trace_probes(); | 1406 | if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { |
1407 | ret = release_all_trace_probes(); | ||
1408 | if (ret < 0) | ||
1409 | return ret; | ||
1410 | } | ||
1386 | 1411 | ||
1387 | return seq_open(file, &probes_seq_op); | 1412 | return seq_open(file, &probes_seq_op); |
1388 | } | 1413 | } |
@@ -2055,6 +2080,21 @@ static __init int kprobe_trace_self_tests_init(void) | |||
2055 | 2080 | ||
2056 | ret = target(1, 2, 3, 4, 5, 6); | 2081 | ret = target(1, 2, 3, 4, 5, 6); |
2057 | 2082 | ||
2083 | /* Disable trace points before removing it */ | ||
2084 | tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM); | ||
2085 | if (WARN_ON_ONCE(tp == NULL)) { | ||
2086 | pr_warning("error on getting test probe.\n"); | ||
2087 | warn++; | ||
2088 | } else | ||
2089 | disable_trace_probe(tp, TP_FLAG_TRACE); | ||
2090 | |||
2091 | tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM); | ||
2092 | if (WARN_ON_ONCE(tp == NULL)) { | ||
2093 | pr_warning("error on getting 2nd test probe.\n"); | ||
2094 | warn++; | ||
2095 | } else | ||
2096 | disable_trace_probe(tp, TP_FLAG_TRACE); | ||
2097 | |||
2058 | ret = command_trace_probe("-:testprobe"); | 2098 | ret = command_trace_probe("-:testprobe"); |
2059 | if (WARN_ON_ONCE(ret)) { | 2099 | if (WARN_ON_ONCE(ret)) { |
2060 | pr_warning("error on deleting a probe.\n"); | 2100 | pr_warning("error on deleting a probe.\n"); |
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 51999309a6cf..0d6ff3555942 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
@@ -627,11 +627,23 @@ int trace_print_context(struct trace_iterator *iter) | |||
627 | unsigned long usec_rem = do_div(t, USEC_PER_SEC); | 627 | unsigned long usec_rem = do_div(t, USEC_PER_SEC); |
628 | unsigned long secs = (unsigned long)t; | 628 | unsigned long secs = (unsigned long)t; |
629 | char comm[TASK_COMM_LEN]; | 629 | char comm[TASK_COMM_LEN]; |
630 | int ret; | ||
630 | 631 | ||
631 | trace_find_cmdline(entry->pid, comm); | 632 | trace_find_cmdline(entry->pid, comm); |
632 | 633 | ||
633 | return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ", | 634 | ret = trace_seq_printf(s, "%16s-%-5d [%03d] ", |
634 | comm, entry->pid, iter->cpu, secs, usec_rem); | 635 | comm, entry->pid, iter->cpu); |
636 | if (!ret) | ||
637 | return 0; | ||
638 | |||
639 | if (trace_flags & TRACE_ITER_IRQ_INFO) { | ||
640 | ret = trace_print_lat_fmt(s, entry); | ||
641 | if (!ret) | ||
642 | return 0; | ||
643 | } | ||
644 | |||
645 | return trace_seq_printf(s, " %5lu.%06lu: ", | ||
646 | secs, usec_rem); | ||
635 | } | 647 | } |
636 | 648 | ||
637 | int trace_print_lat_context(struct trace_iterator *iter) | 649 | int trace_print_lat_context(struct trace_iterator *iter) |
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 1f06468a10d7..6fd4ffd042f9 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c | |||
@@ -59,18 +59,19 @@ void hold_module_trace_bprintk_format(const char **start, const char **end) | |||
59 | continue; | 59 | continue; |
60 | } | 60 | } |
61 | 61 | ||
62 | fmt = NULL; | ||
62 | tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL); | 63 | tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL); |
63 | if (tb_fmt) | 64 | if (tb_fmt) { |
64 | fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL); | 65 | fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL); |
65 | if (tb_fmt && fmt) { | 66 | if (fmt) { |
66 | list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); | 67 | list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); |
67 | strcpy(fmt, *iter); | 68 | strcpy(fmt, *iter); |
68 | tb_fmt->fmt = fmt; | 69 | tb_fmt->fmt = fmt; |
69 | *iter = tb_fmt->fmt; | 70 | } else |
70 | } else { | 71 | kfree(tb_fmt); |
71 | kfree(tb_fmt); | ||
72 | *iter = NULL; | ||
73 | } | 72 | } |
73 | *iter = fmt; | ||
74 | |||
74 | } | 75 | } |
75 | mutex_unlock(&btrace_mutex); | 76 | mutex_unlock(&btrace_mutex); |
76 | } | 77 | } |
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index e4a70c0c71b6..ff791ea48b57 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c | |||
@@ -280,9 +280,20 @@ static enum print_line_t wakeup_print_line(struct trace_iterator *iter) | |||
280 | } | 280 | } |
281 | 281 | ||
282 | static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } | 282 | static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } |
283 | static void wakeup_print_header(struct seq_file *s) { } | ||
284 | static void wakeup_trace_open(struct trace_iterator *iter) { } | 283 | static void wakeup_trace_open(struct trace_iterator *iter) { } |
285 | static void wakeup_trace_close(struct trace_iterator *iter) { } | 284 | static void wakeup_trace_close(struct trace_iterator *iter) { } |
285 | |||
286 | #ifdef CONFIG_FUNCTION_TRACER | ||
287 | static void wakeup_print_header(struct seq_file *s) | ||
288 | { | ||
289 | trace_default_header(s); | ||
290 | } | ||
291 | #else | ||
292 | static void wakeup_print_header(struct seq_file *s) | ||
293 | { | ||
294 | trace_latency_header(s); | ||
295 | } | ||
296 | #endif /* CONFIG_FUNCTION_TRACER */ | ||
286 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ | 297 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ |
287 | 298 | ||
288 | /* | 299 | /* |
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index ee7b5a0bb9f8..cb654542c1a1 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c | |||
@@ -2,6 +2,7 @@ | |||
2 | #include <trace/events/syscalls.h> | 2 | #include <trace/events/syscalls.h> |
3 | #include <linux/slab.h> | 3 | #include <linux/slab.h> |
4 | #include <linux/kernel.h> | 4 | #include <linux/kernel.h> |
5 | #include <linux/module.h> /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */ | ||
5 | #include <linux/ftrace.h> | 6 | #include <linux/ftrace.h> |
6 | #include <linux/perf_event.h> | 7 | #include <linux/perf_event.h> |
7 | #include <asm/syscall.h> | 8 | #include <asm/syscall.h> |
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index b219f1449c54..db110b8ae030 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c | |||
@@ -34,11 +34,16 @@ extern struct tracepoint * const __stop___tracepoints_ptrs[]; | |||
34 | static const int tracepoint_debug; | 34 | static const int tracepoint_debug; |
35 | 35 | ||
36 | /* | 36 | /* |
37 | * tracepoints_mutex nests inside module_mutex. Tracepoints mutex protects the | 37 | * Tracepoints mutex protects the builtin and module tracepoints and the hash |
38 | * builtin and module tracepoints and the hash table. | 38 | * table, as well as the local module list. |
39 | */ | 39 | */ |
40 | static DEFINE_MUTEX(tracepoints_mutex); | 40 | static DEFINE_MUTEX(tracepoints_mutex); |
41 | 41 | ||
42 | #ifdef CONFIG_MODULES | ||
43 | /* Local list of struct module */ | ||
44 | static LIST_HEAD(tracepoint_module_list); | ||
45 | #endif /* CONFIG_MODULES */ | ||
46 | |||
42 | /* | 47 | /* |
43 | * Tracepoint hash table, containing the active tracepoints. | 48 | * Tracepoint hash table, containing the active tracepoints. |
44 | * Protected by tracepoints_mutex. | 49 | * Protected by tracepoints_mutex. |
@@ -292,9 +297,10 @@ static void disable_tracepoint(struct tracepoint *elem) | |||
292 | * @end: end of the range | 297 | * @end: end of the range |
293 | * | 298 | * |
294 | * Updates the probe callback corresponding to a range of tracepoints. | 299 | * Updates the probe callback corresponding to a range of tracepoints. |
300 | * Called with tracepoints_mutex held. | ||
295 | */ | 301 | */ |
296 | void tracepoint_update_probe_range(struct tracepoint * const *begin, | 302 | static void tracepoint_update_probe_range(struct tracepoint * const *begin, |
297 | struct tracepoint * const *end) | 303 | struct tracepoint * const *end) |
298 | { | 304 | { |
299 | struct tracepoint * const *iter; | 305 | struct tracepoint * const *iter; |
300 | struct tracepoint_entry *mark_entry; | 306 | struct tracepoint_entry *mark_entry; |
@@ -302,7 +308,6 @@ void tracepoint_update_probe_range(struct tracepoint * const *begin, | |||
302 | if (!begin) | 308 | if (!begin) |
303 | return; | 309 | return; |
304 | 310 | ||
305 | mutex_lock(&tracepoints_mutex); | ||
306 | for (iter = begin; iter < end; iter++) { | 311 | for (iter = begin; iter < end; iter++) { |
307 | mark_entry = get_tracepoint((*iter)->name); | 312 | mark_entry = get_tracepoint((*iter)->name); |
308 | if (mark_entry) { | 313 | if (mark_entry) { |
@@ -312,11 +317,27 @@ void tracepoint_update_probe_range(struct tracepoint * const *begin, | |||
312 | disable_tracepoint(*iter); | 317 | disable_tracepoint(*iter); |
313 | } | 318 | } |
314 | } | 319 | } |
315 | mutex_unlock(&tracepoints_mutex); | ||
316 | } | 320 | } |
317 | 321 | ||
322 | #ifdef CONFIG_MODULES | ||
323 | void module_update_tracepoints(void) | ||
324 | { | ||
325 | struct tp_module *tp_mod; | ||
326 | |||
327 | list_for_each_entry(tp_mod, &tracepoint_module_list, list) | ||
328 | tracepoint_update_probe_range(tp_mod->tracepoints_ptrs, | ||
329 | tp_mod->tracepoints_ptrs + tp_mod->num_tracepoints); | ||
330 | } | ||
331 | #else /* CONFIG_MODULES */ | ||
332 | void module_update_tracepoints(void) | ||
333 | { | ||
334 | } | ||
335 | #endif /* CONFIG_MODULES */ | ||
336 | |||
337 | |||
318 | /* | 338 | /* |
319 | * Update probes, removing the faulty probes. | 339 | * Update probes, removing the faulty probes. |
340 | * Called with tracepoints_mutex held. | ||
320 | */ | 341 | */ |
321 | static void tracepoint_update_probes(void) | 342 | static void tracepoint_update_probes(void) |
322 | { | 343 | { |
@@ -359,11 +380,12 @@ int tracepoint_probe_register(const char *name, void *probe, void *data) | |||
359 | 380 | ||
360 | mutex_lock(&tracepoints_mutex); | 381 | mutex_lock(&tracepoints_mutex); |
361 | old = tracepoint_add_probe(name, probe, data); | 382 | old = tracepoint_add_probe(name, probe, data); |
362 | mutex_unlock(&tracepoints_mutex); | 383 | if (IS_ERR(old)) { |
363 | if (IS_ERR(old)) | 384 | mutex_unlock(&tracepoints_mutex); |
364 | return PTR_ERR(old); | 385 | return PTR_ERR(old); |
365 | 386 | } | |
366 | tracepoint_update_probes(); /* may update entry */ | 387 | tracepoint_update_probes(); /* may update entry */ |
388 | mutex_unlock(&tracepoints_mutex); | ||
367 | release_probes(old); | 389 | release_probes(old); |
368 | return 0; | 390 | return 0; |
369 | } | 391 | } |
@@ -402,11 +424,12 @@ int tracepoint_probe_unregister(const char *name, void *probe, void *data) | |||
402 | 424 | ||
403 | mutex_lock(&tracepoints_mutex); | 425 | mutex_lock(&tracepoints_mutex); |
404 | old = tracepoint_remove_probe(name, probe, data); | 426 | old = tracepoint_remove_probe(name, probe, data); |
405 | mutex_unlock(&tracepoints_mutex); | 427 | if (IS_ERR(old)) { |
406 | if (IS_ERR(old)) | 428 | mutex_unlock(&tracepoints_mutex); |
407 | return PTR_ERR(old); | 429 | return PTR_ERR(old); |
408 | 430 | } | |
409 | tracepoint_update_probes(); /* may update entry */ | 431 | tracepoint_update_probes(); /* may update entry */ |
432 | mutex_unlock(&tracepoints_mutex); | ||
410 | release_probes(old); | 433 | release_probes(old); |
411 | return 0; | 434 | return 0; |
412 | } | 435 | } |
@@ -489,9 +512,8 @@ void tracepoint_probe_update_all(void) | |||
489 | if (!list_empty(&old_probes)) | 512 | if (!list_empty(&old_probes)) |
490 | list_replace_init(&old_probes, &release_probes); | 513 | list_replace_init(&old_probes, &release_probes); |
491 | need_update = 0; | 514 | need_update = 0; |
492 | mutex_unlock(&tracepoints_mutex); | ||
493 | |||
494 | tracepoint_update_probes(); | 515 | tracepoint_update_probes(); |
516 | mutex_unlock(&tracepoints_mutex); | ||
495 | list_for_each_entry_safe(pos, next, &release_probes, u.list) { | 517 | list_for_each_entry_safe(pos, next, &release_probes, u.list) { |
496 | list_del(&pos->u.list); | 518 | list_del(&pos->u.list); |
497 | call_rcu_sched(&pos->u.rcu, rcu_free_old_probes); | 519 | call_rcu_sched(&pos->u.rcu, rcu_free_old_probes); |
@@ -509,7 +531,7 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_update_all); | |||
509 | * Will return the first tracepoint in the range if the input tracepoint is | 531 | * Will return the first tracepoint in the range if the input tracepoint is |
510 | * NULL. | 532 | * NULL. |
511 | */ | 533 | */ |
512 | int tracepoint_get_iter_range(struct tracepoint * const **tracepoint, | 534 | static int tracepoint_get_iter_range(struct tracepoint * const **tracepoint, |
513 | struct tracepoint * const *begin, struct tracepoint * const *end) | 535 | struct tracepoint * const *begin, struct tracepoint * const *end) |
514 | { | 536 | { |
515 | if (!*tracepoint && begin != end) { | 537 | if (!*tracepoint && begin != end) { |
@@ -520,11 +542,12 @@ int tracepoint_get_iter_range(struct tracepoint * const **tracepoint, | |||
520 | return 1; | 542 | return 1; |
521 | return 0; | 543 | return 0; |
522 | } | 544 | } |
523 | EXPORT_SYMBOL_GPL(tracepoint_get_iter_range); | ||
524 | 545 | ||
546 | #ifdef CONFIG_MODULES | ||
525 | static void tracepoint_get_iter(struct tracepoint_iter *iter) | 547 | static void tracepoint_get_iter(struct tracepoint_iter *iter) |
526 | { | 548 | { |
527 | int found = 0; | 549 | int found = 0; |
550 | struct tp_module *iter_mod; | ||
528 | 551 | ||
529 | /* Core kernel tracepoints */ | 552 | /* Core kernel tracepoints */ |
530 | if (!iter->module) { | 553 | if (!iter->module) { |
@@ -534,12 +557,43 @@ static void tracepoint_get_iter(struct tracepoint_iter *iter) | |||
534 | if (found) | 557 | if (found) |
535 | goto end; | 558 | goto end; |
536 | } | 559 | } |
537 | /* tracepoints in modules. */ | 560 | /* Tracepoints in modules */ |
538 | found = module_get_iter_tracepoints(iter); | 561 | mutex_lock(&tracepoints_mutex); |
562 | list_for_each_entry(iter_mod, &tracepoint_module_list, list) { | ||
563 | /* | ||
564 | * Sorted module list | ||
565 | */ | ||
566 | if (iter_mod < iter->module) | ||
567 | continue; | ||
568 | else if (iter_mod > iter->module) | ||
569 | iter->tracepoint = NULL; | ||
570 | found = tracepoint_get_iter_range(&iter->tracepoint, | ||
571 | iter_mod->tracepoints_ptrs, | ||
572 | iter_mod->tracepoints_ptrs | ||
573 | + iter_mod->num_tracepoints); | ||
574 | if (found) { | ||
575 | iter->module = iter_mod; | ||
576 | break; | ||
577 | } | ||
578 | } | ||
579 | mutex_unlock(&tracepoints_mutex); | ||
539 | end: | 580 | end: |
540 | if (!found) | 581 | if (!found) |
541 | tracepoint_iter_reset(iter); | 582 | tracepoint_iter_reset(iter); |
542 | } | 583 | } |
584 | #else /* CONFIG_MODULES */ | ||
585 | static void tracepoint_get_iter(struct tracepoint_iter *iter) | ||
586 | { | ||
587 | int found = 0; | ||
588 | |||
589 | /* Core kernel tracepoints */ | ||
590 | found = tracepoint_get_iter_range(&iter->tracepoint, | ||
591 | __start___tracepoints_ptrs, | ||
592 | __stop___tracepoints_ptrs); | ||
593 | if (!found) | ||
594 | tracepoint_iter_reset(iter); | ||
595 | } | ||
596 | #endif /* CONFIG_MODULES */ | ||
543 | 597 | ||
544 | void tracepoint_iter_start(struct tracepoint_iter *iter) | 598 | void tracepoint_iter_start(struct tracepoint_iter *iter) |
545 | { | 599 | { |
@@ -566,26 +620,98 @@ EXPORT_SYMBOL_GPL(tracepoint_iter_stop); | |||
566 | 620 | ||
567 | void tracepoint_iter_reset(struct tracepoint_iter *iter) | 621 | void tracepoint_iter_reset(struct tracepoint_iter *iter) |
568 | { | 622 | { |
623 | #ifdef CONFIG_MODULES | ||
569 | iter->module = NULL; | 624 | iter->module = NULL; |
625 | #endif /* CONFIG_MODULES */ | ||
570 | iter->tracepoint = NULL; | 626 | iter->tracepoint = NULL; |
571 | } | 627 | } |
572 | EXPORT_SYMBOL_GPL(tracepoint_iter_reset); | 628 | EXPORT_SYMBOL_GPL(tracepoint_iter_reset); |
573 | 629 | ||
574 | #ifdef CONFIG_MODULES | 630 | #ifdef CONFIG_MODULES |
631 | static int tracepoint_module_coming(struct module *mod) | ||
632 | { | ||
633 | struct tp_module *tp_mod, *iter; | ||
634 | int ret = 0; | ||
635 | |||
636 | /* | ||
637 | * We skip modules that tain the kernel, especially those with different | ||
638 | * module header (for forced load), to make sure we don't cause a crash. | ||
639 | */ | ||
640 | if (mod->taints) | ||
641 | return 0; | ||
642 | mutex_lock(&tracepoints_mutex); | ||
643 | tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL); | ||
644 | if (!tp_mod) { | ||
645 | ret = -ENOMEM; | ||
646 | goto end; | ||
647 | } | ||
648 | tp_mod->num_tracepoints = mod->num_tracepoints; | ||
649 | tp_mod->tracepoints_ptrs = mod->tracepoints_ptrs; | ||
650 | |||
651 | /* | ||
652 | * tracepoint_module_list is kept sorted by struct module pointer | ||
653 | * address for iteration on tracepoints from a seq_file that can release | ||
654 | * the mutex between calls. | ||
655 | */ | ||
656 | list_for_each_entry_reverse(iter, &tracepoint_module_list, list) { | ||
657 | BUG_ON(iter == tp_mod); /* Should never be in the list twice */ | ||
658 | if (iter < tp_mod) { | ||
659 | /* We belong to the location right after iter. */ | ||
660 | list_add(&tp_mod->list, &iter->list); | ||
661 | goto module_added; | ||
662 | } | ||
663 | } | ||
664 | /* We belong to the beginning of the list */ | ||
665 | list_add(&tp_mod->list, &tracepoint_module_list); | ||
666 | module_added: | ||
667 | tracepoint_update_probe_range(mod->tracepoints_ptrs, | ||
668 | mod->tracepoints_ptrs + mod->num_tracepoints); | ||
669 | end: | ||
670 | mutex_unlock(&tracepoints_mutex); | ||
671 | return ret; | ||
672 | } | ||
673 | |||
674 | static int tracepoint_module_going(struct module *mod) | ||
675 | { | ||
676 | struct tp_module *pos; | ||
677 | |||
678 | mutex_lock(&tracepoints_mutex); | ||
679 | tracepoint_update_probe_range(mod->tracepoints_ptrs, | ||
680 | mod->tracepoints_ptrs + mod->num_tracepoints); | ||
681 | list_for_each_entry(pos, &tracepoint_module_list, list) { | ||
682 | if (pos->tracepoints_ptrs == mod->tracepoints_ptrs) { | ||
683 | list_del(&pos->list); | ||
684 | kfree(pos); | ||
685 | break; | ||
686 | } | ||
687 | } | ||
688 | /* | ||
689 | * In the case of modules that were tainted at "coming", we'll simply | ||
690 | * walk through the list without finding it. We cannot use the "tainted" | ||
691 | * flag on "going", in case a module taints the kernel only after being | ||
692 | * loaded. | ||
693 | */ | ||
694 | mutex_unlock(&tracepoints_mutex); | ||
695 | return 0; | ||
696 | } | ||
575 | 697 | ||
576 | int tracepoint_module_notify(struct notifier_block *self, | 698 | int tracepoint_module_notify(struct notifier_block *self, |
577 | unsigned long val, void *data) | 699 | unsigned long val, void *data) |
578 | { | 700 | { |
579 | struct module *mod = data; | 701 | struct module *mod = data; |
702 | int ret = 0; | ||
580 | 703 | ||
581 | switch (val) { | 704 | switch (val) { |
582 | case MODULE_STATE_COMING: | 705 | case MODULE_STATE_COMING: |
706 | ret = tracepoint_module_coming(mod); | ||
707 | break; | ||
708 | case MODULE_STATE_LIVE: | ||
709 | break; | ||
583 | case MODULE_STATE_GOING: | 710 | case MODULE_STATE_GOING: |
584 | tracepoint_update_probe_range(mod->tracepoints_ptrs, | 711 | ret = tracepoint_module_going(mod); |
585 | mod->tracepoints_ptrs + mod->num_tracepoints); | ||
586 | break; | 712 | break; |
587 | } | 713 | } |
588 | return 0; | 714 | return ret; |
589 | } | 715 | } |
590 | 716 | ||
591 | struct notifier_block tracepoint_module_nb = { | 717 | struct notifier_block tracepoint_module_nb = { |
@@ -598,7 +724,6 @@ static int init_tracepoints(void) | |||
598 | return register_module_notifier(&tracepoint_module_nb); | 724 | return register_module_notifier(&tracepoint_module_nb); |
599 | } | 725 | } |
600 | __initcall(init_tracepoints); | 726 | __initcall(init_tracepoints); |
601 | |||
602 | #endif /* CONFIG_MODULES */ | 727 | #endif /* CONFIG_MODULES */ |
603 | 728 | ||
604 | #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS | 729 | #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS |
diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 5bbfac85866e..23b4d784ebdd 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c | |||
@@ -127,7 +127,7 @@ void acct_update_integrals(struct task_struct *tsk) | |||
127 | 127 | ||
128 | local_irq_save(flags); | 128 | local_irq_save(flags); |
129 | time = tsk->stime + tsk->utime; | 129 | time = tsk->stime + tsk->utime; |
130 | dtime = cputime_sub(time, tsk->acct_timexpd); | 130 | dtime = time - tsk->acct_timexpd; |
131 | jiffies_to_timeval(cputime_to_jiffies(dtime), &value); | 131 | jiffies_to_timeval(cputime_to_jiffies(dtime), &value); |
132 | delta = value.tv_sec; | 132 | delta = value.tv_sec; |
133 | delta = delta * USEC_PER_SEC + value.tv_usec; | 133 | delta = delta * USEC_PER_SEC + value.tv_usec; |
diff --git a/kernel/up.c b/kernel/up.c index 1ff27a28bb7d..c54c75e9faf7 100644 --- a/kernel/up.c +++ b/kernel/up.c | |||
@@ -4,7 +4,7 @@ | |||
4 | 4 | ||
5 | #include <linux/interrupt.h> | 5 | #include <linux/interrupt.h> |
6 | #include <linux/kernel.h> | 6 | #include <linux/kernel.h> |
7 | #include <linux/module.h> | 7 | #include <linux/export.h> |
8 | #include <linux/smp.h> | 8 | #include <linux/smp.h> |
9 | 9 | ||
10 | int smp_call_function_single(int cpu, void (*func) (void *info), void *info, | 10 | int smp_call_function_single(int cpu, void (*func) (void *info), void *info, |
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c index 92cb706c7fc8..1744bb80f1fb 100644 --- a/kernel/user-return-notifier.c +++ b/kernel/user-return-notifier.c | |||
@@ -2,7 +2,7 @@ | |||
2 | #include <linux/user-return-notifier.h> | 2 | #include <linux/user-return-notifier.h> |
3 | #include <linux/percpu.h> | 3 | #include <linux/percpu.h> |
4 | #include <linux/sched.h> | 4 | #include <linux/sched.h> |
5 | #include <linux/module.h> | 5 | #include <linux/export.h> |
6 | 6 | ||
7 | static DEFINE_PER_CPU(struct hlist_head, return_notifier_list); | 7 | static DEFINE_PER_CPU(struct hlist_head, return_notifier_list); |
8 | 8 | ||
diff --git a/kernel/user.c b/kernel/user.c index 9e03e9c1df8d..71dd2363ab0f 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
@@ -14,7 +14,7 @@ | |||
14 | #include <linux/bitops.h> | 14 | #include <linux/bitops.h> |
15 | #include <linux/key.h> | 15 | #include <linux/key.h> |
16 | #include <linux/interrupt.h> | 16 | #include <linux/interrupt.h> |
17 | #include <linux/module.h> | 17 | #include <linux/export.h> |
18 | #include <linux/user_namespace.h> | 18 | #include <linux/user_namespace.h> |
19 | 19 | ||
20 | /* | 20 | /* |
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 9da289c34f22..3b906e98b1db 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c | |||
@@ -5,7 +5,7 @@ | |||
5 | * License. | 5 | * License. |
6 | */ | 6 | */ |
7 | 7 | ||
8 | #include <linux/module.h> | 8 | #include <linux/export.h> |
9 | #include <linux/nsproxy.h> | 9 | #include <linux/nsproxy.h> |
10 | #include <linux/slab.h> | 10 | #include <linux/slab.h> |
11 | #include <linux/user_namespace.h> | 11 | #include <linux/user_namespace.h> |
diff --git a/kernel/utsname.c b/kernel/utsname.c index bff131b9510a..405caf91aad5 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c | |||
@@ -9,7 +9,7 @@ | |||
9 | * License. | 9 | * License. |
10 | */ | 10 | */ |
11 | 11 | ||
12 | #include <linux/module.h> | 12 | #include <linux/export.h> |
13 | #include <linux/uts.h> | 13 | #include <linux/uts.h> |
14 | #include <linux/utsname.h> | 14 | #include <linux/utsname.h> |
15 | #include <linux/err.h> | 15 | #include <linux/err.h> |
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index a2cd77e70d4d..63da38c2d820 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c | |||
@@ -9,10 +9,11 @@ | |||
9 | * License. | 9 | * License. |
10 | */ | 10 | */ |
11 | 11 | ||
12 | #include <linux/module.h> | 12 | #include <linux/export.h> |
13 | #include <linux/uts.h> | 13 | #include <linux/uts.h> |
14 | #include <linux/utsname.h> | 14 | #include <linux/utsname.h> |
15 | #include <linux/sysctl.h> | 15 | #include <linux/sysctl.h> |
16 | #include <linux/wait.h> | ||
16 | 17 | ||
17 | static void *get_uts(ctl_table *table, int write) | 18 | static void *get_uts(ctl_table *table, int write) |
18 | { | 19 | { |
@@ -51,12 +52,19 @@ static int proc_do_uts_string(ctl_table *table, int write, | |||
51 | uts_table.data = get_uts(table, write); | 52 | uts_table.data = get_uts(table, write); |
52 | r = proc_dostring(&uts_table,write,buffer,lenp, ppos); | 53 | r = proc_dostring(&uts_table,write,buffer,lenp, ppos); |
53 | put_uts(table, write, uts_table.data); | 54 | put_uts(table, write, uts_table.data); |
55 | |||
56 | if (write) | ||
57 | proc_sys_poll_notify(table->poll); | ||
58 | |||
54 | return r; | 59 | return r; |
55 | } | 60 | } |
56 | #else | 61 | #else |
57 | #define proc_do_uts_string NULL | 62 | #define proc_do_uts_string NULL |
58 | #endif | 63 | #endif |
59 | 64 | ||
65 | static DEFINE_CTL_TABLE_POLL(hostname_poll); | ||
66 | static DEFINE_CTL_TABLE_POLL(domainname_poll); | ||
67 | |||
60 | static struct ctl_table uts_kern_table[] = { | 68 | static struct ctl_table uts_kern_table[] = { |
61 | { | 69 | { |
62 | .procname = "ostype", | 70 | .procname = "ostype", |
@@ -85,6 +93,7 @@ static struct ctl_table uts_kern_table[] = { | |||
85 | .maxlen = sizeof(init_uts_ns.name.nodename), | 93 | .maxlen = sizeof(init_uts_ns.name.nodename), |
86 | .mode = 0644, | 94 | .mode = 0644, |
87 | .proc_handler = proc_do_uts_string, | 95 | .proc_handler = proc_do_uts_string, |
96 | .poll = &hostname_poll, | ||
88 | }, | 97 | }, |
89 | { | 98 | { |
90 | .procname = "domainname", | 99 | .procname = "domainname", |
@@ -92,6 +101,7 @@ static struct ctl_table uts_kern_table[] = { | |||
92 | .maxlen = sizeof(init_uts_ns.name.domainname), | 101 | .maxlen = sizeof(init_uts_ns.name.domainname), |
93 | .mode = 0644, | 102 | .mode = 0644, |
94 | .proc_handler = proc_do_uts_string, | 103 | .proc_handler = proc_do_uts_string, |
104 | .poll = &domainname_poll, | ||
95 | }, | 105 | }, |
96 | {} | 106 | {} |
97 | }; | 107 | }; |
@@ -105,6 +115,19 @@ static struct ctl_table uts_root_table[] = { | |||
105 | {} | 115 | {} |
106 | }; | 116 | }; |
107 | 117 | ||
118 | #ifdef CONFIG_PROC_SYSCTL | ||
119 | /* | ||
120 | * Notify userspace about a change in a certain entry of uts_kern_table, | ||
121 | * identified by the parameter proc. | ||
122 | */ | ||
123 | void uts_proc_notify(enum uts_proc proc) | ||
124 | { | ||
125 | struct ctl_table *table = &uts_kern_table[proc]; | ||
126 | |||
127 | proc_sys_poll_notify(table->poll); | ||
128 | } | ||
129 | #endif | ||
130 | |||
108 | static int __init utsname_sysctl_init(void) | 131 | static int __init utsname_sysctl_init(void) |
109 | { | 132 | { |
110 | register_sysctl_table(uts_root_table); | 133 | register_sysctl_table(uts_root_table); |
diff --git a/kernel/wait.c b/kernel/wait.c index f45ea8d2a1ce..7fdd9eaca2c3 100644 --- a/kernel/wait.c +++ b/kernel/wait.c | |||
@@ -4,16 +4,16 @@ | |||
4 | * (C) 2004 William Irwin, Oracle | 4 | * (C) 2004 William Irwin, Oracle |
5 | */ | 5 | */ |
6 | #include <linux/init.h> | 6 | #include <linux/init.h> |
7 | #include <linux/module.h> | 7 | #include <linux/export.h> |
8 | #include <linux/sched.h> | 8 | #include <linux/sched.h> |
9 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
10 | #include <linux/wait.h> | 10 | #include <linux/wait.h> |
11 | #include <linux/hash.h> | 11 | #include <linux/hash.h> |
12 | 12 | ||
13 | void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key) | 13 | void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) |
14 | { | 14 | { |
15 | spin_lock_init(&q->lock); | 15 | spin_lock_init(&q->lock); |
16 | lockdep_set_class(&q->lock, key); | 16 | lockdep_set_class_and_name(&q->lock, key, name); |
17 | INIT_LIST_HEAD(&q->task_list); | 17 | INIT_LIST_HEAD(&q->task_list); |
18 | } | 18 | } |
19 | 19 | ||
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 36491cd5b7d4..1d7bca7f4f52 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -321,7 +321,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
321 | */ | 321 | */ |
322 | static int watchdog(void *unused) | 322 | static int watchdog(void *unused) |
323 | { | 323 | { |
324 | static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; | 324 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; |
325 | struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); | 325 | struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); |
326 | 326 | ||
327 | sched_setscheduler(current, SCHED_FIFO, ¶m); | 327 | sched_setscheduler(current, SCHED_FIFO, ¶m); |
@@ -350,7 +350,8 @@ static int watchdog(void *unused) | |||
350 | set_current_state(TASK_INTERRUPTIBLE); | 350 | set_current_state(TASK_INTERRUPTIBLE); |
351 | } | 351 | } |
352 | __set_current_state(TASK_RUNNING); | 352 | __set_current_state(TASK_RUNNING); |
353 | 353 | param.sched_priority = 0; | |
354 | sched_setscheduler(current, SCHED_NORMAL, ¶m); | ||
354 | return 0; | 355 | return 0; |
355 | } | 356 | } |
356 | 357 | ||
@@ -438,7 +439,7 @@ static int watchdog_enable(int cpu) | |||
438 | 439 | ||
439 | /* create the watchdog thread */ | 440 | /* create the watchdog thread */ |
440 | if (!p) { | 441 | if (!p) { |
441 | p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); | 442 | p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu); |
442 | if (IS_ERR(p)) { | 443 | if (IS_ERR(p)) { |
443 | printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); | 444 | printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); |
444 | if (!err) { | 445 | if (!err) { |
@@ -480,6 +481,8 @@ static void watchdog_disable(int cpu) | |||
480 | } | 481 | } |
481 | } | 482 | } |
482 | 483 | ||
484 | /* sysctl functions */ | ||
485 | #ifdef CONFIG_SYSCTL | ||
483 | static void watchdog_enable_all_cpus(void) | 486 | static void watchdog_enable_all_cpus(void) |
484 | { | 487 | { |
485 | int cpu; | 488 | int cpu; |
@@ -509,8 +512,6 @@ static void watchdog_disable_all_cpus(void) | |||
509 | } | 512 | } |
510 | 513 | ||
511 | 514 | ||
512 | /* sysctl functions */ | ||
513 | #ifdef CONFIG_SYSCTL | ||
514 | /* | 515 | /* |
515 | * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh | 516 | * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh |
516 | */ | 517 | */ |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1783aabc6128..bec7b5b53e03 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -23,7 +23,7 @@ | |||
23 | * Please read Documentation/workqueue.txt for details. | 23 | * Please read Documentation/workqueue.txt for details. |
24 | */ | 24 | */ |
25 | 25 | ||
26 | #include <linux/module.h> | 26 | #include <linux/export.h> |
27 | #include <linux/kernel.h> | 27 | #include <linux/kernel.h> |
28 | #include <linux/sched.h> | 28 | #include <linux/sched.h> |
29 | #include <linux/init.h> | 29 | #include <linux/init.h> |
@@ -242,10 +242,10 @@ struct workqueue_struct { | |||
242 | 242 | ||
243 | int nr_drainers; /* W: drain in progress */ | 243 | int nr_drainers; /* W: drain in progress */ |
244 | int saved_max_active; /* W: saved cwq max_active */ | 244 | int saved_max_active; /* W: saved cwq max_active */ |
245 | const char *name; /* I: workqueue name */ | ||
246 | #ifdef CONFIG_LOCKDEP | 245 | #ifdef CONFIG_LOCKDEP |
247 | struct lockdep_map lockdep_map; | 246 | struct lockdep_map lockdep_map; |
248 | #endif | 247 | #endif |
248 | char name[]; /* I: workqueue name */ | ||
249 | }; | 249 | }; |
250 | 250 | ||
251 | struct workqueue_struct *system_wq __read_mostly; | 251 | struct workqueue_struct *system_wq __read_mostly; |
@@ -2954,14 +2954,29 @@ static int wq_clamp_max_active(int max_active, unsigned int flags, | |||
2954 | return clamp_val(max_active, 1, lim); | 2954 | return clamp_val(max_active, 1, lim); |
2955 | } | 2955 | } |
2956 | 2956 | ||
2957 | struct workqueue_struct *__alloc_workqueue_key(const char *name, | 2957 | struct workqueue_struct *__alloc_workqueue_key(const char *fmt, |
2958 | unsigned int flags, | 2958 | unsigned int flags, |
2959 | int max_active, | 2959 | int max_active, |
2960 | struct lock_class_key *key, | 2960 | struct lock_class_key *key, |
2961 | const char *lock_name) | 2961 | const char *lock_name, ...) |
2962 | { | 2962 | { |
2963 | va_list args, args1; | ||
2963 | struct workqueue_struct *wq; | 2964 | struct workqueue_struct *wq; |
2964 | unsigned int cpu; | 2965 | unsigned int cpu; |
2966 | size_t namelen; | ||
2967 | |||
2968 | /* determine namelen, allocate wq and format name */ | ||
2969 | va_start(args, lock_name); | ||
2970 | va_copy(args1, args); | ||
2971 | namelen = vsnprintf(NULL, 0, fmt, args) + 1; | ||
2972 | |||
2973 | wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL); | ||
2974 | if (!wq) | ||
2975 | goto err; | ||
2976 | |||
2977 | vsnprintf(wq->name, namelen, fmt, args1); | ||
2978 | va_end(args); | ||
2979 | va_end(args1); | ||
2965 | 2980 | ||
2966 | /* | 2981 | /* |
2967 | * Workqueues which may be used during memory reclaim should | 2982 | * Workqueues which may be used during memory reclaim should |
@@ -2978,12 +2993,9 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name, | |||
2978 | flags |= WQ_HIGHPRI; | 2993 | flags |= WQ_HIGHPRI; |
2979 | 2994 | ||
2980 | max_active = max_active ?: WQ_DFL_ACTIVE; | 2995 | max_active = max_active ?: WQ_DFL_ACTIVE; |
2981 | max_active = wq_clamp_max_active(max_active, flags, name); | 2996 | max_active = wq_clamp_max_active(max_active, flags, wq->name); |
2982 | |||
2983 | wq = kzalloc(sizeof(*wq), GFP_KERNEL); | ||
2984 | if (!wq) | ||
2985 | goto err; | ||
2986 | 2997 | ||
2998 | /* init wq */ | ||
2987 | wq->flags = flags; | 2999 | wq->flags = flags; |
2988 | wq->saved_max_active = max_active; | 3000 | wq->saved_max_active = max_active; |
2989 | mutex_init(&wq->flush_mutex); | 3001 | mutex_init(&wq->flush_mutex); |
@@ -2991,7 +3003,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name, | |||
2991 | INIT_LIST_HEAD(&wq->flusher_queue); | 3003 | INIT_LIST_HEAD(&wq->flusher_queue); |
2992 | INIT_LIST_HEAD(&wq->flusher_overflow); | 3004 | INIT_LIST_HEAD(&wq->flusher_overflow); |
2993 | 3005 | ||
2994 | wq->name = name; | ||
2995 | lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); | 3006 | lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); |
2996 | INIT_LIST_HEAD(&wq->list); | 3007 | INIT_LIST_HEAD(&wq->list); |
2997 | 3008 | ||
@@ -3020,7 +3031,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name, | |||
3020 | if (!rescuer) | 3031 | if (!rescuer) |
3021 | goto err; | 3032 | goto err; |
3022 | 3033 | ||
3023 | rescuer->task = kthread_create(rescuer_thread, wq, "%s", name); | 3034 | rescuer->task = kthread_create(rescuer_thread, wq, "%s", |
3035 | wq->name); | ||
3024 | if (IS_ERR(rescuer->task)) | 3036 | if (IS_ERR(rescuer->task)) |
3025 | goto err; | 3037 | goto err; |
3026 | 3038 | ||